Commit 6be7db23181974114af002ebfe875ceaf23f24af

Authored by Eric Lee
1 parent d999f49ca0

Drop PFNs busy printk in an expected path

Showing 1 changed file with 2 additions and 3 deletions Inline Diff

1 /* 1 /*
2 * linux/mm/page_alloc.c 2 * linux/mm/page_alloc.c
3 * 3 *
4 * Manages the free list, the system allocates free pages here. 4 * Manages the free list, the system allocates free pages here.
5 * Note that kmalloc() lives in slab.c 5 * Note that kmalloc() lives in slab.c
6 * 6 *
7 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 7 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
8 * Swap reorganised 29.12.95, Stephen Tweedie 8 * Swap reorganised 29.12.95, Stephen Tweedie
9 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 9 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
10 * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 10 * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
11 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 11 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
12 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 12 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000
13 * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002 13 * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
14 * (lots of bits borrowed from Ingo Molnar & Andrew Morton) 14 * (lots of bits borrowed from Ingo Molnar & Andrew Morton)
15 */ 15 */
16 16
17 #include <linux/stddef.h> 17 #include <linux/stddef.h>
18 #include <linux/mm.h> 18 #include <linux/mm.h>
19 #include <linux/swap.h> 19 #include <linux/swap.h>
20 #include <linux/interrupt.h> 20 #include <linux/interrupt.h>
21 #include <linux/pagemap.h> 21 #include <linux/pagemap.h>
22 #include <linux/jiffies.h> 22 #include <linux/jiffies.h>
23 #include <linux/bootmem.h> 23 #include <linux/bootmem.h>
24 #include <linux/memblock.h> 24 #include <linux/memblock.h>
25 #include <linux/compiler.h> 25 #include <linux/compiler.h>
26 #include <linux/kernel.h> 26 #include <linux/kernel.h>
27 #include <linux/kasan.h> 27 #include <linux/kasan.h>
28 #include <linux/module.h> 28 #include <linux/module.h>
29 #include <linux/suspend.h> 29 #include <linux/suspend.h>
30 #include <linux/pagevec.h> 30 #include <linux/pagevec.h>
31 #include <linux/blkdev.h> 31 #include <linux/blkdev.h>
32 #include <linux/slab.h> 32 #include <linux/slab.h>
33 #include <linux/ratelimit.h> 33 #include <linux/ratelimit.h>
34 #include <linux/oom.h> 34 #include <linux/oom.h>
35 #include <linux/notifier.h> 35 #include <linux/notifier.h>
36 #include <linux/topology.h> 36 #include <linux/topology.h>
37 #include <linux/sysctl.h> 37 #include <linux/sysctl.h>
38 #include <linux/cpu.h> 38 #include <linux/cpu.h>
39 #include <linux/cpuset.h> 39 #include <linux/cpuset.h>
40 #include <linux/memory_hotplug.h> 40 #include <linux/memory_hotplug.h>
41 #include <linux/nodemask.h> 41 #include <linux/nodemask.h>
42 #include <linux/vmalloc.h> 42 #include <linux/vmalloc.h>
43 #include <linux/vmstat.h> 43 #include <linux/vmstat.h>
44 #include <linux/mempolicy.h> 44 #include <linux/mempolicy.h>
45 #include <linux/memremap.h> 45 #include <linux/memremap.h>
46 #include <linux/stop_machine.h> 46 #include <linux/stop_machine.h>
47 #include <linux/sort.h> 47 #include <linux/sort.h>
48 #include <linux/pfn.h> 48 #include <linux/pfn.h>
49 #include <linux/backing-dev.h> 49 #include <linux/backing-dev.h>
50 #include <linux/fault-inject.h> 50 #include <linux/fault-inject.h>
51 #include <linux/page-isolation.h> 51 #include <linux/page-isolation.h>
52 #include <linux/page_ext.h> 52 #include <linux/page_ext.h>
53 #include <linux/debugobjects.h> 53 #include <linux/debugobjects.h>
54 #include <linux/kmemleak.h> 54 #include <linux/kmemleak.h>
55 #include <linux/compaction.h> 55 #include <linux/compaction.h>
56 #include <trace/events/kmem.h> 56 #include <trace/events/kmem.h>
57 #include <trace/events/oom.h> 57 #include <trace/events/oom.h>
58 #include <linux/prefetch.h> 58 #include <linux/prefetch.h>
59 #include <linux/mm_inline.h> 59 #include <linux/mm_inline.h>
60 #include <linux/migrate.h> 60 #include <linux/migrate.h>
61 #include <linux/hugetlb.h> 61 #include <linux/hugetlb.h>
62 #include <linux/sched/rt.h> 62 #include <linux/sched/rt.h>
63 #include <linux/sched/mm.h> 63 #include <linux/sched/mm.h>
64 #include <linux/page_owner.h> 64 #include <linux/page_owner.h>
65 #include <linux/kthread.h> 65 #include <linux/kthread.h>
66 #include <linux/memcontrol.h> 66 #include <linux/memcontrol.h>
67 #include <linux/ftrace.h> 67 #include <linux/ftrace.h>
68 #include <linux/lockdep.h> 68 #include <linux/lockdep.h>
69 #include <linux/nmi.h> 69 #include <linux/nmi.h>
70 70
71 #include <asm/sections.h> 71 #include <asm/sections.h>
72 #include <asm/tlbflush.h> 72 #include <asm/tlbflush.h>
73 #include <asm/div64.h> 73 #include <asm/div64.h>
74 #include "internal.h" 74 #include "internal.h"
75 75
76 /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ 76 /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
77 static DEFINE_MUTEX(pcp_batch_high_lock); 77 static DEFINE_MUTEX(pcp_batch_high_lock);
78 #define MIN_PERCPU_PAGELIST_FRACTION (8) 78 #define MIN_PERCPU_PAGELIST_FRACTION (8)
79 79
80 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID 80 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
81 DEFINE_PER_CPU(int, numa_node); 81 DEFINE_PER_CPU(int, numa_node);
82 EXPORT_PER_CPU_SYMBOL(numa_node); 82 EXPORT_PER_CPU_SYMBOL(numa_node);
83 #endif 83 #endif
84 84
85 #ifdef CONFIG_HAVE_MEMORYLESS_NODES 85 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
86 /* 86 /*
87 * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly. 87 * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
88 * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined. 88 * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
89 * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem() 89 * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()
90 * defined in <linux/topology.h>. 90 * defined in <linux/topology.h>.
91 */ 91 */
92 DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */ 92 DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */
93 EXPORT_PER_CPU_SYMBOL(_numa_mem_); 93 EXPORT_PER_CPU_SYMBOL(_numa_mem_);
94 int _node_numa_mem_[MAX_NUMNODES]; 94 int _node_numa_mem_[MAX_NUMNODES];
95 #endif 95 #endif
96 96
97 /* work_structs for global per-cpu drains */ 97 /* work_structs for global per-cpu drains */
98 DEFINE_MUTEX(pcpu_drain_mutex); 98 DEFINE_MUTEX(pcpu_drain_mutex);
99 DEFINE_PER_CPU(struct work_struct, pcpu_drain); 99 DEFINE_PER_CPU(struct work_struct, pcpu_drain);
100 100
101 #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY 101 #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
102 volatile unsigned long latent_entropy __latent_entropy; 102 volatile unsigned long latent_entropy __latent_entropy;
103 EXPORT_SYMBOL(latent_entropy); 103 EXPORT_SYMBOL(latent_entropy);
104 #endif 104 #endif
105 105
106 /* 106 /*
107 * Array of node states. 107 * Array of node states.
108 */ 108 */
109 nodemask_t node_states[NR_NODE_STATES] __read_mostly = { 109 nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
110 [N_POSSIBLE] = NODE_MASK_ALL, 110 [N_POSSIBLE] = NODE_MASK_ALL,
111 [N_ONLINE] = { { [0] = 1UL } }, 111 [N_ONLINE] = { { [0] = 1UL } },
112 #ifndef CONFIG_NUMA 112 #ifndef CONFIG_NUMA
113 [N_NORMAL_MEMORY] = { { [0] = 1UL } }, 113 [N_NORMAL_MEMORY] = { { [0] = 1UL } },
114 #ifdef CONFIG_HIGHMEM 114 #ifdef CONFIG_HIGHMEM
115 [N_HIGH_MEMORY] = { { [0] = 1UL } }, 115 [N_HIGH_MEMORY] = { { [0] = 1UL } },
116 #endif 116 #endif
117 [N_MEMORY] = { { [0] = 1UL } }, 117 [N_MEMORY] = { { [0] = 1UL } },
118 [N_CPU] = { { [0] = 1UL } }, 118 [N_CPU] = { { [0] = 1UL } },
119 #endif /* NUMA */ 119 #endif /* NUMA */
120 }; 120 };
121 EXPORT_SYMBOL(node_states); 121 EXPORT_SYMBOL(node_states);
122 122
123 /* Protect totalram_pages and zone->managed_pages */ 123 /* Protect totalram_pages and zone->managed_pages */
124 static DEFINE_SPINLOCK(managed_page_count_lock); 124 static DEFINE_SPINLOCK(managed_page_count_lock);
125 125
126 unsigned long totalram_pages __read_mostly; 126 unsigned long totalram_pages __read_mostly;
127 unsigned long totalreserve_pages __read_mostly; 127 unsigned long totalreserve_pages __read_mostly;
128 unsigned long totalcma_pages __read_mostly; 128 unsigned long totalcma_pages __read_mostly;
129 129
130 int percpu_pagelist_fraction; 130 int percpu_pagelist_fraction;
131 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; 131 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
132 132
133 /* 133 /*
134 * A cached value of the page's pageblock's migratetype, used when the page is 134 * A cached value of the page's pageblock's migratetype, used when the page is
135 * put on a pcplist. Used to avoid the pageblock migratetype lookup when 135 * put on a pcplist. Used to avoid the pageblock migratetype lookup when
136 * freeing from pcplists in most cases, at the cost of possibly becoming stale. 136 * freeing from pcplists in most cases, at the cost of possibly becoming stale.
137 * Also the migratetype set in the page does not necessarily match the pcplist 137 * Also the migratetype set in the page does not necessarily match the pcplist
138 * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any 138 * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any
139 * other index - this ensures that it will be put on the correct CMA freelist. 139 * other index - this ensures that it will be put on the correct CMA freelist.
140 */ 140 */
141 static inline int get_pcppage_migratetype(struct page *page) 141 static inline int get_pcppage_migratetype(struct page *page)
142 { 142 {
143 return page->index; 143 return page->index;
144 } 144 }
145 145
146 static inline void set_pcppage_migratetype(struct page *page, int migratetype) 146 static inline void set_pcppage_migratetype(struct page *page, int migratetype)
147 { 147 {
148 page->index = migratetype; 148 page->index = migratetype;
149 } 149 }
150 150
151 #ifdef CONFIG_PM_SLEEP 151 #ifdef CONFIG_PM_SLEEP
152 /* 152 /*
153 * The following functions are used by the suspend/hibernate code to temporarily 153 * The following functions are used by the suspend/hibernate code to temporarily
154 * change gfp_allowed_mask in order to avoid using I/O during memory allocations 154 * change gfp_allowed_mask in order to avoid using I/O during memory allocations
155 * while devices are suspended. To avoid races with the suspend/hibernate code, 155 * while devices are suspended. To avoid races with the suspend/hibernate code,
156 * they should always be called with pm_mutex held (gfp_allowed_mask also should 156 * they should always be called with pm_mutex held (gfp_allowed_mask also should
157 * only be modified with pm_mutex held, unless the suspend/hibernate code is 157 * only be modified with pm_mutex held, unless the suspend/hibernate code is
158 * guaranteed not to run in parallel with that modification). 158 * guaranteed not to run in parallel with that modification).
159 */ 159 */
160 160
161 static gfp_t saved_gfp_mask; 161 static gfp_t saved_gfp_mask;
162 162
163 void pm_restore_gfp_mask(void) 163 void pm_restore_gfp_mask(void)
164 { 164 {
165 WARN_ON(!mutex_is_locked(&pm_mutex)); 165 WARN_ON(!mutex_is_locked(&pm_mutex));
166 if (saved_gfp_mask) { 166 if (saved_gfp_mask) {
167 gfp_allowed_mask = saved_gfp_mask; 167 gfp_allowed_mask = saved_gfp_mask;
168 saved_gfp_mask = 0; 168 saved_gfp_mask = 0;
169 } 169 }
170 } 170 }
171 171
172 void pm_restrict_gfp_mask(void) 172 void pm_restrict_gfp_mask(void)
173 { 173 {
174 WARN_ON(!mutex_is_locked(&pm_mutex)); 174 WARN_ON(!mutex_is_locked(&pm_mutex));
175 WARN_ON(saved_gfp_mask); 175 WARN_ON(saved_gfp_mask);
176 saved_gfp_mask = gfp_allowed_mask; 176 saved_gfp_mask = gfp_allowed_mask;
177 gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS); 177 gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS);
178 } 178 }
179 179
180 bool pm_suspended_storage(void) 180 bool pm_suspended_storage(void)
181 { 181 {
182 if ((gfp_allowed_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) 182 if ((gfp_allowed_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
183 return false; 183 return false;
184 return true; 184 return true;
185 } 185 }
186 #endif /* CONFIG_PM_SLEEP */ 186 #endif /* CONFIG_PM_SLEEP */
187 187
188 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 188 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
189 unsigned int pageblock_order __read_mostly; 189 unsigned int pageblock_order __read_mostly;
190 #endif 190 #endif
191 191
192 static void __free_pages_ok(struct page *page, unsigned int order); 192 static void __free_pages_ok(struct page *page, unsigned int order);
193 193
194 /* 194 /*
195 * results with 256, 32 in the lowmem_reserve sysctl: 195 * results with 256, 32 in the lowmem_reserve sysctl:
196 * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) 196 * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
197 * 1G machine -> (16M dma, 784M normal, 224M high) 197 * 1G machine -> (16M dma, 784M normal, 224M high)
198 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA 198 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
199 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL 199 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
200 * HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA 200 * HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA
201 * 201 *
202 * TBD: should special case ZONE_DMA32 machines here - in those we normally 202 * TBD: should special case ZONE_DMA32 machines here - in those we normally
203 * don't need any ZONE_NORMAL reservation 203 * don't need any ZONE_NORMAL reservation
204 */ 204 */
205 int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 205 int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
206 #ifdef CONFIG_ZONE_DMA 206 #ifdef CONFIG_ZONE_DMA
207 256, 207 256,
208 #endif 208 #endif
209 #ifdef CONFIG_ZONE_DMA32 209 #ifdef CONFIG_ZONE_DMA32
210 256, 210 256,
211 #endif 211 #endif
212 #ifdef CONFIG_HIGHMEM 212 #ifdef CONFIG_HIGHMEM
213 32, 213 32,
214 #endif 214 #endif
215 32, 215 32,
216 }; 216 };
217 217
218 EXPORT_SYMBOL(totalram_pages); 218 EXPORT_SYMBOL(totalram_pages);
219 219
220 static char * const zone_names[MAX_NR_ZONES] = { 220 static char * const zone_names[MAX_NR_ZONES] = {
221 #ifdef CONFIG_ZONE_DMA 221 #ifdef CONFIG_ZONE_DMA
222 "DMA", 222 "DMA",
223 #endif 223 #endif
224 #ifdef CONFIG_ZONE_DMA32 224 #ifdef CONFIG_ZONE_DMA32
225 "DMA32", 225 "DMA32",
226 #endif 226 #endif
227 "Normal", 227 "Normal",
228 #ifdef CONFIG_HIGHMEM 228 #ifdef CONFIG_HIGHMEM
229 "HighMem", 229 "HighMem",
230 #endif 230 #endif
231 "Movable", 231 "Movable",
232 #ifdef CONFIG_ZONE_DEVICE 232 #ifdef CONFIG_ZONE_DEVICE
233 "Device", 233 "Device",
234 #endif 234 #endif
235 }; 235 };
236 236
237 char * const migratetype_names[MIGRATE_TYPES] = { 237 char * const migratetype_names[MIGRATE_TYPES] = {
238 "Unmovable", 238 "Unmovable",
239 "Movable", 239 "Movable",
240 "Reclaimable", 240 "Reclaimable",
241 "HighAtomic", 241 "HighAtomic",
242 #ifdef CONFIG_CMA 242 #ifdef CONFIG_CMA
243 "CMA", 243 "CMA",
244 #endif 244 #endif
245 #ifdef CONFIG_MEMORY_ISOLATION 245 #ifdef CONFIG_MEMORY_ISOLATION
246 "Isolate", 246 "Isolate",
247 #endif 247 #endif
248 }; 248 };
249 249
250 compound_page_dtor * const compound_page_dtors[] = { 250 compound_page_dtor * const compound_page_dtors[] = {
251 NULL, 251 NULL,
252 free_compound_page, 252 free_compound_page,
253 #ifdef CONFIG_HUGETLB_PAGE 253 #ifdef CONFIG_HUGETLB_PAGE
254 free_huge_page, 254 free_huge_page,
255 #endif 255 #endif
256 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 256 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
257 free_transhuge_page, 257 free_transhuge_page,
258 #endif 258 #endif
259 }; 259 };
260 260
261 /* 261 /*
262 * Try to keep at least this much lowmem free. Do not allow normal 262 * Try to keep at least this much lowmem free. Do not allow normal
263 * allocations below this point, only high priority ones. Automatically 263 * allocations below this point, only high priority ones. Automatically
264 * tuned according to the amount of memory in the system. 264 * tuned according to the amount of memory in the system.
265 */ 265 */
266 int min_free_kbytes = 1024; 266 int min_free_kbytes = 1024;
267 int user_min_free_kbytes = -1; 267 int user_min_free_kbytes = -1;
268 int watermark_scale_factor = 10; 268 int watermark_scale_factor = 10;
269 269
270 /* 270 /*
271 * Extra memory for the system to try freeing. Used to temporarily 271 * Extra memory for the system to try freeing. Used to temporarily
272 * free memory, to make space for new workloads. Anyone can allocate 272 * free memory, to make space for new workloads. Anyone can allocate
273 * down to the min watermarks controlled by min_free_kbytes above. 273 * down to the min watermarks controlled by min_free_kbytes above.
274 */ 274 */
275 int extra_free_kbytes = 0; 275 int extra_free_kbytes = 0;
276 276
277 static unsigned long __meminitdata nr_kernel_pages; 277 static unsigned long __meminitdata nr_kernel_pages;
278 static unsigned long __meminitdata nr_all_pages; 278 static unsigned long __meminitdata nr_all_pages;
279 static unsigned long __meminitdata dma_reserve; 279 static unsigned long __meminitdata dma_reserve;
280 280
281 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 281 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
282 static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; 282 static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
283 static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; 283 static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
284 static unsigned long __initdata required_kernelcore; 284 static unsigned long __initdata required_kernelcore;
285 static unsigned long __initdata required_movablecore; 285 static unsigned long __initdata required_movablecore;
286 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; 286 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
287 static bool mirrored_kernelcore; 287 static bool mirrored_kernelcore;
288 288
289 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ 289 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
290 int movable_zone; 290 int movable_zone;
291 EXPORT_SYMBOL(movable_zone); 291 EXPORT_SYMBOL(movable_zone);
292 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 292 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
293 293
294 #if MAX_NUMNODES > 1 294 #if MAX_NUMNODES > 1
295 int nr_node_ids __read_mostly = MAX_NUMNODES; 295 int nr_node_ids __read_mostly = MAX_NUMNODES;
296 int nr_online_nodes __read_mostly = 1; 296 int nr_online_nodes __read_mostly = 1;
297 EXPORT_SYMBOL(nr_node_ids); 297 EXPORT_SYMBOL(nr_node_ids);
298 EXPORT_SYMBOL(nr_online_nodes); 298 EXPORT_SYMBOL(nr_online_nodes);
299 #endif 299 #endif
300 300
301 int page_group_by_mobility_disabled __read_mostly; 301 int page_group_by_mobility_disabled __read_mostly;
302 302
303 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 303 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
304 304
305 /* 305 /*
306 * Determine how many pages need to be initialized durig early boot 306 * Determine how many pages need to be initialized durig early boot
307 * (non-deferred initialization). 307 * (non-deferred initialization).
308 * The value of first_deferred_pfn will be set later, once non-deferred pages 308 * The value of first_deferred_pfn will be set later, once non-deferred pages
309 * are initialized, but for now set it ULONG_MAX. 309 * are initialized, but for now set it ULONG_MAX.
310 */ 310 */
311 static inline void reset_deferred_meminit(pg_data_t *pgdat) 311 static inline void reset_deferred_meminit(pg_data_t *pgdat)
312 { 312 {
313 phys_addr_t start_addr, end_addr; 313 phys_addr_t start_addr, end_addr;
314 unsigned long max_pgcnt; 314 unsigned long max_pgcnt;
315 unsigned long reserved; 315 unsigned long reserved;
316 316
317 /* 317 /*
318 * Initialise at least 2G of a node but also take into account that 318 * Initialise at least 2G of a node but also take into account that
319 * two large system hashes that can take up 1GB for 0.25TB/node. 319 * two large system hashes that can take up 1GB for 0.25TB/node.
320 */ 320 */
321 max_pgcnt = max(2UL << (30 - PAGE_SHIFT), 321 max_pgcnt = max(2UL << (30 - PAGE_SHIFT),
322 (pgdat->node_spanned_pages >> 8)); 322 (pgdat->node_spanned_pages >> 8));
323 323
324 /* 324 /*
325 * Compensate the all the memblock reservations (e.g. crash kernel) 325 * Compensate the all the memblock reservations (e.g. crash kernel)
326 * from the initial estimation to make sure we will initialize enough 326 * from the initial estimation to make sure we will initialize enough
327 * memory to boot. 327 * memory to boot.
328 */ 328 */
329 start_addr = PFN_PHYS(pgdat->node_start_pfn); 329 start_addr = PFN_PHYS(pgdat->node_start_pfn);
330 end_addr = PFN_PHYS(pgdat->node_start_pfn + max_pgcnt); 330 end_addr = PFN_PHYS(pgdat->node_start_pfn + max_pgcnt);
331 reserved = memblock_reserved_memory_within(start_addr, end_addr); 331 reserved = memblock_reserved_memory_within(start_addr, end_addr);
332 max_pgcnt += PHYS_PFN(reserved); 332 max_pgcnt += PHYS_PFN(reserved);
333 333
334 pgdat->static_init_pgcnt = min(max_pgcnt, pgdat->node_spanned_pages); 334 pgdat->static_init_pgcnt = min(max_pgcnt, pgdat->node_spanned_pages);
335 pgdat->first_deferred_pfn = ULONG_MAX; 335 pgdat->first_deferred_pfn = ULONG_MAX;
336 } 336 }
337 337
338 /* Returns true if the struct page for the pfn is uninitialised */ 338 /* Returns true if the struct page for the pfn is uninitialised */
339 static inline bool __meminit early_page_uninitialised(unsigned long pfn) 339 static inline bool __meminit early_page_uninitialised(unsigned long pfn)
340 { 340 {
341 int nid = early_pfn_to_nid(pfn); 341 int nid = early_pfn_to_nid(pfn);
342 342
343 if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn) 343 if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn)
344 return true; 344 return true;
345 345
346 return false; 346 return false;
347 } 347 }
348 348
349 /* 349 /*
350 * Returns false when the remaining initialisation should be deferred until 350 * Returns false when the remaining initialisation should be deferred until
351 * later in the boot cycle when it can be parallelised. 351 * later in the boot cycle when it can be parallelised.
352 */ 352 */
353 static inline bool update_defer_init(pg_data_t *pgdat, 353 static inline bool update_defer_init(pg_data_t *pgdat,
354 unsigned long pfn, unsigned long zone_end, 354 unsigned long pfn, unsigned long zone_end,
355 unsigned long *nr_initialised) 355 unsigned long *nr_initialised)
356 { 356 {
357 /* Always populate low zones for address-contrained allocations */ 357 /* Always populate low zones for address-contrained allocations */
358 if (zone_end < pgdat_end_pfn(pgdat)) 358 if (zone_end < pgdat_end_pfn(pgdat))
359 return true; 359 return true;
360 (*nr_initialised)++; 360 (*nr_initialised)++;
361 if ((*nr_initialised > pgdat->static_init_pgcnt) && 361 if ((*nr_initialised > pgdat->static_init_pgcnt) &&
362 (pfn & (PAGES_PER_SECTION - 1)) == 0) { 362 (pfn & (PAGES_PER_SECTION - 1)) == 0) {
363 pgdat->first_deferred_pfn = pfn; 363 pgdat->first_deferred_pfn = pfn;
364 return false; 364 return false;
365 } 365 }
366 366
367 return true; 367 return true;
368 } 368 }
369 #else 369 #else
370 static inline void reset_deferred_meminit(pg_data_t *pgdat) 370 static inline void reset_deferred_meminit(pg_data_t *pgdat)
371 { 371 {
372 } 372 }
373 373
374 static inline bool early_page_uninitialised(unsigned long pfn) 374 static inline bool early_page_uninitialised(unsigned long pfn)
375 { 375 {
376 return false; 376 return false;
377 } 377 }
378 378
379 static inline bool update_defer_init(pg_data_t *pgdat, 379 static inline bool update_defer_init(pg_data_t *pgdat,
380 unsigned long pfn, unsigned long zone_end, 380 unsigned long pfn, unsigned long zone_end,
381 unsigned long *nr_initialised) 381 unsigned long *nr_initialised)
382 { 382 {
383 return true; 383 return true;
384 } 384 }
385 #endif 385 #endif
386 386
387 /* Return a pointer to the bitmap storing bits affecting a block of pages */ 387 /* Return a pointer to the bitmap storing bits affecting a block of pages */
388 static inline unsigned long *get_pageblock_bitmap(struct page *page, 388 static inline unsigned long *get_pageblock_bitmap(struct page *page,
389 unsigned long pfn) 389 unsigned long pfn)
390 { 390 {
391 #ifdef CONFIG_SPARSEMEM 391 #ifdef CONFIG_SPARSEMEM
392 return __pfn_to_section(pfn)->pageblock_flags; 392 return __pfn_to_section(pfn)->pageblock_flags;
393 #else 393 #else
394 return page_zone(page)->pageblock_flags; 394 return page_zone(page)->pageblock_flags;
395 #endif /* CONFIG_SPARSEMEM */ 395 #endif /* CONFIG_SPARSEMEM */
396 } 396 }
397 397
398 static inline int pfn_to_bitidx(struct page *page, unsigned long pfn) 398 static inline int pfn_to_bitidx(struct page *page, unsigned long pfn)
399 { 399 {
400 #ifdef CONFIG_SPARSEMEM 400 #ifdef CONFIG_SPARSEMEM
401 pfn &= (PAGES_PER_SECTION-1); 401 pfn &= (PAGES_PER_SECTION-1);
402 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; 402 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
403 #else 403 #else
404 pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages); 404 pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages);
405 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; 405 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
406 #endif /* CONFIG_SPARSEMEM */ 406 #endif /* CONFIG_SPARSEMEM */
407 } 407 }
408 408
409 /** 409 /**
410 * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages 410 * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
411 * @page: The page within the block of interest 411 * @page: The page within the block of interest
412 * @pfn: The target page frame number 412 * @pfn: The target page frame number
413 * @end_bitidx: The last bit of interest to retrieve 413 * @end_bitidx: The last bit of interest to retrieve
414 * @mask: mask of bits that the caller is interested in 414 * @mask: mask of bits that the caller is interested in
415 * 415 *
416 * Return: pageblock_bits flags 416 * Return: pageblock_bits flags
417 */ 417 */
418 static __always_inline unsigned long __get_pfnblock_flags_mask(struct page *page, 418 static __always_inline unsigned long __get_pfnblock_flags_mask(struct page *page,
419 unsigned long pfn, 419 unsigned long pfn,
420 unsigned long end_bitidx, 420 unsigned long end_bitidx,
421 unsigned long mask) 421 unsigned long mask)
422 { 422 {
423 unsigned long *bitmap; 423 unsigned long *bitmap;
424 unsigned long bitidx, word_bitidx; 424 unsigned long bitidx, word_bitidx;
425 unsigned long word; 425 unsigned long word;
426 426
427 bitmap = get_pageblock_bitmap(page, pfn); 427 bitmap = get_pageblock_bitmap(page, pfn);
428 bitidx = pfn_to_bitidx(page, pfn); 428 bitidx = pfn_to_bitidx(page, pfn);
429 word_bitidx = bitidx / BITS_PER_LONG; 429 word_bitidx = bitidx / BITS_PER_LONG;
430 bitidx &= (BITS_PER_LONG-1); 430 bitidx &= (BITS_PER_LONG-1);
431 431
432 word = bitmap[word_bitidx]; 432 word = bitmap[word_bitidx];
433 bitidx += end_bitidx; 433 bitidx += end_bitidx;
434 return (word >> (BITS_PER_LONG - bitidx - 1)) & mask; 434 return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;
435 } 435 }
436 436
437 unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn, 437 unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
438 unsigned long end_bitidx, 438 unsigned long end_bitidx,
439 unsigned long mask) 439 unsigned long mask)
440 { 440 {
441 return __get_pfnblock_flags_mask(page, pfn, end_bitidx, mask); 441 return __get_pfnblock_flags_mask(page, pfn, end_bitidx, mask);
442 } 442 }
443 443
444 static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn) 444 static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn)
445 { 445 {
446 return __get_pfnblock_flags_mask(page, pfn, PB_migrate_end, MIGRATETYPE_MASK); 446 return __get_pfnblock_flags_mask(page, pfn, PB_migrate_end, MIGRATETYPE_MASK);
447 } 447 }
448 448
449 /** 449 /**
450 * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages 450 * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
451 * @page: The page within the block of interest 451 * @page: The page within the block of interest
452 * @flags: The flags to set 452 * @flags: The flags to set
453 * @pfn: The target page frame number 453 * @pfn: The target page frame number
454 * @end_bitidx: The last bit of interest 454 * @end_bitidx: The last bit of interest
455 * @mask: mask of bits that the caller is interested in 455 * @mask: mask of bits that the caller is interested in
456 */ 456 */
457 void set_pfnblock_flags_mask(struct page *page, unsigned long flags, 457 void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
458 unsigned long pfn, 458 unsigned long pfn,
459 unsigned long end_bitidx, 459 unsigned long end_bitidx,
460 unsigned long mask) 460 unsigned long mask)
461 { 461 {
462 unsigned long *bitmap; 462 unsigned long *bitmap;
463 unsigned long bitidx, word_bitidx; 463 unsigned long bitidx, word_bitidx;
464 unsigned long old_word, word; 464 unsigned long old_word, word;
465 465
466 BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); 466 BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
467 467
468 bitmap = get_pageblock_bitmap(page, pfn); 468 bitmap = get_pageblock_bitmap(page, pfn);
469 bitidx = pfn_to_bitidx(page, pfn); 469 bitidx = pfn_to_bitidx(page, pfn);
470 word_bitidx = bitidx / BITS_PER_LONG; 470 word_bitidx = bitidx / BITS_PER_LONG;
471 bitidx &= (BITS_PER_LONG-1); 471 bitidx &= (BITS_PER_LONG-1);
472 472
473 VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page); 473 VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
474 474
475 bitidx += end_bitidx; 475 bitidx += end_bitidx;
476 mask <<= (BITS_PER_LONG - bitidx - 1); 476 mask <<= (BITS_PER_LONG - bitidx - 1);
477 flags <<= (BITS_PER_LONG - bitidx - 1); 477 flags <<= (BITS_PER_LONG - bitidx - 1);
478 478
479 word = READ_ONCE(bitmap[word_bitidx]); 479 word = READ_ONCE(bitmap[word_bitidx]);
480 for (;;) { 480 for (;;) {
481 old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags); 481 old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
482 if (word == old_word) 482 if (word == old_word)
483 break; 483 break;
484 word = old_word; 484 word = old_word;
485 } 485 }
486 } 486 }
487 487
488 void set_pageblock_migratetype(struct page *page, int migratetype) 488 void set_pageblock_migratetype(struct page *page, int migratetype)
489 { 489 {
490 if (unlikely(page_group_by_mobility_disabled && 490 if (unlikely(page_group_by_mobility_disabled &&
491 migratetype < MIGRATE_PCPTYPES)) 491 migratetype < MIGRATE_PCPTYPES))
492 migratetype = MIGRATE_UNMOVABLE; 492 migratetype = MIGRATE_UNMOVABLE;
493 493
494 set_pageblock_flags_group(page, (unsigned long)migratetype, 494 set_pageblock_flags_group(page, (unsigned long)migratetype,
495 PB_migrate, PB_migrate_end); 495 PB_migrate, PB_migrate_end);
496 } 496 }
497 497
498 #ifdef CONFIG_DEBUG_VM 498 #ifdef CONFIG_DEBUG_VM
499 static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 499 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
500 { 500 {
501 int ret = 0; 501 int ret = 0;
502 unsigned seq; 502 unsigned seq;
503 unsigned long pfn = page_to_pfn(page); 503 unsigned long pfn = page_to_pfn(page);
504 unsigned long sp, start_pfn; 504 unsigned long sp, start_pfn;
505 505
506 do { 506 do {
507 seq = zone_span_seqbegin(zone); 507 seq = zone_span_seqbegin(zone);
508 start_pfn = zone->zone_start_pfn; 508 start_pfn = zone->zone_start_pfn;
509 sp = zone->spanned_pages; 509 sp = zone->spanned_pages;
510 if (!zone_spans_pfn(zone, pfn)) 510 if (!zone_spans_pfn(zone, pfn))
511 ret = 1; 511 ret = 1;
512 } while (zone_span_seqretry(zone, seq)); 512 } while (zone_span_seqretry(zone, seq));
513 513
514 if (ret) 514 if (ret)
515 pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n", 515 pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n",
516 pfn, zone_to_nid(zone), zone->name, 516 pfn, zone_to_nid(zone), zone->name,
517 start_pfn, start_pfn + sp); 517 start_pfn, start_pfn + sp);
518 518
519 return ret; 519 return ret;
520 } 520 }
521 521
522 static int page_is_consistent(struct zone *zone, struct page *page) 522 static int page_is_consistent(struct zone *zone, struct page *page)
523 { 523 {
524 if (!pfn_valid_within(page_to_pfn(page))) 524 if (!pfn_valid_within(page_to_pfn(page)))
525 return 0; 525 return 0;
526 if (zone != page_zone(page)) 526 if (zone != page_zone(page))
527 return 0; 527 return 0;
528 528
529 return 1; 529 return 1;
530 } 530 }
531 /* 531 /*
532 * Temporary debugging check for pages not lying within a given zone. 532 * Temporary debugging check for pages not lying within a given zone.
533 */ 533 */
534 static int __maybe_unused bad_range(struct zone *zone, struct page *page) 534 static int __maybe_unused bad_range(struct zone *zone, struct page *page)
535 { 535 {
536 if (page_outside_zone_boundaries(zone, page)) 536 if (page_outside_zone_boundaries(zone, page))
537 return 1; 537 return 1;
538 if (!page_is_consistent(zone, page)) 538 if (!page_is_consistent(zone, page))
539 return 1; 539 return 1;
540 540
541 return 0; 541 return 0;
542 } 542 }
543 #else 543 #else
544 static inline int __maybe_unused bad_range(struct zone *zone, struct page *page) 544 static inline int __maybe_unused bad_range(struct zone *zone, struct page *page)
545 { 545 {
546 return 0; 546 return 0;
547 } 547 }
548 #endif 548 #endif
549 549
550 static void bad_page(struct page *page, const char *reason, 550 static void bad_page(struct page *page, const char *reason,
551 unsigned long bad_flags) 551 unsigned long bad_flags)
552 { 552 {
553 static unsigned long resume; 553 static unsigned long resume;
554 static unsigned long nr_shown; 554 static unsigned long nr_shown;
555 static unsigned long nr_unshown; 555 static unsigned long nr_unshown;
556 556
557 /* 557 /*
558 * Allow a burst of 60 reports, then keep quiet for that minute; 558 * Allow a burst of 60 reports, then keep quiet for that minute;
559 * or allow a steady drip of one report per second. 559 * or allow a steady drip of one report per second.
560 */ 560 */
561 if (nr_shown == 60) { 561 if (nr_shown == 60) {
562 if (time_before(jiffies, resume)) { 562 if (time_before(jiffies, resume)) {
563 nr_unshown++; 563 nr_unshown++;
564 goto out; 564 goto out;
565 } 565 }
566 if (nr_unshown) { 566 if (nr_unshown) {
567 pr_alert( 567 pr_alert(
568 "BUG: Bad page state: %lu messages suppressed\n", 568 "BUG: Bad page state: %lu messages suppressed\n",
569 nr_unshown); 569 nr_unshown);
570 nr_unshown = 0; 570 nr_unshown = 0;
571 } 571 }
572 nr_shown = 0; 572 nr_shown = 0;
573 } 573 }
574 if (nr_shown++ == 0) 574 if (nr_shown++ == 0)
575 resume = jiffies + 60 * HZ; 575 resume = jiffies + 60 * HZ;
576 576
577 pr_alert("BUG: Bad page state in process %s pfn:%05lx\n", 577 pr_alert("BUG: Bad page state in process %s pfn:%05lx\n",
578 current->comm, page_to_pfn(page)); 578 current->comm, page_to_pfn(page));
579 __dump_page(page, reason); 579 __dump_page(page, reason);
580 bad_flags &= page->flags; 580 bad_flags &= page->flags;
581 if (bad_flags) 581 if (bad_flags)
582 pr_alert("bad because of flags: %#lx(%pGp)\n", 582 pr_alert("bad because of flags: %#lx(%pGp)\n",
583 bad_flags, &bad_flags); 583 bad_flags, &bad_flags);
584 dump_page_owner(page); 584 dump_page_owner(page);
585 585
586 print_modules(); 586 print_modules();
587 dump_stack(); 587 dump_stack();
588 out: 588 out:
589 /* Leave bad fields for debug, except PageBuddy could make trouble */ 589 /* Leave bad fields for debug, except PageBuddy could make trouble */
590 page_mapcount_reset(page); /* remove PageBuddy */ 590 page_mapcount_reset(page); /* remove PageBuddy */
591 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); 591 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
592 } 592 }
593 593
594 /* 594 /*
595 * Higher-order pages are called "compound pages". They are structured thusly: 595 * Higher-order pages are called "compound pages". They are structured thusly:
596 * 596 *
597 * The first PAGE_SIZE page is called the "head page" and have PG_head set. 597 * The first PAGE_SIZE page is called the "head page" and have PG_head set.
598 * 598 *
599 * The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded 599 * The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded
600 * in bit 0 of page->compound_head. The rest of bits is pointer to head page. 600 * in bit 0 of page->compound_head. The rest of bits is pointer to head page.
601 * 601 *
602 * The first tail page's ->compound_dtor holds the offset in array of compound 602 * The first tail page's ->compound_dtor holds the offset in array of compound
603 * page destructors. See compound_page_dtors. 603 * page destructors. See compound_page_dtors.
604 * 604 *
605 * The first tail page's ->compound_order holds the order of allocation. 605 * The first tail page's ->compound_order holds the order of allocation.
606 * This usage means that zero-order pages may not be compound. 606 * This usage means that zero-order pages may not be compound.
607 */ 607 */
608 608
609 void free_compound_page(struct page *page) 609 void free_compound_page(struct page *page)
610 { 610 {
611 __free_pages_ok(page, compound_order(page)); 611 __free_pages_ok(page, compound_order(page));
612 } 612 }
613 613
614 void prep_compound_page(struct page *page, unsigned int order) 614 void prep_compound_page(struct page *page, unsigned int order)
615 { 615 {
616 int i; 616 int i;
617 int nr_pages = 1 << order; 617 int nr_pages = 1 << order;
618 618
619 set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); 619 set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
620 set_compound_order(page, order); 620 set_compound_order(page, order);
621 __SetPageHead(page); 621 __SetPageHead(page);
622 for (i = 1; i < nr_pages; i++) { 622 for (i = 1; i < nr_pages; i++) {
623 struct page *p = page + i; 623 struct page *p = page + i;
624 set_page_count(p, 0); 624 set_page_count(p, 0);
625 p->mapping = TAIL_MAPPING; 625 p->mapping = TAIL_MAPPING;
626 set_compound_head(p, page); 626 set_compound_head(p, page);
627 } 627 }
628 atomic_set(compound_mapcount_ptr(page), -1); 628 atomic_set(compound_mapcount_ptr(page), -1);
629 } 629 }
630 630
631 #ifdef CONFIG_DEBUG_PAGEALLOC 631 #ifdef CONFIG_DEBUG_PAGEALLOC
632 unsigned int _debug_guardpage_minorder; 632 unsigned int _debug_guardpage_minorder;
633 bool _debug_pagealloc_enabled __read_mostly 633 bool _debug_pagealloc_enabled __read_mostly
634 = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT); 634 = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT);
635 EXPORT_SYMBOL(_debug_pagealloc_enabled); 635 EXPORT_SYMBOL(_debug_pagealloc_enabled);
636 bool _debug_guardpage_enabled __read_mostly; 636 bool _debug_guardpage_enabled __read_mostly;
637 637
638 static int __init early_debug_pagealloc(char *buf) 638 static int __init early_debug_pagealloc(char *buf)
639 { 639 {
640 if (!buf) 640 if (!buf)
641 return -EINVAL; 641 return -EINVAL;
642 return kstrtobool(buf, &_debug_pagealloc_enabled); 642 return kstrtobool(buf, &_debug_pagealloc_enabled);
643 } 643 }
644 early_param("debug_pagealloc", early_debug_pagealloc); 644 early_param("debug_pagealloc", early_debug_pagealloc);
645 645
646 static bool need_debug_guardpage(void) 646 static bool need_debug_guardpage(void)
647 { 647 {
648 /* If we don't use debug_pagealloc, we don't need guard page */ 648 /* If we don't use debug_pagealloc, we don't need guard page */
649 if (!debug_pagealloc_enabled()) 649 if (!debug_pagealloc_enabled())
650 return false; 650 return false;
651 651
652 if (!debug_guardpage_minorder()) 652 if (!debug_guardpage_minorder())
653 return false; 653 return false;
654 654
655 return true; 655 return true;
656 } 656 }
657 657
658 static void init_debug_guardpage(void) 658 static void init_debug_guardpage(void)
659 { 659 {
660 if (!debug_pagealloc_enabled()) 660 if (!debug_pagealloc_enabled())
661 return; 661 return;
662 662
663 if (!debug_guardpage_minorder()) 663 if (!debug_guardpage_minorder())
664 return; 664 return;
665 665
666 _debug_guardpage_enabled = true; 666 _debug_guardpage_enabled = true;
667 } 667 }
668 668
669 struct page_ext_operations debug_guardpage_ops = { 669 struct page_ext_operations debug_guardpage_ops = {
670 .need = need_debug_guardpage, 670 .need = need_debug_guardpage,
671 .init = init_debug_guardpage, 671 .init = init_debug_guardpage,
672 }; 672 };
673 673
674 static int __init debug_guardpage_minorder_setup(char *buf) 674 static int __init debug_guardpage_minorder_setup(char *buf)
675 { 675 {
676 unsigned long res; 676 unsigned long res;
677 677
678 if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) { 678 if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {
679 pr_err("Bad debug_guardpage_minorder value\n"); 679 pr_err("Bad debug_guardpage_minorder value\n");
680 return 0; 680 return 0;
681 } 681 }
682 _debug_guardpage_minorder = res; 682 _debug_guardpage_minorder = res;
683 pr_info("Setting debug_guardpage_minorder to %lu\n", res); 683 pr_info("Setting debug_guardpage_minorder to %lu\n", res);
684 return 0; 684 return 0;
685 } 685 }
686 early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup); 686 early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup);
687 687
688 static inline bool set_page_guard(struct zone *zone, struct page *page, 688 static inline bool set_page_guard(struct zone *zone, struct page *page,
689 unsigned int order, int migratetype) 689 unsigned int order, int migratetype)
690 { 690 {
691 struct page_ext *page_ext; 691 struct page_ext *page_ext;
692 692
693 if (!debug_guardpage_enabled()) 693 if (!debug_guardpage_enabled())
694 return false; 694 return false;
695 695
696 if (order >= debug_guardpage_minorder()) 696 if (order >= debug_guardpage_minorder())
697 return false; 697 return false;
698 698
699 page_ext = lookup_page_ext(page); 699 page_ext = lookup_page_ext(page);
700 if (unlikely(!page_ext)) 700 if (unlikely(!page_ext))
701 return false; 701 return false;
702 702
703 __set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags); 703 __set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
704 704
705 INIT_LIST_HEAD(&page->lru); 705 INIT_LIST_HEAD(&page->lru);
706 set_page_private(page, order); 706 set_page_private(page, order);
707 /* Guard pages are not available for any usage */ 707 /* Guard pages are not available for any usage */
708 __mod_zone_freepage_state(zone, -(1 << order), migratetype); 708 __mod_zone_freepage_state(zone, -(1 << order), migratetype);
709 709
710 return true; 710 return true;
711 } 711 }
712 712
713 static inline void clear_page_guard(struct zone *zone, struct page *page, 713 static inline void clear_page_guard(struct zone *zone, struct page *page,
714 unsigned int order, int migratetype) 714 unsigned int order, int migratetype)
715 { 715 {
716 struct page_ext *page_ext; 716 struct page_ext *page_ext;
717 717
718 if (!debug_guardpage_enabled()) 718 if (!debug_guardpage_enabled())
719 return; 719 return;
720 720
721 page_ext = lookup_page_ext(page); 721 page_ext = lookup_page_ext(page);
722 if (unlikely(!page_ext)) 722 if (unlikely(!page_ext))
723 return; 723 return;
724 724
725 __clear_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags); 725 __clear_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
726 726
727 set_page_private(page, 0); 727 set_page_private(page, 0);
728 if (!is_migrate_isolate(migratetype)) 728 if (!is_migrate_isolate(migratetype))
729 __mod_zone_freepage_state(zone, (1 << order), migratetype); 729 __mod_zone_freepage_state(zone, (1 << order), migratetype);
730 } 730 }
731 #else 731 #else
732 struct page_ext_operations debug_guardpage_ops; 732 struct page_ext_operations debug_guardpage_ops;
733 static inline bool set_page_guard(struct zone *zone, struct page *page, 733 static inline bool set_page_guard(struct zone *zone, struct page *page,
734 unsigned int order, int migratetype) { return false; } 734 unsigned int order, int migratetype) { return false; }
735 static inline void clear_page_guard(struct zone *zone, struct page *page, 735 static inline void clear_page_guard(struct zone *zone, struct page *page,
736 unsigned int order, int migratetype) {} 736 unsigned int order, int migratetype) {}
737 #endif 737 #endif
738 738
739 static inline void set_page_order(struct page *page, unsigned int order) 739 static inline void set_page_order(struct page *page, unsigned int order)
740 { 740 {
741 set_page_private(page, order); 741 set_page_private(page, order);
742 __SetPageBuddy(page); 742 __SetPageBuddy(page);
743 } 743 }
744 744
745 static inline void rmv_page_order(struct page *page) 745 static inline void rmv_page_order(struct page *page)
746 { 746 {
747 __ClearPageBuddy(page); 747 __ClearPageBuddy(page);
748 set_page_private(page, 0); 748 set_page_private(page, 0);
749 } 749 }
750 750
751 /* 751 /*
752 * This function checks whether a page is free && is the buddy 752 * This function checks whether a page is free && is the buddy
753 * we can do coalesce a page and its buddy if 753 * we can do coalesce a page and its buddy if
754 * (a) the buddy is not in a hole (check before calling!) && 754 * (a) the buddy is not in a hole (check before calling!) &&
755 * (b) the buddy is in the buddy system && 755 * (b) the buddy is in the buddy system &&
756 * (c) a page and its buddy have the same order && 756 * (c) a page and its buddy have the same order &&
757 * (d) a page and its buddy are in the same zone. 757 * (d) a page and its buddy are in the same zone.
758 * 758 *
759 * For recording whether a page is in the buddy system, we set ->_mapcount 759 * For recording whether a page is in the buddy system, we set ->_mapcount
760 * PAGE_BUDDY_MAPCOUNT_VALUE. 760 * PAGE_BUDDY_MAPCOUNT_VALUE.
761 * Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is 761 * Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is
762 * serialized by zone->lock. 762 * serialized by zone->lock.
763 * 763 *
764 * For recording page's order, we use page_private(page). 764 * For recording page's order, we use page_private(page).
765 */ 765 */
766 static inline int page_is_buddy(struct page *page, struct page *buddy, 766 static inline int page_is_buddy(struct page *page, struct page *buddy,
767 unsigned int order) 767 unsigned int order)
768 { 768 {
769 if (page_is_guard(buddy) && page_order(buddy) == order) { 769 if (page_is_guard(buddy) && page_order(buddy) == order) {
770 if (page_zone_id(page) != page_zone_id(buddy)) 770 if (page_zone_id(page) != page_zone_id(buddy))
771 return 0; 771 return 0;
772 772
773 VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); 773 VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
774 774
775 return 1; 775 return 1;
776 } 776 }
777 777
778 if (PageBuddy(buddy) && page_order(buddy) == order) { 778 if (PageBuddy(buddy) && page_order(buddy) == order) {
779 /* 779 /*
780 * zone check is done late to avoid uselessly 780 * zone check is done late to avoid uselessly
781 * calculating zone/node ids for pages that could 781 * calculating zone/node ids for pages that could
782 * never merge. 782 * never merge.
783 */ 783 */
784 if (page_zone_id(page) != page_zone_id(buddy)) 784 if (page_zone_id(page) != page_zone_id(buddy))
785 return 0; 785 return 0;
786 786
787 VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); 787 VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
788 788
789 return 1; 789 return 1;
790 } 790 }
791 return 0; 791 return 0;
792 } 792 }
793 793
794 /* 794 /*
795 * Freeing function for a buddy system allocator. 795 * Freeing function for a buddy system allocator.
796 * 796 *
797 * The concept of a buddy system is to maintain direct-mapped table 797 * The concept of a buddy system is to maintain direct-mapped table
798 * (containing bit values) for memory blocks of various "orders". 798 * (containing bit values) for memory blocks of various "orders".
799 * The bottom level table contains the map for the smallest allocatable 799 * The bottom level table contains the map for the smallest allocatable
800 * units of memory (here, pages), and each level above it describes 800 * units of memory (here, pages), and each level above it describes
801 * pairs of units from the levels below, hence, "buddies". 801 * pairs of units from the levels below, hence, "buddies".
802 * At a high level, all that happens here is marking the table entry 802 * At a high level, all that happens here is marking the table entry
803 * at the bottom level available, and propagating the changes upward 803 * at the bottom level available, and propagating the changes upward
804 * as necessary, plus some accounting needed to play nicely with other 804 * as necessary, plus some accounting needed to play nicely with other
805 * parts of the VM system. 805 * parts of the VM system.
806 * At each level, we keep a list of pages, which are heads of continuous 806 * At each level, we keep a list of pages, which are heads of continuous
807 * free pages of length of (1 << order) and marked with _mapcount 807 * free pages of length of (1 << order) and marked with _mapcount
808 * PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page) 808 * PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page)
809 * field. 809 * field.
810 * So when we are allocating or freeing one, we can derive the state of the 810 * So when we are allocating or freeing one, we can derive the state of the
811 * other. That is, if we allocate a small block, and both were 811 * other. That is, if we allocate a small block, and both were
812 * free, the remainder of the region must be split into blocks. 812 * free, the remainder of the region must be split into blocks.
813 * If a block is freed, and its buddy is also free, then this 813 * If a block is freed, and its buddy is also free, then this
814 * triggers coalescing into a block of larger size. 814 * triggers coalescing into a block of larger size.
815 * 815 *
816 * -- nyc 816 * -- nyc
817 */ 817 */
818 818
819 static inline void __free_one_page(struct page *page, 819 static inline void __free_one_page(struct page *page,
820 unsigned long pfn, 820 unsigned long pfn,
821 struct zone *zone, unsigned int order, 821 struct zone *zone, unsigned int order,
822 int migratetype) 822 int migratetype)
823 { 823 {
824 unsigned long combined_pfn; 824 unsigned long combined_pfn;
825 unsigned long uninitialized_var(buddy_pfn); 825 unsigned long uninitialized_var(buddy_pfn);
826 struct page *buddy; 826 struct page *buddy;
827 unsigned int max_order; 827 unsigned int max_order;
828 828
829 max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1); 829 max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1);
830 830
831 VM_BUG_ON(!zone_is_initialized(zone)); 831 VM_BUG_ON(!zone_is_initialized(zone));
832 VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); 832 VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
833 833
834 VM_BUG_ON(migratetype == -1); 834 VM_BUG_ON(migratetype == -1);
835 if (likely(!is_migrate_isolate(migratetype))) 835 if (likely(!is_migrate_isolate(migratetype)))
836 __mod_zone_freepage_state(zone, 1 << order, migratetype); 836 __mod_zone_freepage_state(zone, 1 << order, migratetype);
837 837
838 VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page); 838 VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
839 VM_BUG_ON_PAGE(bad_range(zone, page), page); 839 VM_BUG_ON_PAGE(bad_range(zone, page), page);
840 840
841 continue_merging: 841 continue_merging:
842 while (order < max_order - 1) { 842 while (order < max_order - 1) {
843 buddy_pfn = __find_buddy_pfn(pfn, order); 843 buddy_pfn = __find_buddy_pfn(pfn, order);
844 buddy = page + (buddy_pfn - pfn); 844 buddy = page + (buddy_pfn - pfn);
845 845
846 if (!pfn_valid_within(buddy_pfn)) 846 if (!pfn_valid_within(buddy_pfn))
847 goto done_merging; 847 goto done_merging;
848 if (!page_is_buddy(page, buddy, order)) 848 if (!page_is_buddy(page, buddy, order))
849 goto done_merging; 849 goto done_merging;
850 /* 850 /*
851 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, 851 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
852 * merge with it and move up one order. 852 * merge with it and move up one order.
853 */ 853 */
854 if (page_is_guard(buddy)) { 854 if (page_is_guard(buddy)) {
855 clear_page_guard(zone, buddy, order, migratetype); 855 clear_page_guard(zone, buddy, order, migratetype);
856 } else { 856 } else {
857 list_del(&buddy->lru); 857 list_del(&buddy->lru);
858 zone->free_area[order].nr_free--; 858 zone->free_area[order].nr_free--;
859 rmv_page_order(buddy); 859 rmv_page_order(buddy);
860 } 860 }
861 combined_pfn = buddy_pfn & pfn; 861 combined_pfn = buddy_pfn & pfn;
862 page = page + (combined_pfn - pfn); 862 page = page + (combined_pfn - pfn);
863 pfn = combined_pfn; 863 pfn = combined_pfn;
864 order++; 864 order++;
865 } 865 }
866 if (max_order < MAX_ORDER) { 866 if (max_order < MAX_ORDER) {
867 /* If we are here, it means order is >= pageblock_order. 867 /* If we are here, it means order is >= pageblock_order.
868 * We want to prevent merge between freepages on isolate 868 * We want to prevent merge between freepages on isolate
869 * pageblock and normal pageblock. Without this, pageblock 869 * pageblock and normal pageblock. Without this, pageblock
870 * isolation could cause incorrect freepage or CMA accounting. 870 * isolation could cause incorrect freepage or CMA accounting.
871 * 871 *
872 * We don't want to hit this code for the more frequent 872 * We don't want to hit this code for the more frequent
873 * low-order merging. 873 * low-order merging.
874 */ 874 */
875 if (unlikely(has_isolate_pageblock(zone))) { 875 if (unlikely(has_isolate_pageblock(zone))) {
876 int buddy_mt; 876 int buddy_mt;
877 877
878 buddy_pfn = __find_buddy_pfn(pfn, order); 878 buddy_pfn = __find_buddy_pfn(pfn, order);
879 buddy = page + (buddy_pfn - pfn); 879 buddy = page + (buddy_pfn - pfn);
880 buddy_mt = get_pageblock_migratetype(buddy); 880 buddy_mt = get_pageblock_migratetype(buddy);
881 881
882 if (migratetype != buddy_mt 882 if (migratetype != buddy_mt
883 && (is_migrate_isolate(migratetype) || 883 && (is_migrate_isolate(migratetype) ||
884 is_migrate_isolate(buddy_mt))) 884 is_migrate_isolate(buddy_mt)))
885 goto done_merging; 885 goto done_merging;
886 } 886 }
887 max_order++; 887 max_order++;
888 goto continue_merging; 888 goto continue_merging;
889 } 889 }
890 890
891 done_merging: 891 done_merging:
892 set_page_order(page, order); 892 set_page_order(page, order);
893 893
894 /* 894 /*
895 * If this is not the largest possible page, check if the buddy 895 * If this is not the largest possible page, check if the buddy
896 * of the next-highest order is free. If it is, it's possible 896 * of the next-highest order is free. If it is, it's possible
897 * that pages are being freed that will coalesce soon. In case, 897 * that pages are being freed that will coalesce soon. In case,
898 * that is happening, add the free page to the tail of the list 898 * that is happening, add the free page to the tail of the list
899 * so it's less likely to be used soon and more likely to be merged 899 * so it's less likely to be used soon and more likely to be merged
900 * as a higher order page 900 * as a higher order page
901 */ 901 */
902 if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)) { 902 if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)) {
903 struct page *higher_page, *higher_buddy; 903 struct page *higher_page, *higher_buddy;
904 combined_pfn = buddy_pfn & pfn; 904 combined_pfn = buddy_pfn & pfn;
905 higher_page = page + (combined_pfn - pfn); 905 higher_page = page + (combined_pfn - pfn);
906 buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1); 906 buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
907 higher_buddy = higher_page + (buddy_pfn - combined_pfn); 907 higher_buddy = higher_page + (buddy_pfn - combined_pfn);
908 if (pfn_valid_within(buddy_pfn) && 908 if (pfn_valid_within(buddy_pfn) &&
909 page_is_buddy(higher_page, higher_buddy, order + 1)) { 909 page_is_buddy(higher_page, higher_buddy, order + 1)) {
910 list_add_tail(&page->lru, 910 list_add_tail(&page->lru,
911 &zone->free_area[order].free_list[migratetype]); 911 &zone->free_area[order].free_list[migratetype]);
912 goto out; 912 goto out;
913 } 913 }
914 } 914 }
915 915
916 list_add(&page->lru, &zone->free_area[order].free_list[migratetype]); 916 list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
917 out: 917 out:
918 zone->free_area[order].nr_free++; 918 zone->free_area[order].nr_free++;
919 } 919 }
920 920
921 /* 921 /*
922 * A bad page could be due to a number of fields. Instead of multiple branches, 922 * A bad page could be due to a number of fields. Instead of multiple branches,
923 * try and check multiple fields with one check. The caller must do a detailed 923 * try and check multiple fields with one check. The caller must do a detailed
924 * check if necessary. 924 * check if necessary.
925 */ 925 */
926 static inline bool page_expected_state(struct page *page, 926 static inline bool page_expected_state(struct page *page,
927 unsigned long check_flags) 927 unsigned long check_flags)
928 { 928 {
929 if (unlikely(atomic_read(&page->_mapcount) != -1)) 929 if (unlikely(atomic_read(&page->_mapcount) != -1))
930 return false; 930 return false;
931 931
932 if (unlikely((unsigned long)page->mapping | 932 if (unlikely((unsigned long)page->mapping |
933 page_ref_count(page) | 933 page_ref_count(page) |
934 #ifdef CONFIG_MEMCG 934 #ifdef CONFIG_MEMCG
935 (unsigned long)page->mem_cgroup | 935 (unsigned long)page->mem_cgroup |
936 #endif 936 #endif
937 (page->flags & check_flags))) 937 (page->flags & check_flags)))
938 return false; 938 return false;
939 939
940 return true; 940 return true;
941 } 941 }
942 942
943 static void free_pages_check_bad(struct page *page) 943 static void free_pages_check_bad(struct page *page)
944 { 944 {
945 const char *bad_reason; 945 const char *bad_reason;
946 unsigned long bad_flags; 946 unsigned long bad_flags;
947 947
948 bad_reason = NULL; 948 bad_reason = NULL;
949 bad_flags = 0; 949 bad_flags = 0;
950 950
951 if (unlikely(atomic_read(&page->_mapcount) != -1)) 951 if (unlikely(atomic_read(&page->_mapcount) != -1))
952 bad_reason = "nonzero mapcount"; 952 bad_reason = "nonzero mapcount";
953 if (unlikely(page->mapping != NULL)) 953 if (unlikely(page->mapping != NULL))
954 bad_reason = "non-NULL mapping"; 954 bad_reason = "non-NULL mapping";
955 if (unlikely(page_ref_count(page) != 0)) 955 if (unlikely(page_ref_count(page) != 0))
956 bad_reason = "nonzero _refcount"; 956 bad_reason = "nonzero _refcount";
957 if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) { 957 if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) {
958 bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; 958 bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
959 bad_flags = PAGE_FLAGS_CHECK_AT_FREE; 959 bad_flags = PAGE_FLAGS_CHECK_AT_FREE;
960 } 960 }
961 #ifdef CONFIG_MEMCG 961 #ifdef CONFIG_MEMCG
962 if (unlikely(page->mem_cgroup)) 962 if (unlikely(page->mem_cgroup))
963 bad_reason = "page still charged to cgroup"; 963 bad_reason = "page still charged to cgroup";
964 #endif 964 #endif
965 bad_page(page, bad_reason, bad_flags); 965 bad_page(page, bad_reason, bad_flags);
966 } 966 }
967 967
968 static inline int free_pages_check(struct page *page) 968 static inline int free_pages_check(struct page *page)
969 { 969 {
970 if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE))) 970 if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE)))
971 return 0; 971 return 0;
972 972
973 /* Something has gone sideways, find it */ 973 /* Something has gone sideways, find it */
974 free_pages_check_bad(page); 974 free_pages_check_bad(page);
975 return 1; 975 return 1;
976 } 976 }
977 977
978 static int free_tail_pages_check(struct page *head_page, struct page *page) 978 static int free_tail_pages_check(struct page *head_page, struct page *page)
979 { 979 {
980 int ret = 1; 980 int ret = 1;
981 981
982 /* 982 /*
983 * We rely page->lru.next never has bit 0 set, unless the page 983 * We rely page->lru.next never has bit 0 set, unless the page
984 * is PageTail(). Let's make sure that's true even for poisoned ->lru. 984 * is PageTail(). Let's make sure that's true even for poisoned ->lru.
985 */ 985 */
986 BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1); 986 BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1);
987 987
988 if (!IS_ENABLED(CONFIG_DEBUG_VM)) { 988 if (!IS_ENABLED(CONFIG_DEBUG_VM)) {
989 ret = 0; 989 ret = 0;
990 goto out; 990 goto out;
991 } 991 }
992 switch (page - head_page) { 992 switch (page - head_page) {
993 case 1: 993 case 1:
994 /* the first tail page: ->mapping is compound_mapcount() */ 994 /* the first tail page: ->mapping is compound_mapcount() */
995 if (unlikely(compound_mapcount(page))) { 995 if (unlikely(compound_mapcount(page))) {
996 bad_page(page, "nonzero compound_mapcount", 0); 996 bad_page(page, "nonzero compound_mapcount", 0);
997 goto out; 997 goto out;
998 } 998 }
999 break; 999 break;
1000 case 2: 1000 case 2:
1001 /* 1001 /*
1002 * the second tail page: ->mapping is 1002 * the second tail page: ->mapping is
1003 * page_deferred_list().next -- ignore value. 1003 * page_deferred_list().next -- ignore value.
1004 */ 1004 */
1005 break; 1005 break;
1006 default: 1006 default:
1007 if (page->mapping != TAIL_MAPPING) { 1007 if (page->mapping != TAIL_MAPPING) {
1008 bad_page(page, "corrupted mapping in tail page", 0); 1008 bad_page(page, "corrupted mapping in tail page", 0);
1009 goto out; 1009 goto out;
1010 } 1010 }
1011 break; 1011 break;
1012 } 1012 }
1013 if (unlikely(!PageTail(page))) { 1013 if (unlikely(!PageTail(page))) {
1014 bad_page(page, "PageTail not set", 0); 1014 bad_page(page, "PageTail not set", 0);
1015 goto out; 1015 goto out;
1016 } 1016 }
1017 if (unlikely(compound_head(page) != head_page)) { 1017 if (unlikely(compound_head(page) != head_page)) {
1018 bad_page(page, "compound_head not consistent", 0); 1018 bad_page(page, "compound_head not consistent", 0);
1019 goto out; 1019 goto out;
1020 } 1020 }
1021 ret = 0; 1021 ret = 0;
1022 out: 1022 out:
1023 page->mapping = NULL; 1023 page->mapping = NULL;
1024 clear_compound_head(page); 1024 clear_compound_head(page);
1025 return ret; 1025 return ret;
1026 } 1026 }
1027 1027
1028 static __always_inline bool free_pages_prepare(struct page *page, 1028 static __always_inline bool free_pages_prepare(struct page *page,
1029 unsigned int order, bool check_free) 1029 unsigned int order, bool check_free)
1030 { 1030 {
1031 int bad = 0; 1031 int bad = 0;
1032 1032
1033 VM_BUG_ON_PAGE(PageTail(page), page); 1033 VM_BUG_ON_PAGE(PageTail(page), page);
1034 1034
1035 trace_mm_page_free(page, order); 1035 trace_mm_page_free(page, order);
1036 1036
1037 /* 1037 /*
1038 * Check tail pages before head page information is cleared to 1038 * Check tail pages before head page information is cleared to
1039 * avoid checking PageCompound for order-0 pages. 1039 * avoid checking PageCompound for order-0 pages.
1040 */ 1040 */
1041 if (unlikely(order)) { 1041 if (unlikely(order)) {
1042 bool compound = PageCompound(page); 1042 bool compound = PageCompound(page);
1043 int i; 1043 int i;
1044 1044
1045 VM_BUG_ON_PAGE(compound && compound_order(page) != order, page); 1045 VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
1046 1046
1047 if (compound) 1047 if (compound)
1048 ClearPageDoubleMap(page); 1048 ClearPageDoubleMap(page);
1049 for (i = 1; i < (1 << order); i++) { 1049 for (i = 1; i < (1 << order); i++) {
1050 if (compound) 1050 if (compound)
1051 bad += free_tail_pages_check(page, page + i); 1051 bad += free_tail_pages_check(page, page + i);
1052 if (unlikely(free_pages_check(page + i))) { 1052 if (unlikely(free_pages_check(page + i))) {
1053 bad++; 1053 bad++;
1054 continue; 1054 continue;
1055 } 1055 }
1056 (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; 1056 (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
1057 } 1057 }
1058 } 1058 }
1059 if (PageMappingFlags(page)) 1059 if (PageMappingFlags(page))
1060 page->mapping = NULL; 1060 page->mapping = NULL;
1061 if (memcg_kmem_enabled() && PageKmemcg(page)) 1061 if (memcg_kmem_enabled() && PageKmemcg(page))
1062 memcg_kmem_uncharge(page, order); 1062 memcg_kmem_uncharge(page, order);
1063 if (check_free) 1063 if (check_free)
1064 bad += free_pages_check(page); 1064 bad += free_pages_check(page);
1065 if (bad) 1065 if (bad)
1066 return false; 1066 return false;
1067 1067
1068 page_cpupid_reset_last(page); 1068 page_cpupid_reset_last(page);
1069 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; 1069 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
1070 reset_page_owner(page, order); 1070 reset_page_owner(page, order);
1071 1071
1072 if (!PageHighMem(page)) { 1072 if (!PageHighMem(page)) {
1073 debug_check_no_locks_freed(page_address(page), 1073 debug_check_no_locks_freed(page_address(page),
1074 PAGE_SIZE << order); 1074 PAGE_SIZE << order);
1075 debug_check_no_obj_freed(page_address(page), 1075 debug_check_no_obj_freed(page_address(page),
1076 PAGE_SIZE << order); 1076 PAGE_SIZE << order);
1077 } 1077 }
1078 arch_free_page(page, order); 1078 arch_free_page(page, order);
1079 kernel_poison_pages(page, 1 << order, 0); 1079 kernel_poison_pages(page, 1 << order, 0);
1080 kernel_map_pages(page, 1 << order, 0); 1080 kernel_map_pages(page, 1 << order, 0);
1081 kasan_free_pages(page, order); 1081 kasan_free_pages(page, order);
1082 1082
1083 return true; 1083 return true;
1084 } 1084 }
1085 1085
1086 #ifdef CONFIG_DEBUG_VM 1086 #ifdef CONFIG_DEBUG_VM
1087 static inline bool free_pcp_prepare(struct page *page) 1087 static inline bool free_pcp_prepare(struct page *page)
1088 { 1088 {
1089 return free_pages_prepare(page, 0, true); 1089 return free_pages_prepare(page, 0, true);
1090 } 1090 }
1091 1091
1092 static inline bool bulkfree_pcp_prepare(struct page *page) 1092 static inline bool bulkfree_pcp_prepare(struct page *page)
1093 { 1093 {
1094 return false; 1094 return false;
1095 } 1095 }
1096 #else 1096 #else
1097 static bool free_pcp_prepare(struct page *page) 1097 static bool free_pcp_prepare(struct page *page)
1098 { 1098 {
1099 return free_pages_prepare(page, 0, false); 1099 return free_pages_prepare(page, 0, false);
1100 } 1100 }
1101 1101
1102 static bool bulkfree_pcp_prepare(struct page *page) 1102 static bool bulkfree_pcp_prepare(struct page *page)
1103 { 1103 {
1104 return free_pages_check(page); 1104 return free_pages_check(page);
1105 } 1105 }
1106 #endif /* CONFIG_DEBUG_VM */ 1106 #endif /* CONFIG_DEBUG_VM */
1107 1107
1108 /* 1108 /*
1109 * Frees a number of pages from the PCP lists 1109 * Frees a number of pages from the PCP lists
1110 * Assumes all pages on list are in same zone, and of same order. 1110 * Assumes all pages on list are in same zone, and of same order.
1111 * count is the number of pages to free. 1111 * count is the number of pages to free.
1112 * 1112 *
1113 * If the zone was previously in an "all pages pinned" state then look to 1113 * If the zone was previously in an "all pages pinned" state then look to
1114 * see if this freeing clears that state. 1114 * see if this freeing clears that state.
1115 * 1115 *
1116 * And clear the zone's pages_scanned counter, to hold off the "all pages are 1116 * And clear the zone's pages_scanned counter, to hold off the "all pages are
1117 * pinned" detection logic. 1117 * pinned" detection logic.
1118 */ 1118 */
1119 static void free_pcppages_bulk(struct zone *zone, int count, 1119 static void free_pcppages_bulk(struct zone *zone, int count,
1120 struct per_cpu_pages *pcp) 1120 struct per_cpu_pages *pcp)
1121 { 1121 {
1122 int migratetype = 0; 1122 int migratetype = 0;
1123 int batch_free = 0; 1123 int batch_free = 0;
1124 bool isolated_pageblocks; 1124 bool isolated_pageblocks;
1125 1125
1126 spin_lock(&zone->lock); 1126 spin_lock(&zone->lock);
1127 isolated_pageblocks = has_isolate_pageblock(zone); 1127 isolated_pageblocks = has_isolate_pageblock(zone);
1128 1128
1129 while (count) { 1129 while (count) {
1130 struct page *page; 1130 struct page *page;
1131 struct list_head *list; 1131 struct list_head *list;
1132 1132
1133 /* 1133 /*
1134 * Remove pages from lists in a round-robin fashion. A 1134 * Remove pages from lists in a round-robin fashion. A
1135 * batch_free count is maintained that is incremented when an 1135 * batch_free count is maintained that is incremented when an
1136 * empty list is encountered. This is so more pages are freed 1136 * empty list is encountered. This is so more pages are freed
1137 * off fuller lists instead of spinning excessively around empty 1137 * off fuller lists instead of spinning excessively around empty
1138 * lists 1138 * lists
1139 */ 1139 */
1140 do { 1140 do {
1141 batch_free++; 1141 batch_free++;
1142 if (++migratetype == MIGRATE_PCPTYPES) 1142 if (++migratetype == MIGRATE_PCPTYPES)
1143 migratetype = 0; 1143 migratetype = 0;
1144 list = &pcp->lists[migratetype]; 1144 list = &pcp->lists[migratetype];
1145 } while (list_empty(list)); 1145 } while (list_empty(list));
1146 1146
1147 /* This is the only non-empty list. Free them all. */ 1147 /* This is the only non-empty list. Free them all. */
1148 if (batch_free == MIGRATE_PCPTYPES) 1148 if (batch_free == MIGRATE_PCPTYPES)
1149 batch_free = count; 1149 batch_free = count;
1150 1150
1151 do { 1151 do {
1152 int mt; /* migratetype of the to-be-freed page */ 1152 int mt; /* migratetype of the to-be-freed page */
1153 1153
1154 page = list_last_entry(list, struct page, lru); 1154 page = list_last_entry(list, struct page, lru);
1155 /* must delete as __free_one_page list manipulates */ 1155 /* must delete as __free_one_page list manipulates */
1156 list_del(&page->lru); 1156 list_del(&page->lru);
1157 1157
1158 mt = get_pcppage_migratetype(page); 1158 mt = get_pcppage_migratetype(page);
1159 /* MIGRATE_ISOLATE page should not go to pcplists */ 1159 /* MIGRATE_ISOLATE page should not go to pcplists */
1160 VM_BUG_ON_PAGE(is_migrate_isolate(mt), page); 1160 VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
1161 /* Pageblock could have been isolated meanwhile */ 1161 /* Pageblock could have been isolated meanwhile */
1162 if (unlikely(isolated_pageblocks)) 1162 if (unlikely(isolated_pageblocks))
1163 mt = get_pageblock_migratetype(page); 1163 mt = get_pageblock_migratetype(page);
1164 1164
1165 if (bulkfree_pcp_prepare(page)) 1165 if (bulkfree_pcp_prepare(page))
1166 continue; 1166 continue;
1167 1167
1168 __free_one_page(page, page_to_pfn(page), zone, 0, mt); 1168 __free_one_page(page, page_to_pfn(page), zone, 0, mt);
1169 trace_mm_page_pcpu_drain(page, 0, mt); 1169 trace_mm_page_pcpu_drain(page, 0, mt);
1170 } while (--count && --batch_free && !list_empty(list)); 1170 } while (--count && --batch_free && !list_empty(list));
1171 } 1171 }
1172 spin_unlock(&zone->lock); 1172 spin_unlock(&zone->lock);
1173 } 1173 }
1174 1174
1175 static void free_one_page(struct zone *zone, 1175 static void free_one_page(struct zone *zone,
1176 struct page *page, unsigned long pfn, 1176 struct page *page, unsigned long pfn,
1177 unsigned int order, 1177 unsigned int order,
1178 int migratetype) 1178 int migratetype)
1179 { 1179 {
1180 spin_lock(&zone->lock); 1180 spin_lock(&zone->lock);
1181 if (unlikely(has_isolate_pageblock(zone) || 1181 if (unlikely(has_isolate_pageblock(zone) ||
1182 is_migrate_isolate(migratetype))) { 1182 is_migrate_isolate(migratetype))) {
1183 migratetype = get_pfnblock_migratetype(page, pfn); 1183 migratetype = get_pfnblock_migratetype(page, pfn);
1184 } 1184 }
1185 __free_one_page(page, pfn, zone, order, migratetype); 1185 __free_one_page(page, pfn, zone, order, migratetype);
1186 spin_unlock(&zone->lock); 1186 spin_unlock(&zone->lock);
1187 } 1187 }
1188 1188
1189 static void __meminit __init_single_page(struct page *page, unsigned long pfn, 1189 static void __meminit __init_single_page(struct page *page, unsigned long pfn,
1190 unsigned long zone, int nid) 1190 unsigned long zone, int nid)
1191 { 1191 {
1192 set_page_links(page, zone, nid, pfn); 1192 set_page_links(page, zone, nid, pfn);
1193 init_page_count(page); 1193 init_page_count(page);
1194 page_mapcount_reset(page); 1194 page_mapcount_reset(page);
1195 page_cpupid_reset_last(page); 1195 page_cpupid_reset_last(page);
1196 1196
1197 INIT_LIST_HEAD(&page->lru); 1197 INIT_LIST_HEAD(&page->lru);
1198 #ifdef WANT_PAGE_VIRTUAL 1198 #ifdef WANT_PAGE_VIRTUAL
1199 /* The shift won't overflow because ZONE_NORMAL is below 4G. */ 1199 /* The shift won't overflow because ZONE_NORMAL is below 4G. */
1200 if (!is_highmem_idx(zone)) 1200 if (!is_highmem_idx(zone))
1201 set_page_address(page, __va(pfn << PAGE_SHIFT)); 1201 set_page_address(page, __va(pfn << PAGE_SHIFT));
1202 #endif 1202 #endif
1203 } 1203 }
1204 1204
1205 static void __meminit __init_single_pfn(unsigned long pfn, unsigned long zone, 1205 static void __meminit __init_single_pfn(unsigned long pfn, unsigned long zone,
1206 int nid) 1206 int nid)
1207 { 1207 {
1208 return __init_single_page(pfn_to_page(pfn), pfn, zone, nid); 1208 return __init_single_page(pfn_to_page(pfn), pfn, zone, nid);
1209 } 1209 }
1210 1210
1211 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 1211 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
1212 static void __meminit init_reserved_page(unsigned long pfn) 1212 static void __meminit init_reserved_page(unsigned long pfn)
1213 { 1213 {
1214 pg_data_t *pgdat; 1214 pg_data_t *pgdat;
1215 int nid, zid; 1215 int nid, zid;
1216 1216
1217 if (!early_page_uninitialised(pfn)) 1217 if (!early_page_uninitialised(pfn))
1218 return; 1218 return;
1219 1219
1220 nid = early_pfn_to_nid(pfn); 1220 nid = early_pfn_to_nid(pfn);
1221 pgdat = NODE_DATA(nid); 1221 pgdat = NODE_DATA(nid);
1222 1222
1223 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 1223 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1224 struct zone *zone = &pgdat->node_zones[zid]; 1224 struct zone *zone = &pgdat->node_zones[zid];
1225 1225
1226 if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone)) 1226 if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone))
1227 break; 1227 break;
1228 } 1228 }
1229 __init_single_pfn(pfn, zid, nid); 1229 __init_single_pfn(pfn, zid, nid);
1230 } 1230 }
1231 #else 1231 #else
1232 static inline void init_reserved_page(unsigned long pfn) 1232 static inline void init_reserved_page(unsigned long pfn)
1233 { 1233 {
1234 } 1234 }
1235 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ 1235 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
1236 1236
1237 /* 1237 /*
1238 * Initialised pages do not have PageReserved set. This function is 1238 * Initialised pages do not have PageReserved set. This function is
1239 * called for each range allocated by the bootmem allocator and 1239 * called for each range allocated by the bootmem allocator and
1240 * marks the pages PageReserved. The remaining valid pages are later 1240 * marks the pages PageReserved. The remaining valid pages are later
1241 * sent to the buddy page allocator. 1241 * sent to the buddy page allocator.
1242 */ 1242 */
1243 void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end) 1243 void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
1244 { 1244 {
1245 unsigned long start_pfn = PFN_DOWN(start); 1245 unsigned long start_pfn = PFN_DOWN(start);
1246 unsigned long end_pfn = PFN_UP(end); 1246 unsigned long end_pfn = PFN_UP(end);
1247 1247
1248 for (; start_pfn < end_pfn; start_pfn++) { 1248 for (; start_pfn < end_pfn; start_pfn++) {
1249 if (pfn_valid(start_pfn)) { 1249 if (pfn_valid(start_pfn)) {
1250 struct page *page = pfn_to_page(start_pfn); 1250 struct page *page = pfn_to_page(start_pfn);
1251 1251
1252 init_reserved_page(start_pfn); 1252 init_reserved_page(start_pfn);
1253 1253
1254 /* Avoid false-positive PageTail() */ 1254 /* Avoid false-positive PageTail() */
1255 INIT_LIST_HEAD(&page->lru); 1255 INIT_LIST_HEAD(&page->lru);
1256 1256
1257 SetPageReserved(page); 1257 SetPageReserved(page);
1258 } 1258 }
1259 } 1259 }
1260 } 1260 }
1261 1261
1262 static void __free_pages_ok(struct page *page, unsigned int order) 1262 static void __free_pages_ok(struct page *page, unsigned int order)
1263 { 1263 {
1264 unsigned long flags; 1264 unsigned long flags;
1265 int migratetype; 1265 int migratetype;
1266 unsigned long pfn = page_to_pfn(page); 1266 unsigned long pfn = page_to_pfn(page);
1267 1267
1268 if (!free_pages_prepare(page, order, true)) 1268 if (!free_pages_prepare(page, order, true))
1269 return; 1269 return;
1270 1270
1271 migratetype = get_pfnblock_migratetype(page, pfn); 1271 migratetype = get_pfnblock_migratetype(page, pfn);
1272 local_irq_save(flags); 1272 local_irq_save(flags);
1273 __count_vm_events(PGFREE, 1 << order); 1273 __count_vm_events(PGFREE, 1 << order);
1274 free_one_page(page_zone(page), page, pfn, order, migratetype); 1274 free_one_page(page_zone(page), page, pfn, order, migratetype);
1275 local_irq_restore(flags); 1275 local_irq_restore(flags);
1276 } 1276 }
1277 1277
1278 static void __init __free_pages_boot_core(struct page *page, unsigned int order) 1278 static void __init __free_pages_boot_core(struct page *page, unsigned int order)
1279 { 1279 {
1280 unsigned int nr_pages = 1 << order; 1280 unsigned int nr_pages = 1 << order;
1281 struct page *p = page; 1281 struct page *p = page;
1282 unsigned int loop; 1282 unsigned int loop;
1283 1283
1284 prefetchw(p); 1284 prefetchw(p);
1285 for (loop = 0; loop < (nr_pages - 1); loop++, p++) { 1285 for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
1286 prefetchw(p + 1); 1286 prefetchw(p + 1);
1287 __ClearPageReserved(p); 1287 __ClearPageReserved(p);
1288 set_page_count(p, 0); 1288 set_page_count(p, 0);
1289 } 1289 }
1290 __ClearPageReserved(p); 1290 __ClearPageReserved(p);
1291 set_page_count(p, 0); 1291 set_page_count(p, 0);
1292 1292
1293 page_zone(page)->managed_pages += nr_pages; 1293 page_zone(page)->managed_pages += nr_pages;
1294 set_page_refcounted(page); 1294 set_page_refcounted(page);
1295 __free_pages(page, order); 1295 __free_pages(page, order);
1296 } 1296 }
1297 1297
1298 #if defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) || \ 1298 #if defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) || \
1299 defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) 1299 defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP)
1300 1300
1301 static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata; 1301 static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
1302 1302
1303 int __meminit early_pfn_to_nid(unsigned long pfn) 1303 int __meminit early_pfn_to_nid(unsigned long pfn)
1304 { 1304 {
1305 static DEFINE_SPINLOCK(early_pfn_lock); 1305 static DEFINE_SPINLOCK(early_pfn_lock);
1306 int nid; 1306 int nid;
1307 1307
1308 spin_lock(&early_pfn_lock); 1308 spin_lock(&early_pfn_lock);
1309 nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache); 1309 nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
1310 if (nid < 0) 1310 if (nid < 0)
1311 nid = first_online_node; 1311 nid = first_online_node;
1312 spin_unlock(&early_pfn_lock); 1312 spin_unlock(&early_pfn_lock);
1313 1313
1314 return nid; 1314 return nid;
1315 } 1315 }
1316 #endif 1316 #endif
1317 1317
1318 #ifdef CONFIG_NODES_SPAN_OTHER_NODES 1318 #ifdef CONFIG_NODES_SPAN_OTHER_NODES
1319 static inline bool __meminit __maybe_unused 1319 static inline bool __meminit __maybe_unused
1320 meminit_pfn_in_nid(unsigned long pfn, int node, 1320 meminit_pfn_in_nid(unsigned long pfn, int node,
1321 struct mminit_pfnnid_cache *state) 1321 struct mminit_pfnnid_cache *state)
1322 { 1322 {
1323 int nid; 1323 int nid;
1324 1324
1325 nid = __early_pfn_to_nid(pfn, state); 1325 nid = __early_pfn_to_nid(pfn, state);
1326 if (nid >= 0 && nid != node) 1326 if (nid >= 0 && nid != node)
1327 return false; 1327 return false;
1328 return true; 1328 return true;
1329 } 1329 }
1330 1330
1331 /* Only safe to use early in boot when initialisation is single-threaded */ 1331 /* Only safe to use early in boot when initialisation is single-threaded */
1332 static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node) 1332 static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
1333 { 1333 {
1334 return meminit_pfn_in_nid(pfn, node, &early_pfnnid_cache); 1334 return meminit_pfn_in_nid(pfn, node, &early_pfnnid_cache);
1335 } 1335 }
1336 1336
1337 #else 1337 #else
1338 1338
1339 static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node) 1339 static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
1340 { 1340 {
1341 return true; 1341 return true;
1342 } 1342 }
1343 static inline bool __meminit __maybe_unused 1343 static inline bool __meminit __maybe_unused
1344 meminit_pfn_in_nid(unsigned long pfn, int node, 1344 meminit_pfn_in_nid(unsigned long pfn, int node,
1345 struct mminit_pfnnid_cache *state) 1345 struct mminit_pfnnid_cache *state)
1346 { 1346 {
1347 return true; 1347 return true;
1348 } 1348 }
1349 #endif 1349 #endif
1350 1350
1351 1351
1352 void __init __free_pages_bootmem(struct page *page, unsigned long pfn, 1352 void __init __free_pages_bootmem(struct page *page, unsigned long pfn,
1353 unsigned int order) 1353 unsigned int order)
1354 { 1354 {
1355 if (early_page_uninitialised(pfn)) 1355 if (early_page_uninitialised(pfn))
1356 return; 1356 return;
1357 return __free_pages_boot_core(page, order); 1357 return __free_pages_boot_core(page, order);
1358 } 1358 }
1359 1359
1360 /* 1360 /*
1361 * Check that the whole (or subset of) a pageblock given by the interval of 1361 * Check that the whole (or subset of) a pageblock given by the interval of
1362 * [start_pfn, end_pfn) is valid and within the same zone, before scanning it 1362 * [start_pfn, end_pfn) is valid and within the same zone, before scanning it
1363 * with the migration of free compaction scanner. The scanners then need to 1363 * with the migration of free compaction scanner. The scanners then need to
1364 * use only pfn_valid_within() check for arches that allow holes within 1364 * use only pfn_valid_within() check for arches that allow holes within
1365 * pageblocks. 1365 * pageblocks.
1366 * 1366 *
1367 * Return struct page pointer of start_pfn, or NULL if checks were not passed. 1367 * Return struct page pointer of start_pfn, or NULL if checks were not passed.
1368 * 1368 *
1369 * It's possible on some configurations to have a setup like node0 node1 node0 1369 * It's possible on some configurations to have a setup like node0 node1 node0
1370 * i.e. it's possible that all pages within a zones range of pages do not 1370 * i.e. it's possible that all pages within a zones range of pages do not
1371 * belong to a single zone. We assume that a border between node0 and node1 1371 * belong to a single zone. We assume that a border between node0 and node1
1372 * can occur within a single pageblock, but not a node0 node1 node0 1372 * can occur within a single pageblock, but not a node0 node1 node0
1373 * interleaving within a single pageblock. It is therefore sufficient to check 1373 * interleaving within a single pageblock. It is therefore sufficient to check
1374 * the first and last page of a pageblock and avoid checking each individual 1374 * the first and last page of a pageblock and avoid checking each individual
1375 * page in a pageblock. 1375 * page in a pageblock.
1376 */ 1376 */
1377 struct page *__pageblock_pfn_to_page(unsigned long start_pfn, 1377 struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
1378 unsigned long end_pfn, struct zone *zone) 1378 unsigned long end_pfn, struct zone *zone)
1379 { 1379 {
1380 struct page *start_page; 1380 struct page *start_page;
1381 struct page *end_page; 1381 struct page *end_page;
1382 1382
1383 /* end_pfn is one past the range we are checking */ 1383 /* end_pfn is one past the range we are checking */
1384 end_pfn--; 1384 end_pfn--;
1385 1385
1386 if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn)) 1386 if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn))
1387 return NULL; 1387 return NULL;
1388 1388
1389 start_page = pfn_to_online_page(start_pfn); 1389 start_page = pfn_to_online_page(start_pfn);
1390 if (!start_page) 1390 if (!start_page)
1391 return NULL; 1391 return NULL;
1392 1392
1393 if (page_zone(start_page) != zone) 1393 if (page_zone(start_page) != zone)
1394 return NULL; 1394 return NULL;
1395 1395
1396 end_page = pfn_to_page(end_pfn); 1396 end_page = pfn_to_page(end_pfn);
1397 1397
1398 /* This gives a shorter code than deriving page_zone(end_page) */ 1398 /* This gives a shorter code than deriving page_zone(end_page) */
1399 if (page_zone_id(start_page) != page_zone_id(end_page)) 1399 if (page_zone_id(start_page) != page_zone_id(end_page))
1400 return NULL; 1400 return NULL;
1401 1401
1402 return start_page; 1402 return start_page;
1403 } 1403 }
1404 1404
1405 void set_zone_contiguous(struct zone *zone) 1405 void set_zone_contiguous(struct zone *zone)
1406 { 1406 {
1407 unsigned long block_start_pfn = zone->zone_start_pfn; 1407 unsigned long block_start_pfn = zone->zone_start_pfn;
1408 unsigned long block_end_pfn; 1408 unsigned long block_end_pfn;
1409 1409
1410 block_end_pfn = ALIGN(block_start_pfn + 1, pageblock_nr_pages); 1410 block_end_pfn = ALIGN(block_start_pfn + 1, pageblock_nr_pages);
1411 for (; block_start_pfn < zone_end_pfn(zone); 1411 for (; block_start_pfn < zone_end_pfn(zone);
1412 block_start_pfn = block_end_pfn, 1412 block_start_pfn = block_end_pfn,
1413 block_end_pfn += pageblock_nr_pages) { 1413 block_end_pfn += pageblock_nr_pages) {
1414 1414
1415 block_end_pfn = min(block_end_pfn, zone_end_pfn(zone)); 1415 block_end_pfn = min(block_end_pfn, zone_end_pfn(zone));
1416 1416
1417 if (!__pageblock_pfn_to_page(block_start_pfn, 1417 if (!__pageblock_pfn_to_page(block_start_pfn,
1418 block_end_pfn, zone)) 1418 block_end_pfn, zone))
1419 return; 1419 return;
1420 } 1420 }
1421 1421
1422 /* We confirm that there is no hole */ 1422 /* We confirm that there is no hole */
1423 zone->contiguous = true; 1423 zone->contiguous = true;
1424 } 1424 }
1425 1425
1426 void clear_zone_contiguous(struct zone *zone) 1426 void clear_zone_contiguous(struct zone *zone)
1427 { 1427 {
1428 zone->contiguous = false; 1428 zone->contiguous = false;
1429 } 1429 }
1430 1430
1431 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 1431 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
1432 static void __init deferred_free_range(struct page *page, 1432 static void __init deferred_free_range(struct page *page,
1433 unsigned long pfn, int nr_pages) 1433 unsigned long pfn, int nr_pages)
1434 { 1434 {
1435 int i; 1435 int i;
1436 1436
1437 if (!page) 1437 if (!page)
1438 return; 1438 return;
1439 1439
1440 /* Free a large naturally-aligned chunk if possible */ 1440 /* Free a large naturally-aligned chunk if possible */
1441 if (nr_pages == pageblock_nr_pages && 1441 if (nr_pages == pageblock_nr_pages &&
1442 (pfn & (pageblock_nr_pages - 1)) == 0) { 1442 (pfn & (pageblock_nr_pages - 1)) == 0) {
1443 set_pageblock_migratetype(page, MIGRATE_MOVABLE); 1443 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
1444 __free_pages_boot_core(page, pageblock_order); 1444 __free_pages_boot_core(page, pageblock_order);
1445 return; 1445 return;
1446 } 1446 }
1447 1447
1448 for (i = 0; i < nr_pages; i++, page++, pfn++) { 1448 for (i = 0; i < nr_pages; i++, page++, pfn++) {
1449 if ((pfn & (pageblock_nr_pages - 1)) == 0) 1449 if ((pfn & (pageblock_nr_pages - 1)) == 0)
1450 set_pageblock_migratetype(page, MIGRATE_MOVABLE); 1450 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
1451 __free_pages_boot_core(page, 0); 1451 __free_pages_boot_core(page, 0);
1452 } 1452 }
1453 } 1453 }
1454 1454
1455 /* Completion tracking for deferred_init_memmap() threads */ 1455 /* Completion tracking for deferred_init_memmap() threads */
1456 static atomic_t pgdat_init_n_undone __initdata; 1456 static atomic_t pgdat_init_n_undone __initdata;
1457 static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp); 1457 static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp);
1458 1458
1459 static inline void __init pgdat_init_report_one_done(void) 1459 static inline void __init pgdat_init_report_one_done(void)
1460 { 1460 {
1461 if (atomic_dec_and_test(&pgdat_init_n_undone)) 1461 if (atomic_dec_and_test(&pgdat_init_n_undone))
1462 complete(&pgdat_init_all_done_comp); 1462 complete(&pgdat_init_all_done_comp);
1463 } 1463 }
1464 1464
1465 /* Initialise remaining memory on a node */ 1465 /* Initialise remaining memory on a node */
1466 static int __init deferred_init_memmap(void *data) 1466 static int __init deferred_init_memmap(void *data)
1467 { 1467 {
1468 pg_data_t *pgdat = data; 1468 pg_data_t *pgdat = data;
1469 int nid = pgdat->node_id; 1469 int nid = pgdat->node_id;
1470 struct mminit_pfnnid_cache nid_init_state = { }; 1470 struct mminit_pfnnid_cache nid_init_state = { };
1471 unsigned long start = jiffies; 1471 unsigned long start = jiffies;
1472 unsigned long nr_pages = 0; 1472 unsigned long nr_pages = 0;
1473 unsigned long walk_start, walk_end; 1473 unsigned long walk_start, walk_end;
1474 int i, zid; 1474 int i, zid;
1475 struct zone *zone; 1475 struct zone *zone;
1476 unsigned long first_init_pfn = pgdat->first_deferred_pfn; 1476 unsigned long first_init_pfn = pgdat->first_deferred_pfn;
1477 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); 1477 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
1478 1478
1479 if (first_init_pfn == ULONG_MAX) { 1479 if (first_init_pfn == ULONG_MAX) {
1480 pgdat_init_report_one_done(); 1480 pgdat_init_report_one_done();
1481 return 0; 1481 return 0;
1482 } 1482 }
1483 1483
1484 /* Bind memory initialisation thread to a local node if possible */ 1484 /* Bind memory initialisation thread to a local node if possible */
1485 if (!cpumask_empty(cpumask)) 1485 if (!cpumask_empty(cpumask))
1486 set_cpus_allowed_ptr(current, cpumask); 1486 set_cpus_allowed_ptr(current, cpumask);
1487 1487
1488 /* Sanity check boundaries */ 1488 /* Sanity check boundaries */
1489 BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn); 1489 BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn);
1490 BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat)); 1490 BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat));
1491 pgdat->first_deferred_pfn = ULONG_MAX; 1491 pgdat->first_deferred_pfn = ULONG_MAX;
1492 1492
1493 /* Only the highest zone is deferred so find it */ 1493 /* Only the highest zone is deferred so find it */
1494 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 1494 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1495 zone = pgdat->node_zones + zid; 1495 zone = pgdat->node_zones + zid;
1496 if (first_init_pfn < zone_end_pfn(zone)) 1496 if (first_init_pfn < zone_end_pfn(zone))
1497 break; 1497 break;
1498 } 1498 }
1499 1499
1500 for_each_mem_pfn_range(i, nid, &walk_start, &walk_end, NULL) { 1500 for_each_mem_pfn_range(i, nid, &walk_start, &walk_end, NULL) {
1501 unsigned long pfn, end_pfn; 1501 unsigned long pfn, end_pfn;
1502 struct page *page = NULL; 1502 struct page *page = NULL;
1503 struct page *free_base_page = NULL; 1503 struct page *free_base_page = NULL;
1504 unsigned long free_base_pfn = 0; 1504 unsigned long free_base_pfn = 0;
1505 int nr_to_free = 0; 1505 int nr_to_free = 0;
1506 1506
1507 end_pfn = min(walk_end, zone_end_pfn(zone)); 1507 end_pfn = min(walk_end, zone_end_pfn(zone));
1508 pfn = first_init_pfn; 1508 pfn = first_init_pfn;
1509 if (pfn < walk_start) 1509 if (pfn < walk_start)
1510 pfn = walk_start; 1510 pfn = walk_start;
1511 if (pfn < zone->zone_start_pfn) 1511 if (pfn < zone->zone_start_pfn)
1512 pfn = zone->zone_start_pfn; 1512 pfn = zone->zone_start_pfn;
1513 1513
1514 for (; pfn < end_pfn; pfn++) { 1514 for (; pfn < end_pfn; pfn++) {
1515 if (!pfn_valid_within(pfn)) 1515 if (!pfn_valid_within(pfn))
1516 goto free_range; 1516 goto free_range;
1517 1517
1518 /* 1518 /*
1519 * Ensure pfn_valid is checked every 1519 * Ensure pfn_valid is checked every
1520 * pageblock_nr_pages for memory holes 1520 * pageblock_nr_pages for memory holes
1521 */ 1521 */
1522 if ((pfn & (pageblock_nr_pages - 1)) == 0) { 1522 if ((pfn & (pageblock_nr_pages - 1)) == 0) {
1523 if (!pfn_valid(pfn)) { 1523 if (!pfn_valid(pfn)) {
1524 page = NULL; 1524 page = NULL;
1525 goto free_range; 1525 goto free_range;
1526 } 1526 }
1527 } 1527 }
1528 1528
1529 if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) { 1529 if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) {
1530 page = NULL; 1530 page = NULL;
1531 goto free_range; 1531 goto free_range;
1532 } 1532 }
1533 1533
1534 /* Minimise pfn page lookups and scheduler checks */ 1534 /* Minimise pfn page lookups and scheduler checks */
1535 if (page && (pfn & (pageblock_nr_pages - 1)) != 0) { 1535 if (page && (pfn & (pageblock_nr_pages - 1)) != 0) {
1536 page++; 1536 page++;
1537 } else { 1537 } else {
1538 nr_pages += nr_to_free; 1538 nr_pages += nr_to_free;
1539 deferred_free_range(free_base_page, 1539 deferred_free_range(free_base_page,
1540 free_base_pfn, nr_to_free); 1540 free_base_pfn, nr_to_free);
1541 free_base_page = NULL; 1541 free_base_page = NULL;
1542 free_base_pfn = nr_to_free = 0; 1542 free_base_pfn = nr_to_free = 0;
1543 1543
1544 page = pfn_to_page(pfn); 1544 page = pfn_to_page(pfn);
1545 cond_resched(); 1545 cond_resched();
1546 } 1546 }
1547 1547
1548 if (page->flags) { 1548 if (page->flags) {
1549 VM_BUG_ON(page_zone(page) != zone); 1549 VM_BUG_ON(page_zone(page) != zone);
1550 goto free_range; 1550 goto free_range;
1551 } 1551 }
1552 1552
1553 __init_single_page(page, pfn, zid, nid); 1553 __init_single_page(page, pfn, zid, nid);
1554 if (!free_base_page) { 1554 if (!free_base_page) {
1555 free_base_page = page; 1555 free_base_page = page;
1556 free_base_pfn = pfn; 1556 free_base_pfn = pfn;
1557 nr_to_free = 0; 1557 nr_to_free = 0;
1558 } 1558 }
1559 nr_to_free++; 1559 nr_to_free++;
1560 1560
1561 /* Where possible, batch up pages for a single free */ 1561 /* Where possible, batch up pages for a single free */
1562 continue; 1562 continue;
1563 free_range: 1563 free_range:
1564 /* Free the current block of pages to allocator */ 1564 /* Free the current block of pages to allocator */
1565 nr_pages += nr_to_free; 1565 nr_pages += nr_to_free;
1566 deferred_free_range(free_base_page, free_base_pfn, 1566 deferred_free_range(free_base_page, free_base_pfn,
1567 nr_to_free); 1567 nr_to_free);
1568 free_base_page = NULL; 1568 free_base_page = NULL;
1569 free_base_pfn = nr_to_free = 0; 1569 free_base_pfn = nr_to_free = 0;
1570 } 1570 }
1571 /* Free the last block of pages to allocator */ 1571 /* Free the last block of pages to allocator */
1572 nr_pages += nr_to_free; 1572 nr_pages += nr_to_free;
1573 deferred_free_range(free_base_page, free_base_pfn, nr_to_free); 1573 deferred_free_range(free_base_page, free_base_pfn, nr_to_free);
1574 1574
1575 first_init_pfn = max(end_pfn, first_init_pfn); 1575 first_init_pfn = max(end_pfn, first_init_pfn);
1576 } 1576 }
1577 1577
1578 /* Sanity check that the next zone really is unpopulated */ 1578 /* Sanity check that the next zone really is unpopulated */
1579 WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone)); 1579 WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));
1580 1580
1581 pr_info("node %d initialised, %lu pages in %ums\n", nid, nr_pages, 1581 pr_info("node %d initialised, %lu pages in %ums\n", nid, nr_pages,
1582 jiffies_to_msecs(jiffies - start)); 1582 jiffies_to_msecs(jiffies - start));
1583 1583
1584 pgdat_init_report_one_done(); 1584 pgdat_init_report_one_done();
1585 return 0; 1585 return 0;
1586 } 1586 }
1587 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ 1587 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
1588 1588
1589 void __init page_alloc_init_late(void) 1589 void __init page_alloc_init_late(void)
1590 { 1590 {
1591 struct zone *zone; 1591 struct zone *zone;
1592 1592
1593 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 1593 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
1594 int nid; 1594 int nid;
1595 1595
1596 /* There will be num_node_state(N_MEMORY) threads */ 1596 /* There will be num_node_state(N_MEMORY) threads */
1597 atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY)); 1597 atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY));
1598 for_each_node_state(nid, N_MEMORY) { 1598 for_each_node_state(nid, N_MEMORY) {
1599 kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid); 1599 kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid);
1600 } 1600 }
1601 1601
1602 /* Block until all are initialised */ 1602 /* Block until all are initialised */
1603 wait_for_completion(&pgdat_init_all_done_comp); 1603 wait_for_completion(&pgdat_init_all_done_comp);
1604 1604
1605 /* Reinit limits that are based on free pages after the kernel is up */ 1605 /* Reinit limits that are based on free pages after the kernel is up */
1606 files_maxfiles_init(); 1606 files_maxfiles_init();
1607 #endif 1607 #endif
1608 #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK 1608 #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
1609 /* Discard memblock private memory */ 1609 /* Discard memblock private memory */
1610 memblock_discard(); 1610 memblock_discard();
1611 #endif 1611 #endif
1612 1612
1613 for_each_populated_zone(zone) 1613 for_each_populated_zone(zone)
1614 set_zone_contiguous(zone); 1614 set_zone_contiguous(zone);
1615 } 1615 }
1616 1616
1617 #ifdef CONFIG_CMA 1617 #ifdef CONFIG_CMA
1618 /* Free whole pageblock and set its migration type to MIGRATE_CMA. */ 1618 /* Free whole pageblock and set its migration type to MIGRATE_CMA. */
1619 void __init init_cma_reserved_pageblock(struct page *page) 1619 void __init init_cma_reserved_pageblock(struct page *page)
1620 { 1620 {
1621 unsigned i = pageblock_nr_pages; 1621 unsigned i = pageblock_nr_pages;
1622 struct page *p = page; 1622 struct page *p = page;
1623 1623
1624 do { 1624 do {
1625 __ClearPageReserved(p); 1625 __ClearPageReserved(p);
1626 set_page_count(p, 0); 1626 set_page_count(p, 0);
1627 } while (++p, --i); 1627 } while (++p, --i);
1628 1628
1629 set_pageblock_migratetype(page, MIGRATE_CMA); 1629 set_pageblock_migratetype(page, MIGRATE_CMA);
1630 1630
1631 if (pageblock_order >= MAX_ORDER) { 1631 if (pageblock_order >= MAX_ORDER) {
1632 i = pageblock_nr_pages; 1632 i = pageblock_nr_pages;
1633 p = page; 1633 p = page;
1634 do { 1634 do {
1635 set_page_refcounted(p); 1635 set_page_refcounted(p);
1636 __free_pages(p, MAX_ORDER - 1); 1636 __free_pages(p, MAX_ORDER - 1);
1637 p += MAX_ORDER_NR_PAGES; 1637 p += MAX_ORDER_NR_PAGES;
1638 } while (i -= MAX_ORDER_NR_PAGES); 1638 } while (i -= MAX_ORDER_NR_PAGES);
1639 } else { 1639 } else {
1640 set_page_refcounted(page); 1640 set_page_refcounted(page);
1641 __free_pages(page, pageblock_order); 1641 __free_pages(page, pageblock_order);
1642 } 1642 }
1643 1643
1644 adjust_managed_page_count(page, pageblock_nr_pages); 1644 adjust_managed_page_count(page, pageblock_nr_pages);
1645 } 1645 }
1646 #endif 1646 #endif
1647 1647
1648 /* 1648 /*
1649 * The order of subdivision here is critical for the IO subsystem. 1649 * The order of subdivision here is critical for the IO subsystem.
1650 * Please do not alter this order without good reasons and regression 1650 * Please do not alter this order without good reasons and regression
1651 * testing. Specifically, as large blocks of memory are subdivided, 1651 * testing. Specifically, as large blocks of memory are subdivided,
1652 * the order in which smaller blocks are delivered depends on the order 1652 * the order in which smaller blocks are delivered depends on the order
1653 * they're subdivided in this function. This is the primary factor 1653 * they're subdivided in this function. This is the primary factor
1654 * influencing the order in which pages are delivered to the IO 1654 * influencing the order in which pages are delivered to the IO
1655 * subsystem according to empirical testing, and this is also justified 1655 * subsystem according to empirical testing, and this is also justified
1656 * by considering the behavior of a buddy system containing a single 1656 * by considering the behavior of a buddy system containing a single
1657 * large block of memory acted on by a series of small allocations. 1657 * large block of memory acted on by a series of small allocations.
1658 * This behavior is a critical factor in sglist merging's success. 1658 * This behavior is a critical factor in sglist merging's success.
1659 * 1659 *
1660 * -- nyc 1660 * -- nyc
1661 */ 1661 */
1662 static inline void expand(struct zone *zone, struct page *page, 1662 static inline void expand(struct zone *zone, struct page *page,
1663 int low, int high, struct free_area *area, 1663 int low, int high, struct free_area *area,
1664 int migratetype) 1664 int migratetype)
1665 { 1665 {
1666 unsigned long size = 1 << high; 1666 unsigned long size = 1 << high;
1667 1667
1668 while (high > low) { 1668 while (high > low) {
1669 area--; 1669 area--;
1670 high--; 1670 high--;
1671 size >>= 1; 1671 size >>= 1;
1672 VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]); 1672 VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
1673 1673
1674 /* 1674 /*
1675 * Mark as guard pages (or page), that will allow to 1675 * Mark as guard pages (or page), that will allow to
1676 * merge back to allocator when buddy will be freed. 1676 * merge back to allocator when buddy will be freed.
1677 * Corresponding page table entries will not be touched, 1677 * Corresponding page table entries will not be touched,
1678 * pages will stay not present in virtual address space 1678 * pages will stay not present in virtual address space
1679 */ 1679 */
1680 if (set_page_guard(zone, &page[size], high, migratetype)) 1680 if (set_page_guard(zone, &page[size], high, migratetype))
1681 continue; 1681 continue;
1682 1682
1683 list_add(&page[size].lru, &area->free_list[migratetype]); 1683 list_add(&page[size].lru, &area->free_list[migratetype]);
1684 area->nr_free++; 1684 area->nr_free++;
1685 set_page_order(&page[size], high); 1685 set_page_order(&page[size], high);
1686 } 1686 }
1687 } 1687 }
1688 1688
1689 static void check_new_page_bad(struct page *page) 1689 static void check_new_page_bad(struct page *page)
1690 { 1690 {
1691 const char *bad_reason = NULL; 1691 const char *bad_reason = NULL;
1692 unsigned long bad_flags = 0; 1692 unsigned long bad_flags = 0;
1693 1693
1694 if (unlikely(atomic_read(&page->_mapcount) != -1)) 1694 if (unlikely(atomic_read(&page->_mapcount) != -1))
1695 bad_reason = "nonzero mapcount"; 1695 bad_reason = "nonzero mapcount";
1696 if (unlikely(page->mapping != NULL)) 1696 if (unlikely(page->mapping != NULL))
1697 bad_reason = "non-NULL mapping"; 1697 bad_reason = "non-NULL mapping";
1698 if (unlikely(page_ref_count(page) != 0)) 1698 if (unlikely(page_ref_count(page) != 0))
1699 bad_reason = "nonzero _count"; 1699 bad_reason = "nonzero _count";
1700 if (unlikely(page->flags & __PG_HWPOISON)) { 1700 if (unlikely(page->flags & __PG_HWPOISON)) {
1701 bad_reason = "HWPoisoned (hardware-corrupted)"; 1701 bad_reason = "HWPoisoned (hardware-corrupted)";
1702 bad_flags = __PG_HWPOISON; 1702 bad_flags = __PG_HWPOISON;
1703 /* Don't complain about hwpoisoned pages */ 1703 /* Don't complain about hwpoisoned pages */
1704 page_mapcount_reset(page); /* remove PageBuddy */ 1704 page_mapcount_reset(page); /* remove PageBuddy */
1705 return; 1705 return;
1706 } 1706 }
1707 if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) { 1707 if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) {
1708 bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set"; 1708 bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set";
1709 bad_flags = PAGE_FLAGS_CHECK_AT_PREP; 1709 bad_flags = PAGE_FLAGS_CHECK_AT_PREP;
1710 } 1710 }
1711 #ifdef CONFIG_MEMCG 1711 #ifdef CONFIG_MEMCG
1712 if (unlikely(page->mem_cgroup)) 1712 if (unlikely(page->mem_cgroup))
1713 bad_reason = "page still charged to cgroup"; 1713 bad_reason = "page still charged to cgroup";
1714 #endif 1714 #endif
1715 bad_page(page, bad_reason, bad_flags); 1715 bad_page(page, bad_reason, bad_flags);
1716 } 1716 }
1717 1717
1718 /* 1718 /*
1719 * This page is about to be returned from the page allocator 1719 * This page is about to be returned from the page allocator
1720 */ 1720 */
1721 static inline int check_new_page(struct page *page) 1721 static inline int check_new_page(struct page *page)
1722 { 1722 {
1723 if (likely(page_expected_state(page, 1723 if (likely(page_expected_state(page,
1724 PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON))) 1724 PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON)))
1725 return 0; 1725 return 0;
1726 1726
1727 check_new_page_bad(page); 1727 check_new_page_bad(page);
1728 return 1; 1728 return 1;
1729 } 1729 }
1730 1730
1731 static inline bool free_pages_prezeroed(void) 1731 static inline bool free_pages_prezeroed(void)
1732 { 1732 {
1733 return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) && 1733 return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) &&
1734 page_poisoning_enabled(); 1734 page_poisoning_enabled();
1735 } 1735 }
1736 1736
1737 #ifdef CONFIG_DEBUG_VM 1737 #ifdef CONFIG_DEBUG_VM
1738 static bool check_pcp_refill(struct page *page) 1738 static bool check_pcp_refill(struct page *page)
1739 { 1739 {
1740 return false; 1740 return false;
1741 } 1741 }
1742 1742
1743 static bool check_new_pcp(struct page *page) 1743 static bool check_new_pcp(struct page *page)
1744 { 1744 {
1745 return check_new_page(page); 1745 return check_new_page(page);
1746 } 1746 }
1747 #else 1747 #else
1748 static bool check_pcp_refill(struct page *page) 1748 static bool check_pcp_refill(struct page *page)
1749 { 1749 {
1750 return check_new_page(page); 1750 return check_new_page(page);
1751 } 1751 }
1752 static bool check_new_pcp(struct page *page) 1752 static bool check_new_pcp(struct page *page)
1753 { 1753 {
1754 return false; 1754 return false;
1755 } 1755 }
1756 #endif /* CONFIG_DEBUG_VM */ 1756 #endif /* CONFIG_DEBUG_VM */
1757 1757
1758 static bool check_new_pages(struct page *page, unsigned int order) 1758 static bool check_new_pages(struct page *page, unsigned int order)
1759 { 1759 {
1760 int i; 1760 int i;
1761 for (i = 0; i < (1 << order); i++) { 1761 for (i = 0; i < (1 << order); i++) {
1762 struct page *p = page + i; 1762 struct page *p = page + i;
1763 1763
1764 if (unlikely(check_new_page(p))) 1764 if (unlikely(check_new_page(p)))
1765 return true; 1765 return true;
1766 } 1766 }
1767 1767
1768 return false; 1768 return false;
1769 } 1769 }
1770 1770
1771 inline void post_alloc_hook(struct page *page, unsigned int order, 1771 inline void post_alloc_hook(struct page *page, unsigned int order,
1772 gfp_t gfp_flags) 1772 gfp_t gfp_flags)
1773 { 1773 {
1774 set_page_private(page, 0); 1774 set_page_private(page, 0);
1775 set_page_refcounted(page); 1775 set_page_refcounted(page);
1776 1776
1777 arch_alloc_page(page, order); 1777 arch_alloc_page(page, order);
1778 kernel_map_pages(page, 1 << order, 1); 1778 kernel_map_pages(page, 1 << order, 1);
1779 kernel_poison_pages(page, 1 << order, 1); 1779 kernel_poison_pages(page, 1 << order, 1);
1780 kasan_alloc_pages(page, order); 1780 kasan_alloc_pages(page, order);
1781 set_page_owner(page, order, gfp_flags); 1781 set_page_owner(page, order, gfp_flags);
1782 } 1782 }
1783 1783
1784 static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, 1784 static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
1785 unsigned int alloc_flags) 1785 unsigned int alloc_flags)
1786 { 1786 {
1787 int i; 1787 int i;
1788 1788
1789 post_alloc_hook(page, order, gfp_flags); 1789 post_alloc_hook(page, order, gfp_flags);
1790 1790
1791 if (!free_pages_prezeroed() && (gfp_flags & __GFP_ZERO)) 1791 if (!free_pages_prezeroed() && (gfp_flags & __GFP_ZERO))
1792 for (i = 0; i < (1 << order); i++) 1792 for (i = 0; i < (1 << order); i++)
1793 clear_highpage(page + i); 1793 clear_highpage(page + i);
1794 1794
1795 if (order && (gfp_flags & __GFP_COMP)) 1795 if (order && (gfp_flags & __GFP_COMP))
1796 prep_compound_page(page, order); 1796 prep_compound_page(page, order);
1797 1797
1798 /* 1798 /*
1799 * page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to 1799 * page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to
1800 * allocate the page. The expectation is that the caller is taking 1800 * allocate the page. The expectation is that the caller is taking
1801 * steps that will free more memory. The caller should avoid the page 1801 * steps that will free more memory. The caller should avoid the page
1802 * being used for !PFMEMALLOC purposes. 1802 * being used for !PFMEMALLOC purposes.
1803 */ 1803 */
1804 if (alloc_flags & ALLOC_NO_WATERMARKS) 1804 if (alloc_flags & ALLOC_NO_WATERMARKS)
1805 set_page_pfmemalloc(page); 1805 set_page_pfmemalloc(page);
1806 else 1806 else
1807 clear_page_pfmemalloc(page); 1807 clear_page_pfmemalloc(page);
1808 } 1808 }
1809 1809
1810 /* 1810 /*
1811 * Go through the free lists for the given migratetype and remove 1811 * Go through the free lists for the given migratetype and remove
1812 * the smallest available page from the freelists 1812 * the smallest available page from the freelists
1813 */ 1813 */
1814 static inline 1814 static inline
1815 struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, 1815 struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
1816 int migratetype) 1816 int migratetype)
1817 { 1817 {
1818 unsigned int current_order; 1818 unsigned int current_order;
1819 struct free_area *area; 1819 struct free_area *area;
1820 struct page *page; 1820 struct page *page;
1821 1821
1822 /* Find a page of the appropriate size in the preferred list */ 1822 /* Find a page of the appropriate size in the preferred list */
1823 for (current_order = order; current_order < MAX_ORDER; ++current_order) { 1823 for (current_order = order; current_order < MAX_ORDER; ++current_order) {
1824 area = &(zone->free_area[current_order]); 1824 area = &(zone->free_area[current_order]);
1825 page = list_first_entry_or_null(&area->free_list[migratetype], 1825 page = list_first_entry_or_null(&area->free_list[migratetype],
1826 struct page, lru); 1826 struct page, lru);
1827 if (!page) 1827 if (!page)
1828 continue; 1828 continue;
1829 list_del(&page->lru); 1829 list_del(&page->lru);
1830 rmv_page_order(page); 1830 rmv_page_order(page);
1831 area->nr_free--; 1831 area->nr_free--;
1832 expand(zone, page, order, current_order, area, migratetype); 1832 expand(zone, page, order, current_order, area, migratetype);
1833 set_pcppage_migratetype(page, migratetype); 1833 set_pcppage_migratetype(page, migratetype);
1834 return page; 1834 return page;
1835 } 1835 }
1836 1836
1837 return NULL; 1837 return NULL;
1838 } 1838 }
1839 1839
1840 1840
1841 /* 1841 /*
1842 * This array describes the order lists are fallen back to when 1842 * This array describes the order lists are fallen back to when
1843 * the free lists for the desirable migrate type are depleted 1843 * the free lists for the desirable migrate type are depleted
1844 */ 1844 */
1845 static int fallbacks[MIGRATE_TYPES][4] = { 1845 static int fallbacks[MIGRATE_TYPES][4] = {
1846 [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, 1846 [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
1847 [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, 1847 [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
1848 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES }, 1848 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES },
1849 #ifdef CONFIG_CMA 1849 #ifdef CONFIG_CMA
1850 [MIGRATE_CMA] = { MIGRATE_TYPES }, /* Never used */ 1850 [MIGRATE_CMA] = { MIGRATE_TYPES }, /* Never used */
1851 #endif 1851 #endif
1852 #ifdef CONFIG_MEMORY_ISOLATION 1852 #ifdef CONFIG_MEMORY_ISOLATION
1853 [MIGRATE_ISOLATE] = { MIGRATE_TYPES }, /* Never used */ 1853 [MIGRATE_ISOLATE] = { MIGRATE_TYPES }, /* Never used */
1854 #endif 1854 #endif
1855 }; 1855 };
1856 1856
1857 #ifdef CONFIG_CMA 1857 #ifdef CONFIG_CMA
1858 static struct page *__rmqueue_cma_fallback(struct zone *zone, 1858 static struct page *__rmqueue_cma_fallback(struct zone *zone,
1859 unsigned int order) 1859 unsigned int order)
1860 { 1860 {
1861 return __rmqueue_smallest(zone, order, MIGRATE_CMA); 1861 return __rmqueue_smallest(zone, order, MIGRATE_CMA);
1862 } 1862 }
1863 #else 1863 #else
1864 static inline struct page *__rmqueue_cma_fallback(struct zone *zone, 1864 static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
1865 unsigned int order) { return NULL; } 1865 unsigned int order) { return NULL; }
1866 #endif 1866 #endif
1867 1867
1868 /* 1868 /*
1869 * Move the free pages in a range to the free lists of the requested type. 1869 * Move the free pages in a range to the free lists of the requested type.
1870 * Note that start_page and end_pages are not aligned on a pageblock 1870 * Note that start_page and end_pages are not aligned on a pageblock
1871 * boundary. If alignment is required, use move_freepages_block() 1871 * boundary. If alignment is required, use move_freepages_block()
1872 */ 1872 */
1873 static int move_freepages(struct zone *zone, 1873 static int move_freepages(struct zone *zone,
1874 struct page *start_page, struct page *end_page, 1874 struct page *start_page, struct page *end_page,
1875 int migratetype, int *num_movable) 1875 int migratetype, int *num_movable)
1876 { 1876 {
1877 struct page *page; 1877 struct page *page;
1878 unsigned int order; 1878 unsigned int order;
1879 int pages_moved = 0; 1879 int pages_moved = 0;
1880 1880
1881 #ifndef CONFIG_HOLES_IN_ZONE 1881 #ifndef CONFIG_HOLES_IN_ZONE
1882 /* 1882 /*
1883 * page_zone is not safe to call in this context when 1883 * page_zone is not safe to call in this context when
1884 * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant 1884 * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
1885 * anyway as we check zone boundaries in move_freepages_block(). 1885 * anyway as we check zone boundaries in move_freepages_block().
1886 * Remove at a later date when no bug reports exist related to 1886 * Remove at a later date when no bug reports exist related to
1887 * grouping pages by mobility 1887 * grouping pages by mobility
1888 */ 1888 */
1889 VM_BUG_ON(page_zone(start_page) != page_zone(end_page)); 1889 VM_BUG_ON(page_zone(start_page) != page_zone(end_page));
1890 #endif 1890 #endif
1891 1891
1892 if (num_movable) 1892 if (num_movable)
1893 *num_movable = 0; 1893 *num_movable = 0;
1894 1894
1895 for (page = start_page; page <= end_page;) { 1895 for (page = start_page; page <= end_page;) {
1896 if (!pfn_valid_within(page_to_pfn(page))) { 1896 if (!pfn_valid_within(page_to_pfn(page))) {
1897 page++; 1897 page++;
1898 continue; 1898 continue;
1899 } 1899 }
1900 1900
1901 /* Make sure we are not inadvertently changing nodes */ 1901 /* Make sure we are not inadvertently changing nodes */
1902 VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); 1902 VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
1903 1903
1904 if (!PageBuddy(page)) { 1904 if (!PageBuddy(page)) {
1905 /* 1905 /*
1906 * We assume that pages that could be isolated for 1906 * We assume that pages that could be isolated for
1907 * migration are movable. But we don't actually try 1907 * migration are movable. But we don't actually try
1908 * isolating, as that would be expensive. 1908 * isolating, as that would be expensive.
1909 */ 1909 */
1910 if (num_movable && 1910 if (num_movable &&
1911 (PageLRU(page) || __PageMovable(page))) 1911 (PageLRU(page) || __PageMovable(page)))
1912 (*num_movable)++; 1912 (*num_movable)++;
1913 1913
1914 page++; 1914 page++;
1915 continue; 1915 continue;
1916 } 1916 }
1917 1917
1918 order = page_order(page); 1918 order = page_order(page);
1919 list_move(&page->lru, 1919 list_move(&page->lru,
1920 &zone->free_area[order].free_list[migratetype]); 1920 &zone->free_area[order].free_list[migratetype]);
1921 page += 1 << order; 1921 page += 1 << order;
1922 pages_moved += 1 << order; 1922 pages_moved += 1 << order;
1923 } 1923 }
1924 1924
1925 return pages_moved; 1925 return pages_moved;
1926 } 1926 }
1927 1927
1928 int move_freepages_block(struct zone *zone, struct page *page, 1928 int move_freepages_block(struct zone *zone, struct page *page,
1929 int migratetype, int *num_movable) 1929 int migratetype, int *num_movable)
1930 { 1930 {
1931 unsigned long start_pfn, end_pfn; 1931 unsigned long start_pfn, end_pfn;
1932 struct page *start_page, *end_page; 1932 struct page *start_page, *end_page;
1933 1933
1934 start_pfn = page_to_pfn(page); 1934 start_pfn = page_to_pfn(page);
1935 start_pfn = start_pfn & ~(pageblock_nr_pages-1); 1935 start_pfn = start_pfn & ~(pageblock_nr_pages-1);
1936 start_page = pfn_to_page(start_pfn); 1936 start_page = pfn_to_page(start_pfn);
1937 end_page = start_page + pageblock_nr_pages - 1; 1937 end_page = start_page + pageblock_nr_pages - 1;
1938 end_pfn = start_pfn + pageblock_nr_pages - 1; 1938 end_pfn = start_pfn + pageblock_nr_pages - 1;
1939 1939
1940 /* Do not cross zone boundaries */ 1940 /* Do not cross zone boundaries */
1941 if (!zone_spans_pfn(zone, start_pfn)) 1941 if (!zone_spans_pfn(zone, start_pfn))
1942 start_page = page; 1942 start_page = page;
1943 if (!zone_spans_pfn(zone, end_pfn)) 1943 if (!zone_spans_pfn(zone, end_pfn))
1944 return 0; 1944 return 0;
1945 1945
1946 return move_freepages(zone, start_page, end_page, migratetype, 1946 return move_freepages(zone, start_page, end_page, migratetype,
1947 num_movable); 1947 num_movable);
1948 } 1948 }
1949 1949
1950 static void change_pageblock_range(struct page *pageblock_page, 1950 static void change_pageblock_range(struct page *pageblock_page,
1951 int start_order, int migratetype) 1951 int start_order, int migratetype)
1952 { 1952 {
1953 int nr_pageblocks = 1 << (start_order - pageblock_order); 1953 int nr_pageblocks = 1 << (start_order - pageblock_order);
1954 1954
1955 while (nr_pageblocks--) { 1955 while (nr_pageblocks--) {
1956 set_pageblock_migratetype(pageblock_page, migratetype); 1956 set_pageblock_migratetype(pageblock_page, migratetype);
1957 pageblock_page += pageblock_nr_pages; 1957 pageblock_page += pageblock_nr_pages;
1958 } 1958 }
1959 } 1959 }
1960 1960
1961 /* 1961 /*
1962 * When we are falling back to another migratetype during allocation, try to 1962 * When we are falling back to another migratetype during allocation, try to
1963 * steal extra free pages from the same pageblocks to satisfy further 1963 * steal extra free pages from the same pageblocks to satisfy further
1964 * allocations, instead of polluting multiple pageblocks. 1964 * allocations, instead of polluting multiple pageblocks.
1965 * 1965 *
1966 * If we are stealing a relatively large buddy page, it is likely there will 1966 * If we are stealing a relatively large buddy page, it is likely there will
1967 * be more free pages in the pageblock, so try to steal them all. For 1967 * be more free pages in the pageblock, so try to steal them all. For
1968 * reclaimable and unmovable allocations, we steal regardless of page size, 1968 * reclaimable and unmovable allocations, we steal regardless of page size,
1969 * as fragmentation caused by those allocations polluting movable pageblocks 1969 * as fragmentation caused by those allocations polluting movable pageblocks
1970 * is worse than movable allocations stealing from unmovable and reclaimable 1970 * is worse than movable allocations stealing from unmovable and reclaimable
1971 * pageblocks. 1971 * pageblocks.
1972 */ 1972 */
1973 static bool can_steal_fallback(unsigned int order, int start_mt) 1973 static bool can_steal_fallback(unsigned int order, int start_mt)
1974 { 1974 {
1975 /* 1975 /*
1976 * Leaving this order check is intended, although there is 1976 * Leaving this order check is intended, although there is
1977 * relaxed order check in next check. The reason is that 1977 * relaxed order check in next check. The reason is that
1978 * we can actually steal whole pageblock if this condition met, 1978 * we can actually steal whole pageblock if this condition met,
1979 * but, below check doesn't guarantee it and that is just heuristic 1979 * but, below check doesn't guarantee it and that is just heuristic
1980 * so could be changed anytime. 1980 * so could be changed anytime.
1981 */ 1981 */
1982 if (order >= pageblock_order) 1982 if (order >= pageblock_order)
1983 return true; 1983 return true;
1984 1984
1985 if (order >= pageblock_order / 2 || 1985 if (order >= pageblock_order / 2 ||
1986 start_mt == MIGRATE_RECLAIMABLE || 1986 start_mt == MIGRATE_RECLAIMABLE ||
1987 start_mt == MIGRATE_UNMOVABLE || 1987 start_mt == MIGRATE_UNMOVABLE ||
1988 page_group_by_mobility_disabled) 1988 page_group_by_mobility_disabled)
1989 return true; 1989 return true;
1990 1990
1991 return false; 1991 return false;
1992 } 1992 }
1993 1993
1994 /* 1994 /*
1995 * This function implements actual steal behaviour. If order is large enough, 1995 * This function implements actual steal behaviour. If order is large enough,
1996 * we can steal whole pageblock. If not, we first move freepages in this 1996 * we can steal whole pageblock. If not, we first move freepages in this
1997 * pageblock to our migratetype and determine how many already-allocated pages 1997 * pageblock to our migratetype and determine how many already-allocated pages
1998 * are there in the pageblock with a compatible migratetype. If at least half 1998 * are there in the pageblock with a compatible migratetype. If at least half
1999 * of pages are free or compatible, we can change migratetype of the pageblock 1999 * of pages are free or compatible, we can change migratetype of the pageblock
2000 * itself, so pages freed in the future will be put on the correct free list. 2000 * itself, so pages freed in the future will be put on the correct free list.
2001 */ 2001 */
2002 static void steal_suitable_fallback(struct zone *zone, struct page *page, 2002 static void steal_suitable_fallback(struct zone *zone, struct page *page,
2003 int start_type, bool whole_block) 2003 int start_type, bool whole_block)
2004 { 2004 {
2005 unsigned int current_order = page_order(page); 2005 unsigned int current_order = page_order(page);
2006 struct free_area *area; 2006 struct free_area *area;
2007 int free_pages, movable_pages, alike_pages; 2007 int free_pages, movable_pages, alike_pages;
2008 int old_block_type; 2008 int old_block_type;
2009 2009
2010 old_block_type = get_pageblock_migratetype(page); 2010 old_block_type = get_pageblock_migratetype(page);
2011 2011
2012 /* 2012 /*
2013 * This can happen due to races and we want to prevent broken 2013 * This can happen due to races and we want to prevent broken
2014 * highatomic accounting. 2014 * highatomic accounting.
2015 */ 2015 */
2016 if (is_migrate_highatomic(old_block_type)) 2016 if (is_migrate_highatomic(old_block_type))
2017 goto single_page; 2017 goto single_page;
2018 2018
2019 /* Take ownership for orders >= pageblock_order */ 2019 /* Take ownership for orders >= pageblock_order */
2020 if (current_order >= pageblock_order) { 2020 if (current_order >= pageblock_order) {
2021 change_pageblock_range(page, current_order, start_type); 2021 change_pageblock_range(page, current_order, start_type);
2022 goto single_page; 2022 goto single_page;
2023 } 2023 }
2024 2024
2025 /* We are not allowed to try stealing from the whole block */ 2025 /* We are not allowed to try stealing from the whole block */
2026 if (!whole_block) 2026 if (!whole_block)
2027 goto single_page; 2027 goto single_page;
2028 2028
2029 free_pages = move_freepages_block(zone, page, start_type, 2029 free_pages = move_freepages_block(zone, page, start_type,
2030 &movable_pages); 2030 &movable_pages);
2031 /* 2031 /*
2032 * Determine how many pages are compatible with our allocation. 2032 * Determine how many pages are compatible with our allocation.
2033 * For movable allocation, it's the number of movable pages which 2033 * For movable allocation, it's the number of movable pages which
2034 * we just obtained. For other types it's a bit more tricky. 2034 * we just obtained. For other types it's a bit more tricky.
2035 */ 2035 */
2036 if (start_type == MIGRATE_MOVABLE) { 2036 if (start_type == MIGRATE_MOVABLE) {
2037 alike_pages = movable_pages; 2037 alike_pages = movable_pages;
2038 } else { 2038 } else {
2039 /* 2039 /*
2040 * If we are falling back a RECLAIMABLE or UNMOVABLE allocation 2040 * If we are falling back a RECLAIMABLE or UNMOVABLE allocation
2041 * to MOVABLE pageblock, consider all non-movable pages as 2041 * to MOVABLE pageblock, consider all non-movable pages as
2042 * compatible. If it's UNMOVABLE falling back to RECLAIMABLE or 2042 * compatible. If it's UNMOVABLE falling back to RECLAIMABLE or
2043 * vice versa, be conservative since we can't distinguish the 2043 * vice versa, be conservative since we can't distinguish the
2044 * exact migratetype of non-movable pages. 2044 * exact migratetype of non-movable pages.
2045 */ 2045 */
2046 if (old_block_type == MIGRATE_MOVABLE) 2046 if (old_block_type == MIGRATE_MOVABLE)
2047 alike_pages = pageblock_nr_pages 2047 alike_pages = pageblock_nr_pages
2048 - (free_pages + movable_pages); 2048 - (free_pages + movable_pages);
2049 else 2049 else
2050 alike_pages = 0; 2050 alike_pages = 0;
2051 } 2051 }
2052 2052
2053 /* moving whole block can fail due to zone boundary conditions */ 2053 /* moving whole block can fail due to zone boundary conditions */
2054 if (!free_pages) 2054 if (!free_pages)
2055 goto single_page; 2055 goto single_page;
2056 2056
2057 /* 2057 /*
2058 * If a sufficient number of pages in the block are either free or of 2058 * If a sufficient number of pages in the block are either free or of
2059 * comparable migratability as our allocation, claim the whole block. 2059 * comparable migratability as our allocation, claim the whole block.
2060 */ 2060 */
2061 if (free_pages + alike_pages >= (1 << (pageblock_order-1)) || 2061 if (free_pages + alike_pages >= (1 << (pageblock_order-1)) ||
2062 page_group_by_mobility_disabled) 2062 page_group_by_mobility_disabled)
2063 set_pageblock_migratetype(page, start_type); 2063 set_pageblock_migratetype(page, start_type);
2064 2064
2065 return; 2065 return;
2066 2066
2067 single_page: 2067 single_page:
2068 area = &zone->free_area[current_order]; 2068 area = &zone->free_area[current_order];
2069 list_move(&page->lru, &area->free_list[start_type]); 2069 list_move(&page->lru, &area->free_list[start_type]);
2070 } 2070 }
2071 2071
2072 /* 2072 /*
2073 * Check whether there is a suitable fallback freepage with requested order. 2073 * Check whether there is a suitable fallback freepage with requested order.
2074 * If only_stealable is true, this function returns fallback_mt only if 2074 * If only_stealable is true, this function returns fallback_mt only if
2075 * we can steal other freepages all together. This would help to reduce 2075 * we can steal other freepages all together. This would help to reduce
2076 * fragmentation due to mixed migratetype pages in one pageblock. 2076 * fragmentation due to mixed migratetype pages in one pageblock.
2077 */ 2077 */
2078 int find_suitable_fallback(struct free_area *area, unsigned int order, 2078 int find_suitable_fallback(struct free_area *area, unsigned int order,
2079 int migratetype, bool only_stealable, bool *can_steal) 2079 int migratetype, bool only_stealable, bool *can_steal)
2080 { 2080 {
2081 int i; 2081 int i;
2082 int fallback_mt; 2082 int fallback_mt;
2083 2083
2084 if (area->nr_free == 0) 2084 if (area->nr_free == 0)
2085 return -1; 2085 return -1;
2086 2086
2087 *can_steal = false; 2087 *can_steal = false;
2088 for (i = 0;; i++) { 2088 for (i = 0;; i++) {
2089 fallback_mt = fallbacks[migratetype][i]; 2089 fallback_mt = fallbacks[migratetype][i];
2090 if (fallback_mt == MIGRATE_TYPES) 2090 if (fallback_mt == MIGRATE_TYPES)
2091 break; 2091 break;
2092 2092
2093 if (list_empty(&area->free_list[fallback_mt])) 2093 if (list_empty(&area->free_list[fallback_mt]))
2094 continue; 2094 continue;
2095 2095
2096 if (can_steal_fallback(order, migratetype)) 2096 if (can_steal_fallback(order, migratetype))
2097 *can_steal = true; 2097 *can_steal = true;
2098 2098
2099 if (!only_stealable) 2099 if (!only_stealable)
2100 return fallback_mt; 2100 return fallback_mt;
2101 2101
2102 if (*can_steal) 2102 if (*can_steal)
2103 return fallback_mt; 2103 return fallback_mt;
2104 } 2104 }
2105 2105
2106 return -1; 2106 return -1;
2107 } 2107 }
2108 2108
2109 /* 2109 /*
2110 * Reserve a pageblock for exclusive use of high-order atomic allocations if 2110 * Reserve a pageblock for exclusive use of high-order atomic allocations if
2111 * there are no empty page blocks that contain a page with a suitable order 2111 * there are no empty page blocks that contain a page with a suitable order
2112 */ 2112 */
2113 static void reserve_highatomic_pageblock(struct page *page, struct zone *zone, 2113 static void reserve_highatomic_pageblock(struct page *page, struct zone *zone,
2114 unsigned int alloc_order) 2114 unsigned int alloc_order)
2115 { 2115 {
2116 int mt; 2116 int mt;
2117 unsigned long max_managed, flags; 2117 unsigned long max_managed, flags;
2118 2118
2119 /* 2119 /*
2120 * Limit the number reserved to 1 pageblock or roughly 1% of a zone. 2120 * Limit the number reserved to 1 pageblock or roughly 1% of a zone.
2121 * Check is race-prone but harmless. 2121 * Check is race-prone but harmless.
2122 */ 2122 */
2123 max_managed = (zone->managed_pages / 100) + pageblock_nr_pages; 2123 max_managed = (zone->managed_pages / 100) + pageblock_nr_pages;
2124 if (zone->nr_reserved_highatomic >= max_managed) 2124 if (zone->nr_reserved_highatomic >= max_managed)
2125 return; 2125 return;
2126 2126
2127 spin_lock_irqsave(&zone->lock, flags); 2127 spin_lock_irqsave(&zone->lock, flags);
2128 2128
2129 /* Recheck the nr_reserved_highatomic limit under the lock */ 2129 /* Recheck the nr_reserved_highatomic limit under the lock */
2130 if (zone->nr_reserved_highatomic >= max_managed) 2130 if (zone->nr_reserved_highatomic >= max_managed)
2131 goto out_unlock; 2131 goto out_unlock;
2132 2132
2133 /* Yoink! */ 2133 /* Yoink! */
2134 mt = get_pageblock_migratetype(page); 2134 mt = get_pageblock_migratetype(page);
2135 if (!is_migrate_highatomic(mt) && !is_migrate_isolate(mt) 2135 if (!is_migrate_highatomic(mt) && !is_migrate_isolate(mt)
2136 && !is_migrate_cma(mt)) { 2136 && !is_migrate_cma(mt)) {
2137 zone->nr_reserved_highatomic += pageblock_nr_pages; 2137 zone->nr_reserved_highatomic += pageblock_nr_pages;
2138 set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC); 2138 set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC);
2139 move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL); 2139 move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL);
2140 } 2140 }
2141 2141
2142 out_unlock: 2142 out_unlock:
2143 spin_unlock_irqrestore(&zone->lock, flags); 2143 spin_unlock_irqrestore(&zone->lock, flags);
2144 } 2144 }
2145 2145
2146 /* 2146 /*
2147 * Used when an allocation is about to fail under memory pressure. This 2147 * Used when an allocation is about to fail under memory pressure. This
2148 * potentially hurts the reliability of high-order allocations when under 2148 * potentially hurts the reliability of high-order allocations when under
2149 * intense memory pressure but failed atomic allocations should be easier 2149 * intense memory pressure but failed atomic allocations should be easier
2150 * to recover from than an OOM. 2150 * to recover from than an OOM.
2151 * 2151 *
2152 * If @force is true, try to unreserve a pageblock even though highatomic 2152 * If @force is true, try to unreserve a pageblock even though highatomic
2153 * pageblock is exhausted. 2153 * pageblock is exhausted.
2154 */ 2154 */
2155 static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, 2155 static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
2156 bool force) 2156 bool force)
2157 { 2157 {
2158 struct zonelist *zonelist = ac->zonelist; 2158 struct zonelist *zonelist = ac->zonelist;
2159 unsigned long flags; 2159 unsigned long flags;
2160 struct zoneref *z; 2160 struct zoneref *z;
2161 struct zone *zone; 2161 struct zone *zone;
2162 struct page *page; 2162 struct page *page;
2163 int order; 2163 int order;
2164 bool ret; 2164 bool ret;
2165 2165
2166 for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx, 2166 for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx,
2167 ac->nodemask) { 2167 ac->nodemask) {
2168 /* 2168 /*
2169 * Preserve at least one pageblock unless memory pressure 2169 * Preserve at least one pageblock unless memory pressure
2170 * is really high. 2170 * is really high.
2171 */ 2171 */
2172 if (!force && zone->nr_reserved_highatomic <= 2172 if (!force && zone->nr_reserved_highatomic <=
2173 pageblock_nr_pages) 2173 pageblock_nr_pages)
2174 continue; 2174 continue;
2175 2175
2176 spin_lock_irqsave(&zone->lock, flags); 2176 spin_lock_irqsave(&zone->lock, flags);
2177 for (order = 0; order < MAX_ORDER; order++) { 2177 for (order = 0; order < MAX_ORDER; order++) {
2178 struct free_area *area = &(zone->free_area[order]); 2178 struct free_area *area = &(zone->free_area[order]);
2179 2179
2180 page = list_first_entry_or_null( 2180 page = list_first_entry_or_null(
2181 &area->free_list[MIGRATE_HIGHATOMIC], 2181 &area->free_list[MIGRATE_HIGHATOMIC],
2182 struct page, lru); 2182 struct page, lru);
2183 if (!page) 2183 if (!page)
2184 continue; 2184 continue;
2185 2185
2186 /* 2186 /*
2187 * In page freeing path, migratetype change is racy so 2187 * In page freeing path, migratetype change is racy so
2188 * we can counter several free pages in a pageblock 2188 * we can counter several free pages in a pageblock
2189 * in this loop althoug we changed the pageblock type 2189 * in this loop althoug we changed the pageblock type
2190 * from highatomic to ac->migratetype. So we should 2190 * from highatomic to ac->migratetype. So we should
2191 * adjust the count once. 2191 * adjust the count once.
2192 */ 2192 */
2193 if (is_migrate_highatomic_page(page)) { 2193 if (is_migrate_highatomic_page(page)) {
2194 /* 2194 /*
2195 * It should never happen but changes to 2195 * It should never happen but changes to
2196 * locking could inadvertently allow a per-cpu 2196 * locking could inadvertently allow a per-cpu
2197 * drain to add pages to MIGRATE_HIGHATOMIC 2197 * drain to add pages to MIGRATE_HIGHATOMIC
2198 * while unreserving so be safe and watch for 2198 * while unreserving so be safe and watch for
2199 * underflows. 2199 * underflows.
2200 */ 2200 */
2201 zone->nr_reserved_highatomic -= min( 2201 zone->nr_reserved_highatomic -= min(
2202 pageblock_nr_pages, 2202 pageblock_nr_pages,
2203 zone->nr_reserved_highatomic); 2203 zone->nr_reserved_highatomic);
2204 } 2204 }
2205 2205
2206 /* 2206 /*
2207 * Convert to ac->migratetype and avoid the normal 2207 * Convert to ac->migratetype and avoid the normal
2208 * pageblock stealing heuristics. Minimally, the caller 2208 * pageblock stealing heuristics. Minimally, the caller
2209 * is doing the work and needs the pages. More 2209 * is doing the work and needs the pages. More
2210 * importantly, if the block was always converted to 2210 * importantly, if the block was always converted to
2211 * MIGRATE_UNMOVABLE or another type then the number 2211 * MIGRATE_UNMOVABLE or another type then the number
2212 * of pageblocks that cannot be completely freed 2212 * of pageblocks that cannot be completely freed
2213 * may increase. 2213 * may increase.
2214 */ 2214 */
2215 set_pageblock_migratetype(page, ac->migratetype); 2215 set_pageblock_migratetype(page, ac->migratetype);
2216 ret = move_freepages_block(zone, page, ac->migratetype, 2216 ret = move_freepages_block(zone, page, ac->migratetype,
2217 NULL); 2217 NULL);
2218 if (ret) { 2218 if (ret) {
2219 spin_unlock_irqrestore(&zone->lock, flags); 2219 spin_unlock_irqrestore(&zone->lock, flags);
2220 return ret; 2220 return ret;
2221 } 2221 }
2222 } 2222 }
2223 spin_unlock_irqrestore(&zone->lock, flags); 2223 spin_unlock_irqrestore(&zone->lock, flags);
2224 } 2224 }
2225 2225
2226 return false; 2226 return false;
2227 } 2227 }
2228 2228
2229 /* 2229 /*
2230 * Try finding a free buddy page on the fallback list and put it on the free 2230 * Try finding a free buddy page on the fallback list and put it on the free
2231 * list of requested migratetype, possibly along with other pages from the same 2231 * list of requested migratetype, possibly along with other pages from the same
2232 * block, depending on fragmentation avoidance heuristics. Returns true if 2232 * block, depending on fragmentation avoidance heuristics. Returns true if
2233 * fallback was found so that __rmqueue_smallest() can grab it. 2233 * fallback was found so that __rmqueue_smallest() can grab it.
2234 * 2234 *
2235 * The use of signed ints for order and current_order is a deliberate 2235 * The use of signed ints for order and current_order is a deliberate
2236 * deviation from the rest of this file, to make the for loop 2236 * deviation from the rest of this file, to make the for loop
2237 * condition simpler. 2237 * condition simpler.
2238 */ 2238 */
2239 static inline bool 2239 static inline bool
2240 __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) 2240 __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
2241 { 2241 {
2242 struct free_area *area; 2242 struct free_area *area;
2243 int current_order; 2243 int current_order;
2244 struct page *page; 2244 struct page *page;
2245 int fallback_mt; 2245 int fallback_mt;
2246 bool can_steal; 2246 bool can_steal;
2247 2247
2248 /* 2248 /*
2249 * Find the largest available free page in the other list. This roughly 2249 * Find the largest available free page in the other list. This roughly
2250 * approximates finding the pageblock with the most free pages, which 2250 * approximates finding the pageblock with the most free pages, which
2251 * would be too costly to do exactly. 2251 * would be too costly to do exactly.
2252 */ 2252 */
2253 for (current_order = MAX_ORDER - 1; current_order >= order; 2253 for (current_order = MAX_ORDER - 1; current_order >= order;
2254 --current_order) { 2254 --current_order) {
2255 area = &(zone->free_area[current_order]); 2255 area = &(zone->free_area[current_order]);
2256 fallback_mt = find_suitable_fallback(area, current_order, 2256 fallback_mt = find_suitable_fallback(area, current_order,
2257 start_migratetype, false, &can_steal); 2257 start_migratetype, false, &can_steal);
2258 if (fallback_mt == -1) 2258 if (fallback_mt == -1)
2259 continue; 2259 continue;
2260 2260
2261 /* 2261 /*
2262 * We cannot steal all free pages from the pageblock and the 2262 * We cannot steal all free pages from the pageblock and the
2263 * requested migratetype is movable. In that case it's better to 2263 * requested migratetype is movable. In that case it's better to
2264 * steal and split the smallest available page instead of the 2264 * steal and split the smallest available page instead of the
2265 * largest available page, because even if the next movable 2265 * largest available page, because even if the next movable
2266 * allocation falls back into a different pageblock than this 2266 * allocation falls back into a different pageblock than this
2267 * one, it won't cause permanent fragmentation. 2267 * one, it won't cause permanent fragmentation.
2268 */ 2268 */
2269 if (!can_steal && start_migratetype == MIGRATE_MOVABLE 2269 if (!can_steal && start_migratetype == MIGRATE_MOVABLE
2270 && current_order > order) 2270 && current_order > order)
2271 goto find_smallest; 2271 goto find_smallest;
2272 2272
2273 goto do_steal; 2273 goto do_steal;
2274 } 2274 }
2275 2275
2276 return false; 2276 return false;
2277 2277
2278 find_smallest: 2278 find_smallest:
2279 for (current_order = order; current_order < MAX_ORDER; 2279 for (current_order = order; current_order < MAX_ORDER;
2280 current_order++) { 2280 current_order++) {
2281 area = &(zone->free_area[current_order]); 2281 area = &(zone->free_area[current_order]);
2282 fallback_mt = find_suitable_fallback(area, current_order, 2282 fallback_mt = find_suitable_fallback(area, current_order,
2283 start_migratetype, false, &can_steal); 2283 start_migratetype, false, &can_steal);
2284 if (fallback_mt != -1) 2284 if (fallback_mt != -1)
2285 break; 2285 break;
2286 } 2286 }
2287 2287
2288 /* 2288 /*
2289 * This should not happen - we already found a suitable fallback 2289 * This should not happen - we already found a suitable fallback
2290 * when looking for the largest page. 2290 * when looking for the largest page.
2291 */ 2291 */
2292 VM_BUG_ON(current_order == MAX_ORDER); 2292 VM_BUG_ON(current_order == MAX_ORDER);
2293 2293
2294 do_steal: 2294 do_steal:
2295 page = list_first_entry(&area->free_list[fallback_mt], 2295 page = list_first_entry(&area->free_list[fallback_mt],
2296 struct page, lru); 2296 struct page, lru);
2297 2297
2298 steal_suitable_fallback(zone, page, start_migratetype, can_steal); 2298 steal_suitable_fallback(zone, page, start_migratetype, can_steal);
2299 2299
2300 trace_mm_page_alloc_extfrag(page, order, current_order, 2300 trace_mm_page_alloc_extfrag(page, order, current_order,
2301 start_migratetype, fallback_mt); 2301 start_migratetype, fallback_mt);
2302 2302
2303 return true; 2303 return true;
2304 2304
2305 } 2305 }
2306 2306
2307 /* 2307 /*
2308 * Do the hard work of removing an element from the buddy allocator. 2308 * Do the hard work of removing an element from the buddy allocator.
2309 * Call me with the zone->lock already held. 2309 * Call me with the zone->lock already held.
2310 */ 2310 */
2311 static struct page *__rmqueue(struct zone *zone, unsigned int order, 2311 static struct page *__rmqueue(struct zone *zone, unsigned int order,
2312 int migratetype) 2312 int migratetype)
2313 { 2313 {
2314 struct page *page; 2314 struct page *page;
2315 2315
2316 retry: 2316 retry:
2317 page = __rmqueue_smallest(zone, order, migratetype); 2317 page = __rmqueue_smallest(zone, order, migratetype);
2318 if (unlikely(!page)) { 2318 if (unlikely(!page)) {
2319 if (migratetype == MIGRATE_MOVABLE) 2319 if (migratetype == MIGRATE_MOVABLE)
2320 page = __rmqueue_cma_fallback(zone, order); 2320 page = __rmqueue_cma_fallback(zone, order);
2321 2321
2322 if (!page && __rmqueue_fallback(zone, order, migratetype)) 2322 if (!page && __rmqueue_fallback(zone, order, migratetype))
2323 goto retry; 2323 goto retry;
2324 } 2324 }
2325 2325
2326 trace_mm_page_alloc_zone_locked(page, order, migratetype); 2326 trace_mm_page_alloc_zone_locked(page, order, migratetype);
2327 return page; 2327 return page;
2328 } 2328 }
2329 2329
2330 /* 2330 /*
2331 * Obtain a specified number of elements from the buddy allocator, all under 2331 * Obtain a specified number of elements from the buddy allocator, all under
2332 * a single hold of the lock, for efficiency. Add them to the supplied list. 2332 * a single hold of the lock, for efficiency. Add them to the supplied list.
2333 * Returns the number of new pages which were placed at *list. 2333 * Returns the number of new pages which were placed at *list.
2334 */ 2334 */
2335 static int rmqueue_bulk(struct zone *zone, unsigned int order, 2335 static int rmqueue_bulk(struct zone *zone, unsigned int order,
2336 unsigned long count, struct list_head *list, 2336 unsigned long count, struct list_head *list,
2337 int migratetype, bool cold) 2337 int migratetype, bool cold)
2338 { 2338 {
2339 int i, alloced = 0; 2339 int i, alloced = 0;
2340 2340
2341 spin_lock(&zone->lock); 2341 spin_lock(&zone->lock);
2342 for (i = 0; i < count; ++i) { 2342 for (i = 0; i < count; ++i) {
2343 struct page *page = __rmqueue(zone, order, migratetype); 2343 struct page *page = __rmqueue(zone, order, migratetype);
2344 if (unlikely(page == NULL)) 2344 if (unlikely(page == NULL))
2345 break; 2345 break;
2346 2346
2347 if (unlikely(check_pcp_refill(page))) 2347 if (unlikely(check_pcp_refill(page)))
2348 continue; 2348 continue;
2349 2349
2350 /* 2350 /*
2351 * Split buddy pages returned by expand() are received here 2351 * Split buddy pages returned by expand() are received here
2352 * in physical page order. The page is added to the callers and 2352 * in physical page order. The page is added to the callers and
2353 * list and the list head then moves forward. From the callers 2353 * list and the list head then moves forward. From the callers
2354 * perspective, the linked list is ordered by page number in 2354 * perspective, the linked list is ordered by page number in
2355 * some conditions. This is useful for IO devices that can 2355 * some conditions. This is useful for IO devices that can
2356 * merge IO requests if the physical pages are ordered 2356 * merge IO requests if the physical pages are ordered
2357 * properly. 2357 * properly.
2358 */ 2358 */
2359 if (likely(!cold)) 2359 if (likely(!cold))
2360 list_add(&page->lru, list); 2360 list_add(&page->lru, list);
2361 else 2361 else
2362 list_add_tail(&page->lru, list); 2362 list_add_tail(&page->lru, list);
2363 list = &page->lru; 2363 list = &page->lru;
2364 alloced++; 2364 alloced++;
2365 if (is_migrate_cma(get_pcppage_migratetype(page))) 2365 if (is_migrate_cma(get_pcppage_migratetype(page)))
2366 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 2366 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
2367 -(1 << order)); 2367 -(1 << order));
2368 } 2368 }
2369 2369
2370 /* 2370 /*
2371 * i pages were removed from the buddy list even if some leak due 2371 * i pages were removed from the buddy list even if some leak due
2372 * to check_pcp_refill failing so adjust NR_FREE_PAGES based 2372 * to check_pcp_refill failing so adjust NR_FREE_PAGES based
2373 * on i. Do not confuse with 'alloced' which is the number of 2373 * on i. Do not confuse with 'alloced' which is the number of
2374 * pages added to the pcp list. 2374 * pages added to the pcp list.
2375 */ 2375 */
2376 __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); 2376 __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
2377 spin_unlock(&zone->lock); 2377 spin_unlock(&zone->lock);
2378 return alloced; 2378 return alloced;
2379 } 2379 }
2380 2380
2381 #ifdef CONFIG_NUMA 2381 #ifdef CONFIG_NUMA
2382 /* 2382 /*
2383 * Called from the vmstat counter updater to drain pagesets of this 2383 * Called from the vmstat counter updater to drain pagesets of this
2384 * currently executing processor on remote nodes after they have 2384 * currently executing processor on remote nodes after they have
2385 * expired. 2385 * expired.
2386 * 2386 *
2387 * Note that this function must be called with the thread pinned to 2387 * Note that this function must be called with the thread pinned to
2388 * a single processor. 2388 * a single processor.
2389 */ 2389 */
2390 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) 2390 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
2391 { 2391 {
2392 unsigned long flags; 2392 unsigned long flags;
2393 int to_drain, batch; 2393 int to_drain, batch;
2394 2394
2395 local_irq_save(flags); 2395 local_irq_save(flags);
2396 batch = READ_ONCE(pcp->batch); 2396 batch = READ_ONCE(pcp->batch);
2397 to_drain = min(pcp->count, batch); 2397 to_drain = min(pcp->count, batch);
2398 if (to_drain > 0) { 2398 if (to_drain > 0) {
2399 free_pcppages_bulk(zone, to_drain, pcp); 2399 free_pcppages_bulk(zone, to_drain, pcp);
2400 pcp->count -= to_drain; 2400 pcp->count -= to_drain;
2401 } 2401 }
2402 local_irq_restore(flags); 2402 local_irq_restore(flags);
2403 } 2403 }
2404 #endif 2404 #endif
2405 2405
2406 /* 2406 /*
2407 * Drain pcplists of the indicated processor and zone. 2407 * Drain pcplists of the indicated processor and zone.
2408 * 2408 *
2409 * The processor must either be the current processor and the 2409 * The processor must either be the current processor and the
2410 * thread pinned to the current processor or a processor that 2410 * thread pinned to the current processor or a processor that
2411 * is not online. 2411 * is not online.
2412 */ 2412 */
2413 static void drain_pages_zone(unsigned int cpu, struct zone *zone) 2413 static void drain_pages_zone(unsigned int cpu, struct zone *zone)
2414 { 2414 {
2415 unsigned long flags; 2415 unsigned long flags;
2416 struct per_cpu_pageset *pset; 2416 struct per_cpu_pageset *pset;
2417 struct per_cpu_pages *pcp; 2417 struct per_cpu_pages *pcp;
2418 2418
2419 local_irq_save(flags); 2419 local_irq_save(flags);
2420 pset = per_cpu_ptr(zone->pageset, cpu); 2420 pset = per_cpu_ptr(zone->pageset, cpu);
2421 2421
2422 pcp = &pset->pcp; 2422 pcp = &pset->pcp;
2423 if (pcp->count) { 2423 if (pcp->count) {
2424 free_pcppages_bulk(zone, pcp->count, pcp); 2424 free_pcppages_bulk(zone, pcp->count, pcp);
2425 pcp->count = 0; 2425 pcp->count = 0;
2426 } 2426 }
2427 local_irq_restore(flags); 2427 local_irq_restore(flags);
2428 } 2428 }
2429 2429
2430 /* 2430 /*
2431 * Drain pcplists of all zones on the indicated processor. 2431 * Drain pcplists of all zones on the indicated processor.
2432 * 2432 *
2433 * The processor must either be the current processor and the 2433 * The processor must either be the current processor and the
2434 * thread pinned to the current processor or a processor that 2434 * thread pinned to the current processor or a processor that
2435 * is not online. 2435 * is not online.
2436 */ 2436 */
2437 static void drain_pages(unsigned int cpu) 2437 static void drain_pages(unsigned int cpu)
2438 { 2438 {
2439 struct zone *zone; 2439 struct zone *zone;
2440 2440
2441 for_each_populated_zone(zone) { 2441 for_each_populated_zone(zone) {
2442 drain_pages_zone(cpu, zone); 2442 drain_pages_zone(cpu, zone);
2443 } 2443 }
2444 } 2444 }
2445 2445
2446 /* 2446 /*
2447 * Spill all of this CPU's per-cpu pages back into the buddy allocator. 2447 * Spill all of this CPU's per-cpu pages back into the buddy allocator.
2448 * 2448 *
2449 * The CPU has to be pinned. When zone parameter is non-NULL, spill just 2449 * The CPU has to be pinned. When zone parameter is non-NULL, spill just
2450 * the single zone's pages. 2450 * the single zone's pages.
2451 */ 2451 */
2452 void drain_local_pages(struct zone *zone) 2452 void drain_local_pages(struct zone *zone)
2453 { 2453 {
2454 int cpu = smp_processor_id(); 2454 int cpu = smp_processor_id();
2455 2455
2456 if (zone) 2456 if (zone)
2457 drain_pages_zone(cpu, zone); 2457 drain_pages_zone(cpu, zone);
2458 else 2458 else
2459 drain_pages(cpu); 2459 drain_pages(cpu);
2460 } 2460 }
2461 2461
2462 static void drain_local_pages_wq(struct work_struct *work) 2462 static void drain_local_pages_wq(struct work_struct *work)
2463 { 2463 {
2464 /* 2464 /*
2465 * drain_all_pages doesn't use proper cpu hotplug protection so 2465 * drain_all_pages doesn't use proper cpu hotplug protection so
2466 * we can race with cpu offline when the WQ can move this from 2466 * we can race with cpu offline when the WQ can move this from
2467 * a cpu pinned worker to an unbound one. We can operate on a different 2467 * a cpu pinned worker to an unbound one. We can operate on a different
2468 * cpu which is allright but we also have to make sure to not move to 2468 * cpu which is allright but we also have to make sure to not move to
2469 * a different one. 2469 * a different one.
2470 */ 2470 */
2471 preempt_disable(); 2471 preempt_disable();
2472 drain_local_pages(NULL); 2472 drain_local_pages(NULL);
2473 preempt_enable(); 2473 preempt_enable();
2474 } 2474 }
2475 2475
2476 /* 2476 /*
2477 * Spill all the per-cpu pages from all CPUs back into the buddy allocator. 2477 * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
2478 * 2478 *
2479 * When zone parameter is non-NULL, spill just the single zone's pages. 2479 * When zone parameter is non-NULL, spill just the single zone's pages.
2480 * 2480 *
2481 * Note that this can be extremely slow as the draining happens in a workqueue. 2481 * Note that this can be extremely slow as the draining happens in a workqueue.
2482 */ 2482 */
2483 void drain_all_pages(struct zone *zone) 2483 void drain_all_pages(struct zone *zone)
2484 { 2484 {
2485 int cpu; 2485 int cpu;
2486 2486
2487 /* 2487 /*
2488 * Allocate in the BSS so we wont require allocation in 2488 * Allocate in the BSS so we wont require allocation in
2489 * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y 2489 * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y
2490 */ 2490 */
2491 static cpumask_t cpus_with_pcps; 2491 static cpumask_t cpus_with_pcps;
2492 2492
2493 /* 2493 /*
2494 * Make sure nobody triggers this path before mm_percpu_wq is fully 2494 * Make sure nobody triggers this path before mm_percpu_wq is fully
2495 * initialized. 2495 * initialized.
2496 */ 2496 */
2497 if (WARN_ON_ONCE(!mm_percpu_wq)) 2497 if (WARN_ON_ONCE(!mm_percpu_wq))
2498 return; 2498 return;
2499 2499
2500 /* 2500 /*
2501 * Do not drain if one is already in progress unless it's specific to 2501 * Do not drain if one is already in progress unless it's specific to
2502 * a zone. Such callers are primarily CMA and memory hotplug and need 2502 * a zone. Such callers are primarily CMA and memory hotplug and need
2503 * the drain to be complete when the call returns. 2503 * the drain to be complete when the call returns.
2504 */ 2504 */
2505 if (unlikely(!mutex_trylock(&pcpu_drain_mutex))) { 2505 if (unlikely(!mutex_trylock(&pcpu_drain_mutex))) {
2506 if (!zone) 2506 if (!zone)
2507 return; 2507 return;
2508 mutex_lock(&pcpu_drain_mutex); 2508 mutex_lock(&pcpu_drain_mutex);
2509 } 2509 }
2510 2510
2511 /* 2511 /*
2512 * We don't care about racing with CPU hotplug event 2512 * We don't care about racing with CPU hotplug event
2513 * as offline notification will cause the notified 2513 * as offline notification will cause the notified
2514 * cpu to drain that CPU pcps and on_each_cpu_mask 2514 * cpu to drain that CPU pcps and on_each_cpu_mask
2515 * disables preemption as part of its processing 2515 * disables preemption as part of its processing
2516 */ 2516 */
2517 for_each_online_cpu(cpu) { 2517 for_each_online_cpu(cpu) {
2518 struct per_cpu_pageset *pcp; 2518 struct per_cpu_pageset *pcp;
2519 struct zone *z; 2519 struct zone *z;
2520 bool has_pcps = false; 2520 bool has_pcps = false;
2521 2521
2522 if (zone) { 2522 if (zone) {
2523 pcp = per_cpu_ptr(zone->pageset, cpu); 2523 pcp = per_cpu_ptr(zone->pageset, cpu);
2524 if (pcp->pcp.count) 2524 if (pcp->pcp.count)
2525 has_pcps = true; 2525 has_pcps = true;
2526 } else { 2526 } else {
2527 for_each_populated_zone(z) { 2527 for_each_populated_zone(z) {
2528 pcp = per_cpu_ptr(z->pageset, cpu); 2528 pcp = per_cpu_ptr(z->pageset, cpu);
2529 if (pcp->pcp.count) { 2529 if (pcp->pcp.count) {
2530 has_pcps = true; 2530 has_pcps = true;
2531 break; 2531 break;
2532 } 2532 }
2533 } 2533 }
2534 } 2534 }
2535 2535
2536 if (has_pcps) 2536 if (has_pcps)
2537 cpumask_set_cpu(cpu, &cpus_with_pcps); 2537 cpumask_set_cpu(cpu, &cpus_with_pcps);
2538 else 2538 else
2539 cpumask_clear_cpu(cpu, &cpus_with_pcps); 2539 cpumask_clear_cpu(cpu, &cpus_with_pcps);
2540 } 2540 }
2541 2541
2542 for_each_cpu(cpu, &cpus_with_pcps) { 2542 for_each_cpu(cpu, &cpus_with_pcps) {
2543 struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu); 2543 struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu);
2544 INIT_WORK(work, drain_local_pages_wq); 2544 INIT_WORK(work, drain_local_pages_wq);
2545 queue_work_on(cpu, mm_percpu_wq, work); 2545 queue_work_on(cpu, mm_percpu_wq, work);
2546 } 2546 }
2547 for_each_cpu(cpu, &cpus_with_pcps) 2547 for_each_cpu(cpu, &cpus_with_pcps)
2548 flush_work(per_cpu_ptr(&pcpu_drain, cpu)); 2548 flush_work(per_cpu_ptr(&pcpu_drain, cpu));
2549 2549
2550 mutex_unlock(&pcpu_drain_mutex); 2550 mutex_unlock(&pcpu_drain_mutex);
2551 } 2551 }
2552 2552
2553 #ifdef CONFIG_HIBERNATION 2553 #ifdef CONFIG_HIBERNATION
2554 2554
2555 /* 2555 /*
2556 * Touch the watchdog for every WD_PAGE_COUNT pages. 2556 * Touch the watchdog for every WD_PAGE_COUNT pages.
2557 */ 2557 */
2558 #define WD_PAGE_COUNT (128*1024) 2558 #define WD_PAGE_COUNT (128*1024)
2559 2559
2560 void mark_free_pages(struct zone *zone) 2560 void mark_free_pages(struct zone *zone)
2561 { 2561 {
2562 unsigned long pfn, max_zone_pfn, page_count = WD_PAGE_COUNT; 2562 unsigned long pfn, max_zone_pfn, page_count = WD_PAGE_COUNT;
2563 unsigned long flags; 2563 unsigned long flags;
2564 unsigned int order, t; 2564 unsigned int order, t;
2565 struct page *page; 2565 struct page *page;
2566 2566
2567 if (zone_is_empty(zone)) 2567 if (zone_is_empty(zone))
2568 return; 2568 return;
2569 2569
2570 spin_lock_irqsave(&zone->lock, flags); 2570 spin_lock_irqsave(&zone->lock, flags);
2571 2571
2572 max_zone_pfn = zone_end_pfn(zone); 2572 max_zone_pfn = zone_end_pfn(zone);
2573 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 2573 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
2574 if (pfn_valid(pfn)) { 2574 if (pfn_valid(pfn)) {
2575 page = pfn_to_page(pfn); 2575 page = pfn_to_page(pfn);
2576 2576
2577 if (!--page_count) { 2577 if (!--page_count) {
2578 touch_nmi_watchdog(); 2578 touch_nmi_watchdog();
2579 page_count = WD_PAGE_COUNT; 2579 page_count = WD_PAGE_COUNT;
2580 } 2580 }
2581 2581
2582 if (page_zone(page) != zone) 2582 if (page_zone(page) != zone)
2583 continue; 2583 continue;
2584 2584
2585 if (!swsusp_page_is_forbidden(page)) 2585 if (!swsusp_page_is_forbidden(page))
2586 swsusp_unset_page_free(page); 2586 swsusp_unset_page_free(page);
2587 } 2587 }
2588 2588
2589 for_each_migratetype_order(order, t) { 2589 for_each_migratetype_order(order, t) {
2590 list_for_each_entry(page, 2590 list_for_each_entry(page,
2591 &zone->free_area[order].free_list[t], lru) { 2591 &zone->free_area[order].free_list[t], lru) {
2592 unsigned long i; 2592 unsigned long i;
2593 2593
2594 pfn = page_to_pfn(page); 2594 pfn = page_to_pfn(page);
2595 for (i = 0; i < (1UL << order); i++) { 2595 for (i = 0; i < (1UL << order); i++) {
2596 if (!--page_count) { 2596 if (!--page_count) {
2597 touch_nmi_watchdog(); 2597 touch_nmi_watchdog();
2598 page_count = WD_PAGE_COUNT; 2598 page_count = WD_PAGE_COUNT;
2599 } 2599 }
2600 swsusp_set_page_free(pfn_to_page(pfn + i)); 2600 swsusp_set_page_free(pfn_to_page(pfn + i));
2601 } 2601 }
2602 } 2602 }
2603 } 2603 }
2604 spin_unlock_irqrestore(&zone->lock, flags); 2604 spin_unlock_irqrestore(&zone->lock, flags);
2605 } 2605 }
2606 #endif /* CONFIG_PM */ 2606 #endif /* CONFIG_PM */
2607 2607
2608 /* 2608 /*
2609 * Free a 0-order page 2609 * Free a 0-order page
2610 * cold == true ? free a cold page : free a hot page 2610 * cold == true ? free a cold page : free a hot page
2611 */ 2611 */
2612 void free_hot_cold_page(struct page *page, bool cold) 2612 void free_hot_cold_page(struct page *page, bool cold)
2613 { 2613 {
2614 struct zone *zone = page_zone(page); 2614 struct zone *zone = page_zone(page);
2615 struct per_cpu_pages *pcp; 2615 struct per_cpu_pages *pcp;
2616 unsigned long flags; 2616 unsigned long flags;
2617 unsigned long pfn = page_to_pfn(page); 2617 unsigned long pfn = page_to_pfn(page);
2618 int migratetype; 2618 int migratetype;
2619 2619
2620 if (!free_pcp_prepare(page)) 2620 if (!free_pcp_prepare(page))
2621 return; 2621 return;
2622 2622
2623 migratetype = get_pfnblock_migratetype(page, pfn); 2623 migratetype = get_pfnblock_migratetype(page, pfn);
2624 set_pcppage_migratetype(page, migratetype); 2624 set_pcppage_migratetype(page, migratetype);
2625 local_irq_save(flags); 2625 local_irq_save(flags);
2626 __count_vm_event(PGFREE); 2626 __count_vm_event(PGFREE);
2627 2627
2628 /* 2628 /*
2629 * We only track unmovable, reclaimable and movable on pcp lists. 2629 * We only track unmovable, reclaimable and movable on pcp lists.
2630 * Free ISOLATE pages back to the allocator because they are being 2630 * Free ISOLATE pages back to the allocator because they are being
2631 * offlined but treat HIGHATOMIC as movable pages so we can get those 2631 * offlined but treat HIGHATOMIC as movable pages so we can get those
2632 * areas back if necessary. Otherwise, we may have to free 2632 * areas back if necessary. Otherwise, we may have to free
2633 * excessively into the page allocator 2633 * excessively into the page allocator
2634 */ 2634 */
2635 if (migratetype >= MIGRATE_PCPTYPES) { 2635 if (migratetype >= MIGRATE_PCPTYPES) {
2636 if (unlikely(is_migrate_isolate(migratetype))) { 2636 if (unlikely(is_migrate_isolate(migratetype))) {
2637 free_one_page(zone, page, pfn, 0, migratetype); 2637 free_one_page(zone, page, pfn, 0, migratetype);
2638 goto out; 2638 goto out;
2639 } 2639 }
2640 migratetype = MIGRATE_MOVABLE; 2640 migratetype = MIGRATE_MOVABLE;
2641 } 2641 }
2642 2642
2643 pcp = &this_cpu_ptr(zone->pageset)->pcp; 2643 pcp = &this_cpu_ptr(zone->pageset)->pcp;
2644 if (!cold) 2644 if (!cold)
2645 list_add(&page->lru, &pcp->lists[migratetype]); 2645 list_add(&page->lru, &pcp->lists[migratetype]);
2646 else 2646 else
2647 list_add_tail(&page->lru, &pcp->lists[migratetype]); 2647 list_add_tail(&page->lru, &pcp->lists[migratetype]);
2648 pcp->count++; 2648 pcp->count++;
2649 if (pcp->count >= pcp->high) { 2649 if (pcp->count >= pcp->high) {
2650 unsigned long batch = READ_ONCE(pcp->batch); 2650 unsigned long batch = READ_ONCE(pcp->batch);
2651 free_pcppages_bulk(zone, batch, pcp); 2651 free_pcppages_bulk(zone, batch, pcp);
2652 pcp->count -= batch; 2652 pcp->count -= batch;
2653 } 2653 }
2654 2654
2655 out: 2655 out:
2656 local_irq_restore(flags); 2656 local_irq_restore(flags);
2657 } 2657 }
2658 2658
2659 /* 2659 /*
2660 * Free a list of 0-order pages 2660 * Free a list of 0-order pages
2661 */ 2661 */
2662 void free_hot_cold_page_list(struct list_head *list, bool cold) 2662 void free_hot_cold_page_list(struct list_head *list, bool cold)
2663 { 2663 {
2664 struct page *page, *next; 2664 struct page *page, *next;
2665 2665
2666 list_for_each_entry_safe(page, next, list, lru) { 2666 list_for_each_entry_safe(page, next, list, lru) {
2667 trace_mm_page_free_batched(page, cold); 2667 trace_mm_page_free_batched(page, cold);
2668 free_hot_cold_page(page, cold); 2668 free_hot_cold_page(page, cold);
2669 } 2669 }
2670 } 2670 }
2671 2671
2672 /* 2672 /*
2673 * split_page takes a non-compound higher-order page, and splits it into 2673 * split_page takes a non-compound higher-order page, and splits it into
2674 * n (1<<order) sub-pages: page[0..n] 2674 * n (1<<order) sub-pages: page[0..n]
2675 * Each sub-page must be freed individually. 2675 * Each sub-page must be freed individually.
2676 * 2676 *
2677 * Note: this is probably too low level an operation for use in drivers. 2677 * Note: this is probably too low level an operation for use in drivers.
2678 * Please consult with lkml before using this in your driver. 2678 * Please consult with lkml before using this in your driver.
2679 */ 2679 */
2680 void split_page(struct page *page, unsigned int order) 2680 void split_page(struct page *page, unsigned int order)
2681 { 2681 {
2682 int i; 2682 int i;
2683 2683
2684 VM_BUG_ON_PAGE(PageCompound(page), page); 2684 VM_BUG_ON_PAGE(PageCompound(page), page);
2685 VM_BUG_ON_PAGE(!page_count(page), page); 2685 VM_BUG_ON_PAGE(!page_count(page), page);
2686 2686
2687 for (i = 1; i < (1 << order); i++) 2687 for (i = 1; i < (1 << order); i++)
2688 set_page_refcounted(page + i); 2688 set_page_refcounted(page + i);
2689 split_page_owner(page, order); 2689 split_page_owner(page, order);
2690 } 2690 }
2691 EXPORT_SYMBOL_GPL(split_page); 2691 EXPORT_SYMBOL_GPL(split_page);
2692 2692
2693 int __isolate_free_page(struct page *page, unsigned int order) 2693 int __isolate_free_page(struct page *page, unsigned int order)
2694 { 2694 {
2695 unsigned long watermark; 2695 unsigned long watermark;
2696 struct zone *zone; 2696 struct zone *zone;
2697 int mt; 2697 int mt;
2698 2698
2699 BUG_ON(!PageBuddy(page)); 2699 BUG_ON(!PageBuddy(page));
2700 2700
2701 zone = page_zone(page); 2701 zone = page_zone(page);
2702 mt = get_pageblock_migratetype(page); 2702 mt = get_pageblock_migratetype(page);
2703 2703
2704 if (!is_migrate_isolate(mt)) { 2704 if (!is_migrate_isolate(mt)) {
2705 /* 2705 /*
2706 * Obey watermarks as if the page was being allocated. We can 2706 * Obey watermarks as if the page was being allocated. We can
2707 * emulate a high-order watermark check with a raised order-0 2707 * emulate a high-order watermark check with a raised order-0
2708 * watermark, because we already know our high-order page 2708 * watermark, because we already know our high-order page
2709 * exists. 2709 * exists.
2710 */ 2710 */
2711 watermark = min_wmark_pages(zone) + (1UL << order); 2711 watermark = min_wmark_pages(zone) + (1UL << order);
2712 if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA)) 2712 if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
2713 return 0; 2713 return 0;
2714 2714
2715 __mod_zone_freepage_state(zone, -(1UL << order), mt); 2715 __mod_zone_freepage_state(zone, -(1UL << order), mt);
2716 } 2716 }
2717 2717
2718 /* Remove page from free list */ 2718 /* Remove page from free list */
2719 list_del(&page->lru); 2719 list_del(&page->lru);
2720 zone->free_area[order].nr_free--; 2720 zone->free_area[order].nr_free--;
2721 rmv_page_order(page); 2721 rmv_page_order(page);
2722 2722
2723 /* 2723 /*
2724 * Set the pageblock if the isolated page is at least half of a 2724 * Set the pageblock if the isolated page is at least half of a
2725 * pageblock 2725 * pageblock
2726 */ 2726 */
2727 if (order >= pageblock_order - 1) { 2727 if (order >= pageblock_order - 1) {
2728 struct page *endpage = page + (1 << order) - 1; 2728 struct page *endpage = page + (1 << order) - 1;
2729 for (; page < endpage; page += pageblock_nr_pages) { 2729 for (; page < endpage; page += pageblock_nr_pages) {
2730 int mt = get_pageblock_migratetype(page); 2730 int mt = get_pageblock_migratetype(page);
2731 if (!is_migrate_isolate(mt) && !is_migrate_cma(mt) 2731 if (!is_migrate_isolate(mt) && !is_migrate_cma(mt)
2732 && !is_migrate_highatomic(mt)) 2732 && !is_migrate_highatomic(mt))
2733 set_pageblock_migratetype(page, 2733 set_pageblock_migratetype(page,
2734 MIGRATE_MOVABLE); 2734 MIGRATE_MOVABLE);
2735 } 2735 }
2736 } 2736 }
2737 2737
2738 2738
2739 return 1UL << order; 2739 return 1UL << order;
2740 } 2740 }
2741 2741
2742 /* 2742 /*
2743 * Update NUMA hit/miss statistics 2743 * Update NUMA hit/miss statistics
2744 * 2744 *
2745 * Must be called with interrupts disabled. 2745 * Must be called with interrupts disabled.
2746 */ 2746 */
2747 static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) 2747 static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
2748 { 2748 {
2749 #ifdef CONFIG_NUMA 2749 #ifdef CONFIG_NUMA
2750 enum numa_stat_item local_stat = NUMA_LOCAL; 2750 enum numa_stat_item local_stat = NUMA_LOCAL;
2751 2751
2752 if (z->node != numa_node_id()) 2752 if (z->node != numa_node_id())
2753 local_stat = NUMA_OTHER; 2753 local_stat = NUMA_OTHER;
2754 2754
2755 if (z->node == preferred_zone->node) 2755 if (z->node == preferred_zone->node)
2756 __inc_numa_state(z, NUMA_HIT); 2756 __inc_numa_state(z, NUMA_HIT);
2757 else { 2757 else {
2758 __inc_numa_state(z, NUMA_MISS); 2758 __inc_numa_state(z, NUMA_MISS);
2759 __inc_numa_state(preferred_zone, NUMA_FOREIGN); 2759 __inc_numa_state(preferred_zone, NUMA_FOREIGN);
2760 } 2760 }
2761 __inc_numa_state(z, local_stat); 2761 __inc_numa_state(z, local_stat);
2762 #endif 2762 #endif
2763 } 2763 }
2764 2764
2765 /* Remove page from the per-cpu list, caller must protect the list */ 2765 /* Remove page from the per-cpu list, caller must protect the list */
2766 static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, 2766 static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
2767 bool cold, struct per_cpu_pages *pcp, 2767 bool cold, struct per_cpu_pages *pcp,
2768 struct list_head *list) 2768 struct list_head *list)
2769 { 2769 {
2770 struct page *page; 2770 struct page *page;
2771 2771
2772 do { 2772 do {
2773 if (list_empty(list)) { 2773 if (list_empty(list)) {
2774 pcp->count += rmqueue_bulk(zone, 0, 2774 pcp->count += rmqueue_bulk(zone, 0,
2775 pcp->batch, list, 2775 pcp->batch, list,
2776 migratetype, cold); 2776 migratetype, cold);
2777 if (unlikely(list_empty(list))) 2777 if (unlikely(list_empty(list)))
2778 return NULL; 2778 return NULL;
2779 } 2779 }
2780 2780
2781 if (cold) 2781 if (cold)
2782 page = list_last_entry(list, struct page, lru); 2782 page = list_last_entry(list, struct page, lru);
2783 else 2783 else
2784 page = list_first_entry(list, struct page, lru); 2784 page = list_first_entry(list, struct page, lru);
2785 2785
2786 list_del(&page->lru); 2786 list_del(&page->lru);
2787 pcp->count--; 2787 pcp->count--;
2788 } while (check_new_pcp(page)); 2788 } while (check_new_pcp(page));
2789 2789
2790 return page; 2790 return page;
2791 } 2791 }
2792 2792
2793 /* Lock and remove page from the per-cpu list */ 2793 /* Lock and remove page from the per-cpu list */
2794 static struct page *rmqueue_pcplist(struct zone *preferred_zone, 2794 static struct page *rmqueue_pcplist(struct zone *preferred_zone,
2795 struct zone *zone, unsigned int order, 2795 struct zone *zone, unsigned int order,
2796 gfp_t gfp_flags, int migratetype) 2796 gfp_t gfp_flags, int migratetype)
2797 { 2797 {
2798 struct per_cpu_pages *pcp; 2798 struct per_cpu_pages *pcp;
2799 struct list_head *list; 2799 struct list_head *list;
2800 bool cold = ((gfp_flags & __GFP_COLD) != 0); 2800 bool cold = ((gfp_flags & __GFP_COLD) != 0);
2801 struct page *page; 2801 struct page *page;
2802 unsigned long flags; 2802 unsigned long flags;
2803 2803
2804 local_irq_save(flags); 2804 local_irq_save(flags);
2805 pcp = &this_cpu_ptr(zone->pageset)->pcp; 2805 pcp = &this_cpu_ptr(zone->pageset)->pcp;
2806 list = &pcp->lists[migratetype]; 2806 list = &pcp->lists[migratetype];
2807 page = __rmqueue_pcplist(zone, migratetype, cold, pcp, list); 2807 page = __rmqueue_pcplist(zone, migratetype, cold, pcp, list);
2808 if (page) { 2808 if (page) {
2809 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); 2809 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
2810 zone_statistics(preferred_zone, zone); 2810 zone_statistics(preferred_zone, zone);
2811 } 2811 }
2812 local_irq_restore(flags); 2812 local_irq_restore(flags);
2813 return page; 2813 return page;
2814 } 2814 }
2815 2815
2816 /* 2816 /*
2817 * Allocate a page from the given zone. Use pcplists for order-0 allocations. 2817 * Allocate a page from the given zone. Use pcplists for order-0 allocations.
2818 */ 2818 */
2819 static inline 2819 static inline
2820 struct page *rmqueue(struct zone *preferred_zone, 2820 struct page *rmqueue(struct zone *preferred_zone,
2821 struct zone *zone, unsigned int order, 2821 struct zone *zone, unsigned int order,
2822 gfp_t gfp_flags, unsigned int alloc_flags, 2822 gfp_t gfp_flags, unsigned int alloc_flags,
2823 int migratetype) 2823 int migratetype)
2824 { 2824 {
2825 unsigned long flags; 2825 unsigned long flags;
2826 struct page *page; 2826 struct page *page;
2827 2827
2828 if (likely(order == 0)) { 2828 if (likely(order == 0)) {
2829 page = rmqueue_pcplist(preferred_zone, zone, order, 2829 page = rmqueue_pcplist(preferred_zone, zone, order,
2830 gfp_flags, migratetype); 2830 gfp_flags, migratetype);
2831 goto out; 2831 goto out;
2832 } 2832 }
2833 2833
2834 /* 2834 /*
2835 * We most definitely don't want callers attempting to 2835 * We most definitely don't want callers attempting to
2836 * allocate greater than order-1 page units with __GFP_NOFAIL. 2836 * allocate greater than order-1 page units with __GFP_NOFAIL.
2837 */ 2837 */
2838 WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); 2838 WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
2839 spin_lock_irqsave(&zone->lock, flags); 2839 spin_lock_irqsave(&zone->lock, flags);
2840 2840
2841 do { 2841 do {
2842 page = NULL; 2842 page = NULL;
2843 if (alloc_flags & ALLOC_HARDER) { 2843 if (alloc_flags & ALLOC_HARDER) {
2844 page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); 2844 page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
2845 if (page) 2845 if (page)
2846 trace_mm_page_alloc_zone_locked(page, order, migratetype); 2846 trace_mm_page_alloc_zone_locked(page, order, migratetype);
2847 } 2847 }
2848 if (!page) 2848 if (!page)
2849 page = __rmqueue(zone, order, migratetype); 2849 page = __rmqueue(zone, order, migratetype);
2850 } while (page && check_new_pages(page, order)); 2850 } while (page && check_new_pages(page, order));
2851 spin_unlock(&zone->lock); 2851 spin_unlock(&zone->lock);
2852 if (!page) 2852 if (!page)
2853 goto failed; 2853 goto failed;
2854 __mod_zone_freepage_state(zone, -(1 << order), 2854 __mod_zone_freepage_state(zone, -(1 << order),
2855 get_pcppage_migratetype(page)); 2855 get_pcppage_migratetype(page));
2856 2856
2857 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); 2857 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
2858 zone_statistics(preferred_zone, zone); 2858 zone_statistics(preferred_zone, zone);
2859 local_irq_restore(flags); 2859 local_irq_restore(flags);
2860 2860
2861 out: 2861 out:
2862 VM_BUG_ON_PAGE(page && bad_range(zone, page), page); 2862 VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
2863 return page; 2863 return page;
2864 2864
2865 failed: 2865 failed:
2866 local_irq_restore(flags); 2866 local_irq_restore(flags);
2867 return NULL; 2867 return NULL;
2868 } 2868 }
2869 2869
2870 #ifdef CONFIG_FAIL_PAGE_ALLOC 2870 #ifdef CONFIG_FAIL_PAGE_ALLOC
2871 2871
2872 static struct { 2872 static struct {
2873 struct fault_attr attr; 2873 struct fault_attr attr;
2874 2874
2875 bool ignore_gfp_highmem; 2875 bool ignore_gfp_highmem;
2876 bool ignore_gfp_reclaim; 2876 bool ignore_gfp_reclaim;
2877 u32 min_order; 2877 u32 min_order;
2878 } fail_page_alloc = { 2878 } fail_page_alloc = {
2879 .attr = FAULT_ATTR_INITIALIZER, 2879 .attr = FAULT_ATTR_INITIALIZER,
2880 .ignore_gfp_reclaim = true, 2880 .ignore_gfp_reclaim = true,
2881 .ignore_gfp_highmem = true, 2881 .ignore_gfp_highmem = true,
2882 .min_order = 1, 2882 .min_order = 1,
2883 }; 2883 };
2884 2884
2885 static int __init setup_fail_page_alloc(char *str) 2885 static int __init setup_fail_page_alloc(char *str)
2886 { 2886 {
2887 return setup_fault_attr(&fail_page_alloc.attr, str); 2887 return setup_fault_attr(&fail_page_alloc.attr, str);
2888 } 2888 }
2889 __setup("fail_page_alloc=", setup_fail_page_alloc); 2889 __setup("fail_page_alloc=", setup_fail_page_alloc);
2890 2890
2891 static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 2891 static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
2892 { 2892 {
2893 if (order < fail_page_alloc.min_order) 2893 if (order < fail_page_alloc.min_order)
2894 return false; 2894 return false;
2895 if (gfp_mask & __GFP_NOFAIL) 2895 if (gfp_mask & __GFP_NOFAIL)
2896 return false; 2896 return false;
2897 if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) 2897 if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
2898 return false; 2898 return false;
2899 if (fail_page_alloc.ignore_gfp_reclaim && 2899 if (fail_page_alloc.ignore_gfp_reclaim &&
2900 (gfp_mask & __GFP_DIRECT_RECLAIM)) 2900 (gfp_mask & __GFP_DIRECT_RECLAIM))
2901 return false; 2901 return false;
2902 2902
2903 return should_fail(&fail_page_alloc.attr, 1 << order); 2903 return should_fail(&fail_page_alloc.attr, 1 << order);
2904 } 2904 }
2905 2905
2906 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS 2906 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
2907 2907
2908 static int __init fail_page_alloc_debugfs(void) 2908 static int __init fail_page_alloc_debugfs(void)
2909 { 2909 {
2910 umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; 2910 umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
2911 struct dentry *dir; 2911 struct dentry *dir;
2912 2912
2913 dir = fault_create_debugfs_attr("fail_page_alloc", NULL, 2913 dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
2914 &fail_page_alloc.attr); 2914 &fail_page_alloc.attr);
2915 if (IS_ERR(dir)) 2915 if (IS_ERR(dir))
2916 return PTR_ERR(dir); 2916 return PTR_ERR(dir);
2917 2917
2918 if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, 2918 if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
2919 &fail_page_alloc.ignore_gfp_reclaim)) 2919 &fail_page_alloc.ignore_gfp_reclaim))
2920 goto fail; 2920 goto fail;
2921 if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir, 2921 if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
2922 &fail_page_alloc.ignore_gfp_highmem)) 2922 &fail_page_alloc.ignore_gfp_highmem))
2923 goto fail; 2923 goto fail;
2924 if (!debugfs_create_u32("min-order", mode, dir, 2924 if (!debugfs_create_u32("min-order", mode, dir,
2925 &fail_page_alloc.min_order)) 2925 &fail_page_alloc.min_order))
2926 goto fail; 2926 goto fail;
2927 2927
2928 return 0; 2928 return 0;
2929 fail: 2929 fail:
2930 debugfs_remove_recursive(dir); 2930 debugfs_remove_recursive(dir);
2931 2931
2932 return -ENOMEM; 2932 return -ENOMEM;
2933 } 2933 }
2934 2934
2935 late_initcall(fail_page_alloc_debugfs); 2935 late_initcall(fail_page_alloc_debugfs);
2936 2936
2937 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ 2937 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
2938 2938
2939 #else /* CONFIG_FAIL_PAGE_ALLOC */ 2939 #else /* CONFIG_FAIL_PAGE_ALLOC */
2940 2940
2941 static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 2941 static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
2942 { 2942 {
2943 return false; 2943 return false;
2944 } 2944 }
2945 2945
2946 #endif /* CONFIG_FAIL_PAGE_ALLOC */ 2946 #endif /* CONFIG_FAIL_PAGE_ALLOC */
2947 2947
2948 /* 2948 /*
2949 * Return true if free base pages are above 'mark'. For high-order checks it 2949 * Return true if free base pages are above 'mark'. For high-order checks it
2950 * will return true of the order-0 watermark is reached and there is at least 2950 * will return true of the order-0 watermark is reached and there is at least
2951 * one free page of a suitable size. Checking now avoids taking the zone lock 2951 * one free page of a suitable size. Checking now avoids taking the zone lock
2952 * to check in the allocation paths if no pages are free. 2952 * to check in the allocation paths if no pages are free.
2953 */ 2953 */
2954 bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, 2954 bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
2955 int classzone_idx, unsigned int alloc_flags, 2955 int classzone_idx, unsigned int alloc_flags,
2956 long free_pages) 2956 long free_pages)
2957 { 2957 {
2958 long min = mark; 2958 long min = mark;
2959 int o; 2959 int o;
2960 const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM)); 2960 const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM));
2961 2961
2962 /* free_pages may go negative - that's OK */ 2962 /* free_pages may go negative - that's OK */
2963 free_pages -= (1 << order) - 1; 2963 free_pages -= (1 << order) - 1;
2964 2964
2965 if (alloc_flags & ALLOC_HIGH) 2965 if (alloc_flags & ALLOC_HIGH)
2966 min -= min / 2; 2966 min -= min / 2;
2967 2967
2968 /* 2968 /*
2969 * If the caller does not have rights to ALLOC_HARDER then subtract 2969 * If the caller does not have rights to ALLOC_HARDER then subtract
2970 * the high-atomic reserves. This will over-estimate the size of the 2970 * the high-atomic reserves. This will over-estimate the size of the
2971 * atomic reserve but it avoids a search. 2971 * atomic reserve but it avoids a search.
2972 */ 2972 */
2973 if (likely(!alloc_harder)) { 2973 if (likely(!alloc_harder)) {
2974 free_pages -= z->nr_reserved_highatomic; 2974 free_pages -= z->nr_reserved_highatomic;
2975 } else { 2975 } else {
2976 /* 2976 /*
2977 * OOM victims can try even harder than normal ALLOC_HARDER 2977 * OOM victims can try even harder than normal ALLOC_HARDER
2978 * users on the grounds that it's definitely going to be in 2978 * users on the grounds that it's definitely going to be in
2979 * the exit path shortly and free memory. Any allocation it 2979 * the exit path shortly and free memory. Any allocation it
2980 * makes during the free path will be small and short-lived. 2980 * makes during the free path will be small and short-lived.
2981 */ 2981 */
2982 if (alloc_flags & ALLOC_OOM) 2982 if (alloc_flags & ALLOC_OOM)
2983 min -= min / 2; 2983 min -= min / 2;
2984 else 2984 else
2985 min -= min / 4; 2985 min -= min / 4;
2986 } 2986 }
2987 2987
2988 2988
2989 #ifdef CONFIG_CMA 2989 #ifdef CONFIG_CMA
2990 /* If allocation can't use CMA areas don't use free CMA pages */ 2990 /* If allocation can't use CMA areas don't use free CMA pages */
2991 if (!(alloc_flags & ALLOC_CMA)) 2991 if (!(alloc_flags & ALLOC_CMA))
2992 free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES); 2992 free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);
2993 #endif 2993 #endif
2994 2994
2995 /* 2995 /*
2996 * Check watermarks for an order-0 allocation request. If these 2996 * Check watermarks for an order-0 allocation request. If these
2997 * are not met, then a high-order request also cannot go ahead 2997 * are not met, then a high-order request also cannot go ahead
2998 * even if a suitable page happened to be free. 2998 * even if a suitable page happened to be free.
2999 */ 2999 */
3000 if (free_pages <= min + z->lowmem_reserve[classzone_idx]) 3000 if (free_pages <= min + z->lowmem_reserve[classzone_idx])
3001 return false; 3001 return false;
3002 3002
3003 /* If this is an order-0 request then the watermark is fine */ 3003 /* If this is an order-0 request then the watermark is fine */
3004 if (!order) 3004 if (!order)
3005 return true; 3005 return true;
3006 3006
3007 /* For a high-order request, check at least one suitable page is free */ 3007 /* For a high-order request, check at least one suitable page is free */
3008 for (o = order; o < MAX_ORDER; o++) { 3008 for (o = order; o < MAX_ORDER; o++) {
3009 struct free_area *area = &z->free_area[o]; 3009 struct free_area *area = &z->free_area[o];
3010 int mt; 3010 int mt;
3011 3011
3012 if (!area->nr_free) 3012 if (!area->nr_free)
3013 continue; 3013 continue;
3014 3014
3015 for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) { 3015 for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) {
3016 if (!list_empty(&area->free_list[mt])) 3016 if (!list_empty(&area->free_list[mt]))
3017 return true; 3017 return true;
3018 } 3018 }
3019 3019
3020 #ifdef CONFIG_CMA 3020 #ifdef CONFIG_CMA
3021 if ((alloc_flags & ALLOC_CMA) && 3021 if ((alloc_flags & ALLOC_CMA) &&
3022 !list_empty(&area->free_list[MIGRATE_CMA])) { 3022 !list_empty(&area->free_list[MIGRATE_CMA])) {
3023 return true; 3023 return true;
3024 } 3024 }
3025 #endif 3025 #endif
3026 if (alloc_harder && 3026 if (alloc_harder &&
3027 !list_empty(&area->free_list[MIGRATE_HIGHATOMIC])) 3027 !list_empty(&area->free_list[MIGRATE_HIGHATOMIC]))
3028 return true; 3028 return true;
3029 } 3029 }
3030 return false; 3030 return false;
3031 } 3031 }
3032 3032
3033 bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, 3033 bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
3034 int classzone_idx, unsigned int alloc_flags) 3034 int classzone_idx, unsigned int alloc_flags)
3035 { 3035 {
3036 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, 3036 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
3037 zone_page_state(z, NR_FREE_PAGES)); 3037 zone_page_state(z, NR_FREE_PAGES));
3038 } 3038 }
3039 3039
3040 static inline bool zone_watermark_fast(struct zone *z, unsigned int order, 3040 static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
3041 unsigned long mark, int classzone_idx, unsigned int alloc_flags) 3041 unsigned long mark, int classzone_idx, unsigned int alloc_flags)
3042 { 3042 {
3043 long free_pages = zone_page_state(z, NR_FREE_PAGES); 3043 long free_pages = zone_page_state(z, NR_FREE_PAGES);
3044 long cma_pages = 0; 3044 long cma_pages = 0;
3045 3045
3046 #ifdef CONFIG_CMA 3046 #ifdef CONFIG_CMA
3047 /* If allocation can't use CMA areas don't use free CMA pages */ 3047 /* If allocation can't use CMA areas don't use free CMA pages */
3048 if (!(alloc_flags & ALLOC_CMA)) 3048 if (!(alloc_flags & ALLOC_CMA))
3049 cma_pages = zone_page_state(z, NR_FREE_CMA_PAGES); 3049 cma_pages = zone_page_state(z, NR_FREE_CMA_PAGES);
3050 #endif 3050 #endif
3051 3051
3052 /* 3052 /*
3053 * Fast check for order-0 only. If this fails then the reserves 3053 * Fast check for order-0 only. If this fails then the reserves
3054 * need to be calculated. There is a corner case where the check 3054 * need to be calculated. There is a corner case where the check
3055 * passes but only the high-order atomic reserve are free. If 3055 * passes but only the high-order atomic reserve are free. If
3056 * the caller is !atomic then it'll uselessly search the free 3056 * the caller is !atomic then it'll uselessly search the free
3057 * list. That corner case is then slower but it is harmless. 3057 * list. That corner case is then slower but it is harmless.
3058 */ 3058 */
3059 if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx]) 3059 if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx])
3060 return true; 3060 return true;
3061 3061
3062 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, 3062 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
3063 free_pages); 3063 free_pages);
3064 } 3064 }
3065 3065
3066 bool zone_watermark_ok_safe(struct zone *z, unsigned int order, 3066 bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
3067 unsigned long mark, int classzone_idx) 3067 unsigned long mark, int classzone_idx)
3068 { 3068 {
3069 long free_pages = zone_page_state(z, NR_FREE_PAGES); 3069 long free_pages = zone_page_state(z, NR_FREE_PAGES);
3070 3070
3071 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) 3071 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
3072 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); 3072 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
3073 3073
3074 return __zone_watermark_ok(z, order, mark, classzone_idx, 0, 3074 return __zone_watermark_ok(z, order, mark, classzone_idx, 0,
3075 free_pages); 3075 free_pages);
3076 } 3076 }
3077 3077
3078 #ifdef CONFIG_NUMA 3078 #ifdef CONFIG_NUMA
3079 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) 3079 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
3080 { 3080 {
3081 return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <= 3081 return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=
3082 RECLAIM_DISTANCE; 3082 RECLAIM_DISTANCE;
3083 } 3083 }
3084 #else /* CONFIG_NUMA */ 3084 #else /* CONFIG_NUMA */
3085 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) 3085 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
3086 { 3086 {
3087 return true; 3087 return true;
3088 } 3088 }
3089 #endif /* CONFIG_NUMA */ 3089 #endif /* CONFIG_NUMA */
3090 3090
3091 /* 3091 /*
3092 * get_page_from_freelist goes through the zonelist trying to allocate 3092 * get_page_from_freelist goes through the zonelist trying to allocate
3093 * a page. 3093 * a page.
3094 */ 3094 */
3095 static struct page * 3095 static struct page *
3096 get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, 3096 get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
3097 const struct alloc_context *ac) 3097 const struct alloc_context *ac)
3098 { 3098 {
3099 struct zoneref *z = ac->preferred_zoneref; 3099 struct zoneref *z = ac->preferred_zoneref;
3100 struct zone *zone; 3100 struct zone *zone;
3101 struct pglist_data *last_pgdat_dirty_limit = NULL; 3101 struct pglist_data *last_pgdat_dirty_limit = NULL;
3102 3102
3103 /* 3103 /*
3104 * Scan zonelist, looking for a zone with enough free. 3104 * Scan zonelist, looking for a zone with enough free.
3105 * See also __cpuset_node_allowed() comment in kernel/cpuset.c. 3105 * See also __cpuset_node_allowed() comment in kernel/cpuset.c.
3106 */ 3106 */
3107 for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, 3107 for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
3108 ac->nodemask) { 3108 ac->nodemask) {
3109 struct page *page; 3109 struct page *page;
3110 unsigned long mark; 3110 unsigned long mark;
3111 3111
3112 if (cpusets_enabled() && 3112 if (cpusets_enabled() &&
3113 (alloc_flags & ALLOC_CPUSET) && 3113 (alloc_flags & ALLOC_CPUSET) &&
3114 !__cpuset_zone_allowed(zone, gfp_mask)) 3114 !__cpuset_zone_allowed(zone, gfp_mask))
3115 continue; 3115 continue;
3116 /* 3116 /*
3117 * When allocating a page cache page for writing, we 3117 * When allocating a page cache page for writing, we
3118 * want to get it from a node that is within its dirty 3118 * want to get it from a node that is within its dirty
3119 * limit, such that no single node holds more than its 3119 * limit, such that no single node holds more than its
3120 * proportional share of globally allowed dirty pages. 3120 * proportional share of globally allowed dirty pages.
3121 * The dirty limits take into account the node's 3121 * The dirty limits take into account the node's
3122 * lowmem reserves and high watermark so that kswapd 3122 * lowmem reserves and high watermark so that kswapd
3123 * should be able to balance it without having to 3123 * should be able to balance it without having to
3124 * write pages from its LRU list. 3124 * write pages from its LRU list.
3125 * 3125 *
3126 * XXX: For now, allow allocations to potentially 3126 * XXX: For now, allow allocations to potentially
3127 * exceed the per-node dirty limit in the slowpath 3127 * exceed the per-node dirty limit in the slowpath
3128 * (spread_dirty_pages unset) before going into reclaim, 3128 * (spread_dirty_pages unset) before going into reclaim,
3129 * which is important when on a NUMA setup the allowed 3129 * which is important when on a NUMA setup the allowed
3130 * nodes are together not big enough to reach the 3130 * nodes are together not big enough to reach the
3131 * global limit. The proper fix for these situations 3131 * global limit. The proper fix for these situations
3132 * will require awareness of nodes in the 3132 * will require awareness of nodes in the
3133 * dirty-throttling and the flusher threads. 3133 * dirty-throttling and the flusher threads.
3134 */ 3134 */
3135 if (ac->spread_dirty_pages) { 3135 if (ac->spread_dirty_pages) {
3136 if (last_pgdat_dirty_limit == zone->zone_pgdat) 3136 if (last_pgdat_dirty_limit == zone->zone_pgdat)
3137 continue; 3137 continue;
3138 3138
3139 if (!node_dirty_ok(zone->zone_pgdat)) { 3139 if (!node_dirty_ok(zone->zone_pgdat)) {
3140 last_pgdat_dirty_limit = zone->zone_pgdat; 3140 last_pgdat_dirty_limit = zone->zone_pgdat;
3141 continue; 3141 continue;
3142 } 3142 }
3143 } 3143 }
3144 3144
3145 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; 3145 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
3146 if (!zone_watermark_fast(zone, order, mark, 3146 if (!zone_watermark_fast(zone, order, mark,
3147 ac_classzone_idx(ac), alloc_flags)) { 3147 ac_classzone_idx(ac), alloc_flags)) {
3148 int ret; 3148 int ret;
3149 3149
3150 /* Checked here to keep the fast path fast */ 3150 /* Checked here to keep the fast path fast */
3151 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); 3151 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
3152 if (alloc_flags & ALLOC_NO_WATERMARKS) 3152 if (alloc_flags & ALLOC_NO_WATERMARKS)
3153 goto try_this_zone; 3153 goto try_this_zone;
3154 3154
3155 if (node_reclaim_mode == 0 || 3155 if (node_reclaim_mode == 0 ||
3156 !zone_allows_reclaim(ac->preferred_zoneref->zone, zone)) 3156 !zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
3157 continue; 3157 continue;
3158 3158
3159 ret = node_reclaim(zone->zone_pgdat, gfp_mask, order); 3159 ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);
3160 switch (ret) { 3160 switch (ret) {
3161 case NODE_RECLAIM_NOSCAN: 3161 case NODE_RECLAIM_NOSCAN:
3162 /* did not scan */ 3162 /* did not scan */
3163 continue; 3163 continue;
3164 case NODE_RECLAIM_FULL: 3164 case NODE_RECLAIM_FULL:
3165 /* scanned but unreclaimable */ 3165 /* scanned but unreclaimable */
3166 continue; 3166 continue;
3167 default: 3167 default:
3168 /* did we reclaim enough */ 3168 /* did we reclaim enough */
3169 if (zone_watermark_ok(zone, order, mark, 3169 if (zone_watermark_ok(zone, order, mark,
3170 ac_classzone_idx(ac), alloc_flags)) 3170 ac_classzone_idx(ac), alloc_flags))
3171 goto try_this_zone; 3171 goto try_this_zone;
3172 3172
3173 continue; 3173 continue;
3174 } 3174 }
3175 } 3175 }
3176 3176
3177 try_this_zone: 3177 try_this_zone:
3178 page = rmqueue(ac->preferred_zoneref->zone, zone, order, 3178 page = rmqueue(ac->preferred_zoneref->zone, zone, order,
3179 gfp_mask, alloc_flags, ac->migratetype); 3179 gfp_mask, alloc_flags, ac->migratetype);
3180 if (page) { 3180 if (page) {
3181 prep_new_page(page, order, gfp_mask, alloc_flags); 3181 prep_new_page(page, order, gfp_mask, alloc_flags);
3182 3182
3183 /* 3183 /*
3184 * If this is a high-order atomic allocation then check 3184 * If this is a high-order atomic allocation then check
3185 * if the pageblock should be reserved for the future 3185 * if the pageblock should be reserved for the future
3186 */ 3186 */
3187 if (unlikely(order && (alloc_flags & ALLOC_HARDER))) 3187 if (unlikely(order && (alloc_flags & ALLOC_HARDER)))
3188 reserve_highatomic_pageblock(page, zone, order); 3188 reserve_highatomic_pageblock(page, zone, order);
3189 3189
3190 return page; 3190 return page;
3191 } 3191 }
3192 } 3192 }
3193 3193
3194 return NULL; 3194 return NULL;
3195 } 3195 }
3196 3196
3197 /* 3197 /*
3198 * Large machines with many possible nodes should not always dump per-node 3198 * Large machines with many possible nodes should not always dump per-node
3199 * meminfo in irq context. 3199 * meminfo in irq context.
3200 */ 3200 */
3201 static inline bool should_suppress_show_mem(void) 3201 static inline bool should_suppress_show_mem(void)
3202 { 3202 {
3203 bool ret = false; 3203 bool ret = false;
3204 3204
3205 #if NODES_SHIFT > 8 3205 #if NODES_SHIFT > 8
3206 ret = in_interrupt(); 3206 ret = in_interrupt();
3207 #endif 3207 #endif
3208 return ret; 3208 return ret;
3209 } 3209 }
3210 3210
3211 static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask) 3211 static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
3212 { 3212 {
3213 unsigned int filter = SHOW_MEM_FILTER_NODES; 3213 unsigned int filter = SHOW_MEM_FILTER_NODES;
3214 static DEFINE_RATELIMIT_STATE(show_mem_rs, HZ, 1); 3214 static DEFINE_RATELIMIT_STATE(show_mem_rs, HZ, 1);
3215 3215
3216 if (should_suppress_show_mem() || !__ratelimit(&show_mem_rs)) 3216 if (should_suppress_show_mem() || !__ratelimit(&show_mem_rs))
3217 return; 3217 return;
3218 3218
3219 /* 3219 /*
3220 * This documents exceptions given to allocations in certain 3220 * This documents exceptions given to allocations in certain
3221 * contexts that are allowed to allocate outside current's set 3221 * contexts that are allowed to allocate outside current's set
3222 * of allowed nodes. 3222 * of allowed nodes.
3223 */ 3223 */
3224 if (!(gfp_mask & __GFP_NOMEMALLOC)) 3224 if (!(gfp_mask & __GFP_NOMEMALLOC))
3225 if (tsk_is_oom_victim(current) || 3225 if (tsk_is_oom_victim(current) ||
3226 (current->flags & (PF_MEMALLOC | PF_EXITING))) 3226 (current->flags & (PF_MEMALLOC | PF_EXITING)))
3227 filter &= ~SHOW_MEM_FILTER_NODES; 3227 filter &= ~SHOW_MEM_FILTER_NODES;
3228 if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM)) 3228 if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
3229 filter &= ~SHOW_MEM_FILTER_NODES; 3229 filter &= ~SHOW_MEM_FILTER_NODES;
3230 3230
3231 show_mem(filter, nodemask); 3231 show_mem(filter, nodemask);
3232 } 3232 }
3233 3233
3234 void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) 3234 void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
3235 { 3235 {
3236 struct va_format vaf; 3236 struct va_format vaf;
3237 va_list args; 3237 va_list args;
3238 static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL, 3238 static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL,
3239 DEFAULT_RATELIMIT_BURST); 3239 DEFAULT_RATELIMIT_BURST);
3240 3240
3241 if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs)) 3241 if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))
3242 return; 3242 return;
3243 3243
3244 pr_warn("%s: ", current->comm); 3244 pr_warn("%s: ", current->comm);
3245 3245
3246 va_start(args, fmt); 3246 va_start(args, fmt);
3247 vaf.fmt = fmt; 3247 vaf.fmt = fmt;
3248 vaf.va = &args; 3248 vaf.va = &args;
3249 pr_cont("%pV", &vaf); 3249 pr_cont("%pV", &vaf);
3250 va_end(args); 3250 va_end(args);
3251 3251
3252 pr_cont(", mode:%#x(%pGg), nodemask=", gfp_mask, &gfp_mask); 3252 pr_cont(", mode:%#x(%pGg), nodemask=", gfp_mask, &gfp_mask);
3253 if (nodemask) 3253 if (nodemask)
3254 pr_cont("%*pbl\n", nodemask_pr_args(nodemask)); 3254 pr_cont("%*pbl\n", nodemask_pr_args(nodemask));
3255 else 3255 else
3256 pr_cont("(null)\n"); 3256 pr_cont("(null)\n");
3257 3257
3258 cpuset_print_current_mems_allowed(); 3258 cpuset_print_current_mems_allowed();
3259 3259
3260 dump_stack(); 3260 dump_stack();
3261 warn_alloc_show_mem(gfp_mask, nodemask); 3261 warn_alloc_show_mem(gfp_mask, nodemask);
3262 } 3262 }
3263 3263
3264 static inline struct page * 3264 static inline struct page *
3265 __alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order, 3265 __alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order,
3266 unsigned int alloc_flags, 3266 unsigned int alloc_flags,
3267 const struct alloc_context *ac) 3267 const struct alloc_context *ac)
3268 { 3268 {
3269 struct page *page; 3269 struct page *page;
3270 3270
3271 page = get_page_from_freelist(gfp_mask, order, 3271 page = get_page_from_freelist(gfp_mask, order,
3272 alloc_flags|ALLOC_CPUSET, ac); 3272 alloc_flags|ALLOC_CPUSET, ac);
3273 /* 3273 /*
3274 * fallback to ignore cpuset restriction if our nodes 3274 * fallback to ignore cpuset restriction if our nodes
3275 * are depleted 3275 * are depleted
3276 */ 3276 */
3277 if (!page) 3277 if (!page)
3278 page = get_page_from_freelist(gfp_mask, order, 3278 page = get_page_from_freelist(gfp_mask, order,
3279 alloc_flags, ac); 3279 alloc_flags, ac);
3280 3280
3281 return page; 3281 return page;
3282 } 3282 }
3283 3283
3284 static inline struct page * 3284 static inline struct page *
3285 __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, 3285 __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
3286 const struct alloc_context *ac, unsigned long *did_some_progress) 3286 const struct alloc_context *ac, unsigned long *did_some_progress)
3287 { 3287 {
3288 struct oom_control oc = { 3288 struct oom_control oc = {
3289 .zonelist = ac->zonelist, 3289 .zonelist = ac->zonelist,
3290 .nodemask = ac->nodemask, 3290 .nodemask = ac->nodemask,
3291 .memcg = NULL, 3291 .memcg = NULL,
3292 .gfp_mask = gfp_mask, 3292 .gfp_mask = gfp_mask,
3293 .order = order, 3293 .order = order,
3294 }; 3294 };
3295 struct page *page; 3295 struct page *page;
3296 3296
3297 *did_some_progress = 0; 3297 *did_some_progress = 0;
3298 3298
3299 /* 3299 /*
3300 * Acquire the oom lock. If that fails, somebody else is 3300 * Acquire the oom lock. If that fails, somebody else is
3301 * making progress for us. 3301 * making progress for us.
3302 */ 3302 */
3303 if (!mutex_trylock(&oom_lock)) { 3303 if (!mutex_trylock(&oom_lock)) {
3304 *did_some_progress = 1; 3304 *did_some_progress = 1;
3305 schedule_timeout_uninterruptible(1); 3305 schedule_timeout_uninterruptible(1);
3306 return NULL; 3306 return NULL;
3307 } 3307 }
3308 3308
3309 /* 3309 /*
3310 * Go through the zonelist yet one more time, keep very high watermark 3310 * Go through the zonelist yet one more time, keep very high watermark
3311 * here, this is only to catch a parallel oom killing, we must fail if 3311 * here, this is only to catch a parallel oom killing, we must fail if
3312 * we're still under heavy pressure. But make sure that this reclaim 3312 * we're still under heavy pressure. But make sure that this reclaim
3313 * attempt shall not depend on __GFP_DIRECT_RECLAIM && !__GFP_NORETRY 3313 * attempt shall not depend on __GFP_DIRECT_RECLAIM && !__GFP_NORETRY
3314 * allocation which will never fail due to oom_lock already held. 3314 * allocation which will never fail due to oom_lock already held.
3315 */ 3315 */
3316 page = get_page_from_freelist((gfp_mask | __GFP_HARDWALL) & 3316 page = get_page_from_freelist((gfp_mask | __GFP_HARDWALL) &
3317 ~__GFP_DIRECT_RECLAIM, order, 3317 ~__GFP_DIRECT_RECLAIM, order,
3318 ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac); 3318 ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac);
3319 if (page) 3319 if (page)
3320 goto out; 3320 goto out;
3321 3321
3322 /* Coredumps can quickly deplete all memory reserves */ 3322 /* Coredumps can quickly deplete all memory reserves */
3323 if (current->flags & PF_DUMPCORE) 3323 if (current->flags & PF_DUMPCORE)
3324 goto out; 3324 goto out;
3325 /* The OOM killer will not help higher order allocs */ 3325 /* The OOM killer will not help higher order allocs */
3326 if (order > PAGE_ALLOC_COSTLY_ORDER) 3326 if (order > PAGE_ALLOC_COSTLY_ORDER)
3327 goto out; 3327 goto out;
3328 /* 3328 /*
3329 * We have already exhausted all our reclaim opportunities without any 3329 * We have already exhausted all our reclaim opportunities without any
3330 * success so it is time to admit defeat. We will skip the OOM killer 3330 * success so it is time to admit defeat. We will skip the OOM killer
3331 * because it is very likely that the caller has a more reasonable 3331 * because it is very likely that the caller has a more reasonable
3332 * fallback than shooting a random task. 3332 * fallback than shooting a random task.
3333 */ 3333 */
3334 if (gfp_mask & __GFP_RETRY_MAYFAIL) 3334 if (gfp_mask & __GFP_RETRY_MAYFAIL)
3335 goto out; 3335 goto out;
3336 /* The OOM killer does not needlessly kill tasks for lowmem */ 3336 /* The OOM killer does not needlessly kill tasks for lowmem */
3337 if (ac->high_zoneidx < ZONE_NORMAL) 3337 if (ac->high_zoneidx < ZONE_NORMAL)
3338 goto out; 3338 goto out;
3339 if (pm_suspended_storage()) 3339 if (pm_suspended_storage())
3340 goto out; 3340 goto out;
3341 /* 3341 /*
3342 * XXX: GFP_NOFS allocations should rather fail than rely on 3342 * XXX: GFP_NOFS allocations should rather fail than rely on
3343 * other request to make a forward progress. 3343 * other request to make a forward progress.
3344 * We are in an unfortunate situation where out_of_memory cannot 3344 * We are in an unfortunate situation where out_of_memory cannot
3345 * do much for this context but let's try it to at least get 3345 * do much for this context but let's try it to at least get
3346 * access to memory reserved if the current task is killed (see 3346 * access to memory reserved if the current task is killed (see
3347 * out_of_memory). Once filesystems are ready to handle allocation 3347 * out_of_memory). Once filesystems are ready to handle allocation
3348 * failures more gracefully we should just bail out here. 3348 * failures more gracefully we should just bail out here.
3349 */ 3349 */
3350 3350
3351 /* The OOM killer may not free memory on a specific node */ 3351 /* The OOM killer may not free memory on a specific node */
3352 if (gfp_mask & __GFP_THISNODE) 3352 if (gfp_mask & __GFP_THISNODE)
3353 goto out; 3353 goto out;
3354 3354
3355 /* Exhausted what can be done so it's blamo time */ 3355 /* Exhausted what can be done so it's blamo time */
3356 if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) { 3356 if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
3357 *did_some_progress = 1; 3357 *did_some_progress = 1;
3358 3358
3359 /* 3359 /*
3360 * Help non-failing allocations by giving them access to memory 3360 * Help non-failing allocations by giving them access to memory
3361 * reserves 3361 * reserves
3362 */ 3362 */
3363 if (gfp_mask & __GFP_NOFAIL) 3363 if (gfp_mask & __GFP_NOFAIL)
3364 page = __alloc_pages_cpuset_fallback(gfp_mask, order, 3364 page = __alloc_pages_cpuset_fallback(gfp_mask, order,
3365 ALLOC_NO_WATERMARKS, ac); 3365 ALLOC_NO_WATERMARKS, ac);
3366 } 3366 }
3367 out: 3367 out:
3368 mutex_unlock(&oom_lock); 3368 mutex_unlock(&oom_lock);
3369 return page; 3369 return page;
3370 } 3370 }
3371 3371
3372 /* 3372 /*
3373 * Maximum number of compaction retries wit a progress before OOM 3373 * Maximum number of compaction retries wit a progress before OOM
3374 * killer is consider as the only way to move forward. 3374 * killer is consider as the only way to move forward.
3375 */ 3375 */
3376 #define MAX_COMPACT_RETRIES 16 3376 #define MAX_COMPACT_RETRIES 16
3377 3377
3378 #ifdef CONFIG_COMPACTION 3378 #ifdef CONFIG_COMPACTION
3379 /* Try memory compaction for high-order allocations before reclaim */ 3379 /* Try memory compaction for high-order allocations before reclaim */
3380 static struct page * 3380 static struct page *
3381 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 3381 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
3382 unsigned int alloc_flags, const struct alloc_context *ac, 3382 unsigned int alloc_flags, const struct alloc_context *ac,
3383 enum compact_priority prio, enum compact_result *compact_result) 3383 enum compact_priority prio, enum compact_result *compact_result)
3384 { 3384 {
3385 struct page *page; 3385 struct page *page;
3386 unsigned int noreclaim_flag; 3386 unsigned int noreclaim_flag;
3387 3387
3388 if (!order) 3388 if (!order)
3389 return NULL; 3389 return NULL;
3390 3390
3391 noreclaim_flag = memalloc_noreclaim_save(); 3391 noreclaim_flag = memalloc_noreclaim_save();
3392 *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, 3392 *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
3393 prio); 3393 prio);
3394 memalloc_noreclaim_restore(noreclaim_flag); 3394 memalloc_noreclaim_restore(noreclaim_flag);
3395 3395
3396 if (*compact_result <= COMPACT_INACTIVE) 3396 if (*compact_result <= COMPACT_INACTIVE)
3397 return NULL; 3397 return NULL;
3398 3398
3399 /* 3399 /*
3400 * At least in one zone compaction wasn't deferred or skipped, so let's 3400 * At least in one zone compaction wasn't deferred or skipped, so let's
3401 * count a compaction stall 3401 * count a compaction stall
3402 */ 3402 */
3403 count_vm_event(COMPACTSTALL); 3403 count_vm_event(COMPACTSTALL);
3404 3404
3405 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); 3405 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
3406 3406
3407 if (page) { 3407 if (page) {
3408 struct zone *zone = page_zone(page); 3408 struct zone *zone = page_zone(page);
3409 3409
3410 zone->compact_blockskip_flush = false; 3410 zone->compact_blockskip_flush = false;
3411 compaction_defer_reset(zone, order, true); 3411 compaction_defer_reset(zone, order, true);
3412 count_vm_event(COMPACTSUCCESS); 3412 count_vm_event(COMPACTSUCCESS);
3413 return page; 3413 return page;
3414 } 3414 }
3415 3415
3416 /* 3416 /*
3417 * It's bad if compaction run occurs and fails. The most likely reason 3417 * It's bad if compaction run occurs and fails. The most likely reason
3418 * is that pages exist, but not enough to satisfy watermarks. 3418 * is that pages exist, but not enough to satisfy watermarks.
3419 */ 3419 */
3420 count_vm_event(COMPACTFAIL); 3420 count_vm_event(COMPACTFAIL);
3421 3421
3422 cond_resched(); 3422 cond_resched();
3423 3423
3424 return NULL; 3424 return NULL;
3425 } 3425 }
3426 3426
3427 static inline bool 3427 static inline bool
3428 should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, 3428 should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
3429 enum compact_result compact_result, 3429 enum compact_result compact_result,
3430 enum compact_priority *compact_priority, 3430 enum compact_priority *compact_priority,
3431 int *compaction_retries) 3431 int *compaction_retries)
3432 { 3432 {
3433 int max_retries = MAX_COMPACT_RETRIES; 3433 int max_retries = MAX_COMPACT_RETRIES;
3434 int min_priority; 3434 int min_priority;
3435 bool ret = false; 3435 bool ret = false;
3436 int retries = *compaction_retries; 3436 int retries = *compaction_retries;
3437 enum compact_priority priority = *compact_priority; 3437 enum compact_priority priority = *compact_priority;
3438 3438
3439 if (!order) 3439 if (!order)
3440 return false; 3440 return false;
3441 3441
3442 if (compaction_made_progress(compact_result)) 3442 if (compaction_made_progress(compact_result))
3443 (*compaction_retries)++; 3443 (*compaction_retries)++;
3444 3444
3445 /* 3445 /*
3446 * compaction considers all the zone as desperately out of memory 3446 * compaction considers all the zone as desperately out of memory
3447 * so it doesn't really make much sense to retry except when the 3447 * so it doesn't really make much sense to retry except when the
3448 * failure could be caused by insufficient priority 3448 * failure could be caused by insufficient priority
3449 */ 3449 */
3450 if (compaction_failed(compact_result)) 3450 if (compaction_failed(compact_result))
3451 goto check_priority; 3451 goto check_priority;
3452 3452
3453 /* 3453 /*
3454 * make sure the compaction wasn't deferred or didn't bail out early 3454 * make sure the compaction wasn't deferred or didn't bail out early
3455 * due to locks contention before we declare that we should give up. 3455 * due to locks contention before we declare that we should give up.
3456 * But do not retry if the given zonelist is not suitable for 3456 * But do not retry if the given zonelist is not suitable for
3457 * compaction. 3457 * compaction.
3458 */ 3458 */
3459 if (compaction_withdrawn(compact_result)) { 3459 if (compaction_withdrawn(compact_result)) {
3460 ret = compaction_zonelist_suitable(ac, order, alloc_flags); 3460 ret = compaction_zonelist_suitable(ac, order, alloc_flags);
3461 goto out; 3461 goto out;
3462 } 3462 }
3463 3463
3464 /* 3464 /*
3465 * !costly requests are much more important than __GFP_RETRY_MAYFAIL 3465 * !costly requests are much more important than __GFP_RETRY_MAYFAIL
3466 * costly ones because they are de facto nofail and invoke OOM 3466 * costly ones because they are de facto nofail and invoke OOM
3467 * killer to move on while costly can fail and users are ready 3467 * killer to move on while costly can fail and users are ready
3468 * to cope with that. 1/4 retries is rather arbitrary but we 3468 * to cope with that. 1/4 retries is rather arbitrary but we
3469 * would need much more detailed feedback from compaction to 3469 * would need much more detailed feedback from compaction to
3470 * make a better decision. 3470 * make a better decision.
3471 */ 3471 */
3472 if (order > PAGE_ALLOC_COSTLY_ORDER) 3472 if (order > PAGE_ALLOC_COSTLY_ORDER)
3473 max_retries /= 4; 3473 max_retries /= 4;
3474 if (*compaction_retries <= max_retries) { 3474 if (*compaction_retries <= max_retries) {
3475 ret = true; 3475 ret = true;
3476 goto out; 3476 goto out;
3477 } 3477 }
3478 3478
3479 /* 3479 /*
3480 * Make sure there are attempts at the highest priority if we exhausted 3480 * Make sure there are attempts at the highest priority if we exhausted
3481 * all retries or failed at the lower priorities. 3481 * all retries or failed at the lower priorities.
3482 */ 3482 */
3483 check_priority: 3483 check_priority:
3484 min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ? 3484 min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ?
3485 MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY; 3485 MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY;
3486 3486
3487 if (*compact_priority > min_priority) { 3487 if (*compact_priority > min_priority) {
3488 (*compact_priority)--; 3488 (*compact_priority)--;
3489 *compaction_retries = 0; 3489 *compaction_retries = 0;
3490 ret = true; 3490 ret = true;
3491 } 3491 }
3492 out: 3492 out:
3493 trace_compact_retry(order, priority, compact_result, retries, max_retries, ret); 3493 trace_compact_retry(order, priority, compact_result, retries, max_retries, ret);
3494 return ret; 3494 return ret;
3495 } 3495 }
3496 #else 3496 #else
3497 static inline struct page * 3497 static inline struct page *
3498 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 3498 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
3499 unsigned int alloc_flags, const struct alloc_context *ac, 3499 unsigned int alloc_flags, const struct alloc_context *ac,
3500 enum compact_priority prio, enum compact_result *compact_result) 3500 enum compact_priority prio, enum compact_result *compact_result)
3501 { 3501 {
3502 *compact_result = COMPACT_SKIPPED; 3502 *compact_result = COMPACT_SKIPPED;
3503 return NULL; 3503 return NULL;
3504 } 3504 }
3505 3505
3506 static inline bool 3506 static inline bool
3507 should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags, 3507 should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags,
3508 enum compact_result compact_result, 3508 enum compact_result compact_result,
3509 enum compact_priority *compact_priority, 3509 enum compact_priority *compact_priority,
3510 int *compaction_retries) 3510 int *compaction_retries)
3511 { 3511 {
3512 struct zone *zone; 3512 struct zone *zone;
3513 struct zoneref *z; 3513 struct zoneref *z;
3514 3514
3515 if (!order || order > PAGE_ALLOC_COSTLY_ORDER) 3515 if (!order || order > PAGE_ALLOC_COSTLY_ORDER)
3516 return false; 3516 return false;
3517 3517
3518 /* 3518 /*
3519 * There are setups with compaction disabled which would prefer to loop 3519 * There are setups with compaction disabled which would prefer to loop
3520 * inside the allocator rather than hit the oom killer prematurely. 3520 * inside the allocator rather than hit the oom killer prematurely.
3521 * Let's give them a good hope and keep retrying while the order-0 3521 * Let's give them a good hope and keep retrying while the order-0
3522 * watermarks are OK. 3522 * watermarks are OK.
3523 */ 3523 */
3524 for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, 3524 for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
3525 ac->nodemask) { 3525 ac->nodemask) {
3526 if (zone_watermark_ok(zone, 0, min_wmark_pages(zone), 3526 if (zone_watermark_ok(zone, 0, min_wmark_pages(zone),
3527 ac_classzone_idx(ac), alloc_flags)) 3527 ac_classzone_idx(ac), alloc_flags))
3528 return true; 3528 return true;
3529 } 3529 }
3530 return false; 3530 return false;
3531 } 3531 }
3532 #endif /* CONFIG_COMPACTION */ 3532 #endif /* CONFIG_COMPACTION */
3533 3533
3534 #ifdef CONFIG_LOCKDEP 3534 #ifdef CONFIG_LOCKDEP
3535 struct lockdep_map __fs_reclaim_map = 3535 struct lockdep_map __fs_reclaim_map =
3536 STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map); 3536 STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map);
3537 3537
3538 static bool __need_fs_reclaim(gfp_t gfp_mask) 3538 static bool __need_fs_reclaim(gfp_t gfp_mask)
3539 { 3539 {
3540 gfp_mask = current_gfp_context(gfp_mask); 3540 gfp_mask = current_gfp_context(gfp_mask);
3541 3541
3542 /* no reclaim without waiting on it */ 3542 /* no reclaim without waiting on it */
3543 if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) 3543 if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
3544 return false; 3544 return false;
3545 3545
3546 /* this guy won't enter reclaim */ 3546 /* this guy won't enter reclaim */
3547 if (current->flags & PF_MEMALLOC) 3547 if (current->flags & PF_MEMALLOC)
3548 return false; 3548 return false;
3549 3549
3550 /* We're only interested __GFP_FS allocations for now */ 3550 /* We're only interested __GFP_FS allocations for now */
3551 if (!(gfp_mask & __GFP_FS)) 3551 if (!(gfp_mask & __GFP_FS))
3552 return false; 3552 return false;
3553 3553
3554 if (gfp_mask & __GFP_NOLOCKDEP) 3554 if (gfp_mask & __GFP_NOLOCKDEP)
3555 return false; 3555 return false;
3556 3556
3557 return true; 3557 return true;
3558 } 3558 }
3559 3559
3560 void fs_reclaim_acquire(gfp_t gfp_mask) 3560 void fs_reclaim_acquire(gfp_t gfp_mask)
3561 { 3561 {
3562 if (__need_fs_reclaim(gfp_mask)) 3562 if (__need_fs_reclaim(gfp_mask))
3563 lock_map_acquire(&__fs_reclaim_map); 3563 lock_map_acquire(&__fs_reclaim_map);
3564 } 3564 }
3565 EXPORT_SYMBOL_GPL(fs_reclaim_acquire); 3565 EXPORT_SYMBOL_GPL(fs_reclaim_acquire);
3566 3566
3567 void fs_reclaim_release(gfp_t gfp_mask) 3567 void fs_reclaim_release(gfp_t gfp_mask)
3568 { 3568 {
3569 if (__need_fs_reclaim(gfp_mask)) 3569 if (__need_fs_reclaim(gfp_mask))
3570 lock_map_release(&__fs_reclaim_map); 3570 lock_map_release(&__fs_reclaim_map);
3571 } 3571 }
3572 EXPORT_SYMBOL_GPL(fs_reclaim_release); 3572 EXPORT_SYMBOL_GPL(fs_reclaim_release);
3573 #endif 3573 #endif
3574 3574
3575 /* Perform direct synchronous page reclaim */ 3575 /* Perform direct synchronous page reclaim */
3576 static int 3576 static int
3577 __perform_reclaim(gfp_t gfp_mask, unsigned int order, 3577 __perform_reclaim(gfp_t gfp_mask, unsigned int order,
3578 const struct alloc_context *ac) 3578 const struct alloc_context *ac)
3579 { 3579 {
3580 struct reclaim_state reclaim_state; 3580 struct reclaim_state reclaim_state;
3581 int progress; 3581 int progress;
3582 unsigned int noreclaim_flag; 3582 unsigned int noreclaim_flag;
3583 3583
3584 cond_resched(); 3584 cond_resched();
3585 3585
3586 /* We now go into synchronous reclaim */ 3586 /* We now go into synchronous reclaim */
3587 cpuset_memory_pressure_bump(); 3587 cpuset_memory_pressure_bump();
3588 noreclaim_flag = memalloc_noreclaim_save(); 3588 noreclaim_flag = memalloc_noreclaim_save();
3589 fs_reclaim_acquire(gfp_mask); 3589 fs_reclaim_acquire(gfp_mask);
3590 reclaim_state.reclaimed_slab = 0; 3590 reclaim_state.reclaimed_slab = 0;
3591 current->reclaim_state = &reclaim_state; 3591 current->reclaim_state = &reclaim_state;
3592 3592
3593 progress = try_to_free_pages(ac->zonelist, order, gfp_mask, 3593 progress = try_to_free_pages(ac->zonelist, order, gfp_mask,
3594 ac->nodemask); 3594 ac->nodemask);
3595 3595
3596 current->reclaim_state = NULL; 3596 current->reclaim_state = NULL;
3597 fs_reclaim_release(gfp_mask); 3597 fs_reclaim_release(gfp_mask);
3598 memalloc_noreclaim_restore(noreclaim_flag); 3598 memalloc_noreclaim_restore(noreclaim_flag);
3599 3599
3600 cond_resched(); 3600 cond_resched();
3601 3601
3602 return progress; 3602 return progress;
3603 } 3603 }
3604 3604
3605 /* The really slow allocator path where we enter direct reclaim */ 3605 /* The really slow allocator path where we enter direct reclaim */
3606 static inline struct page * 3606 static inline struct page *
3607 __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, 3607 __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
3608 unsigned int alloc_flags, const struct alloc_context *ac, 3608 unsigned int alloc_flags, const struct alloc_context *ac,
3609 unsigned long *did_some_progress) 3609 unsigned long *did_some_progress)
3610 { 3610 {
3611 struct page *page = NULL; 3611 struct page *page = NULL;
3612 bool drained = false; 3612 bool drained = false;
3613 3613
3614 *did_some_progress = __perform_reclaim(gfp_mask, order, ac); 3614 *did_some_progress = __perform_reclaim(gfp_mask, order, ac);
3615 if (unlikely(!(*did_some_progress))) 3615 if (unlikely(!(*did_some_progress)))
3616 return NULL; 3616 return NULL;
3617 3617
3618 retry: 3618 retry:
3619 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); 3619 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
3620 3620
3621 /* 3621 /*
3622 * If an allocation failed after direct reclaim, it could be because 3622 * If an allocation failed after direct reclaim, it could be because
3623 * pages are pinned on the per-cpu lists or in high alloc reserves. 3623 * pages are pinned on the per-cpu lists or in high alloc reserves.
3624 * Shrink them them and try again 3624 * Shrink them them and try again
3625 */ 3625 */
3626 if (!page && !drained) { 3626 if (!page && !drained) {
3627 unreserve_highatomic_pageblock(ac, false); 3627 unreserve_highatomic_pageblock(ac, false);
3628 drain_all_pages(NULL); 3628 drain_all_pages(NULL);
3629 drained = true; 3629 drained = true;
3630 goto retry; 3630 goto retry;
3631 } 3631 }
3632 3632
3633 return page; 3633 return page;
3634 } 3634 }
3635 3635
3636 static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac) 3636 static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac)
3637 { 3637 {
3638 struct zoneref *z; 3638 struct zoneref *z;
3639 struct zone *zone; 3639 struct zone *zone;
3640 pg_data_t *last_pgdat = NULL; 3640 pg_data_t *last_pgdat = NULL;
3641 3641
3642 for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, 3642 for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
3643 ac->high_zoneidx, ac->nodemask) { 3643 ac->high_zoneidx, ac->nodemask) {
3644 if (last_pgdat != zone->zone_pgdat) 3644 if (last_pgdat != zone->zone_pgdat)
3645 wakeup_kswapd(zone, order, ac->high_zoneidx); 3645 wakeup_kswapd(zone, order, ac->high_zoneidx);
3646 last_pgdat = zone->zone_pgdat; 3646 last_pgdat = zone->zone_pgdat;
3647 } 3647 }
3648 } 3648 }
3649 3649
3650 static inline unsigned int 3650 static inline unsigned int
3651 gfp_to_alloc_flags(gfp_t gfp_mask) 3651 gfp_to_alloc_flags(gfp_t gfp_mask)
3652 { 3652 {
3653 unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; 3653 unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
3654 3654
3655 /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */ 3655 /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
3656 BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH); 3656 BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
3657 3657
3658 /* 3658 /*
3659 * The caller may dip into page reserves a bit more if the caller 3659 * The caller may dip into page reserves a bit more if the caller
3660 * cannot run direct reclaim, or if the caller has realtime scheduling 3660 * cannot run direct reclaim, or if the caller has realtime scheduling
3661 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will 3661 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
3662 * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH). 3662 * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH).
3663 */ 3663 */
3664 alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH); 3664 alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
3665 3665
3666 if (gfp_mask & __GFP_ATOMIC) { 3666 if (gfp_mask & __GFP_ATOMIC) {
3667 /* 3667 /*
3668 * Not worth trying to allocate harder for __GFP_NOMEMALLOC even 3668 * Not worth trying to allocate harder for __GFP_NOMEMALLOC even
3669 * if it can't schedule. 3669 * if it can't schedule.
3670 */ 3670 */
3671 if (!(gfp_mask & __GFP_NOMEMALLOC)) 3671 if (!(gfp_mask & __GFP_NOMEMALLOC))
3672 alloc_flags |= ALLOC_HARDER; 3672 alloc_flags |= ALLOC_HARDER;
3673 /* 3673 /*
3674 * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the 3674 * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the
3675 * comment for __cpuset_node_allowed(). 3675 * comment for __cpuset_node_allowed().
3676 */ 3676 */
3677 alloc_flags &= ~ALLOC_CPUSET; 3677 alloc_flags &= ~ALLOC_CPUSET;
3678 } else if (unlikely(rt_task(current)) && !in_interrupt()) 3678 } else if (unlikely(rt_task(current)) && !in_interrupt())
3679 alloc_flags |= ALLOC_HARDER; 3679 alloc_flags |= ALLOC_HARDER;
3680 3680
3681 #ifdef CONFIG_CMA 3681 #ifdef CONFIG_CMA
3682 if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) 3682 if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
3683 alloc_flags |= ALLOC_CMA; 3683 alloc_flags |= ALLOC_CMA;
3684 #endif 3684 #endif
3685 return alloc_flags; 3685 return alloc_flags;
3686 } 3686 }
3687 3687
3688 static bool oom_reserves_allowed(struct task_struct *tsk) 3688 static bool oom_reserves_allowed(struct task_struct *tsk)
3689 { 3689 {
3690 if (!tsk_is_oom_victim(tsk)) 3690 if (!tsk_is_oom_victim(tsk))
3691 return false; 3691 return false;
3692 3692
3693 /* 3693 /*
3694 * !MMU doesn't have oom reaper so give access to memory reserves 3694 * !MMU doesn't have oom reaper so give access to memory reserves
3695 * only to the thread with TIF_MEMDIE set 3695 * only to the thread with TIF_MEMDIE set
3696 */ 3696 */
3697 if (!IS_ENABLED(CONFIG_MMU) && !test_thread_flag(TIF_MEMDIE)) 3697 if (!IS_ENABLED(CONFIG_MMU) && !test_thread_flag(TIF_MEMDIE))
3698 return false; 3698 return false;
3699 3699
3700 return true; 3700 return true;
3701 } 3701 }
3702 3702
3703 /* 3703 /*
3704 * Distinguish requests which really need access to full memory 3704 * Distinguish requests which really need access to full memory
3705 * reserves from oom victims which can live with a portion of it 3705 * reserves from oom victims which can live with a portion of it
3706 */ 3706 */
3707 static inline int __gfp_pfmemalloc_flags(gfp_t gfp_mask) 3707 static inline int __gfp_pfmemalloc_flags(gfp_t gfp_mask)
3708 { 3708 {
3709 if (unlikely(gfp_mask & __GFP_NOMEMALLOC)) 3709 if (unlikely(gfp_mask & __GFP_NOMEMALLOC))
3710 return 0; 3710 return 0;
3711 if (gfp_mask & __GFP_MEMALLOC) 3711 if (gfp_mask & __GFP_MEMALLOC)
3712 return ALLOC_NO_WATERMARKS; 3712 return ALLOC_NO_WATERMARKS;
3713 if (in_serving_softirq() && (current->flags & PF_MEMALLOC)) 3713 if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
3714 return ALLOC_NO_WATERMARKS; 3714 return ALLOC_NO_WATERMARKS;
3715 if (!in_interrupt()) { 3715 if (!in_interrupt()) {
3716 if (current->flags & PF_MEMALLOC) 3716 if (current->flags & PF_MEMALLOC)
3717 return ALLOC_NO_WATERMARKS; 3717 return ALLOC_NO_WATERMARKS;
3718 else if (oom_reserves_allowed(current)) 3718 else if (oom_reserves_allowed(current))
3719 return ALLOC_OOM; 3719 return ALLOC_OOM;
3720 } 3720 }
3721 3721
3722 return 0; 3722 return 0;
3723 } 3723 }
3724 3724
3725 bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) 3725 bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
3726 { 3726 {
3727 return !!__gfp_pfmemalloc_flags(gfp_mask); 3727 return !!__gfp_pfmemalloc_flags(gfp_mask);
3728 } 3728 }
3729 3729
3730 /* 3730 /*
3731 * Checks whether it makes sense to retry the reclaim to make a forward progress 3731 * Checks whether it makes sense to retry the reclaim to make a forward progress
3732 * for the given allocation request. 3732 * for the given allocation request.
3733 * 3733 *
3734 * We give up when we either have tried MAX_RECLAIM_RETRIES in a row 3734 * We give up when we either have tried MAX_RECLAIM_RETRIES in a row
3735 * without success, or when we couldn't even meet the watermark if we 3735 * without success, or when we couldn't even meet the watermark if we
3736 * reclaimed all remaining pages on the LRU lists. 3736 * reclaimed all remaining pages on the LRU lists.
3737 * 3737 *
3738 * Returns true if a retry is viable or false to enter the oom path. 3738 * Returns true if a retry is viable or false to enter the oom path.
3739 */ 3739 */
3740 static inline bool 3740 static inline bool
3741 should_reclaim_retry(gfp_t gfp_mask, unsigned order, 3741 should_reclaim_retry(gfp_t gfp_mask, unsigned order,
3742 struct alloc_context *ac, int alloc_flags, 3742 struct alloc_context *ac, int alloc_flags,
3743 bool did_some_progress, int *no_progress_loops) 3743 bool did_some_progress, int *no_progress_loops)
3744 { 3744 {
3745 struct zone *zone; 3745 struct zone *zone;
3746 struct zoneref *z; 3746 struct zoneref *z;
3747 3747
3748 /* 3748 /*
3749 * Costly allocations might have made a progress but this doesn't mean 3749 * Costly allocations might have made a progress but this doesn't mean
3750 * their order will become available due to high fragmentation so 3750 * their order will become available due to high fragmentation so
3751 * always increment the no progress counter for them 3751 * always increment the no progress counter for them
3752 */ 3752 */
3753 if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) 3753 if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER)
3754 *no_progress_loops = 0; 3754 *no_progress_loops = 0;
3755 else 3755 else
3756 (*no_progress_loops)++; 3756 (*no_progress_loops)++;
3757 3757
3758 /* 3758 /*
3759 * Make sure we converge to OOM if we cannot make any progress 3759 * Make sure we converge to OOM if we cannot make any progress
3760 * several times in the row. 3760 * several times in the row.
3761 */ 3761 */
3762 if (*no_progress_loops > MAX_RECLAIM_RETRIES) { 3762 if (*no_progress_loops > MAX_RECLAIM_RETRIES) {
3763 /* Before OOM, exhaust highatomic_reserve */ 3763 /* Before OOM, exhaust highatomic_reserve */
3764 return unreserve_highatomic_pageblock(ac, true); 3764 return unreserve_highatomic_pageblock(ac, true);
3765 } 3765 }
3766 3766
3767 /* 3767 /*
3768 * Keep reclaiming pages while there is a chance this will lead 3768 * Keep reclaiming pages while there is a chance this will lead
3769 * somewhere. If none of the target zones can satisfy our allocation 3769 * somewhere. If none of the target zones can satisfy our allocation
3770 * request even if all reclaimable pages are considered then we are 3770 * request even if all reclaimable pages are considered then we are
3771 * screwed and have to go OOM. 3771 * screwed and have to go OOM.
3772 */ 3772 */
3773 for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, 3773 for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
3774 ac->nodemask) { 3774 ac->nodemask) {
3775 unsigned long available; 3775 unsigned long available;
3776 unsigned long reclaimable; 3776 unsigned long reclaimable;
3777 unsigned long min_wmark = min_wmark_pages(zone); 3777 unsigned long min_wmark = min_wmark_pages(zone);
3778 bool wmark; 3778 bool wmark;
3779 3779
3780 available = reclaimable = zone_reclaimable_pages(zone); 3780 available = reclaimable = zone_reclaimable_pages(zone);
3781 available += zone_page_state_snapshot(zone, NR_FREE_PAGES); 3781 available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
3782 3782
3783 /* 3783 /*
3784 * Would the allocation succeed if we reclaimed all 3784 * Would the allocation succeed if we reclaimed all
3785 * reclaimable pages? 3785 * reclaimable pages?
3786 */ 3786 */
3787 wmark = __zone_watermark_ok(zone, order, min_wmark, 3787 wmark = __zone_watermark_ok(zone, order, min_wmark,
3788 ac_classzone_idx(ac), alloc_flags, available); 3788 ac_classzone_idx(ac), alloc_flags, available);
3789 trace_reclaim_retry_zone(z, order, reclaimable, 3789 trace_reclaim_retry_zone(z, order, reclaimable,
3790 available, min_wmark, *no_progress_loops, wmark); 3790 available, min_wmark, *no_progress_loops, wmark);
3791 if (wmark) { 3791 if (wmark) {
3792 /* 3792 /*
3793 * If we didn't make any progress and have a lot of 3793 * If we didn't make any progress and have a lot of
3794 * dirty + writeback pages then we should wait for 3794 * dirty + writeback pages then we should wait for
3795 * an IO to complete to slow down the reclaim and 3795 * an IO to complete to slow down the reclaim and
3796 * prevent from pre mature OOM 3796 * prevent from pre mature OOM
3797 */ 3797 */
3798 if (!did_some_progress) { 3798 if (!did_some_progress) {
3799 unsigned long write_pending; 3799 unsigned long write_pending;
3800 3800
3801 write_pending = zone_page_state_snapshot(zone, 3801 write_pending = zone_page_state_snapshot(zone,
3802 NR_ZONE_WRITE_PENDING); 3802 NR_ZONE_WRITE_PENDING);
3803 3803
3804 if (2 * write_pending > reclaimable) { 3804 if (2 * write_pending > reclaimable) {
3805 congestion_wait(BLK_RW_ASYNC, HZ/10); 3805 congestion_wait(BLK_RW_ASYNC, HZ/10);
3806 return true; 3806 return true;
3807 } 3807 }
3808 } 3808 }
3809 3809
3810 /* 3810 /*
3811 * Memory allocation/reclaim might be called from a WQ 3811 * Memory allocation/reclaim might be called from a WQ
3812 * context and the current implementation of the WQ 3812 * context and the current implementation of the WQ
3813 * concurrency control doesn't recognize that 3813 * concurrency control doesn't recognize that
3814 * a particular WQ is congested if the worker thread is 3814 * a particular WQ is congested if the worker thread is
3815 * looping without ever sleeping. Therefore we have to 3815 * looping without ever sleeping. Therefore we have to
3816 * do a short sleep here rather than calling 3816 * do a short sleep here rather than calling
3817 * cond_resched(). 3817 * cond_resched().
3818 */ 3818 */
3819 if (current->flags & PF_WQ_WORKER) 3819 if (current->flags & PF_WQ_WORKER)
3820 schedule_timeout_uninterruptible(1); 3820 schedule_timeout_uninterruptible(1);
3821 else 3821 else
3822 cond_resched(); 3822 cond_resched();
3823 3823
3824 return true; 3824 return true;
3825 } 3825 }
3826 } 3826 }
3827 3827
3828 return false; 3828 return false;
3829 } 3829 }
3830 3830
3831 static inline bool 3831 static inline bool
3832 check_retry_cpuset(int cpuset_mems_cookie, struct alloc_context *ac) 3832 check_retry_cpuset(int cpuset_mems_cookie, struct alloc_context *ac)
3833 { 3833 {
3834 /* 3834 /*
3835 * It's possible that cpuset's mems_allowed and the nodemask from 3835 * It's possible that cpuset's mems_allowed and the nodemask from
3836 * mempolicy don't intersect. This should be normally dealt with by 3836 * mempolicy don't intersect. This should be normally dealt with by
3837 * policy_nodemask(), but it's possible to race with cpuset update in 3837 * policy_nodemask(), but it's possible to race with cpuset update in
3838 * such a way the check therein was true, and then it became false 3838 * such a way the check therein was true, and then it became false
3839 * before we got our cpuset_mems_cookie here. 3839 * before we got our cpuset_mems_cookie here.
3840 * This assumes that for all allocations, ac->nodemask can come only 3840 * This assumes that for all allocations, ac->nodemask can come only
3841 * from MPOL_BIND mempolicy (whose documented semantics is to be ignored 3841 * from MPOL_BIND mempolicy (whose documented semantics is to be ignored
3842 * when it does not intersect with the cpuset restrictions) or the 3842 * when it does not intersect with the cpuset restrictions) or the
3843 * caller can deal with a violated nodemask. 3843 * caller can deal with a violated nodemask.
3844 */ 3844 */
3845 if (cpusets_enabled() && ac->nodemask && 3845 if (cpusets_enabled() && ac->nodemask &&
3846 !cpuset_nodemask_valid_mems_allowed(ac->nodemask)) { 3846 !cpuset_nodemask_valid_mems_allowed(ac->nodemask)) {
3847 ac->nodemask = NULL; 3847 ac->nodemask = NULL;
3848 return true; 3848 return true;
3849 } 3849 }
3850 3850
3851 /* 3851 /*
3852 * When updating a task's mems_allowed or mempolicy nodemask, it is 3852 * When updating a task's mems_allowed or mempolicy nodemask, it is
3853 * possible to race with parallel threads in such a way that our 3853 * possible to race with parallel threads in such a way that our
3854 * allocation can fail while the mask is being updated. If we are about 3854 * allocation can fail while the mask is being updated. If we are about
3855 * to fail, check if the cpuset changed during allocation and if so, 3855 * to fail, check if the cpuset changed during allocation and if so,
3856 * retry. 3856 * retry.
3857 */ 3857 */
3858 if (read_mems_allowed_retry(cpuset_mems_cookie)) 3858 if (read_mems_allowed_retry(cpuset_mems_cookie))
3859 return true; 3859 return true;
3860 3860
3861 return false; 3861 return false;
3862 } 3862 }
3863 3863
3864 static inline struct page * 3864 static inline struct page *
3865 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, 3865 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
3866 struct alloc_context *ac) 3866 struct alloc_context *ac)
3867 { 3867 {
3868 bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM; 3868 bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
3869 const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER; 3869 const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;
3870 struct page *page = NULL; 3870 struct page *page = NULL;
3871 unsigned int alloc_flags; 3871 unsigned int alloc_flags;
3872 unsigned long did_some_progress; 3872 unsigned long did_some_progress;
3873 enum compact_priority compact_priority; 3873 enum compact_priority compact_priority;
3874 enum compact_result compact_result; 3874 enum compact_result compact_result;
3875 int compaction_retries; 3875 int compaction_retries;
3876 int no_progress_loops; 3876 int no_progress_loops;
3877 unsigned int cpuset_mems_cookie; 3877 unsigned int cpuset_mems_cookie;
3878 int reserve_flags; 3878 int reserve_flags;
3879 3879
3880 /* 3880 /*
3881 * We also sanity check to catch abuse of atomic reserves being used by 3881 * We also sanity check to catch abuse of atomic reserves being used by
3882 * callers that are not in atomic context. 3882 * callers that are not in atomic context.
3883 */ 3883 */
3884 if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) == 3884 if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) ==
3885 (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) 3885 (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
3886 gfp_mask &= ~__GFP_ATOMIC; 3886 gfp_mask &= ~__GFP_ATOMIC;
3887 3887
3888 retry_cpuset: 3888 retry_cpuset:
3889 compaction_retries = 0; 3889 compaction_retries = 0;
3890 no_progress_loops = 0; 3890 no_progress_loops = 0;
3891 compact_priority = DEF_COMPACT_PRIORITY; 3891 compact_priority = DEF_COMPACT_PRIORITY;
3892 cpuset_mems_cookie = read_mems_allowed_begin(); 3892 cpuset_mems_cookie = read_mems_allowed_begin();
3893 3893
3894 /* 3894 /*
3895 * The fast path uses conservative alloc_flags to succeed only until 3895 * The fast path uses conservative alloc_flags to succeed only until
3896 * kswapd needs to be woken up, and to avoid the cost of setting up 3896 * kswapd needs to be woken up, and to avoid the cost of setting up
3897 * alloc_flags precisely. So we do that now. 3897 * alloc_flags precisely. So we do that now.
3898 */ 3898 */
3899 alloc_flags = gfp_to_alloc_flags(gfp_mask); 3899 alloc_flags = gfp_to_alloc_flags(gfp_mask);
3900 3900
3901 /* 3901 /*
3902 * We need to recalculate the starting point for the zonelist iterator 3902 * We need to recalculate the starting point for the zonelist iterator
3903 * because we might have used different nodemask in the fast path, or 3903 * because we might have used different nodemask in the fast path, or
3904 * there was a cpuset modification and we are retrying - otherwise we 3904 * there was a cpuset modification and we are retrying - otherwise we
3905 * could end up iterating over non-eligible zones endlessly. 3905 * could end up iterating over non-eligible zones endlessly.
3906 */ 3906 */
3907 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, 3907 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
3908 ac->high_zoneidx, ac->nodemask); 3908 ac->high_zoneidx, ac->nodemask);
3909 if (!ac->preferred_zoneref->zone) 3909 if (!ac->preferred_zoneref->zone)
3910 goto nopage; 3910 goto nopage;
3911 3911
3912 if (gfp_mask & __GFP_KSWAPD_RECLAIM) 3912 if (gfp_mask & __GFP_KSWAPD_RECLAIM)
3913 wake_all_kswapds(order, ac); 3913 wake_all_kswapds(order, ac);
3914 3914
3915 /* 3915 /*
3916 * The adjusted alloc_flags might result in immediate success, so try 3916 * The adjusted alloc_flags might result in immediate success, so try
3917 * that first 3917 * that first
3918 */ 3918 */
3919 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); 3919 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
3920 if (page) 3920 if (page)
3921 goto got_pg; 3921 goto got_pg;
3922 3922
3923 /* 3923 /*
3924 * For costly allocations, try direct compaction first, as it's likely 3924 * For costly allocations, try direct compaction first, as it's likely
3925 * that we have enough base pages and don't need to reclaim. For non- 3925 * that we have enough base pages and don't need to reclaim. For non-
3926 * movable high-order allocations, do that as well, as compaction will 3926 * movable high-order allocations, do that as well, as compaction will
3927 * try prevent permanent fragmentation by migrating from blocks of the 3927 * try prevent permanent fragmentation by migrating from blocks of the
3928 * same migratetype. 3928 * same migratetype.
3929 * Don't try this for allocations that are allowed to ignore 3929 * Don't try this for allocations that are allowed to ignore
3930 * watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen. 3930 * watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen.
3931 */ 3931 */
3932 if (can_direct_reclaim && 3932 if (can_direct_reclaim &&
3933 (costly_order || 3933 (costly_order ||
3934 (order > 0 && ac->migratetype != MIGRATE_MOVABLE)) 3934 (order > 0 && ac->migratetype != MIGRATE_MOVABLE))
3935 && !gfp_pfmemalloc_allowed(gfp_mask)) { 3935 && !gfp_pfmemalloc_allowed(gfp_mask)) {
3936 page = __alloc_pages_direct_compact(gfp_mask, order, 3936 page = __alloc_pages_direct_compact(gfp_mask, order,
3937 alloc_flags, ac, 3937 alloc_flags, ac,
3938 INIT_COMPACT_PRIORITY, 3938 INIT_COMPACT_PRIORITY,
3939 &compact_result); 3939 &compact_result);
3940 if (page) 3940 if (page)
3941 goto got_pg; 3941 goto got_pg;
3942 3942
3943 /* 3943 /*
3944 * Checks for costly allocations with __GFP_NORETRY, which 3944 * Checks for costly allocations with __GFP_NORETRY, which
3945 * includes THP page fault allocations 3945 * includes THP page fault allocations
3946 */ 3946 */
3947 if (costly_order && (gfp_mask & __GFP_NORETRY)) { 3947 if (costly_order && (gfp_mask & __GFP_NORETRY)) {
3948 /* 3948 /*
3949 * If compaction is deferred for high-order allocations, 3949 * If compaction is deferred for high-order allocations,
3950 * it is because sync compaction recently failed. If 3950 * it is because sync compaction recently failed. If
3951 * this is the case and the caller requested a THP 3951 * this is the case and the caller requested a THP
3952 * allocation, we do not want to heavily disrupt the 3952 * allocation, we do not want to heavily disrupt the
3953 * system, so we fail the allocation instead of entering 3953 * system, so we fail the allocation instead of entering
3954 * direct reclaim. 3954 * direct reclaim.
3955 */ 3955 */
3956 if (compact_result == COMPACT_DEFERRED) 3956 if (compact_result == COMPACT_DEFERRED)
3957 goto nopage; 3957 goto nopage;
3958 3958
3959 /* 3959 /*
3960 * Looks like reclaim/compaction is worth trying, but 3960 * Looks like reclaim/compaction is worth trying, but
3961 * sync compaction could be very expensive, so keep 3961 * sync compaction could be very expensive, so keep
3962 * using async compaction. 3962 * using async compaction.
3963 */ 3963 */
3964 compact_priority = INIT_COMPACT_PRIORITY; 3964 compact_priority = INIT_COMPACT_PRIORITY;
3965 } 3965 }
3966 } 3966 }
3967 3967
3968 retry: 3968 retry:
3969 /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */ 3969 /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
3970 if (gfp_mask & __GFP_KSWAPD_RECLAIM) 3970 if (gfp_mask & __GFP_KSWAPD_RECLAIM)
3971 wake_all_kswapds(order, ac); 3971 wake_all_kswapds(order, ac);
3972 3972
3973 reserve_flags = __gfp_pfmemalloc_flags(gfp_mask); 3973 reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
3974 if (reserve_flags) 3974 if (reserve_flags)
3975 alloc_flags = reserve_flags; 3975 alloc_flags = reserve_flags;
3976 3976
3977 /* 3977 /*
3978 * Reset the zonelist iterators if memory policies can be ignored. 3978 * Reset the zonelist iterators if memory policies can be ignored.
3979 * These allocations are high priority and system rather than user 3979 * These allocations are high priority and system rather than user
3980 * orientated. 3980 * orientated.
3981 */ 3981 */
3982 if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) { 3982 if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) {
3983 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, 3983 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
3984 ac->high_zoneidx, ac->nodemask); 3984 ac->high_zoneidx, ac->nodemask);
3985 } 3985 }
3986 3986
3987 /* Attempt with potentially adjusted zonelist and alloc_flags */ 3987 /* Attempt with potentially adjusted zonelist and alloc_flags */
3988 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); 3988 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
3989 if (page) 3989 if (page)
3990 goto got_pg; 3990 goto got_pg;
3991 3991
3992 /* Caller is not willing to reclaim, we can't balance anything */ 3992 /* Caller is not willing to reclaim, we can't balance anything */
3993 if (!can_direct_reclaim) 3993 if (!can_direct_reclaim)
3994 goto nopage; 3994 goto nopage;
3995 3995
3996 /* Avoid recursion of direct reclaim */ 3996 /* Avoid recursion of direct reclaim */
3997 if (current->flags & PF_MEMALLOC) 3997 if (current->flags & PF_MEMALLOC)
3998 goto nopage; 3998 goto nopage;
3999 3999
4000 /* Try direct reclaim and then allocating */ 4000 /* Try direct reclaim and then allocating */
4001 page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac, 4001 page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
4002 &did_some_progress); 4002 &did_some_progress);
4003 if (page) 4003 if (page)
4004 goto got_pg; 4004 goto got_pg;
4005 4005
4006 /* Try direct compaction and then allocating */ 4006 /* Try direct compaction and then allocating */
4007 page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac, 4007 page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
4008 compact_priority, &compact_result); 4008 compact_priority, &compact_result);
4009 if (page) 4009 if (page)
4010 goto got_pg; 4010 goto got_pg;
4011 4011
4012 /* Do not loop if specifically requested */ 4012 /* Do not loop if specifically requested */
4013 if (gfp_mask & __GFP_NORETRY) 4013 if (gfp_mask & __GFP_NORETRY)
4014 goto nopage; 4014 goto nopage;
4015 4015
4016 /* 4016 /*
4017 * Do not retry costly high order allocations unless they are 4017 * Do not retry costly high order allocations unless they are
4018 * __GFP_RETRY_MAYFAIL 4018 * __GFP_RETRY_MAYFAIL
4019 */ 4019 */
4020 if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL)) 4020 if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL))
4021 goto nopage; 4021 goto nopage;
4022 4022
4023 if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags, 4023 if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
4024 did_some_progress > 0, &no_progress_loops)) 4024 did_some_progress > 0, &no_progress_loops))
4025 goto retry; 4025 goto retry;
4026 4026
4027 /* 4027 /*
4028 * It doesn't make any sense to retry for the compaction if the order-0 4028 * It doesn't make any sense to retry for the compaction if the order-0
4029 * reclaim is not able to make any progress because the current 4029 * reclaim is not able to make any progress because the current
4030 * implementation of the compaction depends on the sufficient amount 4030 * implementation of the compaction depends on the sufficient amount
4031 * of free memory (see __compaction_suitable) 4031 * of free memory (see __compaction_suitable)
4032 */ 4032 */
4033 if (did_some_progress > 0 && 4033 if (did_some_progress > 0 &&
4034 should_compact_retry(ac, order, alloc_flags, 4034 should_compact_retry(ac, order, alloc_flags,
4035 compact_result, &compact_priority, 4035 compact_result, &compact_priority,
4036 &compaction_retries)) 4036 &compaction_retries))
4037 goto retry; 4037 goto retry;
4038 4038
4039 4039
4040 /* Deal with possible cpuset update races before we start OOM killing */ 4040 /* Deal with possible cpuset update races before we start OOM killing */
4041 if (check_retry_cpuset(cpuset_mems_cookie, ac)) 4041 if (check_retry_cpuset(cpuset_mems_cookie, ac))
4042 goto retry_cpuset; 4042 goto retry_cpuset;
4043 4043
4044 /* Reclaim has failed us, start killing things */ 4044 /* Reclaim has failed us, start killing things */
4045 page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress); 4045 page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
4046 if (page) 4046 if (page)
4047 goto got_pg; 4047 goto got_pg;
4048 4048
4049 /* Avoid allocations with no watermarks from looping endlessly */ 4049 /* Avoid allocations with no watermarks from looping endlessly */
4050 if (tsk_is_oom_victim(current) && 4050 if (tsk_is_oom_victim(current) &&
4051 (alloc_flags == ALLOC_OOM || 4051 (alloc_flags == ALLOC_OOM ||
4052 (gfp_mask & __GFP_NOMEMALLOC))) 4052 (gfp_mask & __GFP_NOMEMALLOC)))
4053 goto nopage; 4053 goto nopage;
4054 4054
4055 /* Retry as long as the OOM killer is making progress */ 4055 /* Retry as long as the OOM killer is making progress */
4056 if (did_some_progress) { 4056 if (did_some_progress) {
4057 no_progress_loops = 0; 4057 no_progress_loops = 0;
4058 goto retry; 4058 goto retry;
4059 } 4059 }
4060 4060
4061 nopage: 4061 nopage:
4062 /* Deal with possible cpuset update races before we fail */ 4062 /* Deal with possible cpuset update races before we fail */
4063 if (check_retry_cpuset(cpuset_mems_cookie, ac)) 4063 if (check_retry_cpuset(cpuset_mems_cookie, ac))
4064 goto retry_cpuset; 4064 goto retry_cpuset;
4065 4065
4066 /* 4066 /*
4067 * Make sure that __GFP_NOFAIL request doesn't leak out and make sure 4067 * Make sure that __GFP_NOFAIL request doesn't leak out and make sure
4068 * we always retry 4068 * we always retry
4069 */ 4069 */
4070 if (gfp_mask & __GFP_NOFAIL) { 4070 if (gfp_mask & __GFP_NOFAIL) {
4071 /* 4071 /*
4072 * All existing users of the __GFP_NOFAIL are blockable, so warn 4072 * All existing users of the __GFP_NOFAIL are blockable, so warn
4073 * of any new users that actually require GFP_NOWAIT 4073 * of any new users that actually require GFP_NOWAIT
4074 */ 4074 */
4075 if (WARN_ON_ONCE(!can_direct_reclaim)) 4075 if (WARN_ON_ONCE(!can_direct_reclaim))
4076 goto fail; 4076 goto fail;
4077 4077
4078 /* 4078 /*
4079 * PF_MEMALLOC request from this context is rather bizarre 4079 * PF_MEMALLOC request from this context is rather bizarre
4080 * because we cannot reclaim anything and only can loop waiting 4080 * because we cannot reclaim anything and only can loop waiting
4081 * for somebody to do a work for us 4081 * for somebody to do a work for us
4082 */ 4082 */
4083 WARN_ON_ONCE(current->flags & PF_MEMALLOC); 4083 WARN_ON_ONCE(current->flags & PF_MEMALLOC);
4084 4084
4085 /* 4085 /*
4086 * non failing costly orders are a hard requirement which we 4086 * non failing costly orders are a hard requirement which we
4087 * are not prepared for much so let's warn about these users 4087 * are not prepared for much so let's warn about these users
4088 * so that we can identify them and convert them to something 4088 * so that we can identify them and convert them to something
4089 * else. 4089 * else.
4090 */ 4090 */
4091 WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER); 4091 WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER);
4092 4092
4093 /* 4093 /*
4094 * Help non-failing allocations by giving them access to memory 4094 * Help non-failing allocations by giving them access to memory
4095 * reserves but do not use ALLOC_NO_WATERMARKS because this 4095 * reserves but do not use ALLOC_NO_WATERMARKS because this
4096 * could deplete whole memory reserves which would just make 4096 * could deplete whole memory reserves which would just make
4097 * the situation worse 4097 * the situation worse
4098 */ 4098 */
4099 page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac); 4099 page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac);
4100 if (page) 4100 if (page)
4101 goto got_pg; 4101 goto got_pg;
4102 4102
4103 cond_resched(); 4103 cond_resched();
4104 goto retry; 4104 goto retry;
4105 } 4105 }
4106 fail: 4106 fail:
4107 warn_alloc(gfp_mask, ac->nodemask, 4107 warn_alloc(gfp_mask, ac->nodemask,
4108 "page allocation failure: order:%u", order); 4108 "page allocation failure: order:%u", order);
4109 got_pg: 4109 got_pg:
4110 return page; 4110 return page;
4111 } 4111 }
4112 4112
4113 static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, 4113 static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
4114 int preferred_nid, nodemask_t *nodemask, 4114 int preferred_nid, nodemask_t *nodemask,
4115 struct alloc_context *ac, gfp_t *alloc_mask, 4115 struct alloc_context *ac, gfp_t *alloc_mask,
4116 unsigned int *alloc_flags) 4116 unsigned int *alloc_flags)
4117 { 4117 {
4118 ac->high_zoneidx = gfp_zone(gfp_mask); 4118 ac->high_zoneidx = gfp_zone(gfp_mask);
4119 ac->zonelist = node_zonelist(preferred_nid, gfp_mask); 4119 ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
4120 ac->nodemask = nodemask; 4120 ac->nodemask = nodemask;
4121 ac->migratetype = gfpflags_to_migratetype(gfp_mask); 4121 ac->migratetype = gfpflags_to_migratetype(gfp_mask);
4122 4122
4123 if (cpusets_enabled()) { 4123 if (cpusets_enabled()) {
4124 *alloc_mask |= __GFP_HARDWALL; 4124 *alloc_mask |= __GFP_HARDWALL;
4125 if (!ac->nodemask) 4125 if (!ac->nodemask)
4126 ac->nodemask = &cpuset_current_mems_allowed; 4126 ac->nodemask = &cpuset_current_mems_allowed;
4127 else 4127 else
4128 *alloc_flags |= ALLOC_CPUSET; 4128 *alloc_flags |= ALLOC_CPUSET;
4129 } 4129 }
4130 4130
4131 fs_reclaim_acquire(gfp_mask); 4131 fs_reclaim_acquire(gfp_mask);
4132 fs_reclaim_release(gfp_mask); 4132 fs_reclaim_release(gfp_mask);
4133 4133
4134 might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM); 4134 might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
4135 4135
4136 if (should_fail_alloc_page(gfp_mask, order)) 4136 if (should_fail_alloc_page(gfp_mask, order))
4137 return false; 4137 return false;
4138 4138
4139 if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE) 4139 if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE)
4140 *alloc_flags |= ALLOC_CMA; 4140 *alloc_flags |= ALLOC_CMA;
4141 4141
4142 return true; 4142 return true;
4143 } 4143 }
4144 4144
4145 /* Determine whether to spread dirty pages and what the first usable zone */ 4145 /* Determine whether to spread dirty pages and what the first usable zone */
4146 static inline void finalise_ac(gfp_t gfp_mask, 4146 static inline void finalise_ac(gfp_t gfp_mask,
4147 unsigned int order, struct alloc_context *ac) 4147 unsigned int order, struct alloc_context *ac)
4148 { 4148 {
4149 /* Dirty zone balancing only done in the fast path */ 4149 /* Dirty zone balancing only done in the fast path */
4150 ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE); 4150 ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);
4151 4151
4152 /* 4152 /*
4153 * The preferred zone is used for statistics but crucially it is 4153 * The preferred zone is used for statistics but crucially it is
4154 * also used as the starting point for the zonelist iterator. It 4154 * also used as the starting point for the zonelist iterator. It
4155 * may get reset for allocations that ignore memory policies. 4155 * may get reset for allocations that ignore memory policies.
4156 */ 4156 */
4157 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, 4157 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
4158 ac->high_zoneidx, ac->nodemask); 4158 ac->high_zoneidx, ac->nodemask);
4159 } 4159 }
4160 4160
4161 /* 4161 /*
4162 * This is the 'heart' of the zoned buddy allocator. 4162 * This is the 'heart' of the zoned buddy allocator.
4163 */ 4163 */
4164 struct page * 4164 struct page *
4165 __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, 4165 __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
4166 nodemask_t *nodemask) 4166 nodemask_t *nodemask)
4167 { 4167 {
4168 struct page *page; 4168 struct page *page;
4169 unsigned int alloc_flags = ALLOC_WMARK_LOW; 4169 unsigned int alloc_flags = ALLOC_WMARK_LOW;
4170 gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */ 4170 gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
4171 struct alloc_context ac = { }; 4171 struct alloc_context ac = { };
4172 4172
4173 /* 4173 /*
4174 * There are several places where we assume that the order value is sane 4174 * There are several places where we assume that the order value is sane
4175 * so bail out early if the request is out of bound. 4175 * so bail out early if the request is out of bound.
4176 */ 4176 */
4177 if (unlikely(order >= MAX_ORDER)) { 4177 if (unlikely(order >= MAX_ORDER)) {
4178 WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN)); 4178 WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
4179 return NULL; 4179 return NULL;
4180 } 4180 }
4181 4181
4182 gfp_mask &= gfp_allowed_mask; 4182 gfp_mask &= gfp_allowed_mask;
4183 alloc_mask = gfp_mask; 4183 alloc_mask = gfp_mask;
4184 if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags)) 4184 if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))
4185 return NULL; 4185 return NULL;
4186 4186
4187 finalise_ac(gfp_mask, order, &ac); 4187 finalise_ac(gfp_mask, order, &ac);
4188 4188
4189 /* First allocation attempt */ 4189 /* First allocation attempt */
4190 page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac); 4190 page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
4191 if (likely(page)) 4191 if (likely(page))
4192 goto out; 4192 goto out;
4193 4193
4194 /* 4194 /*
4195 * Apply scoped allocation constraints. This is mainly about GFP_NOFS 4195 * Apply scoped allocation constraints. This is mainly about GFP_NOFS
4196 * resp. GFP_NOIO which has to be inherited for all allocation requests 4196 * resp. GFP_NOIO which has to be inherited for all allocation requests
4197 * from a particular context which has been marked by 4197 * from a particular context which has been marked by
4198 * memalloc_no{fs,io}_{save,restore}. 4198 * memalloc_no{fs,io}_{save,restore}.
4199 */ 4199 */
4200 alloc_mask = current_gfp_context(gfp_mask); 4200 alloc_mask = current_gfp_context(gfp_mask);
4201 ac.spread_dirty_pages = false; 4201 ac.spread_dirty_pages = false;
4202 4202
4203 /* 4203 /*
4204 * Restore the original nodemask if it was potentially replaced with 4204 * Restore the original nodemask if it was potentially replaced with
4205 * &cpuset_current_mems_allowed to optimize the fast-path attempt. 4205 * &cpuset_current_mems_allowed to optimize the fast-path attempt.
4206 */ 4206 */
4207 if (unlikely(ac.nodemask != nodemask)) 4207 if (unlikely(ac.nodemask != nodemask))
4208 ac.nodemask = nodemask; 4208 ac.nodemask = nodemask;
4209 4209
4210 page = __alloc_pages_slowpath(alloc_mask, order, &ac); 4210 page = __alloc_pages_slowpath(alloc_mask, order, &ac);
4211 4211
4212 out: 4212 out:
4213 if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page && 4213 if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page &&
4214 unlikely(memcg_kmem_charge(page, gfp_mask, order) != 0)) { 4214 unlikely(memcg_kmem_charge(page, gfp_mask, order) != 0)) {
4215 __free_pages(page, order); 4215 __free_pages(page, order);
4216 page = NULL; 4216 page = NULL;
4217 } 4217 }
4218 4218
4219 trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype); 4219 trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);
4220 4220
4221 return page; 4221 return page;
4222 } 4222 }
4223 EXPORT_SYMBOL(__alloc_pages_nodemask); 4223 EXPORT_SYMBOL(__alloc_pages_nodemask);
4224 4224
4225 /* 4225 /*
4226 * Common helper functions. 4226 * Common helper functions.
4227 */ 4227 */
4228 unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) 4228 unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
4229 { 4229 {
4230 struct page *page; 4230 struct page *page;
4231 4231
4232 /* 4232 /*
4233 * __get_free_pages() returns a 32-bit address, which cannot represent 4233 * __get_free_pages() returns a 32-bit address, which cannot represent
4234 * a highmem page 4234 * a highmem page
4235 */ 4235 */
4236 VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); 4236 VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
4237 4237
4238 page = alloc_pages(gfp_mask, order); 4238 page = alloc_pages(gfp_mask, order);
4239 if (!page) 4239 if (!page)
4240 return 0; 4240 return 0;
4241 return (unsigned long) page_address(page); 4241 return (unsigned long) page_address(page);
4242 } 4242 }
4243 EXPORT_SYMBOL(__get_free_pages); 4243 EXPORT_SYMBOL(__get_free_pages);
4244 4244
4245 unsigned long get_zeroed_page(gfp_t gfp_mask) 4245 unsigned long get_zeroed_page(gfp_t gfp_mask)
4246 { 4246 {
4247 return __get_free_pages(gfp_mask | __GFP_ZERO, 0); 4247 return __get_free_pages(gfp_mask | __GFP_ZERO, 0);
4248 } 4248 }
4249 EXPORT_SYMBOL(get_zeroed_page); 4249 EXPORT_SYMBOL(get_zeroed_page);
4250 4250
4251 void __free_pages(struct page *page, unsigned int order) 4251 void __free_pages(struct page *page, unsigned int order)
4252 { 4252 {
4253 if (put_page_testzero(page)) { 4253 if (put_page_testzero(page)) {
4254 if (order == 0) 4254 if (order == 0)
4255 free_hot_cold_page(page, false); 4255 free_hot_cold_page(page, false);
4256 else 4256 else
4257 __free_pages_ok(page, order); 4257 __free_pages_ok(page, order);
4258 } 4258 }
4259 } 4259 }
4260 4260
4261 EXPORT_SYMBOL(__free_pages); 4261 EXPORT_SYMBOL(__free_pages);
4262 4262
4263 void free_pages(unsigned long addr, unsigned int order) 4263 void free_pages(unsigned long addr, unsigned int order)
4264 { 4264 {
4265 if (addr != 0) { 4265 if (addr != 0) {
4266 VM_BUG_ON(!virt_addr_valid((void *)addr)); 4266 VM_BUG_ON(!virt_addr_valid((void *)addr));
4267 __free_pages(virt_to_page((void *)addr), order); 4267 __free_pages(virt_to_page((void *)addr), order);
4268 } 4268 }
4269 } 4269 }
4270 4270
4271 EXPORT_SYMBOL(free_pages); 4271 EXPORT_SYMBOL(free_pages);
4272 4272
4273 /* 4273 /*
4274 * Page Fragment: 4274 * Page Fragment:
4275 * An arbitrary-length arbitrary-offset area of memory which resides 4275 * An arbitrary-length arbitrary-offset area of memory which resides
4276 * within a 0 or higher order page. Multiple fragments within that page 4276 * within a 0 or higher order page. Multiple fragments within that page
4277 * are individually refcounted, in the page's reference counter. 4277 * are individually refcounted, in the page's reference counter.
4278 * 4278 *
4279 * The page_frag functions below provide a simple allocation framework for 4279 * The page_frag functions below provide a simple allocation framework for
4280 * page fragments. This is used by the network stack and network device 4280 * page fragments. This is used by the network stack and network device
4281 * drivers to provide a backing region of memory for use as either an 4281 * drivers to provide a backing region of memory for use as either an
4282 * sk_buff->head, or to be used in the "frags" portion of skb_shared_info. 4282 * sk_buff->head, or to be used in the "frags" portion of skb_shared_info.
4283 */ 4283 */
4284 static struct page *__page_frag_cache_refill(struct page_frag_cache *nc, 4284 static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
4285 gfp_t gfp_mask) 4285 gfp_t gfp_mask)
4286 { 4286 {
4287 struct page *page = NULL; 4287 struct page *page = NULL;
4288 gfp_t gfp = gfp_mask; 4288 gfp_t gfp = gfp_mask;
4289 4289
4290 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) 4290 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
4291 gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY | 4291 gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
4292 __GFP_NOMEMALLOC; 4292 __GFP_NOMEMALLOC;
4293 page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, 4293 page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
4294 PAGE_FRAG_CACHE_MAX_ORDER); 4294 PAGE_FRAG_CACHE_MAX_ORDER);
4295 nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE; 4295 nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
4296 #endif 4296 #endif
4297 if (unlikely(!page)) 4297 if (unlikely(!page))
4298 page = alloc_pages_node(NUMA_NO_NODE, gfp, 0); 4298 page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
4299 4299
4300 nc->va = page ? page_address(page) : NULL; 4300 nc->va = page ? page_address(page) : NULL;
4301 4301
4302 return page; 4302 return page;
4303 } 4303 }
4304 4304
4305 void __page_frag_cache_drain(struct page *page, unsigned int count) 4305 void __page_frag_cache_drain(struct page *page, unsigned int count)
4306 { 4306 {
4307 VM_BUG_ON_PAGE(page_ref_count(page) == 0, page); 4307 VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
4308 4308
4309 if (page_ref_sub_and_test(page, count)) { 4309 if (page_ref_sub_and_test(page, count)) {
4310 unsigned int order = compound_order(page); 4310 unsigned int order = compound_order(page);
4311 4311
4312 if (order == 0) 4312 if (order == 0)
4313 free_hot_cold_page(page, false); 4313 free_hot_cold_page(page, false);
4314 else 4314 else
4315 __free_pages_ok(page, order); 4315 __free_pages_ok(page, order);
4316 } 4316 }
4317 } 4317 }
4318 EXPORT_SYMBOL(__page_frag_cache_drain); 4318 EXPORT_SYMBOL(__page_frag_cache_drain);
4319 4319
4320 void *page_frag_alloc(struct page_frag_cache *nc, 4320 void *page_frag_alloc(struct page_frag_cache *nc,
4321 unsigned int fragsz, gfp_t gfp_mask) 4321 unsigned int fragsz, gfp_t gfp_mask)
4322 { 4322 {
4323 unsigned int size = PAGE_SIZE; 4323 unsigned int size = PAGE_SIZE;
4324 struct page *page; 4324 struct page *page;
4325 int offset; 4325 int offset;
4326 4326
4327 if (unlikely(!nc->va)) { 4327 if (unlikely(!nc->va)) {
4328 refill: 4328 refill:
4329 page = __page_frag_cache_refill(nc, gfp_mask); 4329 page = __page_frag_cache_refill(nc, gfp_mask);
4330 if (!page) 4330 if (!page)
4331 return NULL; 4331 return NULL;
4332 4332
4333 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) 4333 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
4334 /* if size can vary use size else just use PAGE_SIZE */ 4334 /* if size can vary use size else just use PAGE_SIZE */
4335 size = nc->size; 4335 size = nc->size;
4336 #endif 4336 #endif
4337 /* Even if we own the page, we do not use atomic_set(). 4337 /* Even if we own the page, we do not use atomic_set().
4338 * This would break get_page_unless_zero() users. 4338 * This would break get_page_unless_zero() users.
4339 */ 4339 */
4340 page_ref_add(page, size - 1); 4340 page_ref_add(page, size - 1);
4341 4341
4342 /* reset page count bias and offset to start of new frag */ 4342 /* reset page count bias and offset to start of new frag */
4343 nc->pfmemalloc = page_is_pfmemalloc(page); 4343 nc->pfmemalloc = page_is_pfmemalloc(page);
4344 nc->pagecnt_bias = size; 4344 nc->pagecnt_bias = size;
4345 nc->offset = size; 4345 nc->offset = size;
4346 } 4346 }
4347 4347
4348 offset = nc->offset - fragsz; 4348 offset = nc->offset - fragsz;
4349 if (unlikely(offset < 0)) { 4349 if (unlikely(offset < 0)) {
4350 page = virt_to_page(nc->va); 4350 page = virt_to_page(nc->va);
4351 4351
4352 if (!page_ref_sub_and_test(page, nc->pagecnt_bias)) 4352 if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
4353 goto refill; 4353 goto refill;
4354 4354
4355 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) 4355 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
4356 /* if size can vary use size else just use PAGE_SIZE */ 4356 /* if size can vary use size else just use PAGE_SIZE */
4357 size = nc->size; 4357 size = nc->size;
4358 #endif 4358 #endif
4359 /* OK, page count is 0, we can safely set it */ 4359 /* OK, page count is 0, we can safely set it */
4360 set_page_count(page, size); 4360 set_page_count(page, size);
4361 4361
4362 /* reset page count bias and offset to start of new frag */ 4362 /* reset page count bias and offset to start of new frag */
4363 nc->pagecnt_bias = size; 4363 nc->pagecnt_bias = size;
4364 offset = size - fragsz; 4364 offset = size - fragsz;
4365 } 4365 }
4366 4366
4367 nc->pagecnt_bias--; 4367 nc->pagecnt_bias--;
4368 nc->offset = offset; 4368 nc->offset = offset;
4369 4369
4370 return nc->va + offset; 4370 return nc->va + offset;
4371 } 4371 }
4372 EXPORT_SYMBOL(page_frag_alloc); 4372 EXPORT_SYMBOL(page_frag_alloc);
4373 4373
4374 /* 4374 /*
4375 * Frees a page fragment allocated out of either a compound or order 0 page. 4375 * Frees a page fragment allocated out of either a compound or order 0 page.
4376 */ 4376 */
4377 void page_frag_free(void *addr) 4377 void page_frag_free(void *addr)
4378 { 4378 {
4379 struct page *page = virt_to_head_page(addr); 4379 struct page *page = virt_to_head_page(addr);
4380 4380
4381 if (unlikely(put_page_testzero(page))) 4381 if (unlikely(put_page_testzero(page)))
4382 __free_pages_ok(page, compound_order(page)); 4382 __free_pages_ok(page, compound_order(page));
4383 } 4383 }
4384 EXPORT_SYMBOL(page_frag_free); 4384 EXPORT_SYMBOL(page_frag_free);
4385 4385
4386 static void *make_alloc_exact(unsigned long addr, unsigned int order, 4386 static void *make_alloc_exact(unsigned long addr, unsigned int order,
4387 size_t size) 4387 size_t size)
4388 { 4388 {
4389 if (addr) { 4389 if (addr) {
4390 unsigned long alloc_end = addr + (PAGE_SIZE << order); 4390 unsigned long alloc_end = addr + (PAGE_SIZE << order);
4391 unsigned long used = addr + PAGE_ALIGN(size); 4391 unsigned long used = addr + PAGE_ALIGN(size);
4392 4392
4393 split_page(virt_to_page((void *)addr), order); 4393 split_page(virt_to_page((void *)addr), order);
4394 while (used < alloc_end) { 4394 while (used < alloc_end) {
4395 free_page(used); 4395 free_page(used);
4396 used += PAGE_SIZE; 4396 used += PAGE_SIZE;
4397 } 4397 }
4398 } 4398 }
4399 return (void *)addr; 4399 return (void *)addr;
4400 } 4400 }
4401 4401
4402 /** 4402 /**
4403 * alloc_pages_exact - allocate an exact number physically-contiguous pages. 4403 * alloc_pages_exact - allocate an exact number physically-contiguous pages.
4404 * @size: the number of bytes to allocate 4404 * @size: the number of bytes to allocate
4405 * @gfp_mask: GFP flags for the allocation 4405 * @gfp_mask: GFP flags for the allocation
4406 * 4406 *
4407 * This function is similar to alloc_pages(), except that it allocates the 4407 * This function is similar to alloc_pages(), except that it allocates the
4408 * minimum number of pages to satisfy the request. alloc_pages() can only 4408 * minimum number of pages to satisfy the request. alloc_pages() can only
4409 * allocate memory in power-of-two pages. 4409 * allocate memory in power-of-two pages.
4410 * 4410 *
4411 * This function is also limited by MAX_ORDER. 4411 * This function is also limited by MAX_ORDER.
4412 * 4412 *
4413 * Memory allocated by this function must be released by free_pages_exact(). 4413 * Memory allocated by this function must be released by free_pages_exact().
4414 */ 4414 */
4415 void *alloc_pages_exact(size_t size, gfp_t gfp_mask) 4415 void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
4416 { 4416 {
4417 unsigned int order = get_order(size); 4417 unsigned int order = get_order(size);
4418 unsigned long addr; 4418 unsigned long addr;
4419 4419
4420 addr = __get_free_pages(gfp_mask, order); 4420 addr = __get_free_pages(gfp_mask, order);
4421 return make_alloc_exact(addr, order, size); 4421 return make_alloc_exact(addr, order, size);
4422 } 4422 }
4423 EXPORT_SYMBOL(alloc_pages_exact); 4423 EXPORT_SYMBOL(alloc_pages_exact);
4424 4424
4425 /** 4425 /**
4426 * alloc_pages_exact_nid - allocate an exact number of physically-contiguous 4426 * alloc_pages_exact_nid - allocate an exact number of physically-contiguous
4427 * pages on a node. 4427 * pages on a node.
4428 * @nid: the preferred node ID where memory should be allocated 4428 * @nid: the preferred node ID where memory should be allocated
4429 * @size: the number of bytes to allocate 4429 * @size: the number of bytes to allocate
4430 * @gfp_mask: GFP flags for the allocation 4430 * @gfp_mask: GFP flags for the allocation
4431 * 4431 *
4432 * Like alloc_pages_exact(), but try to allocate on node nid first before falling 4432 * Like alloc_pages_exact(), but try to allocate on node nid first before falling
4433 * back. 4433 * back.
4434 */ 4434 */
4435 void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) 4435 void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
4436 { 4436 {
4437 unsigned int order = get_order(size); 4437 unsigned int order = get_order(size);
4438 struct page *p = alloc_pages_node(nid, gfp_mask, order); 4438 struct page *p = alloc_pages_node(nid, gfp_mask, order);
4439 if (!p) 4439 if (!p)
4440 return NULL; 4440 return NULL;
4441 return make_alloc_exact((unsigned long)page_address(p), order, size); 4441 return make_alloc_exact((unsigned long)page_address(p), order, size);
4442 } 4442 }
4443 4443
4444 /** 4444 /**
4445 * free_pages_exact - release memory allocated via alloc_pages_exact() 4445 * free_pages_exact - release memory allocated via alloc_pages_exact()
4446 * @virt: the value returned by alloc_pages_exact. 4446 * @virt: the value returned by alloc_pages_exact.
4447 * @size: size of allocation, same value as passed to alloc_pages_exact(). 4447 * @size: size of allocation, same value as passed to alloc_pages_exact().
4448 * 4448 *
4449 * Release the memory allocated by a previous call to alloc_pages_exact. 4449 * Release the memory allocated by a previous call to alloc_pages_exact.
4450 */ 4450 */
4451 void free_pages_exact(void *virt, size_t size) 4451 void free_pages_exact(void *virt, size_t size)
4452 { 4452 {
4453 unsigned long addr = (unsigned long)virt; 4453 unsigned long addr = (unsigned long)virt;
4454 unsigned long end = addr + PAGE_ALIGN(size); 4454 unsigned long end = addr + PAGE_ALIGN(size);
4455 4455
4456 while (addr < end) { 4456 while (addr < end) {
4457 free_page(addr); 4457 free_page(addr);
4458 addr += PAGE_SIZE; 4458 addr += PAGE_SIZE;
4459 } 4459 }
4460 } 4460 }
4461 EXPORT_SYMBOL(free_pages_exact); 4461 EXPORT_SYMBOL(free_pages_exact);
4462 4462
4463 /** 4463 /**
4464 * nr_free_zone_pages - count number of pages beyond high watermark 4464 * nr_free_zone_pages - count number of pages beyond high watermark
4465 * @offset: The zone index of the highest zone 4465 * @offset: The zone index of the highest zone
4466 * 4466 *
4467 * nr_free_zone_pages() counts the number of counts pages which are beyond the 4467 * nr_free_zone_pages() counts the number of counts pages which are beyond the
4468 * high watermark within all zones at or below a given zone index. For each 4468 * high watermark within all zones at or below a given zone index. For each
4469 * zone, the number of pages is calculated as: 4469 * zone, the number of pages is calculated as:
4470 * 4470 *
4471 * nr_free_zone_pages = managed_pages - high_pages 4471 * nr_free_zone_pages = managed_pages - high_pages
4472 */ 4472 */
4473 static unsigned long nr_free_zone_pages(int offset) 4473 static unsigned long nr_free_zone_pages(int offset)
4474 { 4474 {
4475 struct zoneref *z; 4475 struct zoneref *z;
4476 struct zone *zone; 4476 struct zone *zone;
4477 4477
4478 /* Just pick one node, since fallback list is circular */ 4478 /* Just pick one node, since fallback list is circular */
4479 unsigned long sum = 0; 4479 unsigned long sum = 0;
4480 4480
4481 struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL); 4481 struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
4482 4482
4483 for_each_zone_zonelist(zone, z, zonelist, offset) { 4483 for_each_zone_zonelist(zone, z, zonelist, offset) {
4484 unsigned long size = zone->managed_pages; 4484 unsigned long size = zone->managed_pages;
4485 unsigned long high = high_wmark_pages(zone); 4485 unsigned long high = high_wmark_pages(zone);
4486 if (size > high) 4486 if (size > high)
4487 sum += size - high; 4487 sum += size - high;
4488 } 4488 }
4489 4489
4490 return sum; 4490 return sum;
4491 } 4491 }
4492 4492
4493 /** 4493 /**
4494 * nr_free_buffer_pages - count number of pages beyond high watermark 4494 * nr_free_buffer_pages - count number of pages beyond high watermark
4495 * 4495 *
4496 * nr_free_buffer_pages() counts the number of pages which are beyond the high 4496 * nr_free_buffer_pages() counts the number of pages which are beyond the high
4497 * watermark within ZONE_DMA and ZONE_NORMAL. 4497 * watermark within ZONE_DMA and ZONE_NORMAL.
4498 */ 4498 */
4499 unsigned long nr_free_buffer_pages(void) 4499 unsigned long nr_free_buffer_pages(void)
4500 { 4500 {
4501 return nr_free_zone_pages(gfp_zone(GFP_USER)); 4501 return nr_free_zone_pages(gfp_zone(GFP_USER));
4502 } 4502 }
4503 EXPORT_SYMBOL_GPL(nr_free_buffer_pages); 4503 EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
4504 4504
4505 /** 4505 /**
4506 * nr_free_pagecache_pages - count number of pages beyond high watermark 4506 * nr_free_pagecache_pages - count number of pages beyond high watermark
4507 * 4507 *
4508 * nr_free_pagecache_pages() counts the number of pages which are beyond the 4508 * nr_free_pagecache_pages() counts the number of pages which are beyond the
4509 * high watermark within all zones. 4509 * high watermark within all zones.
4510 */ 4510 */
4511 unsigned long nr_free_pagecache_pages(void) 4511 unsigned long nr_free_pagecache_pages(void)
4512 { 4512 {
4513 return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE)); 4513 return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
4514 } 4514 }
4515 4515
4516 static inline void show_node(struct zone *zone) 4516 static inline void show_node(struct zone *zone)
4517 { 4517 {
4518 if (IS_ENABLED(CONFIG_NUMA)) 4518 if (IS_ENABLED(CONFIG_NUMA))
4519 printk("Node %d ", zone_to_nid(zone)); 4519 printk("Node %d ", zone_to_nid(zone));
4520 } 4520 }
4521 4521
4522 long si_mem_available(void) 4522 long si_mem_available(void)
4523 { 4523 {
4524 long available; 4524 long available;
4525 unsigned long pagecache; 4525 unsigned long pagecache;
4526 unsigned long wmark_low = 0; 4526 unsigned long wmark_low = 0;
4527 unsigned long pages[NR_LRU_LISTS]; 4527 unsigned long pages[NR_LRU_LISTS];
4528 struct zone *zone; 4528 struct zone *zone;
4529 int lru; 4529 int lru;
4530 4530
4531 for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) 4531 for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
4532 pages[lru] = global_node_page_state(NR_LRU_BASE + lru); 4532 pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
4533 4533
4534 for_each_zone(zone) 4534 for_each_zone(zone)
4535 wmark_low += zone->watermark[WMARK_LOW]; 4535 wmark_low += zone->watermark[WMARK_LOW];
4536 4536
4537 /* 4537 /*
4538 * Estimate the amount of memory available for userspace allocations, 4538 * Estimate the amount of memory available for userspace allocations,
4539 * without causing swapping. 4539 * without causing swapping.
4540 */ 4540 */
4541 available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages; 4541 available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages;
4542 4542
4543 /* 4543 /*
4544 * Not all the page cache can be freed, otherwise the system will 4544 * Not all the page cache can be freed, otherwise the system will
4545 * start swapping. Assume at least half of the page cache, or the 4545 * start swapping. Assume at least half of the page cache, or the
4546 * low watermark worth of cache, needs to stay. 4546 * low watermark worth of cache, needs to stay.
4547 */ 4547 */
4548 pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE]; 4548 pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE];
4549 pagecache -= min(pagecache / 2, wmark_low); 4549 pagecache -= min(pagecache / 2, wmark_low);
4550 available += pagecache; 4550 available += pagecache;
4551 4551
4552 /* 4552 /*
4553 * Part of the reclaimable slab consists of items that are in use, 4553 * Part of the reclaimable slab consists of items that are in use,
4554 * and cannot be freed. Cap this estimate at the low watermark. 4554 * and cannot be freed. Cap this estimate at the low watermark.
4555 */ 4555 */
4556 available += global_node_page_state(NR_SLAB_RECLAIMABLE) - 4556 available += global_node_page_state(NR_SLAB_RECLAIMABLE) -
4557 min(global_node_page_state(NR_SLAB_RECLAIMABLE) / 2, 4557 min(global_node_page_state(NR_SLAB_RECLAIMABLE) / 2,
4558 wmark_low); 4558 wmark_low);
4559 4559
4560 /* 4560 /*
4561 * Part of the kernel memory, which can be released under memory 4561 * Part of the kernel memory, which can be released under memory
4562 * pressure. 4562 * pressure.
4563 */ 4563 */
4564 available += global_node_page_state(NR_INDIRECTLY_RECLAIMABLE_BYTES) >> 4564 available += global_node_page_state(NR_INDIRECTLY_RECLAIMABLE_BYTES) >>
4565 PAGE_SHIFT; 4565 PAGE_SHIFT;
4566 4566
4567 if (available < 0) 4567 if (available < 0)
4568 available = 0; 4568 available = 0;
4569 return available; 4569 return available;
4570 } 4570 }
4571 EXPORT_SYMBOL_GPL(si_mem_available); 4571 EXPORT_SYMBOL_GPL(si_mem_available);
4572 4572
4573 void si_meminfo(struct sysinfo *val) 4573 void si_meminfo(struct sysinfo *val)
4574 { 4574 {
4575 val->totalram = totalram_pages; 4575 val->totalram = totalram_pages;
4576 val->sharedram = global_node_page_state(NR_SHMEM); 4576 val->sharedram = global_node_page_state(NR_SHMEM);
4577 val->freeram = global_zone_page_state(NR_FREE_PAGES); 4577 val->freeram = global_zone_page_state(NR_FREE_PAGES);
4578 val->bufferram = nr_blockdev_pages(); 4578 val->bufferram = nr_blockdev_pages();
4579 val->totalhigh = totalhigh_pages; 4579 val->totalhigh = totalhigh_pages;
4580 val->freehigh = nr_free_highpages(); 4580 val->freehigh = nr_free_highpages();
4581 val->mem_unit = PAGE_SIZE; 4581 val->mem_unit = PAGE_SIZE;
4582 } 4582 }
4583 4583
4584 EXPORT_SYMBOL(si_meminfo); 4584 EXPORT_SYMBOL(si_meminfo);
4585 4585
4586 #ifdef CONFIG_NUMA 4586 #ifdef CONFIG_NUMA
4587 void si_meminfo_node(struct sysinfo *val, int nid) 4587 void si_meminfo_node(struct sysinfo *val, int nid)
4588 { 4588 {
4589 int zone_type; /* needs to be signed */ 4589 int zone_type; /* needs to be signed */
4590 unsigned long managed_pages = 0; 4590 unsigned long managed_pages = 0;
4591 unsigned long managed_highpages = 0; 4591 unsigned long managed_highpages = 0;
4592 unsigned long free_highpages = 0; 4592 unsigned long free_highpages = 0;
4593 pg_data_t *pgdat = NODE_DATA(nid); 4593 pg_data_t *pgdat = NODE_DATA(nid);
4594 4594
4595 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) 4595 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
4596 managed_pages += pgdat->node_zones[zone_type].managed_pages; 4596 managed_pages += pgdat->node_zones[zone_type].managed_pages;
4597 val->totalram = managed_pages; 4597 val->totalram = managed_pages;
4598 val->sharedram = node_page_state(pgdat, NR_SHMEM); 4598 val->sharedram = node_page_state(pgdat, NR_SHMEM);
4599 val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES); 4599 val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES);
4600 #ifdef CONFIG_HIGHMEM 4600 #ifdef CONFIG_HIGHMEM
4601 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { 4601 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
4602 struct zone *zone = &pgdat->node_zones[zone_type]; 4602 struct zone *zone = &pgdat->node_zones[zone_type];
4603 4603
4604 if (is_highmem(zone)) { 4604 if (is_highmem(zone)) {
4605 managed_highpages += zone->managed_pages; 4605 managed_highpages += zone->managed_pages;
4606 free_highpages += zone_page_state(zone, NR_FREE_PAGES); 4606 free_highpages += zone_page_state(zone, NR_FREE_PAGES);
4607 } 4607 }
4608 } 4608 }
4609 val->totalhigh = managed_highpages; 4609 val->totalhigh = managed_highpages;
4610 val->freehigh = free_highpages; 4610 val->freehigh = free_highpages;
4611 #else 4611 #else
4612 val->totalhigh = managed_highpages; 4612 val->totalhigh = managed_highpages;
4613 val->freehigh = free_highpages; 4613 val->freehigh = free_highpages;
4614 #endif 4614 #endif
4615 val->mem_unit = PAGE_SIZE; 4615 val->mem_unit = PAGE_SIZE;
4616 } 4616 }
4617 #endif 4617 #endif
4618 4618
4619 /* 4619 /*
4620 * Determine whether the node should be displayed or not, depending on whether 4620 * Determine whether the node should be displayed or not, depending on whether
4621 * SHOW_MEM_FILTER_NODES was passed to show_free_areas(). 4621 * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
4622 */ 4622 */
4623 static bool show_mem_node_skip(unsigned int flags, int nid, nodemask_t *nodemask) 4623 static bool show_mem_node_skip(unsigned int flags, int nid, nodemask_t *nodemask)
4624 { 4624 {
4625 if (!(flags & SHOW_MEM_FILTER_NODES)) 4625 if (!(flags & SHOW_MEM_FILTER_NODES))
4626 return false; 4626 return false;
4627 4627
4628 /* 4628 /*
4629 * no node mask - aka implicit memory numa policy. Do not bother with 4629 * no node mask - aka implicit memory numa policy. Do not bother with
4630 * the synchronization - read_mems_allowed_begin - because we do not 4630 * the synchronization - read_mems_allowed_begin - because we do not
4631 * have to be precise here. 4631 * have to be precise here.
4632 */ 4632 */
4633 if (!nodemask) 4633 if (!nodemask)
4634 nodemask = &cpuset_current_mems_allowed; 4634 nodemask = &cpuset_current_mems_allowed;
4635 4635
4636 return !node_isset(nid, *nodemask); 4636 return !node_isset(nid, *nodemask);
4637 } 4637 }
4638 4638
4639 #define K(x) ((x) << (PAGE_SHIFT-10)) 4639 #define K(x) ((x) << (PAGE_SHIFT-10))
4640 4640
4641 static void show_migration_types(unsigned char type) 4641 static void show_migration_types(unsigned char type)
4642 { 4642 {
4643 static const char types[MIGRATE_TYPES] = { 4643 static const char types[MIGRATE_TYPES] = {
4644 [MIGRATE_UNMOVABLE] = 'U', 4644 [MIGRATE_UNMOVABLE] = 'U',
4645 [MIGRATE_MOVABLE] = 'M', 4645 [MIGRATE_MOVABLE] = 'M',
4646 [MIGRATE_RECLAIMABLE] = 'E', 4646 [MIGRATE_RECLAIMABLE] = 'E',
4647 [MIGRATE_HIGHATOMIC] = 'H', 4647 [MIGRATE_HIGHATOMIC] = 'H',
4648 #ifdef CONFIG_CMA 4648 #ifdef CONFIG_CMA
4649 [MIGRATE_CMA] = 'C', 4649 [MIGRATE_CMA] = 'C',
4650 #endif 4650 #endif
4651 #ifdef CONFIG_MEMORY_ISOLATION 4651 #ifdef CONFIG_MEMORY_ISOLATION
4652 [MIGRATE_ISOLATE] = 'I', 4652 [MIGRATE_ISOLATE] = 'I',
4653 #endif 4653 #endif
4654 }; 4654 };
4655 char tmp[MIGRATE_TYPES + 1]; 4655 char tmp[MIGRATE_TYPES + 1];
4656 char *p = tmp; 4656 char *p = tmp;
4657 int i; 4657 int i;
4658 4658
4659 for (i = 0; i < MIGRATE_TYPES; i++) { 4659 for (i = 0; i < MIGRATE_TYPES; i++) {
4660 if (type & (1 << i)) 4660 if (type & (1 << i))
4661 *p++ = types[i]; 4661 *p++ = types[i];
4662 } 4662 }
4663 4663
4664 *p = '\0'; 4664 *p = '\0';
4665 printk(KERN_CONT "(%s) ", tmp); 4665 printk(KERN_CONT "(%s) ", tmp);
4666 } 4666 }
4667 4667
4668 /* 4668 /*
4669 * Show free area list (used inside shift_scroll-lock stuff) 4669 * Show free area list (used inside shift_scroll-lock stuff)
4670 * We also calculate the percentage fragmentation. We do this by counting the 4670 * We also calculate the percentage fragmentation. We do this by counting the
4671 * memory on each free list with the exception of the first item on the list. 4671 * memory on each free list with the exception of the first item on the list.
4672 * 4672 *
4673 * Bits in @filter: 4673 * Bits in @filter:
4674 * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's 4674 * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's
4675 * cpuset. 4675 * cpuset.
4676 */ 4676 */
4677 void show_free_areas(unsigned int filter, nodemask_t *nodemask) 4677 void show_free_areas(unsigned int filter, nodemask_t *nodemask)
4678 { 4678 {
4679 unsigned long free_pcp = 0; 4679 unsigned long free_pcp = 0;
4680 int cpu; 4680 int cpu;
4681 struct zone *zone; 4681 struct zone *zone;
4682 pg_data_t *pgdat; 4682 pg_data_t *pgdat;
4683 4683
4684 for_each_populated_zone(zone) { 4684 for_each_populated_zone(zone) {
4685 if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) 4685 if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
4686 continue; 4686 continue;
4687 4687
4688 for_each_online_cpu(cpu) 4688 for_each_online_cpu(cpu)
4689 free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count; 4689 free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;
4690 } 4690 }
4691 4691
4692 printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" 4692 printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
4693 " active_file:%lu inactive_file:%lu isolated_file:%lu\n" 4693 " active_file:%lu inactive_file:%lu isolated_file:%lu\n"
4694 " unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n" 4694 " unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n"
4695 " slab_reclaimable:%lu slab_unreclaimable:%lu\n" 4695 " slab_reclaimable:%lu slab_unreclaimable:%lu\n"
4696 " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n" 4696 " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
4697 " free:%lu free_pcp:%lu free_cma:%lu\n", 4697 " free:%lu free_pcp:%lu free_cma:%lu\n",
4698 global_node_page_state(NR_ACTIVE_ANON), 4698 global_node_page_state(NR_ACTIVE_ANON),
4699 global_node_page_state(NR_INACTIVE_ANON), 4699 global_node_page_state(NR_INACTIVE_ANON),
4700 global_node_page_state(NR_ISOLATED_ANON), 4700 global_node_page_state(NR_ISOLATED_ANON),
4701 global_node_page_state(NR_ACTIVE_FILE), 4701 global_node_page_state(NR_ACTIVE_FILE),
4702 global_node_page_state(NR_INACTIVE_FILE), 4702 global_node_page_state(NR_INACTIVE_FILE),
4703 global_node_page_state(NR_ISOLATED_FILE), 4703 global_node_page_state(NR_ISOLATED_FILE),
4704 global_node_page_state(NR_UNEVICTABLE), 4704 global_node_page_state(NR_UNEVICTABLE),
4705 global_node_page_state(NR_FILE_DIRTY), 4705 global_node_page_state(NR_FILE_DIRTY),
4706 global_node_page_state(NR_WRITEBACK), 4706 global_node_page_state(NR_WRITEBACK),
4707 global_node_page_state(NR_UNSTABLE_NFS), 4707 global_node_page_state(NR_UNSTABLE_NFS),
4708 global_node_page_state(NR_SLAB_RECLAIMABLE), 4708 global_node_page_state(NR_SLAB_RECLAIMABLE),
4709 global_node_page_state(NR_SLAB_UNRECLAIMABLE), 4709 global_node_page_state(NR_SLAB_UNRECLAIMABLE),
4710 global_node_page_state(NR_FILE_MAPPED), 4710 global_node_page_state(NR_FILE_MAPPED),
4711 global_node_page_state(NR_SHMEM), 4711 global_node_page_state(NR_SHMEM),
4712 global_zone_page_state(NR_PAGETABLE), 4712 global_zone_page_state(NR_PAGETABLE),
4713 global_zone_page_state(NR_BOUNCE), 4713 global_zone_page_state(NR_BOUNCE),
4714 global_zone_page_state(NR_FREE_PAGES), 4714 global_zone_page_state(NR_FREE_PAGES),
4715 free_pcp, 4715 free_pcp,
4716 global_zone_page_state(NR_FREE_CMA_PAGES)); 4716 global_zone_page_state(NR_FREE_CMA_PAGES));
4717 4717
4718 for_each_online_pgdat(pgdat) { 4718 for_each_online_pgdat(pgdat) {
4719 if (show_mem_node_skip(filter, pgdat->node_id, nodemask)) 4719 if (show_mem_node_skip(filter, pgdat->node_id, nodemask))
4720 continue; 4720 continue;
4721 4721
4722 printk("Node %d" 4722 printk("Node %d"
4723 " active_anon:%lukB" 4723 " active_anon:%lukB"
4724 " inactive_anon:%lukB" 4724 " inactive_anon:%lukB"
4725 " active_file:%lukB" 4725 " active_file:%lukB"
4726 " inactive_file:%lukB" 4726 " inactive_file:%lukB"
4727 " unevictable:%lukB" 4727 " unevictable:%lukB"
4728 " isolated(anon):%lukB" 4728 " isolated(anon):%lukB"
4729 " isolated(file):%lukB" 4729 " isolated(file):%lukB"
4730 " mapped:%lukB" 4730 " mapped:%lukB"
4731 " dirty:%lukB" 4731 " dirty:%lukB"
4732 " writeback:%lukB" 4732 " writeback:%lukB"
4733 " shmem:%lukB" 4733 " shmem:%lukB"
4734 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 4734 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
4735 " shmem_thp: %lukB" 4735 " shmem_thp: %lukB"
4736 " shmem_pmdmapped: %lukB" 4736 " shmem_pmdmapped: %lukB"
4737 " anon_thp: %lukB" 4737 " anon_thp: %lukB"
4738 #endif 4738 #endif
4739 " writeback_tmp:%lukB" 4739 " writeback_tmp:%lukB"
4740 " unstable:%lukB" 4740 " unstable:%lukB"
4741 " all_unreclaimable? %s" 4741 " all_unreclaimable? %s"
4742 "\n", 4742 "\n",
4743 pgdat->node_id, 4743 pgdat->node_id,
4744 K(node_page_state(pgdat, NR_ACTIVE_ANON)), 4744 K(node_page_state(pgdat, NR_ACTIVE_ANON)),
4745 K(node_page_state(pgdat, NR_INACTIVE_ANON)), 4745 K(node_page_state(pgdat, NR_INACTIVE_ANON)),
4746 K(node_page_state(pgdat, NR_ACTIVE_FILE)), 4746 K(node_page_state(pgdat, NR_ACTIVE_FILE)),
4747 K(node_page_state(pgdat, NR_INACTIVE_FILE)), 4747 K(node_page_state(pgdat, NR_INACTIVE_FILE)),
4748 K(node_page_state(pgdat, NR_UNEVICTABLE)), 4748 K(node_page_state(pgdat, NR_UNEVICTABLE)),
4749 K(node_page_state(pgdat, NR_ISOLATED_ANON)), 4749 K(node_page_state(pgdat, NR_ISOLATED_ANON)),
4750 K(node_page_state(pgdat, NR_ISOLATED_FILE)), 4750 K(node_page_state(pgdat, NR_ISOLATED_FILE)),
4751 K(node_page_state(pgdat, NR_FILE_MAPPED)), 4751 K(node_page_state(pgdat, NR_FILE_MAPPED)),
4752 K(node_page_state(pgdat, NR_FILE_DIRTY)), 4752 K(node_page_state(pgdat, NR_FILE_DIRTY)),
4753 K(node_page_state(pgdat, NR_WRITEBACK)), 4753 K(node_page_state(pgdat, NR_WRITEBACK)),
4754 K(node_page_state(pgdat, NR_SHMEM)), 4754 K(node_page_state(pgdat, NR_SHMEM)),
4755 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 4755 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
4756 K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR), 4756 K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR),
4757 K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) 4757 K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)
4758 * HPAGE_PMD_NR), 4758 * HPAGE_PMD_NR),
4759 K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR), 4759 K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR),
4760 #endif 4760 #endif
4761 K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), 4761 K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
4762 K(node_page_state(pgdat, NR_UNSTABLE_NFS)), 4762 K(node_page_state(pgdat, NR_UNSTABLE_NFS)),
4763 pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ? 4763 pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
4764 "yes" : "no"); 4764 "yes" : "no");
4765 } 4765 }
4766 4766
4767 for_each_populated_zone(zone) { 4767 for_each_populated_zone(zone) {
4768 int i; 4768 int i;
4769 4769
4770 if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) 4770 if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
4771 continue; 4771 continue;
4772 4772
4773 free_pcp = 0; 4773 free_pcp = 0;
4774 for_each_online_cpu(cpu) 4774 for_each_online_cpu(cpu)
4775 free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count; 4775 free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;
4776 4776
4777 show_node(zone); 4777 show_node(zone);
4778 printk(KERN_CONT 4778 printk(KERN_CONT
4779 "%s" 4779 "%s"
4780 " free:%lukB" 4780 " free:%lukB"
4781 " min:%lukB" 4781 " min:%lukB"
4782 " low:%lukB" 4782 " low:%lukB"
4783 " high:%lukB" 4783 " high:%lukB"
4784 " active_anon:%lukB" 4784 " active_anon:%lukB"
4785 " inactive_anon:%lukB" 4785 " inactive_anon:%lukB"
4786 " active_file:%lukB" 4786 " active_file:%lukB"
4787 " inactive_file:%lukB" 4787 " inactive_file:%lukB"
4788 " unevictable:%lukB" 4788 " unevictable:%lukB"
4789 " writepending:%lukB" 4789 " writepending:%lukB"
4790 " present:%lukB" 4790 " present:%lukB"
4791 " managed:%lukB" 4791 " managed:%lukB"
4792 " mlocked:%lukB" 4792 " mlocked:%lukB"
4793 " kernel_stack:%lukB" 4793 " kernel_stack:%lukB"
4794 " pagetables:%lukB" 4794 " pagetables:%lukB"
4795 " bounce:%lukB" 4795 " bounce:%lukB"
4796 " free_pcp:%lukB" 4796 " free_pcp:%lukB"
4797 " local_pcp:%ukB" 4797 " local_pcp:%ukB"
4798 " free_cma:%lukB" 4798 " free_cma:%lukB"
4799 "\n", 4799 "\n",
4800 zone->name, 4800 zone->name,
4801 K(zone_page_state(zone, NR_FREE_PAGES)), 4801 K(zone_page_state(zone, NR_FREE_PAGES)),
4802 K(min_wmark_pages(zone)), 4802 K(min_wmark_pages(zone)),
4803 K(low_wmark_pages(zone)), 4803 K(low_wmark_pages(zone)),
4804 K(high_wmark_pages(zone)), 4804 K(high_wmark_pages(zone)),
4805 K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)), 4805 K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)),
4806 K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)), 4806 K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)),
4807 K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)), 4807 K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)),
4808 K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)), 4808 K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)),
4809 K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)), 4809 K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)),
4810 K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)), 4810 K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)),
4811 K(zone->present_pages), 4811 K(zone->present_pages),
4812 K(zone->managed_pages), 4812 K(zone->managed_pages),
4813 K(zone_page_state(zone, NR_MLOCK)), 4813 K(zone_page_state(zone, NR_MLOCK)),
4814 zone_page_state(zone, NR_KERNEL_STACK_KB), 4814 zone_page_state(zone, NR_KERNEL_STACK_KB),
4815 K(zone_page_state(zone, NR_PAGETABLE)), 4815 K(zone_page_state(zone, NR_PAGETABLE)),
4816 K(zone_page_state(zone, NR_BOUNCE)), 4816 K(zone_page_state(zone, NR_BOUNCE)),
4817 K(free_pcp), 4817 K(free_pcp),
4818 K(this_cpu_read(zone->pageset->pcp.count)), 4818 K(this_cpu_read(zone->pageset->pcp.count)),
4819 K(zone_page_state(zone, NR_FREE_CMA_PAGES))); 4819 K(zone_page_state(zone, NR_FREE_CMA_PAGES)));
4820 printk("lowmem_reserve[]:"); 4820 printk("lowmem_reserve[]:");
4821 for (i = 0; i < MAX_NR_ZONES; i++) 4821 for (i = 0; i < MAX_NR_ZONES; i++)
4822 printk(KERN_CONT " %ld", zone->lowmem_reserve[i]); 4822 printk(KERN_CONT " %ld", zone->lowmem_reserve[i]);
4823 printk(KERN_CONT "\n"); 4823 printk(KERN_CONT "\n");
4824 } 4824 }
4825 4825
4826 for_each_populated_zone(zone) { 4826 for_each_populated_zone(zone) {
4827 unsigned int order; 4827 unsigned int order;
4828 unsigned long nr[MAX_ORDER], flags, total = 0; 4828 unsigned long nr[MAX_ORDER], flags, total = 0;
4829 unsigned char types[MAX_ORDER]; 4829 unsigned char types[MAX_ORDER];
4830 4830
4831 if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) 4831 if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
4832 continue; 4832 continue;
4833 show_node(zone); 4833 show_node(zone);
4834 printk(KERN_CONT "%s: ", zone->name); 4834 printk(KERN_CONT "%s: ", zone->name);
4835 4835
4836 spin_lock_irqsave(&zone->lock, flags); 4836 spin_lock_irqsave(&zone->lock, flags);
4837 for (order = 0; order < MAX_ORDER; order++) { 4837 for (order = 0; order < MAX_ORDER; order++) {
4838 struct free_area *area = &zone->free_area[order]; 4838 struct free_area *area = &zone->free_area[order];
4839 int type; 4839 int type;
4840 4840
4841 nr[order] = area->nr_free; 4841 nr[order] = area->nr_free;
4842 total += nr[order] << order; 4842 total += nr[order] << order;
4843 4843
4844 types[order] = 0; 4844 types[order] = 0;
4845 for (type = 0; type < MIGRATE_TYPES; type++) { 4845 for (type = 0; type < MIGRATE_TYPES; type++) {
4846 if (!list_empty(&area->free_list[type])) 4846 if (!list_empty(&area->free_list[type]))
4847 types[order] |= 1 << type; 4847 types[order] |= 1 << type;
4848 } 4848 }
4849 } 4849 }
4850 spin_unlock_irqrestore(&zone->lock, flags); 4850 spin_unlock_irqrestore(&zone->lock, flags);
4851 for (order = 0; order < MAX_ORDER; order++) { 4851 for (order = 0; order < MAX_ORDER; order++) {
4852 printk(KERN_CONT "%lu*%lukB ", 4852 printk(KERN_CONT "%lu*%lukB ",
4853 nr[order], K(1UL) << order); 4853 nr[order], K(1UL) << order);
4854 if (nr[order]) 4854 if (nr[order])
4855 show_migration_types(types[order]); 4855 show_migration_types(types[order]);
4856 } 4856 }
4857 printk(KERN_CONT "= %lukB\n", K(total)); 4857 printk(KERN_CONT "= %lukB\n", K(total));
4858 } 4858 }
4859 4859
4860 hugetlb_show_meminfo(); 4860 hugetlb_show_meminfo();
4861 4861
4862 printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES)); 4862 printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES));
4863 4863
4864 show_swap_cache_info(); 4864 show_swap_cache_info();
4865 } 4865 }
4866 4866
4867 static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) 4867 static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
4868 { 4868 {
4869 zoneref->zone = zone; 4869 zoneref->zone = zone;
4870 zoneref->zone_idx = zone_idx(zone); 4870 zoneref->zone_idx = zone_idx(zone);
4871 } 4871 }
4872 4872
4873 /* 4873 /*
4874 * Builds allocation fallback zone lists. 4874 * Builds allocation fallback zone lists.
4875 * 4875 *
4876 * Add all populated zones of a node to the zonelist. 4876 * Add all populated zones of a node to the zonelist.
4877 */ 4877 */
4878 static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs) 4878 static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs)
4879 { 4879 {
4880 struct zone *zone; 4880 struct zone *zone;
4881 enum zone_type zone_type = MAX_NR_ZONES; 4881 enum zone_type zone_type = MAX_NR_ZONES;
4882 int nr_zones = 0; 4882 int nr_zones = 0;
4883 4883
4884 do { 4884 do {
4885 zone_type--; 4885 zone_type--;
4886 zone = pgdat->node_zones + zone_type; 4886 zone = pgdat->node_zones + zone_type;
4887 if (managed_zone(zone)) { 4887 if (managed_zone(zone)) {
4888 zoneref_set_zone(zone, &zonerefs[nr_zones++]); 4888 zoneref_set_zone(zone, &zonerefs[nr_zones++]);
4889 check_highest_zone(zone_type); 4889 check_highest_zone(zone_type);
4890 } 4890 }
4891 } while (zone_type); 4891 } while (zone_type);
4892 4892
4893 return nr_zones; 4893 return nr_zones;
4894 } 4894 }
4895 4895
4896 #ifdef CONFIG_NUMA 4896 #ifdef CONFIG_NUMA
4897 4897
4898 static int __parse_numa_zonelist_order(char *s) 4898 static int __parse_numa_zonelist_order(char *s)
4899 { 4899 {
4900 /* 4900 /*
4901 * We used to support different zonlists modes but they turned 4901 * We used to support different zonlists modes but they turned
4902 * out to be just not useful. Let's keep the warning in place 4902 * out to be just not useful. Let's keep the warning in place
4903 * if somebody still use the cmd line parameter so that we do 4903 * if somebody still use the cmd line parameter so that we do
4904 * not fail it silently 4904 * not fail it silently
4905 */ 4905 */
4906 if (!(*s == 'd' || *s == 'D' || *s == 'n' || *s == 'N')) { 4906 if (!(*s == 'd' || *s == 'D' || *s == 'n' || *s == 'N')) {
4907 pr_warn("Ignoring unsupported numa_zonelist_order value: %s\n", s); 4907 pr_warn("Ignoring unsupported numa_zonelist_order value: %s\n", s);
4908 return -EINVAL; 4908 return -EINVAL;
4909 } 4909 }
4910 return 0; 4910 return 0;
4911 } 4911 }
4912 4912
4913 static __init int setup_numa_zonelist_order(char *s) 4913 static __init int setup_numa_zonelist_order(char *s)
4914 { 4914 {
4915 if (!s) 4915 if (!s)
4916 return 0; 4916 return 0;
4917 4917
4918 return __parse_numa_zonelist_order(s); 4918 return __parse_numa_zonelist_order(s);
4919 } 4919 }
4920 early_param("numa_zonelist_order", setup_numa_zonelist_order); 4920 early_param("numa_zonelist_order", setup_numa_zonelist_order);
4921 4921
4922 char numa_zonelist_order[] = "Node"; 4922 char numa_zonelist_order[] = "Node";
4923 4923
4924 /* 4924 /*
4925 * sysctl handler for numa_zonelist_order 4925 * sysctl handler for numa_zonelist_order
4926 */ 4926 */
4927 int numa_zonelist_order_handler(struct ctl_table *table, int write, 4927 int numa_zonelist_order_handler(struct ctl_table *table, int write,
4928 void __user *buffer, size_t *length, 4928 void __user *buffer, size_t *length,
4929 loff_t *ppos) 4929 loff_t *ppos)
4930 { 4930 {
4931 char *str; 4931 char *str;
4932 int ret; 4932 int ret;
4933 4933
4934 if (!write) 4934 if (!write)
4935 return proc_dostring(table, write, buffer, length, ppos); 4935 return proc_dostring(table, write, buffer, length, ppos);
4936 str = memdup_user_nul(buffer, 16); 4936 str = memdup_user_nul(buffer, 16);
4937 if (IS_ERR(str)) 4937 if (IS_ERR(str))
4938 return PTR_ERR(str); 4938 return PTR_ERR(str);
4939 4939
4940 ret = __parse_numa_zonelist_order(str); 4940 ret = __parse_numa_zonelist_order(str);
4941 kfree(str); 4941 kfree(str);
4942 return ret; 4942 return ret;
4943 } 4943 }
4944 4944
4945 4945
4946 #define MAX_NODE_LOAD (nr_online_nodes) 4946 #define MAX_NODE_LOAD (nr_online_nodes)
4947 static int node_load[MAX_NUMNODES]; 4947 static int node_load[MAX_NUMNODES];
4948 4948
4949 /** 4949 /**
4950 * find_next_best_node - find the next node that should appear in a given node's fallback list 4950 * find_next_best_node - find the next node that should appear in a given node's fallback list
4951 * @node: node whose fallback list we're appending 4951 * @node: node whose fallback list we're appending
4952 * @used_node_mask: nodemask_t of already used nodes 4952 * @used_node_mask: nodemask_t of already used nodes
4953 * 4953 *
4954 * We use a number of factors to determine which is the next node that should 4954 * We use a number of factors to determine which is the next node that should
4955 * appear on a given node's fallback list. The node should not have appeared 4955 * appear on a given node's fallback list. The node should not have appeared
4956 * already in @node's fallback list, and it should be the next closest node 4956 * already in @node's fallback list, and it should be the next closest node
4957 * according to the distance array (which contains arbitrary distance values 4957 * according to the distance array (which contains arbitrary distance values
4958 * from each node to each node in the system), and should also prefer nodes 4958 * from each node to each node in the system), and should also prefer nodes
4959 * with no CPUs, since presumably they'll have very little allocation pressure 4959 * with no CPUs, since presumably they'll have very little allocation pressure
4960 * on them otherwise. 4960 * on them otherwise.
4961 * It returns -1 if no node is found. 4961 * It returns -1 if no node is found.
4962 */ 4962 */
4963 static int find_next_best_node(int node, nodemask_t *used_node_mask) 4963 static int find_next_best_node(int node, nodemask_t *used_node_mask)
4964 { 4964 {
4965 int n, val; 4965 int n, val;
4966 int min_val = INT_MAX; 4966 int min_val = INT_MAX;
4967 int best_node = NUMA_NO_NODE; 4967 int best_node = NUMA_NO_NODE;
4968 const struct cpumask *tmp = cpumask_of_node(0); 4968 const struct cpumask *tmp = cpumask_of_node(0);
4969 4969
4970 /* Use the local node if we haven't already */ 4970 /* Use the local node if we haven't already */
4971 if (!node_isset(node, *used_node_mask)) { 4971 if (!node_isset(node, *used_node_mask)) {
4972 node_set(node, *used_node_mask); 4972 node_set(node, *used_node_mask);
4973 return node; 4973 return node;
4974 } 4974 }
4975 4975
4976 for_each_node_state(n, N_MEMORY) { 4976 for_each_node_state(n, N_MEMORY) {
4977 4977
4978 /* Don't want a node to appear more than once */ 4978 /* Don't want a node to appear more than once */
4979 if (node_isset(n, *used_node_mask)) 4979 if (node_isset(n, *used_node_mask))
4980 continue; 4980 continue;
4981 4981
4982 /* Use the distance array to find the distance */ 4982 /* Use the distance array to find the distance */
4983 val = node_distance(node, n); 4983 val = node_distance(node, n);
4984 4984
4985 /* Penalize nodes under us ("prefer the next node") */ 4985 /* Penalize nodes under us ("prefer the next node") */
4986 val += (n < node); 4986 val += (n < node);
4987 4987
4988 /* Give preference to headless and unused nodes */ 4988 /* Give preference to headless and unused nodes */
4989 tmp = cpumask_of_node(n); 4989 tmp = cpumask_of_node(n);
4990 if (!cpumask_empty(tmp)) 4990 if (!cpumask_empty(tmp))
4991 val += PENALTY_FOR_NODE_WITH_CPUS; 4991 val += PENALTY_FOR_NODE_WITH_CPUS;
4992 4992
4993 /* Slight preference for less loaded node */ 4993 /* Slight preference for less loaded node */
4994 val *= (MAX_NODE_LOAD*MAX_NUMNODES); 4994 val *= (MAX_NODE_LOAD*MAX_NUMNODES);
4995 val += node_load[n]; 4995 val += node_load[n];
4996 4996
4997 if (val < min_val) { 4997 if (val < min_val) {
4998 min_val = val; 4998 min_val = val;
4999 best_node = n; 4999 best_node = n;
5000 } 5000 }
5001 } 5001 }
5002 5002
5003 if (best_node >= 0) 5003 if (best_node >= 0)
5004 node_set(best_node, *used_node_mask); 5004 node_set(best_node, *used_node_mask);
5005 5005
5006 return best_node; 5006 return best_node;
5007 } 5007 }
5008 5008
5009 5009
5010 /* 5010 /*
5011 * Build zonelists ordered by node and zones within node. 5011 * Build zonelists ordered by node and zones within node.
5012 * This results in maximum locality--normal zone overflows into local 5012 * This results in maximum locality--normal zone overflows into local
5013 * DMA zone, if any--but risks exhausting DMA zone. 5013 * DMA zone, if any--but risks exhausting DMA zone.
5014 */ 5014 */
5015 static void build_zonelists_in_node_order(pg_data_t *pgdat, int *node_order, 5015 static void build_zonelists_in_node_order(pg_data_t *pgdat, int *node_order,
5016 unsigned nr_nodes) 5016 unsigned nr_nodes)
5017 { 5017 {
5018 struct zoneref *zonerefs; 5018 struct zoneref *zonerefs;
5019 int i; 5019 int i;
5020 5020
5021 zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs; 5021 zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
5022 5022
5023 for (i = 0; i < nr_nodes; i++) { 5023 for (i = 0; i < nr_nodes; i++) {
5024 int nr_zones; 5024 int nr_zones;
5025 5025
5026 pg_data_t *node = NODE_DATA(node_order[i]); 5026 pg_data_t *node = NODE_DATA(node_order[i]);
5027 5027
5028 nr_zones = build_zonerefs_node(node, zonerefs); 5028 nr_zones = build_zonerefs_node(node, zonerefs);
5029 zonerefs += nr_zones; 5029 zonerefs += nr_zones;
5030 } 5030 }
5031 zonerefs->zone = NULL; 5031 zonerefs->zone = NULL;
5032 zonerefs->zone_idx = 0; 5032 zonerefs->zone_idx = 0;
5033 } 5033 }
5034 5034
5035 /* 5035 /*
5036 * Build gfp_thisnode zonelists 5036 * Build gfp_thisnode zonelists
5037 */ 5037 */
5038 static void build_thisnode_zonelists(pg_data_t *pgdat) 5038 static void build_thisnode_zonelists(pg_data_t *pgdat)
5039 { 5039 {
5040 struct zoneref *zonerefs; 5040 struct zoneref *zonerefs;
5041 int nr_zones; 5041 int nr_zones;
5042 5042
5043 zonerefs = pgdat->node_zonelists[ZONELIST_NOFALLBACK]._zonerefs; 5043 zonerefs = pgdat->node_zonelists[ZONELIST_NOFALLBACK]._zonerefs;
5044 nr_zones = build_zonerefs_node(pgdat, zonerefs); 5044 nr_zones = build_zonerefs_node(pgdat, zonerefs);
5045 zonerefs += nr_zones; 5045 zonerefs += nr_zones;
5046 zonerefs->zone = NULL; 5046 zonerefs->zone = NULL;
5047 zonerefs->zone_idx = 0; 5047 zonerefs->zone_idx = 0;
5048 } 5048 }
5049 5049
5050 /* 5050 /*
5051 * Build zonelists ordered by zone and nodes within zones. 5051 * Build zonelists ordered by zone and nodes within zones.
5052 * This results in conserving DMA zone[s] until all Normal memory is 5052 * This results in conserving DMA zone[s] until all Normal memory is
5053 * exhausted, but results in overflowing to remote node while memory 5053 * exhausted, but results in overflowing to remote node while memory
5054 * may still exist in local DMA zone. 5054 * may still exist in local DMA zone.
5055 */ 5055 */
5056 5056
5057 static void build_zonelists(pg_data_t *pgdat) 5057 static void build_zonelists(pg_data_t *pgdat)
5058 { 5058 {
5059 static int node_order[MAX_NUMNODES]; 5059 static int node_order[MAX_NUMNODES];
5060 int node, load, nr_nodes = 0; 5060 int node, load, nr_nodes = 0;
5061 nodemask_t used_mask; 5061 nodemask_t used_mask;
5062 int local_node, prev_node; 5062 int local_node, prev_node;
5063 5063
5064 /* NUMA-aware ordering of nodes */ 5064 /* NUMA-aware ordering of nodes */
5065 local_node = pgdat->node_id; 5065 local_node = pgdat->node_id;
5066 load = nr_online_nodes; 5066 load = nr_online_nodes;
5067 prev_node = local_node; 5067 prev_node = local_node;
5068 nodes_clear(used_mask); 5068 nodes_clear(used_mask);
5069 5069
5070 memset(node_order, 0, sizeof(node_order)); 5070 memset(node_order, 0, sizeof(node_order));
5071 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { 5071 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
5072 /* 5072 /*
5073 * We don't want to pressure a particular node. 5073 * We don't want to pressure a particular node.
5074 * So adding penalty to the first node in same 5074 * So adding penalty to the first node in same
5075 * distance group to make it round-robin. 5075 * distance group to make it round-robin.
5076 */ 5076 */
5077 if (node_distance(local_node, node) != 5077 if (node_distance(local_node, node) !=
5078 node_distance(local_node, prev_node)) 5078 node_distance(local_node, prev_node))
5079 node_load[node] = load; 5079 node_load[node] = load;
5080 5080
5081 node_order[nr_nodes++] = node; 5081 node_order[nr_nodes++] = node;
5082 prev_node = node; 5082 prev_node = node;
5083 load--; 5083 load--;
5084 } 5084 }
5085 5085
5086 build_zonelists_in_node_order(pgdat, node_order, nr_nodes); 5086 build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
5087 build_thisnode_zonelists(pgdat); 5087 build_thisnode_zonelists(pgdat);
5088 } 5088 }
5089 5089
5090 #ifdef CONFIG_HAVE_MEMORYLESS_NODES 5090 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
5091 /* 5091 /*
5092 * Return node id of node used for "local" allocations. 5092 * Return node id of node used for "local" allocations.
5093 * I.e., first node id of first zone in arg node's generic zonelist. 5093 * I.e., first node id of first zone in arg node's generic zonelist.
5094 * Used for initializing percpu 'numa_mem', which is used primarily 5094 * Used for initializing percpu 'numa_mem', which is used primarily
5095 * for kernel allocations, so use GFP_KERNEL flags to locate zonelist. 5095 * for kernel allocations, so use GFP_KERNEL flags to locate zonelist.
5096 */ 5096 */
5097 int local_memory_node(int node) 5097 int local_memory_node(int node)
5098 { 5098 {
5099 struct zoneref *z; 5099 struct zoneref *z;
5100 5100
5101 z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL), 5101 z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
5102 gfp_zone(GFP_KERNEL), 5102 gfp_zone(GFP_KERNEL),
5103 NULL); 5103 NULL);
5104 return z->zone->node; 5104 return z->zone->node;
5105 } 5105 }
5106 #endif 5106 #endif
5107 5107
5108 static void setup_min_unmapped_ratio(void); 5108 static void setup_min_unmapped_ratio(void);
5109 static void setup_min_slab_ratio(void); 5109 static void setup_min_slab_ratio(void);
5110 #else /* CONFIG_NUMA */ 5110 #else /* CONFIG_NUMA */
5111 5111
5112 static void build_zonelists(pg_data_t *pgdat) 5112 static void build_zonelists(pg_data_t *pgdat)
5113 { 5113 {
5114 int node, local_node; 5114 int node, local_node;
5115 struct zoneref *zonerefs; 5115 struct zoneref *zonerefs;
5116 int nr_zones; 5116 int nr_zones;
5117 5117
5118 local_node = pgdat->node_id; 5118 local_node = pgdat->node_id;
5119 5119
5120 zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs; 5120 zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
5121 nr_zones = build_zonerefs_node(pgdat, zonerefs); 5121 nr_zones = build_zonerefs_node(pgdat, zonerefs);
5122 zonerefs += nr_zones; 5122 zonerefs += nr_zones;
5123 5123
5124 /* 5124 /*
5125 * Now we build the zonelist so that it contains the zones 5125 * Now we build the zonelist so that it contains the zones
5126 * of all the other nodes. 5126 * of all the other nodes.
5127 * We don't want to pressure a particular node, so when 5127 * We don't want to pressure a particular node, so when
5128 * building the zones for node N, we make sure that the 5128 * building the zones for node N, we make sure that the
5129 * zones coming right after the local ones are those from 5129 * zones coming right after the local ones are those from
5130 * node N+1 (modulo N) 5130 * node N+1 (modulo N)
5131 */ 5131 */
5132 for (node = local_node + 1; node < MAX_NUMNODES; node++) { 5132 for (node = local_node + 1; node < MAX_NUMNODES; node++) {
5133 if (!node_online(node)) 5133 if (!node_online(node))
5134 continue; 5134 continue;
5135 nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs); 5135 nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
5136 zonerefs += nr_zones; 5136 zonerefs += nr_zones;
5137 } 5137 }
5138 for (node = 0; node < local_node; node++) { 5138 for (node = 0; node < local_node; node++) {
5139 if (!node_online(node)) 5139 if (!node_online(node))
5140 continue; 5140 continue;
5141 nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs); 5141 nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
5142 zonerefs += nr_zones; 5142 zonerefs += nr_zones;
5143 } 5143 }
5144 5144
5145 zonerefs->zone = NULL; 5145 zonerefs->zone = NULL;
5146 zonerefs->zone_idx = 0; 5146 zonerefs->zone_idx = 0;
5147 } 5147 }
5148 5148
5149 #endif /* CONFIG_NUMA */ 5149 #endif /* CONFIG_NUMA */
5150 5150
5151 /* 5151 /*
5152 * Boot pageset table. One per cpu which is going to be used for all 5152 * Boot pageset table. One per cpu which is going to be used for all
5153 * zones and all nodes. The parameters will be set in such a way 5153 * zones and all nodes. The parameters will be set in such a way
5154 * that an item put on a list will immediately be handed over to 5154 * that an item put on a list will immediately be handed over to
5155 * the buddy list. This is safe since pageset manipulation is done 5155 * the buddy list. This is safe since pageset manipulation is done
5156 * with interrupts disabled. 5156 * with interrupts disabled.
5157 * 5157 *
5158 * The boot_pagesets must be kept even after bootup is complete for 5158 * The boot_pagesets must be kept even after bootup is complete for
5159 * unused processors and/or zones. They do play a role for bootstrapping 5159 * unused processors and/or zones. They do play a role for bootstrapping
5160 * hotplugged processors. 5160 * hotplugged processors.
5161 * 5161 *
5162 * zoneinfo_show() and maybe other functions do 5162 * zoneinfo_show() and maybe other functions do
5163 * not check if the processor is online before following the pageset pointer. 5163 * not check if the processor is online before following the pageset pointer.
5164 * Other parts of the kernel may not check if the zone is available. 5164 * Other parts of the kernel may not check if the zone is available.
5165 */ 5165 */
5166 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); 5166 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
5167 static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); 5167 static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
5168 static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); 5168 static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
5169 5169
5170 static void __build_all_zonelists(void *data) 5170 static void __build_all_zonelists(void *data)
5171 { 5171 {
5172 int nid; 5172 int nid;
5173 int __maybe_unused cpu; 5173 int __maybe_unused cpu;
5174 pg_data_t *self = data; 5174 pg_data_t *self = data;
5175 static DEFINE_SPINLOCK(lock); 5175 static DEFINE_SPINLOCK(lock);
5176 5176
5177 spin_lock(&lock); 5177 spin_lock(&lock);
5178 5178
5179 #ifdef CONFIG_NUMA 5179 #ifdef CONFIG_NUMA
5180 memset(node_load, 0, sizeof(node_load)); 5180 memset(node_load, 0, sizeof(node_load));
5181 #endif 5181 #endif
5182 5182
5183 /* 5183 /*
5184 * This node is hotadded and no memory is yet present. So just 5184 * This node is hotadded and no memory is yet present. So just
5185 * building zonelists is fine - no need to touch other nodes. 5185 * building zonelists is fine - no need to touch other nodes.
5186 */ 5186 */
5187 if (self && !node_online(self->node_id)) { 5187 if (self && !node_online(self->node_id)) {
5188 build_zonelists(self); 5188 build_zonelists(self);
5189 } else { 5189 } else {
5190 for_each_online_node(nid) { 5190 for_each_online_node(nid) {
5191 pg_data_t *pgdat = NODE_DATA(nid); 5191 pg_data_t *pgdat = NODE_DATA(nid);
5192 5192
5193 build_zonelists(pgdat); 5193 build_zonelists(pgdat);
5194 } 5194 }
5195 5195
5196 #ifdef CONFIG_HAVE_MEMORYLESS_NODES 5196 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
5197 /* 5197 /*
5198 * We now know the "local memory node" for each node-- 5198 * We now know the "local memory node" for each node--
5199 * i.e., the node of the first zone in the generic zonelist. 5199 * i.e., the node of the first zone in the generic zonelist.
5200 * Set up numa_mem percpu variable for on-line cpus. During 5200 * Set up numa_mem percpu variable for on-line cpus. During
5201 * boot, only the boot cpu should be on-line; we'll init the 5201 * boot, only the boot cpu should be on-line; we'll init the
5202 * secondary cpus' numa_mem as they come on-line. During 5202 * secondary cpus' numa_mem as they come on-line. During
5203 * node/memory hotplug, we'll fixup all on-line cpus. 5203 * node/memory hotplug, we'll fixup all on-line cpus.
5204 */ 5204 */
5205 for_each_online_cpu(cpu) 5205 for_each_online_cpu(cpu)
5206 set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu))); 5206 set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
5207 #endif 5207 #endif
5208 } 5208 }
5209 5209
5210 spin_unlock(&lock); 5210 spin_unlock(&lock);
5211 } 5211 }
5212 5212
5213 static noinline void __init 5213 static noinline void __init
5214 build_all_zonelists_init(void) 5214 build_all_zonelists_init(void)
5215 { 5215 {
5216 int cpu; 5216 int cpu;
5217 5217
5218 __build_all_zonelists(NULL); 5218 __build_all_zonelists(NULL);
5219 5219
5220 /* 5220 /*
5221 * Initialize the boot_pagesets that are going to be used 5221 * Initialize the boot_pagesets that are going to be used
5222 * for bootstrapping processors. The real pagesets for 5222 * for bootstrapping processors. The real pagesets for
5223 * each zone will be allocated later when the per cpu 5223 * each zone will be allocated later when the per cpu
5224 * allocator is available. 5224 * allocator is available.
5225 * 5225 *
5226 * boot_pagesets are used also for bootstrapping offline 5226 * boot_pagesets are used also for bootstrapping offline
5227 * cpus if the system is already booted because the pagesets 5227 * cpus if the system is already booted because the pagesets
5228 * are needed to initialize allocators on a specific cpu too. 5228 * are needed to initialize allocators on a specific cpu too.
5229 * F.e. the percpu allocator needs the page allocator which 5229 * F.e. the percpu allocator needs the page allocator which
5230 * needs the percpu allocator in order to allocate its pagesets 5230 * needs the percpu allocator in order to allocate its pagesets
5231 * (a chicken-egg dilemma). 5231 * (a chicken-egg dilemma).
5232 */ 5232 */
5233 for_each_possible_cpu(cpu) 5233 for_each_possible_cpu(cpu)
5234 setup_pageset(&per_cpu(boot_pageset, cpu), 0); 5234 setup_pageset(&per_cpu(boot_pageset, cpu), 0);
5235 5235
5236 mminit_verify_zonelist(); 5236 mminit_verify_zonelist();
5237 cpuset_init_current_mems_allowed(); 5237 cpuset_init_current_mems_allowed();
5238 } 5238 }
5239 5239
5240 /* 5240 /*
5241 * unless system_state == SYSTEM_BOOTING. 5241 * unless system_state == SYSTEM_BOOTING.
5242 * 5242 *
5243 * __ref due to call of __init annotated helper build_all_zonelists_init 5243 * __ref due to call of __init annotated helper build_all_zonelists_init
5244 * [protected by SYSTEM_BOOTING]. 5244 * [protected by SYSTEM_BOOTING].
5245 */ 5245 */
5246 void __ref build_all_zonelists(pg_data_t *pgdat) 5246 void __ref build_all_zonelists(pg_data_t *pgdat)
5247 { 5247 {
5248 if (system_state == SYSTEM_BOOTING) { 5248 if (system_state == SYSTEM_BOOTING) {
5249 build_all_zonelists_init(); 5249 build_all_zonelists_init();
5250 } else { 5250 } else {
5251 __build_all_zonelists(pgdat); 5251 __build_all_zonelists(pgdat);
5252 /* cpuset refresh routine should be here */ 5252 /* cpuset refresh routine should be here */
5253 } 5253 }
5254 vm_total_pages = nr_free_pagecache_pages(); 5254 vm_total_pages = nr_free_pagecache_pages();
5255 /* 5255 /*
5256 * Disable grouping by mobility if the number of pages in the 5256 * Disable grouping by mobility if the number of pages in the
5257 * system is too low to allow the mechanism to work. It would be 5257 * system is too low to allow the mechanism to work. It would be
5258 * more accurate, but expensive to check per-zone. This check is 5258 * more accurate, but expensive to check per-zone. This check is
5259 * made on memory-hotadd so a system can start with mobility 5259 * made on memory-hotadd so a system can start with mobility
5260 * disabled and enable it later 5260 * disabled and enable it later
5261 */ 5261 */
5262 if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES)) 5262 if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
5263 page_group_by_mobility_disabled = 1; 5263 page_group_by_mobility_disabled = 1;
5264 else 5264 else
5265 page_group_by_mobility_disabled = 0; 5265 page_group_by_mobility_disabled = 0;
5266 5266
5267 pr_info("Built %i zonelists, mobility grouping %s. Total pages: %ld\n", 5267 pr_info("Built %i zonelists, mobility grouping %s. Total pages: %ld\n",
5268 nr_online_nodes, 5268 nr_online_nodes,
5269 page_group_by_mobility_disabled ? "off" : "on", 5269 page_group_by_mobility_disabled ? "off" : "on",
5270 vm_total_pages); 5270 vm_total_pages);
5271 #ifdef CONFIG_NUMA 5271 #ifdef CONFIG_NUMA
5272 pr_info("Policy zone: %s\n", zone_names[policy_zone]); 5272 pr_info("Policy zone: %s\n", zone_names[policy_zone]);
5273 #endif 5273 #endif
5274 } 5274 }
5275 5275
5276 /* 5276 /*
5277 * Initially all pages are reserved - free ones are freed 5277 * Initially all pages are reserved - free ones are freed
5278 * up by free_all_bootmem() once the early boot process is 5278 * up by free_all_bootmem() once the early boot process is
5279 * done. Non-atomic initialization, single-pass. 5279 * done. Non-atomic initialization, single-pass.
5280 */ 5280 */
5281 void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, 5281 void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
5282 unsigned long start_pfn, enum memmap_context context) 5282 unsigned long start_pfn, enum memmap_context context)
5283 { 5283 {
5284 struct vmem_altmap *altmap = to_vmem_altmap(__pfn_to_phys(start_pfn)); 5284 struct vmem_altmap *altmap = to_vmem_altmap(__pfn_to_phys(start_pfn));
5285 unsigned long end_pfn = start_pfn + size; 5285 unsigned long end_pfn = start_pfn + size;
5286 pg_data_t *pgdat = NODE_DATA(nid); 5286 pg_data_t *pgdat = NODE_DATA(nid);
5287 unsigned long pfn; 5287 unsigned long pfn;
5288 unsigned long nr_initialised = 0; 5288 unsigned long nr_initialised = 0;
5289 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 5289 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
5290 struct memblock_region *r = NULL, *tmp; 5290 struct memblock_region *r = NULL, *tmp;
5291 #endif 5291 #endif
5292 5292
5293 if (highest_memmap_pfn < end_pfn - 1) 5293 if (highest_memmap_pfn < end_pfn - 1)
5294 highest_memmap_pfn = end_pfn - 1; 5294 highest_memmap_pfn = end_pfn - 1;
5295 5295
5296 /* 5296 /*
5297 * Honor reservation requested by the driver for this ZONE_DEVICE 5297 * Honor reservation requested by the driver for this ZONE_DEVICE
5298 * memory 5298 * memory
5299 */ 5299 */
5300 if (altmap && start_pfn == altmap->base_pfn) 5300 if (altmap && start_pfn == altmap->base_pfn)
5301 start_pfn += altmap->reserve; 5301 start_pfn += altmap->reserve;
5302 5302
5303 for (pfn = start_pfn; pfn < end_pfn; pfn++) { 5303 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
5304 /* 5304 /*
5305 * There can be holes in boot-time mem_map[]s handed to this 5305 * There can be holes in boot-time mem_map[]s handed to this
5306 * function. They do not exist on hotplugged memory. 5306 * function. They do not exist on hotplugged memory.
5307 */ 5307 */
5308 if (context != MEMMAP_EARLY) 5308 if (context != MEMMAP_EARLY)
5309 goto not_early; 5309 goto not_early;
5310 5310
5311 if (!early_pfn_valid(pfn)) 5311 if (!early_pfn_valid(pfn))
5312 continue; 5312 continue;
5313 if (!early_pfn_in_nid(pfn, nid)) 5313 if (!early_pfn_in_nid(pfn, nid))
5314 continue; 5314 continue;
5315 if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised)) 5315 if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised))
5316 break; 5316 break;
5317 5317
5318 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 5318 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
5319 /* 5319 /*
5320 * Check given memblock attribute by firmware which can affect 5320 * Check given memblock attribute by firmware which can affect
5321 * kernel memory layout. If zone==ZONE_MOVABLE but memory is 5321 * kernel memory layout. If zone==ZONE_MOVABLE but memory is
5322 * mirrored, it's an overlapped memmap init. skip it. 5322 * mirrored, it's an overlapped memmap init. skip it.
5323 */ 5323 */
5324 if (mirrored_kernelcore && zone == ZONE_MOVABLE) { 5324 if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
5325 if (!r || pfn >= memblock_region_memory_end_pfn(r)) { 5325 if (!r || pfn >= memblock_region_memory_end_pfn(r)) {
5326 for_each_memblock(memory, tmp) 5326 for_each_memblock(memory, tmp)
5327 if (pfn < memblock_region_memory_end_pfn(tmp)) 5327 if (pfn < memblock_region_memory_end_pfn(tmp))
5328 break; 5328 break;
5329 r = tmp; 5329 r = tmp;
5330 } 5330 }
5331 if (pfn >= memblock_region_memory_base_pfn(r) && 5331 if (pfn >= memblock_region_memory_base_pfn(r) &&
5332 memblock_is_mirror(r)) { 5332 memblock_is_mirror(r)) {
5333 /* already initialized as NORMAL */ 5333 /* already initialized as NORMAL */
5334 pfn = memblock_region_memory_end_pfn(r); 5334 pfn = memblock_region_memory_end_pfn(r);
5335 continue; 5335 continue;
5336 } 5336 }
5337 } 5337 }
5338 #endif 5338 #endif
5339 5339
5340 not_early: 5340 not_early:
5341 /* 5341 /*
5342 * Mark the block movable so that blocks are reserved for 5342 * Mark the block movable so that blocks are reserved for
5343 * movable at startup. This will force kernel allocations 5343 * movable at startup. This will force kernel allocations
5344 * to reserve their blocks rather than leaking throughout 5344 * to reserve their blocks rather than leaking throughout
5345 * the address space during boot when many long-lived 5345 * the address space during boot when many long-lived
5346 * kernel allocations are made. 5346 * kernel allocations are made.
5347 * 5347 *
5348 * bitmap is created for zone's valid pfn range. but memmap 5348 * bitmap is created for zone's valid pfn range. but memmap
5349 * can be created for invalid pages (for alignment) 5349 * can be created for invalid pages (for alignment)
5350 * check here not to call set_pageblock_migratetype() against 5350 * check here not to call set_pageblock_migratetype() against
5351 * pfn out of zone. 5351 * pfn out of zone.
5352 */ 5352 */
5353 if (!(pfn & (pageblock_nr_pages - 1))) { 5353 if (!(pfn & (pageblock_nr_pages - 1))) {
5354 struct page *page = pfn_to_page(pfn); 5354 struct page *page = pfn_to_page(pfn);
5355 5355
5356 __init_single_page(page, pfn, zone, nid); 5356 __init_single_page(page, pfn, zone, nid);
5357 set_pageblock_migratetype(page, MIGRATE_MOVABLE); 5357 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
5358 cond_resched(); 5358 cond_resched();
5359 } else { 5359 } else {
5360 __init_single_pfn(pfn, zone, nid); 5360 __init_single_pfn(pfn, zone, nid);
5361 } 5361 }
5362 } 5362 }
5363 } 5363 }
5364 5364
5365 static void __meminit zone_init_free_lists(struct zone *zone) 5365 static void __meminit zone_init_free_lists(struct zone *zone)
5366 { 5366 {
5367 unsigned int order, t; 5367 unsigned int order, t;
5368 for_each_migratetype_order(order, t) { 5368 for_each_migratetype_order(order, t) {
5369 INIT_LIST_HEAD(&zone->free_area[order].free_list[t]); 5369 INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
5370 zone->free_area[order].nr_free = 0; 5370 zone->free_area[order].nr_free = 0;
5371 } 5371 }
5372 } 5372 }
5373 5373
5374 #ifndef __HAVE_ARCH_MEMMAP_INIT 5374 #ifndef __HAVE_ARCH_MEMMAP_INIT
5375 #define memmap_init(size, nid, zone, start_pfn) \ 5375 #define memmap_init(size, nid, zone, start_pfn) \
5376 memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) 5376 memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
5377 #endif 5377 #endif
5378 5378
5379 static int zone_batchsize(struct zone *zone) 5379 static int zone_batchsize(struct zone *zone)
5380 { 5380 {
5381 #ifdef CONFIG_MMU 5381 #ifdef CONFIG_MMU
5382 int batch; 5382 int batch;
5383 5383
5384 /* 5384 /*
5385 * The per-cpu-pages pools are set to around 1000th of the 5385 * The per-cpu-pages pools are set to around 1000th of the
5386 * size of the zone. But no more than 1/2 of a meg. 5386 * size of the zone. But no more than 1/2 of a meg.
5387 * 5387 *
5388 * OK, so we don't know how big the cache is. So guess. 5388 * OK, so we don't know how big the cache is. So guess.
5389 */ 5389 */
5390 batch = zone->managed_pages / 1024; 5390 batch = zone->managed_pages / 1024;
5391 if (batch * PAGE_SIZE > 512 * 1024) 5391 if (batch * PAGE_SIZE > 512 * 1024)
5392 batch = (512 * 1024) / PAGE_SIZE; 5392 batch = (512 * 1024) / PAGE_SIZE;
5393 batch /= 4; /* We effectively *= 4 below */ 5393 batch /= 4; /* We effectively *= 4 below */
5394 if (batch < 1) 5394 if (batch < 1)
5395 batch = 1; 5395 batch = 1;
5396 5396
5397 /* 5397 /*
5398 * Clamp the batch to a 2^n - 1 value. Having a power 5398 * Clamp the batch to a 2^n - 1 value. Having a power
5399 * of 2 value was found to be more likely to have 5399 * of 2 value was found to be more likely to have
5400 * suboptimal cache aliasing properties in some cases. 5400 * suboptimal cache aliasing properties in some cases.
5401 * 5401 *
5402 * For example if 2 tasks are alternately allocating 5402 * For example if 2 tasks are alternately allocating
5403 * batches of pages, one task can end up with a lot 5403 * batches of pages, one task can end up with a lot
5404 * of pages of one half of the possible page colors 5404 * of pages of one half of the possible page colors
5405 * and the other with pages of the other colors. 5405 * and the other with pages of the other colors.
5406 */ 5406 */
5407 batch = rounddown_pow_of_two(batch + batch/2) - 1; 5407 batch = rounddown_pow_of_two(batch + batch/2) - 1;
5408 5408
5409 return batch; 5409 return batch;
5410 5410
5411 #else 5411 #else
5412 /* The deferral and batching of frees should be suppressed under NOMMU 5412 /* The deferral and batching of frees should be suppressed under NOMMU
5413 * conditions. 5413 * conditions.
5414 * 5414 *
5415 * The problem is that NOMMU needs to be able to allocate large chunks 5415 * The problem is that NOMMU needs to be able to allocate large chunks
5416 * of contiguous memory as there's no hardware page translation to 5416 * of contiguous memory as there's no hardware page translation to
5417 * assemble apparent contiguous memory from discontiguous pages. 5417 * assemble apparent contiguous memory from discontiguous pages.
5418 * 5418 *
5419 * Queueing large contiguous runs of pages for batching, however, 5419 * Queueing large contiguous runs of pages for batching, however,
5420 * causes the pages to actually be freed in smaller chunks. As there 5420 * causes the pages to actually be freed in smaller chunks. As there
5421 * can be a significant delay between the individual batches being 5421 * can be a significant delay between the individual batches being
5422 * recycled, this leads to the once large chunks of space being 5422 * recycled, this leads to the once large chunks of space being
5423 * fragmented and becoming unavailable for high-order allocations. 5423 * fragmented and becoming unavailable for high-order allocations.
5424 */ 5424 */
5425 return 0; 5425 return 0;
5426 #endif 5426 #endif
5427 } 5427 }
5428 5428
5429 /* 5429 /*
5430 * pcp->high and pcp->batch values are related and dependent on one another: 5430 * pcp->high and pcp->batch values are related and dependent on one another:
5431 * ->batch must never be higher then ->high. 5431 * ->batch must never be higher then ->high.
5432 * The following function updates them in a safe manner without read side 5432 * The following function updates them in a safe manner without read side
5433 * locking. 5433 * locking.
5434 * 5434 *
5435 * Any new users of pcp->batch and pcp->high should ensure they can cope with 5435 * Any new users of pcp->batch and pcp->high should ensure they can cope with
5436 * those fields changing asynchronously (acording the the above rule). 5436 * those fields changing asynchronously (acording the the above rule).
5437 * 5437 *
5438 * mutex_is_locked(&pcp_batch_high_lock) required when calling this function 5438 * mutex_is_locked(&pcp_batch_high_lock) required when calling this function
5439 * outside of boot time (or some other assurance that no concurrent updaters 5439 * outside of boot time (or some other assurance that no concurrent updaters
5440 * exist). 5440 * exist).
5441 */ 5441 */
5442 static void pageset_update(struct per_cpu_pages *pcp, unsigned long high, 5442 static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
5443 unsigned long batch) 5443 unsigned long batch)
5444 { 5444 {
5445 /* start with a fail safe value for batch */ 5445 /* start with a fail safe value for batch */
5446 pcp->batch = 1; 5446 pcp->batch = 1;
5447 smp_wmb(); 5447 smp_wmb();
5448 5448
5449 /* Update high, then batch, in order */ 5449 /* Update high, then batch, in order */
5450 pcp->high = high; 5450 pcp->high = high;
5451 smp_wmb(); 5451 smp_wmb();
5452 5452
5453 pcp->batch = batch; 5453 pcp->batch = batch;
5454 } 5454 }
5455 5455
5456 /* a companion to pageset_set_high() */ 5456 /* a companion to pageset_set_high() */
5457 static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch) 5457 static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch)
5458 { 5458 {
5459 pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch)); 5459 pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch));
5460 } 5460 }
5461 5461
5462 static void pageset_init(struct per_cpu_pageset *p) 5462 static void pageset_init(struct per_cpu_pageset *p)
5463 { 5463 {
5464 struct per_cpu_pages *pcp; 5464 struct per_cpu_pages *pcp;
5465 int migratetype; 5465 int migratetype;
5466 5466
5467 memset(p, 0, sizeof(*p)); 5467 memset(p, 0, sizeof(*p));
5468 5468
5469 pcp = &p->pcp; 5469 pcp = &p->pcp;
5470 pcp->count = 0; 5470 pcp->count = 0;
5471 for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) 5471 for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
5472 INIT_LIST_HEAD(&pcp->lists[migratetype]); 5472 INIT_LIST_HEAD(&pcp->lists[migratetype]);
5473 } 5473 }
5474 5474
5475 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) 5475 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
5476 { 5476 {
5477 pageset_init(p); 5477 pageset_init(p);
5478 pageset_set_batch(p, batch); 5478 pageset_set_batch(p, batch);
5479 } 5479 }
5480 5480
5481 /* 5481 /*
5482 * pageset_set_high() sets the high water mark for hot per_cpu_pagelist 5482 * pageset_set_high() sets the high water mark for hot per_cpu_pagelist
5483 * to the value high for the pageset p. 5483 * to the value high for the pageset p.
5484 */ 5484 */
5485 static void pageset_set_high(struct per_cpu_pageset *p, 5485 static void pageset_set_high(struct per_cpu_pageset *p,
5486 unsigned long high) 5486 unsigned long high)
5487 { 5487 {
5488 unsigned long batch = max(1UL, high / 4); 5488 unsigned long batch = max(1UL, high / 4);
5489 if ((high / 4) > (PAGE_SHIFT * 8)) 5489 if ((high / 4) > (PAGE_SHIFT * 8))
5490 batch = PAGE_SHIFT * 8; 5490 batch = PAGE_SHIFT * 8;
5491 5491
5492 pageset_update(&p->pcp, high, batch); 5492 pageset_update(&p->pcp, high, batch);
5493 } 5493 }
5494 5494
5495 static void pageset_set_high_and_batch(struct zone *zone, 5495 static void pageset_set_high_and_batch(struct zone *zone,
5496 struct per_cpu_pageset *pcp) 5496 struct per_cpu_pageset *pcp)
5497 { 5497 {
5498 if (percpu_pagelist_fraction) 5498 if (percpu_pagelist_fraction)
5499 pageset_set_high(pcp, 5499 pageset_set_high(pcp,
5500 (zone->managed_pages / 5500 (zone->managed_pages /
5501 percpu_pagelist_fraction)); 5501 percpu_pagelist_fraction));
5502 else 5502 else
5503 pageset_set_batch(pcp, zone_batchsize(zone)); 5503 pageset_set_batch(pcp, zone_batchsize(zone));
5504 } 5504 }
5505 5505
5506 static void __meminit zone_pageset_init(struct zone *zone, int cpu) 5506 static void __meminit zone_pageset_init(struct zone *zone, int cpu)
5507 { 5507 {
5508 struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); 5508 struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
5509 5509
5510 pageset_init(pcp); 5510 pageset_init(pcp);
5511 pageset_set_high_and_batch(zone, pcp); 5511 pageset_set_high_and_batch(zone, pcp);
5512 } 5512 }
5513 5513
5514 void __meminit setup_zone_pageset(struct zone *zone) 5514 void __meminit setup_zone_pageset(struct zone *zone)
5515 { 5515 {
5516 int cpu; 5516 int cpu;
5517 zone->pageset = alloc_percpu(struct per_cpu_pageset); 5517 zone->pageset = alloc_percpu(struct per_cpu_pageset);
5518 for_each_possible_cpu(cpu) 5518 for_each_possible_cpu(cpu)
5519 zone_pageset_init(zone, cpu); 5519 zone_pageset_init(zone, cpu);
5520 } 5520 }
5521 5521
5522 /* 5522 /*
5523 * Allocate per cpu pagesets and initialize them. 5523 * Allocate per cpu pagesets and initialize them.
5524 * Before this call only boot pagesets were available. 5524 * Before this call only boot pagesets were available.
5525 */ 5525 */
5526 void __init setup_per_cpu_pageset(void) 5526 void __init setup_per_cpu_pageset(void)
5527 { 5527 {
5528 struct pglist_data *pgdat; 5528 struct pglist_data *pgdat;
5529 struct zone *zone; 5529 struct zone *zone;
5530 5530
5531 for_each_populated_zone(zone) 5531 for_each_populated_zone(zone)
5532 setup_zone_pageset(zone); 5532 setup_zone_pageset(zone);
5533 5533
5534 for_each_online_pgdat(pgdat) 5534 for_each_online_pgdat(pgdat)
5535 pgdat->per_cpu_nodestats = 5535 pgdat->per_cpu_nodestats =
5536 alloc_percpu(struct per_cpu_nodestat); 5536 alloc_percpu(struct per_cpu_nodestat);
5537 } 5537 }
5538 5538
5539 static __meminit void zone_pcp_init(struct zone *zone) 5539 static __meminit void zone_pcp_init(struct zone *zone)
5540 { 5540 {
5541 /* 5541 /*
5542 * per cpu subsystem is not up at this point. The following code 5542 * per cpu subsystem is not up at this point. The following code
5543 * relies on the ability of the linker to provide the 5543 * relies on the ability of the linker to provide the
5544 * offset of a (static) per cpu variable into the per cpu area. 5544 * offset of a (static) per cpu variable into the per cpu area.
5545 */ 5545 */
5546 zone->pageset = &boot_pageset; 5546 zone->pageset = &boot_pageset;
5547 5547
5548 if (populated_zone(zone)) 5548 if (populated_zone(zone))
5549 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n", 5549 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n",
5550 zone->name, zone->present_pages, 5550 zone->name, zone->present_pages,
5551 zone_batchsize(zone)); 5551 zone_batchsize(zone));
5552 } 5552 }
5553 5553
5554 void __meminit init_currently_empty_zone(struct zone *zone, 5554 void __meminit init_currently_empty_zone(struct zone *zone,
5555 unsigned long zone_start_pfn, 5555 unsigned long zone_start_pfn,
5556 unsigned long size) 5556 unsigned long size)
5557 { 5557 {
5558 struct pglist_data *pgdat = zone->zone_pgdat; 5558 struct pglist_data *pgdat = zone->zone_pgdat;
5559 int zone_idx = zone_idx(zone) + 1; 5559 int zone_idx = zone_idx(zone) + 1;
5560 5560
5561 if (zone_idx > pgdat->nr_zones) 5561 if (zone_idx > pgdat->nr_zones)
5562 pgdat->nr_zones = zone_idx; 5562 pgdat->nr_zones = zone_idx;
5563 5563
5564 zone->zone_start_pfn = zone_start_pfn; 5564 zone->zone_start_pfn = zone_start_pfn;
5565 5565
5566 mminit_dprintk(MMINIT_TRACE, "memmap_init", 5566 mminit_dprintk(MMINIT_TRACE, "memmap_init",
5567 "Initialising map node %d zone %lu pfns %lu -> %lu\n", 5567 "Initialising map node %d zone %lu pfns %lu -> %lu\n",
5568 pgdat->node_id, 5568 pgdat->node_id,
5569 (unsigned long)zone_idx(zone), 5569 (unsigned long)zone_idx(zone),
5570 zone_start_pfn, (zone_start_pfn + size)); 5570 zone_start_pfn, (zone_start_pfn + size));
5571 5571
5572 zone_init_free_lists(zone); 5572 zone_init_free_lists(zone);
5573 zone->initialized = 1; 5573 zone->initialized = 1;
5574 } 5574 }
5575 5575
5576 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 5576 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
5577 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID 5577 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
5578 5578
5579 /* 5579 /*
5580 * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. 5580 * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
5581 */ 5581 */
5582 int __meminit __early_pfn_to_nid(unsigned long pfn, 5582 int __meminit __early_pfn_to_nid(unsigned long pfn,
5583 struct mminit_pfnnid_cache *state) 5583 struct mminit_pfnnid_cache *state)
5584 { 5584 {
5585 unsigned long start_pfn, end_pfn; 5585 unsigned long start_pfn, end_pfn;
5586 int nid; 5586 int nid;
5587 5587
5588 if (state->last_start <= pfn && pfn < state->last_end) 5588 if (state->last_start <= pfn && pfn < state->last_end)
5589 return state->last_nid; 5589 return state->last_nid;
5590 5590
5591 nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn); 5591 nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
5592 if (nid != -1) { 5592 if (nid != -1) {
5593 state->last_start = start_pfn; 5593 state->last_start = start_pfn;
5594 state->last_end = end_pfn; 5594 state->last_end = end_pfn;
5595 state->last_nid = nid; 5595 state->last_nid = nid;
5596 } 5596 }
5597 5597
5598 return nid; 5598 return nid;
5599 } 5599 }
5600 #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ 5600 #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
5601 5601
5602 /** 5602 /**
5603 * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range 5603 * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range
5604 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. 5604 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
5605 * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid 5605 * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid
5606 * 5606 *
5607 * If an architecture guarantees that all ranges registered contain no holes 5607 * If an architecture guarantees that all ranges registered contain no holes
5608 * and may be freed, this this function may be used instead of calling 5608 * and may be freed, this this function may be used instead of calling
5609 * memblock_free_early_nid() manually. 5609 * memblock_free_early_nid() manually.
5610 */ 5610 */
5611 void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) 5611 void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
5612 { 5612 {
5613 unsigned long start_pfn, end_pfn; 5613 unsigned long start_pfn, end_pfn;
5614 int i, this_nid; 5614 int i, this_nid;
5615 5615
5616 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) { 5616 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {
5617 start_pfn = min(start_pfn, max_low_pfn); 5617 start_pfn = min(start_pfn, max_low_pfn);
5618 end_pfn = min(end_pfn, max_low_pfn); 5618 end_pfn = min(end_pfn, max_low_pfn);
5619 5619
5620 if (start_pfn < end_pfn) 5620 if (start_pfn < end_pfn)
5621 memblock_free_early_nid(PFN_PHYS(start_pfn), 5621 memblock_free_early_nid(PFN_PHYS(start_pfn),
5622 (end_pfn - start_pfn) << PAGE_SHIFT, 5622 (end_pfn - start_pfn) << PAGE_SHIFT,
5623 this_nid); 5623 this_nid);
5624 } 5624 }
5625 } 5625 }
5626 5626
5627 /** 5627 /**
5628 * sparse_memory_present_with_active_regions - Call memory_present for each active range 5628 * sparse_memory_present_with_active_regions - Call memory_present for each active range
5629 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. 5629 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
5630 * 5630 *
5631 * If an architecture guarantees that all ranges registered contain no holes and may 5631 * If an architecture guarantees that all ranges registered contain no holes and may
5632 * be freed, this function may be used instead of calling memory_present() manually. 5632 * be freed, this function may be used instead of calling memory_present() manually.
5633 */ 5633 */
5634 void __init sparse_memory_present_with_active_regions(int nid) 5634 void __init sparse_memory_present_with_active_regions(int nid)
5635 { 5635 {
5636 unsigned long start_pfn, end_pfn; 5636 unsigned long start_pfn, end_pfn;
5637 int i, this_nid; 5637 int i, this_nid;
5638 5638
5639 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) 5639 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)
5640 memory_present(this_nid, start_pfn, end_pfn); 5640 memory_present(this_nid, start_pfn, end_pfn);
5641 } 5641 }
5642 5642
5643 /** 5643 /**
5644 * get_pfn_range_for_nid - Return the start and end page frames for a node 5644 * get_pfn_range_for_nid - Return the start and end page frames for a node
5645 * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned. 5645 * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
5646 * @start_pfn: Passed by reference. On return, it will have the node start_pfn. 5646 * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
5647 * @end_pfn: Passed by reference. On return, it will have the node end_pfn. 5647 * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
5648 * 5648 *
5649 * It returns the start and end page frame of a node based on information 5649 * It returns the start and end page frame of a node based on information
5650 * provided by memblock_set_node(). If called for a node 5650 * provided by memblock_set_node(). If called for a node
5651 * with no available memory, a warning is printed and the start and end 5651 * with no available memory, a warning is printed and the start and end
5652 * PFNs will be 0. 5652 * PFNs will be 0.
5653 */ 5653 */
5654 void __meminit get_pfn_range_for_nid(unsigned int nid, 5654 void __meminit get_pfn_range_for_nid(unsigned int nid,
5655 unsigned long *start_pfn, unsigned long *end_pfn) 5655 unsigned long *start_pfn, unsigned long *end_pfn)
5656 { 5656 {
5657 unsigned long this_start_pfn, this_end_pfn; 5657 unsigned long this_start_pfn, this_end_pfn;
5658 int i; 5658 int i;
5659 5659
5660 *start_pfn = -1UL; 5660 *start_pfn = -1UL;
5661 *end_pfn = 0; 5661 *end_pfn = 0;
5662 5662
5663 for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) { 5663 for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {
5664 *start_pfn = min(*start_pfn, this_start_pfn); 5664 *start_pfn = min(*start_pfn, this_start_pfn);
5665 *end_pfn = max(*end_pfn, this_end_pfn); 5665 *end_pfn = max(*end_pfn, this_end_pfn);
5666 } 5666 }
5667 5667
5668 if (*start_pfn == -1UL) 5668 if (*start_pfn == -1UL)
5669 *start_pfn = 0; 5669 *start_pfn = 0;
5670 } 5670 }
5671 5671
5672 /* 5672 /*
5673 * This finds a zone that can be used for ZONE_MOVABLE pages. The 5673 * This finds a zone that can be used for ZONE_MOVABLE pages. The
5674 * assumption is made that zones within a node are ordered in monotonic 5674 * assumption is made that zones within a node are ordered in monotonic
5675 * increasing memory addresses so that the "highest" populated zone is used 5675 * increasing memory addresses so that the "highest" populated zone is used
5676 */ 5676 */
5677 static void __init find_usable_zone_for_movable(void) 5677 static void __init find_usable_zone_for_movable(void)
5678 { 5678 {
5679 int zone_index; 5679 int zone_index;
5680 for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) { 5680 for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
5681 if (zone_index == ZONE_MOVABLE) 5681 if (zone_index == ZONE_MOVABLE)
5682 continue; 5682 continue;
5683 5683
5684 if (arch_zone_highest_possible_pfn[zone_index] > 5684 if (arch_zone_highest_possible_pfn[zone_index] >
5685 arch_zone_lowest_possible_pfn[zone_index]) 5685 arch_zone_lowest_possible_pfn[zone_index])
5686 break; 5686 break;
5687 } 5687 }
5688 5688
5689 VM_BUG_ON(zone_index == -1); 5689 VM_BUG_ON(zone_index == -1);
5690 movable_zone = zone_index; 5690 movable_zone = zone_index;
5691 } 5691 }
5692 5692
5693 /* 5693 /*
5694 * The zone ranges provided by the architecture do not include ZONE_MOVABLE 5694 * The zone ranges provided by the architecture do not include ZONE_MOVABLE
5695 * because it is sized independent of architecture. Unlike the other zones, 5695 * because it is sized independent of architecture. Unlike the other zones,
5696 * the starting point for ZONE_MOVABLE is not fixed. It may be different 5696 * the starting point for ZONE_MOVABLE is not fixed. It may be different
5697 * in each node depending on the size of each node and how evenly kernelcore 5697 * in each node depending on the size of each node and how evenly kernelcore
5698 * is distributed. This helper function adjusts the zone ranges 5698 * is distributed. This helper function adjusts the zone ranges
5699 * provided by the architecture for a given node by using the end of the 5699 * provided by the architecture for a given node by using the end of the
5700 * highest usable zone for ZONE_MOVABLE. This preserves the assumption that 5700 * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
5701 * zones within a node are in order of monotonic increases memory addresses 5701 * zones within a node are in order of monotonic increases memory addresses
5702 */ 5702 */
5703 static void __meminit adjust_zone_range_for_zone_movable(int nid, 5703 static void __meminit adjust_zone_range_for_zone_movable(int nid,
5704 unsigned long zone_type, 5704 unsigned long zone_type,
5705 unsigned long node_start_pfn, 5705 unsigned long node_start_pfn,
5706 unsigned long node_end_pfn, 5706 unsigned long node_end_pfn,
5707 unsigned long *zone_start_pfn, 5707 unsigned long *zone_start_pfn,
5708 unsigned long *zone_end_pfn) 5708 unsigned long *zone_end_pfn)
5709 { 5709 {
5710 /* Only adjust if ZONE_MOVABLE is on this node */ 5710 /* Only adjust if ZONE_MOVABLE is on this node */
5711 if (zone_movable_pfn[nid]) { 5711 if (zone_movable_pfn[nid]) {
5712 /* Size ZONE_MOVABLE */ 5712 /* Size ZONE_MOVABLE */
5713 if (zone_type == ZONE_MOVABLE) { 5713 if (zone_type == ZONE_MOVABLE) {
5714 *zone_start_pfn = zone_movable_pfn[nid]; 5714 *zone_start_pfn = zone_movable_pfn[nid];
5715 *zone_end_pfn = min(node_end_pfn, 5715 *zone_end_pfn = min(node_end_pfn,
5716 arch_zone_highest_possible_pfn[movable_zone]); 5716 arch_zone_highest_possible_pfn[movable_zone]);
5717 5717
5718 /* Adjust for ZONE_MOVABLE starting within this range */ 5718 /* Adjust for ZONE_MOVABLE starting within this range */
5719 } else if (!mirrored_kernelcore && 5719 } else if (!mirrored_kernelcore &&
5720 *zone_start_pfn < zone_movable_pfn[nid] && 5720 *zone_start_pfn < zone_movable_pfn[nid] &&
5721 *zone_end_pfn > zone_movable_pfn[nid]) { 5721 *zone_end_pfn > zone_movable_pfn[nid]) {
5722 *zone_end_pfn = zone_movable_pfn[nid]; 5722 *zone_end_pfn = zone_movable_pfn[nid];
5723 5723
5724 /* Check if this whole range is within ZONE_MOVABLE */ 5724 /* Check if this whole range is within ZONE_MOVABLE */
5725 } else if (*zone_start_pfn >= zone_movable_pfn[nid]) 5725 } else if (*zone_start_pfn >= zone_movable_pfn[nid])
5726 *zone_start_pfn = *zone_end_pfn; 5726 *zone_start_pfn = *zone_end_pfn;
5727 } 5727 }
5728 } 5728 }
5729 5729
5730 /* 5730 /*
5731 * Return the number of pages a zone spans in a node, including holes 5731 * Return the number of pages a zone spans in a node, including holes
5732 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node() 5732 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
5733 */ 5733 */
5734 static unsigned long __meminit zone_spanned_pages_in_node(int nid, 5734 static unsigned long __meminit zone_spanned_pages_in_node(int nid,
5735 unsigned long zone_type, 5735 unsigned long zone_type,
5736 unsigned long node_start_pfn, 5736 unsigned long node_start_pfn,
5737 unsigned long node_end_pfn, 5737 unsigned long node_end_pfn,
5738 unsigned long *zone_start_pfn, 5738 unsigned long *zone_start_pfn,
5739 unsigned long *zone_end_pfn, 5739 unsigned long *zone_end_pfn,
5740 unsigned long *ignored) 5740 unsigned long *ignored)
5741 { 5741 {
5742 /* When hotadd a new node from cpu_up(), the node should be empty */ 5742 /* When hotadd a new node from cpu_up(), the node should be empty */
5743 if (!node_start_pfn && !node_end_pfn) 5743 if (!node_start_pfn && !node_end_pfn)
5744 return 0; 5744 return 0;
5745 5745
5746 /* Get the start and end of the zone */ 5746 /* Get the start and end of the zone */
5747 *zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; 5747 *zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
5748 *zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; 5748 *zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
5749 adjust_zone_range_for_zone_movable(nid, zone_type, 5749 adjust_zone_range_for_zone_movable(nid, zone_type,
5750 node_start_pfn, node_end_pfn, 5750 node_start_pfn, node_end_pfn,
5751 zone_start_pfn, zone_end_pfn); 5751 zone_start_pfn, zone_end_pfn);
5752 5752
5753 /* Check that this node has pages within the zone's required range */ 5753 /* Check that this node has pages within the zone's required range */
5754 if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn) 5754 if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn)
5755 return 0; 5755 return 0;
5756 5756
5757 /* Move the zone boundaries inside the node if necessary */ 5757 /* Move the zone boundaries inside the node if necessary */
5758 *zone_end_pfn = min(*zone_end_pfn, node_end_pfn); 5758 *zone_end_pfn = min(*zone_end_pfn, node_end_pfn);
5759 *zone_start_pfn = max(*zone_start_pfn, node_start_pfn); 5759 *zone_start_pfn = max(*zone_start_pfn, node_start_pfn);
5760 5760
5761 /* Return the spanned pages */ 5761 /* Return the spanned pages */
5762 return *zone_end_pfn - *zone_start_pfn; 5762 return *zone_end_pfn - *zone_start_pfn;
5763 } 5763 }
5764 5764
5765 /* 5765 /*
5766 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, 5766 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
5767 * then all holes in the requested range will be accounted for. 5767 * then all holes in the requested range will be accounted for.
5768 */ 5768 */
5769 unsigned long __meminit __absent_pages_in_range(int nid, 5769 unsigned long __meminit __absent_pages_in_range(int nid,
5770 unsigned long range_start_pfn, 5770 unsigned long range_start_pfn,
5771 unsigned long range_end_pfn) 5771 unsigned long range_end_pfn)
5772 { 5772 {
5773 unsigned long nr_absent = range_end_pfn - range_start_pfn; 5773 unsigned long nr_absent = range_end_pfn - range_start_pfn;
5774 unsigned long start_pfn, end_pfn; 5774 unsigned long start_pfn, end_pfn;
5775 int i; 5775 int i;
5776 5776
5777 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 5777 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
5778 start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn); 5778 start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
5779 end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn); 5779 end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
5780 nr_absent -= end_pfn - start_pfn; 5780 nr_absent -= end_pfn - start_pfn;
5781 } 5781 }
5782 return nr_absent; 5782 return nr_absent;
5783 } 5783 }
5784 5784
5785 /** 5785 /**
5786 * absent_pages_in_range - Return number of page frames in holes within a range 5786 * absent_pages_in_range - Return number of page frames in holes within a range
5787 * @start_pfn: The start PFN to start searching for holes 5787 * @start_pfn: The start PFN to start searching for holes
5788 * @end_pfn: The end PFN to stop searching for holes 5788 * @end_pfn: The end PFN to stop searching for holes
5789 * 5789 *
5790 * It returns the number of pages frames in memory holes within a range. 5790 * It returns the number of pages frames in memory holes within a range.
5791 */ 5791 */
5792 unsigned long __init absent_pages_in_range(unsigned long start_pfn, 5792 unsigned long __init absent_pages_in_range(unsigned long start_pfn,
5793 unsigned long end_pfn) 5793 unsigned long end_pfn)
5794 { 5794 {
5795 return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn); 5795 return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
5796 } 5796 }
5797 5797
5798 /* Return the number of page frames in holes in a zone on a node */ 5798 /* Return the number of page frames in holes in a zone on a node */
5799 static unsigned long __meminit zone_absent_pages_in_node(int nid, 5799 static unsigned long __meminit zone_absent_pages_in_node(int nid,
5800 unsigned long zone_type, 5800 unsigned long zone_type,
5801 unsigned long node_start_pfn, 5801 unsigned long node_start_pfn,
5802 unsigned long node_end_pfn, 5802 unsigned long node_end_pfn,
5803 unsigned long *ignored) 5803 unsigned long *ignored)
5804 { 5804 {
5805 unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; 5805 unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
5806 unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; 5806 unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
5807 unsigned long zone_start_pfn, zone_end_pfn; 5807 unsigned long zone_start_pfn, zone_end_pfn;
5808 unsigned long nr_absent; 5808 unsigned long nr_absent;
5809 5809
5810 /* When hotadd a new node from cpu_up(), the node should be empty */ 5810 /* When hotadd a new node from cpu_up(), the node should be empty */
5811 if (!node_start_pfn && !node_end_pfn) 5811 if (!node_start_pfn && !node_end_pfn)
5812 return 0; 5812 return 0;
5813 5813
5814 zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); 5814 zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
5815 zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); 5815 zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
5816 5816
5817 adjust_zone_range_for_zone_movable(nid, zone_type, 5817 adjust_zone_range_for_zone_movable(nid, zone_type,
5818 node_start_pfn, node_end_pfn, 5818 node_start_pfn, node_end_pfn,
5819 &zone_start_pfn, &zone_end_pfn); 5819 &zone_start_pfn, &zone_end_pfn);
5820 nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); 5820 nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
5821 5821
5822 /* 5822 /*
5823 * ZONE_MOVABLE handling. 5823 * ZONE_MOVABLE handling.
5824 * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages 5824 * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages
5825 * and vice versa. 5825 * and vice versa.
5826 */ 5826 */
5827 if (mirrored_kernelcore && zone_movable_pfn[nid]) { 5827 if (mirrored_kernelcore && zone_movable_pfn[nid]) {
5828 unsigned long start_pfn, end_pfn; 5828 unsigned long start_pfn, end_pfn;
5829 struct memblock_region *r; 5829 struct memblock_region *r;
5830 5830
5831 for_each_memblock(memory, r) { 5831 for_each_memblock(memory, r) {
5832 start_pfn = clamp(memblock_region_memory_base_pfn(r), 5832 start_pfn = clamp(memblock_region_memory_base_pfn(r),
5833 zone_start_pfn, zone_end_pfn); 5833 zone_start_pfn, zone_end_pfn);
5834 end_pfn = clamp(memblock_region_memory_end_pfn(r), 5834 end_pfn = clamp(memblock_region_memory_end_pfn(r),
5835 zone_start_pfn, zone_end_pfn); 5835 zone_start_pfn, zone_end_pfn);
5836 5836
5837 if (zone_type == ZONE_MOVABLE && 5837 if (zone_type == ZONE_MOVABLE &&
5838 memblock_is_mirror(r)) 5838 memblock_is_mirror(r))
5839 nr_absent += end_pfn - start_pfn; 5839 nr_absent += end_pfn - start_pfn;
5840 5840
5841 if (zone_type == ZONE_NORMAL && 5841 if (zone_type == ZONE_NORMAL &&
5842 !memblock_is_mirror(r)) 5842 !memblock_is_mirror(r))
5843 nr_absent += end_pfn - start_pfn; 5843 nr_absent += end_pfn - start_pfn;
5844 } 5844 }
5845 } 5845 }
5846 5846
5847 return nr_absent; 5847 return nr_absent;
5848 } 5848 }
5849 5849
5850 #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 5850 #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
5851 static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, 5851 static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
5852 unsigned long zone_type, 5852 unsigned long zone_type,
5853 unsigned long node_start_pfn, 5853 unsigned long node_start_pfn,
5854 unsigned long node_end_pfn, 5854 unsigned long node_end_pfn,
5855 unsigned long *zone_start_pfn, 5855 unsigned long *zone_start_pfn,
5856 unsigned long *zone_end_pfn, 5856 unsigned long *zone_end_pfn,
5857 unsigned long *zones_size) 5857 unsigned long *zones_size)
5858 { 5858 {
5859 unsigned int zone; 5859 unsigned int zone;
5860 5860
5861 *zone_start_pfn = node_start_pfn; 5861 *zone_start_pfn = node_start_pfn;
5862 for (zone = 0; zone < zone_type; zone++) 5862 for (zone = 0; zone < zone_type; zone++)
5863 *zone_start_pfn += zones_size[zone]; 5863 *zone_start_pfn += zones_size[zone];
5864 5864
5865 *zone_end_pfn = *zone_start_pfn + zones_size[zone_type]; 5865 *zone_end_pfn = *zone_start_pfn + zones_size[zone_type];
5866 5866
5867 return zones_size[zone_type]; 5867 return zones_size[zone_type];
5868 } 5868 }
5869 5869
5870 static inline unsigned long __meminit zone_absent_pages_in_node(int nid, 5870 static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
5871 unsigned long zone_type, 5871 unsigned long zone_type,
5872 unsigned long node_start_pfn, 5872 unsigned long node_start_pfn,
5873 unsigned long node_end_pfn, 5873 unsigned long node_end_pfn,
5874 unsigned long *zholes_size) 5874 unsigned long *zholes_size)
5875 { 5875 {
5876 if (!zholes_size) 5876 if (!zholes_size)
5877 return 0; 5877 return 0;
5878 5878
5879 return zholes_size[zone_type]; 5879 return zholes_size[zone_type];
5880 } 5880 }
5881 5881
5882 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 5882 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
5883 5883
5884 static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, 5884 static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
5885 unsigned long node_start_pfn, 5885 unsigned long node_start_pfn,
5886 unsigned long node_end_pfn, 5886 unsigned long node_end_pfn,
5887 unsigned long *zones_size, 5887 unsigned long *zones_size,
5888 unsigned long *zholes_size) 5888 unsigned long *zholes_size)
5889 { 5889 {
5890 unsigned long realtotalpages = 0, totalpages = 0; 5890 unsigned long realtotalpages = 0, totalpages = 0;
5891 enum zone_type i; 5891 enum zone_type i;
5892 5892
5893 for (i = 0; i < MAX_NR_ZONES; i++) { 5893 for (i = 0; i < MAX_NR_ZONES; i++) {
5894 struct zone *zone = pgdat->node_zones + i; 5894 struct zone *zone = pgdat->node_zones + i;
5895 unsigned long zone_start_pfn, zone_end_pfn; 5895 unsigned long zone_start_pfn, zone_end_pfn;
5896 unsigned long size, real_size; 5896 unsigned long size, real_size;
5897 5897
5898 size = zone_spanned_pages_in_node(pgdat->node_id, i, 5898 size = zone_spanned_pages_in_node(pgdat->node_id, i,
5899 node_start_pfn, 5899 node_start_pfn,
5900 node_end_pfn, 5900 node_end_pfn,
5901 &zone_start_pfn, 5901 &zone_start_pfn,
5902 &zone_end_pfn, 5902 &zone_end_pfn,
5903 zones_size); 5903 zones_size);
5904 real_size = size - zone_absent_pages_in_node(pgdat->node_id, i, 5904 real_size = size - zone_absent_pages_in_node(pgdat->node_id, i,
5905 node_start_pfn, node_end_pfn, 5905 node_start_pfn, node_end_pfn,
5906 zholes_size); 5906 zholes_size);
5907 if (size) 5907 if (size)
5908 zone->zone_start_pfn = zone_start_pfn; 5908 zone->zone_start_pfn = zone_start_pfn;
5909 else 5909 else
5910 zone->zone_start_pfn = 0; 5910 zone->zone_start_pfn = 0;
5911 zone->spanned_pages = size; 5911 zone->spanned_pages = size;
5912 zone->present_pages = real_size; 5912 zone->present_pages = real_size;
5913 5913
5914 totalpages += size; 5914 totalpages += size;
5915 realtotalpages += real_size; 5915 realtotalpages += real_size;
5916 } 5916 }
5917 5917
5918 pgdat->node_spanned_pages = totalpages; 5918 pgdat->node_spanned_pages = totalpages;
5919 pgdat->node_present_pages = realtotalpages; 5919 pgdat->node_present_pages = realtotalpages;
5920 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, 5920 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
5921 realtotalpages); 5921 realtotalpages);
5922 } 5922 }
5923 5923
5924 #ifndef CONFIG_SPARSEMEM 5924 #ifndef CONFIG_SPARSEMEM
5925 /* 5925 /*
5926 * Calculate the size of the zone->blockflags rounded to an unsigned long 5926 * Calculate the size of the zone->blockflags rounded to an unsigned long
5927 * Start by making sure zonesize is a multiple of pageblock_order by rounding 5927 * Start by making sure zonesize is a multiple of pageblock_order by rounding
5928 * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally 5928 * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
5929 * round what is now in bits to nearest long in bits, then return it in 5929 * round what is now in bits to nearest long in bits, then return it in
5930 * bytes. 5930 * bytes.
5931 */ 5931 */
5932 static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize) 5932 static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)
5933 { 5933 {
5934 unsigned long usemapsize; 5934 unsigned long usemapsize;
5935 5935
5936 zonesize += zone_start_pfn & (pageblock_nr_pages-1); 5936 zonesize += zone_start_pfn & (pageblock_nr_pages-1);
5937 usemapsize = roundup(zonesize, pageblock_nr_pages); 5937 usemapsize = roundup(zonesize, pageblock_nr_pages);
5938 usemapsize = usemapsize >> pageblock_order; 5938 usemapsize = usemapsize >> pageblock_order;
5939 usemapsize *= NR_PAGEBLOCK_BITS; 5939 usemapsize *= NR_PAGEBLOCK_BITS;
5940 usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long)); 5940 usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));
5941 5941
5942 return usemapsize / 8; 5942 return usemapsize / 8;
5943 } 5943 }
5944 5944
5945 static void __init setup_usemap(struct pglist_data *pgdat, 5945 static void __init setup_usemap(struct pglist_data *pgdat,
5946 struct zone *zone, 5946 struct zone *zone,
5947 unsigned long zone_start_pfn, 5947 unsigned long zone_start_pfn,
5948 unsigned long zonesize) 5948 unsigned long zonesize)
5949 { 5949 {
5950 unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize); 5950 unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
5951 zone->pageblock_flags = NULL; 5951 zone->pageblock_flags = NULL;
5952 if (usemapsize) 5952 if (usemapsize)
5953 zone->pageblock_flags = 5953 zone->pageblock_flags =
5954 memblock_virt_alloc_node_nopanic(usemapsize, 5954 memblock_virt_alloc_node_nopanic(usemapsize,
5955 pgdat->node_id); 5955 pgdat->node_id);
5956 } 5956 }
5957 #else 5957 #else
5958 static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, 5958 static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
5959 unsigned long zone_start_pfn, unsigned long zonesize) {} 5959 unsigned long zone_start_pfn, unsigned long zonesize) {}
5960 #endif /* CONFIG_SPARSEMEM */ 5960 #endif /* CONFIG_SPARSEMEM */
5961 5961
5962 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 5962 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
5963 5963
5964 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ 5964 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
5965 void __paginginit set_pageblock_order(void) 5965 void __paginginit set_pageblock_order(void)
5966 { 5966 {
5967 unsigned int order; 5967 unsigned int order;
5968 5968
5969 /* Check that pageblock_nr_pages has not already been setup */ 5969 /* Check that pageblock_nr_pages has not already been setup */
5970 if (pageblock_order) 5970 if (pageblock_order)
5971 return; 5971 return;
5972 5972
5973 if (HPAGE_SHIFT > PAGE_SHIFT) 5973 if (HPAGE_SHIFT > PAGE_SHIFT)
5974 order = HUGETLB_PAGE_ORDER; 5974 order = HUGETLB_PAGE_ORDER;
5975 else 5975 else
5976 order = MAX_ORDER - 1; 5976 order = MAX_ORDER - 1;
5977 5977
5978 /* 5978 /*
5979 * Assume the largest contiguous order of interest is a huge page. 5979 * Assume the largest contiguous order of interest is a huge page.
5980 * This value may be variable depending on boot parameters on IA64 and 5980 * This value may be variable depending on boot parameters on IA64 and
5981 * powerpc. 5981 * powerpc.
5982 */ 5982 */
5983 pageblock_order = order; 5983 pageblock_order = order;
5984 } 5984 }
5985 #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ 5985 #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
5986 5986
5987 /* 5987 /*
5988 * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order() 5988 * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
5989 * is unused as pageblock_order is set at compile-time. See 5989 * is unused as pageblock_order is set at compile-time. See
5990 * include/linux/pageblock-flags.h for the values of pageblock_order based on 5990 * include/linux/pageblock-flags.h for the values of pageblock_order based on
5991 * the kernel config 5991 * the kernel config
5992 */ 5992 */
5993 void __paginginit set_pageblock_order(void) 5993 void __paginginit set_pageblock_order(void)
5994 { 5994 {
5995 } 5995 }
5996 5996
5997 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ 5997 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
5998 5998
5999 static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages, 5999 static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
6000 unsigned long present_pages) 6000 unsigned long present_pages)
6001 { 6001 {
6002 unsigned long pages = spanned_pages; 6002 unsigned long pages = spanned_pages;
6003 6003
6004 /* 6004 /*
6005 * Provide a more accurate estimation if there are holes within 6005 * Provide a more accurate estimation if there are holes within
6006 * the zone and SPARSEMEM is in use. If there are holes within the 6006 * the zone and SPARSEMEM is in use. If there are holes within the
6007 * zone, each populated memory region may cost us one or two extra 6007 * zone, each populated memory region may cost us one or two extra
6008 * memmap pages due to alignment because memmap pages for each 6008 * memmap pages due to alignment because memmap pages for each
6009 * populated regions may not be naturally aligned on page boundary. 6009 * populated regions may not be naturally aligned on page boundary.
6010 * So the (present_pages >> 4) heuristic is a tradeoff for that. 6010 * So the (present_pages >> 4) heuristic is a tradeoff for that.
6011 */ 6011 */
6012 if (spanned_pages > present_pages + (present_pages >> 4) && 6012 if (spanned_pages > present_pages + (present_pages >> 4) &&
6013 IS_ENABLED(CONFIG_SPARSEMEM)) 6013 IS_ENABLED(CONFIG_SPARSEMEM))
6014 pages = present_pages; 6014 pages = present_pages;
6015 6015
6016 return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT; 6016 return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
6017 } 6017 }
6018 6018
6019 /* 6019 /*
6020 * Set up the zone data structures: 6020 * Set up the zone data structures:
6021 * - mark all pages reserved 6021 * - mark all pages reserved
6022 * - mark all memory queues empty 6022 * - mark all memory queues empty
6023 * - clear the memory bitmaps 6023 * - clear the memory bitmaps
6024 * 6024 *
6025 * NOTE: pgdat should get zeroed by caller. 6025 * NOTE: pgdat should get zeroed by caller.
6026 */ 6026 */
6027 static void __paginginit free_area_init_core(struct pglist_data *pgdat) 6027 static void __paginginit free_area_init_core(struct pglist_data *pgdat)
6028 { 6028 {
6029 enum zone_type j; 6029 enum zone_type j;
6030 int nid = pgdat->node_id; 6030 int nid = pgdat->node_id;
6031 6031
6032 pgdat_resize_init(pgdat); 6032 pgdat_resize_init(pgdat);
6033 #ifdef CONFIG_NUMA_BALANCING 6033 #ifdef CONFIG_NUMA_BALANCING
6034 spin_lock_init(&pgdat->numabalancing_migrate_lock); 6034 spin_lock_init(&pgdat->numabalancing_migrate_lock);
6035 pgdat->numabalancing_migrate_nr_pages = 0; 6035 pgdat->numabalancing_migrate_nr_pages = 0;
6036 pgdat->numabalancing_migrate_next_window = jiffies; 6036 pgdat->numabalancing_migrate_next_window = jiffies;
6037 #endif 6037 #endif
6038 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 6038 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
6039 spin_lock_init(&pgdat->split_queue_lock); 6039 spin_lock_init(&pgdat->split_queue_lock);
6040 INIT_LIST_HEAD(&pgdat->split_queue); 6040 INIT_LIST_HEAD(&pgdat->split_queue);
6041 pgdat->split_queue_len = 0; 6041 pgdat->split_queue_len = 0;
6042 #endif 6042 #endif
6043 init_waitqueue_head(&pgdat->kswapd_wait); 6043 init_waitqueue_head(&pgdat->kswapd_wait);
6044 init_waitqueue_head(&pgdat->pfmemalloc_wait); 6044 init_waitqueue_head(&pgdat->pfmemalloc_wait);
6045 #ifdef CONFIG_COMPACTION 6045 #ifdef CONFIG_COMPACTION
6046 init_waitqueue_head(&pgdat->kcompactd_wait); 6046 init_waitqueue_head(&pgdat->kcompactd_wait);
6047 #endif 6047 #endif
6048 pgdat_page_ext_init(pgdat); 6048 pgdat_page_ext_init(pgdat);
6049 spin_lock_init(&pgdat->lru_lock); 6049 spin_lock_init(&pgdat->lru_lock);
6050 lruvec_init(node_lruvec(pgdat)); 6050 lruvec_init(node_lruvec(pgdat));
6051 6051
6052 pgdat->per_cpu_nodestats = &boot_nodestats; 6052 pgdat->per_cpu_nodestats = &boot_nodestats;
6053 6053
6054 for (j = 0; j < MAX_NR_ZONES; j++) { 6054 for (j = 0; j < MAX_NR_ZONES; j++) {
6055 struct zone *zone = pgdat->node_zones + j; 6055 struct zone *zone = pgdat->node_zones + j;
6056 unsigned long size, realsize, freesize, memmap_pages; 6056 unsigned long size, realsize, freesize, memmap_pages;
6057 unsigned long zone_start_pfn = zone->zone_start_pfn; 6057 unsigned long zone_start_pfn = zone->zone_start_pfn;
6058 6058
6059 size = zone->spanned_pages; 6059 size = zone->spanned_pages;
6060 realsize = freesize = zone->present_pages; 6060 realsize = freesize = zone->present_pages;
6061 6061
6062 /* 6062 /*
6063 * Adjust freesize so that it accounts for how much memory 6063 * Adjust freesize so that it accounts for how much memory
6064 * is used by this zone for memmap. This affects the watermark 6064 * is used by this zone for memmap. This affects the watermark
6065 * and per-cpu initialisations 6065 * and per-cpu initialisations
6066 */ 6066 */
6067 memmap_pages = calc_memmap_size(size, realsize); 6067 memmap_pages = calc_memmap_size(size, realsize);
6068 if (!is_highmem_idx(j)) { 6068 if (!is_highmem_idx(j)) {
6069 if (freesize >= memmap_pages) { 6069 if (freesize >= memmap_pages) {
6070 freesize -= memmap_pages; 6070 freesize -= memmap_pages;
6071 if (memmap_pages) 6071 if (memmap_pages)
6072 printk(KERN_DEBUG 6072 printk(KERN_DEBUG
6073 " %s zone: %lu pages used for memmap\n", 6073 " %s zone: %lu pages used for memmap\n",
6074 zone_names[j], memmap_pages); 6074 zone_names[j], memmap_pages);
6075 } else 6075 } else
6076 pr_warn(" %s zone: %lu pages exceeds freesize %lu\n", 6076 pr_warn(" %s zone: %lu pages exceeds freesize %lu\n",
6077 zone_names[j], memmap_pages, freesize); 6077 zone_names[j], memmap_pages, freesize);
6078 } 6078 }
6079 6079
6080 /* Account for reserved pages */ 6080 /* Account for reserved pages */
6081 if (j == 0 && freesize > dma_reserve) { 6081 if (j == 0 && freesize > dma_reserve) {
6082 freesize -= dma_reserve; 6082 freesize -= dma_reserve;
6083 printk(KERN_DEBUG " %s zone: %lu pages reserved\n", 6083 printk(KERN_DEBUG " %s zone: %lu pages reserved\n",
6084 zone_names[0], dma_reserve); 6084 zone_names[0], dma_reserve);
6085 } 6085 }
6086 6086
6087 if (!is_highmem_idx(j)) 6087 if (!is_highmem_idx(j))
6088 nr_kernel_pages += freesize; 6088 nr_kernel_pages += freesize;
6089 /* Charge for highmem memmap if there are enough kernel pages */ 6089 /* Charge for highmem memmap if there are enough kernel pages */
6090 else if (nr_kernel_pages > memmap_pages * 2) 6090 else if (nr_kernel_pages > memmap_pages * 2)
6091 nr_kernel_pages -= memmap_pages; 6091 nr_kernel_pages -= memmap_pages;
6092 nr_all_pages += freesize; 6092 nr_all_pages += freesize;
6093 6093
6094 /* 6094 /*
6095 * Set an approximate value for lowmem here, it will be adjusted 6095 * Set an approximate value for lowmem here, it will be adjusted
6096 * when the bootmem allocator frees pages into the buddy system. 6096 * when the bootmem allocator frees pages into the buddy system.
6097 * And all highmem pages will be managed by the buddy system. 6097 * And all highmem pages will be managed by the buddy system.
6098 */ 6098 */
6099 zone->managed_pages = is_highmem_idx(j) ? realsize : freesize; 6099 zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;
6100 #ifdef CONFIG_NUMA 6100 #ifdef CONFIG_NUMA
6101 zone->node = nid; 6101 zone->node = nid;
6102 #endif 6102 #endif
6103 zone->name = zone_names[j]; 6103 zone->name = zone_names[j];
6104 zone->zone_pgdat = pgdat; 6104 zone->zone_pgdat = pgdat;
6105 spin_lock_init(&zone->lock); 6105 spin_lock_init(&zone->lock);
6106 zone_seqlock_init(zone); 6106 zone_seqlock_init(zone);
6107 zone_pcp_init(zone); 6107 zone_pcp_init(zone);
6108 6108
6109 if (!size) 6109 if (!size)
6110 continue; 6110 continue;
6111 6111
6112 set_pageblock_order(); 6112 set_pageblock_order();
6113 setup_usemap(pgdat, zone, zone_start_pfn, size); 6113 setup_usemap(pgdat, zone, zone_start_pfn, size);
6114 init_currently_empty_zone(zone, zone_start_pfn, size); 6114 init_currently_empty_zone(zone, zone_start_pfn, size);
6115 memmap_init(size, nid, j, zone_start_pfn); 6115 memmap_init(size, nid, j, zone_start_pfn);
6116 } 6116 }
6117 } 6117 }
6118 6118
6119 static void __ref alloc_node_mem_map(struct pglist_data *pgdat) 6119 static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
6120 { 6120 {
6121 unsigned long __maybe_unused start = 0; 6121 unsigned long __maybe_unused start = 0;
6122 unsigned long __maybe_unused offset = 0; 6122 unsigned long __maybe_unused offset = 0;
6123 6123
6124 /* Skip empty nodes */ 6124 /* Skip empty nodes */
6125 if (!pgdat->node_spanned_pages) 6125 if (!pgdat->node_spanned_pages)
6126 return; 6126 return;
6127 6127
6128 #ifdef CONFIG_FLAT_NODE_MEM_MAP 6128 #ifdef CONFIG_FLAT_NODE_MEM_MAP
6129 start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); 6129 start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
6130 offset = pgdat->node_start_pfn - start; 6130 offset = pgdat->node_start_pfn - start;
6131 /* ia64 gets its own node_mem_map, before this, without bootmem */ 6131 /* ia64 gets its own node_mem_map, before this, without bootmem */
6132 if (!pgdat->node_mem_map) { 6132 if (!pgdat->node_mem_map) {
6133 unsigned long size, end; 6133 unsigned long size, end;
6134 struct page *map; 6134 struct page *map;
6135 6135
6136 /* 6136 /*
6137 * The zone's endpoints aren't required to be MAX_ORDER 6137 * The zone's endpoints aren't required to be MAX_ORDER
6138 * aligned but the node_mem_map endpoints must be in order 6138 * aligned but the node_mem_map endpoints must be in order
6139 * for the buddy allocator to function correctly. 6139 * for the buddy allocator to function correctly.
6140 */ 6140 */
6141 end = pgdat_end_pfn(pgdat); 6141 end = pgdat_end_pfn(pgdat);
6142 end = ALIGN(end, MAX_ORDER_NR_PAGES); 6142 end = ALIGN(end, MAX_ORDER_NR_PAGES);
6143 size = (end - start) * sizeof(struct page); 6143 size = (end - start) * sizeof(struct page);
6144 map = alloc_remap(pgdat->node_id, size); 6144 map = alloc_remap(pgdat->node_id, size);
6145 if (!map) 6145 if (!map)
6146 map = memblock_virt_alloc_node_nopanic(size, 6146 map = memblock_virt_alloc_node_nopanic(size,
6147 pgdat->node_id); 6147 pgdat->node_id);
6148 pgdat->node_mem_map = map + offset; 6148 pgdat->node_mem_map = map + offset;
6149 } 6149 }
6150 #ifndef CONFIG_NEED_MULTIPLE_NODES 6150 #ifndef CONFIG_NEED_MULTIPLE_NODES
6151 /* 6151 /*
6152 * With no DISCONTIG, the global mem_map is just set as node 0's 6152 * With no DISCONTIG, the global mem_map is just set as node 0's
6153 */ 6153 */
6154 if (pgdat == NODE_DATA(0)) { 6154 if (pgdat == NODE_DATA(0)) {
6155 mem_map = NODE_DATA(0)->node_mem_map; 6155 mem_map = NODE_DATA(0)->node_mem_map;
6156 #if defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) || defined(CONFIG_FLATMEM) 6156 #if defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) || defined(CONFIG_FLATMEM)
6157 if (page_to_pfn(mem_map) != pgdat->node_start_pfn) 6157 if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
6158 mem_map -= offset; 6158 mem_map -= offset;
6159 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 6159 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
6160 } 6160 }
6161 #endif 6161 #endif
6162 #endif /* CONFIG_FLAT_NODE_MEM_MAP */ 6162 #endif /* CONFIG_FLAT_NODE_MEM_MAP */
6163 } 6163 }
6164 6164
6165 void __paginginit free_area_init_node(int nid, unsigned long *zones_size, 6165 void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
6166 unsigned long node_start_pfn, unsigned long *zholes_size) 6166 unsigned long node_start_pfn, unsigned long *zholes_size)
6167 { 6167 {
6168 pg_data_t *pgdat = NODE_DATA(nid); 6168 pg_data_t *pgdat = NODE_DATA(nid);
6169 unsigned long start_pfn = 0; 6169 unsigned long start_pfn = 0;
6170 unsigned long end_pfn = 0; 6170 unsigned long end_pfn = 0;
6171 6171
6172 /* pg_data_t should be reset to zero when it's allocated */ 6172 /* pg_data_t should be reset to zero when it's allocated */
6173 WARN_ON(pgdat->nr_zones || pgdat->kswapd_classzone_idx); 6173 WARN_ON(pgdat->nr_zones || pgdat->kswapd_classzone_idx);
6174 6174
6175 pgdat->node_id = nid; 6175 pgdat->node_id = nid;
6176 pgdat->node_start_pfn = node_start_pfn; 6176 pgdat->node_start_pfn = node_start_pfn;
6177 pgdat->per_cpu_nodestats = NULL; 6177 pgdat->per_cpu_nodestats = NULL;
6178 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 6178 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
6179 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); 6179 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
6180 pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, 6180 pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
6181 (u64)start_pfn << PAGE_SHIFT, 6181 (u64)start_pfn << PAGE_SHIFT,
6182 end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0); 6182 end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
6183 #else 6183 #else
6184 start_pfn = node_start_pfn; 6184 start_pfn = node_start_pfn;
6185 #endif 6185 #endif
6186 calculate_node_totalpages(pgdat, start_pfn, end_pfn, 6186 calculate_node_totalpages(pgdat, start_pfn, end_pfn,
6187 zones_size, zholes_size); 6187 zones_size, zholes_size);
6188 6188
6189 alloc_node_mem_map(pgdat); 6189 alloc_node_mem_map(pgdat);
6190 #ifdef CONFIG_FLAT_NODE_MEM_MAP 6190 #ifdef CONFIG_FLAT_NODE_MEM_MAP
6191 printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n", 6191 printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",
6192 nid, (unsigned long)pgdat, 6192 nid, (unsigned long)pgdat,
6193 (unsigned long)pgdat->node_mem_map); 6193 (unsigned long)pgdat->node_mem_map);
6194 #endif 6194 #endif
6195 6195
6196 reset_deferred_meminit(pgdat); 6196 reset_deferred_meminit(pgdat);
6197 free_area_init_core(pgdat); 6197 free_area_init_core(pgdat);
6198 } 6198 }
6199 6199
6200 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 6200 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
6201 6201
6202 #if MAX_NUMNODES > 1 6202 #if MAX_NUMNODES > 1
6203 /* 6203 /*
6204 * Figure out the number of possible node ids. 6204 * Figure out the number of possible node ids.
6205 */ 6205 */
6206 void __init setup_nr_node_ids(void) 6206 void __init setup_nr_node_ids(void)
6207 { 6207 {
6208 unsigned int highest; 6208 unsigned int highest;
6209 6209
6210 highest = find_last_bit(node_possible_map.bits, MAX_NUMNODES); 6210 highest = find_last_bit(node_possible_map.bits, MAX_NUMNODES);
6211 nr_node_ids = highest + 1; 6211 nr_node_ids = highest + 1;
6212 } 6212 }
6213 #endif 6213 #endif
6214 6214
6215 /** 6215 /**
6216 * node_map_pfn_alignment - determine the maximum internode alignment 6216 * node_map_pfn_alignment - determine the maximum internode alignment
6217 * 6217 *
6218 * This function should be called after node map is populated and sorted. 6218 * This function should be called after node map is populated and sorted.
6219 * It calculates the maximum power of two alignment which can distinguish 6219 * It calculates the maximum power of two alignment which can distinguish
6220 * all the nodes. 6220 * all the nodes.
6221 * 6221 *
6222 * For example, if all nodes are 1GiB and aligned to 1GiB, the return value 6222 * For example, if all nodes are 1GiB and aligned to 1GiB, the return value
6223 * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the 6223 * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the
6224 * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is 6224 * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is
6225 * shifted, 1GiB is enough and this function will indicate so. 6225 * shifted, 1GiB is enough and this function will indicate so.
6226 * 6226 *
6227 * This is used to test whether pfn -> nid mapping of the chosen memory 6227 * This is used to test whether pfn -> nid mapping of the chosen memory
6228 * model has fine enough granularity to avoid incorrect mapping for the 6228 * model has fine enough granularity to avoid incorrect mapping for the
6229 * populated node map. 6229 * populated node map.
6230 * 6230 *
6231 * Returns the determined alignment in pfn's. 0 if there is no alignment 6231 * Returns the determined alignment in pfn's. 0 if there is no alignment
6232 * requirement (single node). 6232 * requirement (single node).
6233 */ 6233 */
6234 unsigned long __init node_map_pfn_alignment(void) 6234 unsigned long __init node_map_pfn_alignment(void)
6235 { 6235 {
6236 unsigned long accl_mask = 0, last_end = 0; 6236 unsigned long accl_mask = 0, last_end = 0;
6237 unsigned long start, end, mask; 6237 unsigned long start, end, mask;
6238 int last_nid = -1; 6238 int last_nid = -1;
6239 int i, nid; 6239 int i, nid;
6240 6240
6241 for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) { 6241 for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
6242 if (!start || last_nid < 0 || last_nid == nid) { 6242 if (!start || last_nid < 0 || last_nid == nid) {
6243 last_nid = nid; 6243 last_nid = nid;
6244 last_end = end; 6244 last_end = end;
6245 continue; 6245 continue;
6246 } 6246 }
6247 6247
6248 /* 6248 /*
6249 * Start with a mask granular enough to pin-point to the 6249 * Start with a mask granular enough to pin-point to the
6250 * start pfn and tick off bits one-by-one until it becomes 6250 * start pfn and tick off bits one-by-one until it becomes
6251 * too coarse to separate the current node from the last. 6251 * too coarse to separate the current node from the last.
6252 */ 6252 */
6253 mask = ~((1 << __ffs(start)) - 1); 6253 mask = ~((1 << __ffs(start)) - 1);
6254 while (mask && last_end <= (start & (mask << 1))) 6254 while (mask && last_end <= (start & (mask << 1)))
6255 mask <<= 1; 6255 mask <<= 1;
6256 6256
6257 /* accumulate all internode masks */ 6257 /* accumulate all internode masks */
6258 accl_mask |= mask; 6258 accl_mask |= mask;
6259 } 6259 }
6260 6260
6261 /* convert mask to number of pages */ 6261 /* convert mask to number of pages */
6262 return ~accl_mask + 1; 6262 return ~accl_mask + 1;
6263 } 6263 }
6264 6264
6265 /* Find the lowest pfn for a node */ 6265 /* Find the lowest pfn for a node */
6266 static unsigned long __init find_min_pfn_for_node(int nid) 6266 static unsigned long __init find_min_pfn_for_node(int nid)
6267 { 6267 {
6268 unsigned long min_pfn = ULONG_MAX; 6268 unsigned long min_pfn = ULONG_MAX;
6269 unsigned long start_pfn; 6269 unsigned long start_pfn;
6270 int i; 6270 int i;
6271 6271
6272 for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL) 6272 for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)
6273 min_pfn = min(min_pfn, start_pfn); 6273 min_pfn = min(min_pfn, start_pfn);
6274 6274
6275 if (min_pfn == ULONG_MAX) { 6275 if (min_pfn == ULONG_MAX) {
6276 pr_warn("Could not find start_pfn for node %d\n", nid); 6276 pr_warn("Could not find start_pfn for node %d\n", nid);
6277 return 0; 6277 return 0;
6278 } 6278 }
6279 6279
6280 return min_pfn; 6280 return min_pfn;
6281 } 6281 }
6282 6282
6283 /** 6283 /**
6284 * find_min_pfn_with_active_regions - Find the minimum PFN registered 6284 * find_min_pfn_with_active_regions - Find the minimum PFN registered
6285 * 6285 *
6286 * It returns the minimum PFN based on information provided via 6286 * It returns the minimum PFN based on information provided via
6287 * memblock_set_node(). 6287 * memblock_set_node().
6288 */ 6288 */
6289 unsigned long __init find_min_pfn_with_active_regions(void) 6289 unsigned long __init find_min_pfn_with_active_regions(void)
6290 { 6290 {
6291 return find_min_pfn_for_node(MAX_NUMNODES); 6291 return find_min_pfn_for_node(MAX_NUMNODES);
6292 } 6292 }
6293 6293
6294 /* 6294 /*
6295 * early_calculate_totalpages() 6295 * early_calculate_totalpages()
6296 * Sum pages in active regions for movable zone. 6296 * Sum pages in active regions for movable zone.
6297 * Populate N_MEMORY for calculating usable_nodes. 6297 * Populate N_MEMORY for calculating usable_nodes.
6298 */ 6298 */
6299 static unsigned long __init early_calculate_totalpages(void) 6299 static unsigned long __init early_calculate_totalpages(void)
6300 { 6300 {
6301 unsigned long totalpages = 0; 6301 unsigned long totalpages = 0;
6302 unsigned long start_pfn, end_pfn; 6302 unsigned long start_pfn, end_pfn;
6303 int i, nid; 6303 int i, nid;
6304 6304
6305 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { 6305 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
6306 unsigned long pages = end_pfn - start_pfn; 6306 unsigned long pages = end_pfn - start_pfn;
6307 6307
6308 totalpages += pages; 6308 totalpages += pages;
6309 if (pages) 6309 if (pages)
6310 node_set_state(nid, N_MEMORY); 6310 node_set_state(nid, N_MEMORY);
6311 } 6311 }
6312 return totalpages; 6312 return totalpages;
6313 } 6313 }
6314 6314
6315 /* 6315 /*
6316 * Find the PFN the Movable zone begins in each node. Kernel memory 6316 * Find the PFN the Movable zone begins in each node. Kernel memory
6317 * is spread evenly between nodes as long as the nodes have enough 6317 * is spread evenly between nodes as long as the nodes have enough
6318 * memory. When they don't, some nodes will have more kernelcore than 6318 * memory. When they don't, some nodes will have more kernelcore than
6319 * others 6319 * others
6320 */ 6320 */
6321 static void __init find_zone_movable_pfns_for_nodes(void) 6321 static void __init find_zone_movable_pfns_for_nodes(void)
6322 { 6322 {
6323 int i, nid; 6323 int i, nid;
6324 unsigned long usable_startpfn; 6324 unsigned long usable_startpfn;
6325 unsigned long kernelcore_node, kernelcore_remaining; 6325 unsigned long kernelcore_node, kernelcore_remaining;
6326 /* save the state before borrow the nodemask */ 6326 /* save the state before borrow the nodemask */
6327 nodemask_t saved_node_state = node_states[N_MEMORY]; 6327 nodemask_t saved_node_state = node_states[N_MEMORY];
6328 unsigned long totalpages = early_calculate_totalpages(); 6328 unsigned long totalpages = early_calculate_totalpages();
6329 int usable_nodes = nodes_weight(node_states[N_MEMORY]); 6329 int usable_nodes = nodes_weight(node_states[N_MEMORY]);
6330 struct memblock_region *r; 6330 struct memblock_region *r;
6331 6331
6332 /* Need to find movable_zone earlier when movable_node is specified. */ 6332 /* Need to find movable_zone earlier when movable_node is specified. */
6333 find_usable_zone_for_movable(); 6333 find_usable_zone_for_movable();
6334 6334
6335 /* 6335 /*
6336 * If movable_node is specified, ignore kernelcore and movablecore 6336 * If movable_node is specified, ignore kernelcore and movablecore
6337 * options. 6337 * options.
6338 */ 6338 */
6339 if (movable_node_is_enabled()) { 6339 if (movable_node_is_enabled()) {
6340 for_each_memblock(memory, r) { 6340 for_each_memblock(memory, r) {
6341 if (!memblock_is_hotpluggable(r)) 6341 if (!memblock_is_hotpluggable(r))
6342 continue; 6342 continue;
6343 6343
6344 nid = r->nid; 6344 nid = r->nid;
6345 6345
6346 usable_startpfn = PFN_DOWN(r->base); 6346 usable_startpfn = PFN_DOWN(r->base);
6347 zone_movable_pfn[nid] = zone_movable_pfn[nid] ? 6347 zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
6348 min(usable_startpfn, zone_movable_pfn[nid]) : 6348 min(usable_startpfn, zone_movable_pfn[nid]) :
6349 usable_startpfn; 6349 usable_startpfn;
6350 } 6350 }
6351 6351
6352 goto out2; 6352 goto out2;
6353 } 6353 }
6354 6354
6355 /* 6355 /*
6356 * If kernelcore=mirror is specified, ignore movablecore option 6356 * If kernelcore=mirror is specified, ignore movablecore option
6357 */ 6357 */
6358 if (mirrored_kernelcore) { 6358 if (mirrored_kernelcore) {
6359 bool mem_below_4gb_not_mirrored = false; 6359 bool mem_below_4gb_not_mirrored = false;
6360 6360
6361 for_each_memblock(memory, r) { 6361 for_each_memblock(memory, r) {
6362 if (memblock_is_mirror(r)) 6362 if (memblock_is_mirror(r))
6363 continue; 6363 continue;
6364 6364
6365 nid = r->nid; 6365 nid = r->nid;
6366 6366
6367 usable_startpfn = memblock_region_memory_base_pfn(r); 6367 usable_startpfn = memblock_region_memory_base_pfn(r);
6368 6368
6369 if (usable_startpfn < 0x100000) { 6369 if (usable_startpfn < 0x100000) {
6370 mem_below_4gb_not_mirrored = true; 6370 mem_below_4gb_not_mirrored = true;
6371 continue; 6371 continue;
6372 } 6372 }
6373 6373
6374 zone_movable_pfn[nid] = zone_movable_pfn[nid] ? 6374 zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
6375 min(usable_startpfn, zone_movable_pfn[nid]) : 6375 min(usable_startpfn, zone_movable_pfn[nid]) :
6376 usable_startpfn; 6376 usable_startpfn;
6377 } 6377 }
6378 6378
6379 if (mem_below_4gb_not_mirrored) 6379 if (mem_below_4gb_not_mirrored)
6380 pr_warn("This configuration results in unmirrored kernel memory."); 6380 pr_warn("This configuration results in unmirrored kernel memory.");
6381 6381
6382 goto out2; 6382 goto out2;
6383 } 6383 }
6384 6384
6385 /* 6385 /*
6386 * If movablecore=nn[KMG] was specified, calculate what size of 6386 * If movablecore=nn[KMG] was specified, calculate what size of
6387 * kernelcore that corresponds so that memory usable for 6387 * kernelcore that corresponds so that memory usable for
6388 * any allocation type is evenly spread. If both kernelcore 6388 * any allocation type is evenly spread. If both kernelcore
6389 * and movablecore are specified, then the value of kernelcore 6389 * and movablecore are specified, then the value of kernelcore
6390 * will be used for required_kernelcore if it's greater than 6390 * will be used for required_kernelcore if it's greater than
6391 * what movablecore would have allowed. 6391 * what movablecore would have allowed.
6392 */ 6392 */
6393 if (required_movablecore) { 6393 if (required_movablecore) {
6394 unsigned long corepages; 6394 unsigned long corepages;
6395 6395
6396 /* 6396 /*
6397 * Round-up so that ZONE_MOVABLE is at least as large as what 6397 * Round-up so that ZONE_MOVABLE is at least as large as what
6398 * was requested by the user 6398 * was requested by the user
6399 */ 6399 */
6400 required_movablecore = 6400 required_movablecore =
6401 roundup(required_movablecore, MAX_ORDER_NR_PAGES); 6401 roundup(required_movablecore, MAX_ORDER_NR_PAGES);
6402 required_movablecore = min(totalpages, required_movablecore); 6402 required_movablecore = min(totalpages, required_movablecore);
6403 corepages = totalpages - required_movablecore; 6403 corepages = totalpages - required_movablecore;
6404 6404
6405 required_kernelcore = max(required_kernelcore, corepages); 6405 required_kernelcore = max(required_kernelcore, corepages);
6406 } 6406 }
6407 6407
6408 /* 6408 /*
6409 * If kernelcore was not specified or kernelcore size is larger 6409 * If kernelcore was not specified or kernelcore size is larger
6410 * than totalpages, there is no ZONE_MOVABLE. 6410 * than totalpages, there is no ZONE_MOVABLE.
6411 */ 6411 */
6412 if (!required_kernelcore || required_kernelcore >= totalpages) 6412 if (!required_kernelcore || required_kernelcore >= totalpages)
6413 goto out; 6413 goto out;
6414 6414
6415 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ 6415 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
6416 usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone]; 6416 usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
6417 6417
6418 restart: 6418 restart:
6419 /* Spread kernelcore memory as evenly as possible throughout nodes */ 6419 /* Spread kernelcore memory as evenly as possible throughout nodes */
6420 kernelcore_node = required_kernelcore / usable_nodes; 6420 kernelcore_node = required_kernelcore / usable_nodes;
6421 for_each_node_state(nid, N_MEMORY) { 6421 for_each_node_state(nid, N_MEMORY) {
6422 unsigned long start_pfn, end_pfn; 6422 unsigned long start_pfn, end_pfn;
6423 6423
6424 /* 6424 /*
6425 * Recalculate kernelcore_node if the division per node 6425 * Recalculate kernelcore_node if the division per node
6426 * now exceeds what is necessary to satisfy the requested 6426 * now exceeds what is necessary to satisfy the requested
6427 * amount of memory for the kernel 6427 * amount of memory for the kernel
6428 */ 6428 */
6429 if (required_kernelcore < kernelcore_node) 6429 if (required_kernelcore < kernelcore_node)
6430 kernelcore_node = required_kernelcore / usable_nodes; 6430 kernelcore_node = required_kernelcore / usable_nodes;
6431 6431
6432 /* 6432 /*
6433 * As the map is walked, we track how much memory is usable 6433 * As the map is walked, we track how much memory is usable
6434 * by the kernel using kernelcore_remaining. When it is 6434 * by the kernel using kernelcore_remaining. When it is
6435 * 0, the rest of the node is usable by ZONE_MOVABLE 6435 * 0, the rest of the node is usable by ZONE_MOVABLE
6436 */ 6436 */
6437 kernelcore_remaining = kernelcore_node; 6437 kernelcore_remaining = kernelcore_node;
6438 6438
6439 /* Go through each range of PFNs within this node */ 6439 /* Go through each range of PFNs within this node */
6440 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 6440 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
6441 unsigned long size_pages; 6441 unsigned long size_pages;
6442 6442
6443 start_pfn = max(start_pfn, zone_movable_pfn[nid]); 6443 start_pfn = max(start_pfn, zone_movable_pfn[nid]);
6444 if (start_pfn >= end_pfn) 6444 if (start_pfn >= end_pfn)
6445 continue; 6445 continue;
6446 6446
6447 /* Account for what is only usable for kernelcore */ 6447 /* Account for what is only usable for kernelcore */
6448 if (start_pfn < usable_startpfn) { 6448 if (start_pfn < usable_startpfn) {
6449 unsigned long kernel_pages; 6449 unsigned long kernel_pages;
6450 kernel_pages = min(end_pfn, usable_startpfn) 6450 kernel_pages = min(end_pfn, usable_startpfn)
6451 - start_pfn; 6451 - start_pfn;
6452 6452
6453 kernelcore_remaining -= min(kernel_pages, 6453 kernelcore_remaining -= min(kernel_pages,
6454 kernelcore_remaining); 6454 kernelcore_remaining);
6455 required_kernelcore -= min(kernel_pages, 6455 required_kernelcore -= min(kernel_pages,
6456 required_kernelcore); 6456 required_kernelcore);
6457 6457
6458 /* Continue if range is now fully accounted */ 6458 /* Continue if range is now fully accounted */
6459 if (end_pfn <= usable_startpfn) { 6459 if (end_pfn <= usable_startpfn) {
6460 6460
6461 /* 6461 /*
6462 * Push zone_movable_pfn to the end so 6462 * Push zone_movable_pfn to the end so
6463 * that if we have to rebalance 6463 * that if we have to rebalance
6464 * kernelcore across nodes, we will 6464 * kernelcore across nodes, we will
6465 * not double account here 6465 * not double account here
6466 */ 6466 */
6467 zone_movable_pfn[nid] = end_pfn; 6467 zone_movable_pfn[nid] = end_pfn;
6468 continue; 6468 continue;
6469 } 6469 }
6470 start_pfn = usable_startpfn; 6470 start_pfn = usable_startpfn;
6471 } 6471 }
6472 6472
6473 /* 6473 /*
6474 * The usable PFN range for ZONE_MOVABLE is from 6474 * The usable PFN range for ZONE_MOVABLE is from
6475 * start_pfn->end_pfn. Calculate size_pages as the 6475 * start_pfn->end_pfn. Calculate size_pages as the
6476 * number of pages used as kernelcore 6476 * number of pages used as kernelcore
6477 */ 6477 */
6478 size_pages = end_pfn - start_pfn; 6478 size_pages = end_pfn - start_pfn;
6479 if (size_pages > kernelcore_remaining) 6479 if (size_pages > kernelcore_remaining)
6480 size_pages = kernelcore_remaining; 6480 size_pages = kernelcore_remaining;
6481 zone_movable_pfn[nid] = start_pfn + size_pages; 6481 zone_movable_pfn[nid] = start_pfn + size_pages;
6482 6482
6483 /* 6483 /*
6484 * Some kernelcore has been met, update counts and 6484 * Some kernelcore has been met, update counts and
6485 * break if the kernelcore for this node has been 6485 * break if the kernelcore for this node has been
6486 * satisfied 6486 * satisfied
6487 */ 6487 */
6488 required_kernelcore -= min(required_kernelcore, 6488 required_kernelcore -= min(required_kernelcore,
6489 size_pages); 6489 size_pages);
6490 kernelcore_remaining -= size_pages; 6490 kernelcore_remaining -= size_pages;
6491 if (!kernelcore_remaining) 6491 if (!kernelcore_remaining)
6492 break; 6492 break;
6493 } 6493 }
6494 } 6494 }
6495 6495
6496 /* 6496 /*
6497 * If there is still required_kernelcore, we do another pass with one 6497 * If there is still required_kernelcore, we do another pass with one
6498 * less node in the count. This will push zone_movable_pfn[nid] further 6498 * less node in the count. This will push zone_movable_pfn[nid] further
6499 * along on the nodes that still have memory until kernelcore is 6499 * along on the nodes that still have memory until kernelcore is
6500 * satisfied 6500 * satisfied
6501 */ 6501 */
6502 usable_nodes--; 6502 usable_nodes--;
6503 if (usable_nodes && required_kernelcore > usable_nodes) 6503 if (usable_nodes && required_kernelcore > usable_nodes)
6504 goto restart; 6504 goto restart;
6505 6505
6506 out2: 6506 out2:
6507 /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ 6507 /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
6508 for (nid = 0; nid < MAX_NUMNODES; nid++) 6508 for (nid = 0; nid < MAX_NUMNODES; nid++)
6509 zone_movable_pfn[nid] = 6509 zone_movable_pfn[nid] =
6510 roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); 6510 roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
6511 6511
6512 out: 6512 out:
6513 /* restore the node_state */ 6513 /* restore the node_state */
6514 node_states[N_MEMORY] = saved_node_state; 6514 node_states[N_MEMORY] = saved_node_state;
6515 } 6515 }
6516 6516
6517 /* Any regular or high memory on that node ? */ 6517 /* Any regular or high memory on that node ? */
6518 static void check_for_memory(pg_data_t *pgdat, int nid) 6518 static void check_for_memory(pg_data_t *pgdat, int nid)
6519 { 6519 {
6520 enum zone_type zone_type; 6520 enum zone_type zone_type;
6521 6521
6522 if (N_MEMORY == N_NORMAL_MEMORY) 6522 if (N_MEMORY == N_NORMAL_MEMORY)
6523 return; 6523 return;
6524 6524
6525 for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) { 6525 for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
6526 struct zone *zone = &pgdat->node_zones[zone_type]; 6526 struct zone *zone = &pgdat->node_zones[zone_type];
6527 if (populated_zone(zone)) { 6527 if (populated_zone(zone)) {
6528 node_set_state(nid, N_HIGH_MEMORY); 6528 node_set_state(nid, N_HIGH_MEMORY);
6529 if (N_NORMAL_MEMORY != N_HIGH_MEMORY && 6529 if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&
6530 zone_type <= ZONE_NORMAL) 6530 zone_type <= ZONE_NORMAL)
6531 node_set_state(nid, N_NORMAL_MEMORY); 6531 node_set_state(nid, N_NORMAL_MEMORY);
6532 break; 6532 break;
6533 } 6533 }
6534 } 6534 }
6535 } 6535 }
6536 6536
6537 /** 6537 /**
6538 * free_area_init_nodes - Initialise all pg_data_t and zone data 6538 * free_area_init_nodes - Initialise all pg_data_t and zone data
6539 * @max_zone_pfn: an array of max PFNs for each zone 6539 * @max_zone_pfn: an array of max PFNs for each zone
6540 * 6540 *
6541 * This will call free_area_init_node() for each active node in the system. 6541 * This will call free_area_init_node() for each active node in the system.
6542 * Using the page ranges provided by memblock_set_node(), the size of each 6542 * Using the page ranges provided by memblock_set_node(), the size of each
6543 * zone in each node and their holes is calculated. If the maximum PFN 6543 * zone in each node and their holes is calculated. If the maximum PFN
6544 * between two adjacent zones match, it is assumed that the zone is empty. 6544 * between two adjacent zones match, it is assumed that the zone is empty.
6545 * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed 6545 * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
6546 * that arch_max_dma32_pfn has no pages. It is also assumed that a zone 6546 * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
6547 * starts where the previous one ended. For example, ZONE_DMA32 starts 6547 * starts where the previous one ended. For example, ZONE_DMA32 starts
6548 * at arch_max_dma_pfn. 6548 * at arch_max_dma_pfn.
6549 */ 6549 */
6550 void __init free_area_init_nodes(unsigned long *max_zone_pfn) 6550 void __init free_area_init_nodes(unsigned long *max_zone_pfn)
6551 { 6551 {
6552 unsigned long start_pfn, end_pfn; 6552 unsigned long start_pfn, end_pfn;
6553 int i, nid; 6553 int i, nid;
6554 6554
6555 /* Record where the zone boundaries are */ 6555 /* Record where the zone boundaries are */
6556 memset(arch_zone_lowest_possible_pfn, 0, 6556 memset(arch_zone_lowest_possible_pfn, 0,
6557 sizeof(arch_zone_lowest_possible_pfn)); 6557 sizeof(arch_zone_lowest_possible_pfn));
6558 memset(arch_zone_highest_possible_pfn, 0, 6558 memset(arch_zone_highest_possible_pfn, 0,
6559 sizeof(arch_zone_highest_possible_pfn)); 6559 sizeof(arch_zone_highest_possible_pfn));
6560 6560
6561 start_pfn = find_min_pfn_with_active_regions(); 6561 start_pfn = find_min_pfn_with_active_regions();
6562 6562
6563 for (i = 0; i < MAX_NR_ZONES; i++) { 6563 for (i = 0; i < MAX_NR_ZONES; i++) {
6564 if (i == ZONE_MOVABLE) 6564 if (i == ZONE_MOVABLE)
6565 continue; 6565 continue;
6566 6566
6567 end_pfn = max(max_zone_pfn[i], start_pfn); 6567 end_pfn = max(max_zone_pfn[i], start_pfn);
6568 arch_zone_lowest_possible_pfn[i] = start_pfn; 6568 arch_zone_lowest_possible_pfn[i] = start_pfn;
6569 arch_zone_highest_possible_pfn[i] = end_pfn; 6569 arch_zone_highest_possible_pfn[i] = end_pfn;
6570 6570
6571 start_pfn = end_pfn; 6571 start_pfn = end_pfn;
6572 } 6572 }
6573 6573
6574 /* Find the PFNs that ZONE_MOVABLE begins at in each node */ 6574 /* Find the PFNs that ZONE_MOVABLE begins at in each node */
6575 memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); 6575 memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
6576 find_zone_movable_pfns_for_nodes(); 6576 find_zone_movable_pfns_for_nodes();
6577 6577
6578 /* Print out the zone ranges */ 6578 /* Print out the zone ranges */
6579 pr_info("Zone ranges:\n"); 6579 pr_info("Zone ranges:\n");
6580 for (i = 0; i < MAX_NR_ZONES; i++) { 6580 for (i = 0; i < MAX_NR_ZONES; i++) {
6581 if (i == ZONE_MOVABLE) 6581 if (i == ZONE_MOVABLE)
6582 continue; 6582 continue;
6583 pr_info(" %-8s ", zone_names[i]); 6583 pr_info(" %-8s ", zone_names[i]);
6584 if (arch_zone_lowest_possible_pfn[i] == 6584 if (arch_zone_lowest_possible_pfn[i] ==
6585 arch_zone_highest_possible_pfn[i]) 6585 arch_zone_highest_possible_pfn[i])
6586 pr_cont("empty\n"); 6586 pr_cont("empty\n");
6587 else 6587 else
6588 pr_cont("[mem %#018Lx-%#018Lx]\n", 6588 pr_cont("[mem %#018Lx-%#018Lx]\n",
6589 (u64)arch_zone_lowest_possible_pfn[i] 6589 (u64)arch_zone_lowest_possible_pfn[i]
6590 << PAGE_SHIFT, 6590 << PAGE_SHIFT,
6591 ((u64)arch_zone_highest_possible_pfn[i] 6591 ((u64)arch_zone_highest_possible_pfn[i]
6592 << PAGE_SHIFT) - 1); 6592 << PAGE_SHIFT) - 1);
6593 } 6593 }
6594 6594
6595 /* Print out the PFNs ZONE_MOVABLE begins at in each node */ 6595 /* Print out the PFNs ZONE_MOVABLE begins at in each node */
6596 pr_info("Movable zone start for each node\n"); 6596 pr_info("Movable zone start for each node\n");
6597 for (i = 0; i < MAX_NUMNODES; i++) { 6597 for (i = 0; i < MAX_NUMNODES; i++) {
6598 if (zone_movable_pfn[i]) 6598 if (zone_movable_pfn[i])
6599 pr_info(" Node %d: %#018Lx\n", i, 6599 pr_info(" Node %d: %#018Lx\n", i,
6600 (u64)zone_movable_pfn[i] << PAGE_SHIFT); 6600 (u64)zone_movable_pfn[i] << PAGE_SHIFT);
6601 } 6601 }
6602 6602
6603 /* Print out the early node map */ 6603 /* Print out the early node map */
6604 pr_info("Early memory node ranges\n"); 6604 pr_info("Early memory node ranges\n");
6605 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) 6605 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
6606 pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid, 6606 pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid,
6607 (u64)start_pfn << PAGE_SHIFT, 6607 (u64)start_pfn << PAGE_SHIFT,
6608 ((u64)end_pfn << PAGE_SHIFT) - 1); 6608 ((u64)end_pfn << PAGE_SHIFT) - 1);
6609 6609
6610 /* Initialise every node */ 6610 /* Initialise every node */
6611 mminit_verify_pageflags_layout(); 6611 mminit_verify_pageflags_layout();
6612 setup_nr_node_ids(); 6612 setup_nr_node_ids();
6613 for_each_online_node(nid) { 6613 for_each_online_node(nid) {
6614 pg_data_t *pgdat = NODE_DATA(nid); 6614 pg_data_t *pgdat = NODE_DATA(nid);
6615 free_area_init_node(nid, NULL, 6615 free_area_init_node(nid, NULL,
6616 find_min_pfn_for_node(nid), NULL); 6616 find_min_pfn_for_node(nid), NULL);
6617 6617
6618 /* Any memory on that node */ 6618 /* Any memory on that node */
6619 if (pgdat->node_present_pages) 6619 if (pgdat->node_present_pages)
6620 node_set_state(nid, N_MEMORY); 6620 node_set_state(nid, N_MEMORY);
6621 check_for_memory(pgdat, nid); 6621 check_for_memory(pgdat, nid);
6622 } 6622 }
6623 } 6623 }
6624 6624
6625 static int __init cmdline_parse_core(char *p, unsigned long *core) 6625 static int __init cmdline_parse_core(char *p, unsigned long *core)
6626 { 6626 {
6627 unsigned long long coremem; 6627 unsigned long long coremem;
6628 if (!p) 6628 if (!p)
6629 return -EINVAL; 6629 return -EINVAL;
6630 6630
6631 coremem = memparse(p, &p); 6631 coremem = memparse(p, &p);
6632 *core = coremem >> PAGE_SHIFT; 6632 *core = coremem >> PAGE_SHIFT;
6633 6633
6634 /* Paranoid check that UL is enough for the coremem value */ 6634 /* Paranoid check that UL is enough for the coremem value */
6635 WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX); 6635 WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
6636 6636
6637 return 0; 6637 return 0;
6638 } 6638 }
6639 6639
6640 /* 6640 /*
6641 * kernelcore=size sets the amount of memory for use for allocations that 6641 * kernelcore=size sets the amount of memory for use for allocations that
6642 * cannot be reclaimed or migrated. 6642 * cannot be reclaimed or migrated.
6643 */ 6643 */
6644 static int __init cmdline_parse_kernelcore(char *p) 6644 static int __init cmdline_parse_kernelcore(char *p)
6645 { 6645 {
6646 /* parse kernelcore=mirror */ 6646 /* parse kernelcore=mirror */
6647 if (parse_option_str(p, "mirror")) { 6647 if (parse_option_str(p, "mirror")) {
6648 mirrored_kernelcore = true; 6648 mirrored_kernelcore = true;
6649 return 0; 6649 return 0;
6650 } 6650 }
6651 6651
6652 return cmdline_parse_core(p, &required_kernelcore); 6652 return cmdline_parse_core(p, &required_kernelcore);
6653 } 6653 }
6654 6654
6655 /* 6655 /*
6656 * movablecore=size sets the amount of memory for use for allocations that 6656 * movablecore=size sets the amount of memory for use for allocations that
6657 * can be reclaimed or migrated. 6657 * can be reclaimed or migrated.
6658 */ 6658 */
6659 static int __init cmdline_parse_movablecore(char *p) 6659 static int __init cmdline_parse_movablecore(char *p)
6660 { 6660 {
6661 return cmdline_parse_core(p, &required_movablecore); 6661 return cmdline_parse_core(p, &required_movablecore);
6662 } 6662 }
6663 6663
6664 early_param("kernelcore", cmdline_parse_kernelcore); 6664 early_param("kernelcore", cmdline_parse_kernelcore);
6665 early_param("movablecore", cmdline_parse_movablecore); 6665 early_param("movablecore", cmdline_parse_movablecore);
6666 6666
6667 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 6667 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
6668 6668
6669 void adjust_managed_page_count(struct page *page, long count) 6669 void adjust_managed_page_count(struct page *page, long count)
6670 { 6670 {
6671 spin_lock(&managed_page_count_lock); 6671 spin_lock(&managed_page_count_lock);
6672 page_zone(page)->managed_pages += count; 6672 page_zone(page)->managed_pages += count;
6673 totalram_pages += count; 6673 totalram_pages += count;
6674 #ifdef CONFIG_HIGHMEM 6674 #ifdef CONFIG_HIGHMEM
6675 if (PageHighMem(page)) 6675 if (PageHighMem(page))
6676 totalhigh_pages += count; 6676 totalhigh_pages += count;
6677 #endif 6677 #endif
6678 spin_unlock(&managed_page_count_lock); 6678 spin_unlock(&managed_page_count_lock);
6679 } 6679 }
6680 EXPORT_SYMBOL(adjust_managed_page_count); 6680 EXPORT_SYMBOL(adjust_managed_page_count);
6681 6681
6682 unsigned long free_reserved_area(void *start, void *end, int poison, char *s) 6682 unsigned long free_reserved_area(void *start, void *end, int poison, char *s)
6683 { 6683 {
6684 void *pos; 6684 void *pos;
6685 unsigned long pages = 0; 6685 unsigned long pages = 0;
6686 6686
6687 start = (void *)PAGE_ALIGN((unsigned long)start); 6687 start = (void *)PAGE_ALIGN((unsigned long)start);
6688 end = (void *)((unsigned long)end & PAGE_MASK); 6688 end = (void *)((unsigned long)end & PAGE_MASK);
6689 for (pos = start; pos < end; pos += PAGE_SIZE, pages++) { 6689 for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {
6690 if ((unsigned int)poison <= 0xFF) 6690 if ((unsigned int)poison <= 0xFF)
6691 memset(pos, poison, PAGE_SIZE); 6691 memset(pos, poison, PAGE_SIZE);
6692 free_reserved_page(virt_to_page(pos)); 6692 free_reserved_page(virt_to_page(pos));
6693 } 6693 }
6694 6694
6695 if (pages && s) 6695 if (pages && s)
6696 pr_info("Freeing %s memory: %ldK\n", 6696 pr_info("Freeing %s memory: %ldK\n",
6697 s, pages << (PAGE_SHIFT - 10)); 6697 s, pages << (PAGE_SHIFT - 10));
6698 6698
6699 return pages; 6699 return pages;
6700 } 6700 }
6701 EXPORT_SYMBOL(free_reserved_area); 6701 EXPORT_SYMBOL(free_reserved_area);
6702 6702
6703 #ifdef CONFIG_HIGHMEM 6703 #ifdef CONFIG_HIGHMEM
6704 void free_highmem_page(struct page *page) 6704 void free_highmem_page(struct page *page)
6705 { 6705 {
6706 __free_reserved_page(page); 6706 __free_reserved_page(page);
6707 totalram_pages++; 6707 totalram_pages++;
6708 page_zone(page)->managed_pages++; 6708 page_zone(page)->managed_pages++;
6709 totalhigh_pages++; 6709 totalhigh_pages++;
6710 } 6710 }
6711 #endif 6711 #endif
6712 6712
6713 6713
6714 void __init mem_init_print_info(const char *str) 6714 void __init mem_init_print_info(const char *str)
6715 { 6715 {
6716 unsigned long physpages, codesize, datasize, rosize, bss_size; 6716 unsigned long physpages, codesize, datasize, rosize, bss_size;
6717 unsigned long init_code_size, init_data_size; 6717 unsigned long init_code_size, init_data_size;
6718 6718
6719 physpages = get_num_physpages(); 6719 physpages = get_num_physpages();
6720 codesize = _etext - _stext; 6720 codesize = _etext - _stext;
6721 datasize = _edata - _sdata; 6721 datasize = _edata - _sdata;
6722 rosize = __end_rodata - __start_rodata; 6722 rosize = __end_rodata - __start_rodata;
6723 bss_size = __bss_stop - __bss_start; 6723 bss_size = __bss_stop - __bss_start;
6724 init_data_size = __init_end - __init_begin; 6724 init_data_size = __init_end - __init_begin;
6725 init_code_size = _einittext - _sinittext; 6725 init_code_size = _einittext - _sinittext;
6726 6726
6727 /* 6727 /*
6728 * Detect special cases and adjust section sizes accordingly: 6728 * Detect special cases and adjust section sizes accordingly:
6729 * 1) .init.* may be embedded into .data sections 6729 * 1) .init.* may be embedded into .data sections
6730 * 2) .init.text.* may be out of [__init_begin, __init_end], 6730 * 2) .init.text.* may be out of [__init_begin, __init_end],
6731 * please refer to arch/tile/kernel/vmlinux.lds.S. 6731 * please refer to arch/tile/kernel/vmlinux.lds.S.
6732 * 3) .rodata.* may be embedded into .text or .data sections. 6732 * 3) .rodata.* may be embedded into .text or .data sections.
6733 */ 6733 */
6734 #define adj_init_size(start, end, size, pos, adj) \ 6734 #define adj_init_size(start, end, size, pos, adj) \
6735 do { \ 6735 do { \
6736 if (start <= pos && pos < end && size > adj) \ 6736 if (start <= pos && pos < end && size > adj) \
6737 size -= adj; \ 6737 size -= adj; \
6738 } while (0) 6738 } while (0)
6739 6739
6740 adj_init_size(__init_begin, __init_end, init_data_size, 6740 adj_init_size(__init_begin, __init_end, init_data_size,
6741 _sinittext, init_code_size); 6741 _sinittext, init_code_size);
6742 adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size); 6742 adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);
6743 adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size); 6743 adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);
6744 adj_init_size(_stext, _etext, codesize, __start_rodata, rosize); 6744 adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);
6745 adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize); 6745 adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);
6746 6746
6747 #undef adj_init_size 6747 #undef adj_init_size
6748 6748
6749 pr_info("Memory: %luK/%luK available (%luK kernel code, %luK rwdata, %luK rodata, %luK init, %luK bss, %luK reserved, %luK cma-reserved" 6749 pr_info("Memory: %luK/%luK available (%luK kernel code, %luK rwdata, %luK rodata, %luK init, %luK bss, %luK reserved, %luK cma-reserved"
6750 #ifdef CONFIG_HIGHMEM 6750 #ifdef CONFIG_HIGHMEM
6751 ", %luK highmem" 6751 ", %luK highmem"
6752 #endif 6752 #endif
6753 "%s%s)\n", 6753 "%s%s)\n",
6754 nr_free_pages() << (PAGE_SHIFT - 10), 6754 nr_free_pages() << (PAGE_SHIFT - 10),
6755 physpages << (PAGE_SHIFT - 10), 6755 physpages << (PAGE_SHIFT - 10),
6756 codesize >> 10, datasize >> 10, rosize >> 10, 6756 codesize >> 10, datasize >> 10, rosize >> 10,
6757 (init_data_size + init_code_size) >> 10, bss_size >> 10, 6757 (init_data_size + init_code_size) >> 10, bss_size >> 10,
6758 (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT - 10), 6758 (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT - 10),
6759 totalcma_pages << (PAGE_SHIFT - 10), 6759 totalcma_pages << (PAGE_SHIFT - 10),
6760 #ifdef CONFIG_HIGHMEM 6760 #ifdef CONFIG_HIGHMEM
6761 totalhigh_pages << (PAGE_SHIFT - 10), 6761 totalhigh_pages << (PAGE_SHIFT - 10),
6762 #endif 6762 #endif
6763 str ? ", " : "", str ? str : ""); 6763 str ? ", " : "", str ? str : "");
6764 } 6764 }
6765 6765
6766 /** 6766 /**
6767 * set_dma_reserve - set the specified number of pages reserved in the first zone 6767 * set_dma_reserve - set the specified number of pages reserved in the first zone
6768 * @new_dma_reserve: The number of pages to mark reserved 6768 * @new_dma_reserve: The number of pages to mark reserved
6769 * 6769 *
6770 * The per-cpu batchsize and zone watermarks are determined by managed_pages. 6770 * The per-cpu batchsize and zone watermarks are determined by managed_pages.
6771 * In the DMA zone, a significant percentage may be consumed by kernel image 6771 * In the DMA zone, a significant percentage may be consumed by kernel image
6772 * and other unfreeable allocations which can skew the watermarks badly. This 6772 * and other unfreeable allocations which can skew the watermarks badly. This
6773 * function may optionally be used to account for unfreeable pages in the 6773 * function may optionally be used to account for unfreeable pages in the
6774 * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and 6774 * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
6775 * smaller per-cpu batchsize. 6775 * smaller per-cpu batchsize.
6776 */ 6776 */
6777 void __init set_dma_reserve(unsigned long new_dma_reserve) 6777 void __init set_dma_reserve(unsigned long new_dma_reserve)
6778 { 6778 {
6779 dma_reserve = new_dma_reserve; 6779 dma_reserve = new_dma_reserve;
6780 } 6780 }
6781 6781
6782 void __init free_area_init(unsigned long *zones_size) 6782 void __init free_area_init(unsigned long *zones_size)
6783 { 6783 {
6784 free_area_init_node(0, zones_size, 6784 free_area_init_node(0, zones_size,
6785 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); 6785 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
6786 } 6786 }
6787 6787
6788 static int page_alloc_cpu_dead(unsigned int cpu) 6788 static int page_alloc_cpu_dead(unsigned int cpu)
6789 { 6789 {
6790 6790
6791 lru_add_drain_cpu(cpu); 6791 lru_add_drain_cpu(cpu);
6792 drain_pages(cpu); 6792 drain_pages(cpu);
6793 6793
6794 /* 6794 /*
6795 * Spill the event counters of the dead processor 6795 * Spill the event counters of the dead processor
6796 * into the current processors event counters. 6796 * into the current processors event counters.
6797 * This artificially elevates the count of the current 6797 * This artificially elevates the count of the current
6798 * processor. 6798 * processor.
6799 */ 6799 */
6800 vm_events_fold_cpu(cpu); 6800 vm_events_fold_cpu(cpu);
6801 6801
6802 /* 6802 /*
6803 * Zero the differential counters of the dead processor 6803 * Zero the differential counters of the dead processor
6804 * so that the vm statistics are consistent. 6804 * so that the vm statistics are consistent.
6805 * 6805 *
6806 * This is only okay since the processor is dead and cannot 6806 * This is only okay since the processor is dead and cannot
6807 * race with what we are doing. 6807 * race with what we are doing.
6808 */ 6808 */
6809 cpu_vm_stats_fold(cpu); 6809 cpu_vm_stats_fold(cpu);
6810 return 0; 6810 return 0;
6811 } 6811 }
6812 6812
6813 void __init page_alloc_init(void) 6813 void __init page_alloc_init(void)
6814 { 6814 {
6815 int ret; 6815 int ret;
6816 6816
6817 ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC_DEAD, 6817 ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC_DEAD,
6818 "mm/page_alloc:dead", NULL, 6818 "mm/page_alloc:dead", NULL,
6819 page_alloc_cpu_dead); 6819 page_alloc_cpu_dead);
6820 WARN_ON(ret < 0); 6820 WARN_ON(ret < 0);
6821 } 6821 }
6822 6822
6823 /* 6823 /*
6824 * calculate_totalreserve_pages - called when sysctl_lowmem_reserve_ratio 6824 * calculate_totalreserve_pages - called when sysctl_lowmem_reserve_ratio
6825 * or min_free_kbytes changes. 6825 * or min_free_kbytes changes.
6826 */ 6826 */
6827 static void calculate_totalreserve_pages(void) 6827 static void calculate_totalreserve_pages(void)
6828 { 6828 {
6829 struct pglist_data *pgdat; 6829 struct pglist_data *pgdat;
6830 unsigned long reserve_pages = 0; 6830 unsigned long reserve_pages = 0;
6831 enum zone_type i, j; 6831 enum zone_type i, j;
6832 6832
6833 for_each_online_pgdat(pgdat) { 6833 for_each_online_pgdat(pgdat) {
6834 6834
6835 pgdat->totalreserve_pages = 0; 6835 pgdat->totalreserve_pages = 0;
6836 6836
6837 for (i = 0; i < MAX_NR_ZONES; i++) { 6837 for (i = 0; i < MAX_NR_ZONES; i++) {
6838 struct zone *zone = pgdat->node_zones + i; 6838 struct zone *zone = pgdat->node_zones + i;
6839 long max = 0; 6839 long max = 0;
6840 6840
6841 /* Find valid and maximum lowmem_reserve in the zone */ 6841 /* Find valid and maximum lowmem_reserve in the zone */
6842 for (j = i; j < MAX_NR_ZONES; j++) { 6842 for (j = i; j < MAX_NR_ZONES; j++) {
6843 if (zone->lowmem_reserve[j] > max) 6843 if (zone->lowmem_reserve[j] > max)
6844 max = zone->lowmem_reserve[j]; 6844 max = zone->lowmem_reserve[j];
6845 } 6845 }
6846 6846
6847 /* we treat the high watermark as reserved pages. */ 6847 /* we treat the high watermark as reserved pages. */
6848 max += high_wmark_pages(zone); 6848 max += high_wmark_pages(zone);
6849 6849
6850 if (max > zone->managed_pages) 6850 if (max > zone->managed_pages)
6851 max = zone->managed_pages; 6851 max = zone->managed_pages;
6852 6852
6853 pgdat->totalreserve_pages += max; 6853 pgdat->totalreserve_pages += max;
6854 6854
6855 reserve_pages += max; 6855 reserve_pages += max;
6856 } 6856 }
6857 } 6857 }
6858 totalreserve_pages = reserve_pages; 6858 totalreserve_pages = reserve_pages;
6859 } 6859 }
6860 6860
6861 /* 6861 /*
6862 * setup_per_zone_lowmem_reserve - called whenever 6862 * setup_per_zone_lowmem_reserve - called whenever
6863 * sysctl_lowmem_reserve_ratio changes. Ensures that each zone 6863 * sysctl_lowmem_reserve_ratio changes. Ensures that each zone
6864 * has a correct pages reserved value, so an adequate number of 6864 * has a correct pages reserved value, so an adequate number of
6865 * pages are left in the zone after a successful __alloc_pages(). 6865 * pages are left in the zone after a successful __alloc_pages().
6866 */ 6866 */
6867 static void setup_per_zone_lowmem_reserve(void) 6867 static void setup_per_zone_lowmem_reserve(void)
6868 { 6868 {
6869 struct pglist_data *pgdat; 6869 struct pglist_data *pgdat;
6870 enum zone_type j, idx; 6870 enum zone_type j, idx;
6871 6871
6872 for_each_online_pgdat(pgdat) { 6872 for_each_online_pgdat(pgdat) {
6873 for (j = 0; j < MAX_NR_ZONES; j++) { 6873 for (j = 0; j < MAX_NR_ZONES; j++) {
6874 struct zone *zone = pgdat->node_zones + j; 6874 struct zone *zone = pgdat->node_zones + j;
6875 unsigned long managed_pages = zone->managed_pages; 6875 unsigned long managed_pages = zone->managed_pages;
6876 6876
6877 zone->lowmem_reserve[j] = 0; 6877 zone->lowmem_reserve[j] = 0;
6878 6878
6879 idx = j; 6879 idx = j;
6880 while (idx) { 6880 while (idx) {
6881 struct zone *lower_zone; 6881 struct zone *lower_zone;
6882 6882
6883 idx--; 6883 idx--;
6884 6884
6885 if (sysctl_lowmem_reserve_ratio[idx] < 1) 6885 if (sysctl_lowmem_reserve_ratio[idx] < 1)
6886 sysctl_lowmem_reserve_ratio[idx] = 1; 6886 sysctl_lowmem_reserve_ratio[idx] = 1;
6887 6887
6888 lower_zone = pgdat->node_zones + idx; 6888 lower_zone = pgdat->node_zones + idx;
6889 lower_zone->lowmem_reserve[j] = managed_pages / 6889 lower_zone->lowmem_reserve[j] = managed_pages /
6890 sysctl_lowmem_reserve_ratio[idx]; 6890 sysctl_lowmem_reserve_ratio[idx];
6891 managed_pages += lower_zone->managed_pages; 6891 managed_pages += lower_zone->managed_pages;
6892 } 6892 }
6893 } 6893 }
6894 } 6894 }
6895 6895
6896 /* update totalreserve_pages */ 6896 /* update totalreserve_pages */
6897 calculate_totalreserve_pages(); 6897 calculate_totalreserve_pages();
6898 } 6898 }
6899 6899
6900 static void __setup_per_zone_wmarks(void) 6900 static void __setup_per_zone_wmarks(void)
6901 { 6901 {
6902 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); 6902 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
6903 unsigned long pages_low = extra_free_kbytes >> (PAGE_SHIFT - 10); 6903 unsigned long pages_low = extra_free_kbytes >> (PAGE_SHIFT - 10);
6904 unsigned long lowmem_pages = 0; 6904 unsigned long lowmem_pages = 0;
6905 struct zone *zone; 6905 struct zone *zone;
6906 unsigned long flags; 6906 unsigned long flags;
6907 6907
6908 /* Calculate total number of !ZONE_HIGHMEM pages */ 6908 /* Calculate total number of !ZONE_HIGHMEM pages */
6909 for_each_zone(zone) { 6909 for_each_zone(zone) {
6910 if (!is_highmem(zone)) 6910 if (!is_highmem(zone))
6911 lowmem_pages += zone->managed_pages; 6911 lowmem_pages += zone->managed_pages;
6912 } 6912 }
6913 6913
6914 for_each_zone(zone) { 6914 for_each_zone(zone) {
6915 u64 min, low; 6915 u64 min, low;
6916 6916
6917 spin_lock_irqsave(&zone->lock, flags); 6917 spin_lock_irqsave(&zone->lock, flags);
6918 min = (u64)pages_min * zone->managed_pages; 6918 min = (u64)pages_min * zone->managed_pages;
6919 do_div(min, lowmem_pages); 6919 do_div(min, lowmem_pages);
6920 low = (u64)pages_low * zone->managed_pages; 6920 low = (u64)pages_low * zone->managed_pages;
6921 do_div(low, vm_total_pages); 6921 do_div(low, vm_total_pages);
6922 6922
6923 if (is_highmem(zone)) { 6923 if (is_highmem(zone)) {
6924 /* 6924 /*
6925 * __GFP_HIGH and PF_MEMALLOC allocations usually don't 6925 * __GFP_HIGH and PF_MEMALLOC allocations usually don't
6926 * need highmem pages, so cap pages_min to a small 6926 * need highmem pages, so cap pages_min to a small
6927 * value here. 6927 * value here.
6928 * 6928 *
6929 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN) 6929 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
6930 * deltas control asynch page reclaim, and so should 6930 * deltas control asynch page reclaim, and so should
6931 * not be capped for highmem. 6931 * not be capped for highmem.
6932 */ 6932 */
6933 unsigned long min_pages; 6933 unsigned long min_pages;
6934 6934
6935 min_pages = zone->managed_pages / 1024; 6935 min_pages = zone->managed_pages / 1024;
6936 min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL); 6936 min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
6937 zone->watermark[WMARK_MIN] = min_pages; 6937 zone->watermark[WMARK_MIN] = min_pages;
6938 } else { 6938 } else {
6939 /* 6939 /*
6940 * If it's a lowmem zone, reserve a number of pages 6940 * If it's a lowmem zone, reserve a number of pages
6941 * proportionate to the zone's size. 6941 * proportionate to the zone's size.
6942 */ 6942 */
6943 zone->watermark[WMARK_MIN] = min; 6943 zone->watermark[WMARK_MIN] = min;
6944 } 6944 }
6945 6945
6946 /* 6946 /*
6947 * Set the kswapd watermarks distance according to the 6947 * Set the kswapd watermarks distance according to the
6948 * scale factor in proportion to available memory, but 6948 * scale factor in proportion to available memory, but
6949 * ensure a minimum size on small systems. 6949 * ensure a minimum size on small systems.
6950 */ 6950 */
6951 min = max_t(u64, min >> 2, 6951 min = max_t(u64, min >> 2,
6952 mult_frac(zone->managed_pages, 6952 mult_frac(zone->managed_pages,
6953 watermark_scale_factor, 10000)); 6953 watermark_scale_factor, 10000));
6954 6954
6955 zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + 6955 zone->watermark[WMARK_LOW] = min_wmark_pages(zone) +
6956 low + min; 6956 low + min;
6957 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + 6957 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) +
6958 low + min * 2; 6958 low + min * 2;
6959 6959
6960 spin_unlock_irqrestore(&zone->lock, flags); 6960 spin_unlock_irqrestore(&zone->lock, flags);
6961 } 6961 }
6962 6962
6963 /* update totalreserve_pages */ 6963 /* update totalreserve_pages */
6964 calculate_totalreserve_pages(); 6964 calculate_totalreserve_pages();
6965 } 6965 }
6966 6966
6967 /** 6967 /**
6968 * setup_per_zone_wmarks - called when min_free_kbytes changes 6968 * setup_per_zone_wmarks - called when min_free_kbytes changes
6969 * or when memory is hot-{added|removed} 6969 * or when memory is hot-{added|removed}
6970 * 6970 *
6971 * Ensures that the watermark[min,low,high] values for each zone are set 6971 * Ensures that the watermark[min,low,high] values for each zone are set
6972 * correctly with respect to min_free_kbytes. 6972 * correctly with respect to min_free_kbytes.
6973 */ 6973 */
6974 void setup_per_zone_wmarks(void) 6974 void setup_per_zone_wmarks(void)
6975 { 6975 {
6976 static DEFINE_SPINLOCK(lock); 6976 static DEFINE_SPINLOCK(lock);
6977 6977
6978 spin_lock(&lock); 6978 spin_lock(&lock);
6979 __setup_per_zone_wmarks(); 6979 __setup_per_zone_wmarks();
6980 spin_unlock(&lock); 6980 spin_unlock(&lock);
6981 } 6981 }
6982 6982
6983 /* 6983 /*
6984 * Initialise min_free_kbytes. 6984 * Initialise min_free_kbytes.
6985 * 6985 *
6986 * For small machines we want it small (128k min). For large machines 6986 * For small machines we want it small (128k min). For large machines
6987 * we want it large (64MB max). But it is not linear, because network 6987 * we want it large (64MB max). But it is not linear, because network
6988 * bandwidth does not increase linearly with machine size. We use 6988 * bandwidth does not increase linearly with machine size. We use
6989 * 6989 *
6990 * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: 6990 * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
6991 * min_free_kbytes = sqrt(lowmem_kbytes * 16) 6991 * min_free_kbytes = sqrt(lowmem_kbytes * 16)
6992 * 6992 *
6993 * which yields 6993 * which yields
6994 * 6994 *
6995 * 16MB: 512k 6995 * 16MB: 512k
6996 * 32MB: 724k 6996 * 32MB: 724k
6997 * 64MB: 1024k 6997 * 64MB: 1024k
6998 * 128MB: 1448k 6998 * 128MB: 1448k
6999 * 256MB: 2048k 6999 * 256MB: 2048k
7000 * 512MB: 2896k 7000 * 512MB: 2896k
7001 * 1024MB: 4096k 7001 * 1024MB: 4096k
7002 * 2048MB: 5792k 7002 * 2048MB: 5792k
7003 * 4096MB: 8192k 7003 * 4096MB: 8192k
7004 * 8192MB: 11584k 7004 * 8192MB: 11584k
7005 * 16384MB: 16384k 7005 * 16384MB: 16384k
7006 */ 7006 */
7007 int __meminit init_per_zone_wmark_min(void) 7007 int __meminit init_per_zone_wmark_min(void)
7008 { 7008 {
7009 unsigned long lowmem_kbytes; 7009 unsigned long lowmem_kbytes;
7010 int new_min_free_kbytes; 7010 int new_min_free_kbytes;
7011 7011
7012 lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); 7012 lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
7013 new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16); 7013 new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
7014 7014
7015 if (new_min_free_kbytes > user_min_free_kbytes) { 7015 if (new_min_free_kbytes > user_min_free_kbytes) {
7016 min_free_kbytes = new_min_free_kbytes; 7016 min_free_kbytes = new_min_free_kbytes;
7017 if (min_free_kbytes < 128) 7017 if (min_free_kbytes < 128)
7018 min_free_kbytes = 128; 7018 min_free_kbytes = 128;
7019 if (min_free_kbytes > 65536) 7019 if (min_free_kbytes > 65536)
7020 min_free_kbytes = 65536; 7020 min_free_kbytes = 65536;
7021 } else { 7021 } else {
7022 pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n", 7022 pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
7023 new_min_free_kbytes, user_min_free_kbytes); 7023 new_min_free_kbytes, user_min_free_kbytes);
7024 } 7024 }
7025 setup_per_zone_wmarks(); 7025 setup_per_zone_wmarks();
7026 refresh_zone_stat_thresholds(); 7026 refresh_zone_stat_thresholds();
7027 setup_per_zone_lowmem_reserve(); 7027 setup_per_zone_lowmem_reserve();
7028 7028
7029 #ifdef CONFIG_NUMA 7029 #ifdef CONFIG_NUMA
7030 setup_min_unmapped_ratio(); 7030 setup_min_unmapped_ratio();
7031 setup_min_slab_ratio(); 7031 setup_min_slab_ratio();
7032 #endif 7032 #endif
7033 7033
7034 return 0; 7034 return 0;
7035 } 7035 }
7036 core_initcall(init_per_zone_wmark_min) 7036 core_initcall(init_per_zone_wmark_min)
7037 7037
7038 /* 7038 /*
7039 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 7039 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
7040 * that we can call two helper functions whenever min_free_kbytes 7040 * that we can call two helper functions whenever min_free_kbytes
7041 * or extra_free_kbytes changes. 7041 * or extra_free_kbytes changes.
7042 */ 7042 */
7043 int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, 7043 int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
7044 void __user *buffer, size_t *length, loff_t *ppos) 7044 void __user *buffer, size_t *length, loff_t *ppos)
7045 { 7045 {
7046 int rc; 7046 int rc;
7047 7047
7048 rc = proc_dointvec_minmax(table, write, buffer, length, ppos); 7048 rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
7049 if (rc) 7049 if (rc)
7050 return rc; 7050 return rc;
7051 7051
7052 if (write) { 7052 if (write) {
7053 user_min_free_kbytes = min_free_kbytes; 7053 user_min_free_kbytes = min_free_kbytes;
7054 setup_per_zone_wmarks(); 7054 setup_per_zone_wmarks();
7055 } 7055 }
7056 return 0; 7056 return 0;
7057 } 7057 }
7058 7058
7059 int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write, 7059 int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
7060 void __user *buffer, size_t *length, loff_t *ppos) 7060 void __user *buffer, size_t *length, loff_t *ppos)
7061 { 7061 {
7062 int rc; 7062 int rc;
7063 7063
7064 rc = proc_dointvec_minmax(table, write, buffer, length, ppos); 7064 rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
7065 if (rc) 7065 if (rc)
7066 return rc; 7066 return rc;
7067 7067
7068 if (write) 7068 if (write)
7069 setup_per_zone_wmarks(); 7069 setup_per_zone_wmarks();
7070 7070
7071 return 0; 7071 return 0;
7072 } 7072 }
7073 7073
7074 #ifdef CONFIG_NUMA 7074 #ifdef CONFIG_NUMA
7075 static void setup_min_unmapped_ratio(void) 7075 static void setup_min_unmapped_ratio(void)
7076 { 7076 {
7077 pg_data_t *pgdat; 7077 pg_data_t *pgdat;
7078 struct zone *zone; 7078 struct zone *zone;
7079 7079
7080 for_each_online_pgdat(pgdat) 7080 for_each_online_pgdat(pgdat)
7081 pgdat->min_unmapped_pages = 0; 7081 pgdat->min_unmapped_pages = 0;
7082 7082
7083 for_each_zone(zone) 7083 for_each_zone(zone)
7084 zone->zone_pgdat->min_unmapped_pages += (zone->managed_pages * 7084 zone->zone_pgdat->min_unmapped_pages += (zone->managed_pages *
7085 sysctl_min_unmapped_ratio) / 100; 7085 sysctl_min_unmapped_ratio) / 100;
7086 } 7086 }
7087 7087
7088 7088
7089 int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, 7089 int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
7090 void __user *buffer, size_t *length, loff_t *ppos) 7090 void __user *buffer, size_t *length, loff_t *ppos)
7091 { 7091 {
7092 int rc; 7092 int rc;
7093 7093
7094 rc = proc_dointvec_minmax(table, write, buffer, length, ppos); 7094 rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
7095 if (rc) 7095 if (rc)
7096 return rc; 7096 return rc;
7097 7097
7098 setup_min_unmapped_ratio(); 7098 setup_min_unmapped_ratio();
7099 7099
7100 return 0; 7100 return 0;
7101 } 7101 }
7102 7102
7103 static void setup_min_slab_ratio(void) 7103 static void setup_min_slab_ratio(void)
7104 { 7104 {
7105 pg_data_t *pgdat; 7105 pg_data_t *pgdat;
7106 struct zone *zone; 7106 struct zone *zone;
7107 7107
7108 for_each_online_pgdat(pgdat) 7108 for_each_online_pgdat(pgdat)
7109 pgdat->min_slab_pages = 0; 7109 pgdat->min_slab_pages = 0;
7110 7110
7111 for_each_zone(zone) 7111 for_each_zone(zone)
7112 zone->zone_pgdat->min_slab_pages += (zone->managed_pages * 7112 zone->zone_pgdat->min_slab_pages += (zone->managed_pages *
7113 sysctl_min_slab_ratio) / 100; 7113 sysctl_min_slab_ratio) / 100;
7114 } 7114 }
7115 7115
7116 int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, 7116 int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
7117 void __user *buffer, size_t *length, loff_t *ppos) 7117 void __user *buffer, size_t *length, loff_t *ppos)
7118 { 7118 {
7119 int rc; 7119 int rc;
7120 7120
7121 rc = proc_dointvec_minmax(table, write, buffer, length, ppos); 7121 rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
7122 if (rc) 7122 if (rc)
7123 return rc; 7123 return rc;
7124 7124
7125 setup_min_slab_ratio(); 7125 setup_min_slab_ratio();
7126 7126
7127 return 0; 7127 return 0;
7128 } 7128 }
7129 #endif 7129 #endif
7130 7130
7131 /* 7131 /*
7132 * lowmem_reserve_ratio_sysctl_handler - just a wrapper around 7132 * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
7133 * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve() 7133 * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
7134 * whenever sysctl_lowmem_reserve_ratio changes. 7134 * whenever sysctl_lowmem_reserve_ratio changes.
7135 * 7135 *
7136 * The reserve ratio obviously has absolutely no relation with the 7136 * The reserve ratio obviously has absolutely no relation with the
7137 * minimum watermarks. The lowmem reserve ratio can only make sense 7137 * minimum watermarks. The lowmem reserve ratio can only make sense
7138 * if in function of the boot time zone sizes. 7138 * if in function of the boot time zone sizes.
7139 */ 7139 */
7140 int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write, 7140 int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write,
7141 void __user *buffer, size_t *length, loff_t *ppos) 7141 void __user *buffer, size_t *length, loff_t *ppos)
7142 { 7142 {
7143 proc_dointvec_minmax(table, write, buffer, length, ppos); 7143 proc_dointvec_minmax(table, write, buffer, length, ppos);
7144 setup_per_zone_lowmem_reserve(); 7144 setup_per_zone_lowmem_reserve();
7145 return 0; 7145 return 0;
7146 } 7146 }
7147 7147
7148 /* 7148 /*
7149 * percpu_pagelist_fraction - changes the pcp->high for each zone on each 7149 * percpu_pagelist_fraction - changes the pcp->high for each zone on each
7150 * cpu. It is the fraction of total pages in each zone that a hot per cpu 7150 * cpu. It is the fraction of total pages in each zone that a hot per cpu
7151 * pagelist can have before it gets flushed back to buddy allocator. 7151 * pagelist can have before it gets flushed back to buddy allocator.
7152 */ 7152 */
7153 int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write, 7153 int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write,
7154 void __user *buffer, size_t *length, loff_t *ppos) 7154 void __user *buffer, size_t *length, loff_t *ppos)
7155 { 7155 {
7156 struct zone *zone; 7156 struct zone *zone;
7157 int old_percpu_pagelist_fraction; 7157 int old_percpu_pagelist_fraction;
7158 int ret; 7158 int ret;
7159 7159
7160 mutex_lock(&pcp_batch_high_lock); 7160 mutex_lock(&pcp_batch_high_lock);
7161 old_percpu_pagelist_fraction = percpu_pagelist_fraction; 7161 old_percpu_pagelist_fraction = percpu_pagelist_fraction;
7162 7162
7163 ret = proc_dointvec_minmax(table, write, buffer, length, ppos); 7163 ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
7164 if (!write || ret < 0) 7164 if (!write || ret < 0)
7165 goto out; 7165 goto out;
7166 7166
7167 /* Sanity checking to avoid pcp imbalance */ 7167 /* Sanity checking to avoid pcp imbalance */
7168 if (percpu_pagelist_fraction && 7168 if (percpu_pagelist_fraction &&
7169 percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) { 7169 percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) {
7170 percpu_pagelist_fraction = old_percpu_pagelist_fraction; 7170 percpu_pagelist_fraction = old_percpu_pagelist_fraction;
7171 ret = -EINVAL; 7171 ret = -EINVAL;
7172 goto out; 7172 goto out;
7173 } 7173 }
7174 7174
7175 /* No change? */ 7175 /* No change? */
7176 if (percpu_pagelist_fraction == old_percpu_pagelist_fraction) 7176 if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)
7177 goto out; 7177 goto out;
7178 7178
7179 for_each_populated_zone(zone) { 7179 for_each_populated_zone(zone) {
7180 unsigned int cpu; 7180 unsigned int cpu;
7181 7181
7182 for_each_possible_cpu(cpu) 7182 for_each_possible_cpu(cpu)
7183 pageset_set_high_and_batch(zone, 7183 pageset_set_high_and_batch(zone,
7184 per_cpu_ptr(zone->pageset, cpu)); 7184 per_cpu_ptr(zone->pageset, cpu));
7185 } 7185 }
7186 out: 7186 out:
7187 mutex_unlock(&pcp_batch_high_lock); 7187 mutex_unlock(&pcp_batch_high_lock);
7188 return ret; 7188 return ret;
7189 } 7189 }
7190 7190
7191 #ifdef CONFIG_NUMA 7191 #ifdef CONFIG_NUMA
7192 int hashdist = HASHDIST_DEFAULT; 7192 int hashdist = HASHDIST_DEFAULT;
7193 7193
7194 static int __init set_hashdist(char *str) 7194 static int __init set_hashdist(char *str)
7195 { 7195 {
7196 if (!str) 7196 if (!str)
7197 return 0; 7197 return 0;
7198 hashdist = simple_strtoul(str, &str, 0); 7198 hashdist = simple_strtoul(str, &str, 0);
7199 return 1; 7199 return 1;
7200 } 7200 }
7201 __setup("hashdist=", set_hashdist); 7201 __setup("hashdist=", set_hashdist);
7202 #endif 7202 #endif
7203 7203
7204 #ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES 7204 #ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES
7205 /* 7205 /*
7206 * Returns the number of pages that arch has reserved but 7206 * Returns the number of pages that arch has reserved but
7207 * is not known to alloc_large_system_hash(). 7207 * is not known to alloc_large_system_hash().
7208 */ 7208 */
7209 static unsigned long __init arch_reserved_kernel_pages(void) 7209 static unsigned long __init arch_reserved_kernel_pages(void)
7210 { 7210 {
7211 return 0; 7211 return 0;
7212 } 7212 }
7213 #endif 7213 #endif
7214 7214
7215 /* 7215 /*
7216 * Adaptive scale is meant to reduce sizes of hash tables on large memory 7216 * Adaptive scale is meant to reduce sizes of hash tables on large memory
7217 * machines. As memory size is increased the scale is also increased but at 7217 * machines. As memory size is increased the scale is also increased but at
7218 * slower pace. Starting from ADAPT_SCALE_BASE (64G), every time memory 7218 * slower pace. Starting from ADAPT_SCALE_BASE (64G), every time memory
7219 * quadruples the scale is increased by one, which means the size of hash table 7219 * quadruples the scale is increased by one, which means the size of hash table
7220 * only doubles, instead of quadrupling as well. 7220 * only doubles, instead of quadrupling as well.
7221 * Because 32-bit systems cannot have large physical memory, where this scaling 7221 * Because 32-bit systems cannot have large physical memory, where this scaling
7222 * makes sense, it is disabled on such platforms. 7222 * makes sense, it is disabled on such platforms.
7223 */ 7223 */
7224 #if __BITS_PER_LONG > 32 7224 #if __BITS_PER_LONG > 32
7225 #define ADAPT_SCALE_BASE (64ul << 30) 7225 #define ADAPT_SCALE_BASE (64ul << 30)
7226 #define ADAPT_SCALE_SHIFT 2 7226 #define ADAPT_SCALE_SHIFT 2
7227 #define ADAPT_SCALE_NPAGES (ADAPT_SCALE_BASE >> PAGE_SHIFT) 7227 #define ADAPT_SCALE_NPAGES (ADAPT_SCALE_BASE >> PAGE_SHIFT)
7228 #endif 7228 #endif
7229 7229
7230 /* 7230 /*
7231 * allocate a large system hash table from bootmem 7231 * allocate a large system hash table from bootmem
7232 * - it is assumed that the hash table must contain an exact power-of-2 7232 * - it is assumed that the hash table must contain an exact power-of-2
7233 * quantity of entries 7233 * quantity of entries
7234 * - limit is the number of hash buckets, not the total allocation size 7234 * - limit is the number of hash buckets, not the total allocation size
7235 */ 7235 */
7236 void *__init alloc_large_system_hash(const char *tablename, 7236 void *__init alloc_large_system_hash(const char *tablename,
7237 unsigned long bucketsize, 7237 unsigned long bucketsize,
7238 unsigned long numentries, 7238 unsigned long numentries,
7239 int scale, 7239 int scale,
7240 int flags, 7240 int flags,
7241 unsigned int *_hash_shift, 7241 unsigned int *_hash_shift,
7242 unsigned int *_hash_mask, 7242 unsigned int *_hash_mask,
7243 unsigned long low_limit, 7243 unsigned long low_limit,
7244 unsigned long high_limit) 7244 unsigned long high_limit)
7245 { 7245 {
7246 unsigned long long max = high_limit; 7246 unsigned long long max = high_limit;
7247 unsigned long log2qty, size; 7247 unsigned long log2qty, size;
7248 void *table = NULL; 7248 void *table = NULL;
7249 gfp_t gfp_flags; 7249 gfp_t gfp_flags;
7250 7250
7251 /* allow the kernel cmdline to have a say */ 7251 /* allow the kernel cmdline to have a say */
7252 if (!numentries) { 7252 if (!numentries) {
7253 /* round applicable memory size up to nearest megabyte */ 7253 /* round applicable memory size up to nearest megabyte */
7254 numentries = nr_kernel_pages; 7254 numentries = nr_kernel_pages;
7255 numentries -= arch_reserved_kernel_pages(); 7255 numentries -= arch_reserved_kernel_pages();
7256 7256
7257 /* It isn't necessary when PAGE_SIZE >= 1MB */ 7257 /* It isn't necessary when PAGE_SIZE >= 1MB */
7258 if (PAGE_SHIFT < 20) 7258 if (PAGE_SHIFT < 20)
7259 numentries = round_up(numentries, (1<<20)/PAGE_SIZE); 7259 numentries = round_up(numentries, (1<<20)/PAGE_SIZE);
7260 7260
7261 #if __BITS_PER_LONG > 32 7261 #if __BITS_PER_LONG > 32
7262 if (!high_limit) { 7262 if (!high_limit) {
7263 unsigned long adapt; 7263 unsigned long adapt;
7264 7264
7265 for (adapt = ADAPT_SCALE_NPAGES; adapt < numentries; 7265 for (adapt = ADAPT_SCALE_NPAGES; adapt < numentries;
7266 adapt <<= ADAPT_SCALE_SHIFT) 7266 adapt <<= ADAPT_SCALE_SHIFT)
7267 scale++; 7267 scale++;
7268 } 7268 }
7269 #endif 7269 #endif
7270 7270
7271 /* limit to 1 bucket per 2^scale bytes of low memory */ 7271 /* limit to 1 bucket per 2^scale bytes of low memory */
7272 if (scale > PAGE_SHIFT) 7272 if (scale > PAGE_SHIFT)
7273 numentries >>= (scale - PAGE_SHIFT); 7273 numentries >>= (scale - PAGE_SHIFT);
7274 else 7274 else
7275 numentries <<= (PAGE_SHIFT - scale); 7275 numentries <<= (PAGE_SHIFT - scale);
7276 7276
7277 /* Make sure we've got at least a 0-order allocation.. */ 7277 /* Make sure we've got at least a 0-order allocation.. */
7278 if (unlikely(flags & HASH_SMALL)) { 7278 if (unlikely(flags & HASH_SMALL)) {
7279 /* Makes no sense without HASH_EARLY */ 7279 /* Makes no sense without HASH_EARLY */
7280 WARN_ON(!(flags & HASH_EARLY)); 7280 WARN_ON(!(flags & HASH_EARLY));
7281 if (!(numentries >> *_hash_shift)) { 7281 if (!(numentries >> *_hash_shift)) {
7282 numentries = 1UL << *_hash_shift; 7282 numentries = 1UL << *_hash_shift;
7283 BUG_ON(!numentries); 7283 BUG_ON(!numentries);
7284 } 7284 }
7285 } else if (unlikely((numentries * bucketsize) < PAGE_SIZE)) 7285 } else if (unlikely((numentries * bucketsize) < PAGE_SIZE))
7286 numentries = PAGE_SIZE / bucketsize; 7286 numentries = PAGE_SIZE / bucketsize;
7287 } 7287 }
7288 numentries = roundup_pow_of_two(numentries); 7288 numentries = roundup_pow_of_two(numentries);
7289 7289
7290 /* limit allocation size to 1/16 total memory by default */ 7290 /* limit allocation size to 1/16 total memory by default */
7291 if (max == 0) { 7291 if (max == 0) {
7292 max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4; 7292 max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
7293 do_div(max, bucketsize); 7293 do_div(max, bucketsize);
7294 } 7294 }
7295 max = min(max, 0x80000000ULL); 7295 max = min(max, 0x80000000ULL);
7296 7296
7297 if (numentries < low_limit) 7297 if (numentries < low_limit)
7298 numentries = low_limit; 7298 numentries = low_limit;
7299 if (numentries > max) 7299 if (numentries > max)
7300 numentries = max; 7300 numentries = max;
7301 7301
7302 log2qty = ilog2(numentries); 7302 log2qty = ilog2(numentries);
7303 7303
7304 /* 7304 /*
7305 * memblock allocator returns zeroed memory already, so HASH_ZERO is 7305 * memblock allocator returns zeroed memory already, so HASH_ZERO is
7306 * currently not used when HASH_EARLY is specified. 7306 * currently not used when HASH_EARLY is specified.
7307 */ 7307 */
7308 gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC; 7308 gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC;
7309 do { 7309 do {
7310 size = bucketsize << log2qty; 7310 size = bucketsize << log2qty;
7311 if (flags & HASH_EARLY) 7311 if (flags & HASH_EARLY)
7312 table = memblock_virt_alloc_nopanic(size, 0); 7312 table = memblock_virt_alloc_nopanic(size, 0);
7313 else if (hashdist) 7313 else if (hashdist)
7314 table = __vmalloc(size, gfp_flags, PAGE_KERNEL); 7314 table = __vmalloc(size, gfp_flags, PAGE_KERNEL);
7315 else { 7315 else {
7316 /* 7316 /*
7317 * If bucketsize is not a power-of-two, we may free 7317 * If bucketsize is not a power-of-two, we may free
7318 * some pages at the end of hash table which 7318 * some pages at the end of hash table which
7319 * alloc_pages_exact() automatically does 7319 * alloc_pages_exact() automatically does
7320 */ 7320 */
7321 if (get_order(size) < MAX_ORDER) { 7321 if (get_order(size) < MAX_ORDER) {
7322 table = alloc_pages_exact(size, gfp_flags); 7322 table = alloc_pages_exact(size, gfp_flags);
7323 kmemleak_alloc(table, size, 1, gfp_flags); 7323 kmemleak_alloc(table, size, 1, gfp_flags);
7324 } 7324 }
7325 } 7325 }
7326 } while (!table && size > PAGE_SIZE && --log2qty); 7326 } while (!table && size > PAGE_SIZE && --log2qty);
7327 7327
7328 if (!table) 7328 if (!table)
7329 panic("Failed to allocate %s hash table\n", tablename); 7329 panic("Failed to allocate %s hash table\n", tablename);
7330 7330
7331 pr_info("%s hash table entries: %ld (order: %d, %lu bytes)\n", 7331 pr_info("%s hash table entries: %ld (order: %d, %lu bytes)\n",
7332 tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size); 7332 tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size);
7333 7333
7334 if (_hash_shift) 7334 if (_hash_shift)
7335 *_hash_shift = log2qty; 7335 *_hash_shift = log2qty;
7336 if (_hash_mask) 7336 if (_hash_mask)
7337 *_hash_mask = (1 << log2qty) - 1; 7337 *_hash_mask = (1 << log2qty) - 1;
7338 7338
7339 return table; 7339 return table;
7340 } 7340 }
7341 7341
7342 /* 7342 /*
7343 * This function checks whether pageblock includes unmovable pages or not. 7343 * This function checks whether pageblock includes unmovable pages or not.
7344 * If @count is not zero, it is okay to include less @count unmovable pages 7344 * If @count is not zero, it is okay to include less @count unmovable pages
7345 * 7345 *
7346 * PageLRU check without isolation or lru_lock could race so that 7346 * PageLRU check without isolation or lru_lock could race so that
7347 * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable 7347 * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable
7348 * check without lock_page also may miss some movable non-lru pages at 7348 * check without lock_page also may miss some movable non-lru pages at
7349 * race condition. So you can't expect this function should be exact. 7349 * race condition. So you can't expect this function should be exact.
7350 */ 7350 */
7351 bool has_unmovable_pages(struct zone *zone, struct page *page, int count, 7351 bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
7352 bool skip_hwpoisoned_pages) 7352 bool skip_hwpoisoned_pages)
7353 { 7353 {
7354 unsigned long pfn, iter, found; 7354 unsigned long pfn, iter, found;
7355 int mt; 7355 int mt;
7356 7356
7357 /* 7357 /*
7358 * For avoiding noise data, lru_add_drain_all() should be called 7358 * For avoiding noise data, lru_add_drain_all() should be called
7359 * If ZONE_MOVABLE, the zone never contains unmovable pages 7359 * If ZONE_MOVABLE, the zone never contains unmovable pages
7360 */ 7360 */
7361 if (zone_idx(zone) == ZONE_MOVABLE) 7361 if (zone_idx(zone) == ZONE_MOVABLE)
7362 return false; 7362 return false;
7363 mt = get_pageblock_migratetype(page); 7363 mt = get_pageblock_migratetype(page);
7364 if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt)) 7364 if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))
7365 return false; 7365 return false;
7366 7366
7367 pfn = page_to_pfn(page); 7367 pfn = page_to_pfn(page);
7368 for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { 7368 for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
7369 unsigned long check = pfn + iter; 7369 unsigned long check = pfn + iter;
7370 7370
7371 if (!pfn_valid_within(check)) 7371 if (!pfn_valid_within(check))
7372 continue; 7372 continue;
7373 7373
7374 page = pfn_to_page(check); 7374 page = pfn_to_page(check);
7375 7375
7376 /* 7376 /*
7377 * Hugepages are not in LRU lists, but they're movable. 7377 * Hugepages are not in LRU lists, but they're movable.
7378 * We need not scan over tail pages bacause we don't 7378 * We need not scan over tail pages bacause we don't
7379 * handle each tail page individually in migration. 7379 * handle each tail page individually in migration.
7380 */ 7380 */
7381 if (PageHuge(page)) { 7381 if (PageHuge(page)) {
7382 iter = round_up(iter + 1, 1<<compound_order(page)) - 1; 7382 iter = round_up(iter + 1, 1<<compound_order(page)) - 1;
7383 continue; 7383 continue;
7384 } 7384 }
7385 7385
7386 /* 7386 /*
7387 * We can't use page_count without pin a page 7387 * We can't use page_count without pin a page
7388 * because another CPU can free compound page. 7388 * because another CPU can free compound page.
7389 * This check already skips compound tails of THP 7389 * This check already skips compound tails of THP
7390 * because their page->_refcount is zero at all time. 7390 * because their page->_refcount is zero at all time.
7391 */ 7391 */
7392 if (!page_ref_count(page)) { 7392 if (!page_ref_count(page)) {
7393 if (PageBuddy(page)) 7393 if (PageBuddy(page))
7394 iter += (1 << page_order(page)) - 1; 7394 iter += (1 << page_order(page)) - 1;
7395 continue; 7395 continue;
7396 } 7396 }
7397 7397
7398 /* 7398 /*
7399 * The HWPoisoned page may be not in buddy system, and 7399 * The HWPoisoned page may be not in buddy system, and
7400 * page_count() is not 0. 7400 * page_count() is not 0.
7401 */ 7401 */
7402 if (skip_hwpoisoned_pages && PageHWPoison(page)) 7402 if (skip_hwpoisoned_pages && PageHWPoison(page))
7403 continue; 7403 continue;
7404 7404
7405 if (__PageMovable(page)) 7405 if (__PageMovable(page))
7406 continue; 7406 continue;
7407 7407
7408 if (!PageLRU(page)) 7408 if (!PageLRU(page))
7409 found++; 7409 found++;
7410 /* 7410 /*
7411 * If there are RECLAIMABLE pages, we need to check 7411 * If there are RECLAIMABLE pages, we need to check
7412 * it. But now, memory offline itself doesn't call 7412 * it. But now, memory offline itself doesn't call
7413 * shrink_node_slabs() and it still to be fixed. 7413 * shrink_node_slabs() and it still to be fixed.
7414 */ 7414 */
7415 /* 7415 /*
7416 * If the page is not RAM, page_count()should be 0. 7416 * If the page is not RAM, page_count()should be 0.
7417 * we don't need more check. This is an _used_ not-movable page. 7417 * we don't need more check. This is an _used_ not-movable page.
7418 * 7418 *
7419 * The problematic thing here is PG_reserved pages. PG_reserved 7419 * The problematic thing here is PG_reserved pages. PG_reserved
7420 * is set to both of a memory hole page and a _used_ kernel 7420 * is set to both of a memory hole page and a _used_ kernel
7421 * page at boot. 7421 * page at boot.
7422 */ 7422 */
7423 if (found > count) 7423 if (found > count)
7424 return true; 7424 return true;
7425 } 7425 }
7426 return false; 7426 return false;
7427 } 7427 }
7428 7428
7429 bool is_pageblock_removable_nolock(struct page *page) 7429 bool is_pageblock_removable_nolock(struct page *page)
7430 { 7430 {
7431 struct zone *zone; 7431 struct zone *zone;
7432 unsigned long pfn; 7432 unsigned long pfn;
7433 7433
7434 /* 7434 /*
7435 * We have to be careful here because we are iterating over memory 7435 * We have to be careful here because we are iterating over memory
7436 * sections which are not zone aware so we might end up outside of 7436 * sections which are not zone aware so we might end up outside of
7437 * the zone but still within the section. 7437 * the zone but still within the section.
7438 * We have to take care about the node as well. If the node is offline 7438 * We have to take care about the node as well. If the node is offline
7439 * its NODE_DATA will be NULL - see page_zone. 7439 * its NODE_DATA will be NULL - see page_zone.
7440 */ 7440 */
7441 if (!node_online(page_to_nid(page))) 7441 if (!node_online(page_to_nid(page)))
7442 return false; 7442 return false;
7443 7443
7444 zone = page_zone(page); 7444 zone = page_zone(page);
7445 pfn = page_to_pfn(page); 7445 pfn = page_to_pfn(page);
7446 if (!zone_spans_pfn(zone, pfn)) 7446 if (!zone_spans_pfn(zone, pfn))
7447 return false; 7447 return false;
7448 7448
7449 return !has_unmovable_pages(zone, page, 0, true); 7449 return !has_unmovable_pages(zone, page, 0, true);
7450 } 7450 }
7451 7451
7452 #if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA) 7452 #if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)
7453 7453
7454 static unsigned long pfn_max_align_down(unsigned long pfn) 7454 static unsigned long pfn_max_align_down(unsigned long pfn)
7455 { 7455 {
7456 return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES, 7456 return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,
7457 pageblock_nr_pages) - 1); 7457 pageblock_nr_pages) - 1);
7458 } 7458 }
7459 7459
7460 static unsigned long pfn_max_align_up(unsigned long pfn) 7460 static unsigned long pfn_max_align_up(unsigned long pfn)
7461 { 7461 {
7462 return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES, 7462 return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,
7463 pageblock_nr_pages)); 7463 pageblock_nr_pages));
7464 } 7464 }
7465 7465
7466 /* [start, end) must belong to a single zone. */ 7466 /* [start, end) must belong to a single zone. */
7467 static int __alloc_contig_migrate_range(struct compact_control *cc, 7467 static int __alloc_contig_migrate_range(struct compact_control *cc,
7468 unsigned long start, unsigned long end) 7468 unsigned long start, unsigned long end)
7469 { 7469 {
7470 /* This function is based on compact_zone() from compaction.c. */ 7470 /* This function is based on compact_zone() from compaction.c. */
7471 unsigned long nr_reclaimed; 7471 unsigned long nr_reclaimed;
7472 unsigned long pfn = start; 7472 unsigned long pfn = start;
7473 unsigned int tries = 0; 7473 unsigned int tries = 0;
7474 int ret = 0; 7474 int ret = 0;
7475 7475
7476 migrate_prep(); 7476 migrate_prep();
7477 7477
7478 while (pfn < end || !list_empty(&cc->migratepages)) { 7478 while (pfn < end || !list_empty(&cc->migratepages)) {
7479 if (fatal_signal_pending(current)) { 7479 if (fatal_signal_pending(current)) {
7480 ret = -EINTR; 7480 ret = -EINTR;
7481 break; 7481 break;
7482 } 7482 }
7483 7483
7484 if (list_empty(&cc->migratepages)) { 7484 if (list_empty(&cc->migratepages)) {
7485 cc->nr_migratepages = 0; 7485 cc->nr_migratepages = 0;
7486 pfn = isolate_migratepages_range(cc, pfn, end); 7486 pfn = isolate_migratepages_range(cc, pfn, end);
7487 if (!pfn) { 7487 if (!pfn) {
7488 ret = -EINTR; 7488 ret = -EINTR;
7489 break; 7489 break;
7490 } 7490 }
7491 tries = 0; 7491 tries = 0;
7492 } else if (++tries == 5) { 7492 } else if (++tries == 5) {
7493 ret = ret < 0 ? ret : -EBUSY; 7493 ret = ret < 0 ? ret : -EBUSY;
7494 break; 7494 break;
7495 } 7495 }
7496 7496
7497 nr_reclaimed = reclaim_clean_pages_from_list(cc->zone, 7497 nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,
7498 &cc->migratepages); 7498 &cc->migratepages);
7499 cc->nr_migratepages -= nr_reclaimed; 7499 cc->nr_migratepages -= nr_reclaimed;
7500 7500
7501 ret = migrate_pages(&cc->migratepages, alloc_migrate_target, 7501 ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
7502 NULL, 0, cc->mode, MR_CMA); 7502 NULL, 0, cc->mode, MR_CMA);
7503 } 7503 }
7504 if (ret < 0) { 7504 if (ret < 0) {
7505 putback_movable_pages(&cc->migratepages); 7505 putback_movable_pages(&cc->migratepages);
7506 return ret; 7506 return ret;
7507 } 7507 }
7508 return 0; 7508 return 0;
7509 } 7509 }
7510 7510
7511 /** 7511 /**
7512 * alloc_contig_range() -- tries to allocate given range of pages 7512 * alloc_contig_range() -- tries to allocate given range of pages
7513 * @start: start PFN to allocate 7513 * @start: start PFN to allocate
7514 * @end: one-past-the-last PFN to allocate 7514 * @end: one-past-the-last PFN to allocate
7515 * @migratetype: migratetype of the underlaying pageblocks (either 7515 * @migratetype: migratetype of the underlaying pageblocks (either
7516 * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks 7516 * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks
7517 * in range must have the same migratetype and it must 7517 * in range must have the same migratetype and it must
7518 * be either of the two. 7518 * be either of the two.
7519 * @gfp_mask: GFP mask to use during compaction 7519 * @gfp_mask: GFP mask to use during compaction
7520 * 7520 *
7521 * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES 7521 * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES
7522 * aligned, however it's the caller's responsibility to guarantee that 7522 * aligned, however it's the caller's responsibility to guarantee that
7523 * we are the only thread that changes migrate type of pageblocks the 7523 * we are the only thread that changes migrate type of pageblocks the
7524 * pages fall in. 7524 * pages fall in.
7525 * 7525 *
7526 * The PFN range must belong to a single zone. 7526 * The PFN range must belong to a single zone.
7527 * 7527 *
7528 * Returns zero on success or negative error code. On success all 7528 * Returns zero on success or negative error code. On success all
7529 * pages which PFN is in [start, end) are allocated for the caller and 7529 * pages which PFN is in [start, end) are allocated for the caller and
7530 * need to be freed with free_contig_range(). 7530 * need to be freed with free_contig_range().
7531 */ 7531 */
7532 int alloc_contig_range(unsigned long start, unsigned long end, 7532 int alloc_contig_range(unsigned long start, unsigned long end,
7533 unsigned migratetype, gfp_t gfp_mask) 7533 unsigned migratetype, gfp_t gfp_mask)
7534 { 7534 {
7535 unsigned long outer_start, outer_end; 7535 unsigned long outer_start, outer_end;
7536 unsigned int order; 7536 unsigned int order;
7537 int ret = 0; 7537 int ret = 0;
7538 7538
7539 struct compact_control cc = { 7539 struct compact_control cc = {
7540 .nr_migratepages = 0, 7540 .nr_migratepages = 0,
7541 .order = -1, 7541 .order = -1,
7542 .zone = page_zone(pfn_to_page(start)), 7542 .zone = page_zone(pfn_to_page(start)),
7543 .mode = MIGRATE_SYNC, 7543 .mode = MIGRATE_SYNC,
7544 .ignore_skip_hint = true, 7544 .ignore_skip_hint = true,
7545 .gfp_mask = current_gfp_context(gfp_mask), 7545 .gfp_mask = current_gfp_context(gfp_mask),
7546 }; 7546 };
7547 INIT_LIST_HEAD(&cc.migratepages); 7547 INIT_LIST_HEAD(&cc.migratepages);
7548 7548
7549 /* 7549 /*
7550 * What we do here is we mark all pageblocks in range as 7550 * What we do here is we mark all pageblocks in range as
7551 * MIGRATE_ISOLATE. Because pageblock and max order pages may 7551 * MIGRATE_ISOLATE. Because pageblock and max order pages may
7552 * have different sizes, and due to the way page allocator 7552 * have different sizes, and due to the way page allocator
7553 * work, we align the range to biggest of the two pages so 7553 * work, we align the range to biggest of the two pages so
7554 * that page allocator won't try to merge buddies from 7554 * that page allocator won't try to merge buddies from
7555 * different pageblocks and change MIGRATE_ISOLATE to some 7555 * different pageblocks and change MIGRATE_ISOLATE to some
7556 * other migration type. 7556 * other migration type.
7557 * 7557 *
7558 * Once the pageblocks are marked as MIGRATE_ISOLATE, we 7558 * Once the pageblocks are marked as MIGRATE_ISOLATE, we
7559 * migrate the pages from an unaligned range (ie. pages that 7559 * migrate the pages from an unaligned range (ie. pages that
7560 * we are interested in). This will put all the pages in 7560 * we are interested in). This will put all the pages in
7561 * range back to page allocator as MIGRATE_ISOLATE. 7561 * range back to page allocator as MIGRATE_ISOLATE.
7562 * 7562 *
7563 * When this is done, we take the pages in range from page 7563 * When this is done, we take the pages in range from page
7564 * allocator removing them from the buddy system. This way 7564 * allocator removing them from the buddy system. This way
7565 * page allocator will never consider using them. 7565 * page allocator will never consider using them.
7566 * 7566 *
7567 * This lets us mark the pageblocks back as 7567 * This lets us mark the pageblocks back as
7568 * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the 7568 * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the
7569 * aligned range but not in the unaligned, original range are 7569 * aligned range but not in the unaligned, original range are
7570 * put back to page allocator so that buddy can use them. 7570 * put back to page allocator so that buddy can use them.
7571 */ 7571 */
7572 7572
7573 ret = start_isolate_page_range(pfn_max_align_down(start), 7573 ret = start_isolate_page_range(pfn_max_align_down(start),
7574 pfn_max_align_up(end), migratetype, 7574 pfn_max_align_up(end), migratetype,
7575 false); 7575 false);
7576 if (ret) 7576 if (ret)
7577 return ret; 7577 return ret;
7578 7578
7579 /* 7579 /*
7580 * In case of -EBUSY, we'd like to know which page causes problem. 7580 * In case of -EBUSY, we'd like to know which page causes problem.
7581 * So, just fall through. test_pages_isolated() has a tracepoint 7581 * So, just fall through. test_pages_isolated() has a tracepoint
7582 * which will report the busy page. 7582 * which will report the busy page.
7583 * 7583 *
7584 * It is possible that busy pages could become available before 7584 * It is possible that busy pages could become available before
7585 * the call to test_pages_isolated, and the range will actually be 7585 * the call to test_pages_isolated, and the range will actually be
7586 * allocated. So, if we fall through be sure to clear ret so that 7586 * allocated. So, if we fall through be sure to clear ret so that
7587 * -EBUSY is not accidentally used or returned to caller. 7587 * -EBUSY is not accidentally used or returned to caller.
7588 */ 7588 */
7589 ret = __alloc_contig_migrate_range(&cc, start, end); 7589 ret = __alloc_contig_migrate_range(&cc, start, end);
7590 if (ret && ret != -EBUSY) 7590 if (ret && ret != -EBUSY)
7591 goto done; 7591 goto done;
7592 ret =0; 7592 ret =0;
7593 7593
7594 /* 7594 /*
7595 * Pages from [start, end) are within a MAX_ORDER_NR_PAGES 7595 * Pages from [start, end) are within a MAX_ORDER_NR_PAGES
7596 * aligned blocks that are marked as MIGRATE_ISOLATE. What's 7596 * aligned blocks that are marked as MIGRATE_ISOLATE. What's
7597 * more, all pages in [start, end) are free in page allocator. 7597 * more, all pages in [start, end) are free in page allocator.
7598 * What we are going to do is to allocate all pages from 7598 * What we are going to do is to allocate all pages from
7599 * [start, end) (that is remove them from page allocator). 7599 * [start, end) (that is remove them from page allocator).
7600 * 7600 *
7601 * The only problem is that pages at the beginning and at the 7601 * The only problem is that pages at the beginning and at the
7602 * end of interesting range may be not aligned with pages that 7602 * end of interesting range may be not aligned with pages that
7603 * page allocator holds, ie. they can be part of higher order 7603 * page allocator holds, ie. they can be part of higher order
7604 * pages. Because of this, we reserve the bigger range and 7604 * pages. Because of this, we reserve the bigger range and
7605 * once this is done free the pages we are not interested in. 7605 * once this is done free the pages we are not interested in.
7606 * 7606 *
7607 * We don't have to hold zone->lock here because the pages are 7607 * We don't have to hold zone->lock here because the pages are
7608 * isolated thus they won't get removed from buddy. 7608 * isolated thus they won't get removed from buddy.
7609 */ 7609 */
7610 7610
7611 lru_add_drain_all(); 7611 lru_add_drain_all();
7612 drain_all_pages(cc.zone); 7612 drain_all_pages(cc.zone);
7613 7613
7614 order = 0; 7614 order = 0;
7615 outer_start = start; 7615 outer_start = start;
7616 while (!PageBuddy(pfn_to_page(outer_start))) { 7616 while (!PageBuddy(pfn_to_page(outer_start))) {
7617 if (++order >= MAX_ORDER) { 7617 if (++order >= MAX_ORDER) {
7618 outer_start = start; 7618 outer_start = start;
7619 break; 7619 break;
7620 } 7620 }
7621 outer_start &= ~0UL << order; 7621 outer_start &= ~0UL << order;
7622 } 7622 }
7623 7623
7624 if (outer_start != start) { 7624 if (outer_start != start) {
7625 order = page_order(pfn_to_page(outer_start)); 7625 order = page_order(pfn_to_page(outer_start));
7626 7626
7627 /* 7627 /*
7628 * outer_start page could be small order buddy page and 7628 * outer_start page could be small order buddy page and
7629 * it doesn't include start page. Adjust outer_start 7629 * it doesn't include start page. Adjust outer_start
7630 * in this case to report failed page properly 7630 * in this case to report failed page properly
7631 * on tracepoint in test_pages_isolated() 7631 * on tracepoint in test_pages_isolated()
7632 */ 7632 */
7633 if (outer_start + (1UL << order) <= start) 7633 if (outer_start + (1UL << order) <= start)
7634 outer_start = start; 7634 outer_start = start;
7635 } 7635 }
7636 7636
7637 /* Make sure the range is really isolated. */ 7637 /* Make sure the range is really isolated. */
7638 if (test_pages_isolated(outer_start, end, false)) { 7638 ret = test_pages_isolated(outer_start, end, false);
7639 pr_info_ratelimited("%s: [%lx, %lx) PFNs busy\n", 7639 if (ret) {
7640 __func__, outer_start, end);
7641 ret = -EBUSY; 7640 ret = -EBUSY;
7642 goto done; 7641 goto done;
7643 } 7642 }
7644 7643
7645 /* Grab isolated pages from freelists. */ 7644 /* Grab isolated pages from freelists. */
7646 outer_end = isolate_freepages_range(&cc, outer_start, end); 7645 outer_end = isolate_freepages_range(&cc, outer_start, end);
7647 if (!outer_end) { 7646 if (!outer_end) {
7648 ret = -EBUSY; 7647 ret = -EBUSY;
7649 goto done; 7648 goto done;
7650 } 7649 }
7651 7650
7652 /* Free head and tail (if any) */ 7651 /* Free head and tail (if any) */
7653 if (start != outer_start) 7652 if (start != outer_start)
7654 free_contig_range(outer_start, start - outer_start); 7653 free_contig_range(outer_start, start - outer_start);
7655 if (end != outer_end) 7654 if (end != outer_end)
7656 free_contig_range(end, outer_end - end); 7655 free_contig_range(end, outer_end - end);
7657 7656
7658 done: 7657 done:
7659 undo_isolate_page_range(pfn_max_align_down(start), 7658 undo_isolate_page_range(pfn_max_align_down(start),
7660 pfn_max_align_up(end), migratetype); 7659 pfn_max_align_up(end), migratetype);
7661 return ret; 7660 return ret;
7662 } 7661 }
7663 7662
7664 void free_contig_range(unsigned long pfn, unsigned nr_pages) 7663 void free_contig_range(unsigned long pfn, unsigned nr_pages)
7665 { 7664 {
7666 unsigned int count = 0; 7665 unsigned int count = 0;
7667 7666
7668 for (; nr_pages--; pfn++) { 7667 for (; nr_pages--; pfn++) {
7669 struct page *page = pfn_to_page(pfn); 7668 struct page *page = pfn_to_page(pfn);
7670 7669
7671 count += page_count(page) != 1; 7670 count += page_count(page) != 1;
7672 __free_page(page); 7671 __free_page(page);
7673 } 7672 }
7674 WARN(count != 0, "%d pages are still in use!\n", count); 7673 WARN(count != 0, "%d pages are still in use!\n", count);
7675 } 7674 }
7676 #endif 7675 #endif
7677 7676
7678 #ifdef CONFIG_MEMORY_HOTPLUG 7677 #ifdef CONFIG_MEMORY_HOTPLUG
7679 /* 7678 /*
7680 * The zone indicated has a new number of managed_pages; batch sizes and percpu 7679 * The zone indicated has a new number of managed_pages; batch sizes and percpu
7681 * page high values need to be recalulated. 7680 * page high values need to be recalulated.
7682 */ 7681 */
7683 void __meminit zone_pcp_update(struct zone *zone) 7682 void __meminit zone_pcp_update(struct zone *zone)
7684 { 7683 {
7685 unsigned cpu; 7684 unsigned cpu;
7686 mutex_lock(&pcp_batch_high_lock); 7685 mutex_lock(&pcp_batch_high_lock);
7687 for_each_possible_cpu(cpu) 7686 for_each_possible_cpu(cpu)
7688 pageset_set_high_and_batch(zone, 7687 pageset_set_high_and_batch(zone,
7689 per_cpu_ptr(zone->pageset, cpu)); 7688 per_cpu_ptr(zone->pageset, cpu));
7690 mutex_unlock(&pcp_batch_high_lock); 7689 mutex_unlock(&pcp_batch_high_lock);
7691 } 7690 }
7692 #endif 7691 #endif
7693 7692
7694 void zone_pcp_reset(struct zone *zone) 7693 void zone_pcp_reset(struct zone *zone)
7695 { 7694 {
7696 unsigned long flags; 7695 unsigned long flags;
7697 int cpu; 7696 int cpu;
7698 struct per_cpu_pageset *pset; 7697 struct per_cpu_pageset *pset;
7699 7698
7700 /* avoid races with drain_pages() */ 7699 /* avoid races with drain_pages() */
7701 local_irq_save(flags); 7700 local_irq_save(flags);
7702 if (zone->pageset != &boot_pageset) { 7701 if (zone->pageset != &boot_pageset) {
7703 for_each_online_cpu(cpu) { 7702 for_each_online_cpu(cpu) {
7704 pset = per_cpu_ptr(zone->pageset, cpu); 7703 pset = per_cpu_ptr(zone->pageset, cpu);
7705 drain_zonestat(zone, pset); 7704 drain_zonestat(zone, pset);
7706 } 7705 }
7707 free_percpu(zone->pageset); 7706 free_percpu(zone->pageset);
7708 zone->pageset = &boot_pageset; 7707 zone->pageset = &boot_pageset;
7709 } 7708 }
7710 local_irq_restore(flags); 7709 local_irq_restore(flags);
7711 } 7710 }
7712 7711
7713 #ifdef CONFIG_MEMORY_HOTREMOVE 7712 #ifdef CONFIG_MEMORY_HOTREMOVE
7714 /* 7713 /*
7715 * All pages in the range must be in a single zone and isolated 7714 * All pages in the range must be in a single zone and isolated
7716 * before calling this. 7715 * before calling this.
7717 */ 7716 */
7718 void 7717 void
7719 __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) 7718 __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
7720 { 7719 {
7721 struct page *page; 7720 struct page *page;
7722 struct zone *zone; 7721 struct zone *zone;
7723 unsigned int order, i; 7722 unsigned int order, i;
7724 unsigned long pfn; 7723 unsigned long pfn;
7725 unsigned long flags; 7724 unsigned long flags;
7726 /* find the first valid pfn */ 7725 /* find the first valid pfn */
7727 for (pfn = start_pfn; pfn < end_pfn; pfn++) 7726 for (pfn = start_pfn; pfn < end_pfn; pfn++)
7728 if (pfn_valid(pfn)) 7727 if (pfn_valid(pfn))
7729 break; 7728 break;
7730 if (pfn == end_pfn) 7729 if (pfn == end_pfn)
7731 return; 7730 return;
7732 offline_mem_sections(pfn, end_pfn); 7731 offline_mem_sections(pfn, end_pfn);
7733 zone = page_zone(pfn_to_page(pfn)); 7732 zone = page_zone(pfn_to_page(pfn));
7734 spin_lock_irqsave(&zone->lock, flags); 7733 spin_lock_irqsave(&zone->lock, flags);
7735 pfn = start_pfn; 7734 pfn = start_pfn;
7736 while (pfn < end_pfn) { 7735 while (pfn < end_pfn) {
7737 if (!pfn_valid(pfn)) { 7736 if (!pfn_valid(pfn)) {
7738 pfn++; 7737 pfn++;
7739 continue; 7738 continue;
7740 } 7739 }
7741 page = pfn_to_page(pfn); 7740 page = pfn_to_page(pfn);
7742 /* 7741 /*
7743 * The HWPoisoned page may be not in buddy system, and 7742 * The HWPoisoned page may be not in buddy system, and
7744 * page_count() is not 0. 7743 * page_count() is not 0.
7745 */ 7744 */
7746 if (unlikely(!PageBuddy(page) && PageHWPoison(page))) { 7745 if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
7747 pfn++; 7746 pfn++;
7748 SetPageReserved(page); 7747 SetPageReserved(page);
7749 continue; 7748 continue;
7750 } 7749 }
7751 7750
7752 BUG_ON(page_count(page)); 7751 BUG_ON(page_count(page));
7753 BUG_ON(!PageBuddy(page)); 7752 BUG_ON(!PageBuddy(page));
7754 order = page_order(page); 7753 order = page_order(page);
7755 #ifdef CONFIG_DEBUG_VM 7754 #ifdef CONFIG_DEBUG_VM
7756 pr_info("remove from free list %lx %d %lx\n", 7755 pr_info("remove from free list %lx %d %lx\n",
7757 pfn, 1 << order, end_pfn); 7756 pfn, 1 << order, end_pfn);
7758 #endif 7757 #endif
7759 list_del(&page->lru); 7758 list_del(&page->lru);
7760 rmv_page_order(page); 7759 rmv_page_order(page);
7761 zone->free_area[order].nr_free--; 7760 zone->free_area[order].nr_free--;
7762 for (i = 0; i < (1 << order); i++) 7761 for (i = 0; i < (1 << order); i++)
7763 SetPageReserved((page+i)); 7762 SetPageReserved((page+i));
7764 pfn += (1 << order); 7763 pfn += (1 << order);
7765 } 7764 }
7766 spin_unlock_irqrestore(&zone->lock, flags); 7765 spin_unlock_irqrestore(&zone->lock, flags);
7767 } 7766 }
7768 #endif 7767 #endif
7769 7768
7770 bool is_free_buddy_page(struct page *page) 7769 bool is_free_buddy_page(struct page *page)
7771 { 7770 {
7772 struct zone *zone = page_zone(page); 7771 struct zone *zone = page_zone(page);
7773 unsigned long pfn = page_to_pfn(page); 7772 unsigned long pfn = page_to_pfn(page);
7774 unsigned long flags; 7773 unsigned long flags;
7775 unsigned int order; 7774 unsigned int order;
7776 7775
7777 spin_lock_irqsave(&zone->lock, flags); 7776 spin_lock_irqsave(&zone->lock, flags);
7778 for (order = 0; order < MAX_ORDER; order++) { 7777 for (order = 0; order < MAX_ORDER; order++) {
7779 struct page *page_head = page - (pfn & ((1 << order) - 1)); 7778 struct page *page_head = page - (pfn & ((1 << order) - 1));
7780 7779
7781 if (PageBuddy(page_head) && page_order(page_head) >= order) 7780 if (PageBuddy(page_head) && page_order(page_head) >= order)
7782 break; 7781 break;
7783 } 7782 }
7784 spin_unlock_irqrestore(&zone->lock, flags); 7783 spin_unlock_irqrestore(&zone->lock, flags);
7785 7784
7786 return order < MAX_ORDER; 7785 return order < MAX_ORDER;
7787 } 7786 }
7788 7787