Commit 6be7db23181974114af002ebfe875ceaf23f24af

Authored by Eric Lee
1 parent d999f49ca0

Drop PFNs busy printk in an expected path

Showing 1 changed file with 2 additions and 3 deletions Inline Diff

1 /* 1 /*
2 * linux/mm/page_alloc.c 2 * linux/mm/page_alloc.c
3 * 3 *
4 * Manages the free list, the system allocates free pages here. 4 * Manages the free list, the system allocates free pages here.
5 * Note that kmalloc() lives in slab.c 5 * Note that kmalloc() lives in slab.c
6 * 6 *
7 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 7 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
8 * Swap reorganised 29.12.95, Stephen Tweedie 8 * Swap reorganised 29.12.95, Stephen Tweedie
9 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 9 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
10 * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 10 * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
11 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 11 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
12 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 12 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000
13 * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002 13 * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
14 * (lots of bits borrowed from Ingo Molnar & Andrew Morton) 14 * (lots of bits borrowed from Ingo Molnar & Andrew Morton)
15 */ 15 */
16 16
17 #include <linux/stddef.h> 17 #include <linux/stddef.h>
18 #include <linux/mm.h> 18 #include <linux/mm.h>
19 #include <linux/swap.h> 19 #include <linux/swap.h>
20 #include <linux/interrupt.h> 20 #include <linux/interrupt.h>
21 #include <linux/pagemap.h> 21 #include <linux/pagemap.h>
22 #include <linux/jiffies.h> 22 #include <linux/jiffies.h>
23 #include <linux/bootmem.h> 23 #include <linux/bootmem.h>
24 #include <linux/memblock.h> 24 #include <linux/memblock.h>
25 #include <linux/compiler.h> 25 #include <linux/compiler.h>
26 #include <linux/kernel.h> 26 #include <linux/kernel.h>
27 #include <linux/kasan.h> 27 #include <linux/kasan.h>
28 #include <linux/module.h> 28 #include <linux/module.h>
29 #include <linux/suspend.h> 29 #include <linux/suspend.h>
30 #include <linux/pagevec.h> 30 #include <linux/pagevec.h>
31 #include <linux/blkdev.h> 31 #include <linux/blkdev.h>
32 #include <linux/slab.h> 32 #include <linux/slab.h>
33 #include <linux/ratelimit.h> 33 #include <linux/ratelimit.h>
34 #include <linux/oom.h> 34 #include <linux/oom.h>
35 #include <linux/notifier.h> 35 #include <linux/notifier.h>
36 #include <linux/topology.h> 36 #include <linux/topology.h>
37 #include <linux/sysctl.h> 37 #include <linux/sysctl.h>
38 #include <linux/cpu.h> 38 #include <linux/cpu.h>
39 #include <linux/cpuset.h> 39 #include <linux/cpuset.h>
40 #include <linux/memory_hotplug.h> 40 #include <linux/memory_hotplug.h>
41 #include <linux/nodemask.h> 41 #include <linux/nodemask.h>
42 #include <linux/vmalloc.h> 42 #include <linux/vmalloc.h>
43 #include <linux/vmstat.h> 43 #include <linux/vmstat.h>
44 #include <linux/mempolicy.h> 44 #include <linux/mempolicy.h>
45 #include <linux/memremap.h> 45 #include <linux/memremap.h>
46 #include <linux/stop_machine.h> 46 #include <linux/stop_machine.h>
47 #include <linux/sort.h> 47 #include <linux/sort.h>
48 #include <linux/pfn.h> 48 #include <linux/pfn.h>
49 #include <linux/backing-dev.h> 49 #include <linux/backing-dev.h>
50 #include <linux/fault-inject.h> 50 #include <linux/fault-inject.h>
51 #include <linux/page-isolation.h> 51 #include <linux/page-isolation.h>
52 #include <linux/page_ext.h> 52 #include <linux/page_ext.h>
53 #include <linux/debugobjects.h> 53 #include <linux/debugobjects.h>
54 #include <linux/kmemleak.h> 54 #include <linux/kmemleak.h>
55 #include <linux/compaction.h> 55 #include <linux/compaction.h>
56 #include <trace/events/kmem.h> 56 #include <trace/events/kmem.h>
57 #include <trace/events/oom.h> 57 #include <trace/events/oom.h>
58 #include <linux/prefetch.h> 58 #include <linux/prefetch.h>
59 #include <linux/mm_inline.h> 59 #include <linux/mm_inline.h>
60 #include <linux/migrate.h> 60 #include <linux/migrate.h>
61 #include <linux/hugetlb.h> 61 #include <linux/hugetlb.h>
62 #include <linux/sched/rt.h> 62 #include <linux/sched/rt.h>
63 #include <linux/sched/mm.h> 63 #include <linux/sched/mm.h>
64 #include <linux/page_owner.h> 64 #include <linux/page_owner.h>
65 #include <linux/kthread.h> 65 #include <linux/kthread.h>
66 #include <linux/memcontrol.h> 66 #include <linux/memcontrol.h>
67 #include <linux/ftrace.h> 67 #include <linux/ftrace.h>
68 #include <linux/lockdep.h> 68 #include <linux/lockdep.h>
69 #include <linux/nmi.h> 69 #include <linux/nmi.h>
70 70
71 #include <asm/sections.h> 71 #include <asm/sections.h>
72 #include <asm/tlbflush.h> 72 #include <asm/tlbflush.h>
73 #include <asm/div64.h> 73 #include <asm/div64.h>
74 #include "internal.h" 74 #include "internal.h"
75 75
76 /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ 76 /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
77 static DEFINE_MUTEX(pcp_batch_high_lock); 77 static DEFINE_MUTEX(pcp_batch_high_lock);
78 #define MIN_PERCPU_PAGELIST_FRACTION (8) 78 #define MIN_PERCPU_PAGELIST_FRACTION (8)
79 79
80 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID 80 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
81 DEFINE_PER_CPU(int, numa_node); 81 DEFINE_PER_CPU(int, numa_node);
82 EXPORT_PER_CPU_SYMBOL(numa_node); 82 EXPORT_PER_CPU_SYMBOL(numa_node);
83 #endif 83 #endif
84 84
85 #ifdef CONFIG_HAVE_MEMORYLESS_NODES 85 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
86 /* 86 /*
87 * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly. 87 * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
88 * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined. 88 * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
89 * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem() 89 * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()
90 * defined in <linux/topology.h>. 90 * defined in <linux/topology.h>.
91 */ 91 */
92 DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */ 92 DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */
93 EXPORT_PER_CPU_SYMBOL(_numa_mem_); 93 EXPORT_PER_CPU_SYMBOL(_numa_mem_);
94 int _node_numa_mem_[MAX_NUMNODES]; 94 int _node_numa_mem_[MAX_NUMNODES];
95 #endif 95 #endif
96 96
97 /* work_structs for global per-cpu drains */ 97 /* work_structs for global per-cpu drains */
98 DEFINE_MUTEX(pcpu_drain_mutex); 98 DEFINE_MUTEX(pcpu_drain_mutex);
99 DEFINE_PER_CPU(struct work_struct, pcpu_drain); 99 DEFINE_PER_CPU(struct work_struct, pcpu_drain);
100 100
101 #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY 101 #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
102 volatile unsigned long latent_entropy __latent_entropy; 102 volatile unsigned long latent_entropy __latent_entropy;
103 EXPORT_SYMBOL(latent_entropy); 103 EXPORT_SYMBOL(latent_entropy);
104 #endif 104 #endif
105 105
106 /* 106 /*
107 * Array of node states. 107 * Array of node states.
108 */ 108 */
109 nodemask_t node_states[NR_NODE_STATES] __read_mostly = { 109 nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
110 [N_POSSIBLE] = NODE_MASK_ALL, 110 [N_POSSIBLE] = NODE_MASK_ALL,
111 [N_ONLINE] = { { [0] = 1UL } }, 111 [N_ONLINE] = { { [0] = 1UL } },
112 #ifndef CONFIG_NUMA 112 #ifndef CONFIG_NUMA
113 [N_NORMAL_MEMORY] = { { [0] = 1UL } }, 113 [N_NORMAL_MEMORY] = { { [0] = 1UL } },
114 #ifdef CONFIG_HIGHMEM 114 #ifdef CONFIG_HIGHMEM
115 [N_HIGH_MEMORY] = { { [0] = 1UL } }, 115 [N_HIGH_MEMORY] = { { [0] = 1UL } },
116 #endif 116 #endif
117 [N_MEMORY] = { { [0] = 1UL } }, 117 [N_MEMORY] = { { [0] = 1UL } },
118 [N_CPU] = { { [0] = 1UL } }, 118 [N_CPU] = { { [0] = 1UL } },
119 #endif /* NUMA */ 119 #endif /* NUMA */
120 }; 120 };
121 EXPORT_SYMBOL(node_states); 121 EXPORT_SYMBOL(node_states);
122 122
123 /* Protect totalram_pages and zone->managed_pages */ 123 /* Protect totalram_pages and zone->managed_pages */
124 static DEFINE_SPINLOCK(managed_page_count_lock); 124 static DEFINE_SPINLOCK(managed_page_count_lock);
125 125
126 unsigned long totalram_pages __read_mostly; 126 unsigned long totalram_pages __read_mostly;
127 unsigned long totalreserve_pages __read_mostly; 127 unsigned long totalreserve_pages __read_mostly;
128 unsigned long totalcma_pages __read_mostly; 128 unsigned long totalcma_pages __read_mostly;
129 129
130 int percpu_pagelist_fraction; 130 int percpu_pagelist_fraction;
131 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; 131 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
132 132
133 /* 133 /*
134 * A cached value of the page's pageblock's migratetype, used when the page is 134 * A cached value of the page's pageblock's migratetype, used when the page is
135 * put on a pcplist. Used to avoid the pageblock migratetype lookup when 135 * put on a pcplist. Used to avoid the pageblock migratetype lookup when
136 * freeing from pcplists in most cases, at the cost of possibly becoming stale. 136 * freeing from pcplists in most cases, at the cost of possibly becoming stale.
137 * Also the migratetype set in the page does not necessarily match the pcplist 137 * Also the migratetype set in the page does not necessarily match the pcplist
138 * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any 138 * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any
139 * other index - this ensures that it will be put on the correct CMA freelist. 139 * other index - this ensures that it will be put on the correct CMA freelist.
140 */ 140 */
141 static inline int get_pcppage_migratetype(struct page *page) 141 static inline int get_pcppage_migratetype(struct page *page)
142 { 142 {
143 return page->index; 143 return page->index;
144 } 144 }
145 145
146 static inline void set_pcppage_migratetype(struct page *page, int migratetype) 146 static inline void set_pcppage_migratetype(struct page *page, int migratetype)
147 { 147 {
148 page->index = migratetype; 148 page->index = migratetype;
149 } 149 }
150 150
151 #ifdef CONFIG_PM_SLEEP 151 #ifdef CONFIG_PM_SLEEP
152 /* 152 /*
153 * The following functions are used by the suspend/hibernate code to temporarily 153 * The following functions are used by the suspend/hibernate code to temporarily
154 * change gfp_allowed_mask in order to avoid using I/O during memory allocations 154 * change gfp_allowed_mask in order to avoid using I/O during memory allocations
155 * while devices are suspended. To avoid races with the suspend/hibernate code, 155 * while devices are suspended. To avoid races with the suspend/hibernate code,
156 * they should always be called with pm_mutex held (gfp_allowed_mask also should 156 * they should always be called with pm_mutex held (gfp_allowed_mask also should
157 * only be modified with pm_mutex held, unless the suspend/hibernate code is 157 * only be modified with pm_mutex held, unless the suspend/hibernate code is
158 * guaranteed not to run in parallel with that modification). 158 * guaranteed not to run in parallel with that modification).
159 */ 159 */
160 160
161 static gfp_t saved_gfp_mask; 161 static gfp_t saved_gfp_mask;
162 162
163 void pm_restore_gfp_mask(void) 163 void pm_restore_gfp_mask(void)
164 { 164 {
165 WARN_ON(!mutex_is_locked(&pm_mutex)); 165 WARN_ON(!mutex_is_locked(&pm_mutex));
166 if (saved_gfp_mask) { 166 if (saved_gfp_mask) {
167 gfp_allowed_mask = saved_gfp_mask; 167 gfp_allowed_mask = saved_gfp_mask;
168 saved_gfp_mask = 0; 168 saved_gfp_mask = 0;
169 } 169 }
170 } 170 }
171 171
172 void pm_restrict_gfp_mask(void) 172 void pm_restrict_gfp_mask(void)
173 { 173 {
174 WARN_ON(!mutex_is_locked(&pm_mutex)); 174 WARN_ON(!mutex_is_locked(&pm_mutex));
175 WARN_ON(saved_gfp_mask); 175 WARN_ON(saved_gfp_mask);
176 saved_gfp_mask = gfp_allowed_mask; 176 saved_gfp_mask = gfp_allowed_mask;
177 gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS); 177 gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS);
178 } 178 }
179 179
180 bool pm_suspended_storage(void) 180 bool pm_suspended_storage(void)
181 { 181 {
182 if ((gfp_allowed_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) 182 if ((gfp_allowed_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
183 return false; 183 return false;
184 return true; 184 return true;
185 } 185 }
186 #endif /* CONFIG_PM_SLEEP */ 186 #endif /* CONFIG_PM_SLEEP */
187 187
188 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 188 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
189 unsigned int pageblock_order __read_mostly; 189 unsigned int pageblock_order __read_mostly;
190 #endif 190 #endif
191 191
192 static void __free_pages_ok(struct page *page, unsigned int order); 192 static void __free_pages_ok(struct page *page, unsigned int order);
193 193
194 /* 194 /*
195 * results with 256, 32 in the lowmem_reserve sysctl: 195 * results with 256, 32 in the lowmem_reserve sysctl:
196 * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) 196 * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
197 * 1G machine -> (16M dma, 784M normal, 224M high) 197 * 1G machine -> (16M dma, 784M normal, 224M high)
198 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA 198 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
199 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL 199 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
200 * HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA 200 * HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA
201 * 201 *
202 * TBD: should special case ZONE_DMA32 machines here - in those we normally 202 * TBD: should special case ZONE_DMA32 machines here - in those we normally
203 * don't need any ZONE_NORMAL reservation 203 * don't need any ZONE_NORMAL reservation
204 */ 204 */
205 int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 205 int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
206 #ifdef CONFIG_ZONE_DMA 206 #ifdef CONFIG_ZONE_DMA
207 256, 207 256,
208 #endif 208 #endif
209 #ifdef CONFIG_ZONE_DMA32 209 #ifdef CONFIG_ZONE_DMA32
210 256, 210 256,
211 #endif 211 #endif
212 #ifdef CONFIG_HIGHMEM 212 #ifdef CONFIG_HIGHMEM
213 32, 213 32,
214 #endif 214 #endif
215 32, 215 32,
216 }; 216 };
217 217
218 EXPORT_SYMBOL(totalram_pages); 218 EXPORT_SYMBOL(totalram_pages);
219 219
220 static char * const zone_names[MAX_NR_ZONES] = { 220 static char * const zone_names[MAX_NR_ZONES] = {
221 #ifdef CONFIG_ZONE_DMA 221 #ifdef CONFIG_ZONE_DMA
222 "DMA", 222 "DMA",
223 #endif 223 #endif
224 #ifdef CONFIG_ZONE_DMA32 224 #ifdef CONFIG_ZONE_DMA32
225 "DMA32", 225 "DMA32",
226 #endif 226 #endif
227 "Normal", 227 "Normal",
228 #ifdef CONFIG_HIGHMEM 228 #ifdef CONFIG_HIGHMEM
229 "HighMem", 229 "HighMem",
230 #endif 230 #endif
231 "Movable", 231 "Movable",
232 #ifdef CONFIG_ZONE_DEVICE 232 #ifdef CONFIG_ZONE_DEVICE
233 "Device", 233 "Device",
234 #endif 234 #endif
235 }; 235 };
236 236
237 char * const migratetype_names[MIGRATE_TYPES] = { 237 char * const migratetype_names[MIGRATE_TYPES] = {
238 "Unmovable", 238 "Unmovable",
239 "Movable", 239 "Movable",
240 "Reclaimable", 240 "Reclaimable",
241 "HighAtomic", 241 "HighAtomic",
242 #ifdef CONFIG_CMA 242 #ifdef CONFIG_CMA
243 "CMA", 243 "CMA",
244 #endif 244 #endif
245 #ifdef CONFIG_MEMORY_ISOLATION 245 #ifdef CONFIG_MEMORY_ISOLATION
246 "Isolate", 246 "Isolate",
247 #endif 247 #endif
248 }; 248 };
249 249
250 compound_page_dtor * const compound_page_dtors[] = { 250 compound_page_dtor * const compound_page_dtors[] = {
251 NULL, 251 NULL,
252 free_compound_page, 252 free_compound_page,
253 #ifdef CONFIG_HUGETLB_PAGE 253 #ifdef CONFIG_HUGETLB_PAGE
254 free_huge_page, 254 free_huge_page,
255 #endif 255 #endif
256 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 256 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
257 free_transhuge_page, 257 free_transhuge_page,
258 #endif 258 #endif
259 }; 259 };
260 260
261 /* 261 /*
262 * Try to keep at least this much lowmem free. Do not allow normal 262 * Try to keep at least this much lowmem free. Do not allow normal
263 * allocations below this point, only high priority ones. Automatically 263 * allocations below this point, only high priority ones. Automatically
264 * tuned according to the amount of memory in the system. 264 * tuned according to the amount of memory in the system.
265 */ 265 */
266 int min_free_kbytes = 1024; 266 int min_free_kbytes = 1024;
267 int user_min_free_kbytes = -1; 267 int user_min_free_kbytes = -1;
268 int watermark_scale_factor = 10; 268 int watermark_scale_factor = 10;
269 269
270 /* 270 /*
271 * Extra memory for the system to try freeing. Used to temporarily 271 * Extra memory for the system to try freeing. Used to temporarily
272 * free memory, to make space for new workloads. Anyone can allocate 272 * free memory, to make space for new workloads. Anyone can allocate
273 * down to the min watermarks controlled by min_free_kbytes above. 273 * down to the min watermarks controlled by min_free_kbytes above.
274 */ 274 */
275 int extra_free_kbytes = 0; 275 int extra_free_kbytes = 0;
276 276
277 static unsigned long __meminitdata nr_kernel_pages; 277 static unsigned long __meminitdata nr_kernel_pages;
278 static unsigned long __meminitdata nr_all_pages; 278 static unsigned long __meminitdata nr_all_pages;
279 static unsigned long __meminitdata dma_reserve; 279 static unsigned long __meminitdata dma_reserve;
280 280
281 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 281 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
282 static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; 282 static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
283 static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; 283 static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
284 static unsigned long __initdata required_kernelcore; 284 static unsigned long __initdata required_kernelcore;
285 static unsigned long __initdata required_movablecore; 285 static unsigned long __initdata required_movablecore;
286 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; 286 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
287 static bool mirrored_kernelcore; 287 static bool mirrored_kernelcore;
288 288
289 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ 289 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
290 int movable_zone; 290 int movable_zone;
291 EXPORT_SYMBOL(movable_zone); 291 EXPORT_SYMBOL(movable_zone);
292 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 292 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
293 293
294 #if MAX_NUMNODES > 1 294 #if MAX_NUMNODES > 1
295 int nr_node_ids __read_mostly = MAX_NUMNODES; 295 int nr_node_ids __read_mostly = MAX_NUMNODES;
296 int nr_online_nodes __read_mostly = 1; 296 int nr_online_nodes __read_mostly = 1;
297 EXPORT_SYMBOL(nr_node_ids); 297 EXPORT_SYMBOL(nr_node_ids);
298 EXPORT_SYMBOL(nr_online_nodes); 298 EXPORT_SYMBOL(nr_online_nodes);
299 #endif 299 #endif
300 300
301 int page_group_by_mobility_disabled __read_mostly; 301 int page_group_by_mobility_disabled __read_mostly;
302 302
303 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 303 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
304 304
305 /* 305 /*
306 * Determine how many pages need to be initialized durig early boot 306 * Determine how many pages need to be initialized durig early boot
307 * (non-deferred initialization). 307 * (non-deferred initialization).
308 * The value of first_deferred_pfn will be set later, once non-deferred pages 308 * The value of first_deferred_pfn will be set later, once non-deferred pages
309 * are initialized, but for now set it ULONG_MAX. 309 * are initialized, but for now set it ULONG_MAX.
310 */ 310 */
311 static inline void reset_deferred_meminit(pg_data_t *pgdat) 311 static inline void reset_deferred_meminit(pg_data_t *pgdat)
312 { 312 {
313 phys_addr_t start_addr, end_addr; 313 phys_addr_t start_addr, end_addr;
314 unsigned long max_pgcnt; 314 unsigned long max_pgcnt;
315 unsigned long reserved; 315 unsigned long reserved;
316 316
317 /* 317 /*
318 * Initialise at least 2G of a node but also take into account that 318 * Initialise at least 2G of a node but also take into account that
319 * two large system hashes that can take up 1GB for 0.25TB/node. 319 * two large system hashes that can take up 1GB for 0.25TB/node.
320 */ 320 */
321 max_pgcnt = max(2UL << (30 - PAGE_SHIFT), 321 max_pgcnt = max(2UL << (30 - PAGE_SHIFT),
322 (pgdat->node_spanned_pages >> 8)); 322 (pgdat->node_spanned_pages >> 8));
323 323
324 /* 324 /*
325 * Compensate the all the memblock reservations (e.g. crash kernel) 325 * Compensate the all the memblock reservations (e.g. crash kernel)
326 * from the initial estimation to make sure we will initialize enough 326 * from the initial estimation to make sure we will initialize enough
327 * memory to boot. 327 * memory to boot.
328 */ 328 */
329 start_addr = PFN_PHYS(pgdat->node_start_pfn); 329 start_addr = PFN_PHYS(pgdat->node_start_pfn);
330 end_addr = PFN_PHYS(pgdat->node_start_pfn + max_pgcnt); 330 end_addr = PFN_PHYS(pgdat->node_start_pfn + max_pgcnt);
331 reserved = memblock_reserved_memory_within(start_addr, end_addr); 331 reserved = memblock_reserved_memory_within(start_addr, end_addr);
332 max_pgcnt += PHYS_PFN(reserved); 332 max_pgcnt += PHYS_PFN(reserved);
333 333
334 pgdat->static_init_pgcnt = min(max_pgcnt, pgdat->node_spanned_pages); 334 pgdat->static_init_pgcnt = min(max_pgcnt, pgdat->node_spanned_pages);
335 pgdat->first_deferred_pfn = ULONG_MAX; 335 pgdat->first_deferred_pfn = ULONG_MAX;
336 } 336 }
337 337
338 /* Returns true if the struct page for the pfn is uninitialised */ 338 /* Returns true if the struct page for the pfn is uninitialised */
339 static inline bool __meminit early_page_uninitialised(unsigned long pfn) 339 static inline bool __meminit early_page_uninitialised(unsigned long pfn)
340 { 340 {
341 int nid = early_pfn_to_nid(pfn); 341 int nid = early_pfn_to_nid(pfn);
342 342
343 if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn) 343 if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn)
344 return true; 344 return true;
345 345
346 return false; 346 return false;
347 } 347 }
348 348
349 /* 349 /*
350 * Returns false when the remaining initialisation should be deferred until 350 * Returns false when the remaining initialisation should be deferred until
351 * later in the boot cycle when it can be parallelised. 351 * later in the boot cycle when it can be parallelised.
352 */ 352 */
353 static inline bool update_defer_init(pg_data_t *pgdat, 353 static inline bool update_defer_init(pg_data_t *pgdat,
354 unsigned long pfn, unsigned long zone_end, 354 unsigned long pfn, unsigned long zone_end,
355 unsigned long *nr_initialised) 355 unsigned long *nr_initialised)
356 { 356 {
357 /* Always populate low zones for address-contrained allocations */ 357 /* Always populate low zones for address-contrained allocations */
358 if (zone_end < pgdat_end_pfn(pgdat)) 358 if (zone_end < pgdat_end_pfn(pgdat))
359 return true; 359 return true;
360 (*nr_initialised)++; 360 (*nr_initialised)++;
361 if ((*nr_initialised > pgdat->static_init_pgcnt) && 361 if ((*nr_initialised > pgdat->static_init_pgcnt) &&
362 (pfn & (PAGES_PER_SECTION - 1)) == 0) { 362 (pfn & (PAGES_PER_SECTION - 1)) == 0) {
363 pgdat->first_deferred_pfn = pfn; 363 pgdat->first_deferred_pfn = pfn;
364 return false; 364 return false;
365 } 365 }
366 366
367 return true; 367 return true;
368 } 368 }
369 #else 369 #else
370 static inline void reset_deferred_meminit(pg_data_t *pgdat) 370 static inline void reset_deferred_meminit(pg_data_t *pgdat)
371 { 371 {
372 } 372 }
373 373
374 static inline bool early_page_uninitialised(unsigned long pfn) 374 static inline bool early_page_uninitialised(unsigned long pfn)
375 { 375 {
376 return false; 376 return false;
377 } 377 }
378 378
379 static inline bool update_defer_init(pg_data_t *pgdat, 379 static inline bool update_defer_init(pg_data_t *pgdat,
380 unsigned long pfn, unsigned long zone_end, 380 unsigned long pfn, unsigned long zone_end,
381 unsigned long *nr_initialised) 381 unsigned long *nr_initialised)
382 { 382 {
383 return true; 383 return true;
384 } 384 }
385 #endif 385 #endif
386 386
387 /* Return a pointer to the bitmap storing bits affecting a block of pages */ 387 /* Return a pointer to the bitmap storing bits affecting a block of pages */
388 static inline unsigned long *get_pageblock_bitmap(struct page *page, 388 static inline unsigned long *get_pageblock_bitmap(struct page *page,
389 unsigned long pfn) 389 unsigned long pfn)
390 { 390 {
391 #ifdef CONFIG_SPARSEMEM 391 #ifdef CONFIG_SPARSEMEM
392 return __pfn_to_section(pfn)->pageblock_flags; 392 return __pfn_to_section(pfn)->pageblock_flags;
393 #else 393 #else
394 return page_zone(page)->pageblock_flags; 394 return page_zone(page)->pageblock_flags;
395 #endif /* CONFIG_SPARSEMEM */ 395 #endif /* CONFIG_SPARSEMEM */
396 } 396 }
397 397
398 static inline int pfn_to_bitidx(struct page *page, unsigned long pfn) 398 static inline int pfn_to_bitidx(struct page *page, unsigned long pfn)
399 { 399 {
400 #ifdef CONFIG_SPARSEMEM 400 #ifdef CONFIG_SPARSEMEM
401 pfn &= (PAGES_PER_SECTION-1); 401 pfn &= (PAGES_PER_SECTION-1);
402 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; 402 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
403 #else 403 #else
404 pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages); 404 pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages);
405 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; 405 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
406 #endif /* CONFIG_SPARSEMEM */ 406 #endif /* CONFIG_SPARSEMEM */
407 } 407 }
408 408
409 /** 409 /**
410 * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages 410 * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
411 * @page: The page within the block of interest 411 * @page: The page within the block of interest
412 * @pfn: The target page frame number 412 * @pfn: The target page frame number
413 * @end_bitidx: The last bit of interest to retrieve 413 * @end_bitidx: The last bit of interest to retrieve
414 * @mask: mask of bits that the caller is interested in 414 * @mask: mask of bits that the caller is interested in
415 * 415 *
416 * Return: pageblock_bits flags 416 * Return: pageblock_bits flags
417 */ 417 */
418 static __always_inline unsigned long __get_pfnblock_flags_mask(struct page *page, 418 static __always_inline unsigned long __get_pfnblock_flags_mask(struct page *page,
419 unsigned long pfn, 419 unsigned long pfn,
420 unsigned long end_bitidx, 420 unsigned long end_bitidx,
421 unsigned long mask) 421 unsigned long mask)
422 { 422 {
423 unsigned long *bitmap; 423 unsigned long *bitmap;
424 unsigned long bitidx, word_bitidx; 424 unsigned long bitidx, word_bitidx;
425 unsigned long word; 425 unsigned long word;
426 426
427 bitmap = get_pageblock_bitmap(page, pfn); 427 bitmap = get_pageblock_bitmap(page, pfn);
428 bitidx = pfn_to_bitidx(page, pfn); 428 bitidx = pfn_to_bitidx(page, pfn);
429 word_bitidx = bitidx / BITS_PER_LONG; 429 word_bitidx = bitidx / BITS_PER_LONG;
430 bitidx &= (BITS_PER_LONG-1); 430 bitidx &= (BITS_PER_LONG-1);
431 431
432 word = bitmap[word_bitidx]; 432 word = bitmap[word_bitidx];
433 bitidx += end_bitidx; 433 bitidx += end_bitidx;
434 return (word >> (BITS_PER_LONG - bitidx - 1)) & mask; 434 return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;
435 } 435 }
436 436
437 unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn, 437 unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
438 unsigned long end_bitidx, 438 unsigned long end_bitidx,
439 unsigned long mask) 439 unsigned long mask)
440 { 440 {
441 return __get_pfnblock_flags_mask(page, pfn, end_bitidx, mask); 441 return __get_pfnblock_flags_mask(page, pfn, end_bitidx, mask);
442 } 442 }
443 443
444 static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn) 444 static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn)
445 { 445 {
446 return __get_pfnblock_flags_mask(page, pfn, PB_migrate_end, MIGRATETYPE_MASK); 446 return __get_pfnblock_flags_mask(page, pfn, PB_migrate_end, MIGRATETYPE_MASK);
447 } 447 }
448 448
449 /** 449 /**
450 * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages 450 * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
451 * @page: The page within the block of interest 451 * @page: The page within the block of interest
452 * @flags: The flags to set 452 * @flags: The flags to set
453 * @pfn: The target page frame number 453 * @pfn: The target page frame number
454 * @end_bitidx: The last bit of interest 454 * @end_bitidx: The last bit of interest
455 * @mask: mask of bits that the caller is interested in 455 * @mask: mask of bits that the caller is interested in
456 */ 456 */
457 void set_pfnblock_flags_mask(struct page *page, unsigned long flags, 457 void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
458 unsigned long pfn, 458 unsigned long pfn,
459 unsigned long end_bitidx, 459 unsigned long end_bitidx,
460 unsigned long mask) 460 unsigned long mask)
461 { 461 {
462 unsigned long *bitmap; 462 unsigned long *bitmap;
463 unsigned long bitidx, word_bitidx; 463 unsigned long bitidx, word_bitidx;
464 unsigned long old_word, word; 464 unsigned long old_word, word;
465 465
466 BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); 466 BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
467 467
468 bitmap = get_pageblock_bitmap(page, pfn); 468 bitmap = get_pageblock_bitmap(page, pfn);
469 bitidx = pfn_to_bitidx(page, pfn); 469 bitidx = pfn_to_bitidx(page, pfn);
470 word_bitidx = bitidx / BITS_PER_LONG; 470 word_bitidx = bitidx / BITS_PER_LONG;
471 bitidx &= (BITS_PER_LONG-1); 471 bitidx &= (BITS_PER_LONG-1);
472 472
473 VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page); 473 VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
474 474
475 bitidx += end_bitidx; 475 bitidx += end_bitidx;
476 mask <<= (BITS_PER_LONG - bitidx - 1); 476 mask <<= (BITS_PER_LONG - bitidx - 1);
477 flags <<= (BITS_PER_LONG - bitidx - 1); 477 flags <<= (BITS_PER_LONG - bitidx - 1);
478 478
479 word = READ_ONCE(bitmap[word_bitidx]); 479 word = READ_ONCE(bitmap[word_bitidx]);
480 for (;;) { 480 for (;;) {
481 old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags); 481 old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
482 if (word == old_word) 482 if (word == old_word)
483 break; 483 break;
484 word = old_word; 484 word = old_word;
485 } 485 }
486 } 486 }
487 487
488 void set_pageblock_migratetype(struct page *page, int migratetype) 488 void set_pageblock_migratetype(struct page *page, int migratetype)
489 { 489 {
490 if (unlikely(page_group_by_mobility_disabled && 490 if (unlikely(page_group_by_mobility_disabled &&
491 migratetype < MIGRATE_PCPTYPES)) 491 migratetype < MIGRATE_PCPTYPES))
492 migratetype = MIGRATE_UNMOVABLE; 492 migratetype = MIGRATE_UNMOVABLE;
493 493
494 set_pageblock_flags_group(page, (unsigned long)migratetype, 494 set_pageblock_flags_group(page, (unsigned long)migratetype,
495 PB_migrate, PB_migrate_end); 495 PB_migrate, PB_migrate_end);
496 } 496 }
497 497
498 #ifdef CONFIG_DEBUG_VM 498 #ifdef CONFIG_DEBUG_VM
499 static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 499 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
500 { 500 {
501 int ret = 0; 501 int ret = 0;
502 unsigned seq; 502 unsigned seq;
503 unsigned long pfn = page_to_pfn(page); 503 unsigned long pfn = page_to_pfn(page);
504 unsigned long sp, start_pfn; 504 unsigned long sp, start_pfn;
505 505
506 do { 506 do {
507 seq = zone_span_seqbegin(zone); 507 seq = zone_span_seqbegin(zone);
508 start_pfn = zone->zone_start_pfn; 508 start_pfn = zone->zone_start_pfn;
509 sp = zone->spanned_pages; 509 sp = zone->spanned_pages;
510 if (!zone_spans_pfn(zone, pfn)) 510 if (!zone_spans_pfn(zone, pfn))
511 ret = 1; 511 ret = 1;
512 } while (zone_span_seqretry(zone, seq)); 512 } while (zone_span_seqretry(zone, seq));
513 513
514 if (ret) 514 if (ret)
515 pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n", 515 pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n",
516 pfn, zone_to_nid(zone), zone->name, 516 pfn, zone_to_nid(zone), zone->name,
517 start_pfn, start_pfn + sp); 517 start_pfn, start_pfn + sp);
518 518
519 return ret; 519 return ret;
520 } 520 }
521 521
522 static int page_is_consistent(struct zone *zone, struct page *page) 522 static int page_is_consistent(struct zone *zone, struct page *page)
523 { 523 {
524 if (!pfn_valid_within(page_to_pfn(page))) 524 if (!pfn_valid_within(page_to_pfn(page)))
525 return 0; 525 return 0;
526 if (zone != page_zone(page)) 526 if (zone != page_zone(page))
527 return 0; 527 return 0;
528 528
529 return 1; 529 return 1;
530 } 530 }
531 /* 531 /*
532 * Temporary debugging check for pages not lying within a given zone. 532 * Temporary debugging check for pages not lying within a given zone.
533 */ 533 */
534 static int __maybe_unused bad_range(struct zone *zone, struct page *page) 534 static int __maybe_unused bad_range(struct zone *zone, struct page *page)
535 { 535 {
536 if (page_outside_zone_boundaries(zone, page)) 536 if (page_outside_zone_boundaries(zone, page))
537 return 1; 537 return 1;
538 if (!page_is_consistent(zone, page)) 538 if (!page_is_consistent(zone, page))
539 return 1; 539 return 1;
540 540
541 return 0; 541 return 0;
542 } 542 }
543 #else 543 #else
544 static inline int __maybe_unused bad_range(struct zone *zone, struct page *page) 544 static inline int __maybe_unused bad_range(struct zone *zone, struct page *page)
545 { 545 {
546 return 0; 546 return 0;
547 } 547 }
548 #endif 548 #endif
549 549
550 static void bad_page(struct page *page, const char *reason, 550 static void bad_page(struct page *page, const char *reason,
551 unsigned long bad_flags) 551 unsigned long bad_flags)
552 { 552 {
553 static unsigned long resume; 553 static unsigned long resume;
554 static unsigned long nr_shown; 554 static unsigned long nr_shown;
555 static unsigned long nr_unshown; 555 static unsigned long nr_unshown;
556 556
557 /* 557 /*
558 * Allow a burst of 60 reports, then keep quiet for that minute; 558 * Allow a burst of 60 reports, then keep quiet for that minute;
559 * or allow a steady drip of one report per second. 559 * or allow a steady drip of one report per second.
560 */ 560 */
561 if (nr_shown == 60) { 561 if (nr_shown == 60) {
562 if (time_before(jiffies, resume)) { 562 if (time_before(jiffies, resume)) {
563 nr_unshown++; 563 nr_unshown++;
564 goto out; 564 goto out;
565 } 565 }
566 if (nr_unshown) { 566 if (nr_unshown) {
567 pr_alert( 567 pr_alert(
568 "BUG: Bad page state: %lu messages suppressed\n", 568 "BUG: Bad page state: %lu messages suppressed\n",
569 nr_unshown); 569 nr_unshown);
570 nr_unshown = 0; 570 nr_unshown = 0;
571 } 571 }
572 nr_shown = 0; 572 nr_shown = 0;
573 } 573 }
574 if (nr_shown++ == 0) 574 if (nr_shown++ == 0)
575 resume = jiffies + 60 * HZ; 575 resume = jiffies + 60 * HZ;
576 576
577 pr_alert("BUG: Bad page state in process %s pfn:%05lx\n", 577 pr_alert("BUG: Bad page state in process %s pfn:%05lx\n",
578 current->comm, page_to_pfn(page)); 578 current->comm, page_to_pfn(page));
579 __dump_page(page, reason); 579 __dump_page(page, reason);
580 bad_flags &= page->flags; 580 bad_flags &= page->flags;
581 if (bad_flags) 581 if (bad_flags)
582 pr_alert("bad because of flags: %#lx(%pGp)\n", 582 pr_alert("bad because of flags: %#lx(%pGp)\n",
583 bad_flags, &bad_flags); 583 bad_flags, &bad_flags);
584 dump_page_owner(page); 584 dump_page_owner(page);
585 585
586 print_modules(); 586 print_modules();
587 dump_stack(); 587 dump_stack();
588 out: 588 out:
589 /* Leave bad fields for debug, except PageBuddy could make trouble */ 589 /* Leave bad fields for debug, except PageBuddy could make trouble */
590 page_mapcount_reset(page); /* remove PageBuddy */ 590 page_mapcount_reset(page); /* remove PageBuddy */
591 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); 591 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
592 } 592 }
593 593
594 /* 594 /*
595 * Higher-order pages are called "compound pages". They are structured thusly: 595 * Higher-order pages are called "compound pages". They are structured thusly:
596 * 596 *
597 * The first PAGE_SIZE page is called the "head page" and have PG_head set. 597 * The first PAGE_SIZE page is called the "head page" and have PG_head set.
598 * 598 *
599 * The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded 599 * The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded
600 * in bit 0 of page->compound_head. The rest of bits is pointer to head page. 600 * in bit 0 of page->compound_head. The rest of bits is pointer to head page.
601 * 601 *
602 * The first tail page's ->compound_dtor holds the offset in array of compound 602 * The first tail page's ->compound_dtor holds the offset in array of compound
603 * page destructors. See compound_page_dtors. 603 * page destructors. See compound_page_dtors.
604 * 604 *
605 * The first tail page's ->compound_order holds the order of allocation. 605 * The first tail page's ->compound_order holds the order of allocation.
606 * This usage means that zero-order pages may not be compound. 606 * This usage means that zero-order pages may not be compound.
607 */ 607 */
608 608
609 void free_compound_page(struct page *page) 609 void free_compound_page(struct page *page)
610 { 610 {
611 __free_pages_ok(page, compound_order(page)); 611 __free_pages_ok(page, compound_order(page));
612 } 612 }
613 613
614 void prep_compound_page(struct page *page, unsigned int order) 614 void prep_compound_page(struct page *page, unsigned int order)
615 { 615 {
616 int i; 616 int i;
617 int nr_pages = 1 << order; 617 int nr_pages = 1 << order;
618 618
619 set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); 619 set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
620 set_compound_order(page, order); 620 set_compound_order(page, order);
621 __SetPageHead(page); 621 __SetPageHead(page);
622 for (i = 1; i < nr_pages; i++) { 622 for (i = 1; i < nr_pages; i++) {
623 struct page *p = page + i; 623 struct page *p = page + i;
624 set_page_count(p, 0); 624 set_page_count(p, 0);
625 p->mapping = TAIL_MAPPING; 625 p->mapping = TAIL_MAPPING;
626 set_compound_head(p, page); 626 set_compound_head(p, page);
627 } 627 }
628 atomic_set(compound_mapcount_ptr(page), -1); 628 atomic_set(compound_mapcount_ptr(page), -1);
629 } 629 }
630 630
631 #ifdef CONFIG_DEBUG_PAGEALLOC 631 #ifdef CONFIG_DEBUG_PAGEALLOC
632 unsigned int _debug_guardpage_minorder; 632 unsigned int _debug_guardpage_minorder;
633 bool _debug_pagealloc_enabled __read_mostly 633 bool _debug_pagealloc_enabled __read_mostly
634 = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT); 634 = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT);
635 EXPORT_SYMBOL(_debug_pagealloc_enabled); 635 EXPORT_SYMBOL(_debug_pagealloc_enabled);
636 bool _debug_guardpage_enabled __read_mostly; 636 bool _debug_guardpage_enabled __read_mostly;
637 637
638 static int __init early_debug_pagealloc(char *buf) 638 static int __init early_debug_pagealloc(char *buf)
639 { 639 {
640 if (!buf) 640 if (!buf)
641 return -EINVAL; 641 return -EINVAL;
642 return kstrtobool(buf, &_debug_pagealloc_enabled); 642 return kstrtobool(buf, &_debug_pagealloc_enabled);
643 } 643 }
644 early_param("debug_pagealloc", early_debug_pagealloc); 644 early_param("debug_pagealloc", early_debug_pagealloc);
645 645
646 static bool need_debug_guardpage(void) 646 static bool need_debug_guardpage(void)
647 { 647 {
648 /* If we don't use debug_pagealloc, we don't need guard page */ 648 /* If we don't use debug_pagealloc, we don't need guard page */
649 if (!debug_pagealloc_enabled()) 649 if (!debug_pagealloc_enabled())
650 return false; 650 return false;
651 651
652 if (!debug_guardpage_minorder()) 652 if (!debug_guardpage_minorder())
653 return false; 653 return false;
654 654
655 return true; 655 return true;
656 } 656 }
657 657
658 static void init_debug_guardpage(void) 658 static void init_debug_guardpage(void)
659 { 659 {
660 if (!debug_pagealloc_enabled()) 660 if (!debug_pagealloc_enabled())
661 return; 661 return;
662 662
663 if (!debug_guardpage_minorder()) 663 if (!debug_guardpage_minorder())
664 return; 664 return;
665 665
666 _debug_guardpage_enabled = true; 666 _debug_guardpage_enabled = true;
667 } 667 }
668 668
669 struct page_ext_operations debug_guardpage_ops = { 669 struct page_ext_operations debug_guardpage_ops = {
670 .need = need_debug_guardpage, 670 .need = need_debug_guardpage,
671 .init = init_debug_guardpage, 671 .init = init_debug_guardpage,
672 }; 672 };
673 673
674 static int __init debug_guardpage_minorder_setup(char *buf) 674 static int __init debug_guardpage_minorder_setup(char *buf)
675 { 675 {
676 unsigned long res; 676 unsigned long res;
677 677
678 if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) { 678 if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {
679 pr_err("Bad debug_guardpage_minorder value\n"); 679 pr_err("Bad debug_guardpage_minorder value\n");
680 return 0; 680 return 0;
681 } 681 }
682 _debug_guardpage_minorder = res; 682 _debug_guardpage_minorder = res;
683 pr_info("Setting debug_guardpage_minorder to %lu\n", res); 683 pr_info("Setting debug_guardpage_minorder to %lu\n", res);
684 return 0; 684 return 0;
685 } 685 }
686 early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup); 686 early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup);
687 687
688 static inline bool set_page_guard(struct zone *zone, struct page *page, 688 static inline bool set_page_guard(struct zone *zone, struct page *page,
689 unsigned int order, int migratetype) 689 unsigned int order, int migratetype)
690 { 690 {
691 struct page_ext *page_ext; 691 struct page_ext *page_ext;
692 692
693 if (!debug_guardpage_enabled()) 693 if (!debug_guardpage_enabled())
694 return false; 694 return false;
695 695
696 if (order >= debug_guardpage_minorder()) 696 if (order >= debug_guardpage_minorder())
697 return false; 697 return false;
698 698
699 page_ext = lookup_page_ext(page); 699 page_ext = lookup_page_ext(page);
700 if (unlikely(!page_ext)) 700 if (unlikely(!page_ext))
701 return false; 701 return false;
702 702
703 __set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags); 703 __set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
704 704
705 INIT_LIST_HEAD(&page->lru); 705 INIT_LIST_HEAD(&page->lru);
706 set_page_private(page, order); 706 set_page_private(page, order);
707 /* Guard pages are not available for any usage */ 707 /* Guard pages are not available for any usage */
708 __mod_zone_freepage_state(zone, -(1 << order), migratetype); 708 __mod_zone_freepage_state(zone, -(1 << order), migratetype);
709 709
710 return true; 710 return true;
711 } 711 }
712 712
713 static inline void clear_page_guard(struct zone *zone, struct page *page, 713 static inline void clear_page_guard(struct zone *zone, struct page *page,
714 unsigned int order, int migratetype) 714 unsigned int order, int migratetype)
715 { 715 {
716 struct page_ext *page_ext; 716 struct page_ext *page_ext;
717 717
718 if (!debug_guardpage_enabled()) 718 if (!debug_guardpage_enabled())
719 return; 719 return;
720 720
721 page_ext = lookup_page_ext(page); 721 page_ext = lookup_page_ext(page);
722 if (unlikely(!page_ext)) 722 if (unlikely(!page_ext))
723 return; 723 return;
724 724
725 __clear_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags); 725 __clear_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
726 726
727 set_page_private(page, 0); 727 set_page_private(page, 0);
728 if (!is_migrate_isolate(migratetype)) 728 if (!is_migrate_isolate(migratetype))
729 __mod_zone_freepage_state(zone, (1 << order), migratetype); 729 __mod_zone_freepage_state(zone, (1 << order), migratetype);
730 } 730 }
731 #else 731 #else
732 struct page_ext_operations debug_guardpage_ops; 732 struct page_ext_operations debug_guardpage_ops;
733 static inline bool set_page_guard(struct zone *zone, struct page *page, 733 static inline bool set_page_guard(struct zone *zone, struct page *page,
734 unsigned int order, int migratetype) { return false; } 734 unsigned int order, int migratetype) { return false; }
735 static inline void clear_page_guard(struct zone *zone, struct page *page, 735 static inline void clear_page_guard(struct zone *zone, struct page *page,
736 unsigned int order, int migratetype) {} 736 unsigned int order, int migratetype) {}
737 #endif 737 #endif
738 738
739 static inline void set_page_order(struct page *page, unsigned int order) 739 static inline void set_page_order(struct page *page, unsigned int order)
740 { 740 {
741 set_page_private(page, order); 741 set_page_private(page, order);
742 __SetPageBuddy(page); 742 __SetPageBuddy(page);
743 } 743 }
744 744
745 static inline void rmv_page_order(struct page *page) 745 static inline void rmv_page_order(struct page *page)
746 { 746 {
747 __ClearPageBuddy(page); 747 __ClearPageBuddy(page);
748 set_page_private(page, 0); 748 set_page_private(page, 0);
749 } 749 }
750 750
751 /* 751 /*
752 * This function checks whether a page is free && is the buddy 752 * This function checks whether a page is free && is the buddy
753 * we can do coalesce a page and its buddy if 753 * we can do coalesce a page and its buddy if
754 * (a) the buddy is not in a hole (check before calling!) && 754 * (a) the buddy is not in a hole (check before calling!) &&
755 * (b) the buddy is in the buddy system && 755 * (b) the buddy is in the buddy system &&
756 * (c) a page and its buddy have the same order && 756 * (c) a page and its buddy have the same order &&
757 * (d) a page and its buddy are in the same zone. 757 * (d) a page and its buddy are in the same zone.
758 * 758 *
759 * For recording whether a page is in the buddy system, we set ->_mapcount 759 * For recording whether a page is in the buddy system, we set ->_mapcount
760 * PAGE_BUDDY_MAPCOUNT_VALUE. 760 * PAGE_BUDDY_MAPCOUNT_VALUE.
761 * Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is 761 * Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is
762 * serialized by zone->lock. 762 * serialized by zone->lock.
763 * 763 *
764 * For recording page's order, we use page_private(page). 764 * For recording page's order, we use page_private(page).
765 */ 765 */
766 static inline int page_is_buddy(struct page *page, struct page *buddy, 766 static inline int page_is_buddy(struct page *page, struct page *buddy,
767 unsigned int order) 767 unsigned int order)
768 { 768 {
769 if (page_is_guard(buddy) && page_order(buddy) == order) { 769 if (page_is_guard(buddy) && page_order(buddy) == order) {
770 if (page_zone_id(page) != page_zone_id(buddy)) 770 if (page_zone_id(page) != page_zone_id(buddy))
771 return 0; 771 return 0;
772 772
773 VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); 773 VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
774 774
775 return 1; 775 return 1;
776 } 776 }
777 777
778 if (PageBuddy(buddy) && page_order(buddy) == order) { 778 if (PageBuddy(buddy) && page_order(buddy) == order) {
779 /* 779 /*
780 * zone check is done late to avoid uselessly 780 * zone check is done late to avoid uselessly
781 * calculating zone/node ids for pages that could 781 * calculating zone/node ids for pages that could
782 * never merge. 782 * never merge.
783 */ 783 */
784 if (page_zone_id(page) != page_zone_id(buddy)) 784 if (page_zone_id(page) != page_zone_id(buddy))
785 return 0; 785 return 0;
786 786
787 VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); 787 VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
788 788
789 return 1; 789 return 1;
790 } 790 }
791 return 0; 791 return 0;
792 } 792 }
793 793
794 /* 794 /*
795 * Freeing function for a buddy system allocator. 795 * Freeing function for a buddy system allocator.
796 * 796 *
797 * The concept of a buddy system is to maintain direct-mapped table 797 * The concept of a buddy system is to maintain direct-mapped table
798 * (containing bit values) for memory blocks of various "orders". 798 * (containing bit values) for memory blocks of various "orders".
799 * The bottom level table contains the map for the smallest allocatable 799 * The bottom level table contains the map for the smallest allocatable
800 * units of memory (here, pages), and each level above it describes 800 * units of memory (here, pages), and each level above it describes
801 * pairs of units from the levels below, hence, "buddies". 801 * pairs of units from the levels below, hence, "buddies".
802 * At a high level, all that happens here is marking the table entry 802 * At a high level, all that happens here is marking the table entry
803 * at the bottom level available, and propagating the changes upward 803 * at the bottom level available, and propagating the changes upward
804 * as necessary, plus some accounting needed to play nicely with other 804 * as necessary, plus some accounting needed to play nicely with other
805 * parts of the VM system. 805 * parts of the VM system.
806 * At each level, we keep a list of pages, which are heads of continuous 806 * At each level, we keep a list of pages, which are heads of continuous
807 * free pages of length of (1 << order) and marked with _mapcount 807 * free pages of length of (1 << order) and marked with _mapcount
808 * PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page) 808 * PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page)
809 * field. 809 * field.
810 * So when we are allocating or freeing one, we can derive the state of the 810 * So when we are allocating or freeing one, we can derive the state of the
811 * other. That is, if we allocate a small block, and both were 811 * other. That is, if we allocate a small block, and both were
812 * free, the remainder of the region must be split into blocks. 812 * free, the remainder of the region must be split into blocks.
813 * If a block is freed, and its buddy is also free, then this 813 * If a block is freed, and its buddy is also free, then this
814 * triggers coalescing into a block of larger size. 814 * triggers coalescing into a block of larger size.
815 * 815 *
816 * -- nyc 816 * -- nyc
817 */ 817 */
818 818
819 static inline void __free_one_page(struct page *page, 819 static inline void __free_one_page(struct page *page,
820 unsigned long pfn, 820 unsigned long pfn,
821 struct zone *zone, unsigned int order, 821 struct zone *zone, unsigned int order,
822 int migratetype) 822 int migratetype)
823 { 823 {
824 unsigned long combined_pfn; 824 unsigned long combined_pfn;
825 unsigned long uninitialized_var(buddy_pfn); 825 unsigned long uninitialized_var(buddy_pfn);
826 struct page *buddy; 826 struct page *buddy;
827 unsigned int max_order; 827 unsigned int max_order;
828 828
829 max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1); 829 max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1);
830 830
831 VM_BUG_ON(!zone_is_initialized(zone)); 831 VM_BUG_ON(!zone_is_initialized(zone));
832 VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); 832 VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
833 833
834 VM_BUG_ON(migratetype == -1); 834 VM_BUG_ON(migratetype == -1);
835 if (likely(!is_migrate_isolate(migratetype))) 835 if (likely(!is_migrate_isolate(migratetype)))
836 __mod_zone_freepage_state(zone, 1 << order, migratetype); 836 __mod_zone_freepage_state(zone, 1 << order, migratetype);
837 837
838 VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page); 838 VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
839 VM_BUG_ON_PAGE(bad_range(zone, page), page); 839 VM_BUG_ON_PAGE(bad_range(zone, page), page);
840 840
841 continue_merging: 841 continue_merging:
842 while (order < max_order - 1) { 842 while (order < max_order - 1) {
843 buddy_pfn = __find_buddy_pfn(pfn, order); 843 buddy_pfn = __find_buddy_pfn(pfn, order);
844 buddy = page + (buddy_pfn - pfn); 844 buddy = page + (buddy_pfn - pfn);
845 845
846 if (!pfn_valid_within(buddy_pfn)) 846 if (!pfn_valid_within(buddy_pfn))
847 goto done_merging; 847 goto done_merging;
848 if (!page_is_buddy(page, buddy, order)) 848 if (!page_is_buddy(page, buddy, order))
849 goto done_merging; 849 goto done_merging;
850 /* 850 /*
851 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, 851 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
852 * merge with it and move up one order. 852 * merge with it and move up one order.
853 */ 853 */
854 if (page_is_guard(buddy)) { 854 if (page_is_guard(buddy)) {
855 clear_page_guard(zone, buddy, order, migratetype); 855 clear_page_guard(zone, buddy, order, migratetype);
856 } else { 856 } else {
857 list_del(&buddy->lru); 857 list_del(&buddy->lru);
858 zone->free_area[order].nr_free--; 858 zone->free_area[order].nr_free--;
859 rmv_page_order(buddy); 859 rmv_page_order(buddy);
860 } 860 }
861 combined_pfn = buddy_pfn & pfn; 861 combined_pfn = buddy_pfn & pfn;
862 page = page + (combined_pfn - pfn); 862 page = page + (combined_pfn - pfn);
863 pfn = combined_pfn; 863 pfn = combined_pfn;
864 order++; 864 order++;
865 } 865 }
866 if (max_order < MAX_ORDER) { 866 if (max_order < MAX_ORDER) {
867 /* If we are here, it means order is >= pageblock_order. 867 /* If we are here, it means order is >= pageblock_order.
868 * We want to prevent merge between freepages on isolate 868 * We want to prevent merge between freepages on isolate
869 * pageblock and normal pageblock. Without this, pageblock 869 * pageblock and normal pageblock. Without this, pageblock
870 * isolation could cause incorrect freepage or CMA accounting. 870 * isolation could cause incorrect freepage or CMA accounting.
871 * 871 *
872 * We don't want to hit this code for the more frequent 872 * We don't want to hit this code for the more frequent
873 * low-order merging. 873 * low-order merging.
874 */ 874 */
875 if (unlikely(has_isolate_pageblock(zone))) { 875 if (unlikely(has_isolate_pageblock(zone))) {
876 int buddy_mt; 876 int buddy_mt;
877 877
878 buddy_pfn = __find_buddy_pfn(pfn, order); 878 buddy_pfn = __find_buddy_pfn(pfn, order);
879 buddy = page + (buddy_pfn - pfn); 879 buddy = page + (buddy_pfn - pfn);
880 buddy_mt = get_pageblock_migratetype(buddy); 880 buddy_mt = get_pageblock_migratetype(buddy);
881 881
882 if (migratetype != buddy_mt 882 if (migratetype != buddy_mt
883 && (is_migrate_isolate(migratetype) || 883 && (is_migrate_isolate(migratetype) ||
884 is_migrate_isolate(buddy_mt))) 884 is_migrate_isolate(buddy_mt)))
885 goto done_merging; 885 goto done_merging;
886 } 886 }
887 max_order++; 887 max_order++;
888 goto continue_merging; 888 goto continue_merging;
889 } 889 }
890 890
891 done_merging: 891 done_merging:
892 set_page_order(page, order); 892 set_page_order(page, order);
893 893
894 /* 894 /*
895 * If this is not the largest possible page, check if the buddy 895 * If this is not the largest possible page, check if the buddy
896 * of the next-highest order is free. If it is, it's possible 896 * of the next-highest order is free. If it is, it's possible
897 * that pages are being freed that will coalesce soon. In case, 897 * that pages are being freed that will coalesce soon. In case,
898 * that is happening, add the free page to the tail of the list 898 * that is happening, add the free page to the tail of the list
899 * so it's less likely to be used soon and more likely to be merged 899 * so it's less likely to be used soon and more likely to be merged
900 * as a higher order page 900 * as a higher order page
901 */ 901 */
902 if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)) { 902 if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)) {
903 struct page *higher_page, *higher_buddy; 903 struct page *higher_page, *higher_buddy;
904 combined_pfn = buddy_pfn & pfn; 904 combined_pfn = buddy_pfn & pfn;
905 higher_page = page + (combined_pfn - pfn); 905 higher_page = page + (combined_pfn - pfn);
906 buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1); 906 buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
907 higher_buddy = higher_page + (buddy_pfn - combined_pfn); 907 higher_buddy = higher_page + (buddy_pfn - combined_pfn);
908 if (pfn_valid_within(buddy_pfn) && 908 if (pfn_valid_within(buddy_pfn) &&
909 page_is_buddy(higher_page, higher_buddy, order + 1)) { 909 page_is_buddy(higher_page, higher_buddy, order + 1)) {
910 list_add_tail(&page->lru, 910 list_add_tail(&page->lru,
911 &zone->free_area[order].free_list[migratetype]); 911 &zone->free_area[order].free_list[migratetype]);
912 goto out; 912 goto out;
913 } 913 }
914 } 914 }
915 915
916 list_add(&page->lru, &zone->free_area[order].free_list[migratetype]); 916 list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
917 out: 917 out:
918 zone->free_area[order].nr_free++; 918 zone->free_area[order].nr_free++;
919 } 919 }
920 920
921 /* 921 /*
922 * A bad page could be due to a number of fields. Instead of multiple branches, 922 * A bad page could be due to a number of fields. Instead of multiple branches,
923 * try and check multiple fields with one check. The caller must do a detailed 923 * try and check multiple fields with one check. The caller must do a detailed
924 * check if necessary. 924 * check if necessary.
925 */ 925 */
926 static inline bool page_expected_state(struct page *page, 926 static inline bool page_expected_state(struct page *page,
927 unsigned long check_flags) 927 unsigned long check_flags)
928 { 928 {
929 if (unlikely(atomic_read(&page->_mapcount) != -1)) 929 if (unlikely(atomic_read(&page->_mapcount) != -1))
930 return false; 930 return false;
931 931
932 if (unlikely((unsigned long)page->mapping | 932 if (unlikely((unsigned long)page->mapping |
933 page_ref_count(page) | 933 page_ref_count(page) |
934 #ifdef CONFIG_MEMCG 934 #ifdef CONFIG_MEMCG
935 (unsigned long)page->mem_cgroup | 935 (unsigned long)page->mem_cgroup |
936 #endif 936 #endif
937 (page->flags & check_flags))) 937 (page->flags & check_flags)))
938 return false; 938 return false;
939 939
940 return true; 940 return true;
941 } 941 }
942 942
943 static void free_pages_check_bad(struct page *page) 943 static void free_pages_check_bad(struct page *page)
944 { 944 {
945 const char *bad_reason; 945 const char *bad_reason;
946 unsigned long bad_flags; 946 unsigned long bad_flags;
947 947
948 bad_reason = NULL; 948 bad_reason = NULL;
949 bad_flags = 0; 949 bad_flags = 0;
950 950
951 if (unlikely(atomic_read(&page->_mapcount) != -1)) 951 if (unlikely(atomic_read(&page->_mapcount) != -1))
952 bad_reason = "nonzero mapcount"; 952 bad_reason = "nonzero mapcount";
953 if (unlikely(page->mapping != NULL)) 953 if (unlikely(page->mapping != NULL))
954 bad_reason = "non-NULL mapping"; 954 bad_reason = "non-NULL mapping";
955 if (unlikely(page_ref_count(page) != 0)) 955 if (unlikely(page_ref_count(page) != 0))
956 bad_reason = "nonzero _refcount"; 956 bad_reason = "nonzero _refcount";
957 if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) { 957 if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) {
958 bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; 958 bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
959 bad_flags = PAGE_FLAGS_CHECK_AT_FREE; 959 bad_flags = PAGE_FLAGS_CHECK_AT_FREE;
960 } 960 }
961 #ifdef CONFIG_MEMCG 961 #ifdef CONFIG_MEMCG
962 if (unlikely(page->mem_cgroup)) 962 if (unlikely(page->mem_cgroup))
963 bad_reason = "page still charged to cgroup"; 963 bad_reason = "page still charged to cgroup";
964 #endif 964 #endif
965 bad_page(page, bad_reason, bad_flags); 965 bad_page(page, bad_reason, bad_flags);
966 } 966 }
967 967
968 static inline int free_pages_check(struct page *page) 968 static inline int free_pages_check(struct page *page)
969 { 969 {
970 if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE))) 970 if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE)))
971 return 0; 971 return 0;
972 972
973 /* Something has gone sideways, find it */ 973 /* Something has gone sideways, find it */
974 free_pages_check_bad(page); 974 free_pages_check_bad(page);
975 return 1; 975 return 1;
976 } 976 }
977 977
978 static int free_tail_pages_check(struct page *head_page, struct page *page) 978 static int free_tail_pages_check(struct page *head_page, struct page *page)
979 { 979 {
980 int ret = 1; 980 int ret = 1;
981 981
982 /* 982 /*
983 * We rely page->lru.next never has bit 0 set, unless the page 983 * We rely page->lru.next never has bit 0 set, unless the page
984 * is PageTail(). Let's make sure that's true even for poisoned ->lru. 984 * is PageTail(). Let's make sure that's true even for poisoned ->lru.
985 */ 985 */
986 BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1); 986 BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1);
987 987
988 if (!IS_ENABLED(CONFIG_DEBUG_VM)) { 988 if (!IS_ENABLED(CONFIG_DEBUG_VM)) {
989 ret = 0; 989 ret = 0;
990 goto out; 990 goto out;
991 } 991 }
992 switch (page - head_page) { 992 switch (page - head_page) {
993 case 1: 993 case 1:
994 /* the first tail page: ->mapping is compound_mapcount() */ 994 /* the first tail page: ->mapping is compound_mapcount() */
995 if (unlikely(compound_mapcount(page))) { 995 if (unlikely(compound_mapcount(page))) {
996 bad_page(page, "nonzero compound_mapcount", 0); 996 bad_page(page, "nonzero compound_mapcount", 0);
997 goto out; 997 goto out;
998 } 998 }
999 break; 999 break;
1000 case 2: 1000 case 2:
1001 /* 1001 /*
1002 * the second tail page: ->mapping is 1002 * the second tail page: ->mapping is
1003 * page_deferred_list().next -- ignore value. 1003 * page_deferred_list().next -- ignore value.
1004 */ 1004 */
1005 break; 1005 break;
1006 default: 1006 default:
1007 if (page->mapping != TAIL_MAPPING) { 1007 if (page->mapping != TAIL_MAPPING) {
1008 bad_page(page, "corrupted mapping in tail page", 0); 1008 bad_page(page, "corrupted mapping in tail page", 0);
1009 goto out; 1009 goto out;
1010 } 1010 }
1011 break; 1011 break;
1012 } 1012 }
1013 if (unlikely(!PageTail(page))) { 1013 if (unlikely(!PageTail(page))) {
1014 bad_page(page, "PageTail not set", 0); 1014 bad_page(page, "PageTail not set", 0);
1015 goto out; 1015 goto out;
1016 } 1016 }
1017 if (unlikely(compound_head(page) != head_page)) { 1017 if (unlikely(compound_head(page) != head_page)) {
1018 bad_page(page, "compound_head not consistent", 0); 1018 bad_page(page, "compound_head not consistent", 0);
1019 goto out; 1019 goto out;
1020 } 1020 }
1021 ret = 0; 1021 ret = 0;
1022 out: 1022 out:
1023 page->mapping = NULL; 1023 page->mapping = NULL;
1024 clear_compound_head(page); 1024 clear_compound_head(page);
1025 return ret; 1025 return ret;
1026 } 1026 }
1027 1027
1028 static __always_inline bool free_pages_prepare(struct page *page, 1028 static __always_inline bool free_pages_prepare(struct page *page,
1029 unsigned int order, bool check_free) 1029 unsigned int order, bool check_free)
1030 { 1030 {
1031 int bad = 0; 1031 int bad = 0;
1032 1032
1033 VM_BUG_ON_PAGE(PageTail(page), page); 1033 VM_BUG_ON_PAGE(PageTail(page), page);
1034 1034
1035 trace_mm_page_free(page, order); 1035 trace_mm_page_free(page, order);
1036 1036
1037 /* 1037 /*
1038 * Check tail pages before head page information is cleared to 1038 * Check tail pages before head page information is cleared to
1039 * avoid checking PageCompound for order-0 pages. 1039 * avoid checking PageCompound for order-0 pages.
1040 */ 1040 */
1041 if (unlikely(order)) { 1041 if (unlikely(order)) {
1042 bool compound = PageCompound(page); 1042 bool compound = PageCompound(page);
1043 int i; 1043 int i;
1044 1044
1045 VM_BUG_ON_PAGE(compound && compound_order(page) != order, page); 1045 VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
1046 1046
1047 if (compound) 1047 if (compound)
1048 ClearPageDoubleMap(page); 1048 ClearPageDoubleMap(page);
1049 for (i = 1; i < (1 << order); i++) { 1049 for (i = 1; i < (1 << order); i++) {
1050 if (compound) 1050 if (compound)
1051 bad += free_tail_pages_check(page, page + i); 1051 bad += free_tail_pages_check(page, page + i);
1052 if (unlikely(free_pages_check(page + i))) { 1052 if (unlikely(free_pages_check(page + i))) {
1053 bad++; 1053 bad++;
1054 continue; 1054 continue;
1055 } 1055 }
1056 (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; 1056 (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
1057 } 1057 }
1058 } 1058 }
1059 if (PageMappingFlags(page)) 1059 if (PageMappingFlags(page))
1060 page->mapping = NULL; 1060 page->mapping = NULL;
1061 if (memcg_kmem_enabled() && PageKmemcg(page)) 1061 if (memcg_kmem_enabled() && PageKmemcg(page))
1062 memcg_kmem_uncharge(page, order); 1062 memcg_kmem_uncharge(page, order);
1063 if (check_free) 1063 if (check_free)
1064 bad += free_pages_check(page); 1064 bad += free_pages_check(page);
1065 if (bad) 1065 if (bad)
1066 return false; 1066 return false;
1067 1067
1068 page_cpupid_reset_last(page); 1068 page_cpupid_reset_last(page);
1069 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; 1069 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
1070 reset_page_owner(page, order); 1070 reset_page_owner(page, order);
1071 1071
1072 if (!PageHighMem(page)) { 1072 if (!PageHighMem(page)) {
1073 debug_check_no_locks_freed(page_address(page), 1073 debug_check_no_locks_freed(page_address(page),
1074 PAGE_SIZE << order); 1074 PAGE_SIZE << order);
1075 debug_check_no_obj_freed(page_address(page), 1075 debug_check_no_obj_freed(page_address(page),
1076 PAGE_SIZE << order); 1076 PAGE_SIZE << order);
1077 } 1077 }
1078 arch_free_page(page, order); 1078 arch_free_page(page, order);
1079 kernel_poison_pages(page, 1 << order, 0); 1079 kernel_poison_pages(page, 1 << order, 0);
1080 kernel_map_pages(page, 1 << order, 0); 1080 kernel_map_pages(page, 1 << order, 0);
1081 kasan_free_pages(page, order); 1081 kasan_free_pages(page, order);
1082 1082
1083 return true; 1083 return true;
1084 } 1084 }
1085 1085
1086 #ifdef CONFIG_DEBUG_VM 1086 #ifdef CONFIG_DEBUG_VM
1087 static inline bool free_pcp_prepare(struct page *page) 1087 static inline bool free_pcp_prepare(struct page *page)
1088 { 1088 {
1089 return free_pages_prepare(page, 0, true); 1089 return free_pages_prepare(page, 0, true);
1090 } 1090 }
1091 1091
1092 static inline bool bulkfree_pcp_prepare(struct page *page) 1092 static inline bool bulkfree_pcp_prepare(struct page *page)
1093 { 1093 {
1094 return false; 1094 return false;
1095 } 1095 }
1096 #else 1096 #else
1097 static bool free_pcp_prepare(struct page *page) 1097 static bool free_pcp_prepare(struct page *page)
1098 { 1098 {
1099 return free_pages_prepare(page, 0, false); 1099 return free_pages_prepare(page, 0, false);
1100 } 1100 }
1101 1101
1102 static bool bulkfree_pcp_prepare(struct page *page) 1102 static bool bulkfree_pcp_prepare(struct page *page)
1103 { 1103 {
1104 return free_pages_check(page); 1104 return free_pages_check(page);
1105 } 1105 }
1106 #endif /* CONFIG_DEBUG_VM */ 1106 #endif /* CONFIG_DEBUG_VM */
1107 1107
1108 /* 1108 /*
1109 * Frees a number of pages from the PCP lists 1109 * Frees a number of pages from the PCP lists
1110 * Assumes all pages on list are in same zone, and of same order. 1110 * Assumes all pages on list are in same zone, and of same order.
1111 * count is the number of pages to free. 1111 * count is the number of pages to free.
1112 * 1112 *
1113 * If the zone was previously in an "all pages pinned" state then look to 1113 * If the zone was previously in an "all pages pinned" state then look to
1114 * see if this freeing clears that state. 1114 * see if this freeing clears that state.
1115 * 1115 *
1116 * And clear the zone's pages_scanned counter, to hold off the "all pages are 1116 * And clear the zone's pages_scanned counter, to hold off the "all pages are
1117 * pinned" detection logic. 1117 * pinned" detection logic.
1118 */ 1118 */
1119 static void free_pcppages_bulk(struct zone *zone, int count, 1119 static void free_pcppages_bulk(struct zone *zone, int count,
1120 struct per_cpu_pages *pcp) 1120 struct per_cpu_pages *pcp)
1121 { 1121 {
1122 int migratetype = 0; 1122 int migratetype = 0;
1123 int batch_free = 0; 1123 int batch_free = 0;
1124 bool isolated_pageblocks; 1124 bool isolated_pageblocks;
1125 1125
1126 spin_lock(&zone->lock); 1126 spin_lock(&zone->lock);
1127 isolated_pageblocks = has_isolate_pageblock(zone); 1127 isolated_pageblocks = has_isolate_pageblock(zone);
1128 1128
1129 while (count) { 1129 while (count) {
1130 struct page *page; 1130 struct page *page;
1131 struct list_head *list; 1131 struct list_head *list;
1132 1132
1133 /* 1133 /*
1134 * Remove pages from lists in a round-robin fashion. A 1134 * Remove pages from lists in a round-robin fashion. A
1135 * batch_free count is maintained that is incremented when an 1135 * batch_free count is maintained that is incremented when an
1136 * empty list is encountered. This is so more pages are freed 1136 * empty list is encountered. This is so more pages are freed
1137 * off fuller lists instead of spinning excessively around empty 1137 * off fuller lists instead of spinning excessively around empty
1138 * lists 1138 * lists
1139 */ 1139 */
1140 do { 1140 do {
1141 batch_free++; 1141 batch_free++;
1142 if (++migratetype == MIGRATE_PCPTYPES) 1142 if (++migratetype == MIGRATE_PCPTYPES)
1143 migratetype = 0; 1143 migratetype = 0;
1144 list = &pcp->lists[migratetype]; 1144 list = &pcp->lists[migratetype];
1145 } while (list_empty(list)); 1145 } while (list_empty(list));
1146 1146
1147 /* This is the only non-empty list. Free them all. */ 1147 /* This is the only non-empty list. Free them all. */
1148 if (batch_free == MIGRATE_PCPTYPES) 1148 if (batch_free == MIGRATE_PCPTYPES)
1149 batch_free = count; 1149 batch_free = count;
1150 1150
1151 do { 1151 do {
1152 int mt; /* migratetype of the to-be-freed page */ 1152 int mt; /* migratetype of the to-be-freed page */
1153 1153
1154 page = list_last_entry(list, struct page, lru); 1154 page = list_last_entry(list, struct page, lru);
1155 /* must delete as __free_one_page list manipulates */ 1155 /* must delete as __free_one_page list manipulates */
1156 list_del(&page->lru); 1156 list_del(&page->lru);
1157 1157
1158 mt = get_pcppage_migratetype(page); 1158 mt = get_pcppage_migratetype(page);
1159 /* MIGRATE_ISOLATE page should not go to pcplists */ 1159 /* MIGRATE_ISOLATE page should not go to pcplists */
1160 VM_BUG_ON_PAGE(is_migrate_isolate(mt), page); 1160 VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
1161 /* Pageblock could have been isolated meanwhile */ 1161 /* Pageblock could have been isolated meanwhile */
1162 if (unlikely(isolated_pageblocks)) 1162 if (unlikely(isolated_pageblocks))
1163 mt = get_pageblock_migratetype(page); 1163 mt = get_pageblock_migratetype(page);
1164 1164
1165 if (bulkfree_pcp_prepare(page)) 1165 if (bulkfree_pcp_prepare(page))
1166 continue; 1166 continue;
1167 1167
1168 __free_one_page(page, page_to_pfn(page), zone, 0, mt); 1168 __free_one_page(page, page_to_pfn(page), zone, 0, mt);
1169 trace_mm_page_pcpu_drain(page, 0, mt); 1169 trace_mm_page_pcpu_drain(page, 0, mt);
1170 } while (--count && --batch_free && !list_empty(list)); 1170 } while (--count && --batch_free && !list_empty(list));
1171 } 1171 }
1172 spin_unlock(&zone->lock); 1172 spin_unlock(&zone->lock);
1173 } 1173 }
1174 1174
1175 static void free_one_page(struct zone *zone, 1175 static void free_one_page(struct zone *zone,
1176 struct page *page, unsigned long pfn, 1176 struct page *page, unsigned long pfn,
1177 unsigned int order, 1177 unsigned int order,
1178 int migratetype) 1178 int migratetype)
1179 { 1179 {
1180 spin_lock(&zone->lock); 1180 spin_lock(&zone->lock);
1181 if (unlikely(has_isolate_pageblock(zone) || 1181 if (unlikely(has_isolate_pageblock(zone) ||
1182 is_migrate_isolate(migratetype))) { 1182 is_migrate_isolate(migratetype))) {
1183 migratetype = get_pfnblock_migratetype(page, pfn); 1183 migratetype = get_pfnblock_migratetype(page, pfn);
1184 } 1184 }
1185 __free_one_page(page, pfn, zone, order, migratetype); 1185 __free_one_page(page, pfn, zone, order, migratetype);
1186 spin_unlock(&zone->lock); 1186 spin_unlock(&zone->lock);
1187 } 1187 }
1188 1188
1189 static void __meminit __init_single_page(struct page *page, unsigned long pfn, 1189 static void __meminit __init_single_page(struct page *page, unsigned long pfn,
1190 unsigned long zone, int nid) 1190 unsigned long zone, int nid)
1191 { 1191 {
1192 set_page_links(page, zone, nid, pfn); 1192 set_page_links(page, zone, nid, pfn);
1193 init_page_count(page); 1193 init_page_count(page);
1194 page_mapcount_reset(page); 1194 page_mapcount_reset(page);
1195 page_cpupid_reset_last(page); 1195 page_cpupid_reset_last(page);
1196 1196
1197 INIT_LIST_HEAD(&page->lru); 1197 INIT_LIST_HEAD(&page->lru);
1198 #ifdef WANT_PAGE_VIRTUAL 1198 #ifdef WANT_PAGE_VIRTUAL
1199 /* The shift won't overflow because ZONE_NORMAL is below 4G. */ 1199 /* The shift won't overflow because ZONE_NORMAL is below 4G. */
1200 if (!is_highmem_idx(zone)) 1200 if (!is_highmem_idx(zone))
1201 set_page_address(page, __va(pfn << PAGE_SHIFT)); 1201 set_page_address(page, __va(pfn << PAGE_SHIFT));
1202 #endif 1202 #endif
1203 } 1203 }
1204 1204
1205 static void __meminit __init_single_pfn(unsigned long pfn, unsigned long zone, 1205 static void __meminit __init_single_pfn(unsigned long pfn, unsigned long zone,
1206 int nid) 1206 int nid)
1207 { 1207 {
1208 return __init_single_page(pfn_to_page(pfn), pfn, zone, nid); 1208 return __init_single_page(pfn_to_page(pfn), pfn, zone, nid);
1209 } 1209 }
1210 1210
1211 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 1211 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
1212 static void __meminit init_reserved_page(unsigned long pfn) 1212 static void __meminit init_reserved_page(unsigned long pfn)
1213 { 1213 {
1214 pg_data_t *pgdat; 1214 pg_data_t *pgdat;
1215 int nid, zid; 1215 int nid, zid;
1216 1216
1217 if (!early_page_uninitialised(pfn)) 1217 if (!early_page_uninitialised(pfn))
1218 return; 1218 return;
1219 1219
1220 nid = early_pfn_to_nid(pfn); 1220 nid = early_pfn_to_nid(pfn);
1221 pgdat = NODE_DATA(nid); 1221 pgdat = NODE_DATA(nid);
1222 1222
1223 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 1223 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1224 struct zone *zone = &pgdat->node_zones[zid]; 1224 struct zone *zone = &pgdat->node_zones[zid];
1225 1225
1226 if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone)) 1226 if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone))
1227 break; 1227 break;
1228 } 1228 }
1229 __init_single_pfn(pfn, zid, nid); 1229 __init_single_pfn(pfn, zid, nid);
1230 } 1230 }
1231 #else 1231 #else
1232 static inline void init_reserved_page(unsigned long pfn) 1232 static inline void init_reserved_page(unsigned long pfn)
1233 { 1233 {
1234 } 1234 }
1235 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ 1235 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
1236 1236
1237 /* 1237 /*
1238 * Initialised pages do not have PageReserved set. This function is 1238 * Initialised pages do not have PageReserved set. This function is
1239 * called for each range allocated by the bootmem allocator and 1239 * called for each range allocated by the bootmem allocator and
1240 * marks the pages PageReserved. The remaining valid pages are later 1240 * marks the pages PageReserved. The remaining valid pages are later
1241 * sent to the buddy page allocator. 1241 * sent to the buddy page allocator.
1242 */ 1242 */
1243 void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end) 1243 void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
1244 { 1244 {
1245 unsigned long start_pfn = PFN_DOWN(start); 1245 unsigned long start_pfn = PFN_DOWN(start);
1246 unsigned long end_pfn = PFN_UP(end); 1246 unsigned long end_pfn = PFN_UP(end);
1247 1247
1248 for (; start_pfn < end_pfn; start_pfn++) { 1248 for (; start_pfn < end_pfn; start_pfn++) {
1249 if (pfn_valid(start_pfn)) { 1249 if (pfn_valid(start_pfn)) {
1250 struct page *page = pfn_to_page(start_pfn); 1250 struct page *page = pfn_to_page(start_pfn);
1251 1251
1252 init_reserved_page(start_pfn); 1252 init_reserved_page(start_pfn);
1253 1253
1254 /* Avoid false-positive PageTail() */ 1254 /* Avoid false-positive PageTail() */
1255 INIT_LIST_HEAD(&page->lru); 1255 INIT_LIST_HEAD(&page->lru);
1256 1256
1257 SetPageReserved(page); 1257 SetPageReserved(page);
1258 } 1258 }
1259 } 1259 }
1260 } 1260 }
1261 1261
1262 static void __free_pages_ok(struct page *page, unsigned int order) 1262 static void __free_pages_ok(struct page *page, unsigned int order)
1263 { 1263 {
1264 unsigned long flags; 1264 unsigned long flags;
1265 int migratetype; 1265 int migratetype;
1266 unsigned long pfn = page_to_pfn(page); 1266 unsigned long pfn = page_to_pfn(page);
1267 1267
1268 if (!free_pages_prepare(page, order, true)) 1268 if (!free_pages_prepare(page, order, true))
1269 return; 1269 return;
1270 1270
1271 migratetype = get_pfnblock_migratetype(page, pfn); 1271 migratetype = get_pfnblock_migratetype(page, pfn);
1272 local_irq_save(flags); 1272 local_irq_save(flags);
1273 __count_vm_events(PGFREE, 1 << order); 1273 __count_vm_events(PGFREE, 1 << order);
1274 free_one_page(page_zone(page), page, pfn, order, migratetype); 1274 free_one_page(page_zone(page), page, pfn, order, migratetype);
1275 local_irq_restore(flags); 1275 local_irq_restore(flags);
1276 } 1276 }
1277 1277
1278 static void __init __free_pages_boot_core(struct page *page, unsigned int order) 1278 static void __init __free_pages_boot_core(struct page *page, unsigned int order)
1279 { 1279 {
1280 unsigned int nr_pages = 1 << order; 1280 unsigned int nr_pages = 1 << order;
1281 struct page *p = page; 1281 struct page *p = page;
1282 unsigned int loop; 1282 unsigned int loop;
1283 1283
1284 prefetchw(p); 1284 prefetchw(p);
1285 for (loop = 0; loop < (nr_pages - 1); loop++, p++) { 1285 for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
1286 prefetchw(p + 1); 1286 prefetchw(p + 1);
1287 __ClearPageReserved(p); 1287 __ClearPageReserved(p);
1288 set_page_count(p, 0); 1288 set_page_count(p, 0);
1289 } 1289 }
1290 __ClearPageReserved(p); 1290 __ClearPageReserved(p);
1291 set_page_count(p, 0); 1291 set_page_count(p, 0);
1292 1292
1293 page_zone(page)->managed_pages += nr_pages; 1293 page_zone(page)->managed_pages += nr_pages;
1294 set_page_refcounted(page); 1294 set_page_refcounted(page);
1295 __free_pages(page, order); 1295 __free_pages(page, order);
1296 } 1296 }
1297 1297
1298 #if defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) || \ 1298 #if defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) || \
1299 defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) 1299 defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP)
1300 1300
1301 static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata; 1301 static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
1302 1302
1303 int __meminit early_pfn_to_nid(unsigned long pfn) 1303 int __meminit early_pfn_to_nid(unsigned long pfn)
1304 { 1304 {
1305 static DEFINE_SPINLOCK(early_pfn_lock); 1305 static DEFINE_SPINLOCK(early_pfn_lock);
1306 int nid; 1306 int nid;
1307 1307
1308 spin_lock(&early_pfn_lock); 1308 spin_lock(&early_pfn_lock);
1309 nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache); 1309 nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
1310 if (nid < 0) 1310 if (nid < 0)
1311 nid = first_online_node; 1311 nid = first_online_node;
1312 spin_unlock(&early_pfn_lock); 1312 spin_unlock(&early_pfn_lock);
1313 1313
1314 return nid; 1314 return nid;
1315 } 1315 }
1316 #endif 1316 #endif
1317 1317
1318 #ifdef CONFIG_NODES_SPAN_OTHER_NODES 1318 #ifdef CONFIG_NODES_SPAN_OTHER_NODES
1319 static inline bool __meminit __maybe_unused 1319 static inline bool __meminit __maybe_unused
1320 meminit_pfn_in_nid(unsigned long pfn, int node, 1320 meminit_pfn_in_nid(unsigned long pfn, int node,
1321 struct mminit_pfnnid_cache *state) 1321 struct mminit_pfnnid_cache *state)
1322 { 1322 {
1323 int nid; 1323 int nid;
1324 1324
1325 nid = __early_pfn_to_nid(pfn, state); 1325 nid = __early_pfn_to_nid(pfn, state);
1326 if (nid >= 0 && nid != node) 1326 if (nid >= 0 && nid != node)
1327 return false; 1327 return false;
1328 return true; 1328 return true;
1329 } 1329 }
1330 1330
1331 /* Only safe to use early in boot when initialisation is single-threaded */ 1331 /* Only safe to use early in boot when initialisation is single-threaded */
1332 static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node) 1332 static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
1333 { 1333 {
1334 return meminit_pfn_in_nid(pfn, node, &early_pfnnid_cache); 1334 return meminit_pfn_in_nid(pfn, node, &early_pfnnid_cache);
1335 } 1335 }
1336 1336
1337 #else 1337 #else
1338 1338
1339 static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node) 1339 static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
1340 { 1340 {
1341 return true; 1341 return true;
1342 } 1342 }
1343 static inline bool __meminit __maybe_unused 1343 static inline bool __meminit __maybe_unused
1344 meminit_pfn_in_nid(unsigned long pfn, int node, 1344 meminit_pfn_in_nid(unsigned long pfn, int node,
1345 struct mminit_pfnnid_cache *state) 1345 struct mminit_pfnnid_cache *state)
1346 { 1346 {
1347 return true; 1347 return true;
1348 } 1348 }
1349 #endif 1349 #endif
1350 1350
1351 1351
1352 void __init __free_pages_bootmem(struct page *page, unsigned long pfn, 1352 void __init __free_pages_bootmem(struct page *page, unsigned long pfn,
1353 unsigned int order) 1353 unsigned int order)
1354 { 1354 {
1355 if (early_page_uninitialised(pfn)) 1355 if (early_page_uninitialised(pfn))
1356 return; 1356 return;
1357 return __free_pages_boot_core(page, order); 1357 return __free_pages_boot_core(page, order);
1358 } 1358 }
1359 1359
1360 /* 1360 /*
1361 * Check that the whole (or subset of) a pageblock given by the interval of 1361 * Check that the whole (or subset of) a pageblock given by the interval of
1362 * [start_pfn, end_pfn) is valid and within the same zone, before scanning it 1362 * [start_pfn, end_pfn) is valid and within the same zone, before scanning it
1363 * with the migration of free compaction scanner. The scanners then need to 1363 * with the migration of free compaction scanner. The scanners then need to
1364 * use only pfn_valid_within() check for arches that allow holes within 1364 * use only pfn_valid_within() check for arches that allow holes within
1365 * pageblocks. 1365 * pageblocks.
1366 * 1366 *
1367 * Return struct page pointer of start_pfn, or NULL if checks were not passed. 1367 * Return struct page pointer of start_pfn, or NULL if checks were not passed.
1368 * 1368 *
1369 * It's possible on some configurations to have a setup like node0 node1 node0 1369 * It's possible on some configurations to have a setup like node0 node1 node0
1370 * i.e. it's possible that all pages within a zones range of pages do not 1370 * i.e. it's possible that all pages within a zones range of pages do not
1371 * belong to a single zone. We assume that a border between node0 and node1 1371 * belong to a single zone. We assume that a border between node0 and node1
1372 * can occur within a single pageblock, but not a node0 node1 node0 1372 * can occur within a single pageblock, but not a node0 node1 node0
1373 * interleaving within a single pageblock. It is therefore sufficient to check 1373 * interleaving within a single pageblock. It is therefore sufficient to check
1374 * the first and last page of a pageblock and avoid checking each individual 1374 * the first and last page of a pageblock and avoid checking each individual
1375 * page in a pageblock. 1375 * page in a pageblock.
1376 */ 1376 */
1377 struct page *__pageblock_pfn_to_page(unsigned long start_pfn, 1377 struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
1378 unsigned long end_pfn, struct zone *zone) 1378 unsigned long end_pfn, struct zone *zone)
1379 { 1379 {
1380 struct page *start_page; 1380 struct page *start_page;
1381 struct page *end_page; 1381 struct page *end_page;
1382 1382
1383 /* end_pfn is one past the range we are checking */ 1383 /* end_pfn is one past the range we are checking */
1384 end_pfn--; 1384 end_pfn--;
1385 1385
1386 if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn)) 1386 if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn))
1387 return NULL; 1387 return NULL;
1388 1388
1389 start_page = pfn_to_online_page(start_pfn); 1389 start_page = pfn_to_online_page(start_pfn);
1390 if (!start_page) 1390 if (!start_page)
1391 return NULL; 1391 return NULL;
1392 1392
1393 if (page_zone(start_page) != zone) 1393 if (page_zone(start_page) != zone)
1394 return NULL; 1394 return NULL;
1395 1395
1396 end_page = pfn_to_page(end_pfn); 1396 end_page = pfn_to_page(end_pfn);
1397 1397
1398 /* This gives a shorter code than deriving page_zone(end_page) */ 1398 /* This gives a shorter code than deriving page_zone(end_page) */
1399 if (page_zone_id(start_page) != page_zone_id(end_page)) 1399 if (page_zone_id(start_page) != page_zone_id(end_page))
1400 return NULL; 1400 return NULL;
1401 1401
1402 return start_page; 1402 return start_page;
1403 } 1403 }
1404 1404
1405 void set_zone_contiguous(struct zone *zone) 1405 void set_zone_contiguous(struct zone *zone)
1406 { 1406 {
1407 unsigned long block_start_pfn = zone->zone_start_pfn; 1407 unsigned long block_start_pfn = zone->zone_start_pfn;
1408 unsigned long block_end_pfn; 1408 unsigned long block_end_pfn;
1409 1409
1410 block_end_pfn = ALIGN(block_start_pfn + 1, pageblock_nr_pages); 1410 block_end_pfn = ALIGN(block_start_pfn + 1, pageblock_nr_pages);
1411 for (; block_start_pfn < zone_end_pfn(zone); 1411 for (; block_start_pfn < zone_end_pfn(zone);
1412 block_start_pfn = block_end_pfn, 1412 block_start_pfn = block_end_pfn,
1413 block_end_pfn += pageblock_nr_pages) { 1413 block_end_pfn += pageblock_nr_pages) {
1414 1414
1415 block_end_pfn = min(block_end_pfn, zone_end_pfn(zone)); 1415 block_end_pfn = min(block_end_pfn, zone_end_pfn(zone));
1416 1416
1417 if (!__pageblock_pfn_to_page(block_start_pfn, 1417 if (!__pageblock_pfn_to_page(block_start_pfn,
1418 block_end_pfn, zone)) 1418 block_end_pfn, zone))
1419 return; 1419 return;
1420 } 1420 }
1421 1421
1422 /* We confirm that there is no hole */ 1422 /* We confirm that there is no hole */
1423 zone->contiguous = true; 1423 zone->contiguous = true;
1424 } 1424 }
1425 1425
1426 void clear_zone_contiguous(struct zone *zone) 1426 void clear_zone_contiguous(struct zone *zone)
1427 { 1427 {
1428 zone->contiguous = false; 1428 zone->contiguous = false;
1429 } 1429 }
1430 1430
1431 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 1431 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
1432 static void __init deferred_free_range(struct page *page, 1432 static void __init deferred_free_range(struct page *page,
1433 unsigned long pfn, int nr_pages) 1433 unsigned long pfn, int nr_pages)
1434 { 1434 {
1435 int i; 1435 int i;
1436 1436
1437 if (!page) 1437 if (!page)
1438 return; 1438 return;
1439 1439
1440 /* Free a large naturally-aligned chunk if possible */ 1440 /* Free a large naturally-aligned chunk if possible */
1441 if (nr_pages == pageblock_nr_pages && 1441 if (nr_pages == pageblock_nr_pages &&
1442 (pfn & (pageblock_nr_pages - 1)) == 0) { 1442 (pfn & (pageblock_nr_pages - 1)) == 0) {
1443 set_pageblock_migratetype(page, MIGRATE_MOVABLE); 1443 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
1444 __free_pages_boot_core(page, pageblock_order); 1444 __free_pages_boot_core(page, pageblock_order);
1445 return; 1445 return;
1446 } 1446 }
1447 1447
1448 for (i = 0; i < nr_pages; i++, page++, pfn++) { 1448 for (i = 0; i < nr_pages; i++, page++, pfn++) {
1449 if ((pfn & (pageblock_nr_pages - 1)) == 0) 1449 if ((pfn & (pageblock_nr_pages - 1)) == 0)
1450 set_pageblock_migratetype(page, MIGRATE_MOVABLE); 1450 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
1451 __free_pages_boot_core(page, 0); 1451 __free_pages_boot_core(page, 0);
1452 } 1452 }
1453 } 1453 }
1454 1454
1455 /* Completion tracking for deferred_init_memmap() threads */ 1455 /* Completion tracking for deferred_init_memmap() threads */
1456 static atomic_t pgdat_init_n_undone __initdata; 1456 static atomic_t pgdat_init_n_undone __initdata;
1457 static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp); 1457 static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp);
1458 1458
1459 static inline void __init pgdat_init_report_one_done(void) 1459 static inline void __init pgdat_init_report_one_done(void)
1460 { 1460 {
1461 if (atomic_dec_and_test(&pgdat_init_n_undone)) 1461 if (atomic_dec_and_test(&pgdat_init_n_undone))
1462 complete(&pgdat_init_all_done_comp); 1462 complete(&pgdat_init_all_done_comp);
1463 } 1463 }
1464 1464
1465 /* Initialise remaining memory on a node */ 1465 /* Initialise remaining memory on a node */
1466 static int __init deferred_init_memmap(void *data) 1466 static int __init deferred_init_memmap(void *data)
1467 { 1467 {
1468 pg_data_t *pgdat = data; 1468 pg_data_t *pgdat = data;
1469 int nid = pgdat->node_id; 1469 int nid = pgdat->node_id;
1470 struct mminit_pfnnid_cache nid_init_state = { }; 1470 struct mminit_pfnnid_cache nid_init_state = { };
1471 unsigned long start = jiffies; 1471 unsigned long start = jiffies;
1472 unsigned long nr_pages = 0; 1472 unsigned long nr_pages = 0;
1473 unsigned long walk_start, walk_end; 1473 unsigned long walk_start, walk_end;
1474 int i, zid; 1474 int i, zid;
1475 struct zone *zone; 1475 struct zone *zone;
1476 unsigned long first_init_pfn = pgdat->first_deferred_pfn; 1476 unsigned long first_init_pfn = pgdat->first_deferred_pfn;
1477 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); 1477 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
1478 1478
1479 if (first_init_pfn == ULONG_MAX) { 1479 if (first_init_pfn == ULONG_MAX) {
1480 pgdat_init_report_one_done(); 1480 pgdat_init_report_one_done();
1481 return 0; 1481 return 0;
1482 } 1482 }
1483 1483
1484 /* Bind memory initialisation thread to a local node if possible */ 1484 /* Bind memory initialisation thread to a local node if possible */
1485 if (!cpumask_empty(cpumask)) 1485 if (!cpumask_empty(cpumask))
1486 set_cpus_allowed_ptr(current, cpumask); 1486 set_cpus_allowed_ptr(current, cpumask);
1487 1487
1488 /* Sanity check boundaries */ 1488 /* Sanity check boundaries */
1489 BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn); 1489 BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn);
1490 BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat)); 1490 BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat));
1491 pgdat->first_deferred_pfn = ULONG_MAX; 1491 pgdat->first_deferred_pfn = ULONG_MAX;
1492 1492
1493 /* Only the highest zone is deferred so find it */ 1493 /* Only the highest zone is deferred so find it */
1494 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 1494 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1495 zone = pgdat->node_zones + zid; 1495 zone = pgdat->node_zones + zid;
1496 if (first_init_pfn < zone_end_pfn(zone)) 1496 if (first_init_pfn < zone_end_pfn(zone))
1497 break; 1497 break;
1498 } 1498 }
1499 1499
1500 for_each_mem_pfn_range(i, nid, &walk_start, &walk_end, NULL) { 1500 for_each_mem_pfn_range(i, nid, &walk_start, &walk_end, NULL) {
1501 unsigned long pfn, end_pfn; 1501 unsigned long pfn, end_pfn;
1502 struct page *page = NULL; 1502 struct page *page = NULL;
1503 struct page *free_base_page = NULL; 1503 struct page *free_base_page = NULL;
1504 unsigned long free_base_pfn = 0; 1504 unsigned long free_base_pfn = 0;
1505 int nr_to_free = 0; 1505 int nr_to_free = 0;
1506 1506
1507 end_pfn = min(walk_end, zone_end_pfn(zone)); 1507 end_pfn = min(walk_end, zone_end_pfn(zone));
1508 pfn = first_init_pfn; 1508 pfn = first_init_pfn;
1509 if (pfn < walk_start) 1509 if (pfn < walk_start)
1510 pfn = walk_start; 1510 pfn = walk_start;
1511 if (pfn < zone->zone_start_pfn) 1511 if (pfn < zone->zone_start_pfn)
1512 pfn = zone->zone_start_pfn; 1512 pfn = zone->zone_start_pfn;
1513 1513
1514 for (; pfn < end_pfn; pfn++) { 1514 for (; pfn < end_pfn; pfn++) {
1515 if (!pfn_valid_within(pfn)) 1515 if (!pfn_valid_within(pfn))
1516 goto free_range; 1516 goto free_range;
1517 1517
1518 /* 1518 /*
1519 * Ensure pfn_valid is checked every 1519 * Ensure pfn_valid is checked every
1520 * pageblock_nr_pages for memory holes 1520 * pageblock_nr_pages for memory holes
1521 */ 1521 */
1522 if ((pfn & (pageblock_nr_pages - 1)) == 0) { 1522 if ((pfn & (pageblock_nr_pages - 1)) == 0) {
1523 if (!pfn_valid(pfn)) { 1523 if (!pfn_valid(pfn)) {
1524 page = NULL; 1524 page = NULL;
1525 goto free_range; 1525 goto free_range;
1526 } 1526 }
1527 } 1527 }
1528 1528
1529 if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) { 1529 if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) {
1530 page = NULL; 1530 page = NULL;
1531 goto free_range; 1531 goto free_range;
1532 } 1532 }
1533 1533
1534 /* Minimise pfn page lookups and scheduler checks */ 1534 /* Minimise pfn page lookups and scheduler checks */
1535 if (page && (pfn & (pageblock_nr_pages - 1)) != 0) { 1535 if (page && (pfn & (pageblock_nr_pages - 1)) != 0) {
1536 page++; 1536 page++;
1537 } else { 1537 } else {
1538 nr_pages += nr_to_free; 1538 nr_pages += nr_to_free;
1539 deferred_free_range(free_base_page, 1539 deferred_free_range(free_base_page,
1540 free_base_pfn, nr_to_free); 1540 free_base_pfn, nr_to_free);
1541 free_base_page = NULL; 1541 free_base_page = NULL;
1542 free_base_pfn = nr_to_free = 0; 1542 free_base_pfn = nr_to_free = 0;
1543 1543
1544 page = pfn_to_page(pfn); 1544 page = pfn_to_page(pfn);
1545 cond_resched(); 1545 cond_resched();
1546 } 1546 }
1547 1547
1548 if (page->flags) { 1548 if (page->flags) {
1549 VM_BUG_ON(page_zone(page) != zone); 1549 VM_BUG_ON(page_zone(page) != zone);
1550 goto free_range; 1550 goto free_range;
1551 } 1551 }
1552 1552
1553 __init_single_page(page, pfn, zid, nid); 1553 __init_single_page(page, pfn, zid, nid);
1554 if (!free_base_page) { 1554 if (!free_base_page) {
1555 free_base_page = page; 1555 free_base_page = page;
1556 free_base_pfn = pfn; 1556 free_base_pfn = pfn;
1557 nr_to_free = 0; 1557 nr_to_free = 0;
1558 } 1558 }
1559 nr_to_free++; 1559 nr_to_free++;
1560 1560
1561 /* Where possible, batch up pages for a single free */ 1561 /* Where possible, batch up pages for a single free */
1562 continue; 1562 continue;
1563 free_range: 1563 free_range:
1564 /* Free the current block of pages to allocator */ 1564 /* Free the current block of pages to allocator */
1565 nr_pages += nr_to_free; 1565 nr_pages += nr_to_free;
1566 deferred_free_range(free_base_page, free_base_pfn, 1566 deferred_free_range(free_base_page, free_base_pfn,
1567 nr_to_free); 1567 nr_to_free);
1568 free_base_page = NULL; 1568 free_base_page = NULL;
1569 free_base_pfn = nr_to_free = 0; 1569 free_base_pfn = nr_to_free = 0;
1570 } 1570 }
1571 /* Free the last block of pages to allocator */ 1571 /* Free the last block of pages to allocator */
1572 nr_pages += nr_to_free; 1572 nr_pages += nr_to_free;
1573 deferred_free_range(free_base_page, free_base_pfn, nr_to_free); 1573 deferred_free_range(free_base_page, free_base_pfn, nr_to_free);
1574 1574
1575 first_init_pfn = max(end_pfn, first_init_pfn); 1575 first_init_pfn = max(end_pfn, first_init_pfn);
1576 } 1576 }
1577 1577
1578 /* Sanity check that the next zone really is unpopulated */ 1578 /* Sanity check that the next zone really is unpopulated */
1579 WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone)); 1579 WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));
1580 1580
1581 pr_info("node %d initialised, %lu pages in %ums\n", nid, nr_pages, 1581 pr_info("node %d initialised, %lu pages in %ums\n", nid, nr_pages,
1582 jiffies_to_msecs(jiffies - start)); 1582 jiffies_to_msecs(jiffies - start));
1583 1583
1584 pgdat_init_report_one_done(); 1584 pgdat_init_report_one_done();
1585 return 0; 1585 return 0;
1586 } 1586 }
1587 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ 1587 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
1588 1588
1589 void __init page_alloc_init_late(void) 1589 void __init page_alloc_init_late(void)
1590 { 1590 {
1591 struct zone *zone; 1591 struct zone *zone;
1592 1592
1593 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 1593 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
1594 int nid; 1594 int nid;
1595 1595
1596 /* There will be num_node_state(N_MEMORY) threads */ 1596 /* There will be num_node_state(N_MEMORY) threads */
1597 atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY)); 1597 atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY));
1598 for_each_node_state(nid, N_MEMORY) { 1598 for_each_node_state(nid, N_MEMORY) {
1599 kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid); 1599 kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid);
1600 } 1600 }
1601 1601
1602 /* Block until all are initialised */ 1602 /* Block until all are initialised */
1603 wait_for_completion(&pgdat_init_all_done_comp); 1603 wait_for_completion(&pgdat_init_all_done_comp);
1604 1604
1605 /* Reinit limits that are based on free pages after the kernel is up */ 1605 /* Reinit limits that are based on free pages after the kernel is up */
1606 files_maxfiles_init(); 1606 files_maxfiles_init();
1607 #endif 1607 #endif
1608 #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK 1608 #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
1609 /* Discard memblock private memory */ 1609 /* Discard memblock private memory */
1610 memblock_discard(); 1610 memblock_discard();
1611 #endif 1611 #endif
1612 1612
1613 for_each_populated_zone(zone) 1613 for_each_populated_zone(zone)
1614 set_zone_contiguous(zone); 1614 set_zone_contiguous(zone);
1615 } 1615 }
1616 1616
1617 #ifdef CONFIG_CMA 1617 #ifdef CONFIG_CMA
1618 /* Free whole pageblock and set its migration type to MIGRATE_CMA. */ 1618 /* Free whole pageblock and set its migration type to MIGRATE_CMA. */
1619 void __init init_cma_reserved_pageblock(struct page *page) 1619 void __init init_cma_reserved_pageblock(struct page *page)
1620 { 1620 {
1621 unsigned i = pageblock_nr_pages; 1621 unsigned i = pageblock_nr_pages;
1622 struct page *p = page; 1622 struct page *p = page;
1623 1623
1624 do { 1624 do {
1625 __ClearPageReserved(p); 1625 __ClearPageReserved(p);
1626 set_page_count(p, 0); 1626 set_page_count(p, 0);
1627 } while (++p, --i); 1627 } while (++p, --i);
1628 1628
1629 set_pageblock_migratetype(page, MIGRATE_CMA); 1629 set_pageblock_migratetype(page, MIGRATE_CMA);
1630 1630
1631 if (pageblock_order >= MAX_ORDER) { 1631 if (pageblock_order >= MAX_ORDER) {
1632 i = pageblock_nr_pages; 1632 i = pageblock_nr_pages;
1633 p = page; 1633 p = page;
1634 do { 1634 do {
1635 set_page_refcounted(p); 1635 set_page_refcounted(p);
1636 __free_pages(p, MAX_ORDER - 1); 1636 __free_pages(p, MAX_ORDER - 1);
1637 p += MAX_ORDER_NR_PAGES; 1637 p += MAX_ORDER_NR_PAGES;
1638 } while (i -= MAX_ORDER_NR_PAGES); 1638 } while (i -= MAX_ORDER_NR_PAGES);
1639 } else { 1639 } else {
1640 set_page_refcounted(page); 1640 set_page_refcounted(page);
1641 __free_pages(page, pageblock_order); 1641 __free_pages(page, pageblock_order);
1642 } 1642 }
1643 1643
1644 adjust_managed_page_count(page, pageblock_nr_pages); 1644 adjust_managed_page_count(page, pageblock_nr_pages);
1645 } 1645 }
1646 #endif 1646 #endif
1647 1647
1648 /* 1648 /*
1649 * The order of subdivision here is critical for the IO subsystem. 1649 * The order of subdivision here is critical for the IO subsystem.
1650 * Please do not alter this order without good reasons and regression 1650 * Please do not alter this order without good reasons and regression
1651 * testing. Specifically, as large blocks of memory are subdivided, 1651 * testing. Specifically, as large blocks of memory are subdivided,
1652 * the order in which smaller blocks are delivered depends on the order 1652 * the order in which smaller blocks are delivered depends on the order
1653 * they're subdivided in this function. This is the primary factor 1653 * they're subdivided in this function. This is the primary factor
1654 * influencing the order in which pages are delivered to the IO 1654 * influencing the order in which pages are delivered to the IO
1655 * subsystem according to empirical testing, and this is also justified 1655 * subsystem according to empirical testing, and this is also justified
1656 * by considering the behavior of a buddy system containing a single 1656 * by considering the behavior of a buddy system containing a single
1657 * large block of memory acted on by a series of small allocations. 1657 * large block of memory acted on by a series of small allocations.
1658 * This behavior is a critical factor in sglist merging's success. 1658 * This behavior is a critical factor in sglist merging's success.
1659 * 1659 *
1660 * -- nyc 1660 * -- nyc
1661 */ 1661 */
1662 static inline void expand(struct zone *zone, struct page *page, 1662 static inline void expand(struct zone *zone, struct page *page,
1663 int low, int high, struct free_area *area, 1663 int low, int high, struct free_area *area,
1664 int migratetype) 1664 int migratetype)
1665 { 1665 {
1666 unsigned long size = 1 << high; 1666 unsigned long size = 1 << high;
1667 1667
1668 while (high > low) { 1668 while (high > low) {
1669 area--; 1669 area--;
1670 high--; 1670 high--;
1671 size >>= 1; 1671 size >>= 1;
1672 VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]); 1672 VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
1673 1673
1674 /* 1674 /*
1675 * Mark as guard pages (or page), that will allow to 1675 * Mark as guard pages (or page), that will allow to
1676 * merge back to allocator when buddy will be freed. 1676 * merge back to allocator when buddy will be freed.
1677 * Corresponding page table entries will not be touched, 1677 * Corresponding page table entries will not be touched,
1678 * pages will stay not present in virtual address space 1678 * pages will stay not present in virtual address space
1679 */ 1679 */
1680 if (set_page_guard(zone, &page[size], high, migratetype)) 1680 if (set_page_guard(zone, &page[size], high, migratetype))
1681 continue; 1681 continue;
1682 1682
1683 list_add(&page[size].lru, &area->free_list[migratetype]); 1683 list_add(&page[size].lru, &area->free_list[migratetype]);
1684 area->nr_free++; 1684 area->nr_free++;
1685 set_page_order(&page[size], high); 1685 set_page_order(&page[size], high);
1686 } 1686 }
1687 } 1687 }
1688 1688
1689 static void check_new_page_bad(struct page *page) 1689 static void check_new_page_bad(struct page *page)
1690 { 1690 {
1691 const char *bad_reason = NULL; 1691 const char *bad_reason = NULL;
1692 unsigned long bad_flags = 0; 1692 unsigned long bad_flags = 0;
1693 1693
1694 if (unlikely(atomic_read(&page->_mapcount) != -1)) 1694 if (unlikely(atomic_read(&page->_mapcount) != -1))
1695 bad_reason = "nonzero mapcount"; 1695 bad_reason = "nonzero mapcount";
1696 if (unlikely(page->mapping != NULL)) 1696 if (unlikely(page->mapping != NULL))
1697 bad_reason = "non-NULL mapping"; 1697 bad_reason = "non-NULL mapping";
1698 if (unlikely(page_ref_count(page) != 0)) 1698 if (unlikely(page_ref_count(page) != 0))
1699 bad_reason = "nonzero _count"; 1699 bad_reason = "nonzero _count";
1700 if (unlikely(page->flags & __PG_HWPOISON)) { 1700 if (unlikely(page->flags & __PG_HWPOISON)) {
1701 bad_reason = "HWPoisoned (hardware-corrupted)"; 1701 bad_reason = "HWPoisoned (hardware-corrupted)";
1702 bad_flags = __PG_HWPOISON; 1702 bad_flags = __PG_HWPOISON;
1703 /* Don't complain about hwpoisoned pages */ 1703 /* Don't complain about hwpoisoned pages */
1704 page_mapcount_reset(page); /* remove PageBuddy */ 1704 page_mapcount_reset(page); /* remove PageBuddy */
1705 return; 1705 return;
1706 } 1706 }
1707 if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) { 1707 if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) {
1708 bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set"; 1708 bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set";
1709 bad_flags = PAGE_FLAGS_CHECK_AT_PREP; 1709 bad_flags = PAGE_FLAGS_CHECK_AT_PREP;
1710 } 1710 }
1711 #ifdef CONFIG_MEMCG 1711 #ifdef CONFIG_MEMCG
1712 if (unlikely(page->mem_cgroup)) 1712 if (unlikely(page->mem_cgroup))
1713 bad_reason = "page still charged to cgroup"; 1713 bad_reason = "page still charged to cgroup";
1714 #endif 1714 #endif
1715 bad_page(page, bad_reason, bad_flags); 1715 bad_page(page, bad_reason, bad_flags);
1716 } 1716 }
1717 1717
1718 /* 1718 /*
1719 * This page is about to be returned from the page allocator 1719 * This page is about to be returned from the page allocator
1720 */ 1720 */
1721 static inline int check_new_page(struct page *page) 1721 static inline int check_new_page(struct page *page)
1722 { 1722 {
1723 if (likely(page_expected_state(page, 1723 if (likely(page_expected_state(page,
1724 PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON))) 1724 PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON)))
1725 return 0; 1725 return 0;
1726 1726
1727 check_new_page_bad(page); 1727 check_new_page_bad(page);
1728 return 1; 1728 return 1;
1729 } 1729 }
1730 1730
1731 static inline bool free_pages_prezeroed(void) 1731 static inline bool free_pages_prezeroed(void)
1732 { 1732 {
1733 return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) && 1733 return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) &&
1734 page_poisoning_enabled(); 1734 page_poisoning_enabled();
1735 } 1735 }
1736 1736
1737 #ifdef CONFIG_DEBUG_VM 1737 #ifdef CONFIG_DEBUG_VM
1738 static bool check_pcp_refill(struct page *page) 1738 static bool check_pcp_refill(struct page *page)
1739 { 1739 {
1740 return false; 1740 return false;
1741 } 1741 }
1742 1742
1743 static bool check_new_pcp(struct page *page) 1743 static bool check_new_pcp(struct page *page)
1744 { 1744 {
1745 return check_new_page(page); 1745 return check_new_page(page);
1746 } 1746 }
1747 #else 1747 #else
1748 static bool check_pcp_refill(struct page *page) 1748 static bool check_pcp_refill(struct page *page)
1749 { 1749 {
1750 return check_new_page(page); 1750 return check_new_page(page);
1751 } 1751 }
1752 static bool check_new_pcp(struct page *page) 1752 static bool check_new_pcp(struct page *page)
1753 { 1753 {
1754 return false; 1754 return false;
1755 } 1755 }
1756 #endif /* CONFIG_DEBUG_VM */ 1756 #endif /* CONFIG_DEBUG_VM */
1757 1757
1758 static bool check_new_pages(struct page *page, unsigned int order) 1758 static bool check_new_pages(struct page *page, unsigned int order)
1759 { 1759 {
1760 int i; 1760 int i;
1761 for (i = 0; i < (1 << order); i++) { 1761 for (i = 0; i < (1 << order); i++) {
1762 struct page *p = page + i; 1762 struct page *p = page + i;
1763 1763
1764 if (unlikely(check_new_page(p))) 1764 if (unlikely(check_new_page(p)))
1765 return true; 1765 return true;
1766 } 1766 }
1767 1767
1768 return false; 1768 return false;
1769 } 1769 }
1770 1770
1771 inline void post_alloc_hook(struct page *page, unsigned int order, 1771 inline void post_alloc_hook(struct page *page, unsigned int order,
1772 gfp_t gfp_flags) 1772 gfp_t gfp_flags)
1773 { 1773 {
1774 set_page_private(page, 0); 1774 set_page_private(page, 0);
1775 set_page_refcounted(page); 1775 set_page_refcounted(page);
1776 1776
1777 arch_alloc_page(page, order); 1777 arch_alloc_page(page, order);
1778 kernel_map_pages(page, 1 << order, 1); 1778 kernel_map_pages(page, 1 << order, 1);
1779 kernel_poison_pages(page, 1 << order, 1); 1779 kernel_poison_pages(page, 1 << order, 1);
1780 kasan_alloc_pages(page, order); 1780 kasan_alloc_pages(page, order);
1781 set_page_owner(page, order, gfp_flags); 1781 set_page_owner(page, order, gfp_flags);
1782 } 1782 }
1783 1783
1784 static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, 1784 static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
1785 unsigned int alloc_flags) 1785 unsigned int alloc_flags)
1786 { 1786 {
1787 int i; 1787 int i;
1788 1788
1789 post_alloc_hook(page, order, gfp_flags); 1789 post_alloc_hook(page, order, gfp_flags);
1790 1790
1791 if (!free_pages_prezeroed() && (gfp_flags & __GFP_ZERO)) 1791 if (!free_pages_prezeroed() && (gfp_flags & __GFP_ZERO))
1792 for (i = 0; i < (1 << order); i++) 1792 for (i = 0; i < (1 << order); i++)
1793 clear_highpage(page + i); 1793 clear_highpage(page + i);
1794 1794
1795 if (order && (gfp_flags & __GFP_COMP)) 1795 if (order && (gfp_flags & __GFP_COMP))
1796 prep_compound_page(page, order); 1796 prep_compound_page(page, order);
1797 1797
1798 /* 1798 /*
1799 * page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to 1799 * page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to
1800 * allocate the page. The expectation is that the caller is taking 1800 * allocate the page. The expectation is that the caller is taking
1801 * steps that will free more memory. The caller should avoid the page 1801 * steps that will free more memory. The caller should avoid the page
1802 * being used for !PFMEMALLOC purposes. 1802 * being used for !PFMEMALLOC purposes.
1803 */ 1803 */
1804 if (alloc_flags & ALLOC_NO_WATERMARKS) 1804 if (alloc_flags & ALLOC_NO_WATERMARKS)
1805 set_page_pfmemalloc(page); 1805 set_page_pfmemalloc(page);
1806 else 1806 else
1807 clear_page_pfmemalloc(page); 1807 clear_page_pfmemalloc(page);
1808 } 1808 }
1809 1809
1810 /* 1810 /*
1811 * Go through the free lists for the given migratetype and remove 1811 * Go through the free lists for the given migratetype and remove
1812 * the smallest available page from the freelists 1812 * the smallest available page from the freelists
1813 */ 1813 */
1814 static inline 1814 static inline
1815 struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, 1815 struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
1816 int migratetype) 1816 int migratetype)
1817 { 1817 {
1818 unsigned int current_order; 1818 unsigned int current_order;
1819 struct free_area *area; 1819 struct free_area *area;
1820 struct page *page; 1820 struct page *page;
1821 1821
1822 /* Find a page of the appropriate size in the preferred list */ 1822 /* Find a page of the appropriate size in the preferred list */
1823 for (current_order = order; current_order < MAX_ORDER; ++current_order) { 1823 for (current_order = order; current_order < MAX_ORDER; ++current_order) {
1824 area = &(zone->free_area[current_order]); 1824 area = &(zone->free_area[current_order]);
1825 page = list_first_entry_or_null(&area->free_list[migratetype], 1825 page = list_first_entry_or_null(&area->free_list[migratetype],
1826 struct page, lru); 1826 struct page, lru);
1827 if (!page) 1827 if (!page)
1828 continue; 1828 continue;
1829 list_del(&page->lru); 1829 list_del(&page->lru);
1830 rmv_page_order(page); 1830 rmv_page_order(page);
1831 area->nr_free--; 1831 area->nr_free--;
1832 expand(zone, page, order, current_order, area, migratetype); 1832 expand(zone, page, order, current_order, area, migratetype);
1833 set_pcppage_migratetype(page, migratetype); 1833 set_pcppage_migratetype(page, migratetype);
1834 return page; 1834 return page;
1835 } 1835 }
1836 1836
1837 return NULL; 1837 return NULL;
1838 } 1838 }
1839 1839
1840 1840
1841 /* 1841 /*
1842 * This array describes the order lists are fallen back to when 1842 * This array describes the order lists are fallen back to when
1843 * the free lists for the desirable migrate type are depleted 1843 * the free lists for the desirable migrate type are depleted
1844 */ 1844 */
1845 static int fallbacks[MIGRATE_TYPES][4] = { 1845 static int fallbacks[MIGRATE_TYPES][4] = {
1846 [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, 1846 [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
1847 [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, 1847 [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
1848 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES }, 1848 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES },
1849 #ifdef CONFIG_CMA 1849 #ifdef CONFIG_CMA
1850 [MIGRATE_CMA] = { MIGRATE_TYPES }, /* Never used */ 1850 [MIGRATE_CMA] = { MIGRATE_TYPES }, /* Never used */
1851 #endif 1851 #endif
1852 #ifdef CONFIG_MEMORY_ISOLATION 1852 #ifdef CONFIG_MEMORY_ISOLATION
1853 [MIGRATE_ISOLATE] = { MIGRATE_TYPES }, /* Never used */ 1853 [MIGRATE_ISOLATE] = { MIGRATE_TYPES }, /* Never used */
1854 #endif 1854 #endif
1855 }; 1855 };
1856 1856
1857 #ifdef CONFIG_CMA 1857 #ifdef CONFIG_CMA
1858 static struct page *__rmqueue_cma_fallback(struct zone *zone, 1858 static struct page *__rmqueue_cma_fallback(struct zone *zone,
1859 unsigned int order) 1859 unsigned int order)
1860 { 1860 {
1861 return __rmqueue_smallest(zone, order, MIGRATE_CMA); 1861 return __rmqueue_smallest(zone, order, MIGRATE_CMA);
1862 } 1862 }
1863 #else 1863 #else
1864 static inline struct page *__rmqueue_cma_fallback(struct zone *zone, 1864 static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
1865 unsigned int order) { return NULL; } 1865 unsigned int order) { return NULL; }
1866 #endif 1866 #endif
1867 1867
1868 /* 1868 /*
1869 * Move the free pages in a range to the free lists of the requested type. 1869 * Move the free pages in a range to the free lists of the requested type.
1870 * Note that start_page and end_pages are not aligned on a pageblock 1870 * Note that start_page and end_pages are not aligned on a pageblock
1871 * boundary. If alignment is required, use move_freepages_block() 1871 * boundary. If alignment is required, use move_freepages_block()
1872 */ 1872 */
1873 static int move_freepages(struct zone *zone, 1873 static int move_freepages(struct zone *zone,
1874 struct page *start_page, struct page *end_page, 1874 struct page *start_page, struct page *end_page,
1875 int migratetype, int *num_movable) 1875 int migratetype, int *num_movable)
1876 { 1876 {
1877 struct page *page; 1877 struct page *page;
1878 unsigned int order; 1878 unsigned int order;
1879 int pages_moved = 0; 1879 int pages_moved = 0;
1880 1880
1881 #ifndef CONFIG_HOLES_IN_ZONE 1881 #ifndef CONFIG_HOLES_IN_ZONE
1882 /* 1882 /*
1883 * page_zone is not safe to call in this context when 1883 * page_zone is not safe to call in this context when
1884 * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant 1884 * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
1885 * anyway as we check zone boundaries in move_freepages_block(). 1885 * anyway as we check zone boundaries in move_freepages_block().
1886 * Remove at a later date when no bug reports exist related to 1886 * Remove at a later date when no bug reports exist related to
1887 * grouping pages by mobility 1887 * grouping pages by mobility
1888 */ 1888 */
1889 VM_BUG_ON(page_zone(start_page) != page_zone(end_page)); 1889 VM_BUG_ON(page_zone(start_page) != page_zone(end_page));
1890 #endif 1890 #endif
1891 1891
1892 if (num_movable) 1892 if (num_movable)
1893 *num_movable = 0; 1893 *num_movable = 0;
1894 1894
1895 for (page = start_page; page <= end_page;) { 1895 for (page = start_page; page <= end_page;) {
1896 if (!pfn_valid_within(page_to_pfn(page))) { 1896 if (!pfn_valid_within(page_to_pfn(page))) {
1897 page++; 1897 page++;
1898 continue; 1898 continue;
1899 } 1899 }
1900 1900
1901 /* Make sure we are not inadvertently changing nodes */ 1901 /* Make sure we are not inadvertently changing nodes */
1902 VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); 1902 VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
1903 1903
1904 if (!PageBuddy(page)) { 1904 if (!PageBuddy(page)) {
1905 /* 1905 /*
1906 * We assume that pages that could be isolated for 1906 * We assume that pages that could be isolated for
1907 * migration are movable. But we don't actually try 1907 * migration are movable. But we don't actually try
1908 * isolating, as that would be expensive. 1908 * isolating, as that would be expensive.
1909 */ 1909 */
1910 if (num_movable && 1910 if (num_movable &&
1911 (PageLRU(page) || __PageMovable(page))) 1911 (PageLRU(page) || __PageMovable(page)))
1912 (*num_movable)++; 1912 (*num_movable)++;
1913 1913
1914 page++; 1914 page++;
1915 continue; 1915 continue;
1916 } 1916 }
1917 1917
1918 order = page_order(page); 1918 order = page_order(page);
1919 list_move(&page->lru, 1919 list_move(&page->lru,
1920 &zone->free_area[order].free_list[migratetype]); 1920 &zone->free_area[order].free_list[migratetype]);
1921 page += 1 << order; 1921 page += 1 << order;
1922 pages_moved += 1 << order; 1922 pages_moved += 1 << order;
1923 } 1923 }
1924 1924
1925 return pages_moved; 1925 return pages_moved;
1926 } 1926 }
1927 1927
1928 int move_freepages_block(struct zone *zone, struct page *page, 1928 int move_freepages_block(struct zone *zone, struct page *page,
1929 int migratetype, int *num_movable) 1929 int migratetype, int *num_movable)
1930 { 1930 {
1931 unsigned long start_pfn, end_pfn; 1931 unsigned long start_pfn, end_pfn;
1932 struct page *start_page, *end_page; 1932 struct page *start_page, *end_page;
1933 1933
1934 start_pfn = page_to_pfn(page); 1934 start_pfn = page_to_pfn(page);
1935 start_pfn = start_pfn & ~(pageblock_nr_pages-1); 1935 start_pfn = start_pfn & ~(pageblock_nr_pages-1);
1936 start_page = pfn_to_page(start_pfn); 1936 start_page = pfn_to_page(start_pfn);
1937 end_page = start_page + pageblock_nr_pages - 1; 1937 end_page = start_page + pageblock_nr_pages - 1;
1938 end_pfn = start_pfn + pageblock_nr_pages - 1; 1938 end_pfn = start_pfn + pageblock_nr_pages - 1;
1939 1939
1940 /* Do not cross zone boundaries */ 1940 /* Do not cross zone boundaries */
1941 if (!zone_spans_pfn(zone, start_pfn)) 1941 if (!zone_spans_pfn(zone, start_pfn))
1942 start_page = page; 1942 start_page = page;
1943 if (!zone_spans_pfn(zone, end_pfn)) 1943 if (!zone_spans_pfn(zone, end_pfn))
1944 return 0; 1944 return 0;
1945 1945
1946 return move_freepages(zone, start_page, end_page, migratetype, 1946 return move_freepages(zone, start_page, end_page, migratetype,
1947 num_movable); 1947 num_movable);
1948 } 1948 }
1949 1949
1950 static void change_pageblock_range(struct page *pageblock_page, 1950 static void change_pageblock_range(struct page *pageblock_page,
1951 int start_order, int migratetype) 1951 int start_order, int migratetype)
1952 { 1952 {
1953 int nr_pageblocks = 1 << (start_order - pageblock_order); 1953 int nr_pageblocks = 1 << (start_order - pageblock_order);
1954 1954
1955 while (nr_pageblocks--) { 1955 while (nr_pageblocks--) {
1956 set_pageblock_migratetype(pageblock_page, migratetype); 1956 set_pageblock_migratetype(pageblock_page, migratetype);
1957 pageblock_page += pageblock_nr_pages; 1957 pageblock_page += pageblock_nr_pages;
1958 } 1958 }
1959 } 1959 }
1960 1960
1961 /* 1961 /*
1962 * When we are falling back to another migratetype during allocation, try to 1962 * When we are falling back to another migratetype during allocation, try to
1963 * steal extra free pages from the same pageblocks to satisfy further 1963 * steal extra free pages from the same pageblocks to satisfy further
1964 * allocations, instead of polluting multiple pageblocks. 1964 * allocations, instead of polluting multiple pageblocks.
1965 * 1965 *
1966 * If we are stealing a relatively large buddy page, it is likely there will 1966 * If we are stealing a relatively large buddy page, it is likely there will
1967 * be more free pages in the pageblock, so try to steal them all. For 1967 * be more free pages in the pageblock, so try to steal them all. For
1968 * reclaimable and unmovable allocations, we steal regardless of page size, 1968 * reclaimable and unmovable allocations, we steal regardless of page size,
1969 * as fragmentation caused by those allocations polluting movable pageblocks 1969 * as fragmentation caused by those allocations polluting movable pageblocks
1970 * is worse than movable allocations stealing from unmovable and reclaimable 1970 * is worse than movable allocations stealing from unmovable and reclaimable
1971 * pageblocks. 1971 * pageblocks.
1972 */ 1972 */
1973 static bool can_steal_fallback(unsigned int order, int start_mt) 1973 static bool can_steal_fallback(unsigned int order, int start_mt)
1974 { 1974 {
1975 /* 1975 /*
1976 * Leaving this order check is intended, although there is 1976 * Leaving this order check is intended, although there is
1977 * relaxed order check in next check. The reason is that 1977 * relaxed order check in next check. The reason is that
1978 * we can actually steal whole pageblock if this condition met, 1978 * we can actually steal whole pageblock if this condition met,
1979 * but, below check doesn't guarantee it and that is just heuristic 1979 * but, below check doesn't guarantee it and that is just heuristic
1980 * so could be changed anytime. 1980 * so could be changed anytime.
1981 */ 1981 */
1982 if (order >= pageblock_order) 1982 if (order >= pageblock_order)
1983 return true; 1983 return true;
1984 1984
1985 if (order >= pageblock_order / 2 || 1985 if (order >= pageblock_order / 2 ||
1986 start_mt == MIGRATE_RECLAIMABLE || 1986 start_mt == MIGRATE_RECLAIMABLE ||
1987 start_mt == MIGRATE_UNMOVABLE || 1987 start_mt == MIGRATE_UNMOVABLE ||
1988 page_group_by_mobility_disabled) 1988 page_group_by_mobility_disabled)
1989 return true; 1989 return true;
1990 1990
1991 return false; 1991 return false;
1992 } 1992 }
1993 1993
1994 /* 1994 /*
1995 * This function implements actual steal behaviour. If order is large enough, 1995 * This function implements actual steal behaviour. If order is large enough,
1996 * we can steal whole pageblock. If not, we first move freepages in this 1996 * we can steal whole pageblock. If not, we first move freepages in this
1997 * pageblock to our migratetype and determine how many already-allocated pages 1997 * pageblock to our migratetype and determine how many already-allocated pages
1998 * are there in the pageblock with a compatible migratetype. If at least half 1998 * are there in the pageblock with a compatible migratetype. If at least half
1999 * of pages are free or compatible, we can change migratetype of the pageblock 1999 * of pages are free or compatible, we can change migratetype of the pageblock
2000 * itself, so pages freed in the future will be put on the correct free list. 2000 * itself, so pages freed in the future will be put on the correct free list.
2001 */ 2001 */
2002 static void steal_suitable_fallback(struct zone *zone, struct page *page, 2002 static void steal_suitable_fallback(struct zone *zone, struct page *page,
2003 int start_type, bool whole_block) 2003 int start_type, bool whole_block)
2004 { 2004 {
2005 unsigned int current_order = page_order(page); 2005 unsigned int current_order = page_order(page);
2006 struct free_area *area; 2006 struct free_area *area;
2007 int free_pages, movable_pages, alike_pages; 2007 int free_pages, movable_pages, alike_pages;
2008 int old_block_type; 2008 int old_block_type;
2009 2009
2010 old_block_type = get_pageblock_migratetype(page); 2010 old_block_type = get_pageblock_migratetype(page);
2011 2011
2012 /* 2012 /*
2013 * This can happen due to races and we want to prevent broken 2013 * This can happen due to races and we want to prevent broken
2014 * highatomic accounting. 2014 * highatomic accounting.
2015 */ 2015 */
2016 if (is_migrate_highatomic(old_block_type)) 2016 if (is_migrate_highatomic(old_block_type))
2017 goto single_page; 2017 goto single_page;
2018 2018
2019 /* Take ownership for orders >= pageblock_order */ 2019 /* Take ownership for orders >= pageblock_order */
2020 if (current_order >= pageblock_order) { 2020 if (current_order >= pageblock_order) {
2021 change_pageblock_range(page, current_order, start_type); 2021 change_pageblock_range(page, current_order, start_type);
2022 goto single_page; 2022 goto single_page;
2023 } 2023 }
2024 2024
2025 /* We are not allowed to try stealing from the whole block */ 2025 /* We are not allowed to try stealing from the whole block */
2026 if (!whole_block) 2026 if (!whole_block)
2027 goto single_page; 2027 goto single_page;
2028 2028
2029 free_pages = move_freepages_block(zone, page, start_type, 2029 free_pages = move_freepages_block(zone, page, start_type,
2030 &movable_pages); 2030 &movable_pages);
2031 /* 2031 /*
2032 * Determine how many pages are compatible with our allocation. 2032 * Determine how many pages are compatible with our allocation.
2033 * For movable allocation, it's the number of movable pages which 2033 * For movable allocation, it's the number of movable pages which
2034 * we just obtained. For other types it's a bit more tricky. 2034 * we just obtained. For other types it's a bit more tricky.
2035 */ 2035 */
2036 if (start_type == MIGRATE_MOVABLE) { 2036 if (start_type == MIGRATE_MOVABLE) {
2037 alike_pages = movable_pages; 2037 alike_pages = movable_pages;
2038 } else { 2038 } else {
2039 /* 2039 /*
2040 * If we are falling back a RECLAIMABLE or UNMOVABLE allocation 2040 * If we are falling back a RECLAIMABLE or UNMOVABLE allocation
2041 * to MOVABLE pageblock, consider all non-movable pages as 2041 * to MOVABLE pageblock, consider all non-movable pages as
2042 * compatible. If it's UNMOVABLE falling back to RECLAIMABLE or 2042 * compatible. If it's UNMOVABLE falling back to RECLAIMABLE or
2043 * vice versa, be conservative since we can't distinguish the 2043 * vice versa, be conservative since we can't distinguish the
2044 * exact migratetype of non-movable pages. 2044 * exact migratetype of non-movable pages.
2045 */ 2045 */
2046 if (old_block_type == MIGRATE_MOVABLE) 2046 if (old_block_type == MIGRATE_MOVABLE)
2047 alike_pages = pageblock_nr_pages 2047 alike_pages = pageblock_nr_pages
2048 - (free_pages + movable_pages); 2048 - (free_pages + movable_pages);
2049 else 2049 else
2050 alike_pages = 0; 2050 alike_pages = 0;
2051 } 2051 }
2052 2052
2053 /* moving whole block can fail due to zone boundary conditions */ 2053 /* moving whole block can fail due to zone boundary conditions */
2054 if (!free_pages) 2054 if (!free_pages)
2055 goto single_page; 2055 goto single_page;
2056 2056
2057 /* 2057 /*
2058 * If a sufficient number of pages in the block are either free or of 2058 * If a sufficient number of pages in the block are either free or of
2059 * comparable migratability as our allocation, claim the whole block. 2059 * comparable migratability as our allocation, claim the whole block.
2060 */ 2060 */
2061 if (free_pages + alike_pages >= (1 << (pageblock_order-1)) || 2061 if (free_pages + alike_pages >= (1 << (pageblock_order-1)) ||
2062 page_group_by_mobility_disabled) 2062 page_group_by_mobility_disabled)
2063 set_pageblock_migratetype(page, start_type); 2063 set_pageblock_migratetype(page, start_type);
2064 2064
2065 return; 2065 return;
2066 2066
2067 single_page: 2067 single_page:
2068 area = &zone->free_area[current_order]; 2068 area = &zone->free_area[current_order];
2069 list_move(&page->lru, &area->free_list[start_type]); 2069 list_move(&page->lru, &area->free_list[start_type]);
2070 } 2070 }
2071 2071
2072 /* 2072 /*
2073 * Check whether there is a suitable fallback freepage with requested order. 2073 * Check whether there is a suitable fallback freepage with requested order.
2074 * If only_stealable is true, this function returns fallback_mt only if 2074 * If only_stealable is true, this function returns fallback_mt only if
2075 * we can steal other freepages all together. This would help to reduce 2075 * we can steal other freepages all together. This would help to reduce
2076 * fragmentation due to mixed migratetype pages in one pageblock. 2076 * fragmentation due to mixed migratetype pages in one pageblock.
2077 */ 2077 */
2078 int find_suitable_fallback(struct free_area *area, unsigned int order, 2078 int find_suitable_fallback(struct free_area *area, unsigned int order,
2079 int migratetype, bool only_stealable, bool *can_steal) 2079 int migratetype, bool only_stealable, bool *can_steal)
2080 { 2080 {
2081 int i; 2081 int i;
2082 int fallback_mt; 2082 int fallback_mt;
2083 2083
2084 if (area->nr_free == 0) 2084 if (area->nr_free == 0)
2085 return -1; 2085 return -1;
2086 2086
2087 *can_steal = false; 2087 *can_steal = false;
2088 for (i = 0;; i++) { 2088 for (i = 0;; i++) {
2089 fallback_mt = fallbacks[migratetype][i]; 2089 fallback_mt = fallbacks[migratetype][i];
2090 if (fallback_mt == MIGRATE_TYPES) 2090 if (fallback_mt == MIGRATE_TYPES)
2091 break; 2091 break;
2092 2092
2093 if (list_empty(&area->free_list[fallback_mt])) 2093 if (list_empty(&area->free_list[fallback_mt]))
2094 continue; 2094 continue;
2095 2095
2096 if (can_steal_fallback(order, migratetype)) 2096 if (can_steal_fallback(order, migratetype))
2097 *can_steal = true; 2097 *can_steal = true;
2098 2098
2099 if (!only_stealable) 2099 if (!only_stealable)
2100 return fallback_mt; 2100 return fallback_mt;
2101 2101
2102 if (*can_steal) 2102 if (*can_steal)
2103 return fallback_mt; 2103 return fallback_mt;
2104 } 2104 }
2105 2105
2106 return -1; 2106 return -1;
2107 } 2107 }
2108 2108
2109 /* 2109 /*
2110 * Reserve a pageblock for exclusive use of high-order atomic allocations if 2110 * Reserve a pageblock for exclusive use of high-order atomic allocations if
2111 * there are no empty page blocks that contain a page with a suitable order 2111 * there are no empty page blocks that contain a page with a suitable order
2112 */ 2112 */
2113 static void reserve_highatomic_pageblock(struct page *page, struct zone *zone, 2113 static void reserve_highatomic_pageblock(struct page *page, struct zone *zone,
2114 unsigned int alloc_order) 2114 unsigned int alloc_order)
2115 { 2115 {
2116 int mt; 2116 int mt;
2117 unsigned long max_managed, flags; 2117 unsigned long max_managed, flags;
2118 2118
2119 /* 2119 /*
2120 * Limit the number reserved to 1 pageblock or roughly 1% of a zone. 2120 * Limit the number reserved to 1 pageblock or roughly 1% of a zone.
2121 * Check is race-prone but harmless. 2121 * Check is race-prone but harmless.
2122 */ 2122 */
2123 max_managed = (zone->managed_pages / 100) + pageblock_nr_pages; 2123 max_managed = (zone->managed_pages / 100) + pageblock_nr_pages;
2124 if (zone->nr_reserved_highatomic >= max_managed) 2124 if (zone->nr_reserved_highatomic >= max_managed)
2125 return; 2125 return;
2126 2126
2127 spin_lock_irqsave(&zone->lock, flags); 2127 spin_lock_irqsave(&zone->lock, flags);
2128 2128
2129 /* Recheck the nr_reserved_highatomic limit under the lock */ 2129 /* Recheck the nr_reserved_highatomic limit under the lock */
2130 if (zone->nr_reserved_highatomic >= max_managed) 2130 if (zone->nr_reserved_highatomic >= max_managed)
2131 goto out_unlock; 2131 goto out_unlock;
2132 2132
2133 /* Yoink! */ 2133 /* Yoink! */
2134 mt = get_pageblock_migratetype(page); 2134 mt = get_pageblock_migratetype(page);
2135 if (!is_migrate_highatomic(mt) && !is_migrate_isolate(mt) 2135 if (!is_migrate_highatomic(mt) && !is_migrate_isolate(mt)
2136 && !is_migrate_cma(mt)) { 2136 && !is_migrate_cma(mt)) {
2137 zone->nr_reserved_highatomic += pageblock_nr_pages; 2137 zone->nr_reserved_highatomic += pageblock_nr_pages;
2138 set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC); 2138 set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC);
2139 move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL); 2139 move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL);
2140 } 2140 }
2141 2141
2142 out_unlock: 2142 out_unlock:
2143 spin_unlock_irqrestore(&zone->lock, flags); 2143 spin_unlock_irqrestore(&zone->lock, flags);
2144 } 2144 }
2145 2145
2146 /* 2146 /*
2147 * Used when an allocation is about to fail under memory pressure. This 2147 * Used when an allocation is about to fail under memory pressure. This
2148 * potentially hurts the reliability of high-order allocations when under 2148 * potentially hurts the reliability of high-order allocations when under
2149 * intense memory pressure but failed atomic allocations should be easier 2149 * intense memory pressure but failed atomic allocations should be easier
2150 * to recover from than an OOM. 2150 * to recover from than an OOM.
2151 * 2151 *
2152 * If @force is true, try to unreserve a pageblock even though highatomic 2152 * If @force is true, try to unreserve a pageblock even though highatomic
2153 * pageblock is exhausted. 2153 * pageblock is exhausted.
2154 */ 2154 */
2155 static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, 2155 static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
2156 bool force) 2156 bool force)
2157 { 2157 {
2158 struct zonelist *zonelist = ac->zonelist; 2158 struct zonelist *zonelist = ac->zonelist;
2159 unsigned long flags; 2159 unsigned long flags;
2160 struct zoneref *z; 2160 struct zoneref *z;
2161 struct zone *zone; 2161 struct zone *zone;
2162 struct page *page; 2162 struct page *page;
2163 int order; 2163 int order;
2164 bool ret; 2164 bool ret;
2165 2165
2166 for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx, 2166 for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx,
2167 ac->nodemask) { 2167 ac->nodemask) {
2168 /* 2168 /*
2169 * Preserve at least one pageblock unless memory pressure 2169 * Preserve at least one pageblock unless memory pressure
2170 * is really high. 2170 * is really high.
2171 */ 2171 */
2172 if (!force && zone->nr_reserved_highatomic <= 2172 if (!force && zone->nr_reserved_highatomic <=
2173 pageblock_nr_pages) 2173 pageblock_nr_pages)
2174 continue; 2174 continue;
2175 2175
2176 spin_lock_irqsave(&zone->lock, flags); 2176 spin_lock_irqsave(&zone->lock, flags);
2177 for (order = 0; order < MAX_ORDER; order++) { 2177 for (order = 0; order < MAX_ORDER; order++) {
2178 struct free_area *area = &(zone->free_area[order]); 2178 struct free_area *area = &(zone->free_area[order]);
2179 2179
2180 page = list_first_entry_or_null( 2180 page = list_first_entry_or_null(
2181 &area->free_list[MIGRATE_HIGHATOMIC], 2181 &area->free_list[MIGRATE_HIGHATOMIC],
2182 struct page, lru); 2182 struct page, lru);
2183 if (!page) 2183 if (!page)
2184 continue; 2184 continue;
2185 2185
2186 /* 2186 /*
2187 * In page freeing path, migratetype change is racy so 2187 * In page freeing path, migratetype change is racy so
2188 * we can counter several free pages in a pageblock 2188 * we can counter several free pages in a pageblock
2189 * in this loop althoug we changed the pageblock type 2189 * in this loop althoug we changed the pageblock type
2190 * from highatomic to ac->migratetype. So we should 2190 * from highatomic to ac->migratetype. So we should
2191 * adjust the count once. 2191 * adjust the count once.
2192 */ 2192 */
2193 if (is_migrate_highatomic_page(page)) { 2193 if (is_migrate_highatomic_page(page)) {
2194 /* 2194 /*
2195 * It should never happen but changes to 2195 * It should never happen but changes to
2196 * locking could inadvertently allow a per-cpu 2196 * locking could inadvertently allow a per-cpu
2197 * drain to add pages to MIGRATE_HIGHATOMIC 2197 * drain to add pages to MIGRATE_HIGHATOMIC
2198 * while unreserving so be safe and watch for 2198 * while unreserving so be safe and watch for
2199 * underflows. 2199 * underflows.
2200 */ 2200 */
2201 zone->nr_reserved_highatomic -= min( 2201 zone->nr_reserved_highatomic -= min(
2202 pageblock_nr_pages, 2202 pageblock_nr_pages,
2203 zone->nr_reserved_highatomic); 2203 zone->nr_reserved_highatomic);
2204 } 2204 }
2205 2205
2206 /* 2206 /*
2207 * Convert to ac->migratetype and avoid the normal 2207 * Convert to ac->migratetype and avoid the normal
2208 * pageblock stealing heuristics. Minimally, the caller 2208 * pageblock stealing heuristics. Minimally, the caller
2209 * is doing the work and needs the pages. More 2209 * is doing the work and needs the pages. More
2210 * importantly, if the block was always converted to 2210 * importantly, if the block was always converted to
2211 * MIGRATE_UNMOVABLE or another type then the number 2211 * MIGRATE_UNMOVABLE or another type then the number
2212 * of pageblocks that cannot be completely freed 2212 * of pageblocks that cannot be completely freed
2213 * may increase. 2213 * may increase.
2214 */ 2214 */
2215 set_pageblock_migratetype(page, ac->migratetype); 2215 set_pageblock_migratetype(page, ac->migratetype);
2216 ret = move_freepages_block(zone, page, ac->migratetype, 2216 ret = move_freepages_block(zone, page, ac->migratetype,
2217 NULL); 2217 NULL);
2218 if (ret) { 2218 if (ret) {
2219 spin_unlock_irqrestore(&zone->lock, flags); 2219 spin_unlock_irqrestore(&zone->lock, flags);
2220 return ret; 2220 return ret;
2221 } 2221 }
2222 } 2222 }
2223 spin_unlock_irqrestore(&zone->lock, flags); 2223 spin_unlock_irqrestore(&zone->lock, flags);
2224 } 2224 }
2225 2225
2226 return false; 2226 return false;
2227 } 2227 }
2228 2228
2229 /* 2229 /*
2230 * Try finding a free buddy page on the fallback list and put it on the free 2230 * Try finding a free buddy page on the fallback list and put it on the free
2231 * list of requested migratetype, possibly along with other pages from the same 2231 * list of requested migratetype, possibly along with other pages from the same
2232 * block, depending on fragmentation avoidance heuristics. Returns true if 2232 * block, depending on fragmentation avoidance heuristics. Returns true if
2233 * fallback was found so that __rmqueue_smallest() can grab it. 2233 * fallback was found so that __rmqueue_smallest() can grab it.
2234 * 2234 *
2235 * The use of signed ints for order and current_order is a deliberate 2235 * The use of signed ints for order and current_order is a deliberate
2236 * deviation from the rest of this file, to make the for loop 2236 * deviation from the rest of this file, to make the for loop
2237 * condition simpler. 2237 * condition simpler.
2238 */ 2238 */
2239 static inline bool 2239 static inline bool
2240 __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) 2240 __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
2241 { 2241 {
2242 struct free_area *area; 2242 struct free_area *area;
2243 int current_order; 2243 int current_order;
2244 struct page *page; 2244 struct page *page;
2245 int fallback_mt; 2245 int fallback_mt;
2246 bool can_steal; 2246 bool can_steal;
2247 2247
2248 /* 2248 /*
2249 * Find the largest available free page in the other list. This roughly 2249 * Find the largest available free page in the other list. This roughly
2250 * approximates finding the pageblock with the most free pages, which 2250 * approximates finding the pageblock with the most free pages, which
2251 * would be too costly to do exactly. 2251 * would be too costly to do exactly.
2252 */ 2252 */
2253 for (current_order = MAX_ORDER - 1; current_order >= order; 2253 for (current_order = MAX_ORDER - 1; current_order >= order;
2254 --current_order) { 2254 --current_order) {
2255 area = &(zone->free_area[current_order]); 2255 area = &(zone->free_area[current_order]);
2256 fallback_mt = find_suitable_fallback(area, current_order, 2256 fallback_mt = find_suitable_fallback(area, current_order,
2257 start_migratetype, false, &can_steal); 2257 start_migratetype, false, &can_steal);
2258 if (fallback_mt == -1) 2258 if (fallback_mt == -1)
2259 continue; 2259 continue;
2260 2260
2261 /* 2261 /*
2262 * We cannot steal all free pages from the pageblock and the 2262 * We cannot steal all free pages from the pageblock and the
2263 * requested migratetype is movable. In that case it's better to 2263 * requested migratetype is movable. In that case it's better to
2264 * steal and split the smallest available page instead of the 2264 * steal and split the smallest available page instead of the
2265 * largest available page, because even if the next movable 2265 * largest available page, because even if the next movable
2266 * allocation falls back into a different pageblock than this 2266 * allocation falls back into a different pageblock than this
2267 * one, it won't cause permanent fragmentation. 2267 * one, it won't cause permanent fragmentation.
2268 */ 2268 */
2269 if (!can_steal && start_migratetype == MIGRATE_MOVABLE 2269 if (!can_steal && start_migratetype == MIGRATE_MOVABLE
2270 && current_order > order) 2270 && current_order > order)
2271 goto find_smallest; 2271 goto find_smallest;
2272 2272
2273 goto do_steal; 2273 goto do_steal;
2274 } 2274 }
2275 2275
2276 return false; 2276 return false;
2277 2277
2278 find_smallest: 2278 find_smallest:
2279 for (current_order = order; current_order < MAX_ORDER; 2279 for (current_order = order; current_order < MAX_ORDER;
2280 current_order++) { 2280 current_order++) {
2281 area = &(zone->free_area[current_order]); 2281 area = &(zone->free_area[current_order]);
2282 fallback_mt = find_suitable_fallback(area, current_order, 2282 fallback_mt = find_suitable_fallback(area, current_order,
2283 start_migratetype, false, &can_steal); 2283 start_migratetype, false, &can_steal);
2284 if (fallback_mt != -1) 2284 if (fallback_mt != -1)
2285 break; 2285 break;
2286 } 2286 }
2287 2287
2288 /* 2288 /*
2289 * This should not happen - we already found a suitable fallback 2289 * This should not happen - we already found a suitable fallback
2290 * when looking for the largest page. 2290 * when looking for the largest page.
2291 */ 2291 */
2292 VM_BUG_ON(current_order == MAX_ORDER); 2292 VM_BUG_ON(current_order == MAX_ORDER);
2293 2293
2294 do_steal: 2294 do_steal:
2295 page = list_first_entry(&area->free_list[fallback_mt], 2295 page = list_first_entry(&area->free_list[fallback_mt],
2296 struct page, lru); 2296 struct page, lru);
2297 2297
2298 steal_suitable_fallback(zone, page, start_migratetype, can_steal); 2298 steal_suitable_fallback(zone, page, start_migratetype, can_steal);
2299 2299
2300 trace_mm_page_alloc_extfrag(page, order, current_order, 2300 trace_mm_page_alloc_extfrag(page, order, current_order,
2301 start_migratetype, fallback_mt); 2301 start_migratetype, fallback_mt);
2302 2302
2303 return true; 2303 return true;
2304 2304
2305 } 2305 }
2306 2306
2307 /* 2307 /*
2308 * Do the hard work of removing an element from the buddy allocator. 2308 * Do the hard work of removing an element from the buddy allocator.
2309 * Call me with the zone->lock already held. 2309 * Call me with the zone->lock already held.
2310 */ 2310 */
2311 static struct page *__rmqueue(struct zone *zone, unsigned int order, 2311 static struct page *__rmqueue(struct zone *zone, unsigned int order,
2312 int migratetype) 2312 int migratetype)
2313 { 2313 {
2314 struct page *page; 2314 struct page *page;
2315 2315
2316 retry: 2316 retry:
2317 page = __rmqueue_smallest(zone, order, migratetype); 2317 page = __rmqueue_smallest(zone, order, migratetype);
2318 if (unlikely(!page)) { 2318 if (unlikely(!page)) {
2319 if (migratetype == MIGRATE_MOVABLE) 2319 if (migratetype == MIGRATE_MOVABLE)
2320 page = __rmqueue_cma_fallback(zone, order); 2320 page = __rmqueue_cma_fallback(zone, order);
2321 2321
2322 if (!page && __rmqueue_fallback(zone, order, migratetype)) 2322 if (!page && __rmqueue_fallback(zone, order, migratetype))
2323 goto retry; 2323 goto retry;
2324 } 2324 }
2325 2325
2326 trace_mm_page_alloc_zone_locked(page, order, migratetype); 2326 trace_mm_page_alloc_zone_locked(page, order, migratetype);
2327 return page; 2327 return page;
2328 } 2328 }
2329 2329
2330 /* 2330 /*
2331 * Obtain a specified number of elements from the buddy allocator, all under 2331 * Obtain a specified number of elements from the buddy allocator, all under
2332 * a single hold of the lock, for efficiency. Add them to the supplied list. 2332 * a single hold of the lock, for efficiency. Add them to the supplied list.
2333 * Returns the number of new pages which were placed at *list. 2333 * Returns the number of new pages which were placed at *list.
2334 */ 2334 */
2335 static int rmqueue_bulk(struct zone *zone, unsigned int order, 2335 static int rmqueue_bulk(struct zone *zone, unsigned int order,
2336 unsigned long count, struct list_head *list, 2336 unsigned long count, struct list_head *list,
2337 int migratetype, bool cold) 2337 int migratetype, bool cold)
2338 { 2338 {
2339 int i, alloced = 0; 2339 int i, alloced = 0;
2340 2340
2341 spin_lock(&zone->lock); 2341 spin_lock(&zone->lock);
2342 for (i = 0; i < count; ++i) { 2342 for (i = 0; i < count; ++i) {
2343 struct page *page = __rmqueue(zone, order, migratetype); 2343 struct page *page = __rmqueue(zone, order, migratetype);
2344 if (unlikely(page == NULL)) 2344 if (unlikely(page == NULL))
2345 break; 2345 break;
2346 2346
2347 if (unlikely(check_pcp_refill(page))) 2347 if (unlikely(check_pcp_refill(page)))
2348 continue; 2348 continue;
2349 2349
2350 /* 2350 /*
2351 * Split buddy pages returned by expand() are received here 2351 * Split buddy pages returned by expand() are received here
2352 * in physical page order. The page is added to the callers and 2352 * in physical page order. The page is added to the callers and
2353 * list and the list head then moves forward. From the callers 2353 * list and the list head then moves forward. From the callers
2354 * perspective, the linked list is ordered by page number in 2354 * perspective, the linked list is ordered by page number in
2355 * some conditions. This is useful for IO devices that can 2355 * some conditions. This is useful for IO devices that can
2356 * merge IO requests if the physical pages are ordered 2356 * merge IO requests if the physical pages are ordered
2357 * properly. 2357 * properly.
2358 */ 2358 */
2359 if (likely(!cold)) 2359 if (likely(!cold))
2360 list_add(&page->lru, list); 2360 list_add(&page->lru, list);
2361 else 2361 else
2362 list_add_tail(&page->lru, list); 2362 list_add_tail(&page->lru, list);
2363 list = &page->lru; 2363 list = &page->lru;
2364 alloced++; 2364 alloced++;
2365 if (is_migrate_cma(get_pcppage_migratetype(page))) 2365 if (is_migrate_cma(get_pcppage_migratetype(page)))
2366 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 2366 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
2367 -(1 << order)); 2367 -(1 << order));
2368 } 2368 }
2369 2369
2370 /* 2370 /*
2371 * i pages were removed from the buddy list even if some leak due 2371 * i pages were removed from the buddy list even if some leak due
2372 * to check_pcp_refill failing so adjust NR_FREE_PAGES based 2372 * to check_pcp_refill failing so adjust NR_FREE_PAGES based
2373 * on i. Do not confuse with 'alloced' which is the number of 2373 * on i. Do not confuse with 'alloced' which is the number of
2374 * pages added to the pcp list. 2374 * pages added to the pcp list.
2375 */ 2375 */
2376 __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); 2376 __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
2377 spin_unlock(&zone->lock); 2377 spin_unlock(&zone->lock);
2378 return alloced; 2378 return alloced;
2379 } 2379 }
2380 2380
2381 #ifdef CONFIG_NUMA 2381 #ifdef CONFIG_NUMA
2382 /* 2382 /*
2383 * Called from the vmstat counter updater to drain pagesets of this 2383 * Called from the vmstat counter updater to drain pagesets of this
2384 * currently executing processor on remote nodes after they have 2384 * currently executing processor on remote nodes after they have
2385 * expired. 2385 * expired.
2386 * 2386 *
2387 * Note that this function must be called with the thread pinned to 2387 * Note that this function must be called with the thread pinned to
2388 * a single processor. 2388 * a single processor.
2389 */ 2389 */
2390 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) 2390 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
2391 { 2391 {
2392 unsigned long flags; 2392 unsigned long flags;
2393 int to_drain, batch; 2393 int to_drain, batch;
2394 2394
2395 local_irq_save(flags); 2395 local_irq_save(flags);
2396 batch = READ_ONCE(pcp->batch); 2396 batch = READ_ONCE(pcp->batch);
2397 to_drain = min(pcp->count, batch); 2397 to_drain = min(pcp->count, batch);
2398 if (to_drain > 0) { 2398 if (to_drain > 0) {
2399 free_pcppages_bulk(zone, to_drain, pcp); 2399 free_pcppages_bulk(zone, to_drain, pcp);
2400 pcp->count -= to_drain; 2400 pcp->count -= to_drain;
2401 } 2401 }
2402 local_irq_restore(flags); 2402 local_irq_restore(flags);
2403 } 2403 }
2404 #endif 2404 #endif
2405 2405
2406 /* 2406 /*
2407 * Drain pcplists of the indicated processor and zone. 2407 * Drain pcplists of the indicated processor and zone.
2408 * 2408 *
2409 * The processor must either be the current processor and the 2409 * The processor must either be the current processor and the
2410 * thread pinned to the current processor or a processor that 2410 * thread pinned to the current processor or a processor that
2411 * is not online. 2411 * is not online.
2412 */ 2412 */
2413 static void drain_pages_zone(unsigned int cpu, struct zone *zone) 2413 static void drain_pages_zone(unsigned int cpu, struct zone *zone)
2414 { 2414 {
2415 unsigned long flags; 2415 unsigned long flags;
2416 struct per_cpu_pageset *pset; 2416 struct per_cpu_pageset *pset;
2417 struct per_cpu_pages *pcp; 2417 struct per_cpu_pages *pcp;
2418 2418
2419 local_irq_save(flags); 2419 local_irq_save(flags);
2420 pset = per_cpu_ptr(zone->pageset, cpu); 2420 pset = per_cpu_ptr(zone->pageset, cpu);
2421 2421
2422 pcp = &pset->pcp; 2422 pcp = &pset->pcp;
2423 if (pcp->count) { 2423 if (pcp->count) {
2424 free_pcppages_bulk(zone, pcp->count, pcp); 2424 free_pcppages_bulk(zone, pcp->count, pcp);
2425 pcp->count = 0; 2425 pcp->count = 0;
2426 } 2426 }
2427 local_irq_restore(flags); 2427 local_irq_restore(flags);
2428 } 2428 }
2429 2429
2430 /* 2430 /*
2431 * Drain pcplists of all zones on the indicated processor. 2431 * Drain pcplists of all zones on the indicated processor.
2432 * 2432 *
2433 * The processor must either be the current processor and the 2433 * The processor must either be the current processor and the
2434 * thread pinned to the current processor or a processor that 2434 * thread pinned to the current processor or a processor that
2435 * is not online. 2435 * is not online.
2436 */ 2436 */
2437 static void drain_pages(unsigned int cpu) 2437 static void drain_pages(unsigned int cpu)
2438 { 2438 {
2439 struct zone *zone; 2439 struct zone *zone;
2440 2440
2441 for_each_populated_zone(zone) { 2441 for_each_populated_zone(zone) {
2442 drain_pages_zone(cpu, zone); 2442 drain_pages_zone(cpu, zone);
2443 } 2443 }
2444 } 2444 }
2445 2445
2446 /* 2446 /*
2447 * Spill all of this CPU's per-cpu pages back into the buddy allocator. 2447 * Spill all of this CPU's per-cpu pages back into the buddy allocator.
2448 * 2448 *
2449 * The CPU has to be pinned. When zone parameter is non-NULL, spill just 2449 * The CPU has to be pinned. When zone parameter is non-NULL, spill just
2450 * the single zone's pages. 2450 * the single zone's pages.
2451 */ 2451 */
2452 void drain_local_pages(struct zone *zone) 2452 void drain_local_pages(struct zone *zone)
2453 { 2453 {
2454 int cpu = smp_processor_id(); 2454 int cpu = smp_processor_id();
2455 2455
2456 if (zone) 2456 if (zone)
2457 drain_pages_zone(cpu, zone); 2457 drain_pages_zone(cpu, zone);
2458 else 2458 else
2459 drain_pages(cpu); 2459 drain_pages(cpu);
2460 } 2460 }
2461 2461
2462 static void drain_local_pages_wq(struct work_struct *work) 2462 static void drain_local_pages_wq(struct work_struct *work)
2463 { 2463 {
2464 /* 2464 /*
2465 * drain_all_pages doesn't use proper cpu hotplug protection so 2465 * drain_all_pages doesn't use proper cpu hotplug protection so
2466 * we can race with cpu offline when the WQ can move this from 2466 * we can race with cpu offline when the WQ can move this from
2467 * a cpu pinned worker to an unbound one. We can operate on a different 2467 * a cpu pinned worker to an unbound one. We can operate on a different
2468 * cpu which is allright but we also have to make sure to not move to 2468 * cpu which is allright but we also have to make sure to not move to
2469 * a different one. 2469 * a different one.
2470 */ 2470 */
2471 preempt_disable(); 2471 preempt_disable();
2472 drain_local_pages(NULL); 2472 drain_local_pages(NULL);
2473 preempt_enable(); 2473 preempt_enable();
2474 } 2474 }
2475 2475
2476 /* 2476 /*
2477 * Spill all the per-cpu pages from all CPUs back into the buddy allocator. 2477 * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
2478 * 2478 *
2479 * When zone parameter is non-NULL, spill just the single zone's pages. 2479 * When zone parameter is non-NULL, spill just the single zone's pages.
2480 * 2480 *
2481 * Note that this can be extremely slow as the draining happens in a workqueue. 2481 * Note that this can be extremely slow as the draining happens in a workqueue.
2482 */ 2482 */
2483 void drain_all_pages(struct zone *zone) 2483 void drain_all_pages(struct zone *zone)
2484 { 2484 {
2485 int cpu; 2485 int cpu;
2486 2486
2487 /* 2487 /*
2488 * Allocate in the BSS so we wont require allocation in 2488 * Allocate in the BSS so we wont require allocation in
2489 * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y 2489 * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y
2490 */ 2490 */
2491 static cpumask_t cpus_with_pcps; 2491 static cpumask_t cpus_with_pcps;
2492 2492
2493 /* 2493 /*
2494 * Make sure nobody triggers this path before mm_percpu_wq is fully 2494 * Make sure nobody triggers this path before mm_percpu_wq is fully
2495 * initialized. 2495 * initialized.
2496 */ 2496 */
2497 if (WARN_ON_ONCE(!mm_percpu_wq)) 2497 if (WARN_ON_ONCE(!mm_percpu_wq))
2498 return; 2498 return;
2499 2499
2500 /* 2500 /*
2501 * Do not drain if one is already in progress unless it's specific to 2501 * Do not drain if one is already in progress unless it's specific to
2502 * a zone. Such callers are primarily CMA and memory hotplug and need 2502 * a zone. Such callers are primarily CMA and memory hotplug and need
2503 * the drain to be complete when the call returns. 2503 * the drain to be complete when the call returns.
2504 */ 2504 */
2505 if (unlikely(!mutex_trylock(&pcpu_drain_mutex))) { 2505 if (unlikely(!mutex_trylock(&pcpu_drain_mutex))) {
2506 if (!zone) 2506 if (!zone)
2507 return; 2507 return;
2508 mutex_lock(&pcpu_drain_mutex); 2508 mutex_lock(&pcpu_drain_mutex);
2509 } 2509 }
2510 2510
2511 /* 2511 /*
2512 * We don't care about racing with CPU hotplug event 2512 * We don't care about racing with CPU hotplug event
2513 * as offline notification will cause the notified 2513 * as offline notification will cause the notified
2514 * cpu to drain that CPU pcps and on_each_cpu_mask 2514 * cpu to drain that CPU pcps and on_each_cpu_mask
2515 * disables preemption as part of its processing 2515 * disables preemption as part of its processing
2516 */ 2516 */
2517 for_each_online_cpu(cpu) { 2517 for_each_online_cpu(cpu) {
2518 struct per_cpu_pageset *pcp; 2518 struct per_cpu_pageset *pcp;
2519 struct zone *z; 2519 struct zone *z;
2520 bool has_pcps = false; 2520 bool has_pcps = false;
2521 2521
2522 if (zone) { 2522 if (zone) {
2523 pcp = per_cpu_ptr(zone->pageset, cpu); 2523 pcp = per_cpu_ptr(zone->pageset, cpu);
2524 if (pcp->pcp.count) 2524 if (pcp->pcp.count)
2525 has_pcps = true; 2525 has_pcps = true;
2526 } else { 2526 } else {
2527 for_each_populated_zone(z) { 2527 for_each_populated_zone(z) {
2528 pcp = per_cpu_ptr(z->pageset, cpu); 2528 pcp = per_cpu_ptr(z->pageset, cpu);
2529 if (pcp->pcp.count) { 2529 if (pcp->pcp.count) {
2530 has_pcps = true; 2530 has_pcps = true;
2531 break; 2531 break;
2532 } 2532 }
2533 } 2533 }
2534 } 2534 }
2535 2535
2536 if (has_pcps) 2536 if (has_pcps)
2537 cpumask_set_cpu(cpu, &cpus_with_pcps); 2537 cpumask_set_cpu(cpu, &cpus_with_pcps);
2538 else 2538 else
2539 cpumask_clear_cpu(cpu, &cpus_with_pcps); 2539 cpumask_clear_cpu(cpu, &cpus_with_pcps);
2540 } 2540 }
2541 2541
2542 for_each_cpu(cpu, &cpus_with_pcps) { 2542 for_each_cpu(cpu, &cpus_with_pcps) {
2543 struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu); 2543 struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu);
2544 INIT_WORK(work, drain_local_pages_wq); 2544 INIT_WORK(work, drain_local_pages_wq);
2545 queue_work_on(cpu, mm_percpu_wq, work); 2545 queue_work_on(cpu, mm_percpu_wq, work);
2546 } 2546 }
2547 for_each_cpu(cpu, &cpus_with_pcps) 2547 for_each_cpu(cpu, &cpus_with_pcps)
2548 flush_work(per_cpu_ptr(&pcpu_drain, cpu)); 2548 flush_work(per_cpu_ptr(&pcpu_drain, cpu));
2549 2549
2550 mutex_unlock(&pcpu_drain_mutex); 2550 mutex_unlock(&pcpu_drain_mutex);
2551 } 2551 }
2552 2552
2553 #ifdef CONFIG_HIBERNATION 2553 #ifdef CONFIG_HIBERNATION
2554 2554
2555 /* 2555 /*
2556 * Touch the watchdog for every WD_PAGE_COUNT pages. 2556 * Touch the watchdog for every WD_PAGE_COUNT pages.
2557 */ 2557 */
2558 #define WD_PAGE_COUNT (128*1024) 2558 #define WD_PAGE_COUNT (128*1024)
2559 2559
2560 void mark_free_pages(struct zone *zone) 2560 void mark_free_pages(struct zone *zone)
2561 { 2561 {
2562 unsigned long pfn, max_zone_pfn, page_count = WD_PAGE_COUNT; 2562 unsigned long pfn, max_zone_pfn, page_count = WD_PAGE_COUNT;
2563 unsigned long flags; 2563 unsigned long flags;
2564 unsigned int order, t; 2564 unsigned int order, t;
2565 struct page *page; 2565 struct page *page;
2566 2566
2567 if (zone_is_empty(zone)) 2567 if (zone_is_empty(zone))
2568 return; 2568 return;
2569 2569
2570 spin_lock_irqsave(&zone->lock, flags); 2570 spin_lock_irqsave(&zone->lock, flags);
2571 2571
2572 max_zone_pfn = zone_end_pfn(zone); 2572 max_zone_pfn = zone_end_pfn(zone);
2573 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 2573 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
2574 if (pfn_valid(pfn)) { 2574 if (pfn_valid(pfn)) {
2575 page = pfn_to_page(pfn); 2575 page = pfn_to_page(pfn);
2576 2576
2577 if (!--page_count) { 2577 if (!--page_count) {
2578 touch_nmi_watchdog(); 2578 touch_nmi_watchdog();
2579 page_count = WD_PAGE_COUNT; 2579 page_count = WD_PAGE_COUNT;
2580 } 2580 }
2581 2581
2582 if (page_zone(page) != zone) 2582 if (page_zone(page) != zone)
2583 continue; 2583 continue;
2584 2584
2585 if (!swsusp_page_is_forbidden(page)) 2585 if (!swsusp_page_is_forbidden(page))
2586 swsusp_unset_page_free(page); 2586 swsusp_unset_page_free(page);
2587 } 2587 }
2588 2588
2589 for_each_migratetype_order(order, t) { 2589 for_each_migratetype_order(order, t) {
2590 list_for_each_entry(page, 2590 list_for_each_entry(page,
2591 &zone->free_area[order].free_list[t], lru) { 2591 &zone->free_area[order].free_list[t], lru) {
2592 unsigned long i; 2592 unsigned long i;
2593 2593
2594 pfn = page_to_pfn(page); 2594 pfn = page_to_pfn(page);
2595 for (i = 0; i < (1UL << order); i++) { 2595 for (i = 0; i < (1UL << order); i++) {
2596 if (!--page_count) { 2596 if (!--page_count) {
2597 touch_nmi_watchdog(); 2597 touch_nmi_watchdog();
2598 page_count = WD_PAGE_COUNT; 2598 page_count = WD_PAGE_COUNT;
2599 } 2599 }
2600 swsusp_set_page_free(pfn_to_page(pfn + i)); 2600 swsusp_set_page_free(pfn_to_page(pfn + i));
2601 } 2601 }
2602 } 2602 }
2603 } 2603 }
2604 spin_unlock_irqrestore(&zone->lock, flags); 2604 spin_unlock_irqrestore(&zone->lock, flags);
2605 } 2605 }
2606 #endif /* CONFIG_PM */ 2606 #endif /* CONFIG_PM */
2607 2607
2608 /* 2608 /*
2609 * Free a 0-order page 2609 * Free a 0-order page
2610 * cold == true ? free a cold page : free a hot page 2610 * cold == true ? free a cold page : free a hot page
2611 */ 2611 */
2612 void free_hot_cold_page(struct page *page, bool cold) 2612 void free_hot_cold_page(struct page *page, bool cold)
2613 { 2613 {
2614 struct zone *zone = page_zone(page); 2614 struct zone *zone = page_zone(page);
2615 struct per_cpu_pages *pcp; 2615 struct per_cpu_pages *pcp;
2616 unsigned long flags; 2616 unsigned long flags;
2617 unsigned long pfn = page_to_pfn(page); 2617 unsigned long pfn = page_to_pfn(page);
2618 int migratetype; 2618 int migratetype;
2619 2619
2620 if (!free_pcp_prepare(page)) 2620 if (!free_pcp_prepare(page))
2621 return; 2621 return;
2622 2622
2623 migratetype = get_pfnblock_migratetype(page, pfn); 2623 migratetype = get_pfnblock_migratetype(page, pfn);
2624 set_pcppage_migratetype(page, migratetype); 2624 set_pcppage_migratetype(page, migratetype);
2625 local_irq_save(flags); 2625 local_irq_save(flags);
2626 __count_vm_event(PGFREE); 2626 __count_vm_event(PGFREE);
2627 2627
2628 /* 2628 /*
2629 * We only track unmovable, reclaimable and movable on pcp lists. 2629 * We only track unmovable, reclaimable and movable on pcp lists.
2630 * Free ISOLATE pages back to the allocator because they are being 2630 * Free ISOLATE pages back to the allocator because they are being
2631 * offlined but treat HIGHATOMIC as movable pages so we can get those 2631 * offlined but treat HIGHATOMIC as movable pages so we can get those
2632 * areas back if necessary. Otherwise, we may have to free 2632 * areas back if necessary. Otherwise, we may have to free
2633 * excessively into the page allocator 2633 * excessively into the page allocator
2634 */ 2634 */
2635 if (migratetype >= MIGRATE_PCPTYPES) { 2635 if (migratetype >= MIGRATE_PCPTYPES) {
2636 if (unlikely(is_migrate_isolate(migratetype))) { 2636 if (unlikely(is_migrate_isolate(migratetype))) {
2637 free_one_page(zone, page, pfn, 0, migratetype); 2637 free_one_page(zone, page, pfn, 0, migratetype);
2638 goto out; 2638 goto out;
2639 } 2639 }
2640 migratetype = MIGRATE_MOVABLE; 2640 migratetype = MIGRATE_MOVABLE;
2641 } 2641 }
2642 2642
2643 pcp = &this_cpu_ptr(zone->pageset)->pcp; 2643 pcp = &this_cpu_ptr(zone->pageset)->pcp;
2644 if (!cold) 2644 if (!cold)
2645 list_add(&page->lru, &pcp->lists[migratetype]); 2645 list_add(&page->lru, &pcp->lists[migratetype]);
2646 else 2646 else
2647 list_add_tail(&page->lru, &pcp->lists[migratetype]); 2647 list_add_tail(&page->lru, &pcp->lists[migratetype]);
2648 pcp->count++; 2648 pcp->count++;
2649 if (pcp->count >= pcp->high) { 2649 if (pcp->count >= pcp->high) {
2650 unsigned long batch = READ_ONCE(pcp->batch); 2650 unsigned long batch = READ_ONCE(pcp->batch);
2651 free_pcppages_bulk(zone, batch, pcp); 2651 free_pcppages_bulk(zone, batch, pcp);
2652 pcp->count -= batch; 2652 pcp->count -= batch;
2653 } 2653 }
2654 2654
2655 out: 2655 out:
2656 local_irq_restore(flags); 2656 local_irq_restore(flags);
2657 } 2657 }
2658 2658
2659 /* 2659 /*
2660 * Free a list of 0-order pages 2660 * Free a list of 0-order pages
2661 */ 2661 */
2662 void free_hot_cold_page_list(struct list_head *list, bool cold) 2662 void free_hot_cold_page_list(struct list_head *list, bool cold)
2663 { 2663 {
2664 struct page *page, *next; 2664 struct page *page, *next;
2665 2665
2666 list_for_each_entry_safe(page, next, list, lru) { 2666 list_for_each_entry_safe(page, next, list, lru) {
2667 trace_mm_page_free_batched(page, cold); 2667 trace_mm_page_free_batched(page, cold);
2668 free_hot_cold_page(page, cold); 2668 free_hot_cold_page(page, cold);
2669 } 2669 }
2670 } 2670 }
2671 2671
2672 /* 2672 /*
2673 * split_page takes a non-compound higher-order page, and splits it into 2673 * split_page takes a non-compound higher-order page, and splits it into
2674 * n (1<<order) sub-pages: page[0..n] 2674 * n (1<<order) sub-pages: page[0..n]
2675 * Each sub-page must be freed individually. 2675 * Each sub-page must be freed individually.
2676 * 2676 *
2677 * Note: this is probably too low level an operation for use in drivers. 2677 * Note: this is probably too low level an operation for use in drivers.
2678 * Please consult with lkml before using this in your driver. 2678 * Please consult with lkml before using this in your driver.
2679 */ 2679 */
2680 void split_page(struct page *page, unsigned int order) 2680 void split_page(struct page *page, unsigned int order)
2681 { 2681 {
2682 int i; 2682 int i;
2683 2683
2684 VM_BUG_ON_PAGE(PageCompound(page), page); 2684 VM_BUG_ON_PAGE(PageCompound(page), page);
2685 VM_BUG_ON_PAGE(!page_count(page), page); 2685 VM_BUG_ON_PAGE(!page_count(page), page);
2686 2686
2687 for (i = 1; i < (1 << order); i++) 2687 for (i = 1; i < (1 << order); i++)
2688 set_page_refcounted(page + i); 2688 set_page_refcounted(page + i);
2689 split_page_owner(page, order); 2689 split_page_owner(page, order);
2690 } 2690 }
2691 EXPORT_SYMBOL_GPL(split_page); 2691 EXPORT_SYMBOL_GPL(split_page);
2692 2692
2693 int __isolate_free_page(struct page *page, unsigned int order) 2693 int __isolate_free_page(struct page *page, unsigned int order)
2694 { 2694 {
2695 unsigned long watermark; 2695 unsigned long watermark;
2696 struct zone *zone; 2696 struct zone *zone;
2697 int mt; 2697 int mt;
2698 2698
2699 BUG_ON(!PageBuddy(page)); 2699 BUG_ON(!PageBuddy(page));
2700 2700
2701 zone = page_zone(page); 2701 zone = page_zone(page);
2702 mt = get_pageblock_migratetype(page); 2702 mt = get_pageblock_migratetype(page);
2703 2703
2704 if (!is_migrate_isolate(mt)) { 2704 if (!is_migrate_isolate(mt)) {
2705 /* 2705 /*
2706 * Obey watermarks as if the page was being allocated. We can 2706 * Obey watermarks as if the page was being allocated. We can
2707 * emulate a high-order watermark check with a raised order-0 2707 * emulate a high-order watermark check with a raised order-0
2708 * watermark, because we already know our high-order page 2708 * watermark, because we already know our high-order page
2709 * exists. 2709 * exists.
2710 */ 2710 */
2711 watermark = min_wmark_pages(zone) + (1UL << order); 2711 watermark = min_wmark_pages(zone) + (1UL << order);
2712 if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA)) 2712 if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
2713 return 0; 2713 return 0;
2714 2714
2715 __mod_zone_freepage_state(zone, -(1UL << order), mt); 2715 __mod_zone_freepage_state(zone, -(1UL << order), mt);
2716 } 2716 }
2717 2717
2718 /* Remove page from free list */ 2718 /* Remove page from free list */
2719 list_del(&page->lru); 2719 list_del(&page->lru);
2720 zone->free_area[order].nr_free--; 2720 zone->free_area[order].nr_free--;
2721 rmv_page_order(page); 2721 rmv_page_order(page);
2722 2722
2723 /* 2723 /*
2724 * Set the pageblock if the isolated page is at least half of a 2724 * Set the pageblock if the isolated page is at least half of a
2725 * pageblock 2725 * pageblock
2726 */ 2726 */
2727 if (order >= pageblock_order - 1) { 2727 if (order >= pageblock_order - 1) {
2728 struct page *endpage = page + (1 << order) - 1; 2728 struct page *endpage = page + (1 << order) - 1;
2729 for (; page < endpage; page += pageblock_nr_pages) { 2729 for (; page < endpage; page += pageblock_nr_pages) {
2730 int mt = get_pageblock_migratetype(page); 2730 int mt = get_pageblock_migratetype(page);
2731 if (!is_migrate_isolate(mt) && !is_migrate_cma(mt) 2731 if (!is_migrate_isolate(mt) && !is_migrate_cma(mt)
2732 && !is_migrate_highatomic(mt)) 2732 && !is_migrate_highatomic(mt))
2733 set_pageblock_migratetype(page, 2733 set_pageblock_migratetype(page,
2734 MIGRATE_MOVABLE); 2734 MIGRATE_MOVABLE);
2735 } 2735 }
2736 } 2736 }
2737 2737
2738 2738
2739 return 1UL << order; 2739 return 1UL << order;
2740 } 2740 }
2741 2741
2742 /* 2742 /*
2743 * Update NUMA hit/miss statistics 2743 * Update NUMA hit/miss statistics
2744 * 2744 *
2745 * Must be called with interrupts disabled. 2745 * Must be called with interrupts disabled.
2746 */ 2746 */
2747 static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) 2747 static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
2748 { 2748 {
2749 #ifdef CONFIG_NUMA 2749 #ifdef CONFIG_NUMA
2750 enum numa_stat_item local_stat = NUMA_LOCAL; 2750 enum numa_stat_item local_stat = NUMA_LOCAL;
2751 2751
2752 if (z->node != numa_node_id()) 2752 if (z->node != numa_node_id())
2753 local_stat = NUMA_OTHER; 2753 local_stat = NUMA_OTHER;
2754 2754
2755 if (z->node == preferred_zone->node) 2755 if (z->node == preferred_zone->node)
2756 __inc_numa_state(z, NUMA_HIT); 2756 __inc_numa_state(z, NUMA_HIT);
2757 else { 2757 else {
2758 __inc_numa_state(z, NUMA_MISS); 2758 __inc_numa_state(z, NUMA_MISS);
2759 __inc_numa_state(preferred_zone, NUMA_FOREIGN); 2759 __inc_numa_state(preferred_zone, NUMA_FOREIGN);
2760 } 2760 }
2761 __inc_numa_state(z, local_stat); 2761 __inc_numa_state(z, local_stat);
2762 #endif 2762 #endif
2763 } 2763 }
2764 2764
2765 /* Remove page from the per-cpu list, caller must protect the list */ 2765 /* Remove page from the per-cpu list, caller must protect the list */
2766 static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, 2766 static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
2767 bool cold, struct per_cpu_pages *pcp, 2767 bool cold, struct per_cpu_pages *pcp,
2768 struct list_head *list) 2768 struct list_head *list)
2769 { 2769 {
2770 struct page *page; 2770 struct page *page;
2771 2771
2772 do { 2772 do {
2773 if (list_empty(list)) { 2773 if (list_empty(list)) {
2774 pcp->count += rmqueue_bulk(zone, 0, 2774 pcp->count += rmqueue_bulk(zone, 0,
2775 pcp->batch, list, 2775 pcp->batch, list,
2776 migratetype, cold); 2776 migratetype, cold);
2777 if (unlikely(list_empty(list))) 2777 if (unlikely(list_empty(list)))
2778 return NULL; 2778 return NULL;
2779 } 2779 }
2780 2780
2781 if (cold) 2781 if (cold)
2782 page = list_last_entry(list, struct page, lru); 2782 page = list_last_entry(list, struct page, lru);
2783 else 2783 else
2784 page = list_first_entry(list, struct page, lru); 2784 page = list_first_entry(list, struct page, lru);
2785 2785
2786 list_del(&page->lru); 2786 list_del(&page->lru);
2787 pcp->count--; 2787 pcp->count--;
2788 } while (check_new_pcp(page)); 2788 } while (check_new_pcp(page));
2789 2789
2790 return page; 2790 return page;
2791 } 2791 }
2792 2792
2793 /* Lock and remove page from the per-cpu list */ 2793 /* Lock and remove page from the per-cpu list */
2794 static struct page *rmqueue_pcplist(struct zone *preferred_zone, 2794 static struct page *rmqueue_pcplist(struct zone *preferred_zone,
2795 struct zone *zone, unsigned int order, 2795 struct zone *zone, unsigned int order,
2796 gfp_t gfp_flags, int migratetype) 2796 gfp_t gfp_flags, int migratetype)
2797 { 2797 {
2798 struct per_cpu_pages *pcp; 2798 struct per_cpu_pages *pcp;
2799 struct list_head *list; 2799 struct list_head *list;
2800 bool cold = ((gfp_flags & __GFP_COLD) != 0); 2800 bool cold = ((gfp_flags & __GFP_COLD) != 0);
2801 struct page *page; 2801 struct page *page;
2802 unsigned long flags; 2802 unsigned long flags;
2803 2803
2804 local_irq_save(flags); 2804 local_irq_save(flags);
2805 pcp = &this_cpu_ptr(zone->pageset)->pcp; 2805 pcp = &this_cpu_ptr(zone->pageset)->pcp;
2806 list = &pcp->lists[migratetype]; 2806 list = &pcp->lists[migratetype];
2807 page = __rmqueue_pcplist(zone, migratetype, cold, pcp, list); 2807 page = __rmqueue_pcplist(zone, migratetype, cold, pcp, list);
2808 if (page) { 2808 if (page) {
2809 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); 2809 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
2810 zone_statistics(preferred_zone, zone); 2810 zone_statistics(preferred_zone, zone);
2811 } 2811 }
2812 local_irq_restore(flags); 2812 local_irq_restore(flags);
2813 return page; 2813 return page;
2814 } 2814 }
2815 2815
2816 /* 2816 /*
2817 * Allocate a page from the given zone. Use pcplists for order-0 allocations. 2817 * Allocate a page from the given zone. Use pcplists for order-0 allocations.
2818 */ 2818 */
2819 static inline 2819 static inline
2820 struct page *rmqueue(struct zone *preferred_zone, 2820 struct page *rmqueue(struct zone *preferred_zone,
2821 struct zone *zone, unsigned int order, 2821 struct zone *zone, unsigned int order,
2822 gfp_t gfp_flags, unsigned int alloc_flags, 2822 gfp_t gfp_flags, unsigned int alloc_flags,
2823 int migratetype) 2823 int migratetype)
2824 { 2824 {
2825 unsigned long flags; 2825 unsigned long flags;
2826 struct page *page; 2826 struct page *page;
2827 2827
2828 if (likely(order == 0)) { 2828 if (likely(order == 0)) {
2829 page = rmqueue_pcplist(preferred_zone, zone, order, 2829 page = rmqueue_pcplist(preferred_zone, zone, order,
2830 gfp_flags, migratetype); 2830 gfp_flags, migratetype);
2831 goto out; 2831 goto out;
2832 } 2832 }
2833 2833
2834 /* 2834 /*
2835 * We most definitely don't want callers attempting to 2835 * We most definitely don't want callers attempting to
2836 * allocate greater than order-1 page units with __GFP_NOFAIL. 2836 * allocate greater than order-1 page units with __GFP_NOFAIL.
2837 */ 2837 */
2838 WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); 2838 WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
2839 spin_lock_irqsave(&zone->lock, flags); 2839 spin_lock_irqsave(&zone->lock, flags);
2840 2840
2841 do { 2841 do {
2842 page = NULL; 2842 page = NULL;
2843 if (alloc_flags & ALLOC_HARDER) { 2843 if (alloc_flags & ALLOC_HARDER) {
2844 page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); 2844 page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
2845 if (page) 2845 if (page)
2846 trace_mm_page_alloc_zone_locked(page, order, migratetype); 2846 trace_mm_page_alloc_zone_locked(page, order, migratetype);
2847 } 2847 }
2848 if (!page) 2848 if (!page)
2849 page = __rmqueue(zone, order, migratetype); 2849 page = __rmqueue(zone, order, migratetype);
2850 } while (page && check_new_pages(page, order)); 2850 } while (page && check_new_pages(page, order));
2851 spin_unlock(&zone->lock); 2851 spin_unlock(&zone->lock);
2852 if (!page) 2852 if (!page)
2853 goto failed; 2853 goto failed;
2854 __mod_zone_freepage_state(zone, -(1 << order), 2854 __mod_zone_freepage_state(zone, -(1 << order),
2855 get_pcppage_migratetype(page)); 2855 get_pcppage_migratetype(page));
2856 2856
2857 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); 2857 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
2858 zone_statistics(preferred_zone, zone); 2858 zone_statistics(preferred_zone, zone);
2859 local_irq_restore(flags); 2859 local_irq_restore(flags);
2860 2860
2861 out: 2861 out:
2862 VM_BUG_ON_PAGE(page && bad_range(zone, page), page); 2862 VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
2863 return page; 2863 return page;
2864 2864
2865 failed: 2865 failed:
2866 local_irq_restore(flags); 2866 local_irq_restore(flags);
2867 return NULL; 2867 return NULL;
2868 } 2868 }
2869 2869
2870 #ifdef CONFIG_FAIL_PAGE_ALLOC 2870 #ifdef CONFIG_FAIL_PAGE_ALLOC
2871 2871
2872 static struct { 2872 static struct {
2873 struct fault_attr attr; 2873 struct fault_attr attr;
2874 2874
2875 bool ignore_gfp_highmem; 2875 bool ignore_gfp_highmem;
2876 bool ignore_gfp_reclaim; 2876 bool ignore_gfp_reclaim;
2877 u32 min_order; 2877 u32 min_order;
2878 } fail_page_alloc = { 2878 } fail_page_alloc = {
2879 .attr = FAULT_ATTR_INITIALIZER, 2879 .attr = FAULT_ATTR_INITIALIZER,
2880 .ignore_gfp_reclaim = true, 2880 .ignore_gfp_reclaim = true,
2881 .ignore_gfp_highmem = true, 2881 .ignore_gfp_highmem = true,
2882 .min_order = 1, 2882 .min_order = 1,
2883 }; 2883 };
2884 2884
2885 static int __init setup_fail_page_alloc(char *str) 2885 static int __init setup_fail_page_alloc(char *str)
2886 { 2886 {
2887 return setup_fault_attr(&fail_page_alloc.attr, str); 2887 return setup_fault_attr(&fail_page_alloc.attr, str);
2888 } 2888 }
2889 __setup("fail_page_alloc=", setup_fail_page_alloc); 2889 __setup("fail_page_alloc=", setup_fail_page_alloc);
2890 2890
2891 static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 2891 static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
2892 { 2892 {
2893 if (order < fail_page_alloc.min_order) 2893 if (order < fail_page_alloc.min_order)
2894 return false; 2894 return false;
2895 if (gfp_mask & __GFP_NOFAIL) 2895 if (gfp_mask & __GFP_NOFAIL)
2896 return false; 2896 return false;
2897 if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) 2897 if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
2898 return false; 2898 return false;
2899 if (fail_page_alloc.ignore_gfp_reclaim && 2899 if (fail_page_alloc.ignore_gfp_reclaim &&
2900 (gfp_mask & __GFP_DIRECT_RECLAIM)) 2900 (gfp_mask & __GFP_DIRECT_RECLAIM))
2901 return false; 2901 return false;
2902 2902
2903 return should_fail(&fail_page_alloc.attr, 1 << order); 2903 return should_fail(&fail_page_alloc.attr, 1 << order);
2904 } 2904 }
2905 2905
2906 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS 2906 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
2907 2907
2908 static int __init fail_page_alloc_debugfs(void) 2908 static int __init fail_page_alloc_debugfs(void)
2909 { 2909 {
2910 umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; 2910 umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
2911 struct dentry *dir; 2911 struct dentry *dir;
2912 2912
2913 dir = fault_create_debugfs_attr("fail_page_alloc", NULL, 2913 dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
2914 &fail_page_alloc.attr); 2914 &fail_page_alloc.attr);
2915 if (IS_ERR(dir)) 2915 if (IS_ERR(dir))
2916 return PTR_ERR(dir); 2916 return PTR_ERR(dir);
2917 2917
2918 if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, 2918 if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
2919 &fail_page_alloc.ignore_gfp_reclaim)) 2919 &fail_page_alloc.ignore_gfp_reclaim))
2920 goto fail; 2920 goto fail;
2921 if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir, 2921 if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
2922 &fail_page_alloc.ignore_gfp_highmem)) 2922 &fail_page_alloc.ignore_gfp_highmem))
2923 goto fail; 2923 goto fail;
2924 if (!debugfs_create_u32("min-order", mode, dir, 2924 if (!debugfs_create_u32("min-order", mode, dir,
2925 &fail_page_alloc.min_order)) 2925 &fail_page_alloc.min_order))
2926 goto fail; 2926 goto fail;
2927 2927
2928 return 0; 2928 return 0;
2929 fail: 2929 fail:
2930 debugfs_remove_recursive(dir); 2930 debugfs_remove_recursive(dir);
2931 2931
2932 return -ENOMEM; 2932 return -ENOMEM;
2933 } 2933 }
2934 2934
2935 late_initcall(fail_page_alloc_debugfs); 2935 late_initcall(fail_page_alloc_debugfs);
2936 2936
2937 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ 2937 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
2938 2938
2939 #else /* CONFIG_FAIL_PAGE_ALLOC */ 2939 #else /* CONFIG_FAIL_PAGE_ALLOC */
2940 2940
2941 static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 2941 static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
2942 { 2942 {
2943 return false; 2943 return false;
2944 } 2944 }
2945 2945
2946 #endif /* CONFIG_FAIL_PAGE_ALLOC */ 2946 #endif /* CONFIG_FAIL_PAGE_ALLOC */
2947 2947
2948 /* 2948 /*
2949 * Return true if free base pages are above 'mark'. For high-order checks it 2949 * Return true if free base pages are above 'mark'. For high-order checks it
2950 * will return true of the order-0 watermark is reached and there is at least 2950 * will return true of the order-0 watermark is reached and there is at least
2951 * one free page of a suitable size. Checking now avoids taking the zone lock 2951 * one free page of a suitable size. Checking now avoids taking the zone lock
2952 * to check in the allocation paths if no pages are free. 2952 * to check in the allocation paths if no pages are free.
2953 */ 2953 */
2954 bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, 2954 bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
2955 int classzone_idx, unsigned int alloc_flags, 2955 int classzone_idx, unsigned int alloc_flags,
2956 long free_pages) 2956 long free_pages)
2957 { 2957 {
2958 long min = mark; 2958 long min = mark;
2959 int o; 2959 int o;
2960 const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM)); 2960 const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM));
2961 2961
2962 /* free_pages may go negative - that's OK */ 2962 /* free_pages may go negative - that's OK */
2963 free_pages -= (1 << order) - 1; 2963 free_pages -= (1 << order) - 1;
2964 2964
2965 if (alloc_flags & ALLOC_HIGH) 2965 if (alloc_flags & ALLOC_HIGH)
2966 min -= min / 2; 2966 min -= min / 2;
2967 2967
2968 /* 2968 /*
2969 * If the caller does not have rights to ALLOC_HARDER then subtract 2969 * If the caller does not have rights to ALLOC_HARDER then subtract
2970 * the high-atomic reserves. This will over-estimate the size of the 2970 * the high-atomic reserves. This will over-estimate the size of the
2971 * atomic reserve but it avoids a search. 2971 * atomic reserve but it avoids a search.
2972 */ 2972 */
2973 if (likely(!alloc_harder)) { 2973 if (likely(!alloc_harder)) {
2974 free_pages -= z->nr_reserved_highatomic; 2974 free_pages -= z->nr_reserved_highatomic;
2975 } else { 2975 } else {
2976 /* 2976 /*
2977 * OOM victims can try even harder than normal ALLOC_HARDER 2977 * OOM victims can try even harder than normal ALLOC_HARDER
2978 * users on the grounds that it's definitely going to be in 2978 * users on the grounds that it's definitely going to be in
2979 * the exit path shortly and free memory. Any allocation it 2979 * the exit path shortly and free memory. Any allocation it
2980 * makes during the free path will be small and short-lived. 2980 * makes during the free path will be small and short-lived.
2981 */ 2981 */
2982 if (alloc_flags & ALLOC_OOM) 2982 if (alloc_flags & ALLOC_OOM)
2983 min -= min / 2; 2983 min -= min / 2;
2984 else 2984 else
2985 min -= min / 4; 2985 min -= min / 4;
2986 } 2986 }
2987 2987
2988 2988
2989 #ifdef CONFIG_CMA 2989 #ifdef CONFIG_CMA
2990 /* If allocation can't use CMA areas don't use free CMA pages */ 2990 /* If allocation can't use CMA areas don't use free CMA pages */
2991 if (!(alloc_flags & ALLOC_CMA)) 2991 if (!(alloc_flags & ALLOC_CMA))
2992 free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES); 2992 free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);
2993 #endif 2993 #endif
2994 2994
2995 /* 2995 /*
2996 * Check watermarks for an order-0 allocation request. If these 2996 * Check watermarks for an order-0 allocation request. If these
2997 * are not met, then a high-order request also cannot go ahead 2997 * are not met, then a high-order request also cannot go ahead
2998 * even if a suitable page happened to be free. 2998 * even if a suitable page happened to be free.
2999 */ 2999 */
3000 if (free_pages <= min + z->lowmem_reserve[classzone_idx]) 3000 if (free_pages <= min + z->lowmem_reserve[classzone_idx])
3001 return false; 3001 return false;
3002 3002
3003 /* If this is an order-0 request then the watermark is fine */ 3003 /* If this is an order-0 request then the watermark is fine */
3004 if (!order) 3004 if (!order)
3005 return true; 3005 return true;
3006 3006
3007 /* For a high-order request, check at least one suitable page is free */ 3007 /* For a high-order request, check at least one suitable page is free */
3008 for (o = order; o < MAX_ORDER; o++) { 3008 for (o = order; o < MAX_ORDER; o++) {
3009 struct free_area *area = &z->free_area[o]; 3009 struct free_area *area = &z->free_area[o];
3010 int mt; 3010 int mt;
3011 3011
3012 if (!area->nr_free) 3012 if (!area->nr_free)
3013 continue; 3013 continue;
3014 3014
3015 for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) { 3015 for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) {
3016 if (!list_empty(&area->free_list[mt])) 3016 if (!list_empty(&area->free_list[mt]))
3017 return true; 3017 return true;
3018 } 3018 }
3019 3019
3020 #ifdef CONFIG_CMA 3020 #ifdef CONFIG_CMA
3021 if ((alloc_flags & ALLOC_CMA) && 3021 if ((alloc_flags & ALLOC_CMA) &&
3022 !list_empty(&area->free_list[MIGRATE_CMA])) { 3022 !list_empty(&area->free_list[MIGRATE_CMA])) {
3023 return true; 3023 return true;
3024 } 3024 }
3025 #endif 3025 #endif
3026 if (alloc_harder && 3026 if (alloc_harder &&
3027 !list_empty(&area->free_list[MIGRATE_HIGHATOMIC])) 3027 !list_empty(&area->free_list[MIGRATE_HIGHATOMIC]))
3028 return true; 3028 return true;
3029 } 3029 }
3030 return false; 3030 return false;
3031 } 3031 }
3032 3032
3033 bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, 3033 bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
3034 int classzone_idx, unsigned int alloc_flags) 3034 int classzone_idx, unsigned int alloc_flags)
3035 { 3035 {
3036 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, 3036 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
3037 zone_page_state(z, NR_FREE_PAGES)); 3037 zone_page_state(z, NR_FREE_PAGES));
3038 } 3038 }
3039 3039
3040 static inline bool zone_watermark_fast(struct zone *z, unsigned int order, 3040 static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
3041 unsigned long mark, int classzone_idx, unsigned int alloc_flags) 3041 unsigned long mark, int classzone_idx, unsigned int alloc_flags)
3042 { 3042 {
3043 long free_pages = zone_page_state(z, NR_FREE_PAGES); 3043 long free_pages = zone_page_state(z, NR_FREE_PAGES);
3044 long cma_pages = 0; 3044 long cma_pages = 0;
3045 3045
3046 #ifdef CONFIG_CMA 3046 #ifdef CONFIG_CMA
3047 /* If allocation can't use CMA areas don't use free CMA pages */ 3047 /* If allocation can't use CMA areas don't use free CMA pages */
3048 if (!(alloc_flags & ALLOC_CMA)) 3048 if (!(alloc_flags & ALLOC_CMA))
3049 cma_pages = zone_page_state(z, NR_FREE_CMA_PAGES); 3049 cma_pages = zone_page_state(z, NR_FREE_CMA_PAGES);
3050 #endif 3050 #endif
3051 3051
3052 /* 3052 /*
3053 * Fast check for order-0 only. If this fails then the reserves 3053 * Fast check for order-0 only. If this fails then the reserves
3054 * need to be calculated. There is a corner case where the check 3054 * need to be calculated. There is a corner case where the check
3055 * passes but only the high-order atomic reserve are free. If 3055 * passes but only the high-order atomic reserve are free. If
3056 * the caller is !atomic then it'll uselessly search the free 3056 * the caller is !atomic then it'll uselessly search the free
3057 * list. That corner case is then slower but it is harmless. 3057 * list. That corner case is then slower but it is harmless.
3058 */ 3058 */
3059 if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx]) 3059 if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx])
3060 return true; 3060 return true;
3061 3061
3062 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, 3062 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
3063 free_pages); 3063 free_pages);
3064 } 3064 }
3065 3065
3066 bool zone_watermark_ok_safe(struct zone *z, unsigned int order, 3066 bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
3067 unsigned long mark, int classzone_idx) 3067 unsigned long mark, int classzone_idx)
3068 { 3068 {
3069 long free_pages = zone_page_state(z, NR_FREE_PAGES); 3069 long free_pages = zone_page_state(z, NR_FREE_PAGES);
3070 3070
3071 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) 3071 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
3072 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); 3072 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
3073 3073
3074 return __zone_watermark_ok(z, order, mark, classzone_idx, 0, 3074 return __zone_watermark_ok(z, order, mark, classzone_idx, 0,
3075 free_pages); 3075 free_pages);
3076 } 3076 }
3077 3077
3078 #ifdef CONFIG_NUMA 3078 #ifdef CONFIG_NUMA
3079 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) 3079 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
3080 { 3080 {
3081 return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <= 3081 return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=
3082 RECLAIM_DISTANCE; 3082 RECLAIM_DISTANCE;
3083 } 3083 }
3084 #else /* CONFIG_NUMA */ 3084 #else /* CONFIG_NUMA */
3085 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) 3085 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
3086 { 3086 {
3087 return true; 3087 return true;
3088 } 3088 }
3089 #endif /* CONFIG_NUMA */ 3089 #endif /* CONFIG_NUMA */
3090 3090
3091 /* 3091 /*
3092 * get_page_from_freelist goes through the zonelist trying to allocate 3092 * get_page_from_freelist goes through the zonelist trying to allocate
3093 * a page. 3093 * a page.
3094 */ 3094 */
3095 static struct page * 3095 static struct page *
3096 get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, 3096 get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
3097 const struct alloc_context *ac) 3097 const struct alloc_context *ac)
3098 { 3098 {
3099 struct zoneref *z = ac->preferred_zoneref; 3099 struct zoneref *z = ac->preferred_zoneref;
3100 struct zone *zone; 3100 struct zone *zone;
3101 struct pglist_data *last_pgdat_dirty_limit = NULL; 3101 struct pglist_data *last_pgdat_dirty_limit = NULL;
3102 3102
3103 /* 3103 /*
3104 * Scan zonelist, looking for a zone with enough free. 3104 * Scan zonelist, looking for a zone with enough free.
3105 * See also __cpuset_node_allowed() comment in kernel/cpuset.c. 3105 * See also __cpuset_node_allowed() comment in kernel/cpuset.c.
3106 */ 3106 */
3107 for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, 3107 for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
3108 ac->nodemask) { 3108 ac->nodemask) {
3109 struct page *page; 3109 struct page *page;
3110 unsigned long mark; 3110 unsigned long mark;
3111 3111
3112 if (cpusets_enabled() && 3112 if (cpusets_enabled() &&
3113 (alloc_flags & ALLOC_CPUSET) && 3113 (alloc_flags & ALLOC_CPUSET) &&
3114 !__cpuset_zone_allowed(zone, gfp_mask)) 3114 !__cpuset_zone_allowed(zone, gfp_mask))
3115 continue; 3115 continue;
3116 /* 3116 /*
3117 * When allocating a page cache page for writing, we 3117 * When allocating a page cache page for writing, we
3118 * want to get it from a node that is within its dirty 3118 * want to get it from a node that is within its dirty
3119 * limit, such that no single node holds more than its 3119 * limit, such that no single node holds more than its
3120 * proportional share of globally allowed dirty pages. 3120 * proportional share of globally allowed dirty pages.
3121 * The dirty limits take into account the node's 3121 * The dirty limits take into account the node's
3122 * lowmem reserves and high watermark so that kswapd 3122 * lowmem reserves and high watermark so that kswapd
3123 * should be able to balance it without having to 3123 * should be able to balance it without having to
3124 * write pages from its LRU list. 3124 * write pages from its LRU list.
3125 * 3125 *
3126 * XXX: For now, allow allocations to potentially 3126 * XXX: For now, allow allocations to potentially
3127 * exceed the per-node dirty limit in the slowpath 3127 * exceed the per-node dirty limit in the slowpath
3128 * (spread_dirty_pages unset) before going into reclaim, 3128 * (spread_dirty_pages unset) before going into reclaim,
3129 * which is important when on a NUMA setup the allowed 3129 * which is important when on a NUMA setup the allowed
3130 * nodes are together not big enough to reach the 3130 * nodes are together not big enough to reach the
3131 * global limit. The proper fix for these situations 3131 * global limit. The proper fix for these situations
3132 * will require awareness of nodes in the 3132 * will require awareness of nodes in the
3133 * dirty-throttling and the flusher threads. 3133 * dirty-throttling and the flusher threads.
3134 */ 3134 */
3135 if (ac->spread_dirty_pages) { 3135 if (ac->spread_dirty_pages) {
3136 if (last_pgdat_dirty_limit == zone->zone_pgdat) 3136 if (last_pgdat_dirty_limit == zone->zone_pgdat)
3137 continue; 3137 continue;
3138 3138
3139 if (!node_dirty_ok(zone->zone_pgdat)) { 3139 if (!node_dirty_ok(zone->zone_pgdat)) {
3140 last_pgdat_dirty_limit = zone->zone_pgdat; 3140 last_pgdat_dirty_limit = zone->zone_pgdat;
3141 continue; 3141 continue;
3142 } 3142 }
3143 } 3143 }
3144 3144
3145 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; 3145 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
3146 if (!zone_watermark_fast(zone, order, mark, 3146 if (!zone_watermark_fast(zone, order, mark,
3147 ac_classzone_idx(ac), alloc_flags)) { 3147 ac_classzone_idx(ac), alloc_flags)) {
3148 int ret; 3148 int ret;
3149 3149
3150 /* Checked here to keep the fast path fast */ 3150 /* Checked here to keep the fast path fast */
3151 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); 3151 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
3152 if (alloc_flags & ALLOC_NO_WATERMARKS) 3152 if (alloc_flags & ALLOC_NO_WATERMARKS)
3153 goto try_this_zone; 3153 goto try_this_zone;
3154 3154
3155 if (node_reclaim_mode == 0 || 3155 if (node_reclaim_mode == 0 ||
3156 !zone_allows_reclaim(ac->preferred_zoneref->zone, zone)) 3156 !zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
3157 continue; 3157 continue;
3158 3158
3159 ret = node_reclaim(zone->zone_pgdat, gfp_mask, order); 3159 ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);
3160 switch (ret) { 3160 switch (ret) {
3161 case NODE_RECLAIM_NOSCAN: 3161 case NODE_RECLAIM_NOSCAN:
3162 /* did not scan */ 3162 /* did not scan */
3163 continue; 3163 continue;
3164 case NODE_RECLAIM_FULL: 3164 case NODE_RECLAIM_FULL:
3165 /* scanned but unreclaimable */ 3165 /* scanned but unreclaimable */
3166 continue; 3166 continue;
3167 default: 3167 default:
3168 /* did we reclaim enough */ 3168 /* did we reclaim enough */
3169 if (zone_watermark_ok(zone, order, mark, 3169 if (zone_watermark_ok(zone, order, mark,
3170 ac_classzone_idx(ac), alloc_flags)) 3170 ac_classzone_idx(ac), alloc_flags))
3171 goto try_this_zone; 3171 goto try_this_zone;
3172 3172
3173 continue; 3173 continue;
3174 } 3174 }
3175 } 3175 }
3176 3176
3177 try_this_zone: 3177 try_this_zone:
3178 page = rmqueue(ac->preferred_zoneref->zone, zone, order, 3178 page = rmqueue(ac->preferred_zoneref->zone, zone, order,
3179 gfp_mask, alloc_flags, ac->migratetype); 3179 gfp_mask, alloc_flags, ac->migratetype);
3180 if (page) { 3180 if (page) {
3181 prep_new_page(page, order, gfp_mask, alloc_flags); 3181 prep_new_page(page, order, gfp_mask, alloc_flags);
3182 3182
3183 /* 3183 /*
3184 * If this is a high-order atomic allocation then check 3184 * If this is a high-order atomic allocation then check
3185 * if the pageblock should be reserved for the future 3185 * if the pageblock should be reserved for the future
3186 */ 3186 */
3187 if (unlikely(order && (alloc_flags & ALLOC_HARDER))) 3187 if (unlikely(order && (alloc_flags & ALLOC_HARDER)))
3188 reserve_highatomic_pageblock(page, zone, order); 3188 reserve_highatomic_pageblock(page, zone, order);
3189 3189
3190 return page; 3190 return page;
3191 } 3191 }
3192 } 3192 }
3193 3193
3194 return NULL; 3194 return NULL;
3195 } 3195 }
3196 3196
3197 /* 3197 /*
3198 * Large machines with many possible nodes should not always dump per-node 3198 * Large machines with many possible nodes should not always dump per-node
3199 * meminfo in irq context. 3199 * meminfo in irq context.
3200 */ 3200 */
3201 static inline bool should_suppress_show_mem(void) 3201 static inline bool should_suppress_show_mem(void)
3202 { 3202 {
3203 bool ret = false; 3203 bool ret = false;
3204 3204
3205 #if NODES_SHIFT > 8 3205 #if NODES_SHIFT > 8
3206 ret = in_interrupt(); 3206 ret = in_interrupt();
3207 #endif 3207 #endif
3208 return ret; 3208 return ret;
3209 } 3209 }
3210 3210
3211 static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask) 3211 static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
3212 { 3212 {
3213 unsigned int filter = SHOW_MEM_FILTER_NODES; 3213 unsigned int filter = SHOW_MEM_FILTER_NODES;
3214 static DEFINE_RATELIMIT_STATE(show_mem_rs, HZ, 1); 3214 static DEFINE_RATELIMIT_STATE(show_mem_rs, HZ, 1);
3215 3215
3216 if (should_suppress_show_mem() || !__ratelimit(&show_mem_rs)) 3216 if (should_suppress_show_mem() || !__ratelimit(&show_mem_rs))
3217 return; 3217 return;
3218 3218
3219 /* 3219 /*
3220 * This documents exceptions given to allocations in certain 3220 * This documents exceptions given to allocations in certain
3221 * contexts that are allowed to allocate outside current's set 3221 * contexts that are allowed to allocate outside current's set
3222 * of allowed nodes. 3222 * of allowed nodes.
3223 */ 3223 */
3224 if (!(gfp_mask & __GFP_NOMEMALLOC)) 3224 if (!(gfp_mask & __GFP_NOMEMALLOC))
3225 if (tsk_is_oom_victim(current) || 3225 if (tsk_is_oom_victim(current) ||
3226 (current->flags & (PF_MEMALLOC | PF_EXITING))) 3226 (current->flags & (PF_MEMALLOC | PF_EXITING)))
3227 filter &= ~SHOW_MEM_FILTER_NODES; 3227 filter &= ~SHOW_MEM_FILTER_NODES;
3228 if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM)) 3228 if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
3229 filter &= ~SHOW_MEM_FILTER_NODES; 3229 filter &= ~SHOW_MEM_FILTER_NODES;
3230 3230
3231 show_mem(filter, nodemask); 3231 show_mem(filter, nodemask);
3232 } 3232 }
3233 3233
3234 void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) 3234 void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
3235 { 3235 {
3236 struct va_format vaf; 3236 struct va_format vaf;
3237 va_list args; 3237 va_list args;
3238 static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL, 3238 static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL,
3239 DEFAULT_RATELIMIT_BURST); 3239 DEFAULT_RATELIMIT_BURST);
3240 3240
3241 if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs)) 3241 if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))
3242 return; 3242 return;
3243 3243
3244 pr_warn("%s: ", current->comm); 3244 pr_warn("%s: ", current->comm);
3245 3245
3246 va_start(args, fmt); 3246 va_start(args, fmt);
3247 vaf.fmt = fmt; 3247 vaf.fmt = fmt;
3248 vaf.va = &args; 3248 vaf.va = &args;
3249 pr_cont("%pV", &vaf); 3249 pr_cont("%pV", &vaf);
3250 va_end(args); 3250 va_end(args);
3251 3251
3252 pr_cont(", mode:%#x(%pGg), nodemask=", gfp_mask, &gfp_mask); 3252 pr_cont(", mode:%#x(%pGg), nodemask=", gfp_mask, &gfp_mask);
3253 if (nodemask) 3253 if (nodemask)
3254 pr_cont("%*pbl\n", nodemask_pr_args(nodemask)); 3254 pr_cont("%*pbl\n", nodemask_pr_args(nodemask));
3255 else 3255 else
3256 pr_cont("(null)\n"); 3256 pr_cont("(null)\n");
3257 3257
3258 cpuset_print_current_mems_allowed(); 3258 cpuset_print_current_mems_allowed();
3259 3259
3260 dump_stack(); 3260 dump_stack();
3261 warn_alloc_show_mem(gfp_mask, nodemask); 3261 warn_alloc_show_mem(gfp_mask, nodemask);
3262 } 3262 }
3263 3263
3264 static inline struct page * 3264 static inline struct page *
3265 __alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order, 3265 __alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order,
3266 unsigned int alloc_flags, 3266 unsigned int alloc_flags,
3267 const struct alloc_context *ac) 3267 const struct alloc_context *ac)
3268 { 3268 {
3269 struct page *page; 3269 struct page *page;
3270 3270
3271 page = get_page_from_freelist(gfp_mask, order, 3271 page = get_page_from_freelist(gfp_mask, order,
3272 alloc_flags|ALLOC_CPUSET, ac); 3272 alloc_flags|ALLOC_CPUSET, ac);
3273 /* 3273 /*
3274 * fallback to ignore cpuset restriction if our nodes 3274 * fallback to ignore cpuset restriction if our nodes
3275 * are depleted 3275 * are depleted
3276 */ 3276 */
3277 if (!page) 3277 if (!page)
3278 page = get_page_from_freelist(gfp_mask, order, 3278 page = get_page_from_freelist(gfp_mask, order,
3279 alloc_flags, ac); 3279 alloc_flags, ac);
3280 3280
3281 return page; 3281 return page;
3282 } 3282 }
3283 3283
3284 static inline struct page * 3284 static inline struct page *
3285 __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, 3285 __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
3286 const struct alloc_context *ac, unsigned long *did_some_progress) 3286 const struct alloc_context *ac, unsigned long *did_some_progress)
3287 { 3287 {
3288 struct oom_control oc = { 3288 struct oom_control oc = {
3289 .zonelist = ac->zonelist, 3289 .zonelist = ac->zonelist,
3290 .nodemask = ac->nodemask, 3290 .nodemask = ac->nodemask,
3291 .memcg = NULL, 3291 .memcg = NULL,
3292 .gfp_mask = gfp_mask, 3292 .gfp_mask = gfp_mask,
3293 .order = order, 3293 .order = order,
3294 }; 3294 };
3295 struct page *page; 3295 struct page *page;
3296 3296
3297 *did_some_progress = 0; 3297 *did_some_progress = 0;
3298 3298
3299 /* 3299 /*
3300 * Acquire the oom lock. If that fails, somebody else is 3300 * Acquire the oom lock. If that fails, somebody else is
3301 * making progress for us. 3301 * making progress for us.
3302 */ 3302 */
3303 if (!mutex_trylock(&oom_lock)) { 3303 if (!mutex_trylock(&oom_lock)) {
3304 *did_some_progress = 1; 3304 *did_some_progress = 1;
3305 schedule_timeout_uninterruptible(1); 3305 schedule_timeout_uninterruptible(1);
3306 return NULL; 3306 return NULL;
3307 } 3307 }
3308 3308
3309 /* 3309 /*
3310 * Go through the zonelist yet one more time, keep very high watermark 3310 * Go through the zonelist yet one more time, keep very high watermark
3311 * here, this is only to catch a parallel oom killing, we must fail if 3311 * here, this is only to catch a parallel oom killing, we must fail if
3312 * we're still under heavy pressure. But make sure that this reclaim 3312 * we're still under heavy pressure. But make sure that this reclaim
3313 * attempt shall not depend on __GFP_DIRECT_RECLAIM && !__GFP_NORETRY 3313 * attempt shall not depend on __GFP_DIRECT_RECLAIM && !__GFP_NORETRY
3314 * allocation which will never fail due to oom_lock already held. 3314 * allocation which will never fail due to oom_lock already held.
3315 */ 3315 */
3316 page = get_page_from_freelist((gfp_mask | __GFP_HARDWALL) & 3316 page = get_page_from_freelist((gfp_mask | __GFP_HARDWALL) &
3317 ~__GFP_DIRECT_RECLAIM, order, 3317 ~__GFP_DIRECT_RECLAIM, order,
3318 ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac); 3318 ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac);
3319 if (page) 3319 if (page)
3320 goto out; 3320 goto out;
3321 3321
3322 /* Coredumps can quickly deplete all memory reserves */ 3322 /* Coredumps can quickly deplete all memory reserves */
3323 if (current->flags & PF_DUMPCORE) 3323 if (current->flags & PF_DUMPCORE)
3324 goto out; 3324 goto out;
3325 /* The OOM killer will not help higher order allocs */ 3325 /* The OOM killer will not help higher order allocs */
3326 if (order > PAGE_ALLOC_COSTLY_ORDER) 3326 if (order > PAGE_ALLOC_COSTLY_ORDER)
3327 goto out; 3327 goto out;
3328 /* 3328 /*
3329 * We have already exhausted all our reclaim opportunities without any 3329 * We have already exhausted all our reclaim opportunities without any
3330 * success so it is time to admit defeat. We will skip the OOM killer 3330 * success so it is time to admit defeat. We will skip the OOM killer
3331 * because it is very likely that the caller has a more reasonable 3331 * because it is very likely that the caller has a more reasonable
3332 * fallback than shooting a random task. 3332 * fallback than shooting a random task.
3333 */ 3333 */
3334 if (gfp_mask & __GFP_RETRY_MAYFAIL) 3334 if (gfp_mask & __GFP_RETRY_MAYFAIL)
3335 goto out; 3335 goto out;
3336 /* The OOM killer does not needlessly kill tasks for lowmem */ 3336 /* The OOM killer does not needlessly kill tasks for lowmem */
3337 if (ac->high_zoneidx < ZONE_NORMAL) 3337 if (ac->high_zoneidx < ZONE_NORMAL)
3338 goto out; 3338 goto out;
3339 if (pm_suspended_storage()) 3339 if (pm_suspended_storage())
3340 goto out; 3340 goto out;
3341 /* 3341 /*
3342 * XXX: GFP_NOFS allocations should rather fail than rely on 3342 * XXX: GFP_NOFS allocations should rather fail than rely on
3343 * other request to make a forward progress. 3343 * other request to make a forward progress.
3344 * We are in an unfortunate situation where out_of_memory cannot 3344 * We are in an unfortunate situation where out_of_memory cannot
3345 * do much for this context but let's try it to at least get 3345 * do much for this context but let's try it to at least get
3346 * access to memory reserved if the current task is killed (see 3346 * access to memory reserved if the current task is killed (see
3347 * out_of_memory). Once filesystems are ready to handle allocation 3347 * out_of_memory). Once filesystems are ready to handle allocation
3348 * failures more gracefully we should just bail out here. 3348 * failures more gracefully we should just bail out here.
3349 */ 3349 */
3350 3350
3351 /* The OOM killer may not free memory on a specific node */ 3351 /* The OOM killer may not free memory on a specific node */
3352 if (gfp_mask & __GFP_THISNODE) 3352 if (gfp_mask & __GFP_THISNODE)
3353 goto out; 3353 goto out;
3354 3354
3355 /* Exhausted what can be done so it's blamo time */ 3355 /* Exhausted what can be done so it's blamo time */
3356 if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) { 3356 if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
3357 *did_some_progress = 1; 3357 *did_some_progress = 1;
3358 3358
3359 /* 3359 /*
3360 * Help non-failing allocations by giving them access to memory 3360 * Help non-failing allocations by giving them access to memory
3361 * reserves 3361 * reserves
3362 */ 3362 */
3363 if (gfp_mask & __GFP_NOFAIL) 3363 if (gfp_mask & __GFP_NOFAIL)
3364 page = __alloc_pages_cpuset_fallback(gfp_mask, order, 3364 page = __alloc_pages_cpuset_fallback(gfp_mask, order,
3365 ALLOC_NO_WATERMARKS, ac); 3365 ALLOC_NO_WATERMARKS, ac);
3366 } 3366 }
3367 out: 3367 out:
3368 mutex_unlock(&oom_lock); 3368 mutex_unlock(&oom_lock);
3369 return page; 3369 return page;
3370 } 3370 }
3371 3371
3372 /* 3372 /*
3373 * Maximum number of compaction retries wit a progress before OOM 3373 * Maximum number of compaction retries wit a progress before OOM
3374 * killer is consider as the only way to move forward. 3374 * killer is consider as the only way to move forward.
3375 */ 3375 */
3376 #define MAX_COMPACT_RETRIES 16 3376 #define MAX_COMPACT_RETRIES 16
3377 3377
3378 #ifdef CONFIG_COMPACTION 3378 #ifdef CONFIG_COMPACTION
3379 /* Try memory compaction for high-order allocations before reclaim */ 3379 /* Try memory compaction for high-order allocations before reclaim */
3380 static struct page * 3380 static struct page *
3381 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 3381 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
3382 unsigned int alloc_flags, const struct alloc_context *ac, 3382 unsigned int alloc_flags, const struct alloc_context *ac,
3383 enum compact_priority prio, enum compact_result *compact_result) 3383 enum compact_priority prio, enum compact_result *compact_result)
3384 { 3384 {
3385 struct page *page; 3385 struct page *page;
3386 unsigned int noreclaim_flag; 3386 unsigned int noreclaim_flag;
3387 3387
3388 if (!order) 3388 if (!order)
3389 return NULL; 3389 return NULL;
3390 3390
3391 noreclaim_flag = memalloc_noreclaim_save(); 3391 noreclaim_flag = memalloc_noreclaim_save();
3392 *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, 3392 *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
3393 prio); 3393 prio);
3394 memalloc_noreclaim_restore(noreclaim_flag); 3394 memalloc_noreclaim_restore(noreclaim_flag);
3395 3395
3396 if (*compact_result <= COMPACT_INACTIVE) 3396 if (*compact_result <= COMPACT_INACTIVE)
3397 return NULL; 3397 return NULL;
3398 3398
3399 /* 3399 /*
3400 * At least in one zone compaction wasn't deferred or skipped, so let's 3400 * At least in one zone compaction wasn't deferred or skipped, so let's
3401 * count a compaction stall 3401 * count a compaction stall
3402 */ 3402 */
3403 count_vm_event(COMPACTSTALL); 3403 count_vm_event(COMPACTSTALL);
3404 3404
3405 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); 3405 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
3406 3406
3407 if (page) { 3407 if (page) {
3408 struct zone *zone = page_zone(page); 3408 struct zone *zone = page_zone(page);
3409 3409
3410 zone->compact_blockskip_flush = false; 3410 zone->compact_blockskip_flush = false;
3411 compaction_defer_reset(zone, order, true); 3411 compaction_defer_reset(zone, order, true);
3412 count_vm_event(COMPACTSUCCESS); 3412 count_vm_event(COMPACTSUCCESS);
3413 return page; 3413 return page;
3414 } 3414 }
3415 3415
3416 /* 3416 /*
3417 * It's bad if compaction run occurs and fails. The most likely reason 3417 * It's bad if compaction run occurs and fails. The most likely reason
3418 * is that pages exist, but not enough to satisfy watermarks. 3418 * is that pages exist, but not enough to satisfy watermarks.
3419 */ 3419 */
3420 count_vm_event(COMPACTFAIL); 3420 count_vm_event(COMPACTFAIL);
3421 3421
3422 cond_resched(); 3422 cond_resched();
3423 3423
3424 return NULL; 3424 return NULL;
3425 } 3425 }
3426 3426
3427 static inline bool 3427 static inline bool
3428 should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, 3428 should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
3429 enum compact_result compact_result, 3429 enum compact_result compact_result,
3430 enum compact_priority *compact_priority, 3430 enum compact_priority *compact_priority,
3431 int *compaction_retries) 3431 int *compaction_retries)
3432 { 3432 {
3433 int max_retries = MAX_COMPACT_RETRIES; 3433 int max_retries = MAX_COMPACT_RETRIES;
3434 int min_priority; 3434 int min_priority;
3435 bool ret = false; 3435 bool ret = false;
3436 int retries = *compaction_retries; 3436 int retries = *compaction_retries;
3437 enum compact_priority priority = *compact_priority; 3437 enum compact_priority priority = *compact_priority;
3438 3438
3439 if (!order) 3439 if (!order)
3440 return false; 3440 return false;
3441 3441
3442 if (compaction_made_progress(compact_result)) 3442 if (compaction_made_progress(compact_result))
3443 (*compaction_retries)++; 3443 (*compaction_retries)++;
3444 3444
3445 /* 3445 /*
3446 * compaction considers all the zone as desperately out of memory 3446 * compaction considers all the zone as desperately out of memory
3447 * so it doesn't really make much sense to retry except when the 3447 * so it doesn't really make much sense to retry except when the
3448 * failure could be caused by insufficient priority 3448 * failure could be caused by insufficient priority
3449 */ 3449 */
3450 if (compaction_failed(compact_result)) 3450 if (compaction_failed(compact_result))
3451 goto check_priority; 3451 goto check_priority;
3452 3452
3453 /* 3453 /*
3454 * make sure the compaction wasn't deferred or didn't bail out early 3454 * make sure the compaction wasn't deferred or didn't bail out early
3455 * due to locks contention before we declare that we should give up. 3455 * due to locks contention before we declare that we should give up.
3456 * But do not retry if the given zonelist is not suitable for 3456 * But do not retry if the given zonelist is not suitable for
3457 * compaction. 3457 * compaction.
3458 */ 3458 */
3459 if (compaction_withdrawn(compact_result)) { 3459 if (compaction_withdrawn(compact_result)) {
3460 ret = compaction_zonelist_suitable(ac, order, alloc_flags); 3460 ret = compaction_zonelist_suitable(ac, order, alloc_flags);
3461 goto out; 3461 goto out;
3462 } 3462 }
3463 3463
3464 /* 3464 /*
3465 * !costly requests are much more important than __GFP_RETRY_MAYFAIL 3465 * !costly requests are much more important than __GFP_RETRY_MAYFAIL
3466 * costly ones because they are de facto nofail and invoke OOM 3466 * costly ones because they are de facto nofail and invoke OOM
3467 * killer to move on while costly can fail and users are ready 3467 * killer to move on while costly can fail and users are ready
3468 * to cope with that. 1/4 retries is rather arbitrary but we 3468 * to cope with that. 1/4 retries is rather arbitrary but we
3469 * would need much more detailed feedback from compaction to 3469 * would need much more detailed feedback from compaction to
3470 * make a better decision. 3470 * make a better decision.
3471 */ 3471 */
3472 if (order > PAGE_ALLOC_COSTLY_ORDER) 3472 if (order > PAGE_ALLOC_COSTLY_ORDER)
3473 max_retries /= 4; 3473 max_retries /= 4;
3474 if (*compaction_retries <= max_retries) { 3474 if (*compaction_retries <= max_retries) {
3475 ret = true; 3475 ret = true;
3476 goto out; 3476 goto out;
3477 } 3477 }
3478 3478
3479 /* 3479 /*
3480 * Make sure there are attempts at the highest priority if we exhausted 3480 * Make sure there are attempts at the highest priority if we exhausted
3481 * all retries or failed at the lower priorities. 3481 * all retries or failed at the lower priorities.
3482 */ 3482 */
3483 check_priority: 3483 check_priority:
3484 min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ? 3484 min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ?
3485 MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY; 3485 MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY;
3486 3486
3487 if (*compact_priority > min_priority) { 3487 if (*compact_priority > min_priority) {
3488 (*compact_priority)--; 3488 (*compact_priority)--;
3489 *compaction_retries = 0; 3489 *compaction_retries = 0;
3490 ret = true; 3490 ret = true;
3491 } 3491 }
3492 out: 3492 out:
3493 trace_compact_retry(order, priority, compact_result, retries, max_retries, ret); 3493 trace_compact_retry(order, priority, compact_result, retries, max_retries, ret);
3494 return ret; 3494 return ret;
3495 } 3495 }
3496 #else 3496 #else
3497 static inline struct page * 3497 static inline struct page *
3498 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 3498 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
3499 unsigned int alloc_flags, const struct alloc_context *ac, 3499 unsigned int alloc_flags, const struct alloc_context *ac,
3500 enum compact_priority prio, enum compact_result *compact_result) 3500 enum compact_priority prio, enum compact_result *compact_result)
3501 { 3501 {
3502 *compact_result = COMPACT_SKIPPED; 3502 *compact_result = COMPACT_SKIPPED;
3503 return NULL; 3503 return NULL;
3504 } 3504 }
3505 3505
3506 static inline bool 3506 static inline bool
3507 should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags, 3507 should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags,
3508 enum compact_result compact_result, 3508 enum compact_result compact_result,
3509 enum compact_priority *compact_priority, 3509 enum compact_priority *compact_priority,
3510 int *compaction_retries) 3510 int *compaction_retries)
3511 { 3511 {
3512 struct zone *zone; 3512 struct zone *zone;
3513 struct zoneref *z; 3513 struct zoneref *z;
3514 3514
3515 if (!order || order > PAGE_ALLOC_COSTLY_ORDER) 3515 if (!order || order > PAGE_ALLOC_COSTLY_ORDER)
3516 return false; 3516 return false;
3517 3517
3518 /* 3518 /*
3519 * There are setups with compaction disabled which would prefer to loop 3519 * There are setups with compaction disabled which would prefer to loop
3520 * inside the allocator rather than hit the oom killer prematurely. 3520 * inside the allocator rather than hit the oom killer prematurely.
3521 * Let's give them a good hope and keep retrying while the order-0 3521 * Let's give them a good hope and keep retrying while the order-0
3522 * watermarks are OK. 3522 * watermarks are OK.
3523 */ 3523 */
3524 for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, 3524 for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
3525 ac->nodemask) { 3525 ac->nodemask) {
3526 if (zone_watermark_ok(zone, 0, min_wmark_pages(zone), 3526 if (zone_watermark_ok(zone, 0, min_wmark_pages(zone),
3527 ac_classzone_idx(ac), alloc_flags)) 3527 ac_classzone_idx(ac), alloc_flags))
3528 return true; 3528 return true;
3529 } 3529 }
3530 return false; 3530 return false;
3531 } 3531 }
3532 #endif /* CONFIG_COMPACTION */ 3532 #endif /* CONFIG_COMPACTION */
3533 3533
3534 #ifdef CONFIG_LOCKDEP 3534 #ifdef CONFIG_LOCKDEP
3535 struct lockdep_map __fs_reclaim_map = 3535 struct lockdep_map __fs_reclaim_map =
3536 STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map); 3536 STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map);
3537 3537
3538 static bool __need_fs_reclaim(gfp_t gfp_mask) 3538 static bool __need_fs_reclaim(gfp_t gfp_mask)
3539 { 3539 {
3540 gfp_mask = current_gfp_context(gfp_mask); 3540 gfp_mask = current_gfp_context(gfp_mask);
3541 3541
3542 /* no reclaim without waiting on it */ 3542 /* no reclaim without waiting on it */
3543 if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) 3543 if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
3544 return false; 3544 return false;
3545 3545
3546 /* this guy won't enter reclaim */ 3546 /* this guy won't enter reclaim */
3547 if (current->flags & PF_MEMALLOC) 3547 if (current->flags & PF_MEMALLOC)
3548 return false; 3548 return false;
3549 3549
3550 /* We're only interested __GFP_FS allocations for now */ 3550 /* We're only interested __GFP_FS allocations for now */
3551 if (!(gfp_mask & __GFP_FS)) 3551 if (!(gfp_mask & __GFP_FS))
3552 return false; 3552 return false;
3553 3553
3554 if (gfp_mask & __GFP_NOLOCKDEP) 3554 if (gfp_mask & __GFP_NOLOCKDEP)
3555 return false; 3555 return false;
3556 3556
3557 return true; 3557 return true;
3558 } 3558 }
3559 3559
3560 void fs_reclaim_acquire(gfp_t gfp_mask) 3560 void fs_reclaim_acquire(gfp_t gfp_mask)
3561 { 3561 {
3562 if (__need_fs_reclaim(gfp_mask)) 3562 if (__need_fs_reclaim(gfp_mask))
3563 lock_map_acquire(&__fs_reclaim_map); 3563 lock_map_acquire(&__fs_reclaim_map);
3564 } 3564 }
3565 EXPORT_SYMBOL_GPL(fs_reclaim_acquire); 3565 EXPORT_SYMBOL_GPL(fs_reclaim_acquire);
3566 3566
3567 void fs_reclaim_release(gfp_t gfp_mask) 3567 void fs_reclaim_release(gfp_t gfp_mask)
3568 { 3568 {
3569 if (__need_fs_reclaim(gfp_mask)) 3569 if (__need_fs_reclaim(gfp_mask))
3570 lock_map_release(&__fs_reclaim_map); 3570 lock_map_release(&__fs_reclaim_map);
3571 } 3571 }
3572 EXPORT_SYMBOL_GPL(fs_reclaim_release); 3572 EXPORT_SYMBOL_GPL(fs_reclaim_release);
3573 #endif 3573 #endif
3574 3574
3575 /* Perform direct synchronous page reclaim */ 3575 /* Perform direct synchronous page reclaim */
3576 static int 3576 static int
3577 __perform_reclaim(gfp_t gfp_mask, unsigned int order, 3577 __perform_reclaim(gfp_t gfp_mask, unsigned int order,
3578 const struct alloc_context *ac) 3578 const struct alloc_context *ac)
3579 { 3579 {
3580 struct reclaim_state reclaim_state; 3580 struct reclaim_state reclaim_state;
3581 int progress; 3581 int progress;
3582 unsigned int noreclaim_flag; 3582 unsigned int noreclaim_flag;
3583 3583
3584 cond_resched(); 3584 cond_resched();
3585 3585
3586 /* We now go into synchronous reclaim */ 3586 /* We now go into synchronous reclaim */
3587 cpuset_memory_pressure_bump(); 3587 cpuset_memory_pressure_bump();
3588 noreclaim_flag = memalloc_noreclaim_save(); 3588 noreclaim_flag = memalloc_noreclaim_save();
3589 fs_reclaim_acquire(gfp_mask); 3589 fs_reclaim_acquire(gfp_mask);
3590 reclaim_state.reclaimed_slab = 0; 3590 reclaim_state.reclaimed_slab = 0;
3591 current->reclaim_state = &reclaim_state; 3591 current->reclaim_state = &reclaim_state;
3592 3592
3593 progress = try_to_free_pages(ac->zonelist, order, gfp_mask, 3593 progress = try_to_free_pages(ac->zonelist, order, gfp_mask,
3594 ac->nodemask); 3594 ac->nodemask);
3595 3595
3596 current->reclaim_state = NULL; 3596 current->reclaim_state = NULL;
3597 fs_reclaim_release(gfp_mask); 3597 fs_reclaim_release(gfp_mask);
3598 memalloc_noreclaim_restore(noreclaim_flag); 3598 memalloc_noreclaim_restore(noreclaim_flag);
3599 3599
3600 cond_resched(); 3600 cond_resched();
3601 3601
3602 return progress; 3602 return progress;
3603 } 3603 }
3604 3604
3605 /* The really slow allocator path where we enter direct reclaim */ 3605 /* The really slow allocator path where we enter direct reclaim */
3606 static inline struct page * 3606 static inline struct page *
3607 __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, 3607 __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
3608 unsigned int alloc_flags, const struct alloc_context *ac, 3608 unsigned int alloc_flags, const struct alloc_context *ac,
3609 unsigned long *did_some_progress) 3609 unsigned long *did_some_progress)
3610 { 3610 {
3611 struct page *page = NULL; 3611 struct page *page = NULL;
3612 bool drained = false; 3612 bool drained = false;
3613 3613
3614 *did_some_progress = __perform_reclaim(gfp_mask, order, ac); 3614 *did_some_progress = __perform_reclaim(gfp_mask, order, ac);
3615 if (unlikely(!(*did_some_progress))) 3615 if (unlikely(!(*did_some_progress)))
3616 return NULL; 3616 return NULL;
3617 3617
3618 retry: 3618 retry:
3619 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); 3619 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
3620 3620
3621 /* 3621 /*
3622 * If an allocation failed after direct reclaim, it could be because 3622 * If an allocation failed after direct reclaim, it could be because
3623 * pages are pinned on the per-cpu lists or in high alloc reserves. 3623 * pages are pinned on the per-cpu lists or in high alloc reserves.
3624 * Shrink them them and try again 3624 * Shrink them them and try again
3625 */ 3625 */
3626 if (!page && !drained) { 3626 if (!page && !drained) {
3627 unreserve_highatomic_pageblock(ac, false); 3627 unreserve_highatomic_pageblock(ac, false);
3628 drain_all_pages(NULL); 3628 drain_all_pages(NULL);
3629 drained = true; 3629 drained = true;
3630 goto retry; 3630 goto retry;
3631 } 3631 }
3632 3632
3633 return page; 3633 return page;
3634 } 3634 }
3635 3635
3636 static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac) 3636 static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac)
3637 { 3637 {
3638 struct zoneref *z; 3638 struct zoneref *z;
3639 struct zone *zone; 3639 struct zone *zone;
3640 pg_data_t *last_pgdat = NULL; 3640 pg_data_t *last_pgdat = NULL;
3641 3641
3642 for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, 3642 for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
3643 ac->high_zoneidx, ac->nodemask) { 3643 ac->high_zoneidx, ac->nodemask) {
3644 if (last_pgdat != zone->zone_pgdat) 3644 if (last_pgdat != zone->zone_pgdat)
3645 wakeup_kswapd(zone, order, ac->high_zoneidx); 3645 wakeup_kswapd(zone, order, ac->high_zoneidx);
3646 last_pgdat = zone->zone_pgdat; 3646 last_pgdat = zone->zone_pgdat;
3647 } 3647 }
3648 } 3648 }
3649 3649
3650 static inline unsigned int 3650 static inline unsigned int
3651 gfp_to_alloc_flags(gfp_t gfp_mask) 3651 gfp_to_alloc_flags(gfp_t gfp_mask)
3652 { 3652 {
3653 unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; 3653 unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
3654 3654
3655 /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */ 3655 /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
3656 BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH); 3656 BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
3657 3657
3658 /* 3658 /*
3659 * The caller may dip into page reserves a bit more if the caller 3659 * The caller may dip into page reserves a bit more if the caller
3660 * cannot run direct reclaim, or if the caller has realtime scheduling 3660 * cannot run direct reclaim, or if the caller has realtime scheduling
3661 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will 3661 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
3662 * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH). 3662 * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH).
3663 */ 3663 */
3664 alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH); 3664 alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
3665 3665
3666 if (gfp_mask & __GFP_ATOMIC) { 3666 if (gfp_mask & __GFP_ATOMIC) {
3667 /* 3667 /*
3668 * Not worth trying to allocate harder for __GFP_NOMEMALLOC even 3668 * Not worth trying to allocate harder for __GFP_NOMEMALLOC even
3669 * if it can't schedule. 3669 * if it can't schedule.
3670 */ 3670 */
3671 if (!(gfp_mask & __GFP_NOMEMALLOC)) 3671 if (!(gfp_mask & __GFP_NOMEMALLOC))
3672 alloc_flags |= ALLOC_HARDER; 3672 alloc_flags |= ALLOC_HARDER;
3673 /* 3673 /*
3674 * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the 3674 * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the
3675 * comment for __cpuset_node_allowed(). 3675 * comment for __cpuset_node_allowed().
3676 */ 3676 */
3677 alloc_flags &= ~ALLOC_CPUSET; 3677 alloc_flags &= ~ALLOC_CPUSET;
3678 } else if (unlikely(rt_task(current)) && !in_interrupt()) 3678 } else if (unlikely(rt_task(current)) && !in_interrupt())
3679 alloc_flags |= ALLOC_HARDER; 3679 alloc_flags |= ALLOC_HARDER;
3680 3680
3681 #ifdef CONFIG_CMA 3681 #ifdef CONFIG_CMA
3682 if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) 3682 if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
3683 alloc_flags |= ALLOC_CMA; 3683 alloc_flags |= ALLOC_CMA;
3684 #endif 3684 #endif
3685 return alloc_flags; 3685 return alloc_flags;
3686 } 3686 }
3687 3687
3688 static bool oom_reserves_allowed(struct task_struct *tsk) 3688 static bool oom_reserves_allowed(struct task_struct *tsk)
3689 { 3689 {
3690 if (!tsk_is_oom_victim(tsk)) 3690 if (!tsk_is_oom_victim(tsk))
3691 return false; 3691 return false;
3692 3692
3693 /* 3693 /*
3694 * !MMU doesn't have oom reaper so give access to memory reserves 3694 * !MMU doesn't have oom reaper so give access to memory reserves
3695 * only to the thread with TIF_MEMDIE set 3695 * only to the thread with TIF_MEMDIE set
3696 */ 3696 */
3697 if (!IS_ENABLED(CONFIG_MMU) && !test_thread_flag(TIF_MEMDIE)) 3697 if (!IS_ENABLED(CONFIG_MMU) && !test_thread_flag(TIF_MEMDIE))
3698 return false; 3698 return false;
3699 3699
3700 return true; 3700 return true;
3701 } 3701 }
3702 3702
3703 /* 3703 /*
3704 * Distinguish requests which really need access to full memory 3704 * Distinguish requests which really need access to full memory
3705 * reserves from oom victims which can live with a portion of it 3705 * reserves from oom victims which can live with a portion of it
3706 */ 3706 */
3707 static inline int __gfp_pfmemalloc_flags(gfp_t gfp_mask) 3707 static inline int __gfp_pfmemalloc_flags(gfp_t gfp_mask)
3708 { 3708 {
3709 if (unlikely(gfp_mask & __GFP_NOMEMALLOC)) 3709 if (unlikely(gfp_mask & __GFP_NOMEMALLOC))
3710 return 0; 3710 return 0;
3711 if (gfp_mask & __GFP_MEMALLOC) 3711 if (gfp_mask & __GFP_MEMALLOC)
3712 return ALLOC_NO_WATERMARKS; 3712 return ALLOC_NO_WATERMARKS;
3713 if (in_serving_softirq() && (current->flags & PF_MEMALLOC)) 3713 if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
3714 return ALLOC_NO_WATERMARKS; 3714 return ALLOC_NO_WATERMARKS;
3715 if (!in_interrupt()) { 3715 if (!in_interrupt()) {
3716 if (current->flags & PF_MEMALLOC) 3716 if (current->flags & PF_MEMALLOC)
3717 return ALLOC_NO_WATERMARKS; 3717 return ALLOC_NO_WATERMARKS;
3718 else if (oom_reserves_allowed(current)) 3718 else if (oom_reserves_allowed(current))
3719 return ALLOC_OOM; 3719 return ALLOC_OOM;
3720 } 3720 }
3721 3721
3722 return 0; 3722 return 0;
3723 } 3723 }
3724 3724
3725 bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) 3725 bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
3726 { 3726 {
3727 return !!__gfp_pfmemalloc_flags(gfp_mask); 3727 return !!__gfp_pfmemalloc_flags(gfp_mask);
3728 } 3728 }
3729 3729
3730 /* 3730 /*
3731 * Checks whether it makes sense to retry the reclaim to make a forward progress 3731 * Checks whether it makes sense to retry the reclaim to make a forward progress
3732 * for the given allocation request. 3732 * for the given allocation request.
3733 * 3733 *
3734 * We give up when we either have tried MAX_RECLAIM_RETRIES in a row 3734 * We give up when we either have tried MAX_RECLAIM_RETRIES in a row
3735 * without success, or when we couldn't even meet the watermark if we 3735 * without success, or when we couldn't even meet the watermark if we
3736 * reclaimed all remaining pages on the LRU lists. 3736 * reclaimed all remaining pages on the LRU lists.
3737 * 3737 *
3738 * Returns true if a retry is viable or false to enter the oom path. 3738 * Returns true if a retry is viable or false to enter the oom path.
3739 */ 3739 */
3740 static inline bool 3740 static inline bool
3741 should_reclaim_retry(gfp_t gfp_mask, unsigned order, 3741 should_reclaim_retry(gfp_t gfp_mask, unsigned order,
3742 struct alloc_context *ac, int alloc_flags, 3742 struct alloc_context *ac, int alloc_flags,
3743 bool did_some_progress, int *no_progress_loops) 3743 bool did_some_progress, int *no_progress_loops)
3744 { 3744 {
3745 struct zone *zone; 3745 struct zone *zone;
3746 struct zoneref *z; 3746 struct zoneref *z;
3747 3747
3748 /* 3748 /*
3749 * Costly allocations might have made a progress but this doesn't mean 3749 * Costly allocations might have made a progress but this doesn't mean
3750 * their order will become available due to high fragmentation so 3750 * their order will become available due to high fragmentation so
3751 * always increment the no progress counter for them 3751 * always increment the no progress counter for them
3752 */ 3752 */
3753 if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) 3753 if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER)
3754 *no_progress_loops = 0; 3754 *no_progress_loops = 0;
3755 else 3755 else
3756 (*no_progress_loops)++; 3756 (*no_progress_loops)++;
3757 3757
3758 /* 3758 /*
3759 * Make sure we converge to OOM if we cannot make any progress 3759 * Make sure we converge to OOM if we cannot make any progress
3760 * several times in the row. 3760 * several times in the row.
3761 */ 3761 */
3762 if (*no_progress_loops > MAX_RECLAIM_RETRIES) { 3762 if (*no_progress_loops > MAX_RECLAIM_RETRIES) {
3763 /* Before OOM, exhaust highatomic_reserve */ 3763 /* Before OOM, exhaust highatomic_reserve */
3764 return unreserve_highatomic_pageblock(ac, true); 3764 return unreserve_highatomic_pageblock(ac, true);
3765 } 3765 }
3766 3766
3767 /* 3767 /*
3768 * Keep reclaiming pages while there is a chance this will lead 3768 * Keep reclaiming pages while there is a chance this will lead
3769 * somewhere. If none of the target zones can satisfy our allocation 3769 * somewhere. If none of the target zones can satisfy our allocation
3770 * request even if all reclaimable pages are considered then we are 3770 * request even if all reclaimable pages are considered then we are
3771 * screwed and have to go OOM. 3771 * screwed and have to go OOM.
3772 */ 3772 */
3773 for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, 3773 for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
3774 ac->nodemask) { 3774 ac->nodemask) {
3775 unsigned long available; 3775 unsigned long available;
3776 unsigned long reclaimable; 3776 unsigned long reclaimable;
3777 unsigned long min_wmark = min_wmark_pages(zone); 3777 unsigned long min_wmark = min_wmark_pages(zone);
3778 bool wmark; 3778 bool wmark;
3779 3779
3780 available = reclaimable = zone_reclaimable_pages(zone); 3780 available = reclaimable = zone_reclaimable_pages(zone);
3781 available += zone_page_state_snapshot(zone, NR_FREE_PAGES); 3781 available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
3782 3782
3783 /* 3783 /*
3784 * Would the allocation succeed if we reclaimed all 3784 * Would the allocation succeed if we reclaimed all
3785 * reclaimable pages? 3785 * reclaimable pages?
3786 */ 3786 */
3787 wmark = __zone_watermark_ok(zone, order, min_wmark, 3787 wmark = __zone_watermark_ok(zone, order, min_wmark,
3788 ac_classzone_idx(ac), alloc_flags, available); 3788 ac_classzone_idx(ac), alloc_flags, available);
3789 trace_reclaim_retry_zone(z, order, reclaimable, 3789 trace_reclaim_retry_zone(z, order, reclaimable,
3790 available, min_wmark, *no_progress_loops, wmark); 3790 available, min_wmark, *no_progress_loops, wmark);
3791 if (wmark) { 3791 if (wmark) {
3792 /* 3792 /*
3793 * If we didn't make any progress and have a lot of 3793 * If we didn't make any progress and have a lot of
3794 * dirty + writeback pages then we should wait for 3794 * dirty + writeback pages then we should wait for
3795 * an IO to complete to slow down the reclaim and 3795 * an IO to complete to slow down the reclaim and
3796 * prevent from pre mature OOM 3796 * prevent from pre mature OOM
3797 */ 3797 */
3798 if (!did_some_progress) { 3798 if (!did_some_progress) {
3799 unsigned long write_pending; 3799 unsigned long write_pending;
3800 3800
3801 write_pending = zone_page_state_snapshot(zone, 3801 write_pending = zone_page_state_snapshot(zone,
3802 NR_ZONE_WRITE_PENDING); 3802 NR_ZONE_WRITE_PENDING);
3803 3803
3804 if (2 * write_pending > reclaimable) { 3804 if (2 * write_pending > reclaimable) {
3805 congestion_wait(BLK_RW_ASYNC, HZ/10); 3805 congestion_wait(BLK_RW_ASYNC, HZ/10);
3806 return true; 3806 return true;
3807 } 3807 }
3808 } 3808 }
3809 3809
3810 /* 3810 /*
3811 * Memory allocation/reclaim might be called from a WQ 3811 * Memory allocation/reclaim might be called from a WQ
3812 * context and the current implementation of the WQ 3812 * context and the current implementation of the WQ
3813 * concurrency control doesn't recognize that 3813 * concurrency control doesn't recognize that
3814 * a particular WQ is congested if the worker thread is 3814 * a particular WQ is congested if the worker thread is
3815 * looping without ever sleeping. Therefore we have to 3815 * looping without ever sleeping. Therefore we have to
3816 * do a short sleep here rather than calling 3816 * do a short sleep here rather than calling
3817 * cond_resched(). 3817 * cond_resched().
3818 */ 3818 */
3819 if (current->flags & PF_WQ_WORKER) 3819 if (current->flags & PF_WQ_WORKER)
3820 schedule_timeout_uninterruptible(1); 3820 schedule_timeout_uninterruptible(1);
3821 else 3821 else
3822 cond_resched(); 3822 cond_resched();
3823 3823
3824 return true; 3824 return true;
3825 } 3825 }
3826 } 3826 }
3827 3827
3828 return false; 3828 return false;
3829 } 3829 }
3830 3830
3831 static inline bool 3831 static inline bool
3832 check_retry_cpuset(int cpuset_mems_cookie, struct alloc_context *ac) 3832 check_retry_cpuset(int cpuset_mems_cookie, struct alloc_context *ac)
3833 { 3833 {
3834 /* 3834 /*
3835 * It's possible that cpuset's mems_allowed and the nodemask from 3835 * It's possible that cpuset's mems_allowed and the nodemask from
3836 * mempolicy don't intersect. This should be normally dealt with by 3836 * mempolicy don't intersect. This should be normally dealt with by
3837 * policy_nodemask(), but it's possible to race with cpuset update in 3837 * policy_nodemask(), but it's possible to race with cpuset update in
3838 * such a way the check therein was true, and then it became false 3838 * such a way the check therein was true, and then it became false
3839 * before we got our cpuset_mems_cookie here. 3839 * before we got our cpuset_mems_cookie here.
3840 * This assumes that for all allocations, ac->nodemask can come only 3840 * This assumes that for all allocations, ac->nodemask can come only
3841 * from MPOL_BIND mempolicy (whose documented semantics is to be ignored 3841 * from MPOL_BIND mempolicy (whose documented semantics is to be ignored
3842 * when it does not intersect with the cpuset restrictions) or the 3842 * when it does not intersect with the cpuset restrictions) or the
3843 * caller can deal with a violated nodemask. 3843 * caller can deal with a violated nodemask.
3844 */ 3844 */
3845 if (cpusets_enabled() && ac->nodemask && 3845 if (cpusets_enabled() && ac->nodemask &&
3846 !cpuset_nodemask_valid_mems_allowed(ac->nodemask)) { 3846 !cpuset_nodemask_valid_mems_allowed(ac->nodemask)) {
3847 ac->nodemask = NULL; 3847 ac->nodemask = NULL;
3848 return true; 3848 return true;
3849 } 3849 }
3850 3850
3851 /* 3851 /*
3852 * When updating a task's mems_allowed or mempolicy nodemask, it is 3852 * When updating a task's mems_allowed or mempolicy nodemask, it is
3853 * possible to race with parallel threads in such a way that our 3853 * possible to race with parallel threads in such a way that our
3854 * allocation can fail while the mask is being updated. If we are about 3854 * allocation can fail while the mask is being updated. If we are about
3855 * to fail, check if the cpuset changed during allocation and if so, 3855 * to fail, check if the cpuset changed during allocation and if so,
3856 * retry. 3856 * retry.
3857 */ 3857 */
3858 if (read_mems_allowed_retry(cpuset_mems_cookie)) 3858 if (read_mems_allowed_retry(cpuset_mems_cookie))
3859 return true; 3859 return true;
3860 3860
3861 return false; 3861 return false;
3862 } 3862 }
3863 3863
3864 static inline struct page * 3864 static inline struct page *
3865 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, 3865 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
3866 struct alloc_context *ac) 3866 struct alloc_context *ac)
3867 { 3867 {
3868 bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM; 3868 bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
3869 const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER; 3869 const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;
3870 struct page *page = NULL; 3870 struct page *page = NULL;
3871 unsigned int alloc_flags; 3871 unsigned int alloc_flags;
3872 unsigned long did_some_progress; 3872 unsigned long did_some_progress;
3873 enum compact_priority compact_priority; 3873 enum compact_priority compact_priority;
3874 enum compact_result compact_result; 3874 enum compact_result compact_result;
3875 int compaction_retries; 3875 int compaction_retries;
3876 int no_progress_loops; 3876 int no_progress_loops;
3877 unsigned int cpuset_mems_cookie; 3877 unsigned int cpuset_mems_cookie;
3878 int reserve_flags; 3878 int reserve_flags;
3879 3879
3880 /* 3880 /*
3881 * We also sanity check to catch abuse of atomic reserves being used by 3881 * We also sanity check to catch abuse of atomic reserves being used by
3882 * callers that are not in atomic context. 3882 * callers that are not in atomic context.
3883 */ 3883 */
3884 if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) == 3884 if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) ==
3885 (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) 3885 (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
3886 gfp_mask &= ~__GFP_ATOMIC; 3886 gfp_mask &= ~__GFP_ATOMIC;
3887 3887
3888 retry_cpuset: 3888 retry_cpuset:
3889 compaction_retries = 0; 3889 compaction_retries = 0;
3890 no_progress_loops = 0; 3890 no_progress_loops = 0;
3891 compact_priority = DEF_COMPACT_PRIORITY; 3891 compact_priority = DEF_COMPACT_PRIORITY;
3892 cpuset_mems_cookie = read_mems_allowed_begin(); 3892 cpuset_mems_cookie = read_mems_allowed_begin();
3893 3893
3894 /* 3894 /*
3895 * The fast path uses conservative alloc_flags to succeed only until 3895 * The fast path uses conservative alloc_flags to succeed only until
3896 * kswapd needs to be woken up, and to avoid the cost of setting up 3896 * kswapd needs to be woken up, and to avoid the cost of setting up
3897 * alloc_flags precisely. So we do that now. 3897 * alloc_flags precisely. So we do that now.
3898 */ 3898 */
3899 alloc_flags = gfp_to_alloc_flags(gfp_mask); 3899 alloc_flags = gfp_to_alloc_flags(gfp_mask);
3900 3900
3901 /* 3901 /*
3902 * We need to recalculate the starting point for the zonelist iterator 3902 * We need to recalculate the starting point for the zonelist iterator
3903 * because we might have used different nodemask in the fast path, or 3903 * because we might have used different nodemask in the fast path, or
3904 * there was a cpuset modification and we are retrying - otherwise we 3904 * there was a cpuset modification and we are retrying - otherwise we
3905 * could end up iterating over non-eligible zones endlessly. 3905 * could end up iterating over non-eligible zones endlessly.
3906 */ 3906 */
3907 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, 3907 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
3908 ac->high_zoneidx, ac->nodemask); 3908 ac->high_zoneidx, ac->nodemask);
3909 if (!ac->preferred_zoneref->zone) 3909 if (!ac->preferred_zoneref->zone)
3910 goto nopage; 3910 goto nopage;
3911 3911
3912 if (gfp_mask & __GFP_KSWAPD_RECLAIM) 3912 if (gfp_mask & __GFP_KSWAPD_RECLAIM)
3913 wake_all_kswapds(order, ac); 3913 wake_all_kswapds(order, ac);
3914 3914
3915 /* 3915 /*
3916 * The adjusted alloc_flags might result in immediate success, so try 3916 * The adjusted alloc_flags might result in immediate success, so try
3917 * that first 3917 * that first
3918 */ 3918 */
3919 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); 3919 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
3920 if (page) 3920 if (page)
3921 goto got_pg; 3921 goto got_pg;
3922 3922
3923 /* 3923 /*
3924 * For costly allocations, try direct compaction first, as it's likely 3924 * For costly allocations, try direct compaction first, as it's likely
3925 * that we have enough base pages and don't need to reclaim. For non- 3925 * that we have enough base pages and don't need to reclaim. For non-
3926 * movable high-order allocations, do that as well, as compaction will 3926 * movable high-order allocations, do that as well, as compaction will
3927 * try prevent permanent fragmentation by migrating from blocks of the 3927 * try prevent permanent fragmentation by migrating from blocks of the
3928 * same migratetype. 3928 * same migratetype.
3929 * Don't try this for allocations that are allowed to ignore 3929 * Don't try this for allocations that are allowed to ignore
3930 * watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen. 3930 * watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen.
3931 */ 3931 */
3932 if (can_direct_reclaim && 3932 if (can_direct_reclaim &&
3933 (costly_order || 3933 (costly_order ||
3934 (order > 0 && ac->migratetype != MIGRATE_MOVABLE)) 3934 (order > 0 && ac->migratetype != MIGRATE_MOVABLE))
3935 && !gfp_pfmemalloc_allowed(gfp_mask)) { 3935 && !gfp_pfmemalloc_allowed(gfp_mask)) {
3936 page = __alloc_pages_direct_compact(gfp_mask, order, 3936 page = __alloc_pages_direct_compact(gfp_mask, order,
3937 alloc_flags, ac, 3937 alloc_flags, ac,
3938 INIT_COMPACT_PRIORITY, 3938 INIT_COMPACT_PRIORITY,
3939 &compact_result); 3939 &compact_result);
3940 if (page) 3940 if (page)
3941 goto got_pg; 3941 goto got_pg;
3942 3942
3943 /* 3943 /*
3944 * Checks for costly allocations with __GFP_NORETRY, which 3944 * Checks for costly allocations with __GFP_NORETRY, which
3945 * includes THP page fault allocations 3945 * includes THP page fault allocations
3946 */ 3946 */
3947 if (costly_order && (gfp_mask & __GFP_NORETRY)) { 3947 if (costly_order && (gfp_mask & __GFP_NORETRY)) {
3948 /* 3948 /*
3949 * If compaction is deferred for high-order allocations, 3949 * If compaction is deferred for high-order allocations,
3950 * it is because sync compaction recently failed. If 3950 * it is because sync compaction recently failed. If
3951 * this is the case and the caller requested a THP 3951 * this is the case and the caller requested a THP
3952 * allocation, we do not want to heavily disrupt the 3952 * allocation, we do not want to heavily disrupt the
3953 * system, so we fail the allocation instead of entering 3953 * system, so we fail the allocation instead of entering
3954 * direct reclaim. 3954 * direct reclaim.
3955 */ 3955 */
3956 if (compact_result == COMPACT_DEFERRED) 3956 if (compact_result == COMPACT_DEFERRED)
3957 goto nopage; 3957 goto nopage;
3958 3958
3959 /* 3959 /*
3960 * Looks like reclaim/compaction is worth trying, but 3960 * Looks like reclaim/compaction is worth trying, but
3961 * sync compaction could be very expensive, so keep 3961 * sync compaction could be very expensive, so keep
3962 * using async compaction. 3962 * using async compaction.
3963 */ 3963 */
3964 compact_priority = INIT_COMPACT_PRIORITY; 3964 compact_priority = INIT_COMPACT_PRIORITY;
3965 } 3965 }
3966 } 3966 }
3967 3967
3968 retry: 3968 retry:
3969 /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */ 3969 /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
3970 if (gfp_mask & __GFP_KSWAPD_RECLAIM) 3970 if (gfp_mask & __GFP_KSWAPD_RECLAIM)
3971 wake_all_kswapds(order, ac); 3971 wake_all_kswapds(order, ac);
3972 3972
3973 reserve_flags = __gfp_pfmemalloc_flags(gfp_mask); 3973 reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
3974 if (reserve_flags) 3974 if (reserve_flags)
3975 alloc_flags = reserve_flags; 3975 alloc_flags = reserve_flags;
3976 3976
3977 /* 3977 /*
3978 * Reset the zonelist iterators if memory policies can be ignored. 3978 * Reset the zonelist iterators if memory policies can be ignored.
3979 * These allocations are high priority and system rather than user 3979 * These allocations are high priority and system rather than user
3980 * orientated. 3980 * orientated.
3981 */ 3981 */
3982 if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) { 3982 if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) {
3983 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, 3983 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
3984 ac->high_zoneidx, ac->nodemask); 3984 ac->high_zoneidx, ac->nodemask);
3985 } 3985 }
3986 3986
3987 /* Attempt with potentially adjusted zonelist and alloc_flags */ 3987 /* Attempt with potentially adjusted zonelist and alloc_flags */
3988 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); 3988 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
3989 if (page) 3989 if (page)
3990 goto got_pg; 3990 goto got_pg;
3991 3991
3992 /* Caller is not willing to reclaim, we can't balance anything */ 3992 /* Caller is not willing to reclaim, we can't balance anything */
3993 if (!can_direct_reclaim) 3993 if (!can_direct_reclaim)
3994 goto nopage; 3994 goto nopage;
3995 3995
3996 /* Avoid recursion of direct reclaim */ 3996 /* Avoid recursion of direct reclaim */
3997 if (current->flags & PF_MEMALLOC) 3997 if (current->flags & PF_MEMALLOC)
3998 goto nopage; 3998 goto nopage;
3999 3999
4000 /* Try direct reclaim and then allocating */ 4000 /* Try direct reclaim and then allocating */
4001 page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac, 4001 page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
4002 &did_some_progress); 4002 &did_some_progress);
4003 if (page) 4003 if (page)
4004 goto got_pg; 4004 goto got_pg;
4005 4005
4006 /* Try direct compaction and then allocating */ 4006 /* Try direct compaction and then allocating */
4007 page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac, 4007 page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
4008 compact_priority, &compact_result); 4008 compact_priority, &compact_result);
4009 if (page) 4009 if (page)
4010 goto got_pg; 4010 goto got_pg;
4011 4011
4012 /* Do not loop if specifically requested */ 4012 /* Do not loop if specifically requested */
4013 if (gfp_mask & __GFP_NORETRY) 4013 if (gfp_mask & __GFP_NORETRY)
4014 goto nopage; 4014 goto nopage;
4015 4015
4016 /* 4016 /*
4017 * Do not retry costly high order allocations unless they are 4017 * Do not retry costly high order allocations unless they are
4018 * __GFP_RETRY_MAYFAIL 4018 * __GFP_RETRY_MAYFAIL
4019 */ 4019 */
4020 if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL)) 4020 if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL))
4021 goto nopage; 4021 goto nopage;
4022 4022
4023 if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags, 4023 if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
4024 did_some_progress > 0, &no_progress_loops)) 4024 did_some_progress > 0, &no_progress_loops))
4025 goto retry; 4025 goto retry;
4026 4026
4027 /* 4027 /*
4028 * It doesn't make any sense to retry for the compaction if the order-0 4028 * It doesn't make any sense to retry for the compaction if the order-0
4029 * reclaim is not able to make any progress because the current 4029 * reclaim is not able to make any progress because the current
4030 * implementation of the compaction depends on the sufficient amount 4030 * implementation of the compaction depends on the sufficient amount
4031 * of free memory (see __compaction_suitable) 4031 * of free memory (see __compaction_suitable)
4032 */ 4032 */
4033 if (did_some_progress > 0 && 4033 if (did_some_progress > 0 &&
4034 should_compact_retry(ac, order, alloc_flags, 4034 should_compact_retry(ac, order, alloc_flags,
4035 compact_result, &compact_priority, 4035 compact_result, &compact_priority,
4036 &compaction_retries)) 4036 &compaction_retries))
4037 goto retry; 4037 goto retry;
4038 4038
4039 4039
4040 /* Deal with possible cpuset update races before we start OOM killing */ 4040 /* Deal with possible cpuset update races before we start OOM killing */
4041 if (check_retry_cpuset(cpuset_mems_cookie, ac)) 4041 if (check_retry_cpuset(cpuset_mems_cookie, ac))
4042 goto retry_cpuset; 4042 goto retry_cpuset;
4043 4043
4044 /* Reclaim has failed us, start killing things */ 4044 /* Reclaim has failed us, start killing things */
4045 page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress); 4045 page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
4046 if (page) 4046 if (page)
4047 goto got_pg; 4047 goto got_pg;
4048 4048
4049 /* Avoid allocations with no watermarks from looping endlessly */ 4049 /* Avoid allocations with no watermarks from looping endlessly */
4050 if (tsk_is_oom_victim(current) && 4050 if (tsk_is_oom_victim(current) &&
4051 (alloc_flags == ALLOC_OOM || 4051 (alloc_flags == ALLOC_OOM ||
4052 (gfp_mask & __GFP_NOMEMALLOC))) 4052 (gfp_mask & __GFP_NOMEMALLOC)))
4053 goto nopage; 4053 goto nopage;
4054 4054
4055 /* Retry as long as the OOM killer is making progress */ 4055 /* Retry as long as the OOM killer is making progress */
4056 if (did_some_progress) { 4056 if (did_some_progress) {
4057 no_progress_loops = 0; 4057 no_progress_loops = 0;
4058 goto retry; 4058 goto retry;
4059 } 4059 }
4060 4060
4061 nopage: 4061 nopage:
4062 /* Deal with possible cpuset update races before we fail */ 4062 /* Deal with possible cpuset update races before we fail */
4063 if (check_retry_cpuset(cpuset_mems_cookie, ac)) 4063 if (check_retry_cpuset(cpuset_mems_cookie, ac))
4064 goto retry_cpuset; 4064 goto retry_cpuset;
4065 4065
4066 /* 4066 /*
4067 * Make sure that __GFP_NOFAIL request doesn't leak out and make sure 4067 * Make sure that __GFP_NOFAIL request doesn't leak out and make sure
4068 * we always retry 4068 * we always retry
4069 */ 4069 */
4070 if (gfp_mask & __GFP_NOFAIL) { 4070 if (gfp_mask & __GFP_NOFAIL) {
4071 /* 4071 /*
4072 * All existing users of the __GFP_NOFAIL are blockable, so warn 4072 * All existing users of the __GFP_NOFAIL are blockable, so warn
4073 * of any new users that actually require GFP_NOWAIT 4073 * of any new users that actually require GFP_NOWAIT
4074 */ 4074 */
4075 if (WARN_ON_ONCE(!can_direct_reclaim)) 4075 if (WARN_ON_ONCE(!can_direct_reclaim))
4076 goto fail; 4076 goto fail;
4077 4077
4078 /* 4078 /*
4079 * PF_MEMALLOC request from this context is rather bizarre 4079 * PF_MEMALLOC request from this context is rather bizarre
4080 * because we cannot reclaim anything and only can loop waiting 4080 * because we cannot reclaim anything and only can loop waiting
4081 * for somebody to do a work for us 4081 * for somebody to do a work for us
4082 */ 4082 */
4083 WARN_ON_ONCE(current->flags & PF_MEMALLOC); 4083 WARN_ON_ONCE(current->flags & PF_MEMALLOC);
4084 4084
4085 /* 4085 /*
4086 * non failing costly orders are a hard requirement which we 4086 * non failing costly orders are a hard requirement which we
4087 * are not prepared for much so let's warn about these users 4087 * are not prepared for much so let's warn about these users
4088 * so that we can identify them and convert them to something 4088 * so that we can identify them and convert them to something
4089 * else. 4089 * else.
4090 */ 4090 */
4091 WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER); 4091 WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER);
4092 4092
4093 /* 4093 /*
4094 * Help non-failing allocations by giving them access to memory 4094 * Help non-failing allocations by giving them access to memory
4095 * reserves but do not use ALLOC_NO_WATERMARKS because this 4095 * reserves but do not use ALLOC_NO_WATERMARKS because this
4096 * could deplete whole memory reserves which would just make 4096 * could deplete whole memory reserves which would just make
4097 * the situation worse 4097 * the situation worse
4098 */ 4098 */
4099 page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac); 4099 page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac);
4100 if (page) 4100 if (page)
4101 goto got_pg; 4101 goto got_pg;
4102 4102
4103 cond_resched(); 4103 cond_resched();
4104 goto retry; 4104 goto retry;
4105 } 4105 }
4106 fail: 4106 fail:
4107 warn_alloc(gfp_mask, ac->nodemask, 4107 warn_alloc(gfp_mask, ac->nodemask,
4108 "page allocation failure: order:%u", order); 4108 "page allocation failure: order:%u", order);
4109 got_pg: 4109 got_pg:
4110 return page; 4110 return page;
4111 } 4111 }
4112 4112
4113 static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, 4113 static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
4114 int preferred_nid, nodemask_t *nodemask, 4114 int preferred_nid, nodemask_t *nodemask,
4115 struct alloc_context *ac, gfp_t *alloc_mask, 4115 struct alloc_context *ac, gfp_t *alloc_mask,
4116 unsigned int *alloc_flags) 4116 unsigned int *alloc_flags)
4117 { 4117 {
4118 ac->high_zoneidx = gfp_zone(gfp_mask); 4118 ac->high_zoneidx = gfp_zone(gfp_mask);
4119 ac->zonelist = node_zonelist(preferred_nid, gfp_mask); 4119 ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
4120 ac->nodemask = nodemask; 4120 ac->nodemask = nodemask;
4121 ac->migratetype = gfpflags_to_migratetype(gfp_mask); 4121 ac->migratetype = gfpflags_to_migratetype(gfp_mask);
4122 4122
4123 if (cpusets_enabled()) { 4123 if (cpusets_enabled()) {
4124 *alloc_mask |= __GFP_HARDWALL; 4124 *alloc_mask |= __GFP_HARDWALL;
4125 if (!ac->nodemask) 4125 if (!ac->nodemask)
4126 ac->nodemask = &cpuset_current_mems_allowed; 4126 ac->nodemask = &cpuset_current_mems_allowed;
4127 else 4127 else
4128 *alloc_flags |= ALLOC_CPUSET; 4128 *alloc_flags |= ALLOC_CPUSET;
4129 } 4129 }
4130 4130
4131 fs_reclaim_acquire(gfp_mask); 4131 fs_reclaim_acquire(gfp_mask);
4132 fs_reclaim_release(gfp_mask); 4132 fs_reclaim_release(gfp_mask);
4133 4133
4134 might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM); 4134 might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
4135 4135
4136 if (should_fail_alloc_page(gfp_mask, order)) 4136 if (should_fail_alloc_page(gfp_mask, order))
4137 return false; 4137 return false;
4138 4138
4139 if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE) 4139 if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE)
4140 *alloc_flags |= ALLOC_CMA; 4140 *alloc_flags |= ALLOC_CMA;
4141 4141
4142 return true; 4142 return true;
4143 } 4143 }
4144 4144
4145 /* Determine whether to spread dirty pages and what the first usable zone */ 4145 /* Determine whether to spread dirty pages and what the first usable zone */
4146 static inline void finalise_ac(gfp_t gfp_mask, 4146 static inline void finalise_ac(gfp_t gfp_mask,
4147 unsigned int order, struct alloc_context *ac) 4147 unsigned int order, struct alloc_context *ac)
4148 { 4148 {
4149 /* Dirty zone balancing only done in the fast path */ 4149 /* Dirty zone balancing only done in the fast path */
4150 ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE); 4150 ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);
4151 4151
4152 /* 4152 /*
4153 * The preferred zone is used for statistics but crucially it is 4153 * The preferred zone is used for statistics but crucially it is
4154 * also used as the starting point for the zonelist iterator. It 4154 * also used as the starting point for the zonelist iterator. It
4155 * may get reset for allocations that ignore memory policies. 4155 * may get reset for allocations that ignore memory policies.
4156 */ 4156 */
4157 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, 4157 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
4158 ac->high_zoneidx, ac->nodemask); 4158 ac->high_zoneidx, ac->nodemask);
4159 } 4159 }
4160 4160
4161 /* 4161 /*
4162 * This is the 'heart' of the zoned buddy allocator. 4162 * This is the 'heart' of the zoned buddy allocator.
4163 */ 4163 */
4164 struct page * 4164 struct page *
4165 __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, 4165 __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
4166 nodemask_t *nodemask) 4166 nodemask_t *nodemask)
4167 { 4167 {
4168 struct page *page; 4168 struct page *page;
4169 unsigned int alloc_flags = ALLOC_WMARK_LOW; 4169 unsigned int alloc_flags = ALLOC_WMARK_LOW;
4170 gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */ 4170 gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
4171 struct alloc_context ac = { }; 4171 struct alloc_context ac = { };
4172 4172
4173 /* 4173 /*
4174 * There are several places where we assume that the order value is sane 4174 * There are several places where we assume that the order value is sane
4175 * so bail out early if the request is out of bound. 4175 * so bail out early if the request is out of bound.
4176 */ 4176 */
4177 if (unlikely(order >= MAX_ORDER)) { 4177 if (unlikely(order >= MAX_ORDER)) {
4178 WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN)); 4178 WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
4179 return NULL; 4179 return NULL;
4180 } 4180 }
4181 4181
4182 gfp_mask &= gfp_allowed_mask; 4182 gfp_mask &= gfp_allowed_mask;
4183 alloc_mask = gfp_mask; 4183 alloc_mask = gfp_mask;
4184 if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags)) 4184 if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))
4185 return NULL; 4185 return NULL;
4186 4186
4187 finalise_ac(gfp_mask, order, &ac); 4187 finalise_ac(gfp_mask, order, &ac);
4188 4188
4189 /* First allocation attempt */ 4189 /* First allocation attempt */
4190 page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac); 4190 page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
4191 if (likely(page)) 4191 if (likely(page))
4192 goto out; 4192 goto out;
4193 4193
4194 /* 4194 /*
4195 * Apply scoped allocation constraints. This is mainly about GFP_NOFS 4195 * Apply scoped allocation constraints. This is mainly about GFP_NOFS
4196 * resp. GFP_NOIO which has to be inherited for all allocation requests 4196 * resp. GFP_NOIO which has to be inherited for all allocation requests
4197 * from a particular context which has been marked by 4197 * from a particular context which has been marked by
4198 * memalloc_no{fs,io}_{save,restore}. 4198 * memalloc_no{fs,io}_{save,restore}.
4199 */ 4199 */
4200 alloc_mask = current_gfp_context(gfp_mask); 4200 alloc_mask = current_gfp_context(gfp_mask);
4201 ac.spread_dirty_pages = false; 4201 ac.spread_dirty_pages = false;
4202 4202
4203 /* 4203 /*
4204 * Restore the original nodemask if it was potentially replaced with 4204 * Restore the original nodemask if it was potentially replaced with
4205 * &cpuset_current_mems_allowed to optimize the fast-path attempt. 4205 * &cpuset_current_mems_allowed to optimize the fast-path attempt.
4206 */ 4206 */
4207 if (unlikely(ac.nodemask != nodemask)) 4207 if (unlikely(ac.nodemask != nodemask))
4208 ac.nodemask = nodemask; 4208 ac.nodemask = nodemask;
4209 4209
4210 page = __alloc_pages_slowpath(alloc_mask, order, &ac); 4210 page = __alloc_pages_slowpath(alloc_mask, order, &ac);
4211 4211
4212 out: 4212 out:
4213 if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page && 4213 if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page &&
4214 unlikely(memcg_kmem_charge(page, gfp_mask, order) != 0)) { 4214 unlikely(memcg_kmem_charge(page, gfp_mask, order) != 0)) {
4215 __free_pages(page, order); 4215 __free_pages(page, order);
4216 page = NULL; 4216 page = NULL;
4217 } 4217 }
4218 4218
4219 trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype); 4219 trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);
4220 4220
4221 return page; 4221 return page;
4222 } 4222 }
4223 EXPORT_SYMBOL(__alloc_pages_nodemask); 4223 EXPORT_SYMBOL(__alloc_pages_nodemask);
4224 4224
4225 /* 4225 /*
4226 * Common helper functions. 4226 * Common helper functions.
4227 */ 4227 */
4228 unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) 4228 unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
4229 { 4229 {
4230 struct page *page; 4230 struct page *page;
4231 4231
4232 /* 4232 /*
4233 * __get_free_pages() returns a 32-bit address, which cannot represent 4233 * __get_free_pages() returns a 32-bit address, which cannot represent
4234 * a highmem page 4234 * a highmem page
4235 */ 4235 */
4236 VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); 4236 VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
4237 4237
4238 page = alloc_pages(gfp_mask, order); 4238 page = alloc_pages(gfp_mask, order);
4239 if (!page) 4239 if (!page)
4240 return 0; 4240 return 0;
4241 return (unsigned long) page_address(page); 4241 return (unsigned long) page_address(page);
4242 } 4242 }
4243 EXPORT_SYMBOL(__get_free_pages); 4243 EXPORT_SYMBOL(__get_free_pages);
4244 4244
4245 unsigned long get_zeroed_page(gfp_t gfp_mask) 4245 unsigned long get_zeroed_page(gfp_t gfp_mask)
4246 { 4246 {
4247 return __get_free_pages(gfp_mask | __GFP_ZERO, 0); 4247 return __get_free_pages(gfp_mask | __GFP_ZERO, 0);
4248 } 4248 }
4249 EXPORT_SYMBOL(get_zeroed_page); 4249 EXPORT_SYMBOL(get_zeroed_page);
4250 4250
4251 void __free_pages(struct page *page, unsigned int order) 4251 void __free_pages(struct page *page, unsigned int order)
4252 { 4252 {
4253 if (put_page_testzero(page)) { 4253 if (put_page_testzero(page)) {
4254 if (order == 0) 4254 if (order == 0)
4255 free_hot_cold_page(page, false); 4255 free_hot_cold_page(page, false);
4256 else 4256 else
4257 __free_pages_ok(page, order); 4257 __free_pages_ok(page, order);
4258 } 4258 }
4259 } 4259 }
4260 4260
4261 EXPORT_SYMBOL(__free_pages); 4261 EXPORT_SYMBOL(__free_pages);
4262 4262
4263 void free_pages(unsigned long addr, unsigned int order) 4263 void free_pages(unsigned long addr, unsigned int order)
4264 { 4264 {
4265 if (addr != 0) { 4265 if (addr != 0) {
4266 VM_BUG_ON(!virt_addr_valid((void *)addr)); 4266 VM_BUG_ON(!virt_addr_valid((void *)addr));
4267 __free_pages(virt_to_page((void *)addr), order); 4267 __free_pages(virt_to_page((void *)addr), order);
4268 } 4268 }
4269 } 4269 }
4270 4270
4271 EXPORT_SYMBOL(free_pages); 4271 EXPORT_SYMBOL(free_pages);
4272 4272
4273 /* 4273 /*
4274 * Page Fragment: 4274 * Page Fragment:
4275 * An arbitrary-length arbitrary-offset area of memory which resides 4275 * An arbitrary-length arbitrary-offset area of memory which resides
4276 * within a 0 or higher order page. Multiple fragments within that page 4276 * within a 0 or higher order page. Multiple fragments within that page
4277 * are individually refcounted, in the page's reference counter. 4277 * are individually refcounted, in the page's reference counter.
4278 * 4278 *
4279 * The page_frag functions below provide a simple allocation framework for 4279 * The page_frag functions below provide a simple allocation framework for
4280 * page fragments. This is used by the network stack and network device 4280 * page fragments. This is used by the network stack and network device
4281 * drivers to provide a backing region of memory for use as either an 4281 * drivers to provide a backing region of memory for use as either an
4282 * sk_buff->head, or to be used in the "frags" portion of skb_shared_info. 4282 * sk_buff->head, or to be used in the "frags" portion of skb_shared_info.
4283 */ 4283 */
4284 static struct page *__page_frag_cache_refill(struct page_frag_cache *nc, 4284 static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
4285 gfp_t gfp_mask) 4285 gfp_t gfp_mask)
4286 { 4286 {
4287 struct page *page = NULL; 4287 struct page *page = NULL;
4288 gfp_t gfp = gfp_mask; 4288 gfp_t gfp = gfp_mask;
4289 4289
4290 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) 4290 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
4291 gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY | 4291 gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
4292 __GFP_NOMEMALLOC; 4292 __GFP_NOMEMALLOC;
4293 page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, 4293 page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
4294 PAGE_FRAG_CACHE_MAX_ORDER); 4294 PAGE_FRAG_CACHE_MAX_ORDER);
4295 nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE; 4295 nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
4296 #endif 4296 #endif
4297 if (unlikely(!page)) 4297 if (unlikely(!page))
4298 page = alloc_pages_node(NUMA_NO_NODE, gfp, 0); 4298 page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
4299 4299
4300 nc->va = page ? page_address(page) : NULL; 4300 nc->va = page ? page_address(page) : NULL;
4301 4301
4302 return page; 4302 return page;
4303 } 4303 }
4304 4304
4305 void __page_frag_cache_drain(struct page *page, unsigned int count) 4305 void __page_frag_cache_drain(struct page *page, unsigned int count)
4306 { 4306 {
4307 VM_BUG_ON_PAGE(page_ref_count(page) == 0, page); 4307 VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
4308 4308
4309 if (page_ref_sub_and_test(page, count)) { 4309 if (page_ref_sub_and_test(page, count)) {
4310 unsigned int order = compound_order(page); 4310 unsigned int order = compound_order(page);
4311 4311
4312 if (order == 0) 4312 if (order == 0)
4313 free_hot_cold_page(page, false); 4313 free_hot_cold_page(page, false);
4314 else 4314 else
4315 __free_pages_ok(page, order); 4315 __free_pages_ok(page, order);
4316 } 4316 }
4317 } 4317 }
4318 EXPORT_SYMBOL(__page_frag_cache_drain); 4318 EXPORT_SYMBOL(__page_frag_cache_drain);
4319 4319
4320 void *page_frag_alloc(struct page_frag_cache *nc, 4320 void *page_frag_alloc(struct page_frag_cache *nc,
4321 unsigned int fragsz, gfp_t gfp_mask) 4321 unsigned int fragsz, gfp_t gfp_mask)
4322 { 4322 {
4323 unsigned int size = PAGE_SIZE; 4323 unsigned int size = PAGE_SIZE;
4324 struct page *page; 4324 struct page *page;
4325 int offset; 4325 int offset;
4326 4326
4327 if (unlikely(!nc->va)) { 4327 if (unlikely(!nc->va)) {
4328 refill: 4328 refill:
4329 page = __page_frag_cache_refill(nc, gfp_mask); 4329 page = __page_frag_cache_refill(nc, gfp_mask);
4330 if (!page) 4330 if (!page)
4331 return NULL; 4331 return NULL;
4332 4332
4333 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) 4333 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
4334 /* if size can vary use size else just use PAGE_SIZE */ 4334 /* if size can vary use size else just use PAGE_SIZE */
4335 size = nc->size; 4335 size = nc->size;
4336 #endif 4336 #endif
4337 /* Even if we own the page, we do not use atomic_set(). 4337 /* Even if we own the page, we do not use atomic_set().
4338 * This would break get_page_unless_zero() users. 4338 * This would break get_page_unless_zero() users.
4339 */ 4339 */
4340 page_ref_add(page, size - 1); 4340 page_ref_add(page, size - 1);
4341 4341
4342 /* reset page count bias and offset to start of new frag */ 4342 /* reset page count bias and offset to start of new frag */
4343 nc->pfmemalloc = page_is_pfmemalloc(page); 4343 nc->pfmemalloc = page_is_pfmemalloc(page);
4344 nc->pagecnt_bias = size; 4344 nc->pagecnt_bias = size;
4345 nc->offset = size; 4345 nc->offset = size;
4346 } 4346 }
4347 4347
4348 offset = nc->offset - fragsz; 4348 offset = nc->offset - fragsz;
4349 if (unlikely(offset < 0)) { 4349 if (unlikely(offset < 0)) {
4350 page = virt_to_page(nc->va); 4350 page = virt_to_page(nc->va);
4351 4351
4352 if (!page_ref_sub_and_test(page, nc->pagecnt_bias)) 4352 if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
4353 goto refill; 4353 goto refill;
4354 4354
4355 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) 4355 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
4356 /* if size can vary use size else just use PAGE_SIZE */ 4356 /* if size can vary use size else just use PAGE_SIZE */
4357 size = nc->size; 4357 size = nc->size;
4358 #endif 4358 #endif
4359 /* OK, page count is 0, we can safely set it */ 4359 /* OK, page count is 0, we can safely set it */
4360 set_page_count(page, size); 4360 set_page_count(page, size);
4361 4361
4362 /* reset page count bias and offset to start of new frag */ 4362 /* reset page count bias and offset to start of new frag */
4363 nc->pagecnt_bias = size; 4363 nc->pagecnt_bias = size;
4364 offset = size - fragsz; 4364 offset = size - fragsz;
4365 } 4365 }
4366 4366
4367 nc->pagecnt_bias--; 4367 nc->pagecnt_bias--;
4368 nc->offset = offset; 4368 nc->offset = offset;
4369 4369
4370 return nc->va + offset; 4370 return nc->va + offset;
4371 } 4371 }
4372 EXPORT_SYMBOL(page_frag_alloc); 4372 EXPORT_SYMBOL(page_frag_alloc);
4373 4373
4374 /* 4374 /*
4375 * Frees a page fragment allocated out of either a compound or order 0 page. 4375 * Frees a page fragment allocated out of either a compound or order 0 page.
4376 */ 4376 */
4377 void page_frag_free(void *addr) 4377 void page_frag_free(void *addr)
4378 { 4378 {
4379 struct page *page = virt_to_head_page(addr); 4379 struct page *page = virt_to_head_page(addr);
4380 4380
4381 if (unlikely(put_page_testzero(page))) 4381 if (unlikely(put_page_testzero(page)))
4382 __free_pages_ok(page, compound_order(page)); 4382 __free_pages_ok(page, compound_order(page));
4383 } 4383 }
4384 EXPORT_SYMBOL(page_frag_free); 4384 EXPORT_SYMBOL(page_frag_free);
4385 4385
4386 static void *make_alloc_exact(unsigned long addr, unsigned int order, 4386 static void *make_alloc_exact(unsigned long addr, unsigned int order,
4387 size_t size) 4387 size_t size)
4388 { 4388 {
4389 if (addr) { 4389 if (addr) {
4390 unsigned long alloc_end = addr + (PAGE_SIZE << order); 4390 unsigned long alloc_end = addr + (PAGE_SIZE << order);
4391 unsigned long used = addr + PAGE_ALIGN(size); 4391 unsigned long used = addr + PAGE_ALIGN(size);
4392 4392
4393 split_page(virt_to_page((void *)addr), order); 4393 split_page(virt_to_page((void *)addr), order);
4394 while (used < alloc_end) { 4394 while (used < alloc_end) {
4395 free_page(used); 4395 free_page(used);
4396 used += PAGE_SIZE; 4396 used += PAGE_SIZE;
4397 } 4397 }
4398 } 4398 }
4399 return (void *)addr; 4399 return (void *)addr;
4400 } 4400 }
4401 4401
4402 /** 4402 /**
4403 * alloc_pages_exact - allocate an exact number physically-contiguous pages. 4403 * alloc_pages_exact - allocate an exact number physically-contiguous pages.
4404 * @size: the number of bytes to allocate 4404 * @size: the number of bytes to allocate
4405 * @gfp_mask: GFP flags for the allocation 4405 * @gfp_mask: GFP flags for the allocation
4406 * 4406 *
4407 * This function is similar to alloc_pages(), except that it allocates the 4407 * This function is similar to alloc_pages(), except that it allocates the
4408 * minimum number of pages to satisfy the request. alloc_pages() can only 4408 * minimum number of pages to satisfy the request. alloc_pages() can only
4409 * allocate memory in power-of-two pages. 4409 * allocate memory in power-of-two pages.
4410 * 4410 *
4411 * This function is also limited by MAX_ORDER. 4411 * This function is also limited by MAX_ORDER.
4412 * 4412 *
4413 * Memory allocated by this function must be released by free_pages_exact(). 4413 * Memory allocated by this function must be released by free_pages_exact().
4414 */ 4414 */
4415 void *alloc_pages_exact(size_t size, gfp_t gfp_mask) 4415 void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
4416 { 4416 {