Commit 4cd64dcede969cf30f47a6e6ba8e378e74d0790d

Authored by Vlastimil Babka
Committed by Jiri Slaby
1 parent 2ce666c175

mm/page_alloc: prevent MIGRATE_RESERVE pages from being misplaced

commit 5bcc9f86ef09a933255ee66bd899d4601785dad5 upstream.

For the MIGRATE_RESERVE pages, it is useful when they do not get
misplaced on free_list of other migratetype, otherwise they might get
allocated prematurely and e.g.  fragment the MIGRATE_RESEVE pageblocks.
While this cannot be avoided completely when allocating new
MIGRATE_RESERVE pageblocks in min_free_kbytes sysctl handler, we should
prevent the misplacement where possible.

Currently, it is possible for the misplacement to happen when a
MIGRATE_RESERVE page is allocated on pcplist through rmqueue_bulk() as a
fallback for other desired migratetype, and then later freed back
through free_pcppages_bulk() without being actually used.  This happens
because free_pcppages_bulk() uses get_freepage_migratetype() to choose
the free_list, and rmqueue_bulk() calls set_freepage_migratetype() with
the *desired* migratetype and not the page's original MIGRATE_RESERVE
migratetype.

This patch fixes the problem by moving the call to
set_freepage_migratetype() from rmqueue_bulk() down to
__rmqueue_smallest() and __rmqueue_fallback() where the actual page's
migratetype (e.g.  from which free_list the page is taken from) is used.
Note that this migratetype might be different from the pageblock's
migratetype due to freepage stealing decisions.  This is OK, as page
stealing never uses MIGRATE_RESERVE as a fallback, and also takes care
to leave all MIGRATE_CMA pages on the correct freelist.

Therefore, as an additional benefit, the call to
get_pageblock_migratetype() from rmqueue_bulk() when CMA is enabled, can
be removed completely.  This relies on the fact that MIGRATE_CMA
pageblocks are created only during system init, and the above.  The
related is_migrate_isolate() check is also unnecessary, as memory
isolation has other ways to move pages between freelists, and drain pcp
lists containing pages that should be isolated.  The buffered_rmqueue()
can also benefit from calling get_freepage_migratetype() instead of
get_pageblock_migratetype().

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Reported-by: Yong-Taek Lee <ytk.lee@samsung.com>
Reported-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Suggested-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Acked-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Suggested-by: Mel Gorman <mgorman@suse.de>
Acked-by: Minchan Kim <minchan@kernel.org>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: "Wang, Yalin" <Yalin.Wang@sonymobile.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Jiri Slaby <jslaby@suse.cz>

Showing 1 changed file with 13 additions and 10 deletions Inline Diff

1 /* 1 /*
2 * linux/mm/page_alloc.c 2 * linux/mm/page_alloc.c
3 * 3 *
4 * Manages the free list, the system allocates free pages here. 4 * Manages the free list, the system allocates free pages here.
5 * Note that kmalloc() lives in slab.c 5 * Note that kmalloc() lives in slab.c
6 * 6 *
7 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 7 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
8 * Swap reorganised 29.12.95, Stephen Tweedie 8 * Swap reorganised 29.12.95, Stephen Tweedie
9 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 9 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
10 * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 10 * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
11 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 11 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
12 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 12 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000
13 * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002 13 * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
14 * (lots of bits borrowed from Ingo Molnar & Andrew Morton) 14 * (lots of bits borrowed from Ingo Molnar & Andrew Morton)
15 */ 15 */
16 16
17 #include <linux/stddef.h> 17 #include <linux/stddef.h>
18 #include <linux/mm.h> 18 #include <linux/mm.h>
19 #include <linux/swap.h> 19 #include <linux/swap.h>
20 #include <linux/interrupt.h> 20 #include <linux/interrupt.h>
21 #include <linux/pagemap.h> 21 #include <linux/pagemap.h>
22 #include <linux/jiffies.h> 22 #include <linux/jiffies.h>
23 #include <linux/bootmem.h> 23 #include <linux/bootmem.h>
24 #include <linux/memblock.h> 24 #include <linux/memblock.h>
25 #include <linux/compiler.h> 25 #include <linux/compiler.h>
26 #include <linux/kernel.h> 26 #include <linux/kernel.h>
27 #include <linux/kmemcheck.h> 27 #include <linux/kmemcheck.h>
28 #include <linux/module.h> 28 #include <linux/module.h>
29 #include <linux/suspend.h> 29 #include <linux/suspend.h>
30 #include <linux/pagevec.h> 30 #include <linux/pagevec.h>
31 #include <linux/blkdev.h> 31 #include <linux/blkdev.h>
32 #include <linux/slab.h> 32 #include <linux/slab.h>
33 #include <linux/ratelimit.h> 33 #include <linux/ratelimit.h>
34 #include <linux/oom.h> 34 #include <linux/oom.h>
35 #include <linux/notifier.h> 35 #include <linux/notifier.h>
36 #include <linux/topology.h> 36 #include <linux/topology.h>
37 #include <linux/sysctl.h> 37 #include <linux/sysctl.h>
38 #include <linux/cpu.h> 38 #include <linux/cpu.h>
39 #include <linux/cpuset.h> 39 #include <linux/cpuset.h>
40 #include <linux/memory_hotplug.h> 40 #include <linux/memory_hotplug.h>
41 #include <linux/nodemask.h> 41 #include <linux/nodemask.h>
42 #include <linux/vmalloc.h> 42 #include <linux/vmalloc.h>
43 #include <linux/vmstat.h> 43 #include <linux/vmstat.h>
44 #include <linux/mempolicy.h> 44 #include <linux/mempolicy.h>
45 #include <linux/stop_machine.h> 45 #include <linux/stop_machine.h>
46 #include <linux/sort.h> 46 #include <linux/sort.h>
47 #include <linux/pfn.h> 47 #include <linux/pfn.h>
48 #include <linux/backing-dev.h> 48 #include <linux/backing-dev.h>
49 #include <linux/fault-inject.h> 49 #include <linux/fault-inject.h>
50 #include <linux/page-isolation.h> 50 #include <linux/page-isolation.h>
51 #include <linux/page_cgroup.h> 51 #include <linux/page_cgroup.h>
52 #include <linux/debugobjects.h> 52 #include <linux/debugobjects.h>
53 #include <linux/kmemleak.h> 53 #include <linux/kmemleak.h>
54 #include <linux/compaction.h> 54 #include <linux/compaction.h>
55 #include <trace/events/kmem.h> 55 #include <trace/events/kmem.h>
56 #include <linux/ftrace_event.h> 56 #include <linux/ftrace_event.h>
57 #include <linux/memcontrol.h> 57 #include <linux/memcontrol.h>
58 #include <linux/prefetch.h> 58 #include <linux/prefetch.h>
59 #include <linux/mm_inline.h> 59 #include <linux/mm_inline.h>
60 #include <linux/migrate.h> 60 #include <linux/migrate.h>
61 #include <linux/page-debug-flags.h> 61 #include <linux/page-debug-flags.h>
62 #include <linux/hugetlb.h> 62 #include <linux/hugetlb.h>
63 #include <linux/sched/rt.h> 63 #include <linux/sched/rt.h>
64 64
65 #include <asm/sections.h> 65 #include <asm/sections.h>
66 #include <asm/tlbflush.h> 66 #include <asm/tlbflush.h>
67 #include <asm/div64.h> 67 #include <asm/div64.h>
68 #include "internal.h" 68 #include "internal.h"
69 69
70 /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ 70 /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
71 static DEFINE_MUTEX(pcp_batch_high_lock); 71 static DEFINE_MUTEX(pcp_batch_high_lock);
72 #define MIN_PERCPU_PAGELIST_FRACTION (8) 72 #define MIN_PERCPU_PAGELIST_FRACTION (8)
73 73
74 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID 74 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
75 DEFINE_PER_CPU(int, numa_node); 75 DEFINE_PER_CPU(int, numa_node);
76 EXPORT_PER_CPU_SYMBOL(numa_node); 76 EXPORT_PER_CPU_SYMBOL(numa_node);
77 #endif 77 #endif
78 78
79 #ifdef CONFIG_HAVE_MEMORYLESS_NODES 79 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
80 /* 80 /*
81 * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly. 81 * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
82 * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined. 82 * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
83 * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem() 83 * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()
84 * defined in <linux/topology.h>. 84 * defined in <linux/topology.h>.
85 */ 85 */
86 DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */ 86 DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */
87 EXPORT_PER_CPU_SYMBOL(_numa_mem_); 87 EXPORT_PER_CPU_SYMBOL(_numa_mem_);
88 #endif 88 #endif
89 89
90 /* 90 /*
91 * Array of node states. 91 * Array of node states.
92 */ 92 */
93 nodemask_t node_states[NR_NODE_STATES] __read_mostly = { 93 nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
94 [N_POSSIBLE] = NODE_MASK_ALL, 94 [N_POSSIBLE] = NODE_MASK_ALL,
95 [N_ONLINE] = { { [0] = 1UL } }, 95 [N_ONLINE] = { { [0] = 1UL } },
96 #ifndef CONFIG_NUMA 96 #ifndef CONFIG_NUMA
97 [N_NORMAL_MEMORY] = { { [0] = 1UL } }, 97 [N_NORMAL_MEMORY] = { { [0] = 1UL } },
98 #ifdef CONFIG_HIGHMEM 98 #ifdef CONFIG_HIGHMEM
99 [N_HIGH_MEMORY] = { { [0] = 1UL } }, 99 [N_HIGH_MEMORY] = { { [0] = 1UL } },
100 #endif 100 #endif
101 #ifdef CONFIG_MOVABLE_NODE 101 #ifdef CONFIG_MOVABLE_NODE
102 [N_MEMORY] = { { [0] = 1UL } }, 102 [N_MEMORY] = { { [0] = 1UL } },
103 #endif 103 #endif
104 [N_CPU] = { { [0] = 1UL } }, 104 [N_CPU] = { { [0] = 1UL } },
105 #endif /* NUMA */ 105 #endif /* NUMA */
106 }; 106 };
107 EXPORT_SYMBOL(node_states); 107 EXPORT_SYMBOL(node_states);
108 108
109 /* Protect totalram_pages and zone->managed_pages */ 109 /* Protect totalram_pages and zone->managed_pages */
110 static DEFINE_SPINLOCK(managed_page_count_lock); 110 static DEFINE_SPINLOCK(managed_page_count_lock);
111 111
112 unsigned long totalram_pages __read_mostly; 112 unsigned long totalram_pages __read_mostly;
113 unsigned long totalreserve_pages __read_mostly; 113 unsigned long totalreserve_pages __read_mostly;
114 /* 114 /*
115 * When calculating the number of globally allowed dirty pages, there 115 * When calculating the number of globally allowed dirty pages, there
116 * is a certain number of per-zone reserves that should not be 116 * is a certain number of per-zone reserves that should not be
117 * considered dirtyable memory. This is the sum of those reserves 117 * considered dirtyable memory. This is the sum of those reserves
118 * over all existing zones that contribute dirtyable memory. 118 * over all existing zones that contribute dirtyable memory.
119 */ 119 */
120 unsigned long dirty_balance_reserve __read_mostly; 120 unsigned long dirty_balance_reserve __read_mostly;
121 121
122 int percpu_pagelist_fraction; 122 int percpu_pagelist_fraction;
123 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; 123 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
124 124
125 #ifdef CONFIG_PM_SLEEP 125 #ifdef CONFIG_PM_SLEEP
126 /* 126 /*
127 * The following functions are used by the suspend/hibernate code to temporarily 127 * The following functions are used by the suspend/hibernate code to temporarily
128 * change gfp_allowed_mask in order to avoid using I/O during memory allocations 128 * change gfp_allowed_mask in order to avoid using I/O during memory allocations
129 * while devices are suspended. To avoid races with the suspend/hibernate code, 129 * while devices are suspended. To avoid races with the suspend/hibernate code,
130 * they should always be called with pm_mutex held (gfp_allowed_mask also should 130 * they should always be called with pm_mutex held (gfp_allowed_mask also should
131 * only be modified with pm_mutex held, unless the suspend/hibernate code is 131 * only be modified with pm_mutex held, unless the suspend/hibernate code is
132 * guaranteed not to run in parallel with that modification). 132 * guaranteed not to run in parallel with that modification).
133 */ 133 */
134 134
135 static gfp_t saved_gfp_mask; 135 static gfp_t saved_gfp_mask;
136 136
137 void pm_restore_gfp_mask(void) 137 void pm_restore_gfp_mask(void)
138 { 138 {
139 WARN_ON(!mutex_is_locked(&pm_mutex)); 139 WARN_ON(!mutex_is_locked(&pm_mutex));
140 if (saved_gfp_mask) { 140 if (saved_gfp_mask) {
141 gfp_allowed_mask = saved_gfp_mask; 141 gfp_allowed_mask = saved_gfp_mask;
142 saved_gfp_mask = 0; 142 saved_gfp_mask = 0;
143 } 143 }
144 } 144 }
145 145
146 void pm_restrict_gfp_mask(void) 146 void pm_restrict_gfp_mask(void)
147 { 147 {
148 WARN_ON(!mutex_is_locked(&pm_mutex)); 148 WARN_ON(!mutex_is_locked(&pm_mutex));
149 WARN_ON(saved_gfp_mask); 149 WARN_ON(saved_gfp_mask);
150 saved_gfp_mask = gfp_allowed_mask; 150 saved_gfp_mask = gfp_allowed_mask;
151 gfp_allowed_mask &= ~GFP_IOFS; 151 gfp_allowed_mask &= ~GFP_IOFS;
152 } 152 }
153 153
154 bool pm_suspended_storage(void) 154 bool pm_suspended_storage(void)
155 { 155 {
156 if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS) 156 if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)
157 return false; 157 return false;
158 return true; 158 return true;
159 } 159 }
160 #endif /* CONFIG_PM_SLEEP */ 160 #endif /* CONFIG_PM_SLEEP */
161 161
162 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 162 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
163 int pageblock_order __read_mostly; 163 int pageblock_order __read_mostly;
164 #endif 164 #endif
165 165
166 static void __free_pages_ok(struct page *page, unsigned int order); 166 static void __free_pages_ok(struct page *page, unsigned int order);
167 167
168 /* 168 /*
169 * results with 256, 32 in the lowmem_reserve sysctl: 169 * results with 256, 32 in the lowmem_reserve sysctl:
170 * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) 170 * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
171 * 1G machine -> (16M dma, 784M normal, 224M high) 171 * 1G machine -> (16M dma, 784M normal, 224M high)
172 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA 172 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
173 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL 173 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
174 * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA 174 * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
175 * 175 *
176 * TBD: should special case ZONE_DMA32 machines here - in those we normally 176 * TBD: should special case ZONE_DMA32 machines here - in those we normally
177 * don't need any ZONE_NORMAL reservation 177 * don't need any ZONE_NORMAL reservation
178 */ 178 */
179 int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 179 int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
180 #ifdef CONFIG_ZONE_DMA 180 #ifdef CONFIG_ZONE_DMA
181 256, 181 256,
182 #endif 182 #endif
183 #ifdef CONFIG_ZONE_DMA32 183 #ifdef CONFIG_ZONE_DMA32
184 256, 184 256,
185 #endif 185 #endif
186 #ifdef CONFIG_HIGHMEM 186 #ifdef CONFIG_HIGHMEM
187 32, 187 32,
188 #endif 188 #endif
189 32, 189 32,
190 }; 190 };
191 191
192 EXPORT_SYMBOL(totalram_pages); 192 EXPORT_SYMBOL(totalram_pages);
193 193
194 static char * const zone_names[MAX_NR_ZONES] = { 194 static char * const zone_names[MAX_NR_ZONES] = {
195 #ifdef CONFIG_ZONE_DMA 195 #ifdef CONFIG_ZONE_DMA
196 "DMA", 196 "DMA",
197 #endif 197 #endif
198 #ifdef CONFIG_ZONE_DMA32 198 #ifdef CONFIG_ZONE_DMA32
199 "DMA32", 199 "DMA32",
200 #endif 200 #endif
201 "Normal", 201 "Normal",
202 #ifdef CONFIG_HIGHMEM 202 #ifdef CONFIG_HIGHMEM
203 "HighMem", 203 "HighMem",
204 #endif 204 #endif
205 "Movable", 205 "Movable",
206 }; 206 };
207 207
208 int min_free_kbytes = 1024; 208 int min_free_kbytes = 1024;
209 int user_min_free_kbytes; 209 int user_min_free_kbytes;
210 210
211 static unsigned long __meminitdata nr_kernel_pages; 211 static unsigned long __meminitdata nr_kernel_pages;
212 static unsigned long __meminitdata nr_all_pages; 212 static unsigned long __meminitdata nr_all_pages;
213 static unsigned long __meminitdata dma_reserve; 213 static unsigned long __meminitdata dma_reserve;
214 214
215 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 215 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
216 static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; 216 static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
217 static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; 217 static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
218 static unsigned long __initdata required_kernelcore; 218 static unsigned long __initdata required_kernelcore;
219 static unsigned long __initdata required_movablecore; 219 static unsigned long __initdata required_movablecore;
220 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; 220 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
221 221
222 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ 222 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
223 int movable_zone; 223 int movable_zone;
224 EXPORT_SYMBOL(movable_zone); 224 EXPORT_SYMBOL(movable_zone);
225 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 225 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
226 226
227 #if MAX_NUMNODES > 1 227 #if MAX_NUMNODES > 1
228 int nr_node_ids __read_mostly = MAX_NUMNODES; 228 int nr_node_ids __read_mostly = MAX_NUMNODES;
229 int nr_online_nodes __read_mostly = 1; 229 int nr_online_nodes __read_mostly = 1;
230 EXPORT_SYMBOL(nr_node_ids); 230 EXPORT_SYMBOL(nr_node_ids);
231 EXPORT_SYMBOL(nr_online_nodes); 231 EXPORT_SYMBOL(nr_online_nodes);
232 #endif 232 #endif
233 233
234 int page_group_by_mobility_disabled __read_mostly; 234 int page_group_by_mobility_disabled __read_mostly;
235 235
236 void set_pageblock_migratetype(struct page *page, int migratetype) 236 void set_pageblock_migratetype(struct page *page, int migratetype)
237 { 237 {
238 238
239 if (unlikely(page_group_by_mobility_disabled)) 239 if (unlikely(page_group_by_mobility_disabled))
240 migratetype = MIGRATE_UNMOVABLE; 240 migratetype = MIGRATE_UNMOVABLE;
241 241
242 set_pageblock_flags_group(page, (unsigned long)migratetype, 242 set_pageblock_flags_group(page, (unsigned long)migratetype,
243 PB_migrate, PB_migrate_end); 243 PB_migrate, PB_migrate_end);
244 } 244 }
245 245
246 bool oom_killer_disabled __read_mostly; 246 bool oom_killer_disabled __read_mostly;
247 247
248 #ifdef CONFIG_DEBUG_VM 248 #ifdef CONFIG_DEBUG_VM
249 static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 249 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
250 { 250 {
251 int ret = 0; 251 int ret = 0;
252 unsigned seq; 252 unsigned seq;
253 unsigned long pfn = page_to_pfn(page); 253 unsigned long pfn = page_to_pfn(page);
254 unsigned long sp, start_pfn; 254 unsigned long sp, start_pfn;
255 255
256 do { 256 do {
257 seq = zone_span_seqbegin(zone); 257 seq = zone_span_seqbegin(zone);
258 start_pfn = zone->zone_start_pfn; 258 start_pfn = zone->zone_start_pfn;
259 sp = zone->spanned_pages; 259 sp = zone->spanned_pages;
260 if (!zone_spans_pfn(zone, pfn)) 260 if (!zone_spans_pfn(zone, pfn))
261 ret = 1; 261 ret = 1;
262 } while (zone_span_seqretry(zone, seq)); 262 } while (zone_span_seqretry(zone, seq));
263 263
264 if (ret) 264 if (ret)
265 pr_err("page %lu outside zone [ %lu - %lu ]\n", 265 pr_err("page %lu outside zone [ %lu - %lu ]\n",
266 pfn, start_pfn, start_pfn + sp); 266 pfn, start_pfn, start_pfn + sp);
267 267
268 return ret; 268 return ret;
269 } 269 }
270 270
271 static int page_is_consistent(struct zone *zone, struct page *page) 271 static int page_is_consistent(struct zone *zone, struct page *page)
272 { 272 {
273 if (!pfn_valid_within(page_to_pfn(page))) 273 if (!pfn_valid_within(page_to_pfn(page)))
274 return 0; 274 return 0;
275 if (zone != page_zone(page)) 275 if (zone != page_zone(page))
276 return 0; 276 return 0;
277 277
278 return 1; 278 return 1;
279 } 279 }
280 /* 280 /*
281 * Temporary debugging check for pages not lying within a given zone. 281 * Temporary debugging check for pages not lying within a given zone.
282 */ 282 */
283 static int bad_range(struct zone *zone, struct page *page) 283 static int bad_range(struct zone *zone, struct page *page)
284 { 284 {
285 if (page_outside_zone_boundaries(zone, page)) 285 if (page_outside_zone_boundaries(zone, page))
286 return 1; 286 return 1;
287 if (!page_is_consistent(zone, page)) 287 if (!page_is_consistent(zone, page))
288 return 1; 288 return 1;
289 289
290 return 0; 290 return 0;
291 } 291 }
292 #else 292 #else
293 static inline int bad_range(struct zone *zone, struct page *page) 293 static inline int bad_range(struct zone *zone, struct page *page)
294 { 294 {
295 return 0; 295 return 0;
296 } 296 }
297 #endif 297 #endif
298 298
299 static void bad_page(struct page *page) 299 static void bad_page(struct page *page)
300 { 300 {
301 static unsigned long resume; 301 static unsigned long resume;
302 static unsigned long nr_shown; 302 static unsigned long nr_shown;
303 static unsigned long nr_unshown; 303 static unsigned long nr_unshown;
304 304
305 /* Don't complain about poisoned pages */ 305 /* Don't complain about poisoned pages */
306 if (PageHWPoison(page)) { 306 if (PageHWPoison(page)) {
307 page_mapcount_reset(page); /* remove PageBuddy */ 307 page_mapcount_reset(page); /* remove PageBuddy */
308 return; 308 return;
309 } 309 }
310 310
311 /* 311 /*
312 * Allow a burst of 60 reports, then keep quiet for that minute; 312 * Allow a burst of 60 reports, then keep quiet for that minute;
313 * or allow a steady drip of one report per second. 313 * or allow a steady drip of one report per second.
314 */ 314 */
315 if (nr_shown == 60) { 315 if (nr_shown == 60) {
316 if (time_before(jiffies, resume)) { 316 if (time_before(jiffies, resume)) {
317 nr_unshown++; 317 nr_unshown++;
318 goto out; 318 goto out;
319 } 319 }
320 if (nr_unshown) { 320 if (nr_unshown) {
321 printk(KERN_ALERT 321 printk(KERN_ALERT
322 "BUG: Bad page state: %lu messages suppressed\n", 322 "BUG: Bad page state: %lu messages suppressed\n",
323 nr_unshown); 323 nr_unshown);
324 nr_unshown = 0; 324 nr_unshown = 0;
325 } 325 }
326 nr_shown = 0; 326 nr_shown = 0;
327 } 327 }
328 if (nr_shown++ == 0) 328 if (nr_shown++ == 0)
329 resume = jiffies + 60 * HZ; 329 resume = jiffies + 60 * HZ;
330 330
331 printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n", 331 printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n",
332 current->comm, page_to_pfn(page)); 332 current->comm, page_to_pfn(page));
333 dump_page(page); 333 dump_page(page);
334 334
335 print_modules(); 335 print_modules();
336 dump_stack(); 336 dump_stack();
337 out: 337 out:
338 /* Leave bad fields for debug, except PageBuddy could make trouble */ 338 /* Leave bad fields for debug, except PageBuddy could make trouble */
339 page_mapcount_reset(page); /* remove PageBuddy */ 339 page_mapcount_reset(page); /* remove PageBuddy */
340 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); 340 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
341 } 341 }
342 342
343 /* 343 /*
344 * Higher-order pages are called "compound pages". They are structured thusly: 344 * Higher-order pages are called "compound pages". They are structured thusly:
345 * 345 *
346 * The first PAGE_SIZE page is called the "head page". 346 * The first PAGE_SIZE page is called the "head page".
347 * 347 *
348 * The remaining PAGE_SIZE pages are called "tail pages". 348 * The remaining PAGE_SIZE pages are called "tail pages".
349 * 349 *
350 * All pages have PG_compound set. All tail pages have their ->first_page 350 * All pages have PG_compound set. All tail pages have their ->first_page
351 * pointing at the head page. 351 * pointing at the head page.
352 * 352 *
353 * The first tail page's ->lru.next holds the address of the compound page's 353 * The first tail page's ->lru.next holds the address of the compound page's
354 * put_page() function. Its ->lru.prev holds the order of allocation. 354 * put_page() function. Its ->lru.prev holds the order of allocation.
355 * This usage means that zero-order pages may not be compound. 355 * This usage means that zero-order pages may not be compound.
356 */ 356 */
357 357
358 static void free_compound_page(struct page *page) 358 static void free_compound_page(struct page *page)
359 { 359 {
360 __free_pages_ok(page, compound_order(page)); 360 __free_pages_ok(page, compound_order(page));
361 } 361 }
362 362
363 void prep_compound_page(struct page *page, unsigned long order) 363 void prep_compound_page(struct page *page, unsigned long order)
364 { 364 {
365 int i; 365 int i;
366 int nr_pages = 1 << order; 366 int nr_pages = 1 << order;
367 367
368 set_compound_page_dtor(page, free_compound_page); 368 set_compound_page_dtor(page, free_compound_page);
369 set_compound_order(page, order); 369 set_compound_order(page, order);
370 __SetPageHead(page); 370 __SetPageHead(page);
371 for (i = 1; i < nr_pages; i++) { 371 for (i = 1; i < nr_pages; i++) {
372 struct page *p = page + i; 372 struct page *p = page + i;
373 set_page_count(p, 0); 373 set_page_count(p, 0);
374 p->first_page = page; 374 p->first_page = page;
375 /* Make sure p->first_page is always valid for PageTail() */ 375 /* Make sure p->first_page is always valid for PageTail() */
376 smp_wmb(); 376 smp_wmb();
377 __SetPageTail(p); 377 __SetPageTail(p);
378 } 378 }
379 } 379 }
380 380
381 /* update __split_huge_page_refcount if you change this function */ 381 /* update __split_huge_page_refcount if you change this function */
382 static int destroy_compound_page(struct page *page, unsigned long order) 382 static int destroy_compound_page(struct page *page, unsigned long order)
383 { 383 {
384 int i; 384 int i;
385 int nr_pages = 1 << order; 385 int nr_pages = 1 << order;
386 int bad = 0; 386 int bad = 0;
387 387
388 if (unlikely(compound_order(page) != order)) { 388 if (unlikely(compound_order(page) != order)) {
389 bad_page(page); 389 bad_page(page);
390 bad++; 390 bad++;
391 } 391 }
392 392
393 __ClearPageHead(page); 393 __ClearPageHead(page);
394 394
395 for (i = 1; i < nr_pages; i++) { 395 for (i = 1; i < nr_pages; i++) {
396 struct page *p = page + i; 396 struct page *p = page + i;
397 397
398 if (unlikely(!PageTail(p) || (p->first_page != page))) { 398 if (unlikely(!PageTail(p) || (p->first_page != page))) {
399 bad_page(page); 399 bad_page(page);
400 bad++; 400 bad++;
401 } 401 }
402 __ClearPageTail(p); 402 __ClearPageTail(p);
403 } 403 }
404 404
405 return bad; 405 return bad;
406 } 406 }
407 407
408 static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) 408 static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
409 { 409 {
410 int i; 410 int i;
411 411
412 /* 412 /*
413 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO 413 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
414 * and __GFP_HIGHMEM from hard or soft interrupt context. 414 * and __GFP_HIGHMEM from hard or soft interrupt context.
415 */ 415 */
416 VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt()); 416 VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
417 for (i = 0; i < (1 << order); i++) 417 for (i = 0; i < (1 << order); i++)
418 clear_highpage(page + i); 418 clear_highpage(page + i);
419 } 419 }
420 420
421 #ifdef CONFIG_DEBUG_PAGEALLOC 421 #ifdef CONFIG_DEBUG_PAGEALLOC
422 unsigned int _debug_guardpage_minorder; 422 unsigned int _debug_guardpage_minorder;
423 423
424 static int __init debug_guardpage_minorder_setup(char *buf) 424 static int __init debug_guardpage_minorder_setup(char *buf)
425 { 425 {
426 unsigned long res; 426 unsigned long res;
427 427
428 if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) { 428 if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {
429 printk(KERN_ERR "Bad debug_guardpage_minorder value\n"); 429 printk(KERN_ERR "Bad debug_guardpage_minorder value\n");
430 return 0; 430 return 0;
431 } 431 }
432 _debug_guardpage_minorder = res; 432 _debug_guardpage_minorder = res;
433 printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res); 433 printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);
434 return 0; 434 return 0;
435 } 435 }
436 __setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup); 436 __setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);
437 437
438 static inline void set_page_guard_flag(struct page *page) 438 static inline void set_page_guard_flag(struct page *page)
439 { 439 {
440 __set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); 440 __set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
441 } 441 }
442 442
443 static inline void clear_page_guard_flag(struct page *page) 443 static inline void clear_page_guard_flag(struct page *page)
444 { 444 {
445 __clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); 445 __clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
446 } 446 }
447 #else 447 #else
448 static inline void set_page_guard_flag(struct page *page) { } 448 static inline void set_page_guard_flag(struct page *page) { }
449 static inline void clear_page_guard_flag(struct page *page) { } 449 static inline void clear_page_guard_flag(struct page *page) { }
450 #endif 450 #endif
451 451
452 static inline void set_page_order(struct page *page, int order) 452 static inline void set_page_order(struct page *page, int order)
453 { 453 {
454 set_page_private(page, order); 454 set_page_private(page, order);
455 __SetPageBuddy(page); 455 __SetPageBuddy(page);
456 } 456 }
457 457
458 static inline void rmv_page_order(struct page *page) 458 static inline void rmv_page_order(struct page *page)
459 { 459 {
460 __ClearPageBuddy(page); 460 __ClearPageBuddy(page);
461 set_page_private(page, 0); 461 set_page_private(page, 0);
462 } 462 }
463 463
464 /* 464 /*
465 * Locate the struct page for both the matching buddy in our 465 * Locate the struct page for both the matching buddy in our
466 * pair (buddy1) and the combined O(n+1) page they form (page). 466 * pair (buddy1) and the combined O(n+1) page they form (page).
467 * 467 *
468 * 1) Any buddy B1 will have an order O twin B2 which satisfies 468 * 1) Any buddy B1 will have an order O twin B2 which satisfies
469 * the following equation: 469 * the following equation:
470 * B2 = B1 ^ (1 << O) 470 * B2 = B1 ^ (1 << O)
471 * For example, if the starting buddy (buddy2) is #8 its order 471 * For example, if the starting buddy (buddy2) is #8 its order
472 * 1 buddy is #10: 472 * 1 buddy is #10:
473 * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10 473 * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
474 * 474 *
475 * 2) Any buddy B will have an order O+1 parent P which 475 * 2) Any buddy B will have an order O+1 parent P which
476 * satisfies the following equation: 476 * satisfies the following equation:
477 * P = B & ~(1 << O) 477 * P = B & ~(1 << O)
478 * 478 *
479 * Assumption: *_mem_map is contiguous at least up to MAX_ORDER 479 * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
480 */ 480 */
481 static inline unsigned long 481 static inline unsigned long
482 __find_buddy_index(unsigned long page_idx, unsigned int order) 482 __find_buddy_index(unsigned long page_idx, unsigned int order)
483 { 483 {
484 return page_idx ^ (1 << order); 484 return page_idx ^ (1 << order);
485 } 485 }
486 486
487 /* 487 /*
488 * This function checks whether a page is free && is the buddy 488 * This function checks whether a page is free && is the buddy
489 * we can do coalesce a page and its buddy if 489 * we can do coalesce a page and its buddy if
490 * (a) the buddy is not in a hole && 490 * (a) the buddy is not in a hole &&
491 * (b) the buddy is in the buddy system && 491 * (b) the buddy is in the buddy system &&
492 * (c) a page and its buddy have the same order && 492 * (c) a page and its buddy have the same order &&
493 * (d) a page and its buddy are in the same zone. 493 * (d) a page and its buddy are in the same zone.
494 * 494 *
495 * For recording whether a page is in the buddy system, we set ->_mapcount 495 * For recording whether a page is in the buddy system, we set ->_mapcount
496 * PAGE_BUDDY_MAPCOUNT_VALUE. 496 * PAGE_BUDDY_MAPCOUNT_VALUE.
497 * Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is 497 * Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is
498 * serialized by zone->lock. 498 * serialized by zone->lock.
499 * 499 *
500 * For recording page's order, we use page_private(page). 500 * For recording page's order, we use page_private(page).
501 */ 501 */
502 static inline int page_is_buddy(struct page *page, struct page *buddy, 502 static inline int page_is_buddy(struct page *page, struct page *buddy,
503 int order) 503 int order)
504 { 504 {
505 if (!pfn_valid_within(page_to_pfn(buddy))) 505 if (!pfn_valid_within(page_to_pfn(buddy)))
506 return 0; 506 return 0;
507 507
508 if (page_zone_id(page) != page_zone_id(buddy)) 508 if (page_zone_id(page) != page_zone_id(buddy))
509 return 0; 509 return 0;
510 510
511 if (page_is_guard(buddy) && page_order(buddy) == order) { 511 if (page_is_guard(buddy) && page_order(buddy) == order) {
512 VM_BUG_ON(page_count(buddy) != 0); 512 VM_BUG_ON(page_count(buddy) != 0);
513 return 1; 513 return 1;
514 } 514 }
515 515
516 if (PageBuddy(buddy) && page_order(buddy) == order) { 516 if (PageBuddy(buddy) && page_order(buddy) == order) {
517 VM_BUG_ON(page_count(buddy) != 0); 517 VM_BUG_ON(page_count(buddy) != 0);
518 return 1; 518 return 1;
519 } 519 }
520 return 0; 520 return 0;
521 } 521 }
522 522
523 /* 523 /*
524 * Freeing function for a buddy system allocator. 524 * Freeing function for a buddy system allocator.
525 * 525 *
526 * The concept of a buddy system is to maintain direct-mapped table 526 * The concept of a buddy system is to maintain direct-mapped table
527 * (containing bit values) for memory blocks of various "orders". 527 * (containing bit values) for memory blocks of various "orders".
528 * The bottom level table contains the map for the smallest allocatable 528 * The bottom level table contains the map for the smallest allocatable
529 * units of memory (here, pages), and each level above it describes 529 * units of memory (here, pages), and each level above it describes
530 * pairs of units from the levels below, hence, "buddies". 530 * pairs of units from the levels below, hence, "buddies".
531 * At a high level, all that happens here is marking the table entry 531 * At a high level, all that happens here is marking the table entry
532 * at the bottom level available, and propagating the changes upward 532 * at the bottom level available, and propagating the changes upward
533 * as necessary, plus some accounting needed to play nicely with other 533 * as necessary, plus some accounting needed to play nicely with other
534 * parts of the VM system. 534 * parts of the VM system.
535 * At each level, we keep a list of pages, which are heads of continuous 535 * At each level, we keep a list of pages, which are heads of continuous
536 * free pages of length of (1 << order) and marked with _mapcount 536 * free pages of length of (1 << order) and marked with _mapcount
537 * PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page) 537 * PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page)
538 * field. 538 * field.
539 * So when we are allocating or freeing one, we can derive the state of the 539 * So when we are allocating or freeing one, we can derive the state of the
540 * other. That is, if we allocate a small block, and both were 540 * other. That is, if we allocate a small block, and both were
541 * free, the remainder of the region must be split into blocks. 541 * free, the remainder of the region must be split into blocks.
542 * If a block is freed, and its buddy is also free, then this 542 * If a block is freed, and its buddy is also free, then this
543 * triggers coalescing into a block of larger size. 543 * triggers coalescing into a block of larger size.
544 * 544 *
545 * -- nyc 545 * -- nyc
546 */ 546 */
547 547
548 static inline void __free_one_page(struct page *page, 548 static inline void __free_one_page(struct page *page,
549 struct zone *zone, unsigned int order, 549 struct zone *zone, unsigned int order,
550 int migratetype) 550 int migratetype)
551 { 551 {
552 unsigned long page_idx; 552 unsigned long page_idx;
553 unsigned long combined_idx; 553 unsigned long combined_idx;
554 unsigned long uninitialized_var(buddy_idx); 554 unsigned long uninitialized_var(buddy_idx);
555 struct page *buddy; 555 struct page *buddy;
556 556
557 VM_BUG_ON(!zone_is_initialized(zone)); 557 VM_BUG_ON(!zone_is_initialized(zone));
558 558
559 if (unlikely(PageCompound(page))) 559 if (unlikely(PageCompound(page)))
560 if (unlikely(destroy_compound_page(page, order))) 560 if (unlikely(destroy_compound_page(page, order)))
561 return; 561 return;
562 562
563 VM_BUG_ON(migratetype == -1); 563 VM_BUG_ON(migratetype == -1);
564 564
565 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); 565 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
566 566
567 VM_BUG_ON(page_idx & ((1 << order) - 1)); 567 VM_BUG_ON(page_idx & ((1 << order) - 1));
568 VM_BUG_ON(bad_range(zone, page)); 568 VM_BUG_ON(bad_range(zone, page));
569 569
570 while (order < MAX_ORDER-1) { 570 while (order < MAX_ORDER-1) {
571 buddy_idx = __find_buddy_index(page_idx, order); 571 buddy_idx = __find_buddy_index(page_idx, order);
572 buddy = page + (buddy_idx - page_idx); 572 buddy = page + (buddy_idx - page_idx);
573 if (!page_is_buddy(page, buddy, order)) 573 if (!page_is_buddy(page, buddy, order))
574 break; 574 break;
575 /* 575 /*
576 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, 576 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
577 * merge with it and move up one order. 577 * merge with it and move up one order.
578 */ 578 */
579 if (page_is_guard(buddy)) { 579 if (page_is_guard(buddy)) {
580 clear_page_guard_flag(buddy); 580 clear_page_guard_flag(buddy);
581 set_page_private(page, 0); 581 set_page_private(page, 0);
582 __mod_zone_freepage_state(zone, 1 << order, 582 __mod_zone_freepage_state(zone, 1 << order,
583 migratetype); 583 migratetype);
584 } else { 584 } else {
585 list_del(&buddy->lru); 585 list_del(&buddy->lru);
586 zone->free_area[order].nr_free--; 586 zone->free_area[order].nr_free--;
587 rmv_page_order(buddy); 587 rmv_page_order(buddy);
588 } 588 }
589 combined_idx = buddy_idx & page_idx; 589 combined_idx = buddy_idx & page_idx;
590 page = page + (combined_idx - page_idx); 590 page = page + (combined_idx - page_idx);
591 page_idx = combined_idx; 591 page_idx = combined_idx;
592 order++; 592 order++;
593 } 593 }
594 set_page_order(page, order); 594 set_page_order(page, order);
595 595
596 /* 596 /*
597 * If this is not the largest possible page, check if the buddy 597 * If this is not the largest possible page, check if the buddy
598 * of the next-highest order is free. If it is, it's possible 598 * of the next-highest order is free. If it is, it's possible
599 * that pages are being freed that will coalesce soon. In case, 599 * that pages are being freed that will coalesce soon. In case,
600 * that is happening, add the free page to the tail of the list 600 * that is happening, add the free page to the tail of the list
601 * so it's less likely to be used soon and more likely to be merged 601 * so it's less likely to be used soon and more likely to be merged
602 * as a higher order page 602 * as a higher order page
603 */ 603 */
604 if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) { 604 if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
605 struct page *higher_page, *higher_buddy; 605 struct page *higher_page, *higher_buddy;
606 combined_idx = buddy_idx & page_idx; 606 combined_idx = buddy_idx & page_idx;
607 higher_page = page + (combined_idx - page_idx); 607 higher_page = page + (combined_idx - page_idx);
608 buddy_idx = __find_buddy_index(combined_idx, order + 1); 608 buddy_idx = __find_buddy_index(combined_idx, order + 1);
609 higher_buddy = higher_page + (buddy_idx - combined_idx); 609 higher_buddy = higher_page + (buddy_idx - combined_idx);
610 if (page_is_buddy(higher_page, higher_buddy, order + 1)) { 610 if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
611 list_add_tail(&page->lru, 611 list_add_tail(&page->lru,
612 &zone->free_area[order].free_list[migratetype]); 612 &zone->free_area[order].free_list[migratetype]);
613 goto out; 613 goto out;
614 } 614 }
615 } 615 }
616 616
617 list_add(&page->lru, &zone->free_area[order].free_list[migratetype]); 617 list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
618 out: 618 out:
619 zone->free_area[order].nr_free++; 619 zone->free_area[order].nr_free++;
620 } 620 }
621 621
622 static inline int free_pages_check(struct page *page) 622 static inline int free_pages_check(struct page *page)
623 { 623 {
624 if (unlikely(page_mapcount(page) | 624 if (unlikely(page_mapcount(page) |
625 (page->mapping != NULL) | 625 (page->mapping != NULL) |
626 (atomic_read(&page->_count) != 0) | 626 (atomic_read(&page->_count) != 0) |
627 (page->flags & PAGE_FLAGS_CHECK_AT_FREE) | 627 (page->flags & PAGE_FLAGS_CHECK_AT_FREE) |
628 (mem_cgroup_bad_page_check(page)))) { 628 (mem_cgroup_bad_page_check(page)))) {
629 bad_page(page); 629 bad_page(page);
630 return 1; 630 return 1;
631 } 631 }
632 page_nid_reset_last(page); 632 page_nid_reset_last(page);
633 if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) 633 if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
634 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; 634 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
635 return 0; 635 return 0;
636 } 636 }
637 637
638 /* 638 /*
639 * Frees a number of pages from the PCP lists 639 * Frees a number of pages from the PCP lists
640 * Assumes all pages on list are in same zone, and of same order. 640 * Assumes all pages on list are in same zone, and of same order.
641 * count is the number of pages to free. 641 * count is the number of pages to free.
642 * 642 *
643 * If the zone was previously in an "all pages pinned" state then look to 643 * If the zone was previously in an "all pages pinned" state then look to
644 * see if this freeing clears that state. 644 * see if this freeing clears that state.
645 * 645 *
646 * And clear the zone's pages_scanned counter, to hold off the "all pages are 646 * And clear the zone's pages_scanned counter, to hold off the "all pages are
647 * pinned" detection logic. 647 * pinned" detection logic.
648 */ 648 */
649 static void free_pcppages_bulk(struct zone *zone, int count, 649 static void free_pcppages_bulk(struct zone *zone, int count,
650 struct per_cpu_pages *pcp) 650 struct per_cpu_pages *pcp)
651 { 651 {
652 int migratetype = 0; 652 int migratetype = 0;
653 int batch_free = 0; 653 int batch_free = 0;
654 int to_free = count; 654 int to_free = count;
655 655
656 spin_lock(&zone->lock); 656 spin_lock(&zone->lock);
657 zone->pages_scanned = 0; 657 zone->pages_scanned = 0;
658 658
659 while (to_free) { 659 while (to_free) {
660 struct page *page; 660 struct page *page;
661 struct list_head *list; 661 struct list_head *list;
662 662
663 /* 663 /*
664 * Remove pages from lists in a round-robin fashion. A 664 * Remove pages from lists in a round-robin fashion. A
665 * batch_free count is maintained that is incremented when an 665 * batch_free count is maintained that is incremented when an
666 * empty list is encountered. This is so more pages are freed 666 * empty list is encountered. This is so more pages are freed
667 * off fuller lists instead of spinning excessively around empty 667 * off fuller lists instead of spinning excessively around empty
668 * lists 668 * lists
669 */ 669 */
670 do { 670 do {
671 batch_free++; 671 batch_free++;
672 if (++migratetype == MIGRATE_PCPTYPES) 672 if (++migratetype == MIGRATE_PCPTYPES)
673 migratetype = 0; 673 migratetype = 0;
674 list = &pcp->lists[migratetype]; 674 list = &pcp->lists[migratetype];
675 } while (list_empty(list)); 675 } while (list_empty(list));
676 676
677 /* This is the only non-empty list. Free them all. */ 677 /* This is the only non-empty list. Free them all. */
678 if (batch_free == MIGRATE_PCPTYPES) 678 if (batch_free == MIGRATE_PCPTYPES)
679 batch_free = to_free; 679 batch_free = to_free;
680 680
681 do { 681 do {
682 int mt; /* migratetype of the to-be-freed page */ 682 int mt; /* migratetype of the to-be-freed page */
683 683
684 page = list_entry(list->prev, struct page, lru); 684 page = list_entry(list->prev, struct page, lru);
685 /* must delete as __free_one_page list manipulates */ 685 /* must delete as __free_one_page list manipulates */
686 list_del(&page->lru); 686 list_del(&page->lru);
687 mt = get_freepage_migratetype(page); 687 mt = get_freepage_migratetype(page);
688 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ 688 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
689 __free_one_page(page, zone, 0, mt); 689 __free_one_page(page, zone, 0, mt);
690 trace_mm_page_pcpu_drain(page, 0, mt); 690 trace_mm_page_pcpu_drain(page, 0, mt);
691 if (likely(!is_migrate_isolate_page(page))) { 691 if (likely(!is_migrate_isolate_page(page))) {
692 __mod_zone_page_state(zone, NR_FREE_PAGES, 1); 692 __mod_zone_page_state(zone, NR_FREE_PAGES, 1);
693 if (is_migrate_cma(mt)) 693 if (is_migrate_cma(mt))
694 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1); 694 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
695 } 695 }
696 } while (--to_free && --batch_free && !list_empty(list)); 696 } while (--to_free && --batch_free && !list_empty(list));
697 } 697 }
698 spin_unlock(&zone->lock); 698 spin_unlock(&zone->lock);
699 } 699 }
700 700
701 static void free_one_page(struct zone *zone, struct page *page, int order, 701 static void free_one_page(struct zone *zone, struct page *page, int order,
702 int migratetype) 702 int migratetype)
703 { 703 {
704 spin_lock(&zone->lock); 704 spin_lock(&zone->lock);
705 zone->pages_scanned = 0; 705 zone->pages_scanned = 0;
706 706
707 __free_one_page(page, zone, order, migratetype); 707 __free_one_page(page, zone, order, migratetype);
708 if (unlikely(!is_migrate_isolate(migratetype))) 708 if (unlikely(!is_migrate_isolate(migratetype)))
709 __mod_zone_freepage_state(zone, 1 << order, migratetype); 709 __mod_zone_freepage_state(zone, 1 << order, migratetype);
710 spin_unlock(&zone->lock); 710 spin_unlock(&zone->lock);
711 } 711 }
712 712
713 static bool free_pages_prepare(struct page *page, unsigned int order) 713 static bool free_pages_prepare(struct page *page, unsigned int order)
714 { 714 {
715 int i; 715 int i;
716 int bad = 0; 716 int bad = 0;
717 717
718 trace_mm_page_free(page, order); 718 trace_mm_page_free(page, order);
719 kmemcheck_free_shadow(page, order); 719 kmemcheck_free_shadow(page, order);
720 720
721 if (PageAnon(page)) 721 if (PageAnon(page))
722 page->mapping = NULL; 722 page->mapping = NULL;
723 for (i = 0; i < (1 << order); i++) 723 for (i = 0; i < (1 << order); i++)
724 bad += free_pages_check(page + i); 724 bad += free_pages_check(page + i);
725 if (bad) 725 if (bad)
726 return false; 726 return false;
727 727
728 if (!PageHighMem(page)) { 728 if (!PageHighMem(page)) {
729 debug_check_no_locks_freed(page_address(page), 729 debug_check_no_locks_freed(page_address(page),
730 PAGE_SIZE << order); 730 PAGE_SIZE << order);
731 debug_check_no_obj_freed(page_address(page), 731 debug_check_no_obj_freed(page_address(page),
732 PAGE_SIZE << order); 732 PAGE_SIZE << order);
733 } 733 }
734 arch_free_page(page, order); 734 arch_free_page(page, order);
735 kernel_map_pages(page, 1 << order, 0); 735 kernel_map_pages(page, 1 << order, 0);
736 736
737 return true; 737 return true;
738 } 738 }
739 739
740 static void __free_pages_ok(struct page *page, unsigned int order) 740 static void __free_pages_ok(struct page *page, unsigned int order)
741 { 741 {
742 unsigned long flags; 742 unsigned long flags;
743 int migratetype; 743 int migratetype;
744 744
745 if (!free_pages_prepare(page, order)) 745 if (!free_pages_prepare(page, order))
746 return; 746 return;
747 747
748 local_irq_save(flags); 748 local_irq_save(flags);
749 __count_vm_events(PGFREE, 1 << order); 749 __count_vm_events(PGFREE, 1 << order);
750 migratetype = get_pageblock_migratetype(page); 750 migratetype = get_pageblock_migratetype(page);
751 set_freepage_migratetype(page, migratetype); 751 set_freepage_migratetype(page, migratetype);
752 free_one_page(page_zone(page), page, order, migratetype); 752 free_one_page(page_zone(page), page, order, migratetype);
753 local_irq_restore(flags); 753 local_irq_restore(flags);
754 } 754 }
755 755
756 void __init __free_pages_bootmem(struct page *page, unsigned int order) 756 void __init __free_pages_bootmem(struct page *page, unsigned int order)
757 { 757 {
758 unsigned int nr_pages = 1 << order; 758 unsigned int nr_pages = 1 << order;
759 struct page *p = page; 759 struct page *p = page;
760 unsigned int loop; 760 unsigned int loop;
761 761
762 prefetchw(p); 762 prefetchw(p);
763 for (loop = 0; loop < (nr_pages - 1); loop++, p++) { 763 for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
764 prefetchw(p + 1); 764 prefetchw(p + 1);
765 __ClearPageReserved(p); 765 __ClearPageReserved(p);
766 set_page_count(p, 0); 766 set_page_count(p, 0);
767 } 767 }
768 __ClearPageReserved(p); 768 __ClearPageReserved(p);
769 set_page_count(p, 0); 769 set_page_count(p, 0);
770 770
771 page_zone(page)->managed_pages += nr_pages; 771 page_zone(page)->managed_pages += nr_pages;
772 set_page_refcounted(page); 772 set_page_refcounted(page);
773 __free_pages(page, order); 773 __free_pages(page, order);
774 } 774 }
775 775
776 #ifdef CONFIG_CMA 776 #ifdef CONFIG_CMA
777 /* Free whole pageblock and set its migration type to MIGRATE_CMA. */ 777 /* Free whole pageblock and set its migration type to MIGRATE_CMA. */
778 void __init init_cma_reserved_pageblock(struct page *page) 778 void __init init_cma_reserved_pageblock(struct page *page)
779 { 779 {
780 unsigned i = pageblock_nr_pages; 780 unsigned i = pageblock_nr_pages;
781 struct page *p = page; 781 struct page *p = page;
782 782
783 do { 783 do {
784 __ClearPageReserved(p); 784 __ClearPageReserved(p);
785 set_page_count(p, 0); 785 set_page_count(p, 0);
786 } while (++p, --i); 786 } while (++p, --i);
787 787
788 set_pageblock_migratetype(page, MIGRATE_CMA); 788 set_pageblock_migratetype(page, MIGRATE_CMA);
789 789
790 if (pageblock_order >= MAX_ORDER) { 790 if (pageblock_order >= MAX_ORDER) {
791 i = pageblock_nr_pages; 791 i = pageblock_nr_pages;
792 p = page; 792 p = page;
793 do { 793 do {
794 set_page_refcounted(p); 794 set_page_refcounted(p);
795 __free_pages(p, MAX_ORDER - 1); 795 __free_pages(p, MAX_ORDER - 1);
796 p += MAX_ORDER_NR_PAGES; 796 p += MAX_ORDER_NR_PAGES;
797 } while (i -= MAX_ORDER_NR_PAGES); 797 } while (i -= MAX_ORDER_NR_PAGES);
798 } else { 798 } else {
799 set_page_refcounted(page); 799 set_page_refcounted(page);
800 __free_pages(page, pageblock_order); 800 __free_pages(page, pageblock_order);
801 } 801 }
802 802
803 adjust_managed_page_count(page, pageblock_nr_pages); 803 adjust_managed_page_count(page, pageblock_nr_pages);
804 } 804 }
805 #endif 805 #endif
806 806
807 /* 807 /*
808 * The order of subdivision here is critical for the IO subsystem. 808 * The order of subdivision here is critical for the IO subsystem.
809 * Please do not alter this order without good reasons and regression 809 * Please do not alter this order without good reasons and regression
810 * testing. Specifically, as large blocks of memory are subdivided, 810 * testing. Specifically, as large blocks of memory are subdivided,
811 * the order in which smaller blocks are delivered depends on the order 811 * the order in which smaller blocks are delivered depends on the order
812 * they're subdivided in this function. This is the primary factor 812 * they're subdivided in this function. This is the primary factor
813 * influencing the order in which pages are delivered to the IO 813 * influencing the order in which pages are delivered to the IO
814 * subsystem according to empirical testing, and this is also justified 814 * subsystem according to empirical testing, and this is also justified
815 * by considering the behavior of a buddy system containing a single 815 * by considering the behavior of a buddy system containing a single
816 * large block of memory acted on by a series of small allocations. 816 * large block of memory acted on by a series of small allocations.
817 * This behavior is a critical factor in sglist merging's success. 817 * This behavior is a critical factor in sglist merging's success.
818 * 818 *
819 * -- nyc 819 * -- nyc
820 */ 820 */
821 static inline void expand(struct zone *zone, struct page *page, 821 static inline void expand(struct zone *zone, struct page *page,
822 int low, int high, struct free_area *area, 822 int low, int high, struct free_area *area,
823 int migratetype) 823 int migratetype)
824 { 824 {
825 unsigned long size = 1 << high; 825 unsigned long size = 1 << high;
826 826
827 while (high > low) { 827 while (high > low) {
828 area--; 828 area--;
829 high--; 829 high--;
830 size >>= 1; 830 size >>= 1;
831 VM_BUG_ON(bad_range(zone, &page[size])); 831 VM_BUG_ON(bad_range(zone, &page[size]));
832 832
833 #ifdef CONFIG_DEBUG_PAGEALLOC 833 #ifdef CONFIG_DEBUG_PAGEALLOC
834 if (high < debug_guardpage_minorder()) { 834 if (high < debug_guardpage_minorder()) {
835 /* 835 /*
836 * Mark as guard pages (or page), that will allow to 836 * Mark as guard pages (or page), that will allow to
837 * merge back to allocator when buddy will be freed. 837 * merge back to allocator when buddy will be freed.
838 * Corresponding page table entries will not be touched, 838 * Corresponding page table entries will not be touched,
839 * pages will stay not present in virtual address space 839 * pages will stay not present in virtual address space
840 */ 840 */
841 INIT_LIST_HEAD(&page[size].lru); 841 INIT_LIST_HEAD(&page[size].lru);
842 set_page_guard_flag(&page[size]); 842 set_page_guard_flag(&page[size]);
843 set_page_private(&page[size], high); 843 set_page_private(&page[size], high);
844 /* Guard pages are not available for any usage */ 844 /* Guard pages are not available for any usage */
845 __mod_zone_freepage_state(zone, -(1 << high), 845 __mod_zone_freepage_state(zone, -(1 << high),
846 migratetype); 846 migratetype);
847 continue; 847 continue;
848 } 848 }
849 #endif 849 #endif
850 list_add(&page[size].lru, &area->free_list[migratetype]); 850 list_add(&page[size].lru, &area->free_list[migratetype]);
851 area->nr_free++; 851 area->nr_free++;
852 set_page_order(&page[size], high); 852 set_page_order(&page[size], high);
853 } 853 }
854 } 854 }
855 855
856 /* 856 /*
857 * This page is about to be returned from the page allocator 857 * This page is about to be returned from the page allocator
858 */ 858 */
859 static inline int check_new_page(struct page *page) 859 static inline int check_new_page(struct page *page)
860 { 860 {
861 if (unlikely(page_mapcount(page) | 861 if (unlikely(page_mapcount(page) |
862 (page->mapping != NULL) | 862 (page->mapping != NULL) |
863 (atomic_read(&page->_count) != 0) | 863 (atomic_read(&page->_count) != 0) |
864 (page->flags & PAGE_FLAGS_CHECK_AT_PREP) | 864 (page->flags & PAGE_FLAGS_CHECK_AT_PREP) |
865 (mem_cgroup_bad_page_check(page)))) { 865 (mem_cgroup_bad_page_check(page)))) {
866 bad_page(page); 866 bad_page(page);
867 return 1; 867 return 1;
868 } 868 }
869 return 0; 869 return 0;
870 } 870 }
871 871
872 static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) 872 static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
873 { 873 {
874 int i; 874 int i;
875 875
876 for (i = 0; i < (1 << order); i++) { 876 for (i = 0; i < (1 << order); i++) {
877 struct page *p = page + i; 877 struct page *p = page + i;
878 if (unlikely(check_new_page(p))) 878 if (unlikely(check_new_page(p)))
879 return 1; 879 return 1;
880 } 880 }
881 881
882 set_page_private(page, 0); 882 set_page_private(page, 0);
883 set_page_refcounted(page); 883 set_page_refcounted(page);
884 884
885 arch_alloc_page(page, order); 885 arch_alloc_page(page, order);
886 kernel_map_pages(page, 1 << order, 1); 886 kernel_map_pages(page, 1 << order, 1);
887 887
888 if (gfp_flags & __GFP_ZERO) 888 if (gfp_flags & __GFP_ZERO)
889 prep_zero_page(page, order, gfp_flags); 889 prep_zero_page(page, order, gfp_flags);
890 890
891 if (order && (gfp_flags & __GFP_COMP)) 891 if (order && (gfp_flags & __GFP_COMP))
892 prep_compound_page(page, order); 892 prep_compound_page(page, order);
893 893
894 return 0; 894 return 0;
895 } 895 }
896 896
897 /* 897 /*
898 * Go through the free lists for the given migratetype and remove 898 * Go through the free lists for the given migratetype and remove
899 * the smallest available page from the freelists 899 * the smallest available page from the freelists
900 */ 900 */
901 static inline 901 static inline
902 struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, 902 struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
903 int migratetype) 903 int migratetype)
904 { 904 {
905 unsigned int current_order; 905 unsigned int current_order;
906 struct free_area *area; 906 struct free_area *area;
907 struct page *page; 907 struct page *page;
908 908
909 /* Find a page of the appropriate size in the preferred list */ 909 /* Find a page of the appropriate size in the preferred list */
910 for (current_order = order; current_order < MAX_ORDER; ++current_order) { 910 for (current_order = order; current_order < MAX_ORDER; ++current_order) {
911 area = &(zone->free_area[current_order]); 911 area = &(zone->free_area[current_order]);
912 if (list_empty(&area->free_list[migratetype])) 912 if (list_empty(&area->free_list[migratetype]))
913 continue; 913 continue;
914 914
915 page = list_entry(area->free_list[migratetype].next, 915 page = list_entry(area->free_list[migratetype].next,
916 struct page, lru); 916 struct page, lru);
917 list_del(&page->lru); 917 list_del(&page->lru);
918 rmv_page_order(page); 918 rmv_page_order(page);
919 area->nr_free--; 919 area->nr_free--;
920 expand(zone, page, order, current_order, area, migratetype); 920 expand(zone, page, order, current_order, area, migratetype);
921 set_freepage_migratetype(page, migratetype);
921 return page; 922 return page;
922 } 923 }
923 924
924 return NULL; 925 return NULL;
925 } 926 }
926 927
927 928
928 /* 929 /*
929 * This array describes the order lists are fallen back to when 930 * This array describes the order lists are fallen back to when
930 * the free lists for the desirable migrate type are depleted 931 * the free lists for the desirable migrate type are depleted
931 */ 932 */
932 static int fallbacks[MIGRATE_TYPES][4] = { 933 static int fallbacks[MIGRATE_TYPES][4] = {
933 [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, 934 [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
934 [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, 935 [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
935 #ifdef CONFIG_CMA 936 #ifdef CONFIG_CMA
936 [MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, 937 [MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
937 [MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */ 938 [MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */
938 #else 939 #else
939 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, 940 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
940 #endif 941 #endif
941 [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */ 942 [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */
942 #ifdef CONFIG_MEMORY_ISOLATION 943 #ifdef CONFIG_MEMORY_ISOLATION
943 [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */ 944 [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */
944 #endif 945 #endif
945 }; 946 };
946 947
947 /* 948 /*
948 * Move the free pages in a range to the free lists of the requested type. 949 * Move the free pages in a range to the free lists of the requested type.
949 * Note that start_page and end_pages are not aligned on a pageblock 950 * Note that start_page and end_pages are not aligned on a pageblock
950 * boundary. If alignment is required, use move_freepages_block() 951 * boundary. If alignment is required, use move_freepages_block()
951 */ 952 */
952 int move_freepages(struct zone *zone, 953 int move_freepages(struct zone *zone,
953 struct page *start_page, struct page *end_page, 954 struct page *start_page, struct page *end_page,
954 int migratetype) 955 int migratetype)
955 { 956 {
956 struct page *page; 957 struct page *page;
957 unsigned long order; 958 unsigned long order;
958 int pages_moved = 0; 959 int pages_moved = 0;
959 960
960 #ifndef CONFIG_HOLES_IN_ZONE 961 #ifndef CONFIG_HOLES_IN_ZONE
961 /* 962 /*
962 * page_zone is not safe to call in this context when 963 * page_zone is not safe to call in this context when
963 * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant 964 * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
964 * anyway as we check zone boundaries in move_freepages_block(). 965 * anyway as we check zone boundaries in move_freepages_block().
965 * Remove at a later date when no bug reports exist related to 966 * Remove at a later date when no bug reports exist related to
966 * grouping pages by mobility 967 * grouping pages by mobility
967 */ 968 */
968 BUG_ON(page_zone(start_page) != page_zone(end_page)); 969 BUG_ON(page_zone(start_page) != page_zone(end_page));
969 #endif 970 #endif
970 971
971 for (page = start_page; page <= end_page;) { 972 for (page = start_page; page <= end_page;) {
972 /* Make sure we are not inadvertently changing nodes */ 973 /* Make sure we are not inadvertently changing nodes */
973 VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone)); 974 VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));
974 975
975 if (!pfn_valid_within(page_to_pfn(page))) { 976 if (!pfn_valid_within(page_to_pfn(page))) {
976 page++; 977 page++;
977 continue; 978 continue;
978 } 979 }
979 980
980 if (!PageBuddy(page)) { 981 if (!PageBuddy(page)) {
981 page++; 982 page++;
982 continue; 983 continue;
983 } 984 }
984 985
985 order = page_order(page); 986 order = page_order(page);
986 list_move(&page->lru, 987 list_move(&page->lru,
987 &zone->free_area[order].free_list[migratetype]); 988 &zone->free_area[order].free_list[migratetype]);
988 set_freepage_migratetype(page, migratetype); 989 set_freepage_migratetype(page, migratetype);
989 page += 1 << order; 990 page += 1 << order;
990 pages_moved += 1 << order; 991 pages_moved += 1 << order;
991 } 992 }
992 993
993 return pages_moved; 994 return pages_moved;
994 } 995 }
995 996
996 int move_freepages_block(struct zone *zone, struct page *page, 997 int move_freepages_block(struct zone *zone, struct page *page,
997 int migratetype) 998 int migratetype)
998 { 999 {
999 unsigned long start_pfn, end_pfn; 1000 unsigned long start_pfn, end_pfn;
1000 struct page *start_page, *end_page; 1001 struct page *start_page, *end_page;
1001 1002
1002 start_pfn = page_to_pfn(page); 1003 start_pfn = page_to_pfn(page);
1003 start_pfn = start_pfn & ~(pageblock_nr_pages-1); 1004 start_pfn = start_pfn & ~(pageblock_nr_pages-1);
1004 start_page = pfn_to_page(start_pfn); 1005 start_page = pfn_to_page(start_pfn);
1005 end_page = start_page + pageblock_nr_pages - 1; 1006 end_page = start_page + pageblock_nr_pages - 1;
1006 end_pfn = start_pfn + pageblock_nr_pages - 1; 1007 end_pfn = start_pfn + pageblock_nr_pages - 1;
1007 1008
1008 /* Do not cross zone boundaries */ 1009 /* Do not cross zone boundaries */
1009 if (!zone_spans_pfn(zone, start_pfn)) 1010 if (!zone_spans_pfn(zone, start_pfn))
1010 start_page = page; 1011 start_page = page;
1011 if (!zone_spans_pfn(zone, end_pfn)) 1012 if (!zone_spans_pfn(zone, end_pfn))
1012 return 0; 1013 return 0;
1013 1014
1014 return move_freepages(zone, start_page, end_page, migratetype); 1015 return move_freepages(zone, start_page, end_page, migratetype);
1015 } 1016 }
1016 1017
1017 static void change_pageblock_range(struct page *pageblock_page, 1018 static void change_pageblock_range(struct page *pageblock_page,
1018 int start_order, int migratetype) 1019 int start_order, int migratetype)
1019 { 1020 {
1020 int nr_pageblocks = 1 << (start_order - pageblock_order); 1021 int nr_pageblocks = 1 << (start_order - pageblock_order);
1021 1022
1022 while (nr_pageblocks--) { 1023 while (nr_pageblocks--) {
1023 set_pageblock_migratetype(pageblock_page, migratetype); 1024 set_pageblock_migratetype(pageblock_page, migratetype);
1024 pageblock_page += pageblock_nr_pages; 1025 pageblock_page += pageblock_nr_pages;
1025 } 1026 }
1026 } 1027 }
1027 1028
1028 /* 1029 /*
1029 * If breaking a large block of pages, move all free pages to the preferred 1030 * If breaking a large block of pages, move all free pages to the preferred
1030 * allocation list. If falling back for a reclaimable kernel allocation, be 1031 * allocation list. If falling back for a reclaimable kernel allocation, be
1031 * more aggressive about taking ownership of free pages. 1032 * more aggressive about taking ownership of free pages.
1032 * 1033 *
1033 * On the other hand, never change migration type of MIGRATE_CMA pageblocks 1034 * On the other hand, never change migration type of MIGRATE_CMA pageblocks
1034 * nor move CMA pages to different free lists. We don't want unmovable pages 1035 * nor move CMA pages to different free lists. We don't want unmovable pages
1035 * to be allocated from MIGRATE_CMA areas. 1036 * to be allocated from MIGRATE_CMA areas.
1036 * 1037 *
1037 * Returns the new migratetype of the pageblock (or the same old migratetype 1038 * Returns the new migratetype of the pageblock (or the same old migratetype
1038 * if it was unchanged). 1039 * if it was unchanged).
1039 */ 1040 */
1040 static int try_to_steal_freepages(struct zone *zone, struct page *page, 1041 static int try_to_steal_freepages(struct zone *zone, struct page *page,
1041 int start_type, int fallback_type) 1042 int start_type, int fallback_type)
1042 { 1043 {
1043 int current_order = page_order(page); 1044 int current_order = page_order(page);
1044 1045
1045 /* 1046 /*
1046 * When borrowing from MIGRATE_CMA, we need to release the excess 1047 * When borrowing from MIGRATE_CMA, we need to release the excess
1047 * buddy pages to CMA itself. 1048 * buddy pages to CMA itself. We also ensure the freepage_migratetype
1049 * is set to CMA so it is returned to the correct freelist in case
1050 * the page ends up being not actually allocated from the pcp lists.
1048 */ 1051 */
1049 if (is_migrate_cma(fallback_type)) 1052 if (is_migrate_cma(fallback_type))
1050 return fallback_type; 1053 return fallback_type;
1051 1054
1052 /* Take ownership for orders >= pageblock_order */ 1055 /* Take ownership for orders >= pageblock_order */
1053 if (current_order >= pageblock_order) { 1056 if (current_order >= pageblock_order) {
1054 change_pageblock_range(page, current_order, start_type); 1057 change_pageblock_range(page, current_order, start_type);
1055 return start_type; 1058 return start_type;
1056 } 1059 }
1057 1060
1058 if (current_order >= pageblock_order / 2 || 1061 if (current_order >= pageblock_order / 2 ||
1059 start_type == MIGRATE_RECLAIMABLE || 1062 start_type == MIGRATE_RECLAIMABLE ||
1060 page_group_by_mobility_disabled) { 1063 page_group_by_mobility_disabled) {
1061 int pages; 1064 int pages;
1062 1065
1063 pages = move_freepages_block(zone, page, start_type); 1066 pages = move_freepages_block(zone, page, start_type);
1064 1067
1065 /* Claim the whole block if over half of it is free */ 1068 /* Claim the whole block if over half of it is free */
1066 if (pages >= (1 << (pageblock_order-1)) || 1069 if (pages >= (1 << (pageblock_order-1)) ||
1067 page_group_by_mobility_disabled) { 1070 page_group_by_mobility_disabled) {
1068 1071
1069 set_pageblock_migratetype(page, start_type); 1072 set_pageblock_migratetype(page, start_type);
1070 return start_type; 1073 return start_type;
1071 } 1074 }
1072 1075
1073 } 1076 }
1074 1077
1075 return fallback_type; 1078 return fallback_type;
1076 } 1079 }
1077 1080
1078 /* Remove an element from the buddy allocator from the fallback list */ 1081 /* Remove an element from the buddy allocator from the fallback list */
1079 static inline struct page * 1082 static inline struct page *
1080 __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) 1083 __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
1081 { 1084 {
1082 struct free_area *area; 1085 struct free_area *area;
1083 int current_order; 1086 int current_order;
1084 struct page *page; 1087 struct page *page;
1085 int migratetype, new_type, i; 1088 int migratetype, new_type, i;
1086 1089
1087 /* Find the largest possible block of pages in the other list */ 1090 /* Find the largest possible block of pages in the other list */
1088 for (current_order = MAX_ORDER-1; current_order >= order; 1091 for (current_order = MAX_ORDER-1; current_order >= order;
1089 --current_order) { 1092 --current_order) {
1090 for (i = 0;; i++) { 1093 for (i = 0;; i++) {
1091 migratetype = fallbacks[start_migratetype][i]; 1094 migratetype = fallbacks[start_migratetype][i];
1092 1095
1093 /* MIGRATE_RESERVE handled later if necessary */ 1096 /* MIGRATE_RESERVE handled later if necessary */
1094 if (migratetype == MIGRATE_RESERVE) 1097 if (migratetype == MIGRATE_RESERVE)
1095 break; 1098 break;
1096 1099
1097 area = &(zone->free_area[current_order]); 1100 area = &(zone->free_area[current_order]);
1098 if (list_empty(&area->free_list[migratetype])) 1101 if (list_empty(&area->free_list[migratetype]))
1099 continue; 1102 continue;
1100 1103
1101 page = list_entry(area->free_list[migratetype].next, 1104 page = list_entry(area->free_list[migratetype].next,
1102 struct page, lru); 1105 struct page, lru);
1103 area->nr_free--; 1106 area->nr_free--;
1104 1107
1105 new_type = try_to_steal_freepages(zone, page, 1108 new_type = try_to_steal_freepages(zone, page,
1106 start_migratetype, 1109 start_migratetype,
1107 migratetype); 1110 migratetype);
1108 1111
1109 /* Remove the page from the freelists */ 1112 /* Remove the page from the freelists */
1110 list_del(&page->lru); 1113 list_del(&page->lru);
1111 rmv_page_order(page); 1114 rmv_page_order(page);
1112 1115
1113 expand(zone, page, order, current_order, area, 1116 expand(zone, page, order, current_order, area,
1114 new_type); 1117 new_type);
1118 /* The freepage_migratetype may differ from pageblock's
1119 * migratetype depending on the decisions in
1120 * try_to_steal_freepages. This is OK as long as it does
1121 * not differ for MIGRATE_CMA type.
1122 */
1123 set_freepage_migratetype(page, new_type);
1115 1124
1116 trace_mm_page_alloc_extfrag(page, order, current_order, 1125 trace_mm_page_alloc_extfrag(page, order, current_order,
1117 start_migratetype, migratetype, new_type); 1126 start_migratetype, migratetype, new_type);
1118 1127
1119 return page; 1128 return page;
1120 } 1129 }
1121 } 1130 }
1122 1131
1123 return NULL; 1132 return NULL;
1124 } 1133 }
1125 1134
1126 /* 1135 /*
1127 * Do the hard work of removing an element from the buddy allocator. 1136 * Do the hard work of removing an element from the buddy allocator.
1128 * Call me with the zone->lock already held. 1137 * Call me with the zone->lock already held.
1129 */ 1138 */
1130 static struct page *__rmqueue(struct zone *zone, unsigned int order, 1139 static struct page *__rmqueue(struct zone *zone, unsigned int order,
1131 int migratetype) 1140 int migratetype)
1132 { 1141 {
1133 struct page *page; 1142 struct page *page;
1134 1143
1135 retry_reserve: 1144 retry_reserve:
1136 page = __rmqueue_smallest(zone, order, migratetype); 1145 page = __rmqueue_smallest(zone, order, migratetype);
1137 1146
1138 if (unlikely(!page) && migratetype != MIGRATE_RESERVE) { 1147 if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
1139 page = __rmqueue_fallback(zone, order, migratetype); 1148 page = __rmqueue_fallback(zone, order, migratetype);
1140 1149
1141 /* 1150 /*
1142 * Use MIGRATE_RESERVE rather than fail an allocation. goto 1151 * Use MIGRATE_RESERVE rather than fail an allocation. goto
1143 * is used because __rmqueue_smallest is an inline function 1152 * is used because __rmqueue_smallest is an inline function
1144 * and we want just one call site 1153 * and we want just one call site
1145 */ 1154 */
1146 if (!page) { 1155 if (!page) {
1147 migratetype = MIGRATE_RESERVE; 1156 migratetype = MIGRATE_RESERVE;
1148 goto retry_reserve; 1157 goto retry_reserve;
1149 } 1158 }
1150 } 1159 }
1151 1160
1152 trace_mm_page_alloc_zone_locked(page, order, migratetype); 1161 trace_mm_page_alloc_zone_locked(page, order, migratetype);
1153 return page; 1162 return page;
1154 } 1163 }
1155 1164
1156 /* 1165 /*
1157 * Obtain a specified number of elements from the buddy allocator, all under 1166 * Obtain a specified number of elements from the buddy allocator, all under
1158 * a single hold of the lock, for efficiency. Add them to the supplied list. 1167 * a single hold of the lock, for efficiency. Add them to the supplied list.
1159 * Returns the number of new pages which were placed at *list. 1168 * Returns the number of new pages which were placed at *list.
1160 */ 1169 */
1161 static int rmqueue_bulk(struct zone *zone, unsigned int order, 1170 static int rmqueue_bulk(struct zone *zone, unsigned int order,
1162 unsigned long count, struct list_head *list, 1171 unsigned long count, struct list_head *list,
1163 int migratetype, int cold) 1172 int migratetype, int cold)
1164 { 1173 {
1165 int mt = migratetype, i; 1174 int i;
1166 1175
1167 spin_lock(&zone->lock); 1176 spin_lock(&zone->lock);
1168 for (i = 0; i < count; ++i) { 1177 for (i = 0; i < count; ++i) {
1169 struct page *page = __rmqueue(zone, order, migratetype); 1178 struct page *page = __rmqueue(zone, order, migratetype);
1170 if (unlikely(page == NULL)) 1179 if (unlikely(page == NULL))
1171 break; 1180 break;
1172 1181
1173 /* 1182 /*
1174 * Split buddy pages returned by expand() are received here 1183 * Split buddy pages returned by expand() are received here
1175 * in physical page order. The page is added to the callers and 1184 * in physical page order. The page is added to the callers and
1176 * list and the list head then moves forward. From the callers 1185 * list and the list head then moves forward. From the callers
1177 * perspective, the linked list is ordered by page number in 1186 * perspective, the linked list is ordered by page number in
1178 * some conditions. This is useful for IO devices that can 1187 * some conditions. This is useful for IO devices that can
1179 * merge IO requests if the physical pages are ordered 1188 * merge IO requests if the physical pages are ordered
1180 * properly. 1189 * properly.
1181 */ 1190 */
1182 if (likely(cold == 0)) 1191 if (likely(cold == 0))
1183 list_add(&page->lru, list); 1192 list_add(&page->lru, list);
1184 else 1193 else
1185 list_add_tail(&page->lru, list); 1194 list_add_tail(&page->lru, list);
1186 if (IS_ENABLED(CONFIG_CMA)) {
1187 mt = get_pageblock_migratetype(page);
1188 if (!is_migrate_cma(mt) && !is_migrate_isolate(mt))
1189 mt = migratetype;
1190 }
1191 set_freepage_migratetype(page, mt);
1192 list = &page->lru; 1195 list = &page->lru;
1193 if (is_migrate_cma(mt)) 1196 if (is_migrate_cma(get_freepage_migratetype(page)))
1194 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1197 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
1195 -(1 << order)); 1198 -(1 << order));
1196 } 1199 }
1197 __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); 1200 __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
1198 spin_unlock(&zone->lock); 1201 spin_unlock(&zone->lock);
1199 return i; 1202 return i;
1200 } 1203 }
1201 1204
1202 #ifdef CONFIG_NUMA 1205 #ifdef CONFIG_NUMA
1203 /* 1206 /*
1204 * Called from the vmstat counter updater to drain pagesets of this 1207 * Called from the vmstat counter updater to drain pagesets of this
1205 * currently executing processor on remote nodes after they have 1208 * currently executing processor on remote nodes after they have
1206 * expired. 1209 * expired.
1207 * 1210 *
1208 * Note that this function must be called with the thread pinned to 1211 * Note that this function must be called with the thread pinned to
1209 * a single processor. 1212 * a single processor.
1210 */ 1213 */
1211 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) 1214 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
1212 { 1215 {
1213 unsigned long flags; 1216 unsigned long flags;
1214 int to_drain; 1217 int to_drain;
1215 unsigned long batch; 1218 unsigned long batch;
1216 1219
1217 local_irq_save(flags); 1220 local_irq_save(flags);
1218 batch = ACCESS_ONCE(pcp->batch); 1221 batch = ACCESS_ONCE(pcp->batch);
1219 if (pcp->count >= batch) 1222 if (pcp->count >= batch)
1220 to_drain = batch; 1223 to_drain = batch;
1221 else 1224 else
1222 to_drain = pcp->count; 1225 to_drain = pcp->count;
1223 if (to_drain > 0) { 1226 if (to_drain > 0) {
1224 free_pcppages_bulk(zone, to_drain, pcp); 1227 free_pcppages_bulk(zone, to_drain, pcp);
1225 pcp->count -= to_drain; 1228 pcp->count -= to_drain;
1226 } 1229 }
1227 local_irq_restore(flags); 1230 local_irq_restore(flags);
1228 } 1231 }
1229 #endif 1232 #endif
1230 1233
1231 /* 1234 /*
1232 * Drain pages of the indicated processor. 1235 * Drain pages of the indicated processor.
1233 * 1236 *
1234 * The processor must either be the current processor and the 1237 * The processor must either be the current processor and the
1235 * thread pinned to the current processor or a processor that 1238 * thread pinned to the current processor or a processor that
1236 * is not online. 1239 * is not online.
1237 */ 1240 */
1238 static void drain_pages(unsigned int cpu) 1241 static void drain_pages(unsigned int cpu)
1239 { 1242 {
1240 unsigned long flags; 1243 unsigned long flags;
1241 struct zone *zone; 1244 struct zone *zone;
1242 1245
1243 for_each_populated_zone(zone) { 1246 for_each_populated_zone(zone) {
1244 struct per_cpu_pageset *pset; 1247 struct per_cpu_pageset *pset;
1245 struct per_cpu_pages *pcp; 1248 struct per_cpu_pages *pcp;
1246 1249
1247 local_irq_save(flags); 1250 local_irq_save(flags);
1248 pset = per_cpu_ptr(zone->pageset, cpu); 1251 pset = per_cpu_ptr(zone->pageset, cpu);
1249 1252
1250 pcp = &pset->pcp; 1253 pcp = &pset->pcp;
1251 if (pcp->count) { 1254 if (pcp->count) {
1252 free_pcppages_bulk(zone, pcp->count, pcp); 1255 free_pcppages_bulk(zone, pcp->count, pcp);
1253 pcp->count = 0; 1256 pcp->count = 0;
1254 } 1257 }
1255 local_irq_restore(flags); 1258 local_irq_restore(flags);
1256 } 1259 }
1257 } 1260 }
1258 1261
1259 /* 1262 /*
1260 * Spill all of this CPU's per-cpu pages back into the buddy allocator. 1263 * Spill all of this CPU's per-cpu pages back into the buddy allocator.
1261 */ 1264 */
1262 void drain_local_pages(void *arg) 1265 void drain_local_pages(void *arg)
1263 { 1266 {
1264 drain_pages(smp_processor_id()); 1267 drain_pages(smp_processor_id());
1265 } 1268 }
1266 1269
1267 /* 1270 /*
1268 * Spill all the per-cpu pages from all CPUs back into the buddy allocator. 1271 * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
1269 * 1272 *
1270 * Note that this code is protected against sending an IPI to an offline 1273 * Note that this code is protected against sending an IPI to an offline
1271 * CPU but does not guarantee sending an IPI to newly hotplugged CPUs: 1274 * CPU but does not guarantee sending an IPI to newly hotplugged CPUs:
1272 * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but 1275 * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but
1273 * nothing keeps CPUs from showing up after we populated the cpumask and 1276 * nothing keeps CPUs from showing up after we populated the cpumask and
1274 * before the call to on_each_cpu_mask(). 1277 * before the call to on_each_cpu_mask().
1275 */ 1278 */
1276 void drain_all_pages(void) 1279 void drain_all_pages(void)
1277 { 1280 {
1278 int cpu; 1281 int cpu;
1279 struct per_cpu_pageset *pcp; 1282 struct per_cpu_pageset *pcp;
1280 struct zone *zone; 1283 struct zone *zone;
1281 1284
1282 /* 1285 /*
1283 * Allocate in the BSS so we wont require allocation in 1286 * Allocate in the BSS so we wont require allocation in
1284 * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y 1287 * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y
1285 */ 1288 */
1286 static cpumask_t cpus_with_pcps; 1289 static cpumask_t cpus_with_pcps;
1287 1290
1288 /* 1291 /*
1289 * We don't care about racing with CPU hotplug event 1292 * We don't care about racing with CPU hotplug event
1290 * as offline notification will cause the notified 1293 * as offline notification will cause the notified
1291 * cpu to drain that CPU pcps and on_each_cpu_mask 1294 * cpu to drain that CPU pcps and on_each_cpu_mask
1292 * disables preemption as part of its processing 1295 * disables preemption as part of its processing
1293 */ 1296 */
1294 for_each_online_cpu(cpu) { 1297 for_each_online_cpu(cpu) {
1295 bool has_pcps = false; 1298 bool has_pcps = false;
1296 for_each_populated_zone(zone) { 1299 for_each_populated_zone(zone) {
1297 pcp = per_cpu_ptr(zone->pageset, cpu); 1300 pcp = per_cpu_ptr(zone->pageset, cpu);
1298 if (pcp->pcp.count) { 1301 if (pcp->pcp.count) {
1299 has_pcps = true; 1302 has_pcps = true;
1300 break; 1303 break;
1301 } 1304 }
1302 } 1305 }
1303 if (has_pcps) 1306 if (has_pcps)
1304 cpumask_set_cpu(cpu, &cpus_with_pcps); 1307 cpumask_set_cpu(cpu, &cpus_with_pcps);
1305 else 1308 else
1306 cpumask_clear_cpu(cpu, &cpus_with_pcps); 1309 cpumask_clear_cpu(cpu, &cpus_with_pcps);
1307 } 1310 }
1308 on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1); 1311 on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1);
1309 } 1312 }
1310 1313
1311 #ifdef CONFIG_HIBERNATION 1314 #ifdef CONFIG_HIBERNATION
1312 1315
1313 void mark_free_pages(struct zone *zone) 1316 void mark_free_pages(struct zone *zone)
1314 { 1317 {
1315 unsigned long pfn, max_zone_pfn; 1318 unsigned long pfn, max_zone_pfn;
1316 unsigned long flags; 1319 unsigned long flags;
1317 int order, t; 1320 int order, t;
1318 struct list_head *curr; 1321 struct list_head *curr;
1319 1322
1320 if (zone_is_empty(zone)) 1323 if (zone_is_empty(zone))
1321 return; 1324 return;
1322 1325
1323 spin_lock_irqsave(&zone->lock, flags); 1326 spin_lock_irqsave(&zone->lock, flags);
1324 1327
1325 max_zone_pfn = zone_end_pfn(zone); 1328 max_zone_pfn = zone_end_pfn(zone);
1326 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 1329 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
1327 if (pfn_valid(pfn)) { 1330 if (pfn_valid(pfn)) {
1328 struct page *page = pfn_to_page(pfn); 1331 struct page *page = pfn_to_page(pfn);
1329 1332
1330 if (!swsusp_page_is_forbidden(page)) 1333 if (!swsusp_page_is_forbidden(page))
1331 swsusp_unset_page_free(page); 1334 swsusp_unset_page_free(page);
1332 } 1335 }
1333 1336
1334 for_each_migratetype_order(order, t) { 1337 for_each_migratetype_order(order, t) {
1335 list_for_each(curr, &zone->free_area[order].free_list[t]) { 1338 list_for_each(curr, &zone->free_area[order].free_list[t]) {
1336 unsigned long i; 1339 unsigned long i;
1337 1340
1338 pfn = page_to_pfn(list_entry(curr, struct page, lru)); 1341 pfn = page_to_pfn(list_entry(curr, struct page, lru));
1339 for (i = 0; i < (1UL << order); i++) 1342 for (i = 0; i < (1UL << order); i++)
1340 swsusp_set_page_free(pfn_to_page(pfn + i)); 1343 swsusp_set_page_free(pfn_to_page(pfn + i));
1341 } 1344 }
1342 } 1345 }
1343 spin_unlock_irqrestore(&zone->lock, flags); 1346 spin_unlock_irqrestore(&zone->lock, flags);
1344 } 1347 }
1345 #endif /* CONFIG_PM */ 1348 #endif /* CONFIG_PM */
1346 1349
1347 /* 1350 /*
1348 * Free a 0-order page 1351 * Free a 0-order page
1349 * cold == 1 ? free a cold page : free a hot page 1352 * cold == 1 ? free a cold page : free a hot page
1350 */ 1353 */
1351 void free_hot_cold_page(struct page *page, int cold) 1354 void free_hot_cold_page(struct page *page, int cold)
1352 { 1355 {
1353 struct zone *zone = page_zone(page); 1356 struct zone *zone = page_zone(page);
1354 struct per_cpu_pages *pcp; 1357 struct per_cpu_pages *pcp;
1355 unsigned long flags; 1358 unsigned long flags;
1356 int migratetype; 1359 int migratetype;
1357 1360
1358 if (!free_pages_prepare(page, 0)) 1361 if (!free_pages_prepare(page, 0))
1359 return; 1362 return;
1360 1363
1361 migratetype = get_pageblock_migratetype(page); 1364 migratetype = get_pageblock_migratetype(page);
1362 set_freepage_migratetype(page, migratetype); 1365 set_freepage_migratetype(page, migratetype);
1363 local_irq_save(flags); 1366 local_irq_save(flags);
1364 __count_vm_event(PGFREE); 1367 __count_vm_event(PGFREE);
1365 1368
1366 /* 1369 /*
1367 * We only track unmovable, reclaimable and movable on pcp lists. 1370 * We only track unmovable, reclaimable and movable on pcp lists.
1368 * Free ISOLATE pages back to the allocator because they are being 1371 * Free ISOLATE pages back to the allocator because they are being
1369 * offlined but treat RESERVE as movable pages so we can get those 1372 * offlined but treat RESERVE as movable pages so we can get those
1370 * areas back if necessary. Otherwise, we may have to free 1373 * areas back if necessary. Otherwise, we may have to free
1371 * excessively into the page allocator 1374 * excessively into the page allocator
1372 */ 1375 */
1373 if (migratetype >= MIGRATE_PCPTYPES) { 1376 if (migratetype >= MIGRATE_PCPTYPES) {
1374 if (unlikely(is_migrate_isolate(migratetype))) { 1377 if (unlikely(is_migrate_isolate(migratetype))) {
1375 free_one_page(zone, page, 0, migratetype); 1378 free_one_page(zone, page, 0, migratetype);
1376 goto out; 1379 goto out;
1377 } 1380 }
1378 migratetype = MIGRATE_MOVABLE; 1381 migratetype = MIGRATE_MOVABLE;
1379 } 1382 }
1380 1383
1381 pcp = &this_cpu_ptr(zone->pageset)->pcp; 1384 pcp = &this_cpu_ptr(zone->pageset)->pcp;
1382 if (cold) 1385 if (cold)
1383 list_add_tail(&page->lru, &pcp->lists[migratetype]); 1386 list_add_tail(&page->lru, &pcp->lists[migratetype]);
1384 else 1387 else
1385 list_add(&page->lru, &pcp->lists[migratetype]); 1388 list_add(&page->lru, &pcp->lists[migratetype]);
1386 pcp->count++; 1389 pcp->count++;
1387 if (pcp->count >= pcp->high) { 1390 if (pcp->count >= pcp->high) {
1388 unsigned long batch = ACCESS_ONCE(pcp->batch); 1391 unsigned long batch = ACCESS_ONCE(pcp->batch);
1389 free_pcppages_bulk(zone, batch, pcp); 1392 free_pcppages_bulk(zone, batch, pcp);
1390 pcp->count -= batch; 1393 pcp->count -= batch;
1391 } 1394 }
1392 1395
1393 out: 1396 out:
1394 local_irq_restore(flags); 1397 local_irq_restore(flags);
1395 } 1398 }
1396 1399
1397 /* 1400 /*
1398 * Free a list of 0-order pages 1401 * Free a list of 0-order pages
1399 */ 1402 */
1400 void free_hot_cold_page_list(struct list_head *list, int cold) 1403 void free_hot_cold_page_list(struct list_head *list, int cold)
1401 { 1404 {
1402 struct page *page, *next; 1405 struct page *page, *next;
1403 1406
1404 list_for_each_entry_safe(page, next, list, lru) { 1407 list_for_each_entry_safe(page, next, list, lru) {
1405 trace_mm_page_free_batched(page, cold); 1408 trace_mm_page_free_batched(page, cold);
1406 free_hot_cold_page(page, cold); 1409 free_hot_cold_page(page, cold);
1407 } 1410 }
1408 } 1411 }
1409 1412
1410 /* 1413 /*
1411 * split_page takes a non-compound higher-order page, and splits it into 1414 * split_page takes a non-compound higher-order page, and splits it into
1412 * n (1<<order) sub-pages: page[0..n] 1415 * n (1<<order) sub-pages: page[0..n]
1413 * Each sub-page must be freed individually. 1416 * Each sub-page must be freed individually.
1414 * 1417 *
1415 * Note: this is probably too low level an operation for use in drivers. 1418 * Note: this is probably too low level an operation for use in drivers.
1416 * Please consult with lkml before using this in your driver. 1419 * Please consult with lkml before using this in your driver.
1417 */ 1420 */
1418 void split_page(struct page *page, unsigned int order) 1421 void split_page(struct page *page, unsigned int order)
1419 { 1422 {
1420 int i; 1423 int i;
1421 1424
1422 VM_BUG_ON(PageCompound(page)); 1425 VM_BUG_ON(PageCompound(page));
1423 VM_BUG_ON(!page_count(page)); 1426 VM_BUG_ON(!page_count(page));
1424 1427
1425 #ifdef CONFIG_KMEMCHECK 1428 #ifdef CONFIG_KMEMCHECK
1426 /* 1429 /*
1427 * Split shadow pages too, because free(page[0]) would 1430 * Split shadow pages too, because free(page[0]) would
1428 * otherwise free the whole shadow. 1431 * otherwise free the whole shadow.
1429 */ 1432 */
1430 if (kmemcheck_page_is_tracked(page)) 1433 if (kmemcheck_page_is_tracked(page))
1431 split_page(virt_to_page(page[0].shadow), order); 1434 split_page(virt_to_page(page[0].shadow), order);
1432 #endif 1435 #endif
1433 1436
1434 for (i = 1; i < (1 << order); i++) 1437 for (i = 1; i < (1 << order); i++)
1435 set_page_refcounted(page + i); 1438 set_page_refcounted(page + i);
1436 } 1439 }
1437 EXPORT_SYMBOL_GPL(split_page); 1440 EXPORT_SYMBOL_GPL(split_page);
1438 1441
1439 static int __isolate_free_page(struct page *page, unsigned int order) 1442 static int __isolate_free_page(struct page *page, unsigned int order)
1440 { 1443 {
1441 unsigned long watermark; 1444 unsigned long watermark;
1442 struct zone *zone; 1445 struct zone *zone;
1443 int mt; 1446 int mt;
1444 1447
1445 BUG_ON(!PageBuddy(page)); 1448 BUG_ON(!PageBuddy(page));
1446 1449
1447 zone = page_zone(page); 1450 zone = page_zone(page);
1448 mt = get_pageblock_migratetype(page); 1451 mt = get_pageblock_migratetype(page);
1449 1452
1450 if (!is_migrate_isolate(mt)) { 1453 if (!is_migrate_isolate(mt)) {
1451 /* Obey watermarks as if the page was being allocated */ 1454 /* Obey watermarks as if the page was being allocated */
1452 watermark = low_wmark_pages(zone) + (1 << order); 1455 watermark = low_wmark_pages(zone) + (1 << order);
1453 if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) 1456 if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
1454 return 0; 1457 return 0;
1455 1458
1456 __mod_zone_freepage_state(zone, -(1UL << order), mt); 1459 __mod_zone_freepage_state(zone, -(1UL << order), mt);
1457 } 1460 }
1458 1461
1459 /* Remove page from free list */ 1462 /* Remove page from free list */
1460 list_del(&page->lru); 1463 list_del(&page->lru);
1461 zone->free_area[order].nr_free--; 1464 zone->free_area[order].nr_free--;
1462 rmv_page_order(page); 1465 rmv_page_order(page);
1463 1466
1464 /* Set the pageblock if the isolated page is at least a pageblock */ 1467 /* Set the pageblock if the isolated page is at least a pageblock */
1465 if (order >= pageblock_order - 1) { 1468 if (order >= pageblock_order - 1) {
1466 struct page *endpage = page + (1 << order) - 1; 1469 struct page *endpage = page + (1 << order) - 1;
1467 for (; page < endpage; page += pageblock_nr_pages) { 1470 for (; page < endpage; page += pageblock_nr_pages) {
1468 int mt = get_pageblock_migratetype(page); 1471 int mt = get_pageblock_migratetype(page);
1469 if (!is_migrate_isolate(mt) && !is_migrate_cma(mt)) 1472 if (!is_migrate_isolate(mt) && !is_migrate_cma(mt))
1470 set_pageblock_migratetype(page, 1473 set_pageblock_migratetype(page,
1471 MIGRATE_MOVABLE); 1474 MIGRATE_MOVABLE);
1472 } 1475 }
1473 } 1476 }
1474 1477
1475 return 1UL << order; 1478 return 1UL << order;
1476 } 1479 }
1477 1480
1478 /* 1481 /*
1479 * Similar to split_page except the page is already free. As this is only 1482 * Similar to split_page except the page is already free. As this is only
1480 * being used for migration, the migratetype of the block also changes. 1483 * being used for migration, the migratetype of the block also changes.
1481 * As this is called with interrupts disabled, the caller is responsible 1484 * As this is called with interrupts disabled, the caller is responsible
1482 * for calling arch_alloc_page() and kernel_map_page() after interrupts 1485 * for calling arch_alloc_page() and kernel_map_page() after interrupts
1483 * are enabled. 1486 * are enabled.
1484 * 1487 *
1485 * Note: this is probably too low level an operation for use in drivers. 1488 * Note: this is probably too low level an operation for use in drivers.
1486 * Please consult with lkml before using this in your driver. 1489 * Please consult with lkml before using this in your driver.
1487 */ 1490 */
1488 int split_free_page(struct page *page) 1491 int split_free_page(struct page *page)
1489 { 1492 {
1490 unsigned int order; 1493 unsigned int order;
1491 int nr_pages; 1494 int nr_pages;
1492 1495
1493 order = page_order(page); 1496 order = page_order(page);
1494 1497
1495 nr_pages = __isolate_free_page(page, order); 1498 nr_pages = __isolate_free_page(page, order);
1496 if (!nr_pages) 1499 if (!nr_pages)
1497 return 0; 1500 return 0;
1498 1501
1499 /* Split into individual pages */ 1502 /* Split into individual pages */
1500 set_page_refcounted(page); 1503 set_page_refcounted(page);
1501 split_page(page, order); 1504 split_page(page, order);
1502 return nr_pages; 1505 return nr_pages;
1503 } 1506 }
1504 1507
1505 /* 1508 /*
1506 * Really, prep_compound_page() should be called from __rmqueue_bulk(). But 1509 * Really, prep_compound_page() should be called from __rmqueue_bulk(). But
1507 * we cheat by calling it from here, in the order > 0 path. Saves a branch 1510 * we cheat by calling it from here, in the order > 0 path. Saves a branch
1508 * or two. 1511 * or two.
1509 */ 1512 */
1510 static inline 1513 static inline
1511 struct page *buffered_rmqueue(struct zone *preferred_zone, 1514 struct page *buffered_rmqueue(struct zone *preferred_zone,
1512 struct zone *zone, int order, gfp_t gfp_flags, 1515 struct zone *zone, int order, gfp_t gfp_flags,
1513 int migratetype) 1516 int migratetype)
1514 { 1517 {
1515 unsigned long flags; 1518 unsigned long flags;
1516 struct page *page; 1519 struct page *page;
1517 int cold = !!(gfp_flags & __GFP_COLD); 1520 int cold = !!(gfp_flags & __GFP_COLD);
1518 1521
1519 again: 1522 again:
1520 if (likely(order == 0)) { 1523 if (likely(order == 0)) {
1521 struct per_cpu_pages *pcp; 1524 struct per_cpu_pages *pcp;
1522 struct list_head *list; 1525 struct list_head *list;
1523 1526
1524 local_irq_save(flags); 1527 local_irq_save(flags);
1525 pcp = &this_cpu_ptr(zone->pageset)->pcp; 1528 pcp = &this_cpu_ptr(zone->pageset)->pcp;
1526 list = &pcp->lists[migratetype]; 1529 list = &pcp->lists[migratetype];
1527 if (list_empty(list)) { 1530 if (list_empty(list)) {
1528 pcp->count += rmqueue_bulk(zone, 0, 1531 pcp->count += rmqueue_bulk(zone, 0,
1529 pcp->batch, list, 1532 pcp->batch, list,
1530 migratetype, cold); 1533 migratetype, cold);
1531 if (unlikely(list_empty(list))) 1534 if (unlikely(list_empty(list)))
1532 goto failed; 1535 goto failed;
1533 } 1536 }
1534 1537
1535 if (cold) 1538 if (cold)
1536 page = list_entry(list->prev, struct page, lru); 1539 page = list_entry(list->prev, struct page, lru);
1537 else 1540 else
1538 page = list_entry(list->next, struct page, lru); 1541 page = list_entry(list->next, struct page, lru);
1539 1542
1540 list_del(&page->lru); 1543 list_del(&page->lru);
1541 pcp->count--; 1544 pcp->count--;
1542 } else { 1545 } else {
1543 if (unlikely(gfp_flags & __GFP_NOFAIL)) { 1546 if (unlikely(gfp_flags & __GFP_NOFAIL)) {
1544 /* 1547 /*
1545 * __GFP_NOFAIL is not to be used in new code. 1548 * __GFP_NOFAIL is not to be used in new code.
1546 * 1549 *
1547 * All __GFP_NOFAIL callers should be fixed so that they 1550 * All __GFP_NOFAIL callers should be fixed so that they
1548 * properly detect and handle allocation failures. 1551 * properly detect and handle allocation failures.
1549 * 1552 *
1550 * We most definitely don't want callers attempting to 1553 * We most definitely don't want callers attempting to
1551 * allocate greater than order-1 page units with 1554 * allocate greater than order-1 page units with
1552 * __GFP_NOFAIL. 1555 * __GFP_NOFAIL.
1553 */ 1556 */
1554 WARN_ON_ONCE(order > 1); 1557 WARN_ON_ONCE(order > 1);
1555 } 1558 }
1556 spin_lock_irqsave(&zone->lock, flags); 1559 spin_lock_irqsave(&zone->lock, flags);
1557 page = __rmqueue(zone, order, migratetype); 1560 page = __rmqueue(zone, order, migratetype);
1558 spin_unlock(&zone->lock); 1561 spin_unlock(&zone->lock);
1559 if (!page) 1562 if (!page)
1560 goto failed; 1563 goto failed;
1561 __mod_zone_freepage_state(zone, -(1 << order), 1564 __mod_zone_freepage_state(zone, -(1 << order),
1562 get_pageblock_migratetype(page)); 1565 get_freepage_migratetype(page));
1563 } 1566 }
1564 1567
1565 __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); 1568 __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
1566 1569
1567 __count_zone_vm_events(PGALLOC, zone, 1 << order); 1570 __count_zone_vm_events(PGALLOC, zone, 1 << order);
1568 zone_statistics(preferred_zone, zone, gfp_flags); 1571 zone_statistics(preferred_zone, zone, gfp_flags);
1569 local_irq_restore(flags); 1572 local_irq_restore(flags);
1570 1573
1571 VM_BUG_ON(bad_range(zone, page)); 1574 VM_BUG_ON(bad_range(zone, page));
1572 if (prep_new_page(page, order, gfp_flags)) 1575 if (prep_new_page(page, order, gfp_flags))
1573 goto again; 1576 goto again;
1574 return page; 1577 return page;
1575 1578
1576 failed: 1579 failed:
1577 local_irq_restore(flags); 1580 local_irq_restore(flags);
1578 return NULL; 1581 return NULL;
1579 } 1582 }
1580 1583
1581 #ifdef CONFIG_FAIL_PAGE_ALLOC 1584 #ifdef CONFIG_FAIL_PAGE_ALLOC
1582 1585
1583 static struct { 1586 static struct {
1584 struct fault_attr attr; 1587 struct fault_attr attr;
1585 1588
1586 u32 ignore_gfp_highmem; 1589 u32 ignore_gfp_highmem;
1587 u32 ignore_gfp_wait; 1590 u32 ignore_gfp_wait;
1588 u32 min_order; 1591 u32 min_order;
1589 } fail_page_alloc = { 1592 } fail_page_alloc = {
1590 .attr = FAULT_ATTR_INITIALIZER, 1593 .attr = FAULT_ATTR_INITIALIZER,
1591 .ignore_gfp_wait = 1, 1594 .ignore_gfp_wait = 1,
1592 .ignore_gfp_highmem = 1, 1595 .ignore_gfp_highmem = 1,
1593 .min_order = 1, 1596 .min_order = 1,
1594 }; 1597 };
1595 1598
1596 static int __init setup_fail_page_alloc(char *str) 1599 static int __init setup_fail_page_alloc(char *str)
1597 { 1600 {
1598 return setup_fault_attr(&fail_page_alloc.attr, str); 1601 return setup_fault_attr(&fail_page_alloc.attr, str);
1599 } 1602 }
1600 __setup("fail_page_alloc=", setup_fail_page_alloc); 1603 __setup("fail_page_alloc=", setup_fail_page_alloc);
1601 1604
1602 static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 1605 static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
1603 { 1606 {
1604 if (order < fail_page_alloc.min_order) 1607 if (order < fail_page_alloc.min_order)
1605 return false; 1608 return false;
1606 if (gfp_mask & __GFP_NOFAIL) 1609 if (gfp_mask & __GFP_NOFAIL)
1607 return false; 1610 return false;
1608 if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) 1611 if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
1609 return false; 1612 return false;
1610 if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT)) 1613 if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
1611 return false; 1614 return false;
1612 1615
1613 return should_fail(&fail_page_alloc.attr, 1 << order); 1616 return should_fail(&fail_page_alloc.attr, 1 << order);
1614 } 1617 }
1615 1618
1616 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS 1619 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
1617 1620
1618 static int __init fail_page_alloc_debugfs(void) 1621 static int __init fail_page_alloc_debugfs(void)
1619 { 1622 {
1620 umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; 1623 umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
1621 struct dentry *dir; 1624 struct dentry *dir;
1622 1625
1623 dir = fault_create_debugfs_attr("fail_page_alloc", NULL, 1626 dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
1624 &fail_page_alloc.attr); 1627 &fail_page_alloc.attr);
1625 if (IS_ERR(dir)) 1628 if (IS_ERR(dir))
1626 return PTR_ERR(dir); 1629 return PTR_ERR(dir);
1627 1630
1628 if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, 1631 if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
1629 &fail_page_alloc.ignore_gfp_wait)) 1632 &fail_page_alloc.ignore_gfp_wait))
1630 goto fail; 1633 goto fail;
1631 if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir, 1634 if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
1632 &fail_page_alloc.ignore_gfp_highmem)) 1635 &fail_page_alloc.ignore_gfp_highmem))
1633 goto fail; 1636 goto fail;
1634 if (!debugfs_create_u32("min-order", mode, dir, 1637 if (!debugfs_create_u32("min-order", mode, dir,
1635 &fail_page_alloc.min_order)) 1638 &fail_page_alloc.min_order))
1636 goto fail; 1639 goto fail;
1637 1640
1638 return 0; 1641 return 0;
1639 fail: 1642 fail:
1640 debugfs_remove_recursive(dir); 1643 debugfs_remove_recursive(dir);
1641 1644
1642 return -ENOMEM; 1645 return -ENOMEM;
1643 } 1646 }
1644 1647
1645 late_initcall(fail_page_alloc_debugfs); 1648 late_initcall(fail_page_alloc_debugfs);
1646 1649
1647 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ 1650 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
1648 1651
1649 #else /* CONFIG_FAIL_PAGE_ALLOC */ 1652 #else /* CONFIG_FAIL_PAGE_ALLOC */
1650 1653
1651 static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 1654 static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
1652 { 1655 {
1653 return false; 1656 return false;
1654 } 1657 }
1655 1658
1656 #endif /* CONFIG_FAIL_PAGE_ALLOC */ 1659 #endif /* CONFIG_FAIL_PAGE_ALLOC */
1657 1660
1658 /* 1661 /*
1659 * Return true if free pages are above 'mark'. This takes into account the order 1662 * Return true if free pages are above 'mark'. This takes into account the order
1660 * of the allocation. 1663 * of the allocation.
1661 */ 1664 */
1662 static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, 1665 static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1663 int classzone_idx, int alloc_flags, long free_pages) 1666 int classzone_idx, int alloc_flags, long free_pages)
1664 { 1667 {
1665 /* free_pages my go negative - that's OK */ 1668 /* free_pages my go negative - that's OK */
1666 long min = mark; 1669 long min = mark;
1667 long lowmem_reserve = z->lowmem_reserve[classzone_idx]; 1670 long lowmem_reserve = z->lowmem_reserve[classzone_idx];
1668 int o; 1671 int o;
1669 long free_cma = 0; 1672 long free_cma = 0;
1670 1673
1671 free_pages -= (1 << order) - 1; 1674 free_pages -= (1 << order) - 1;
1672 if (alloc_flags & ALLOC_HIGH) 1675 if (alloc_flags & ALLOC_HIGH)
1673 min -= min / 2; 1676 min -= min / 2;
1674 if (alloc_flags & ALLOC_HARDER) 1677 if (alloc_flags & ALLOC_HARDER)
1675 min -= min / 4; 1678 min -= min / 4;
1676 #ifdef CONFIG_CMA 1679 #ifdef CONFIG_CMA
1677 /* If allocation can't use CMA areas don't use free CMA pages */ 1680 /* If allocation can't use CMA areas don't use free CMA pages */
1678 if (!(alloc_flags & ALLOC_CMA)) 1681 if (!(alloc_flags & ALLOC_CMA))
1679 free_cma = zone_page_state(z, NR_FREE_CMA_PAGES); 1682 free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);
1680 #endif 1683 #endif
1681 1684
1682 if (free_pages - free_cma <= min + lowmem_reserve) 1685 if (free_pages - free_cma <= min + lowmem_reserve)
1683 return false; 1686 return false;
1684 for (o = 0; o < order; o++) { 1687 for (o = 0; o < order; o++) {
1685 /* At the next order, this order's pages become unavailable */ 1688 /* At the next order, this order's pages become unavailable */
1686 free_pages -= z->free_area[o].nr_free << o; 1689 free_pages -= z->free_area[o].nr_free << o;
1687 1690
1688 /* Require fewer higher order pages to be free */ 1691 /* Require fewer higher order pages to be free */
1689 min >>= 1; 1692 min >>= 1;
1690 1693
1691 if (free_pages <= min) 1694 if (free_pages <= min)
1692 return false; 1695 return false;
1693 } 1696 }
1694 return true; 1697 return true;
1695 } 1698 }
1696 1699
1697 bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, 1700 bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1698 int classzone_idx, int alloc_flags) 1701 int classzone_idx, int alloc_flags)
1699 { 1702 {
1700 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, 1703 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
1701 zone_page_state(z, NR_FREE_PAGES)); 1704 zone_page_state(z, NR_FREE_PAGES));
1702 } 1705 }
1703 1706
1704 bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, 1707 bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
1705 int classzone_idx, int alloc_flags) 1708 int classzone_idx, int alloc_flags)
1706 { 1709 {
1707 long free_pages = zone_page_state(z, NR_FREE_PAGES); 1710 long free_pages = zone_page_state(z, NR_FREE_PAGES);
1708 1711
1709 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) 1712 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
1710 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); 1713 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
1711 1714
1712 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, 1715 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
1713 free_pages); 1716 free_pages);
1714 } 1717 }
1715 1718
1716 #ifdef CONFIG_NUMA 1719 #ifdef CONFIG_NUMA
1717 /* 1720 /*
1718 * zlc_setup - Setup for "zonelist cache". Uses cached zone data to 1721 * zlc_setup - Setup for "zonelist cache". Uses cached zone data to
1719 * skip over zones that are not allowed by the cpuset, or that have 1722 * skip over zones that are not allowed by the cpuset, or that have
1720 * been recently (in last second) found to be nearly full. See further 1723 * been recently (in last second) found to be nearly full. See further
1721 * comments in mmzone.h. Reduces cache footprint of zonelist scans 1724 * comments in mmzone.h. Reduces cache footprint of zonelist scans
1722 * that have to skip over a lot of full or unallowed zones. 1725 * that have to skip over a lot of full or unallowed zones.
1723 * 1726 *
1724 * If the zonelist cache is present in the passed in zonelist, then 1727 * If the zonelist cache is present in the passed in zonelist, then
1725 * returns a pointer to the allowed node mask (either the current 1728 * returns a pointer to the allowed node mask (either the current
1726 * tasks mems_allowed, or node_states[N_MEMORY].) 1729 * tasks mems_allowed, or node_states[N_MEMORY].)
1727 * 1730 *
1728 * If the zonelist cache is not available for this zonelist, does 1731 * If the zonelist cache is not available for this zonelist, does
1729 * nothing and returns NULL. 1732 * nothing and returns NULL.
1730 * 1733 *
1731 * If the fullzones BITMAP in the zonelist cache is stale (more than 1734 * If the fullzones BITMAP in the zonelist cache is stale (more than
1732 * a second since last zap'd) then we zap it out (clear its bits.) 1735 * a second since last zap'd) then we zap it out (clear its bits.)
1733 * 1736 *
1734 * We hold off even calling zlc_setup, until after we've checked the 1737 * We hold off even calling zlc_setup, until after we've checked the
1735 * first zone in the zonelist, on the theory that most allocations will 1738 * first zone in the zonelist, on the theory that most allocations will
1736 * be satisfied from that first zone, so best to examine that zone as 1739 * be satisfied from that first zone, so best to examine that zone as
1737 * quickly as we can. 1740 * quickly as we can.
1738 */ 1741 */
1739 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) 1742 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
1740 { 1743 {
1741 struct zonelist_cache *zlc; /* cached zonelist speedup info */ 1744 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1742 nodemask_t *allowednodes; /* zonelist_cache approximation */ 1745 nodemask_t *allowednodes; /* zonelist_cache approximation */
1743 1746
1744 zlc = zonelist->zlcache_ptr; 1747 zlc = zonelist->zlcache_ptr;
1745 if (!zlc) 1748 if (!zlc)
1746 return NULL; 1749 return NULL;
1747 1750
1748 if (time_after(jiffies, zlc->last_full_zap + HZ)) { 1751 if (time_after(jiffies, zlc->last_full_zap + HZ)) {
1749 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); 1752 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
1750 zlc->last_full_zap = jiffies; 1753 zlc->last_full_zap = jiffies;
1751 } 1754 }
1752 1755
1753 allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? 1756 allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
1754 &cpuset_current_mems_allowed : 1757 &cpuset_current_mems_allowed :
1755 &node_states[N_MEMORY]; 1758 &node_states[N_MEMORY];
1756 return allowednodes; 1759 return allowednodes;
1757 } 1760 }
1758 1761
1759 /* 1762 /*
1760 * Given 'z' scanning a zonelist, run a couple of quick checks to see 1763 * Given 'z' scanning a zonelist, run a couple of quick checks to see
1761 * if it is worth looking at further for free memory: 1764 * if it is worth looking at further for free memory:
1762 * 1) Check that the zone isn't thought to be full (doesn't have its 1765 * 1) Check that the zone isn't thought to be full (doesn't have its
1763 * bit set in the zonelist_cache fullzones BITMAP). 1766 * bit set in the zonelist_cache fullzones BITMAP).
1764 * 2) Check that the zones node (obtained from the zonelist_cache 1767 * 2) Check that the zones node (obtained from the zonelist_cache
1765 * z_to_n[] mapping) is allowed in the passed in allowednodes mask. 1768 * z_to_n[] mapping) is allowed in the passed in allowednodes mask.
1766 * Return true (non-zero) if zone is worth looking at further, or 1769 * Return true (non-zero) if zone is worth looking at further, or
1767 * else return false (zero) if it is not. 1770 * else return false (zero) if it is not.
1768 * 1771 *
1769 * This check -ignores- the distinction between various watermarks, 1772 * This check -ignores- the distinction between various watermarks,
1770 * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is 1773 * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is
1771 * found to be full for any variation of these watermarks, it will 1774 * found to be full for any variation of these watermarks, it will
1772 * be considered full for up to one second by all requests, unless 1775 * be considered full for up to one second by all requests, unless
1773 * we are so low on memory on all allowed nodes that we are forced 1776 * we are so low on memory on all allowed nodes that we are forced
1774 * into the second scan of the zonelist. 1777 * into the second scan of the zonelist.
1775 * 1778 *
1776 * In the second scan we ignore this zonelist cache and exactly 1779 * In the second scan we ignore this zonelist cache and exactly
1777 * apply the watermarks to all zones, even it is slower to do so. 1780 * apply the watermarks to all zones, even it is slower to do so.
1778 * We are low on memory in the second scan, and should leave no stone 1781 * We are low on memory in the second scan, and should leave no stone
1779 * unturned looking for a free page. 1782 * unturned looking for a free page.
1780 */ 1783 */
1781 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z, 1784 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
1782 nodemask_t *allowednodes) 1785 nodemask_t *allowednodes)
1783 { 1786 {
1784 struct zonelist_cache *zlc; /* cached zonelist speedup info */ 1787 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1785 int i; /* index of *z in zonelist zones */ 1788 int i; /* index of *z in zonelist zones */
1786 int n; /* node that zone *z is on */ 1789 int n; /* node that zone *z is on */
1787 1790
1788 zlc = zonelist->zlcache_ptr; 1791 zlc = zonelist->zlcache_ptr;
1789 if (!zlc) 1792 if (!zlc)
1790 return 1; 1793 return 1;
1791 1794
1792 i = z - zonelist->_zonerefs; 1795 i = z - zonelist->_zonerefs;
1793 n = zlc->z_to_n[i]; 1796 n = zlc->z_to_n[i];
1794 1797
1795 /* This zone is worth trying if it is allowed but not full */ 1798 /* This zone is worth trying if it is allowed but not full */
1796 return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones); 1799 return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
1797 } 1800 }
1798 1801
1799 /* 1802 /*
1800 * Given 'z' scanning a zonelist, set the corresponding bit in 1803 * Given 'z' scanning a zonelist, set the corresponding bit in
1801 * zlc->fullzones, so that subsequent attempts to allocate a page 1804 * zlc->fullzones, so that subsequent attempts to allocate a page
1802 * from that zone don't waste time re-examining it. 1805 * from that zone don't waste time re-examining it.
1803 */ 1806 */
1804 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) 1807 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
1805 { 1808 {
1806 struct zonelist_cache *zlc; /* cached zonelist speedup info */ 1809 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1807 int i; /* index of *z in zonelist zones */ 1810 int i; /* index of *z in zonelist zones */
1808 1811
1809 zlc = zonelist->zlcache_ptr; 1812 zlc = zonelist->zlcache_ptr;
1810 if (!zlc) 1813 if (!zlc)
1811 return; 1814 return;
1812 1815
1813 i = z - zonelist->_zonerefs; 1816 i = z - zonelist->_zonerefs;
1814 1817
1815 set_bit(i, zlc->fullzones); 1818 set_bit(i, zlc->fullzones);
1816 } 1819 }
1817 1820
1818 /* 1821 /*
1819 * clear all zones full, called after direct reclaim makes progress so that 1822 * clear all zones full, called after direct reclaim makes progress so that
1820 * a zone that was recently full is not skipped over for up to a second 1823 * a zone that was recently full is not skipped over for up to a second
1821 */ 1824 */
1822 static void zlc_clear_zones_full(struct zonelist *zonelist) 1825 static void zlc_clear_zones_full(struct zonelist *zonelist)
1823 { 1826 {
1824 struct zonelist_cache *zlc; /* cached zonelist speedup info */ 1827 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1825 1828
1826 zlc = zonelist->zlcache_ptr; 1829 zlc = zonelist->zlcache_ptr;
1827 if (!zlc) 1830 if (!zlc)
1828 return; 1831 return;
1829 1832
1830 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); 1833 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
1831 } 1834 }
1832 1835
1833 static bool zone_local(struct zone *local_zone, struct zone *zone) 1836 static bool zone_local(struct zone *local_zone, struct zone *zone)
1834 { 1837 {
1835 return local_zone->node == zone->node; 1838 return local_zone->node == zone->node;
1836 } 1839 }
1837 1840
1838 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) 1841 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
1839 { 1842 {
1840 return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes); 1843 return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes);
1841 } 1844 }
1842 1845
1843 static void __paginginit init_zone_allows_reclaim(int nid) 1846 static void __paginginit init_zone_allows_reclaim(int nid)
1844 { 1847 {
1845 int i; 1848 int i;
1846 1849
1847 for_each_node_state(i, N_MEMORY) 1850 for_each_node_state(i, N_MEMORY)
1848 if (node_distance(nid, i) <= RECLAIM_DISTANCE) 1851 if (node_distance(nid, i) <= RECLAIM_DISTANCE)
1849 node_set(i, NODE_DATA(nid)->reclaim_nodes); 1852 node_set(i, NODE_DATA(nid)->reclaim_nodes);
1850 else 1853 else
1851 zone_reclaim_mode = 1; 1854 zone_reclaim_mode = 1;
1852 } 1855 }
1853 1856
1854 #else /* CONFIG_NUMA */ 1857 #else /* CONFIG_NUMA */
1855 1858
1856 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) 1859 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
1857 { 1860 {
1858 return NULL; 1861 return NULL;
1859 } 1862 }
1860 1863
1861 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z, 1864 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
1862 nodemask_t *allowednodes) 1865 nodemask_t *allowednodes)
1863 { 1866 {
1864 return 1; 1867 return 1;
1865 } 1868 }
1866 1869
1867 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) 1870 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
1868 { 1871 {
1869 } 1872 }
1870 1873
1871 static void zlc_clear_zones_full(struct zonelist *zonelist) 1874 static void zlc_clear_zones_full(struct zonelist *zonelist)
1872 { 1875 {
1873 } 1876 }
1874 1877
1875 static bool zone_local(struct zone *local_zone, struct zone *zone) 1878 static bool zone_local(struct zone *local_zone, struct zone *zone)
1876 { 1879 {
1877 return true; 1880 return true;
1878 } 1881 }
1879 1882
1880 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) 1883 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
1881 { 1884 {
1882 return true; 1885 return true;
1883 } 1886 }
1884 1887
1885 static inline void init_zone_allows_reclaim(int nid) 1888 static inline void init_zone_allows_reclaim(int nid)
1886 { 1889 {
1887 } 1890 }
1888 #endif /* CONFIG_NUMA */ 1891 #endif /* CONFIG_NUMA */
1889 1892
1890 /* 1893 /*
1891 * get_page_from_freelist goes through the zonelist trying to allocate 1894 * get_page_from_freelist goes through the zonelist trying to allocate
1892 * a page. 1895 * a page.
1893 */ 1896 */
1894 static struct page * 1897 static struct page *
1895 get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, 1898 get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
1896 struct zonelist *zonelist, int high_zoneidx, int alloc_flags, 1899 struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
1897 struct zone *preferred_zone, int migratetype) 1900 struct zone *preferred_zone, int migratetype)
1898 { 1901 {
1899 struct zoneref *z; 1902 struct zoneref *z;
1900 struct page *page = NULL; 1903 struct page *page = NULL;
1901 int classzone_idx; 1904 int classzone_idx;
1902 struct zone *zone; 1905 struct zone *zone;
1903 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ 1906 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
1904 int zlc_active = 0; /* set if using zonelist_cache */ 1907 int zlc_active = 0; /* set if using zonelist_cache */
1905 int did_zlc_setup = 0; /* just call zlc_setup() one time */ 1908 int did_zlc_setup = 0; /* just call zlc_setup() one time */
1906 1909
1907 classzone_idx = zone_idx(preferred_zone); 1910 classzone_idx = zone_idx(preferred_zone);
1908 zonelist_scan: 1911 zonelist_scan:
1909 /* 1912 /*
1910 * Scan zonelist, looking for a zone with enough free. 1913 * Scan zonelist, looking for a zone with enough free.
1911 * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c. 1914 * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.
1912 */ 1915 */
1913 for_each_zone_zonelist_nodemask(zone, z, zonelist, 1916 for_each_zone_zonelist_nodemask(zone, z, zonelist,
1914 high_zoneidx, nodemask) { 1917 high_zoneidx, nodemask) {
1915 unsigned long mark; 1918 unsigned long mark;
1916 1919
1917 if (IS_ENABLED(CONFIG_NUMA) && zlc_active && 1920 if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
1918 !zlc_zone_worth_trying(zonelist, z, allowednodes)) 1921 !zlc_zone_worth_trying(zonelist, z, allowednodes))
1919 continue; 1922 continue;
1920 if ((alloc_flags & ALLOC_CPUSET) && 1923 if ((alloc_flags & ALLOC_CPUSET) &&
1921 !cpuset_zone_allowed_softwall(zone, gfp_mask)) 1924 !cpuset_zone_allowed_softwall(zone, gfp_mask))
1922 continue; 1925 continue;
1923 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); 1926 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
1924 if (unlikely(alloc_flags & ALLOC_NO_WATERMARKS)) 1927 if (unlikely(alloc_flags & ALLOC_NO_WATERMARKS))
1925 goto try_this_zone; 1928 goto try_this_zone;
1926 /* 1929 /*
1927 * Distribute pages in proportion to the individual 1930 * Distribute pages in proportion to the individual
1928 * zone size to ensure fair page aging. The zone a 1931 * zone size to ensure fair page aging. The zone a
1929 * page was allocated in should have no effect on the 1932 * page was allocated in should have no effect on the
1930 * time the page has in memory before being reclaimed. 1933 * time the page has in memory before being reclaimed.
1931 */ 1934 */
1932 if (alloc_flags & ALLOC_FAIR) { 1935 if (alloc_flags & ALLOC_FAIR) {
1933 if (!zone_local(preferred_zone, zone)) 1936 if (!zone_local(preferred_zone, zone))
1934 continue; 1937 continue;
1935 if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) 1938 if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
1936 continue; 1939 continue;
1937 } 1940 }
1938 /* 1941 /*
1939 * When allocating a page cache page for writing, we 1942 * When allocating a page cache page for writing, we
1940 * want to get it from a zone that is within its dirty 1943 * want to get it from a zone that is within its dirty
1941 * limit, such that no single zone holds more than its 1944 * limit, such that no single zone holds more than its
1942 * proportional share of globally allowed dirty pages. 1945 * proportional share of globally allowed dirty pages.
1943 * The dirty limits take into account the zone's 1946 * The dirty limits take into account the zone's
1944 * lowmem reserves and high watermark so that kswapd 1947 * lowmem reserves and high watermark so that kswapd
1945 * should be able to balance it without having to 1948 * should be able to balance it without having to
1946 * write pages from its LRU list. 1949 * write pages from its LRU list.
1947 * 1950 *
1948 * This may look like it could increase pressure on 1951 * This may look like it could increase pressure on
1949 * lower zones by failing allocations in higher zones 1952 * lower zones by failing allocations in higher zones
1950 * before they are full. But the pages that do spill 1953 * before they are full. But the pages that do spill
1951 * over are limited as the lower zones are protected 1954 * over are limited as the lower zones are protected
1952 * by this very same mechanism. It should not become 1955 * by this very same mechanism. It should not become
1953 * a practical burden to them. 1956 * a practical burden to them.
1954 * 1957 *
1955 * XXX: For now, allow allocations to potentially 1958 * XXX: For now, allow allocations to potentially
1956 * exceed the per-zone dirty limit in the slowpath 1959 * exceed the per-zone dirty limit in the slowpath
1957 * (ALLOC_WMARK_LOW unset) before going into reclaim, 1960 * (ALLOC_WMARK_LOW unset) before going into reclaim,
1958 * which is important when on a NUMA setup the allowed 1961 * which is important when on a NUMA setup the allowed
1959 * zones are together not big enough to reach the 1962 * zones are together not big enough to reach the
1960 * global limit. The proper fix for these situations 1963 * global limit. The proper fix for these situations
1961 * will require awareness of zones in the 1964 * will require awareness of zones in the
1962 * dirty-throttling and the flusher threads. 1965 * dirty-throttling and the flusher threads.
1963 */ 1966 */
1964 if ((alloc_flags & ALLOC_WMARK_LOW) && 1967 if ((alloc_flags & ALLOC_WMARK_LOW) &&
1965 (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone)) 1968 (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))
1966 goto this_zone_full; 1969 goto this_zone_full;
1967 1970
1968 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; 1971 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
1969 if (!zone_watermark_ok(zone, order, mark, 1972 if (!zone_watermark_ok(zone, order, mark,
1970 classzone_idx, alloc_flags)) { 1973 classzone_idx, alloc_flags)) {
1971 int ret; 1974 int ret;
1972 1975
1973 if (IS_ENABLED(CONFIG_NUMA) && 1976 if (IS_ENABLED(CONFIG_NUMA) &&
1974 !did_zlc_setup && nr_online_nodes > 1) { 1977 !did_zlc_setup && nr_online_nodes > 1) {
1975 /* 1978 /*
1976 * we do zlc_setup if there are multiple nodes 1979 * we do zlc_setup if there are multiple nodes
1977 * and before considering the first zone allowed 1980 * and before considering the first zone allowed
1978 * by the cpuset. 1981 * by the cpuset.
1979 */ 1982 */
1980 allowednodes = zlc_setup(zonelist, alloc_flags); 1983 allowednodes = zlc_setup(zonelist, alloc_flags);
1981 zlc_active = 1; 1984 zlc_active = 1;
1982 did_zlc_setup = 1; 1985 did_zlc_setup = 1;
1983 } 1986 }
1984 1987
1985 if (zone_reclaim_mode == 0 || 1988 if (zone_reclaim_mode == 0 ||
1986 !zone_allows_reclaim(preferred_zone, zone)) 1989 !zone_allows_reclaim(preferred_zone, zone))
1987 goto this_zone_full; 1990 goto this_zone_full;
1988 1991
1989 /* 1992 /*
1990 * As we may have just activated ZLC, check if the first 1993 * As we may have just activated ZLC, check if the first
1991 * eligible zone has failed zone_reclaim recently. 1994 * eligible zone has failed zone_reclaim recently.
1992 */ 1995 */
1993 if (IS_ENABLED(CONFIG_NUMA) && zlc_active && 1996 if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
1994 !zlc_zone_worth_trying(zonelist, z, allowednodes)) 1997 !zlc_zone_worth_trying(zonelist, z, allowednodes))
1995 continue; 1998 continue;
1996 1999
1997 ret = zone_reclaim(zone, gfp_mask, order); 2000 ret = zone_reclaim(zone, gfp_mask, order);
1998 switch (ret) { 2001 switch (ret) {
1999 case ZONE_RECLAIM_NOSCAN: 2002 case ZONE_RECLAIM_NOSCAN:
2000 /* did not scan */ 2003 /* did not scan */
2001 continue; 2004 continue;
2002 case ZONE_RECLAIM_FULL: 2005 case ZONE_RECLAIM_FULL:
2003 /* scanned but unreclaimable */ 2006 /* scanned but unreclaimable */
2004 continue; 2007 continue;
2005 default: 2008 default:
2006 /* did we reclaim enough */ 2009 /* did we reclaim enough */
2007 if (zone_watermark_ok(zone, order, mark, 2010 if (zone_watermark_ok(zone, order, mark,
2008 classzone_idx, alloc_flags)) 2011 classzone_idx, alloc_flags))
2009 goto try_this_zone; 2012 goto try_this_zone;
2010 2013
2011 /* 2014 /*
2012 * Failed to reclaim enough to meet watermark. 2015 * Failed to reclaim enough to meet watermark.
2013 * Only mark the zone full if checking the min 2016 * Only mark the zone full if checking the min
2014 * watermark or if we failed to reclaim just 2017 * watermark or if we failed to reclaim just
2015 * 1<<order pages or else the page allocator 2018 * 1<<order pages or else the page allocator
2016 * fastpath will prematurely mark zones full 2019 * fastpath will prematurely mark zones full
2017 * when the watermark is between the low and 2020 * when the watermark is between the low and
2018 * min watermarks. 2021 * min watermarks.
2019 */ 2022 */
2020 if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) || 2023 if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) ||
2021 ret == ZONE_RECLAIM_SOME) 2024 ret == ZONE_RECLAIM_SOME)
2022 goto this_zone_full; 2025 goto this_zone_full;
2023 2026
2024 continue; 2027 continue;
2025 } 2028 }
2026 } 2029 }
2027 2030
2028 try_this_zone: 2031 try_this_zone:
2029 page = buffered_rmqueue(preferred_zone, zone, order, 2032 page = buffered_rmqueue(preferred_zone, zone, order,
2030 gfp_mask, migratetype); 2033 gfp_mask, migratetype);
2031 if (page) 2034 if (page)
2032 break; 2035 break;
2033 this_zone_full: 2036 this_zone_full:
2034 if (IS_ENABLED(CONFIG_NUMA)) 2037 if (IS_ENABLED(CONFIG_NUMA))
2035 zlc_mark_zone_full(zonelist, z); 2038 zlc_mark_zone_full(zonelist, z);
2036 } 2039 }
2037 2040
2038 if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) { 2041 if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {
2039 /* Disable zlc cache for second zonelist scan */ 2042 /* Disable zlc cache for second zonelist scan */
2040 zlc_active = 0; 2043 zlc_active = 0;
2041 goto zonelist_scan; 2044 goto zonelist_scan;
2042 } 2045 }
2043 2046
2044 if (page) 2047 if (page)
2045 /* 2048 /*
2046 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was 2049 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
2047 * necessary to allocate the page. The expectation is 2050 * necessary to allocate the page. The expectation is
2048 * that the caller is taking steps that will free more 2051 * that the caller is taking steps that will free more
2049 * memory. The caller should avoid the page being used 2052 * memory. The caller should avoid the page being used
2050 * for !PFMEMALLOC purposes. 2053 * for !PFMEMALLOC purposes.
2051 */ 2054 */
2052 page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); 2055 page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
2053 2056
2054 return page; 2057 return page;
2055 } 2058 }
2056 2059
2057 /* 2060 /*
2058 * Large machines with many possible nodes should not always dump per-node 2061 * Large machines with many possible nodes should not always dump per-node
2059 * meminfo in irq context. 2062 * meminfo in irq context.
2060 */ 2063 */
2061 static inline bool should_suppress_show_mem(void) 2064 static inline bool should_suppress_show_mem(void)
2062 { 2065 {
2063 bool ret = false; 2066 bool ret = false;
2064 2067
2065 #if NODES_SHIFT > 8 2068 #if NODES_SHIFT > 8
2066 ret = in_interrupt(); 2069 ret = in_interrupt();
2067 #endif 2070 #endif
2068 return ret; 2071 return ret;
2069 } 2072 }
2070 2073
2071 static DEFINE_RATELIMIT_STATE(nopage_rs, 2074 static DEFINE_RATELIMIT_STATE(nopage_rs,
2072 DEFAULT_RATELIMIT_INTERVAL, 2075 DEFAULT_RATELIMIT_INTERVAL,
2073 DEFAULT_RATELIMIT_BURST); 2076 DEFAULT_RATELIMIT_BURST);
2074 2077
2075 void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) 2078 void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
2076 { 2079 {
2077 unsigned int filter = SHOW_MEM_FILTER_NODES; 2080 unsigned int filter = SHOW_MEM_FILTER_NODES;
2078 2081
2079 if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) || 2082 if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
2080 debug_guardpage_minorder() > 0) 2083 debug_guardpage_minorder() > 0)
2081 return; 2084 return;
2082 2085
2083 /* 2086 /*
2084 * Walking all memory to count page types is very expensive and should 2087 * Walking all memory to count page types is very expensive and should
2085 * be inhibited in non-blockable contexts. 2088 * be inhibited in non-blockable contexts.
2086 */ 2089 */
2087 if (!(gfp_mask & __GFP_WAIT)) 2090 if (!(gfp_mask & __GFP_WAIT))
2088 filter |= SHOW_MEM_FILTER_PAGE_COUNT; 2091 filter |= SHOW_MEM_FILTER_PAGE_COUNT;
2089 2092
2090 /* 2093 /*
2091 * This documents exceptions given to allocations in certain 2094 * This documents exceptions given to allocations in certain
2092 * contexts that are allowed to allocate outside current's set 2095 * contexts that are allowed to allocate outside current's set
2093 * of allowed nodes. 2096 * of allowed nodes.
2094 */ 2097 */
2095 if (!(gfp_mask & __GFP_NOMEMALLOC)) 2098 if (!(gfp_mask & __GFP_NOMEMALLOC))
2096 if (test_thread_flag(TIF_MEMDIE) || 2099 if (test_thread_flag(TIF_MEMDIE) ||
2097 (current->flags & (PF_MEMALLOC | PF_EXITING))) 2100 (current->flags & (PF_MEMALLOC | PF_EXITING)))
2098 filter &= ~SHOW_MEM_FILTER_NODES; 2101 filter &= ~SHOW_MEM_FILTER_NODES;
2099 if (in_interrupt() || !(gfp_mask & __GFP_WAIT)) 2102 if (in_interrupt() || !(gfp_mask & __GFP_WAIT))
2100 filter &= ~SHOW_MEM_FILTER_NODES; 2103 filter &= ~SHOW_MEM_FILTER_NODES;
2101 2104
2102 if (fmt) { 2105 if (fmt) {
2103 struct va_format vaf; 2106 struct va_format vaf;
2104 va_list args; 2107 va_list args;
2105 2108
2106 va_start(args, fmt); 2109 va_start(args, fmt);
2107 2110
2108 vaf.fmt = fmt; 2111 vaf.fmt = fmt;
2109 vaf.va = &args; 2112 vaf.va = &args;
2110 2113
2111 pr_warn("%pV", &vaf); 2114 pr_warn("%pV", &vaf);
2112 2115
2113 va_end(args); 2116 va_end(args);
2114 } 2117 }
2115 2118
2116 pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n", 2119 pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n",
2117 current->comm, order, gfp_mask); 2120 current->comm, order, gfp_mask);
2118 2121
2119 dump_stack(); 2122 dump_stack();
2120 if (!should_suppress_show_mem()) 2123 if (!should_suppress_show_mem())
2121 show_mem(filter); 2124 show_mem(filter);
2122 } 2125 }
2123 2126
2124 static inline int 2127 static inline int
2125 should_alloc_retry(gfp_t gfp_mask, unsigned int order, 2128 should_alloc_retry(gfp_t gfp_mask, unsigned int order,
2126 unsigned long did_some_progress, 2129 unsigned long did_some_progress,
2127 unsigned long pages_reclaimed) 2130 unsigned long pages_reclaimed)
2128 { 2131 {
2129 /* Do not loop if specifically requested */ 2132 /* Do not loop if specifically requested */
2130 if (gfp_mask & __GFP_NORETRY) 2133 if (gfp_mask & __GFP_NORETRY)
2131 return 0; 2134 return 0;
2132 2135
2133 /* Always retry if specifically requested */ 2136 /* Always retry if specifically requested */
2134 if (gfp_mask & __GFP_NOFAIL) 2137 if (gfp_mask & __GFP_NOFAIL)
2135 return 1; 2138 return 1;
2136 2139
2137 /* 2140 /*
2138 * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim 2141 * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim
2139 * making forward progress without invoking OOM. Suspend also disables 2142 * making forward progress without invoking OOM. Suspend also disables
2140 * storage devices so kswapd will not help. Bail if we are suspending. 2143 * storage devices so kswapd will not help. Bail if we are suspending.
2141 */ 2144 */
2142 if (!did_some_progress && pm_suspended_storage()) 2145 if (!did_some_progress && pm_suspended_storage())
2143 return 0; 2146 return 0;
2144 2147
2145 /* 2148 /*
2146 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER 2149 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
2147 * means __GFP_NOFAIL, but that may not be true in other 2150 * means __GFP_NOFAIL, but that may not be true in other
2148 * implementations. 2151 * implementations.
2149 */ 2152 */
2150 if (order <= PAGE_ALLOC_COSTLY_ORDER) 2153 if (order <= PAGE_ALLOC_COSTLY_ORDER)
2151 return 1; 2154 return 1;
2152 2155
2153 /* 2156 /*
2154 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is 2157 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
2155 * specified, then we retry until we no longer reclaim any pages 2158 * specified, then we retry until we no longer reclaim any pages
2156 * (above), or we've reclaimed an order of pages at least as 2159 * (above), or we've reclaimed an order of pages at least as
2157 * large as the allocation's order. In both cases, if the 2160 * large as the allocation's order. In both cases, if the
2158 * allocation still fails, we stop retrying. 2161 * allocation still fails, we stop retrying.
2159 */ 2162 */
2160 if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order)) 2163 if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
2161 return 1; 2164 return 1;
2162 2165
2163 return 0; 2166 return 0;
2164 } 2167 }
2165 2168
2166 static inline struct page * 2169 static inline struct page *
2167 __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, 2170 __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
2168 struct zonelist *zonelist, enum zone_type high_zoneidx, 2171 struct zonelist *zonelist, enum zone_type high_zoneidx,
2169 nodemask_t *nodemask, struct zone *preferred_zone, 2172 nodemask_t *nodemask, struct zone *preferred_zone,
2170 int migratetype) 2173 int migratetype)
2171 { 2174 {
2172 struct page *page; 2175 struct page *page;
2173 2176
2174 /* Acquire the OOM killer lock for the zones in zonelist */ 2177 /* Acquire the OOM killer lock for the zones in zonelist */
2175 if (!try_set_zonelist_oom(zonelist, gfp_mask)) { 2178 if (!try_set_zonelist_oom(zonelist, gfp_mask)) {
2176 schedule_timeout_uninterruptible(1); 2179 schedule_timeout_uninterruptible(1);
2177 return NULL; 2180 return NULL;
2178 } 2181 }
2179 2182
2180 /* 2183 /*
2181 * Go through the zonelist yet one more time, keep very high watermark 2184 * Go through the zonelist yet one more time, keep very high watermark
2182 * here, this is only to catch a parallel oom killing, we must fail if 2185 * here, this is only to catch a parallel oom killing, we must fail if
2183 * we're still under heavy pressure. 2186 * we're still under heavy pressure.
2184 */ 2187 */
2185 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, 2188 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
2186 order, zonelist, high_zoneidx, 2189 order, zonelist, high_zoneidx,
2187 ALLOC_WMARK_HIGH|ALLOC_CPUSET, 2190 ALLOC_WMARK_HIGH|ALLOC_CPUSET,
2188 preferred_zone, migratetype); 2191 preferred_zone, migratetype);
2189 if (page) 2192 if (page)
2190 goto out; 2193 goto out;
2191 2194
2192 if (!(gfp_mask & __GFP_NOFAIL)) { 2195 if (!(gfp_mask & __GFP_NOFAIL)) {
2193 /* The OOM killer will not help higher order allocs */ 2196 /* The OOM killer will not help higher order allocs */
2194 if (order > PAGE_ALLOC_COSTLY_ORDER) 2197 if (order > PAGE_ALLOC_COSTLY_ORDER)
2195 goto out; 2198 goto out;
2196 /* The OOM killer does not needlessly kill tasks for lowmem */ 2199 /* The OOM killer does not needlessly kill tasks for lowmem */
2197 if (high_zoneidx < ZONE_NORMAL) 2200 if (high_zoneidx < ZONE_NORMAL)
2198 goto out; 2201 goto out;
2199 /* 2202 /*
2200 * GFP_THISNODE contains __GFP_NORETRY and we never hit this. 2203 * GFP_THISNODE contains __GFP_NORETRY and we never hit this.
2201 * Sanity check for bare calls of __GFP_THISNODE, not real OOM. 2204 * Sanity check for bare calls of __GFP_THISNODE, not real OOM.
2202 * The caller should handle page allocation failure by itself if 2205 * The caller should handle page allocation failure by itself if
2203 * it specifies __GFP_THISNODE. 2206 * it specifies __GFP_THISNODE.
2204 * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER. 2207 * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.
2205 */ 2208 */
2206 if (gfp_mask & __GFP_THISNODE) 2209 if (gfp_mask & __GFP_THISNODE)
2207 goto out; 2210 goto out;
2208 } 2211 }
2209 /* Exhausted what can be done so it's blamo time */ 2212 /* Exhausted what can be done so it's blamo time */
2210 out_of_memory(zonelist, gfp_mask, order, nodemask, false); 2213 out_of_memory(zonelist, gfp_mask, order, nodemask, false);
2211 2214
2212 out: 2215 out:
2213 clear_zonelist_oom(zonelist, gfp_mask); 2216 clear_zonelist_oom(zonelist, gfp_mask);
2214 return page; 2217 return page;
2215 } 2218 }
2216 2219
2217 #ifdef CONFIG_COMPACTION 2220 #ifdef CONFIG_COMPACTION
2218 /* Try memory compaction for high-order allocations before reclaim */ 2221 /* Try memory compaction for high-order allocations before reclaim */
2219 static struct page * 2222 static struct page *
2220 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 2223 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2221 struct zonelist *zonelist, enum zone_type high_zoneidx, 2224 struct zonelist *zonelist, enum zone_type high_zoneidx,
2222 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2225 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2223 int migratetype, enum migrate_mode mode, 2226 int migratetype, enum migrate_mode mode,
2224 bool *contended_compaction, bool *deferred_compaction, 2227 bool *contended_compaction, bool *deferred_compaction,
2225 unsigned long *did_some_progress) 2228 unsigned long *did_some_progress)
2226 { 2229 {
2227 if (!order) 2230 if (!order)
2228 return NULL; 2231 return NULL;
2229 2232
2230 if (compaction_deferred(preferred_zone, order)) { 2233 if (compaction_deferred(preferred_zone, order)) {
2231 *deferred_compaction = true; 2234 *deferred_compaction = true;
2232 return NULL; 2235 return NULL;
2233 } 2236 }
2234 2237
2235 current->flags |= PF_MEMALLOC; 2238 current->flags |= PF_MEMALLOC;
2236 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, 2239 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
2237 nodemask, mode, 2240 nodemask, mode,
2238 contended_compaction); 2241 contended_compaction);
2239 current->flags &= ~PF_MEMALLOC; 2242 current->flags &= ~PF_MEMALLOC;
2240 2243
2241 if (*did_some_progress != COMPACT_SKIPPED) { 2244 if (*did_some_progress != COMPACT_SKIPPED) {
2242 struct page *page; 2245 struct page *page;
2243 2246
2244 /* Page migration frees to the PCP lists but we want merging */ 2247 /* Page migration frees to the PCP lists but we want merging */
2245 drain_pages(get_cpu()); 2248 drain_pages(get_cpu());
2246 put_cpu(); 2249 put_cpu();
2247 2250
2248 page = get_page_from_freelist(gfp_mask, nodemask, 2251 page = get_page_from_freelist(gfp_mask, nodemask,
2249 order, zonelist, high_zoneidx, 2252 order, zonelist, high_zoneidx,
2250 alloc_flags & ~ALLOC_NO_WATERMARKS, 2253 alloc_flags & ~ALLOC_NO_WATERMARKS,
2251 preferred_zone, migratetype); 2254 preferred_zone, migratetype);
2252 if (page) { 2255 if (page) {
2253 preferred_zone->compact_blockskip_flush = false; 2256 preferred_zone->compact_blockskip_flush = false;
2254 compaction_defer_reset(preferred_zone, order, true); 2257 compaction_defer_reset(preferred_zone, order, true);
2255 count_vm_event(COMPACTSUCCESS); 2258 count_vm_event(COMPACTSUCCESS);
2256 return page; 2259 return page;
2257 } 2260 }
2258 2261
2259 /* 2262 /*
2260 * It's bad if compaction run occurs and fails. 2263 * It's bad if compaction run occurs and fails.
2261 * The most likely reason is that pages exist, 2264 * The most likely reason is that pages exist,
2262 * but not enough to satisfy watermarks. 2265 * but not enough to satisfy watermarks.
2263 */ 2266 */
2264 count_vm_event(COMPACTFAIL); 2267 count_vm_event(COMPACTFAIL);
2265 2268
2266 /* 2269 /*
2267 * As async compaction considers a subset of pageblocks, only 2270 * As async compaction considers a subset of pageblocks, only
2268 * defer if the failure was a sync compaction failure. 2271 * defer if the failure was a sync compaction failure.
2269 */ 2272 */
2270 if (mode != MIGRATE_ASYNC) 2273 if (mode != MIGRATE_ASYNC)
2271 defer_compaction(preferred_zone, order); 2274 defer_compaction(preferred_zone, order);
2272 2275
2273 cond_resched(); 2276 cond_resched();
2274 } 2277 }
2275 2278
2276 return NULL; 2279 return NULL;
2277 } 2280 }
2278 #else 2281 #else
2279 static inline struct page * 2282 static inline struct page *
2280 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 2283 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2281 struct zonelist *zonelist, enum zone_type high_zoneidx, 2284 struct zonelist *zonelist, enum zone_type high_zoneidx,
2282 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2285 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2283 int migratetype, enum migrate_mode mode, bool *contended_compaction, 2286 int migratetype, enum migrate_mode mode, bool *contended_compaction,
2284 bool *deferred_compaction, unsigned long *did_some_progress) 2287 bool *deferred_compaction, unsigned long *did_some_progress)
2285 { 2288 {
2286 return NULL; 2289 return NULL;
2287 } 2290 }
2288 #endif /* CONFIG_COMPACTION */ 2291 #endif /* CONFIG_COMPACTION */
2289 2292
2290 /* Perform direct synchronous page reclaim */ 2293 /* Perform direct synchronous page reclaim */
2291 static int 2294 static int
2292 __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, 2295 __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,
2293 nodemask_t *nodemask) 2296 nodemask_t *nodemask)
2294 { 2297 {
2295 struct reclaim_state reclaim_state; 2298 struct reclaim_state reclaim_state;
2296 int progress; 2299 int progress;
2297 2300
2298 cond_resched(); 2301 cond_resched();
2299 2302
2300 /* We now go into synchronous reclaim */ 2303 /* We now go into synchronous reclaim */
2301 cpuset_memory_pressure_bump(); 2304 cpuset_memory_pressure_bump();
2302 current->flags |= PF_MEMALLOC; 2305 current->flags |= PF_MEMALLOC;
2303 lockdep_set_current_reclaim_state(gfp_mask); 2306 lockdep_set_current_reclaim_state(gfp_mask);
2304 reclaim_state.reclaimed_slab = 0; 2307 reclaim_state.reclaimed_slab = 0;
2305 current->reclaim_state = &reclaim_state; 2308 current->reclaim_state = &reclaim_state;
2306 2309
2307 progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); 2310 progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
2308 2311
2309 current->reclaim_state = NULL; 2312 current->reclaim_state = NULL;
2310 lockdep_clear_current_reclaim_state(); 2313 lockdep_clear_current_reclaim_state();
2311 current->flags &= ~PF_MEMALLOC; 2314 current->flags &= ~PF_MEMALLOC;
2312 2315
2313 cond_resched(); 2316 cond_resched();
2314 2317
2315 return progress; 2318 return progress;
2316 } 2319 }
2317 2320
2318 /* The really slow allocator path where we enter direct reclaim */ 2321 /* The really slow allocator path where we enter direct reclaim */
2319 static inline struct page * 2322 static inline struct page *
2320 __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, 2323 __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
2321 struct zonelist *zonelist, enum zone_type high_zoneidx, 2324 struct zonelist *zonelist, enum zone_type high_zoneidx,
2322 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2325 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2323 int migratetype, unsigned long *did_some_progress) 2326 int migratetype, unsigned long *did_some_progress)
2324 { 2327 {
2325 struct page *page = NULL; 2328 struct page *page = NULL;
2326 bool drained = false; 2329 bool drained = false;
2327 2330
2328 *did_some_progress = __perform_reclaim(gfp_mask, order, zonelist, 2331 *did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,
2329 nodemask); 2332 nodemask);
2330 if (unlikely(!(*did_some_progress))) 2333 if (unlikely(!(*did_some_progress)))
2331 return NULL; 2334 return NULL;
2332 2335
2333 /* After successful reclaim, reconsider all zones for allocation */ 2336 /* After successful reclaim, reconsider all zones for allocation */
2334 if (IS_ENABLED(CONFIG_NUMA)) 2337 if (IS_ENABLED(CONFIG_NUMA))
2335 zlc_clear_zones_full(zonelist); 2338 zlc_clear_zones_full(zonelist);
2336 2339
2337 retry: 2340 retry:
2338 page = get_page_from_freelist(gfp_mask, nodemask, order, 2341 page = get_page_from_freelist(gfp_mask, nodemask, order,
2339 zonelist, high_zoneidx, 2342 zonelist, high_zoneidx,
2340 alloc_flags & ~ALLOC_NO_WATERMARKS, 2343 alloc_flags & ~ALLOC_NO_WATERMARKS,
2341 preferred_zone, migratetype); 2344 preferred_zone, migratetype);
2342 2345
2343 /* 2346 /*
2344 * If an allocation failed after direct reclaim, it could be because 2347 * If an allocation failed after direct reclaim, it could be because
2345 * pages are pinned on the per-cpu lists. Drain them and try again 2348 * pages are pinned on the per-cpu lists. Drain them and try again
2346 */ 2349 */
2347 if (!page && !drained) { 2350 if (!page && !drained) {
2348 drain_all_pages(); 2351 drain_all_pages();
2349 drained = true; 2352 drained = true;
2350 goto retry; 2353 goto retry;
2351 } 2354 }
2352 2355
2353 return page; 2356 return page;
2354 } 2357 }
2355 2358
2356 /* 2359 /*
2357 * This is called in the allocator slow-path if the allocation request is of 2360 * This is called in the allocator slow-path if the allocation request is of
2358 * sufficient urgency to ignore watermarks and take other desperate measures 2361 * sufficient urgency to ignore watermarks and take other desperate measures
2359 */ 2362 */
2360 static inline struct page * 2363 static inline struct page *
2361 __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, 2364 __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
2362 struct zonelist *zonelist, enum zone_type high_zoneidx, 2365 struct zonelist *zonelist, enum zone_type high_zoneidx,
2363 nodemask_t *nodemask, struct zone *preferred_zone, 2366 nodemask_t *nodemask, struct zone *preferred_zone,
2364 int migratetype) 2367 int migratetype)
2365 { 2368 {
2366 struct page *page; 2369 struct page *page;
2367 2370
2368 do { 2371 do {
2369 page = get_page_from_freelist(gfp_mask, nodemask, order, 2372 page = get_page_from_freelist(gfp_mask, nodemask, order,
2370 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS, 2373 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,
2371 preferred_zone, migratetype); 2374 preferred_zone, migratetype);
2372 2375
2373 if (!page && gfp_mask & __GFP_NOFAIL) 2376 if (!page && gfp_mask & __GFP_NOFAIL)
2374 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); 2377 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
2375 } while (!page && (gfp_mask & __GFP_NOFAIL)); 2378 } while (!page && (gfp_mask & __GFP_NOFAIL));
2376 2379
2377 return page; 2380 return page;
2378 } 2381 }
2379 2382
2380 static void reset_alloc_batches(struct zonelist *zonelist, 2383 static void reset_alloc_batches(struct zonelist *zonelist,
2381 enum zone_type high_zoneidx, 2384 enum zone_type high_zoneidx,
2382 struct zone *preferred_zone) 2385 struct zone *preferred_zone)
2383 { 2386 {
2384 struct zoneref *z; 2387 struct zoneref *z;
2385 struct zone *zone; 2388 struct zone *zone;
2386 2389
2387 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { 2390 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
2388 /* 2391 /*
2389 * Only reset the batches of zones that were actually 2392 * Only reset the batches of zones that were actually
2390 * considered in the fairness pass, we don't want to 2393 * considered in the fairness pass, we don't want to
2391 * trash fairness information for zones that are not 2394 * trash fairness information for zones that are not
2392 * actually part of this zonelist's round-robin cycle. 2395 * actually part of this zonelist's round-robin cycle.
2393 */ 2396 */
2394 if (!zone_local(preferred_zone, zone)) 2397 if (!zone_local(preferred_zone, zone))
2395 continue; 2398 continue;
2396 mod_zone_page_state(zone, NR_ALLOC_BATCH, 2399 mod_zone_page_state(zone, NR_ALLOC_BATCH,
2397 high_wmark_pages(zone) - low_wmark_pages(zone) - 2400 high_wmark_pages(zone) - low_wmark_pages(zone) -
2398 atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); 2401 atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
2399 } 2402 }
2400 } 2403 }
2401 2404
2402 static void wake_all_kswapds(unsigned int order, 2405 static void wake_all_kswapds(unsigned int order,
2403 struct zonelist *zonelist, 2406 struct zonelist *zonelist,
2404 enum zone_type high_zoneidx, 2407 enum zone_type high_zoneidx,
2405 struct zone *preferred_zone) 2408 struct zone *preferred_zone)
2406 { 2409 {
2407 struct zoneref *z; 2410 struct zoneref *z;
2408 struct zone *zone; 2411 struct zone *zone;
2409 2412
2410 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) 2413 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
2411 wakeup_kswapd(zone, order, zone_idx(preferred_zone)); 2414 wakeup_kswapd(zone, order, zone_idx(preferred_zone));
2412 } 2415 }
2413 2416
2414 static inline int 2417 static inline int
2415 gfp_to_alloc_flags(gfp_t gfp_mask) 2418 gfp_to_alloc_flags(gfp_t gfp_mask)
2416 { 2419 {
2417 int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; 2420 int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
2418 const bool atomic = !(gfp_mask & (__GFP_WAIT | __GFP_NO_KSWAPD)); 2421 const bool atomic = !(gfp_mask & (__GFP_WAIT | __GFP_NO_KSWAPD));
2419 2422
2420 /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */ 2423 /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
2421 BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH); 2424 BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
2422 2425
2423 /* 2426 /*
2424 * The caller may dip into page reserves a bit more if the caller 2427 * The caller may dip into page reserves a bit more if the caller
2425 * cannot run direct reclaim, or if the caller has realtime scheduling 2428 * cannot run direct reclaim, or if the caller has realtime scheduling
2426 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will 2429 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
2427 * set both ALLOC_HARDER (atomic == true) and ALLOC_HIGH (__GFP_HIGH). 2430 * set both ALLOC_HARDER (atomic == true) and ALLOC_HIGH (__GFP_HIGH).
2428 */ 2431 */
2429 alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH); 2432 alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
2430 2433
2431 if (atomic) { 2434 if (atomic) {
2432 /* 2435 /*
2433 * Not worth trying to allocate harder for __GFP_NOMEMALLOC even 2436 * Not worth trying to allocate harder for __GFP_NOMEMALLOC even
2434 * if it can't schedule. 2437 * if it can't schedule.
2435 */ 2438 */
2436 if (!(gfp_mask & __GFP_NOMEMALLOC)) 2439 if (!(gfp_mask & __GFP_NOMEMALLOC))
2437 alloc_flags |= ALLOC_HARDER; 2440 alloc_flags |= ALLOC_HARDER;
2438 /* 2441 /*
2439 * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the 2442 * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the
2440 * comment for __cpuset_node_allowed_softwall(). 2443 * comment for __cpuset_node_allowed_softwall().
2441 */ 2444 */
2442 alloc_flags &= ~ALLOC_CPUSET; 2445 alloc_flags &= ~ALLOC_CPUSET;
2443 } else if (unlikely(rt_task(current)) && !in_interrupt()) 2446 } else if (unlikely(rt_task(current)) && !in_interrupt())
2444 alloc_flags |= ALLOC_HARDER; 2447 alloc_flags |= ALLOC_HARDER;
2445 2448
2446 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { 2449 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
2447 if (gfp_mask & __GFP_MEMALLOC) 2450 if (gfp_mask & __GFP_MEMALLOC)
2448 alloc_flags |= ALLOC_NO_WATERMARKS; 2451 alloc_flags |= ALLOC_NO_WATERMARKS;
2449 else if (in_serving_softirq() && (current->flags & PF_MEMALLOC)) 2452 else if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
2450 alloc_flags |= ALLOC_NO_WATERMARKS; 2453 alloc_flags |= ALLOC_NO_WATERMARKS;
2451 else if (!in_interrupt() && 2454 else if (!in_interrupt() &&
2452 ((current->flags & PF_MEMALLOC) || 2455 ((current->flags & PF_MEMALLOC) ||
2453 unlikely(test_thread_flag(TIF_MEMDIE)))) 2456 unlikely(test_thread_flag(TIF_MEMDIE))))
2454 alloc_flags |= ALLOC_NO_WATERMARKS; 2457 alloc_flags |= ALLOC_NO_WATERMARKS;
2455 } 2458 }
2456 #ifdef CONFIG_CMA 2459 #ifdef CONFIG_CMA
2457 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) 2460 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
2458 alloc_flags |= ALLOC_CMA; 2461 alloc_flags |= ALLOC_CMA;
2459 #endif 2462 #endif
2460 return alloc_flags; 2463 return alloc_flags;
2461 } 2464 }
2462 2465
2463 bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) 2466 bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
2464 { 2467 {
2465 return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS); 2468 return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);
2466 } 2469 }
2467 2470
2468 static inline struct page * 2471 static inline struct page *
2469 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, 2472 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2470 struct zonelist *zonelist, enum zone_type high_zoneidx, 2473 struct zonelist *zonelist, enum zone_type high_zoneidx,
2471 nodemask_t *nodemask, struct zone *preferred_zone, 2474 nodemask_t *nodemask, struct zone *preferred_zone,
2472 int migratetype) 2475 int migratetype)
2473 { 2476 {
2474 const gfp_t wait = gfp_mask & __GFP_WAIT; 2477 const gfp_t wait = gfp_mask & __GFP_WAIT;
2475 struct page *page = NULL; 2478 struct page *page = NULL;
2476 int alloc_flags; 2479 int alloc_flags;
2477 unsigned long pages_reclaimed = 0; 2480 unsigned long pages_reclaimed = 0;
2478 unsigned long did_some_progress; 2481 unsigned long did_some_progress;
2479 enum migrate_mode migration_mode = MIGRATE_ASYNC; 2482 enum migrate_mode migration_mode = MIGRATE_ASYNC;
2480 bool deferred_compaction = false; 2483 bool deferred_compaction = false;
2481 bool contended_compaction = false; 2484 bool contended_compaction = false;
2482 2485
2483 /* 2486 /*
2484 * In the slowpath, we sanity check order to avoid ever trying to 2487 * In the slowpath, we sanity check order to avoid ever trying to
2485 * reclaim >= MAX_ORDER areas which will never succeed. Callers may 2488 * reclaim >= MAX_ORDER areas which will never succeed. Callers may
2486 * be using allocators in order of preference for an area that is 2489 * be using allocators in order of preference for an area that is
2487 * too large. 2490 * too large.
2488 */ 2491 */
2489 if (order >= MAX_ORDER) { 2492 if (order >= MAX_ORDER) {
2490 WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN)); 2493 WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
2491 return NULL; 2494 return NULL;
2492 } 2495 }
2493 2496
2494 /* 2497 /*
2495 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and 2498 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
2496 * __GFP_NOWARN set) should not cause reclaim since the subsystem 2499 * __GFP_NOWARN set) should not cause reclaim since the subsystem
2497 * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim 2500 * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
2498 * using a larger set of nodes after it has established that the 2501 * using a larger set of nodes after it has established that the
2499 * allowed per node queues are empty and that nodes are 2502 * allowed per node queues are empty and that nodes are
2500 * over allocated. 2503 * over allocated.
2501 */ 2504 */
2502 if (IS_ENABLED(CONFIG_NUMA) && 2505 if (IS_ENABLED(CONFIG_NUMA) &&
2503 (gfp_mask & GFP_THISNODE) == GFP_THISNODE) 2506 (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
2504 goto nopage; 2507 goto nopage;
2505 2508
2506 restart: 2509 restart:
2507 if (!(gfp_mask & __GFP_NO_KSWAPD)) 2510 if (!(gfp_mask & __GFP_NO_KSWAPD))
2508 wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone); 2511 wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone);
2509 2512
2510 /* 2513 /*
2511 * OK, we're below the kswapd watermark and have kicked background 2514 * OK, we're below the kswapd watermark and have kicked background
2512 * reclaim. Now things get more complex, so set up alloc_flags according 2515 * reclaim. Now things get more complex, so set up alloc_flags according
2513 * to how we want to proceed. 2516 * to how we want to proceed.
2514 */ 2517 */
2515 alloc_flags = gfp_to_alloc_flags(gfp_mask); 2518 alloc_flags = gfp_to_alloc_flags(gfp_mask);
2516 2519
2517 /* 2520 /*
2518 * Find the true preferred zone if the allocation is unconstrained by 2521 * Find the true preferred zone if the allocation is unconstrained by
2519 * cpusets. 2522 * cpusets.
2520 */ 2523 */
2521 if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) 2524 if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)
2522 first_zones_zonelist(zonelist, high_zoneidx, NULL, 2525 first_zones_zonelist(zonelist, high_zoneidx, NULL,
2523 &preferred_zone); 2526 &preferred_zone);
2524 2527
2525 rebalance: 2528 rebalance:
2526 /* This is the last chance, in general, before the goto nopage. */ 2529 /* This is the last chance, in general, before the goto nopage. */
2527 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, 2530 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
2528 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, 2531 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
2529 preferred_zone, migratetype); 2532 preferred_zone, migratetype);
2530 if (page) 2533 if (page)
2531 goto got_pg; 2534 goto got_pg;
2532 2535
2533 /* Allocate without watermarks if the context allows */ 2536 /* Allocate without watermarks if the context allows */
2534 if (alloc_flags & ALLOC_NO_WATERMARKS) { 2537 if (alloc_flags & ALLOC_NO_WATERMARKS) {
2535 /* 2538 /*
2536 * Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds 2539 * Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds
2537 * the allocation is high priority and these type of 2540 * the allocation is high priority and these type of
2538 * allocations are system rather than user orientated 2541 * allocations are system rather than user orientated
2539 */ 2542 */
2540 zonelist = node_zonelist(numa_node_id(), gfp_mask); 2543 zonelist = node_zonelist(numa_node_id(), gfp_mask);
2541 2544
2542 page = __alloc_pages_high_priority(gfp_mask, order, 2545 page = __alloc_pages_high_priority(gfp_mask, order,
2543 zonelist, high_zoneidx, nodemask, 2546 zonelist, high_zoneidx, nodemask,
2544 preferred_zone, migratetype); 2547 preferred_zone, migratetype);
2545 if (page) { 2548 if (page) {
2546 goto got_pg; 2549 goto got_pg;
2547 } 2550 }
2548 } 2551 }
2549 2552
2550 /* Atomic allocations - we can't balance anything */ 2553 /* Atomic allocations - we can't balance anything */
2551 if (!wait) 2554 if (!wait)
2552 goto nopage; 2555 goto nopage;
2553 2556
2554 /* Avoid recursion of direct reclaim */ 2557 /* Avoid recursion of direct reclaim */
2555 if (current->flags & PF_MEMALLOC) 2558 if (current->flags & PF_MEMALLOC)
2556 goto nopage; 2559 goto nopage;
2557 2560
2558 /* Avoid allocations with no watermarks from looping endlessly */ 2561 /* Avoid allocations with no watermarks from looping endlessly */
2559 if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) 2562 if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
2560 goto nopage; 2563 goto nopage;
2561 2564
2562 /* 2565 /*
2563 * Try direct compaction. The first pass is asynchronous. Subsequent 2566 * Try direct compaction. The first pass is asynchronous. Subsequent
2564 * attempts after direct reclaim are synchronous 2567 * attempts after direct reclaim are synchronous
2565 */ 2568 */
2566 page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, 2569 page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
2567 high_zoneidx, nodemask, alloc_flags, 2570 high_zoneidx, nodemask, alloc_flags,
2568 preferred_zone, migratetype, 2571 preferred_zone, migratetype,
2569 migration_mode, &contended_compaction, 2572 migration_mode, &contended_compaction,
2570 &deferred_compaction, 2573 &deferred_compaction,
2571 &did_some_progress); 2574 &did_some_progress);
2572 if (page) 2575 if (page)
2573 goto got_pg; 2576 goto got_pg;
2574 migration_mode = MIGRATE_SYNC_LIGHT; 2577 migration_mode = MIGRATE_SYNC_LIGHT;
2575 2578
2576 /* 2579 /*
2577 * If compaction is deferred for high-order allocations, it is because 2580 * If compaction is deferred for high-order allocations, it is because
2578 * sync compaction recently failed. In this is the case and the caller 2581 * sync compaction recently failed. In this is the case and the caller
2579 * requested a movable allocation that does not heavily disrupt the 2582 * requested a movable allocation that does not heavily disrupt the
2580 * system then fail the allocation instead of entering direct reclaim. 2583 * system then fail the allocation instead of entering direct reclaim.
2581 */ 2584 */
2582 if ((deferred_compaction || contended_compaction) && 2585 if ((deferred_compaction || contended_compaction) &&
2583 (gfp_mask & __GFP_NO_KSWAPD)) 2586 (gfp_mask & __GFP_NO_KSWAPD))
2584 goto nopage; 2587 goto nopage;
2585 2588
2586 /* Try direct reclaim and then allocating */ 2589 /* Try direct reclaim and then allocating */
2587 page = __alloc_pages_direct_reclaim(gfp_mask, order, 2590 page = __alloc_pages_direct_reclaim(gfp_mask, order,
2588 zonelist, high_zoneidx, 2591 zonelist, high_zoneidx,
2589 nodemask, 2592 nodemask,
2590 alloc_flags, preferred_zone, 2593 alloc_flags, preferred_zone,
2591 migratetype, &did_some_progress); 2594 migratetype, &did_some_progress);
2592 if (page) 2595 if (page)
2593 goto got_pg; 2596 goto got_pg;
2594 2597
2595 /* 2598 /*
2596 * If we failed to make any progress reclaiming, then we are 2599 * If we failed to make any progress reclaiming, then we are
2597 * running out of options and have to consider going OOM 2600 * running out of options and have to consider going OOM
2598 */ 2601 */
2599 if (!did_some_progress) { 2602 if (!did_some_progress) {
2600 if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { 2603 if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
2601 if (oom_killer_disabled) 2604 if (oom_killer_disabled)
2602 goto nopage; 2605 goto nopage;
2603 /* Coredumps can quickly deplete all memory reserves */ 2606 /* Coredumps can quickly deplete all memory reserves */
2604 if ((current->flags & PF_DUMPCORE) && 2607 if ((current->flags & PF_DUMPCORE) &&
2605 !(gfp_mask & __GFP_NOFAIL)) 2608 !(gfp_mask & __GFP_NOFAIL))
2606 goto nopage; 2609 goto nopage;
2607 page = __alloc_pages_may_oom(gfp_mask, order, 2610 page = __alloc_pages_may_oom(gfp_mask, order,
2608 zonelist, high_zoneidx, 2611 zonelist, high_zoneidx,
2609 nodemask, preferred_zone, 2612 nodemask, preferred_zone,
2610 migratetype); 2613 migratetype);
2611 if (page) 2614 if (page)
2612 goto got_pg; 2615 goto got_pg;
2613 2616
2614 if (!(gfp_mask & __GFP_NOFAIL)) { 2617 if (!(gfp_mask & __GFP_NOFAIL)) {
2615 /* 2618 /*
2616 * The oom killer is not called for high-order 2619 * The oom killer is not called for high-order
2617 * allocations that may fail, so if no progress 2620 * allocations that may fail, so if no progress
2618 * is being made, there are no other options and 2621 * is being made, there are no other options and
2619 * retrying is unlikely to help. 2622 * retrying is unlikely to help.
2620 */ 2623 */
2621 if (order > PAGE_ALLOC_COSTLY_ORDER) 2624 if (order > PAGE_ALLOC_COSTLY_ORDER)
2622 goto nopage; 2625 goto nopage;
2623 /* 2626 /*
2624 * The oom killer is not called for lowmem 2627 * The oom killer is not called for lowmem
2625 * allocations to prevent needlessly killing 2628 * allocations to prevent needlessly killing
2626 * innocent tasks. 2629 * innocent tasks.
2627 */ 2630 */
2628 if (high_zoneidx < ZONE_NORMAL) 2631 if (high_zoneidx < ZONE_NORMAL)
2629 goto nopage; 2632 goto nopage;
2630 } 2633 }
2631 2634
2632 goto restart; 2635 goto restart;
2633 } 2636 }
2634 } 2637 }
2635 2638
2636 /* Check if we should retry the allocation */ 2639 /* Check if we should retry the allocation */
2637 pages_reclaimed += did_some_progress; 2640 pages_reclaimed += did_some_progress;
2638 if (should_alloc_retry(gfp_mask, order, did_some_progress, 2641 if (should_alloc_retry(gfp_mask, order, did_some_progress,
2639 pages_reclaimed)) { 2642 pages_reclaimed)) {
2640 /* Wait for some write requests to complete then retry */ 2643 /* Wait for some write requests to complete then retry */
2641 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); 2644 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
2642 goto rebalance; 2645 goto rebalance;
2643 } else { 2646 } else {
2644 /* 2647 /*
2645 * High-order allocations do not necessarily loop after 2648 * High-order allocations do not necessarily loop after
2646 * direct reclaim and reclaim/compaction depends on compaction 2649 * direct reclaim and reclaim/compaction depends on compaction
2647 * being called after reclaim so call directly if necessary 2650 * being called after reclaim so call directly if necessary
2648 */ 2651 */
2649 page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, 2652 page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
2650 high_zoneidx, nodemask, alloc_flags, 2653 high_zoneidx, nodemask, alloc_flags,
2651 preferred_zone, migratetype, 2654 preferred_zone, migratetype,
2652 migration_mode, &contended_compaction, 2655 migration_mode, &contended_compaction,
2653 &deferred_compaction, 2656 &deferred_compaction,
2654 &did_some_progress); 2657 &did_some_progress);
2655 if (page) 2658 if (page)
2656 goto got_pg; 2659 goto got_pg;
2657 } 2660 }
2658 2661
2659 nopage: 2662 nopage:
2660 warn_alloc_failed(gfp_mask, order, NULL); 2663 warn_alloc_failed(gfp_mask, order, NULL);
2661 return page; 2664 return page;
2662 got_pg: 2665 got_pg:
2663 if (kmemcheck_enabled) 2666 if (kmemcheck_enabled)
2664 kmemcheck_pagealloc_alloc(page, order, gfp_mask); 2667 kmemcheck_pagealloc_alloc(page, order, gfp_mask);
2665 2668
2666 return page; 2669 return page;
2667 } 2670 }
2668 2671
2669 /* 2672 /*
2670 * This is the 'heart' of the zoned buddy allocator. 2673 * This is the 'heart' of the zoned buddy allocator.
2671 */ 2674 */
2672 struct page * 2675 struct page *
2673 __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, 2676 __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2674 struct zonelist *zonelist, nodemask_t *nodemask) 2677 struct zonelist *zonelist, nodemask_t *nodemask)
2675 { 2678 {
2676 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 2679 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
2677 struct zone *preferred_zone; 2680 struct zone *preferred_zone;
2678 struct page *page = NULL; 2681 struct page *page = NULL;
2679 int migratetype = allocflags_to_migratetype(gfp_mask); 2682 int migratetype = allocflags_to_migratetype(gfp_mask);
2680 unsigned int cpuset_mems_cookie; 2683 unsigned int cpuset_mems_cookie;
2681 int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; 2684 int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
2682 struct mem_cgroup *memcg = NULL; 2685 struct mem_cgroup *memcg = NULL;
2683 2686
2684 gfp_mask &= gfp_allowed_mask; 2687 gfp_mask &= gfp_allowed_mask;
2685 2688
2686 lockdep_trace_alloc(gfp_mask); 2689 lockdep_trace_alloc(gfp_mask);
2687 2690
2688 might_sleep_if(gfp_mask & __GFP_WAIT); 2691 might_sleep_if(gfp_mask & __GFP_WAIT);
2689 2692
2690 if (should_fail_alloc_page(gfp_mask, order)) 2693 if (should_fail_alloc_page(gfp_mask, order))
2691 return NULL; 2694 return NULL;
2692 2695
2693 /* 2696 /*
2694 * Check the zones suitable for the gfp_mask contain at least one 2697 * Check the zones suitable for the gfp_mask contain at least one
2695 * valid zone. It's possible to have an empty zonelist as a result 2698 * valid zone. It's possible to have an empty zonelist as a result
2696 * of GFP_THISNODE and a memoryless node 2699 * of GFP_THISNODE and a memoryless node
2697 */ 2700 */
2698 if (unlikely(!zonelist->_zonerefs->zone)) 2701 if (unlikely(!zonelist->_zonerefs->zone))
2699 return NULL; 2702 return NULL;
2700 2703
2701 /* 2704 /*
2702 * Will only have any effect when __GFP_KMEMCG is set. This is 2705 * Will only have any effect when __GFP_KMEMCG is set. This is
2703 * verified in the (always inline) callee 2706 * verified in the (always inline) callee
2704 */ 2707 */
2705 if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) 2708 if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
2706 return NULL; 2709 return NULL;
2707 2710
2708 retry_cpuset: 2711 retry_cpuset:
2709 cpuset_mems_cookie = read_mems_allowed_begin(); 2712 cpuset_mems_cookie = read_mems_allowed_begin();
2710 2713
2711 /* The preferred zone is used for statistics later */ 2714 /* The preferred zone is used for statistics later */
2712 first_zones_zonelist(zonelist, high_zoneidx, 2715 first_zones_zonelist(zonelist, high_zoneidx,
2713 nodemask ? : &cpuset_current_mems_allowed, 2716 nodemask ? : &cpuset_current_mems_allowed,
2714 &preferred_zone); 2717 &preferred_zone);
2715 if (!preferred_zone) 2718 if (!preferred_zone)
2716 goto out; 2719 goto out;
2717 2720
2718 #ifdef CONFIG_CMA 2721 #ifdef CONFIG_CMA
2719 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) 2722 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
2720 alloc_flags |= ALLOC_CMA; 2723 alloc_flags |= ALLOC_CMA;
2721 #endif 2724 #endif
2722 retry: 2725 retry:
2723 /* First allocation attempt */ 2726 /* First allocation attempt */
2724 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, 2727 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
2725 zonelist, high_zoneidx, alloc_flags, 2728 zonelist, high_zoneidx, alloc_flags,
2726 preferred_zone, migratetype); 2729 preferred_zone, migratetype);
2727 if (unlikely(!page)) { 2730 if (unlikely(!page)) {
2728 /* 2731 /*
2729 * The first pass makes sure allocations are spread 2732 * The first pass makes sure allocations are spread
2730 * fairly within the local node. However, the local 2733 * fairly within the local node. However, the local
2731 * node might have free pages left after the fairness 2734 * node might have free pages left after the fairness
2732 * batches are exhausted, and remote zones haven't 2735 * batches are exhausted, and remote zones haven't
2733 * even been considered yet. Try once more without 2736 * even been considered yet. Try once more without
2734 * fairness, and include remote zones now, before 2737 * fairness, and include remote zones now, before
2735 * entering the slowpath and waking kswapd: prefer 2738 * entering the slowpath and waking kswapd: prefer
2736 * spilling to a remote zone over swapping locally. 2739 * spilling to a remote zone over swapping locally.
2737 */ 2740 */
2738 if (alloc_flags & ALLOC_FAIR) { 2741 if (alloc_flags & ALLOC_FAIR) {
2739 reset_alloc_batches(zonelist, high_zoneidx, 2742 reset_alloc_batches(zonelist, high_zoneidx,
2740 preferred_zone); 2743 preferred_zone);
2741 alloc_flags &= ~ALLOC_FAIR; 2744 alloc_flags &= ~ALLOC_FAIR;
2742 goto retry; 2745 goto retry;
2743 } 2746 }
2744 /* 2747 /*
2745 * Runtime PM, block IO and its error handling path 2748 * Runtime PM, block IO and its error handling path
2746 * can deadlock because I/O on the device might not 2749 * can deadlock because I/O on the device might not
2747 * complete. 2750 * complete.
2748 */ 2751 */
2749 gfp_mask = memalloc_noio_flags(gfp_mask); 2752 gfp_mask = memalloc_noio_flags(gfp_mask);
2750 page = __alloc_pages_slowpath(gfp_mask, order, 2753 page = __alloc_pages_slowpath(gfp_mask, order,
2751 zonelist, high_zoneidx, nodemask, 2754 zonelist, high_zoneidx, nodemask,
2752 preferred_zone, migratetype); 2755 preferred_zone, migratetype);
2753 } 2756 }
2754 2757
2755 trace_mm_page_alloc(page, order, gfp_mask, migratetype); 2758 trace_mm_page_alloc(page, order, gfp_mask, migratetype);
2756 2759
2757 out: 2760 out:
2758 /* 2761 /*
2759 * When updating a task's mems_allowed, it is possible to race with 2762 * When updating a task's mems_allowed, it is possible to race with
2760 * parallel threads in such a way that an allocation can fail while 2763 * parallel threads in such a way that an allocation can fail while
2761 * the mask is being updated. If a page allocation is about to fail, 2764 * the mask is being updated. If a page allocation is about to fail,
2762 * check if the cpuset changed during allocation and if so, retry. 2765 * check if the cpuset changed during allocation and if so, retry.
2763 */ 2766 */
2764 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) 2767 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2765 goto retry_cpuset; 2768 goto retry_cpuset;
2766 2769
2767 memcg_kmem_commit_charge(page, memcg, order); 2770 memcg_kmem_commit_charge(page, memcg, order);
2768 2771
2769 return page; 2772 return page;
2770 } 2773 }
2771 EXPORT_SYMBOL(__alloc_pages_nodemask); 2774 EXPORT_SYMBOL(__alloc_pages_nodemask);
2772 2775
2773 /* 2776 /*
2774 * Common helper functions. 2777 * Common helper functions.
2775 */ 2778 */
2776 unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) 2779 unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
2777 { 2780 {
2778 struct page *page; 2781 struct page *page;
2779 2782
2780 /* 2783 /*
2781 * __get_free_pages() returns a 32-bit address, which cannot represent 2784 * __get_free_pages() returns a 32-bit address, which cannot represent
2782 * a highmem page 2785 * a highmem page
2783 */ 2786 */
2784 VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); 2787 VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
2785 2788
2786 page = alloc_pages(gfp_mask, order); 2789 page = alloc_pages(gfp_mask, order);
2787 if (!page) 2790 if (!page)
2788 return 0; 2791 return 0;
2789 return (unsigned long) page_address(page); 2792 return (unsigned long) page_address(page);
2790 } 2793 }
2791 EXPORT_SYMBOL(__get_free_pages); 2794 EXPORT_SYMBOL(__get_free_pages);
2792 2795
2793 unsigned long get_zeroed_page(gfp_t gfp_mask) 2796 unsigned long get_zeroed_page(gfp_t gfp_mask)
2794 { 2797 {
2795 return __get_free_pages(gfp_mask | __GFP_ZERO, 0); 2798 return __get_free_pages(gfp_mask | __GFP_ZERO, 0);
2796 } 2799 }
2797 EXPORT_SYMBOL(get_zeroed_page); 2800 EXPORT_SYMBOL(get_zeroed_page);
2798 2801
2799 void __free_pages(struct page *page, unsigned int order) 2802 void __free_pages(struct page *page, unsigned int order)
2800 { 2803 {
2801 if (put_page_testzero(page)) { 2804 if (put_page_testzero(page)) {
2802 if (order == 0) 2805 if (order == 0)
2803 free_hot_cold_page(page, 0); 2806 free_hot_cold_page(page, 0);
2804 else 2807 else
2805 __free_pages_ok(page, order); 2808 __free_pages_ok(page, order);
2806 } 2809 }
2807 } 2810 }
2808 2811
2809 EXPORT_SYMBOL(__free_pages); 2812 EXPORT_SYMBOL(__free_pages);
2810 2813
2811 void free_pages(unsigned long addr, unsigned int order) 2814 void free_pages(unsigned long addr, unsigned int order)
2812 { 2815 {
2813 if (addr != 0) { 2816 if (addr != 0) {
2814 VM_BUG_ON(!virt_addr_valid((void *)addr)); 2817 VM_BUG_ON(!virt_addr_valid((void *)addr));
2815 __free_pages(virt_to_page((void *)addr), order); 2818 __free_pages(virt_to_page((void *)addr), order);
2816 } 2819 }
2817 } 2820 }
2818 2821
2819 EXPORT_SYMBOL(free_pages); 2822 EXPORT_SYMBOL(free_pages);
2820 2823
2821 /* 2824 /*
2822 * __free_memcg_kmem_pages and free_memcg_kmem_pages will free 2825 * __free_memcg_kmem_pages and free_memcg_kmem_pages will free
2823 * pages allocated with __GFP_KMEMCG. 2826 * pages allocated with __GFP_KMEMCG.
2824 * 2827 *
2825 * Those pages are accounted to a particular memcg, embedded in the 2828 * Those pages are accounted to a particular memcg, embedded in the
2826 * corresponding page_cgroup. To avoid adding a hit in the allocator to search 2829 * corresponding page_cgroup. To avoid adding a hit in the allocator to search
2827 * for that information only to find out that it is NULL for users who have no 2830 * for that information only to find out that it is NULL for users who have no
2828 * interest in that whatsoever, we provide these functions. 2831 * interest in that whatsoever, we provide these functions.
2829 * 2832 *
2830 * The caller knows better which flags it relies on. 2833 * The caller knows better which flags it relies on.
2831 */ 2834 */
2832 void __free_memcg_kmem_pages(struct page *page, unsigned int order) 2835 void __free_memcg_kmem_pages(struct page *page, unsigned int order)
2833 { 2836 {
2834 memcg_kmem_uncharge_pages(page, order); 2837 memcg_kmem_uncharge_pages(page, order);
2835 __free_pages(page, order); 2838 __free_pages(page, order);
2836 } 2839 }
2837 2840
2838 void free_memcg_kmem_pages(unsigned long addr, unsigned int order) 2841 void free_memcg_kmem_pages(unsigned long addr, unsigned int order)
2839 { 2842 {
2840 if (addr != 0) { 2843 if (addr != 0) {
2841 VM_BUG_ON(!virt_addr_valid((void *)addr)); 2844 VM_BUG_ON(!virt_addr_valid((void *)addr));
2842 __free_memcg_kmem_pages(virt_to_page((void *)addr), order); 2845 __free_memcg_kmem_pages(virt_to_page((void *)addr), order);
2843 } 2846 }
2844 } 2847 }
2845 2848
2846 static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size) 2849 static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)
2847 { 2850 {
2848 if (addr) { 2851 if (addr) {
2849 unsigned long alloc_end = addr + (PAGE_SIZE << order); 2852 unsigned long alloc_end = addr + (PAGE_SIZE << order);
2850 unsigned long used = addr + PAGE_ALIGN(size); 2853 unsigned long used = addr + PAGE_ALIGN(size);
2851 2854
2852 split_page(virt_to_page((void *)addr), order); 2855 split_page(virt_to_page((void *)addr), order);
2853 while (used < alloc_end) { 2856 while (used < alloc_end) {
2854 free_page(used); 2857 free_page(used);
2855 used += PAGE_SIZE; 2858 used += PAGE_SIZE;
2856 } 2859 }
2857 } 2860 }
2858 return (void *)addr; 2861 return (void *)addr;
2859 } 2862 }
2860 2863
2861 /** 2864 /**
2862 * alloc_pages_exact - allocate an exact number physically-contiguous pages. 2865 * alloc_pages_exact - allocate an exact number physically-contiguous pages.
2863 * @size: the number of bytes to allocate 2866 * @size: the number of bytes to allocate
2864 * @gfp_mask: GFP flags for the allocation 2867 * @gfp_mask: GFP flags for the allocation
2865 * 2868 *
2866 * This function is similar to alloc_pages(), except that it allocates the 2869 * This function is similar to alloc_pages(), except that it allocates the
2867 * minimum number of pages to satisfy the request. alloc_pages() can only 2870 * minimum number of pages to satisfy the request. alloc_pages() can only
2868 * allocate memory in power-of-two pages. 2871 * allocate memory in power-of-two pages.
2869 * 2872 *
2870 * This function is also limited by MAX_ORDER. 2873 * This function is also limited by MAX_ORDER.
2871 * 2874 *
2872 * Memory allocated by this function must be released by free_pages_exact(). 2875 * Memory allocated by this function must be released by free_pages_exact().
2873 */ 2876 */
2874 void *alloc_pages_exact(size_t size, gfp_t gfp_mask) 2877 void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
2875 { 2878 {
2876 unsigned int order = get_order(size); 2879 unsigned int order = get_order(size);
2877 unsigned long addr; 2880 unsigned long addr;
2878 2881
2879 addr = __get_free_pages(gfp_mask, order); 2882 addr = __get_free_pages(gfp_mask, order);
2880 return make_alloc_exact(addr, order, size); 2883 return make_alloc_exact(addr, order, size);
2881 } 2884 }
2882 EXPORT_SYMBOL(alloc_pages_exact); 2885 EXPORT_SYMBOL(alloc_pages_exact);
2883 2886
2884 /** 2887 /**
2885 * alloc_pages_exact_nid - allocate an exact number of physically-contiguous 2888 * alloc_pages_exact_nid - allocate an exact number of physically-contiguous
2886 * pages on a node. 2889 * pages on a node.
2887 * @nid: the preferred node ID where memory should be allocated 2890 * @nid: the preferred node ID where memory should be allocated
2888 * @size: the number of bytes to allocate 2891 * @size: the number of bytes to allocate
2889 * @gfp_mask: GFP flags for the allocation 2892 * @gfp_mask: GFP flags for the allocation
2890 * 2893 *
2891 * Like alloc_pages_exact(), but try to allocate on node nid first before falling 2894 * Like alloc_pages_exact(), but try to allocate on node nid first before falling
2892 * back. 2895 * back.
2893 * Note this is not alloc_pages_exact_node() which allocates on a specific node, 2896 * Note this is not alloc_pages_exact_node() which allocates on a specific node,
2894 * but is not exact. 2897 * but is not exact.
2895 */ 2898 */
2896 void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) 2899 void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
2897 { 2900 {
2898 unsigned order = get_order(size); 2901 unsigned order = get_order(size);
2899 struct page *p = alloc_pages_node(nid, gfp_mask, order); 2902 struct page *p = alloc_pages_node(nid, gfp_mask, order);
2900 if (!p) 2903 if (!p)
2901 return NULL; 2904 return NULL;
2902 return make_alloc_exact((unsigned long)page_address(p), order, size); 2905 return make_alloc_exact((unsigned long)page_address(p), order, size);
2903 } 2906 }
2904 EXPORT_SYMBOL(alloc_pages_exact_nid); 2907 EXPORT_SYMBOL(alloc_pages_exact_nid);
2905 2908
2906 /** 2909 /**
2907 * free_pages_exact - release memory allocated via alloc_pages_exact() 2910 * free_pages_exact - release memory allocated via alloc_pages_exact()
2908 * @virt: the value returned by alloc_pages_exact. 2911 * @virt: the value returned by alloc_pages_exact.
2909 * @size: size of allocation, same value as passed to alloc_pages_exact(). 2912 * @size: size of allocation, same value as passed to alloc_pages_exact().
2910 * 2913 *
2911 * Release the memory allocated by a previous call to alloc_pages_exact. 2914 * Release the memory allocated by a previous call to alloc_pages_exact.
2912 */ 2915 */
2913 void free_pages_exact(void *virt, size_t size) 2916 void free_pages_exact(void *virt, size_t size)
2914 { 2917 {
2915 unsigned long addr = (unsigned long)virt; 2918 unsigned long addr = (unsigned long)virt;
2916 unsigned long end = addr + PAGE_ALIGN(size); 2919 unsigned long end = addr + PAGE_ALIGN(size);
2917 2920
2918 while (addr < end) { 2921 while (addr < end) {
2919 free_page(addr); 2922 free_page(addr);
2920 addr += PAGE_SIZE; 2923 addr += PAGE_SIZE;
2921 } 2924 }
2922 } 2925 }
2923 EXPORT_SYMBOL(free_pages_exact); 2926 EXPORT_SYMBOL(free_pages_exact);
2924 2927
2925 /** 2928 /**
2926 * nr_free_zone_pages - count number of pages beyond high watermark 2929 * nr_free_zone_pages - count number of pages beyond high watermark
2927 * @offset: The zone index of the highest zone 2930 * @offset: The zone index of the highest zone
2928 * 2931 *
2929 * nr_free_zone_pages() counts the number of counts pages which are beyond the 2932 * nr_free_zone_pages() counts the number of counts pages which are beyond the
2930 * high watermark within all zones at or below a given zone index. For each 2933 * high watermark within all zones at or below a given zone index. For each
2931 * zone, the number of pages is calculated as: 2934 * zone, the number of pages is calculated as:
2932 * managed_pages - high_pages 2935 * managed_pages - high_pages
2933 */ 2936 */
2934 static unsigned long nr_free_zone_pages(int offset) 2937 static unsigned long nr_free_zone_pages(int offset)
2935 { 2938 {
2936 struct zoneref *z; 2939 struct zoneref *z;
2937 struct zone *zone; 2940 struct zone *zone;
2938 2941
2939 /* Just pick one node, since fallback list is circular */ 2942 /* Just pick one node, since fallback list is circular */
2940 unsigned long sum = 0; 2943 unsigned long sum = 0;
2941 2944
2942 struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL); 2945 struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
2943 2946
2944 for_each_zone_zonelist(zone, z, zonelist, offset) { 2947 for_each_zone_zonelist(zone, z, zonelist, offset) {
2945 unsigned long size = zone->managed_pages; 2948 unsigned long size = zone->managed_pages;
2946 unsigned long high = high_wmark_pages(zone); 2949 unsigned long high = high_wmark_pages(zone);
2947 if (size > high) 2950 if (size > high)
2948 sum += size - high; 2951 sum += size - high;
2949 } 2952 }
2950 2953
2951 return sum; 2954 return sum;
2952 } 2955 }
2953 2956
2954 /** 2957 /**
2955 * nr_free_buffer_pages - count number of pages beyond high watermark 2958 * nr_free_buffer_pages - count number of pages beyond high watermark
2956 * 2959 *
2957 * nr_free_buffer_pages() counts the number of pages which are beyond the high 2960 * nr_free_buffer_pages() counts the number of pages which are beyond the high
2958 * watermark within ZONE_DMA and ZONE_NORMAL. 2961 * watermark within ZONE_DMA and ZONE_NORMAL.
2959 */ 2962 */
2960 unsigned long nr_free_buffer_pages(void) 2963 unsigned long nr_free_buffer_pages(void)
2961 { 2964 {
2962 return nr_free_zone_pages(gfp_zone(GFP_USER)); 2965 return nr_free_zone_pages(gfp_zone(GFP_USER));
2963 } 2966 }
2964 EXPORT_SYMBOL_GPL(nr_free_buffer_pages); 2967 EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
2965 2968
2966 /** 2969 /**
2967 * nr_free_pagecache_pages - count number of pages beyond high watermark 2970 * nr_free_pagecache_pages - count number of pages beyond high watermark
2968 * 2971 *
2969 * nr_free_pagecache_pages() counts the number of pages which are beyond the 2972 * nr_free_pagecache_pages() counts the number of pages which are beyond the
2970 * high watermark within all zones. 2973 * high watermark within all zones.
2971 */ 2974 */
2972 unsigned long nr_free_pagecache_pages(void) 2975 unsigned long nr_free_pagecache_pages(void)
2973 { 2976 {
2974 return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE)); 2977 return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
2975 } 2978 }
2976 2979
2977 static inline void show_node(struct zone *zone) 2980 static inline void show_node(struct zone *zone)
2978 { 2981 {
2979 if (IS_ENABLED(CONFIG_NUMA)) 2982 if (IS_ENABLED(CONFIG_NUMA))
2980 printk("Node %d ", zone_to_nid(zone)); 2983 printk("Node %d ", zone_to_nid(zone));
2981 } 2984 }
2982 2985
2983 void si_meminfo(struct sysinfo *val) 2986 void si_meminfo(struct sysinfo *val)
2984 { 2987 {
2985 val->totalram = totalram_pages; 2988 val->totalram = totalram_pages;
2986 val->sharedram = 0; 2989 val->sharedram = 0;
2987 val->freeram = global_page_state(NR_FREE_PAGES); 2990 val->freeram = global_page_state(NR_FREE_PAGES);
2988 val->bufferram = nr_blockdev_pages(); 2991 val->bufferram = nr_blockdev_pages();
2989 val->totalhigh = totalhigh_pages; 2992 val->totalhigh = totalhigh_pages;
2990 val->freehigh = nr_free_highpages(); 2993 val->freehigh = nr_free_highpages();
2991 val->mem_unit = PAGE_SIZE; 2994 val->mem_unit = PAGE_SIZE;
2992 } 2995 }
2993 2996
2994 EXPORT_SYMBOL(si_meminfo); 2997 EXPORT_SYMBOL(si_meminfo);
2995 2998
2996 #ifdef CONFIG_NUMA 2999 #ifdef CONFIG_NUMA
2997 void si_meminfo_node(struct sysinfo *val, int nid) 3000 void si_meminfo_node(struct sysinfo *val, int nid)
2998 { 3001 {
2999 int zone_type; /* needs to be signed */ 3002 int zone_type; /* needs to be signed */
3000 unsigned long managed_pages = 0; 3003 unsigned long managed_pages = 0;
3001 pg_data_t *pgdat = NODE_DATA(nid); 3004 pg_data_t *pgdat = NODE_DATA(nid);
3002 3005
3003 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) 3006 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
3004 managed_pages += pgdat->node_zones[zone_type].managed_pages; 3007 managed_pages += pgdat->node_zones[zone_type].managed_pages;
3005 val->totalram = managed_pages; 3008 val->totalram = managed_pages;
3006 val->freeram = node_page_state(nid, NR_FREE_PAGES); 3009 val->freeram = node_page_state(nid, NR_FREE_PAGES);
3007 #ifdef CONFIG_HIGHMEM 3010 #ifdef CONFIG_HIGHMEM
3008 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages; 3011 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages;
3009 val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM], 3012 val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
3010 NR_FREE_PAGES); 3013 NR_FREE_PAGES);
3011 #else 3014 #else
3012 val->totalhigh = 0; 3015 val->totalhigh = 0;
3013 val->freehigh = 0; 3016 val->freehigh = 0;
3014 #endif 3017 #endif
3015 val->mem_unit = PAGE_SIZE; 3018 val->mem_unit = PAGE_SIZE;
3016 } 3019 }
3017 #endif 3020 #endif
3018 3021
3019 /* 3022 /*
3020 * Determine whether the node should be displayed or not, depending on whether 3023 * Determine whether the node should be displayed or not, depending on whether
3021 * SHOW_MEM_FILTER_NODES was passed to show_free_areas(). 3024 * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
3022 */ 3025 */
3023 bool skip_free_areas_node(unsigned int flags, int nid) 3026 bool skip_free_areas_node(unsigned int flags, int nid)
3024 { 3027 {
3025 bool ret = false; 3028 bool ret = false;
3026 unsigned int cpuset_mems_cookie; 3029 unsigned int cpuset_mems_cookie;
3027 3030
3028 if (!(flags & SHOW_MEM_FILTER_NODES)) 3031 if (!(flags & SHOW_MEM_FILTER_NODES))
3029 goto out; 3032 goto out;
3030 3033
3031 do { 3034 do {
3032 cpuset_mems_cookie = read_mems_allowed_begin(); 3035 cpuset_mems_cookie = read_mems_allowed_begin();
3033 ret = !node_isset(nid, cpuset_current_mems_allowed); 3036 ret = !node_isset(nid, cpuset_current_mems_allowed);
3034 } while (read_mems_allowed_retry(cpuset_mems_cookie)); 3037 } while (read_mems_allowed_retry(cpuset_mems_cookie));
3035 out: 3038 out:
3036 return ret; 3039 return ret;
3037 } 3040 }
3038 3041
3039 #define K(x) ((x) << (PAGE_SHIFT-10)) 3042 #define K(x) ((x) << (PAGE_SHIFT-10))
3040 3043
3041 static void show_migration_types(unsigned char type) 3044 static void show_migration_types(unsigned char type)
3042 { 3045 {
3043 static const char types[MIGRATE_TYPES] = { 3046 static const char types[MIGRATE_TYPES] = {
3044 [MIGRATE_UNMOVABLE] = 'U', 3047 [MIGRATE_UNMOVABLE] = 'U',
3045 [MIGRATE_RECLAIMABLE] = 'E', 3048 [MIGRATE_RECLAIMABLE] = 'E',
3046 [MIGRATE_MOVABLE] = 'M', 3049 [MIGRATE_MOVABLE] = 'M',
3047 [MIGRATE_RESERVE] = 'R', 3050 [MIGRATE_RESERVE] = 'R',
3048 #ifdef CONFIG_CMA 3051 #ifdef CONFIG_CMA
3049 [MIGRATE_CMA] = 'C', 3052 [MIGRATE_CMA] = 'C',
3050 #endif 3053 #endif
3051 #ifdef CONFIG_MEMORY_ISOLATION 3054 #ifdef CONFIG_MEMORY_ISOLATION
3052 [MIGRATE_ISOLATE] = 'I', 3055 [MIGRATE_ISOLATE] = 'I',
3053 #endif 3056 #endif
3054 }; 3057 };
3055 char tmp[MIGRATE_TYPES + 1]; 3058 char tmp[MIGRATE_TYPES + 1];
3056 char *p = tmp; 3059 char *p = tmp;
3057 int i; 3060 int i;
3058 3061
3059 for (i = 0; i < MIGRATE_TYPES; i++) { 3062 for (i = 0; i < MIGRATE_TYPES; i++) {
3060 if (type & (1 << i)) 3063 if (type & (1 << i))
3061 *p++ = types[i]; 3064 *p++ = types[i];
3062 } 3065 }
3063 3066
3064 *p = '\0'; 3067 *p = '\0';
3065 printk("(%s) ", tmp); 3068 printk("(%s) ", tmp);
3066 } 3069 }
3067 3070
3068 /* 3071 /*
3069 * Show free area list (used inside shift_scroll-lock stuff) 3072 * Show free area list (used inside shift_scroll-lock stuff)
3070 * We also calculate the percentage fragmentation. We do this by counting the 3073 * We also calculate the percentage fragmentation. We do this by counting the
3071 * memory on each free list with the exception of the first item on the list. 3074 * memory on each free list with the exception of the first item on the list.
3072 * Suppresses nodes that are not allowed by current's cpuset if 3075 * Suppresses nodes that are not allowed by current's cpuset if
3073 * SHOW_MEM_FILTER_NODES is passed. 3076 * SHOW_MEM_FILTER_NODES is passed.
3074 */ 3077 */
3075 void show_free_areas(unsigned int filter) 3078 void show_free_areas(unsigned int filter)
3076 { 3079 {
3077 int cpu; 3080 int cpu;
3078 struct zone *zone; 3081 struct zone *zone;
3079 3082
3080 for_each_populated_zone(zone) { 3083 for_each_populated_zone(zone) {
3081 if (skip_free_areas_node(filter, zone_to_nid(zone))) 3084 if (skip_free_areas_node(filter, zone_to_nid(zone)))
3082 continue; 3085 continue;
3083 show_node(zone); 3086 show_node(zone);
3084 printk("%s per-cpu:\n", zone->name); 3087 printk("%s per-cpu:\n", zone->name);
3085 3088
3086 for_each_online_cpu(cpu) { 3089 for_each_online_cpu(cpu) {
3087 struct per_cpu_pageset *pageset; 3090 struct per_cpu_pageset *pageset;
3088 3091
3089 pageset = per_cpu_ptr(zone->pageset, cpu); 3092 pageset = per_cpu_ptr(zone->pageset, cpu);
3090 3093
3091 printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n", 3094 printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
3092 cpu, pageset->pcp.high, 3095 cpu, pageset->pcp.high,
3093 pageset->pcp.batch, pageset->pcp.count); 3096 pageset->pcp.batch, pageset->pcp.count);
3094 } 3097 }
3095 } 3098 }
3096 3099
3097 printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" 3100 printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
3098 " active_file:%lu inactive_file:%lu isolated_file:%lu\n" 3101 " active_file:%lu inactive_file:%lu isolated_file:%lu\n"
3099 " unevictable:%lu" 3102 " unevictable:%lu"
3100 " dirty:%lu writeback:%lu unstable:%lu\n" 3103 " dirty:%lu writeback:%lu unstable:%lu\n"
3101 " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n" 3104 " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
3102 " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n" 3105 " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
3103 " free_cma:%lu\n", 3106 " free_cma:%lu\n",
3104 global_page_state(NR_ACTIVE_ANON), 3107 global_page_state(NR_ACTIVE_ANON),
3105 global_page_state(NR_INACTIVE_ANON), 3108 global_page_state(NR_INACTIVE_ANON),
3106 global_page_state(NR_ISOLATED_ANON), 3109 global_page_state(NR_ISOLATED_ANON),
3107 global_page_state(NR_ACTIVE_FILE), 3110 global_page_state(NR_ACTIVE_FILE),
3108 global_page_state(NR_INACTIVE_FILE), 3111 global_page_state(NR_INACTIVE_FILE),
3109 global_page_state(NR_ISOLATED_FILE), 3112 global_page_state(NR_ISOLATED_FILE),
3110 global_page_state(NR_UNEVICTABLE), 3113 global_page_state(NR_UNEVICTABLE),
3111 global_page_state(NR_FILE_DIRTY), 3114 global_page_state(NR_FILE_DIRTY),
3112 global_page_state(NR_WRITEBACK), 3115 global_page_state(NR_WRITEBACK),
3113 global_page_state(NR_UNSTABLE_NFS), 3116 global_page_state(NR_UNSTABLE_NFS),
3114 global_page_state(NR_FREE_PAGES), 3117 global_page_state(NR_FREE_PAGES),
3115 global_page_state(NR_SLAB_RECLAIMABLE), 3118 global_page_state(NR_SLAB_RECLAIMABLE),
3116 global_page_state(NR_SLAB_UNRECLAIMABLE), 3119 global_page_state(NR_SLAB_UNRECLAIMABLE),
3117 global_page_state(NR_FILE_MAPPED), 3120 global_page_state(NR_FILE_MAPPED),
3118 global_page_state(NR_SHMEM), 3121 global_page_state(NR_SHMEM),
3119 global_page_state(NR_PAGETABLE), 3122 global_page_state(NR_PAGETABLE),
3120 global_page_state(NR_BOUNCE), 3123 global_page_state(NR_BOUNCE),
3121 global_page_state(NR_FREE_CMA_PAGES)); 3124 global_page_state(NR_FREE_CMA_PAGES));
3122 3125
3123 for_each_populated_zone(zone) { 3126 for_each_populated_zone(zone) {
3124 int i; 3127 int i;
3125 3128
3126 if (skip_free_areas_node(filter, zone_to_nid(zone))) 3129 if (skip_free_areas_node(filter, zone_to_nid(zone)))
3127 continue; 3130 continue;
3128 show_node(zone); 3131 show_node(zone);
3129 printk("%s" 3132 printk("%s"
3130 " free:%lukB" 3133 " free:%lukB"
3131 " min:%lukB" 3134 " min:%lukB"
3132 " low:%lukB" 3135 " low:%lukB"
3133 " high:%lukB" 3136 " high:%lukB"
3134 " active_anon:%lukB" 3137 " active_anon:%lukB"
3135 " inactive_anon:%lukB" 3138 " inactive_anon:%lukB"
3136 " active_file:%lukB" 3139 " active_file:%lukB"
3137 " inactive_file:%lukB" 3140 " inactive_file:%lukB"
3138 " unevictable:%lukB" 3141 " unevictable:%lukB"
3139 " isolated(anon):%lukB" 3142 " isolated(anon):%lukB"
3140 " isolated(file):%lukB" 3143 " isolated(file):%lukB"
3141 " present:%lukB" 3144 " present:%lukB"
3142 " managed:%lukB" 3145 " managed:%lukB"
3143 " mlocked:%lukB" 3146 " mlocked:%lukB"
3144 " dirty:%lukB" 3147 " dirty:%lukB"
3145 " writeback:%lukB" 3148 " writeback:%lukB"
3146 " mapped:%lukB" 3149 " mapped:%lukB"
3147 " shmem:%lukB" 3150 " shmem:%lukB"
3148 " slab_reclaimable:%lukB" 3151 " slab_reclaimable:%lukB"
3149 " slab_unreclaimable:%lukB" 3152 " slab_unreclaimable:%lukB"
3150 " kernel_stack:%lukB" 3153 " kernel_stack:%lukB"
3151 " pagetables:%lukB" 3154 " pagetables:%lukB"
3152 " unstable:%lukB" 3155 " unstable:%lukB"
3153 " bounce:%lukB" 3156 " bounce:%lukB"
3154 " free_cma:%lukB" 3157 " free_cma:%lukB"
3155 " writeback_tmp:%lukB" 3158 " writeback_tmp:%lukB"
3156 " pages_scanned:%lu" 3159 " pages_scanned:%lu"
3157 " all_unreclaimable? %s" 3160 " all_unreclaimable? %s"
3158 "\n", 3161 "\n",
3159 zone->name, 3162 zone->name,
3160 K(zone_page_state(zone, NR_FREE_PAGES)), 3163 K(zone_page_state(zone, NR_FREE_PAGES)),
3161 K(min_wmark_pages(zone)), 3164 K(min_wmark_pages(zone)),
3162 K(low_wmark_pages(zone)), 3165 K(low_wmark_pages(zone)),
3163 K(high_wmark_pages(zone)), 3166 K(high_wmark_pages(zone)),
3164 K(zone_page_state(zone, NR_ACTIVE_ANON)), 3167 K(zone_page_state(zone, NR_ACTIVE_ANON)),
3165 K(zone_page_state(zone, NR_INACTIVE_ANON)), 3168 K(zone_page_state(zone, NR_INACTIVE_ANON)),
3166 K(zone_page_state(zone, NR_ACTIVE_FILE)), 3169 K(zone_page_state(zone, NR_ACTIVE_FILE)),
3167 K(zone_page_state(zone, NR_INACTIVE_FILE)), 3170 K(zone_page_state(zone, NR_INACTIVE_FILE)),
3168 K(zone_page_state(zone, NR_UNEVICTABLE)), 3171 K(zone_page_state(zone, NR_UNEVICTABLE)),
3169 K(zone_page_state(zone, NR_ISOLATED_ANON)), 3172 K(zone_page_state(zone, NR_ISOLATED_ANON)),
3170 K(zone_page_state(zone, NR_ISOLATED_FILE)), 3173 K(zone_page_state(zone, NR_ISOLATED_FILE)),
3171 K(zone->present_pages), 3174 K(zone->present_pages),
3172 K(zone->managed_pages), 3175 K(zone->managed_pages),
3173 K(zone_page_state(zone, NR_MLOCK)), 3176 K(zone_page_state(zone, NR_MLOCK)),
3174 K(zone_page_state(zone, NR_FILE_DIRTY)), 3177 K(zone_page_state(zone, NR_FILE_DIRTY)),
3175 K(zone_page_state(zone, NR_WRITEBACK)), 3178 K(zone_page_state(zone, NR_WRITEBACK)),
3176 K(zone_page_state(zone, NR_FILE_MAPPED)), 3179 K(zone_page_state(zone, NR_FILE_MAPPED)),
3177 K(zone_page_state(zone, NR_SHMEM)), 3180 K(zone_page_state(zone, NR_SHMEM)),
3178 K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)), 3181 K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),
3179 K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)), 3182 K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),
3180 zone_page_state(zone, NR_KERNEL_STACK) * 3183 zone_page_state(zone, NR_KERNEL_STACK) *
3181 THREAD_SIZE / 1024, 3184 THREAD_SIZE / 1024,
3182 K(zone_page_state(zone, NR_PAGETABLE)), 3185 K(zone_page_state(zone, NR_PAGETABLE)),
3183 K(zone_page_state(zone, NR_UNSTABLE_NFS)), 3186 K(zone_page_state(zone, NR_UNSTABLE_NFS)),
3184 K(zone_page_state(zone, NR_BOUNCE)), 3187 K(zone_page_state(zone, NR_BOUNCE)),
3185 K(zone_page_state(zone, NR_FREE_CMA_PAGES)), 3188 K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
3186 K(zone_page_state(zone, NR_WRITEBACK_TEMP)), 3189 K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
3187 zone->pages_scanned, 3190 zone->pages_scanned,
3188 (!zone_reclaimable(zone) ? "yes" : "no") 3191 (!zone_reclaimable(zone) ? "yes" : "no")
3189 ); 3192 );
3190 printk("lowmem_reserve[]:"); 3193 printk("lowmem_reserve[]:");
3191 for (i = 0; i < MAX_NR_ZONES; i++) 3194 for (i = 0; i < MAX_NR_ZONES; i++)
3192 printk(" %lu", zone->lowmem_reserve[i]); 3195 printk(" %lu", zone->lowmem_reserve[i]);
3193 printk("\n"); 3196 printk("\n");
3194 } 3197 }
3195 3198
3196 for_each_populated_zone(zone) { 3199 for_each_populated_zone(zone) {
3197 unsigned long nr[MAX_ORDER], flags, order, total = 0; 3200 unsigned long nr[MAX_ORDER], flags, order, total = 0;
3198 unsigned char types[MAX_ORDER]; 3201 unsigned char types[MAX_ORDER];
3199 3202
3200 if (skip_free_areas_node(filter, zone_to_nid(zone))) 3203 if (skip_free_areas_node(filter, zone_to_nid(zone)))
3201 continue; 3204 continue;
3202 show_node(zone); 3205 show_node(zone);
3203 printk("%s: ", zone->name); 3206 printk("%s: ", zone->name);
3204 3207
3205 spin_lock_irqsave(&zone->lock, flags); 3208 spin_lock_irqsave(&zone->lock, flags);
3206 for (order = 0; order < MAX_ORDER; order++) { 3209 for (order = 0; order < MAX_ORDER; order++) {
3207 struct free_area *area = &zone->free_area[order]; 3210 struct free_area *area = &zone->free_area[order];
3208 int type; 3211 int type;
3209 3212
3210 nr[order] = area->nr_free; 3213 nr[order] = area->nr_free;
3211 total += nr[order] << order; 3214 total += nr[order] << order;
3212 3215
3213 types[order] = 0; 3216 types[order] = 0;
3214 for (type = 0; type < MIGRATE_TYPES; type++) { 3217 for (type = 0; type < MIGRATE_TYPES; type++) {
3215 if (!list_empty(&area->free_list[type])) 3218 if (!list_empty(&area->free_list[type]))
3216 types[order] |= 1 << type; 3219 types[order] |= 1 << type;
3217 } 3220 }
3218 } 3221 }
3219 spin_unlock_irqrestore(&zone->lock, flags); 3222 spin_unlock_irqrestore(&zone->lock, flags);
3220 for (order = 0; order < MAX_ORDER; order++) { 3223 for (order = 0; order < MAX_ORDER; order++) {
3221 printk("%lu*%lukB ", nr[order], K(1UL) << order); 3224 printk("%lu*%lukB ", nr[order], K(1UL) << order);
3222 if (nr[order]) 3225 if (nr[order])
3223 show_migration_types(types[order]); 3226 show_migration_types(types[order]);
3224 } 3227 }
3225 printk("= %lukB\n", K(total)); 3228 printk("= %lukB\n", K(total));
3226 } 3229 }
3227 3230
3228 hugetlb_show_meminfo(); 3231 hugetlb_show_meminfo();
3229 3232
3230 printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES)); 3233 printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));
3231 3234
3232 show_swap_cache_info(); 3235 show_swap_cache_info();
3233 } 3236 }
3234 3237
3235 static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) 3238 static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
3236 { 3239 {
3237 zoneref->zone = zone; 3240 zoneref->zone = zone;
3238 zoneref->zone_idx = zone_idx(zone); 3241 zoneref->zone_idx = zone_idx(zone);
3239 } 3242 }
3240 3243
3241 /* 3244 /*
3242 * Builds allocation fallback zone lists. 3245 * Builds allocation fallback zone lists.
3243 * 3246 *
3244 * Add all populated zones of a node to the zonelist. 3247 * Add all populated zones of a node to the zonelist.
3245 */ 3248 */
3246 static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, 3249 static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
3247 int nr_zones) 3250 int nr_zones)
3248 { 3251 {
3249 struct zone *zone; 3252 struct zone *zone;
3250 enum zone_type zone_type = MAX_NR_ZONES; 3253 enum zone_type zone_type = MAX_NR_ZONES;
3251 3254
3252 do { 3255 do {
3253 zone_type--; 3256 zone_type--;
3254 zone = pgdat->node_zones + zone_type; 3257 zone = pgdat->node_zones + zone_type;
3255 if (populated_zone(zone)) { 3258 if (populated_zone(zone)) {
3256 zoneref_set_zone(zone, 3259 zoneref_set_zone(zone,
3257 &zonelist->_zonerefs[nr_zones++]); 3260 &zonelist->_zonerefs[nr_zones++]);
3258 check_highest_zone(zone_type); 3261 check_highest_zone(zone_type);
3259 } 3262 }
3260 } while (zone_type); 3263 } while (zone_type);
3261 3264
3262 return nr_zones; 3265 return nr_zones;
3263 } 3266 }
3264 3267
3265 3268
3266 /* 3269 /*
3267 * zonelist_order: 3270 * zonelist_order:
3268 * 0 = automatic detection of better ordering. 3271 * 0 = automatic detection of better ordering.
3269 * 1 = order by ([node] distance, -zonetype) 3272 * 1 = order by ([node] distance, -zonetype)
3270 * 2 = order by (-zonetype, [node] distance) 3273 * 2 = order by (-zonetype, [node] distance)
3271 * 3274 *
3272 * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create 3275 * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
3273 * the same zonelist. So only NUMA can configure this param. 3276 * the same zonelist. So only NUMA can configure this param.
3274 */ 3277 */
3275 #define ZONELIST_ORDER_DEFAULT 0 3278 #define ZONELIST_ORDER_DEFAULT 0
3276 #define ZONELIST_ORDER_NODE 1 3279 #define ZONELIST_ORDER_NODE 1
3277 #define ZONELIST_ORDER_ZONE 2 3280 #define ZONELIST_ORDER_ZONE 2
3278 3281
3279 /* zonelist order in the kernel. 3282 /* zonelist order in the kernel.
3280 * set_zonelist_order() will set this to NODE or ZONE. 3283 * set_zonelist_order() will set this to NODE or ZONE.
3281 */ 3284 */
3282 static int current_zonelist_order = ZONELIST_ORDER_DEFAULT; 3285 static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;
3283 static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"}; 3286 static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};
3284 3287
3285 3288
3286 #ifdef CONFIG_NUMA 3289 #ifdef CONFIG_NUMA
3287 /* The value user specified ....changed by config */ 3290 /* The value user specified ....changed by config */
3288 static int user_zonelist_order = ZONELIST_ORDER_DEFAULT; 3291 static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;
3289 /* string for sysctl */ 3292 /* string for sysctl */
3290 #define NUMA_ZONELIST_ORDER_LEN 16 3293 #define NUMA_ZONELIST_ORDER_LEN 16
3291 char numa_zonelist_order[16] = "default"; 3294 char numa_zonelist_order[16] = "default";
3292 3295
3293 /* 3296 /*
3294 * interface for configure zonelist ordering. 3297 * interface for configure zonelist ordering.
3295 * command line option "numa_zonelist_order" 3298 * command line option "numa_zonelist_order"
3296 * = "[dD]efault - default, automatic configuration. 3299 * = "[dD]efault - default, automatic configuration.
3297 * = "[nN]ode - order by node locality, then by zone within node 3300 * = "[nN]ode - order by node locality, then by zone within node
3298 * = "[zZ]one - order by zone, then by locality within zone 3301 * = "[zZ]one - order by zone, then by locality within zone
3299 */ 3302 */
3300 3303
3301 static int __parse_numa_zonelist_order(char *s) 3304 static int __parse_numa_zonelist_order(char *s)
3302 { 3305 {
3303 if (*s == 'd' || *s == 'D') { 3306 if (*s == 'd' || *s == 'D') {
3304 user_zonelist_order = ZONELIST_ORDER_DEFAULT; 3307 user_zonelist_order = ZONELIST_ORDER_DEFAULT;
3305 } else if (*s == 'n' || *s == 'N') { 3308 } else if (*s == 'n' || *s == 'N') {
3306 user_zonelist_order = ZONELIST_ORDER_NODE; 3309 user_zonelist_order = ZONELIST_ORDER_NODE;
3307 } else if (*s == 'z' || *s == 'Z') { 3310 } else if (*s == 'z' || *s == 'Z') {
3308 user_zonelist_order = ZONELIST_ORDER_ZONE; 3311 user_zonelist_order = ZONELIST_ORDER_ZONE;
3309 } else { 3312 } else {
3310 printk(KERN_WARNING 3313 printk(KERN_WARNING
3311 "Ignoring invalid numa_zonelist_order value: " 3314 "Ignoring invalid numa_zonelist_order value: "
3312 "%s\n", s); 3315 "%s\n", s);
3313 return -EINVAL; 3316 return -EINVAL;
3314 } 3317 }
3315 return 0; 3318 return 0;
3316 } 3319 }
3317 3320
3318 static __init int setup_numa_zonelist_order(char *s) 3321 static __init int setup_numa_zonelist_order(char *s)
3319 { 3322 {
3320 int ret; 3323 int ret;
3321 3324
3322 if (!s) 3325 if (!s)
3323 return 0; 3326 return 0;
3324 3327
3325 ret = __parse_numa_zonelist_order(s); 3328 ret = __parse_numa_zonelist_order(s);
3326 if (ret == 0) 3329 if (ret == 0)
3327 strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN); 3330 strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
3328 3331
3329 return ret; 3332 return ret;
3330 } 3333 }
3331 early_param("numa_zonelist_order", setup_numa_zonelist_order); 3334 early_param("numa_zonelist_order", setup_numa_zonelist_order);
3332 3335
3333 /* 3336 /*
3334 * sysctl handler for numa_zonelist_order 3337 * sysctl handler for numa_zonelist_order
3335 */ 3338 */
3336 int numa_zonelist_order_handler(ctl_table *table, int write, 3339 int numa_zonelist_order_handler(ctl_table *table, int write,
3337 void __user *buffer, size_t *length, 3340 void __user *buffer, size_t *length,
3338 loff_t *ppos) 3341 loff_t *ppos)
3339 { 3342 {
3340 char saved_string[NUMA_ZONELIST_ORDER_LEN]; 3343 char saved_string[NUMA_ZONELIST_ORDER_LEN];
3341 int ret; 3344 int ret;
3342 static DEFINE_MUTEX(zl_order_mutex); 3345 static DEFINE_MUTEX(zl_order_mutex);
3343 3346
3344 mutex_lock(&zl_order_mutex); 3347 mutex_lock(&zl_order_mutex);
3345 if (write) { 3348 if (write) {
3346 if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) { 3349 if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) {
3347 ret = -EINVAL; 3350 ret = -EINVAL;
3348 goto out; 3351 goto out;
3349 } 3352 }
3350 strcpy(saved_string, (char *)table->data); 3353 strcpy(saved_string, (char *)table->data);
3351 } 3354 }
3352 ret = proc_dostring(table, write, buffer, length, ppos); 3355 ret = proc_dostring(table, write, buffer, length, ppos);
3353 if (ret) 3356 if (ret)
3354 goto out; 3357 goto out;
3355 if (write) { 3358 if (write) {
3356 int oldval = user_zonelist_order; 3359 int oldval = user_zonelist_order;
3357 3360
3358 ret = __parse_numa_zonelist_order((char *)table->data); 3361 ret = __parse_numa_zonelist_order((char *)table->data);
3359 if (ret) { 3362 if (ret) {
3360 /* 3363 /*
3361 * bogus value. restore saved string 3364 * bogus value. restore saved string
3362 */ 3365 */
3363 strncpy((char *)table->data, saved_string, 3366 strncpy((char *)table->data, saved_string,
3364 NUMA_ZONELIST_ORDER_LEN); 3367 NUMA_ZONELIST_ORDER_LEN);
3365 user_zonelist_order = oldval; 3368 user_zonelist_order = oldval;
3366 } else if (oldval != user_zonelist_order) { 3369 } else if (oldval != user_zonelist_order) {
3367 mutex_lock(&zonelists_mutex); 3370 mutex_lock(&zonelists_mutex);
3368 build_all_zonelists(NULL, NULL); 3371 build_all_zonelists(NULL, NULL);
3369 mutex_unlock(&zonelists_mutex); 3372 mutex_unlock(&zonelists_mutex);
3370 } 3373 }
3371 } 3374 }
3372 out: 3375 out:
3373 mutex_unlock(&zl_order_mutex); 3376 mutex_unlock(&zl_order_mutex);
3374 return ret; 3377 return ret;
3375 } 3378 }
3376 3379
3377 3380
3378 #define MAX_NODE_LOAD (nr_online_nodes) 3381 #define MAX_NODE_LOAD (nr_online_nodes)
3379 static int node_load[MAX_NUMNODES]; 3382 static int node_load[MAX_NUMNODES];
3380 3383
3381 /** 3384 /**
3382 * find_next_best_node - find the next node that should appear in a given node's fallback list 3385 * find_next_best_node - find the next node that should appear in a given node's fallback list
3383 * @node: node whose fallback list we're appending 3386 * @node: node whose fallback list we're appending
3384 * @used_node_mask: nodemask_t of already used nodes 3387 * @used_node_mask: nodemask_t of already used nodes
3385 * 3388 *
3386 * We use a number of factors to determine which is the next node that should 3389 * We use a number of factors to determine which is the next node that should
3387 * appear on a given node's fallback list. The node should not have appeared 3390 * appear on a given node's fallback list. The node should not have appeared
3388 * already in @node's fallback list, and it should be the next closest node 3391 * already in @node's fallback list, and it should be the next closest node
3389 * according to the distance array (which contains arbitrary distance values 3392 * according to the distance array (which contains arbitrary distance values
3390 * from each node to each node in the system), and should also prefer nodes 3393 * from each node to each node in the system), and should also prefer nodes
3391 * with no CPUs, since presumably they'll have very little allocation pressure 3394 * with no CPUs, since presumably they'll have very little allocation pressure
3392 * on them otherwise. 3395 * on them otherwise.
3393 * It returns -1 if no node is found. 3396 * It returns -1 if no node is found.
3394 */ 3397 */
3395 static int find_next_best_node(int node, nodemask_t *used_node_mask) 3398 static int find_next_best_node(int node, nodemask_t *used_node_mask)
3396 { 3399 {
3397 int n, val; 3400 int n, val;
3398 int min_val = INT_MAX; 3401 int min_val = INT_MAX;
3399 int best_node = NUMA_NO_NODE; 3402 int best_node = NUMA_NO_NODE;
3400 const struct cpumask *tmp = cpumask_of_node(0); 3403 const struct cpumask *tmp = cpumask_of_node(0);
3401 3404
3402 /* Use the local node if we haven't already */ 3405 /* Use the local node if we haven't already */
3403 if (!node_isset(node, *used_node_mask)) { 3406 if (!node_isset(node, *used_node_mask)) {
3404 node_set(node, *used_node_mask); 3407 node_set(node, *used_node_mask);
3405 return node; 3408 return node;
3406 } 3409 }
3407 3410
3408 for_each_node_state(n, N_MEMORY) { 3411 for_each_node_state(n, N_MEMORY) {
3409 3412
3410 /* Don't want a node to appear more than once */ 3413 /* Don't want a node to appear more than once */
3411 if (node_isset(n, *used_node_mask)) 3414 if (node_isset(n, *used_node_mask))
3412 continue; 3415 continue;
3413 3416
3414 /* Use the distance array to find the distance */ 3417 /* Use the distance array to find the distance */
3415 val = node_distance(node, n); 3418 val = node_distance(node, n);
3416 3419
3417 /* Penalize nodes under us ("prefer the next node") */ 3420 /* Penalize nodes under us ("prefer the next node") */
3418 val += (n < node); 3421 val += (n < node);
3419 3422
3420 /* Give preference to headless and unused nodes */ 3423 /* Give preference to headless and unused nodes */
3421 tmp = cpumask_of_node(n); 3424 tmp = cpumask_of_node(n);
3422 if (!cpumask_empty(tmp)) 3425 if (!cpumask_empty(tmp))
3423 val += PENALTY_FOR_NODE_WITH_CPUS; 3426 val += PENALTY_FOR_NODE_WITH_CPUS;
3424 3427
3425 /* Slight preference for less loaded node */ 3428 /* Slight preference for less loaded node */
3426 val *= (MAX_NODE_LOAD*MAX_NUMNODES); 3429 val *= (MAX_NODE_LOAD*MAX_NUMNODES);
3427 val += node_load[n]; 3430 val += node_load[n];
3428 3431
3429 if (val < min_val) { 3432 if (val < min_val) {
3430 min_val = val; 3433 min_val = val;
3431 best_node = n; 3434 best_node = n;
3432 } 3435 }
3433 } 3436 }
3434 3437
3435 if (best_node >= 0) 3438 if (best_node >= 0)
3436 node_set(best_node, *used_node_mask); 3439 node_set(best_node, *used_node_mask);
3437 3440
3438 return best_node; 3441 return best_node;
3439 } 3442 }
3440 3443
3441 3444
3442 /* 3445 /*
3443 * Build zonelists ordered by node and zones within node. 3446 * Build zonelists ordered by node and zones within node.
3444 * This results in maximum locality--normal zone overflows into local 3447 * This results in maximum locality--normal zone overflows into local
3445 * DMA zone, if any--but risks exhausting DMA zone. 3448 * DMA zone, if any--but risks exhausting DMA zone.
3446 */ 3449 */
3447 static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) 3450 static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
3448 { 3451 {
3449 int j; 3452 int j;
3450 struct zonelist *zonelist; 3453 struct zonelist *zonelist;
3451 3454
3452 zonelist = &pgdat->node_zonelists[0]; 3455 zonelist = &pgdat->node_zonelists[0];
3453 for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++) 3456 for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
3454 ; 3457 ;
3455 j = build_zonelists_node(NODE_DATA(node), zonelist, j); 3458 j = build_zonelists_node(NODE_DATA(node), zonelist, j);
3456 zonelist->_zonerefs[j].zone = NULL; 3459 zonelist->_zonerefs[j].zone = NULL;
3457 zonelist->_zonerefs[j].zone_idx = 0; 3460 zonelist->_zonerefs[j].zone_idx = 0;
3458 } 3461 }
3459 3462
3460 /* 3463 /*
3461 * Build gfp_thisnode zonelists 3464 * Build gfp_thisnode zonelists
3462 */ 3465 */
3463 static void build_thisnode_zonelists(pg_data_t *pgdat) 3466 static void build_thisnode_zonelists(pg_data_t *pgdat)
3464 { 3467 {
3465 int j; 3468 int j;
3466 struct zonelist *zonelist; 3469 struct zonelist *zonelist;
3467 3470
3468 zonelist = &pgdat->node_zonelists[1]; 3471 zonelist = &pgdat->node_zonelists[1];
3469 j = build_zonelists_node(pgdat, zonelist, 0); 3472 j = build_zonelists_node(pgdat, zonelist, 0);
3470 zonelist->_zonerefs[j].zone = NULL; 3473 zonelist->_zonerefs[j].zone = NULL;
3471 zonelist->_zonerefs[j].zone_idx = 0; 3474 zonelist->_zonerefs[j].zone_idx = 0;
3472 } 3475 }
3473 3476
3474 /* 3477 /*
3475 * Build zonelists ordered by zone and nodes within zones. 3478 * Build zonelists ordered by zone and nodes within zones.
3476 * This results in conserving DMA zone[s] until all Normal memory is 3479 * This results in conserving DMA zone[s] until all Normal memory is
3477 * exhausted, but results in overflowing to remote node while memory 3480 * exhausted, but results in overflowing to remote node while memory
3478 * may still exist in local DMA zone. 3481 * may still exist in local DMA zone.
3479 */ 3482 */
3480 static int node_order[MAX_NUMNODES]; 3483 static int node_order[MAX_NUMNODES];
3481 3484
3482 static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes) 3485 static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
3483 { 3486 {
3484 int pos, j, node; 3487 int pos, j, node;
3485 int zone_type; /* needs to be signed */ 3488 int zone_type; /* needs to be signed */
3486 struct zone *z; 3489 struct zone *z;
3487 struct zonelist *zonelist; 3490 struct zonelist *zonelist;
3488 3491
3489 zonelist = &pgdat->node_zonelists[0]; 3492 zonelist = &pgdat->node_zonelists[0];
3490 pos = 0; 3493 pos = 0;
3491 for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) { 3494 for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
3492 for (j = 0; j < nr_nodes; j++) { 3495 for (j = 0; j < nr_nodes; j++) {
3493 node = node_order[j]; 3496 node = node_order[j];
3494 z = &NODE_DATA(node)->node_zones[zone_type]; 3497 z = &NODE_DATA(node)->node_zones[zone_type];
3495 if (populated_zone(z)) { 3498 if (populated_zone(z)) {
3496 zoneref_set_zone(z, 3499 zoneref_set_zone(z,
3497 &zonelist->_zonerefs[pos++]); 3500 &zonelist->_zonerefs[pos++]);
3498 check_highest_zone(zone_type); 3501 check_highest_zone(zone_type);
3499 } 3502 }
3500 } 3503 }
3501 } 3504 }
3502 zonelist->_zonerefs[pos].zone = NULL; 3505 zonelist->_zonerefs[pos].zone = NULL;
3503 zonelist->_zonerefs[pos].zone_idx = 0; 3506 zonelist->_zonerefs[pos].zone_idx = 0;
3504 } 3507 }
3505 3508
3506 static int default_zonelist_order(void) 3509 static int default_zonelist_order(void)
3507 { 3510 {
3508 int nid, zone_type; 3511 int nid, zone_type;
3509 unsigned long low_kmem_size, total_size; 3512 unsigned long low_kmem_size, total_size;
3510 struct zone *z; 3513 struct zone *z;
3511 int average_size; 3514 int average_size;
3512 /* 3515 /*
3513 * ZONE_DMA and ZONE_DMA32 can be very small area in the system. 3516 * ZONE_DMA and ZONE_DMA32 can be very small area in the system.
3514 * If they are really small and used heavily, the system can fall 3517 * If they are really small and used heavily, the system can fall
3515 * into OOM very easily. 3518 * into OOM very easily.
3516 * This function detect ZONE_DMA/DMA32 size and configures zone order. 3519 * This function detect ZONE_DMA/DMA32 size and configures zone order.
3517 */ 3520 */
3518 /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */ 3521 /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */
3519 low_kmem_size = 0; 3522 low_kmem_size = 0;
3520 total_size = 0; 3523 total_size = 0;
3521 for_each_online_node(nid) { 3524 for_each_online_node(nid) {
3522 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { 3525 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
3523 z = &NODE_DATA(nid)->node_zones[zone_type]; 3526 z = &NODE_DATA(nid)->node_zones[zone_type];
3524 if (populated_zone(z)) { 3527 if (populated_zone(z)) {
3525 if (zone_type < ZONE_NORMAL) 3528 if (zone_type < ZONE_NORMAL)
3526 low_kmem_size += z->managed_pages; 3529 low_kmem_size += z->managed_pages;
3527 total_size += z->managed_pages; 3530 total_size += z->managed_pages;
3528 } else if (zone_type == ZONE_NORMAL) { 3531 } else if (zone_type == ZONE_NORMAL) {
3529 /* 3532 /*
3530 * If any node has only lowmem, then node order 3533 * If any node has only lowmem, then node order
3531 * is preferred to allow kernel allocations 3534 * is preferred to allow kernel allocations
3532 * locally; otherwise, they can easily infringe 3535 * locally; otherwise, they can easily infringe
3533 * on other nodes when there is an abundance of 3536 * on other nodes when there is an abundance of
3534 * lowmem available to allocate from. 3537 * lowmem available to allocate from.
3535 */ 3538 */
3536 return ZONELIST_ORDER_NODE; 3539 return ZONELIST_ORDER_NODE;
3537 } 3540 }
3538 } 3541 }
3539 } 3542 }
3540 if (!low_kmem_size || /* there are no DMA area. */ 3543 if (!low_kmem_size || /* there are no DMA area. */
3541 low_kmem_size > total_size/2) /* DMA/DMA32 is big. */ 3544 low_kmem_size > total_size/2) /* DMA/DMA32 is big. */
3542 return ZONELIST_ORDER_NODE; 3545 return ZONELIST_ORDER_NODE;
3543 /* 3546 /*
3544 * look into each node's config. 3547 * look into each node's config.
3545 * If there is a node whose DMA/DMA32 memory is very big area on 3548 * If there is a node whose DMA/DMA32 memory is very big area on
3546 * local memory, NODE_ORDER may be suitable. 3549 * local memory, NODE_ORDER may be suitable.
3547 */ 3550 */
3548 average_size = total_size / 3551 average_size = total_size /
3549 (nodes_weight(node_states[N_MEMORY]) + 1); 3552 (nodes_weight(node_states[N_MEMORY]) + 1);
3550 for_each_online_node(nid) { 3553 for_each_online_node(nid) {
3551 low_kmem_size = 0; 3554 low_kmem_size = 0;
3552 total_size = 0; 3555 total_size = 0;
3553 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { 3556 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
3554 z = &NODE_DATA(nid)->node_zones[zone_type]; 3557 z = &NODE_DATA(nid)->node_zones[zone_type];
3555 if (populated_zone(z)) { 3558 if (populated_zone(z)) {
3556 if (zone_type < ZONE_NORMAL) 3559 if (zone_type < ZONE_NORMAL)
3557 low_kmem_size += z->present_pages; 3560 low_kmem_size += z->present_pages;
3558 total_size += z->present_pages; 3561 total_size += z->present_pages;
3559 } 3562 }
3560 } 3563 }
3561 if (low_kmem_size && 3564 if (low_kmem_size &&
3562 total_size > average_size && /* ignore small node */ 3565 total_size > average_size && /* ignore small node */
3563 low_kmem_size > total_size * 70/100) 3566 low_kmem_size > total_size * 70/100)
3564 return ZONELIST_ORDER_NODE; 3567 return ZONELIST_ORDER_NODE;
3565 } 3568 }
3566 return ZONELIST_ORDER_ZONE; 3569 return ZONELIST_ORDER_ZONE;
3567 } 3570 }
3568 3571
3569 static void set_zonelist_order(void) 3572 static void set_zonelist_order(void)
3570 { 3573 {
3571 if (user_zonelist_order == ZONELIST_ORDER_DEFAULT) 3574 if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)
3572 current_zonelist_order = default_zonelist_order(); 3575 current_zonelist_order = default_zonelist_order();
3573 else 3576 else
3574 current_zonelist_order = user_zonelist_order; 3577 current_zonelist_order = user_zonelist_order;
3575 } 3578 }
3576 3579
3577 static void build_zonelists(pg_data_t *pgdat) 3580 static void build_zonelists(pg_data_t *pgdat)
3578 { 3581 {
3579 int j, node, load; 3582 int j, node, load;
3580 enum zone_type i; 3583 enum zone_type i;
3581 nodemask_t used_mask; 3584 nodemask_t used_mask;
3582 int local_node, prev_node; 3585 int local_node, prev_node;
3583 struct zonelist *zonelist; 3586 struct zonelist *zonelist;
3584 int order = current_zonelist_order; 3587 int order = current_zonelist_order;
3585 3588
3586 /* initialize zonelists */ 3589 /* initialize zonelists */
3587 for (i = 0; i < MAX_ZONELISTS; i++) { 3590 for (i = 0; i < MAX_ZONELISTS; i++) {
3588 zonelist = pgdat->node_zonelists + i; 3591 zonelist = pgdat->node_zonelists + i;
3589 zonelist->_zonerefs[0].zone = NULL; 3592 zonelist->_zonerefs[0].zone = NULL;
3590 zonelist->_zonerefs[0].zone_idx = 0; 3593 zonelist->_zonerefs[0].zone_idx = 0;
3591 } 3594 }
3592 3595
3593 /* NUMA-aware ordering of nodes */ 3596 /* NUMA-aware ordering of nodes */
3594 local_node = pgdat->node_id; 3597 local_node = pgdat->node_id;
3595 load = nr_online_nodes; 3598 load = nr_online_nodes;
3596 prev_node = local_node; 3599 prev_node = local_node;
3597 nodes_clear(used_mask); 3600 nodes_clear(used_mask);
3598 3601
3599 memset(node_order, 0, sizeof(node_order)); 3602 memset(node_order, 0, sizeof(node_order));
3600 j = 0; 3603 j = 0;
3601 3604
3602 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { 3605 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
3603 /* 3606 /*
3604 * We don't want to pressure a particular node. 3607 * We don't want to pressure a particular node.
3605 * So adding penalty to the first node in same 3608 * So adding penalty to the first node in same
3606 * distance group to make it round-robin. 3609 * distance group to make it round-robin.
3607 */ 3610 */
3608 if (node_distance(local_node, node) != 3611 if (node_distance(local_node, node) !=
3609 node_distance(local_node, prev_node)) 3612 node_distance(local_node, prev_node))
3610 node_load[node] = load; 3613 node_load[node] = load;
3611 3614
3612 prev_node = node; 3615 prev_node = node;
3613 load--; 3616 load--;
3614 if (order == ZONELIST_ORDER_NODE) 3617 if (order == ZONELIST_ORDER_NODE)
3615 build_zonelists_in_node_order(pgdat, node); 3618 build_zonelists_in_node_order(pgdat, node);
3616 else 3619 else
3617 node_order[j++] = node; /* remember order */ 3620 node_order[j++] = node; /* remember order */
3618 } 3621 }
3619 3622
3620 if (order == ZONELIST_ORDER_ZONE) { 3623 if (order == ZONELIST_ORDER_ZONE) {
3621 /* calculate node order -- i.e., DMA last! */ 3624 /* calculate node order -- i.e., DMA last! */
3622 build_zonelists_in_zone_order(pgdat, j); 3625 build_zonelists_in_zone_order(pgdat, j);
3623 } 3626 }
3624 3627
3625 build_thisnode_zonelists(pgdat); 3628 build_thisnode_zonelists(pgdat);
3626 } 3629 }
3627 3630
3628 /* Construct the zonelist performance cache - see further mmzone.h */ 3631 /* Construct the zonelist performance cache - see further mmzone.h */
3629 static void build_zonelist_cache(pg_data_t *pgdat) 3632 static void build_zonelist_cache(pg_data_t *pgdat)
3630 { 3633 {
3631 struct zonelist *zonelist; 3634 struct zonelist *zonelist;
3632 struct zonelist_cache *zlc; 3635 struct zonelist_cache *zlc;
3633 struct zoneref *z; 3636 struct zoneref *z;
3634 3637
3635 zonelist = &pgdat->node_zonelists[0]; 3638 zonelist = &pgdat->node_zonelists[0];
3636 zonelist->zlcache_ptr = zlc = &zonelist->zlcache; 3639 zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
3637 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); 3640 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
3638 for (z = zonelist->_zonerefs; z->zone; z++) 3641 for (z = zonelist->_zonerefs; z->zone; z++)
3639 zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z); 3642 zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);
3640 } 3643 }
3641 3644
3642 #ifdef CONFIG_HAVE_MEMORYLESS_NODES 3645 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
3643 /* 3646 /*
3644 * Return node id of node used for "local" allocations. 3647 * Return node id of node used for "local" allocations.
3645 * I.e., first node id of first zone in arg node's generic zonelist. 3648 * I.e., first node id of first zone in arg node's generic zonelist.
3646 * Used for initializing percpu 'numa_mem', which is used primarily 3649 * Used for initializing percpu 'numa_mem', which is used primarily
3647 * for kernel allocations, so use GFP_KERNEL flags to locate zonelist. 3650 * for kernel allocations, so use GFP_KERNEL flags to locate zonelist.
3648 */ 3651 */
3649 int local_memory_node(int node) 3652 int local_memory_node(int node)
3650 { 3653 {
3651 struct zone *zone; 3654 struct zone *zone;
3652 3655
3653 (void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL), 3656 (void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
3654 gfp_zone(GFP_KERNEL), 3657 gfp_zone(GFP_KERNEL),
3655 NULL, 3658 NULL,
3656 &zone); 3659 &zone);
3657 return zone->node; 3660 return zone->node;
3658 } 3661 }
3659 #endif 3662 #endif
3660 3663
3661 #else /* CONFIG_NUMA */ 3664 #else /* CONFIG_NUMA */
3662 3665
3663 static void set_zonelist_order(void) 3666 static void set_zonelist_order(void)
3664 { 3667 {
3665 current_zonelist_order = ZONELIST_ORDER_ZONE; 3668 current_zonelist_order = ZONELIST_ORDER_ZONE;
3666 } 3669 }
3667 3670
3668 static void build_zonelists(pg_data_t *pgdat) 3671 static void build_zonelists(pg_data_t *pgdat)
3669 { 3672 {
3670 int node, local_node; 3673 int node, local_node;
3671 enum zone_type j; 3674 enum zone_type j;
3672 struct zonelist *zonelist; 3675 struct zonelist *zonelist;
3673 3676
3674 local_node = pgdat->node_id; 3677 local_node = pgdat->node_id;
3675 3678
3676 zonelist = &pgdat->node_zonelists[0]; 3679 zonelist = &pgdat->node_zonelists[0];
3677 j = build_zonelists_node(pgdat, zonelist, 0); 3680 j = build_zonelists_node(pgdat, zonelist, 0);
3678 3681
3679 /* 3682 /*
3680 * Now we build the zonelist so that it contains the zones 3683 * Now we build the zonelist so that it contains the zones
3681 * of all the other nodes. 3684 * of all the other nodes.
3682 * We don't want to pressure a particular node, so when 3685 * We don't want to pressure a particular node, so when
3683 * building the zones for node N, we make sure that the 3686 * building the zones for node N, we make sure that the
3684 * zones coming right after the local ones are those from 3687 * zones coming right after the local ones are those from
3685 * node N+1 (modulo N) 3688 * node N+1 (modulo N)
3686 */ 3689 */
3687 for (node = local_node + 1; node < MAX_NUMNODES; node++) { 3690 for (node = local_node + 1; node < MAX_NUMNODES; node++) {
3688 if (!node_online(node)) 3691 if (!node_online(node))
3689 continue; 3692 continue;
3690 j = build_zonelists_node(NODE_DATA(node), zonelist, j); 3693 j = build_zonelists_node(NODE_DATA(node), zonelist, j);
3691 } 3694 }
3692 for (node = 0; node < local_node; node++) { 3695 for (node = 0; node < local_node; node++) {
3693 if (!node_online(node)) 3696 if (!node_online(node))
3694 continue; 3697 continue;
3695 j = build_zonelists_node(NODE_DATA(node), zonelist, j); 3698 j = build_zonelists_node(NODE_DATA(node), zonelist, j);
3696 } 3699 }
3697 3700
3698 zonelist->_zonerefs[j].zone = NULL; 3701 zonelist->_zonerefs[j].zone = NULL;
3699 zonelist->_zonerefs[j].zone_idx = 0; 3702 zonelist->_zonerefs[j].zone_idx = 0;
3700 } 3703 }
3701 3704
3702 /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */ 3705 /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
3703 static void build_zonelist_cache(pg_data_t *pgdat) 3706 static void build_zonelist_cache(pg_data_t *pgdat)
3704 { 3707 {
3705 pgdat->node_zonelists[0].zlcache_ptr = NULL; 3708 pgdat->node_zonelists[0].zlcache_ptr = NULL;
3706 } 3709 }
3707 3710
3708 #endif /* CONFIG_NUMA */ 3711 #endif /* CONFIG_NUMA */
3709 3712
3710 /* 3713 /*
3711 * Boot pageset table. One per cpu which is going to be used for all 3714 * Boot pageset table. One per cpu which is going to be used for all
3712 * zones and all nodes. The parameters will be set in such a way 3715 * zones and all nodes. The parameters will be set in such a way
3713 * that an item put on a list will immediately be handed over to 3716 * that an item put on a list will immediately be handed over to
3714 * the buddy list. This is safe since pageset manipulation is done 3717 * the buddy list. This is safe since pageset manipulation is done
3715 * with interrupts disabled. 3718 * with interrupts disabled.
3716 * 3719 *
3717 * The boot_pagesets must be kept even after bootup is complete for 3720 * The boot_pagesets must be kept even after bootup is complete for
3718 * unused processors and/or zones. They do play a role for bootstrapping 3721 * unused processors and/or zones. They do play a role for bootstrapping
3719 * hotplugged processors. 3722 * hotplugged processors.
3720 * 3723 *
3721 * zoneinfo_show() and maybe other functions do 3724 * zoneinfo_show() and maybe other functions do
3722 * not check if the processor is online before following the pageset pointer. 3725 * not check if the processor is online before following the pageset pointer.
3723 * Other parts of the kernel may not check if the zone is available. 3726 * Other parts of the kernel may not check if the zone is available.
3724 */ 3727 */
3725 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); 3728 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
3726 static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); 3729 static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
3727 static void setup_zone_pageset(struct zone *zone); 3730 static void setup_zone_pageset(struct zone *zone);
3728 3731
3729 /* 3732 /*
3730 * Global mutex to protect against size modification of zonelists 3733 * Global mutex to protect against size modification of zonelists
3731 * as well as to serialize pageset setup for the new populated zone. 3734 * as well as to serialize pageset setup for the new populated zone.
3732 */ 3735 */
3733 DEFINE_MUTEX(zonelists_mutex); 3736 DEFINE_MUTEX(zonelists_mutex);
3734 3737
3735 /* return values int ....just for stop_machine() */ 3738 /* return values int ....just for stop_machine() */
3736 static int __build_all_zonelists(void *data) 3739 static int __build_all_zonelists(void *data)
3737 { 3740 {
3738 int nid; 3741 int nid;
3739 int cpu; 3742 int cpu;
3740 pg_data_t *self = data; 3743 pg_data_t *self = data;
3741 3744
3742 #ifdef CONFIG_NUMA 3745 #ifdef CONFIG_NUMA
3743 memset(node_load, 0, sizeof(node_load)); 3746 memset(node_load, 0, sizeof(node_load));
3744 #endif 3747 #endif
3745 3748
3746 if (self && !node_online(self->node_id)) { 3749 if (self && !node_online(self->node_id)) {
3747 build_zonelists(self); 3750 build_zonelists(self);
3748 build_zonelist_cache(self); 3751 build_zonelist_cache(self);
3749 } 3752 }
3750 3753
3751 for_each_online_node(nid) { 3754 for_each_online_node(nid) {
3752 pg_data_t *pgdat = NODE_DATA(nid); 3755 pg_data_t *pgdat = NODE_DATA(nid);
3753 3756
3754 build_zonelists(pgdat); 3757 build_zonelists(pgdat);
3755 build_zonelist_cache(pgdat); 3758 build_zonelist_cache(pgdat);
3756 } 3759 }
3757 3760
3758 /* 3761 /*
3759 * Initialize the boot_pagesets that are going to be used 3762 * Initialize the boot_pagesets that are going to be used
3760 * for bootstrapping processors. The real pagesets for 3763 * for bootstrapping processors. The real pagesets for
3761 * each zone will be allocated later when the per cpu 3764 * each zone will be allocated later when the per cpu
3762 * allocator is available. 3765 * allocator is available.
3763 * 3766 *
3764 * boot_pagesets are used also for bootstrapping offline 3767 * boot_pagesets are used also for bootstrapping offline
3765 * cpus if the system is already booted because the pagesets 3768 * cpus if the system is already booted because the pagesets
3766 * are needed to initialize allocators on a specific cpu too. 3769 * are needed to initialize allocators on a specific cpu too.
3767 * F.e. the percpu allocator needs the page allocator which 3770 * F.e. the percpu allocator needs the page allocator which
3768 * needs the percpu allocator in order to allocate its pagesets 3771 * needs the percpu allocator in order to allocate its pagesets
3769 * (a chicken-egg dilemma). 3772 * (a chicken-egg dilemma).
3770 */ 3773 */
3771 for_each_possible_cpu(cpu) { 3774 for_each_possible_cpu(cpu) {
3772 setup_pageset(&per_cpu(boot_pageset, cpu), 0); 3775 setup_pageset(&per_cpu(boot_pageset, cpu), 0);
3773 3776
3774 #ifdef CONFIG_HAVE_MEMORYLESS_NODES 3777 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
3775 /* 3778 /*
3776 * We now know the "local memory node" for each node-- 3779 * We now know the "local memory node" for each node--
3777 * i.e., the node of the first zone in the generic zonelist. 3780 * i.e., the node of the first zone in the generic zonelist.
3778 * Set up numa_mem percpu variable for on-line cpus. During 3781 * Set up numa_mem percpu variable for on-line cpus. During
3779 * boot, only the boot cpu should be on-line; we'll init the 3782 * boot, only the boot cpu should be on-line; we'll init the
3780 * secondary cpus' numa_mem as they come on-line. During 3783 * secondary cpus' numa_mem as they come on-line. During
3781 * node/memory hotplug, we'll fixup all on-line cpus. 3784 * node/memory hotplug, we'll fixup all on-line cpus.
3782 */ 3785 */
3783 if (cpu_online(cpu)) 3786 if (cpu_online(cpu))
3784 set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu))); 3787 set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
3785 #endif 3788 #endif
3786 } 3789 }
3787 3790
3788 return 0; 3791 return 0;
3789 } 3792 }
3790 3793
3791 /* 3794 /*
3792 * Called with zonelists_mutex held always 3795 * Called with zonelists_mutex held always
3793 * unless system_state == SYSTEM_BOOTING. 3796 * unless system_state == SYSTEM_BOOTING.
3794 */ 3797 */
3795 void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) 3798 void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
3796 { 3799 {
3797 set_zonelist_order(); 3800 set_zonelist_order();
3798 3801
3799 if (system_state == SYSTEM_BOOTING) { 3802 if (system_state == SYSTEM_BOOTING) {
3800 __build_all_zonelists(NULL); 3803 __build_all_zonelists(NULL);
3801 mminit_verify_zonelist(); 3804 mminit_verify_zonelist();
3802 cpuset_init_current_mems_allowed(); 3805 cpuset_init_current_mems_allowed();
3803 } else { 3806 } else {
3804 #ifdef CONFIG_MEMORY_HOTPLUG 3807 #ifdef CONFIG_MEMORY_HOTPLUG
3805 if (zone) 3808 if (zone)
3806 setup_zone_pageset(zone); 3809 setup_zone_pageset(zone);
3807 #endif 3810 #endif
3808 /* we have to stop all cpus to guarantee there is no user 3811 /* we have to stop all cpus to guarantee there is no user
3809 of zonelist */ 3812 of zonelist */
3810 stop_machine(__build_all_zonelists, pgdat, NULL); 3813 stop_machine(__build_all_zonelists, pgdat, NULL);
3811 /* cpuset refresh routine should be here */ 3814 /* cpuset refresh routine should be here */
3812 } 3815 }
3813 vm_total_pages = nr_free_pagecache_pages(); 3816 vm_total_pages = nr_free_pagecache_pages();
3814 /* 3817 /*
3815 * Disable grouping by mobility if the number of pages in the 3818 * Disable grouping by mobility if the number of pages in the
3816 * system is too low to allow the mechanism to work. It would be 3819 * system is too low to allow the mechanism to work. It would be
3817 * more accurate, but expensive to check per-zone. This check is 3820 * more accurate, but expensive to check per-zone. This check is
3818 * made on memory-hotadd so a system can start with mobility 3821 * made on memory-hotadd so a system can start with mobility
3819 * disabled and enable it later 3822 * disabled and enable it later
3820 */ 3823 */
3821 if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES)) 3824 if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
3822 page_group_by_mobility_disabled = 1; 3825 page_group_by_mobility_disabled = 1;
3823 else 3826 else
3824 page_group_by_mobility_disabled = 0; 3827 page_group_by_mobility_disabled = 0;
3825 3828
3826 printk("Built %i zonelists in %s order, mobility grouping %s. " 3829 printk("Built %i zonelists in %s order, mobility grouping %s. "
3827 "Total pages: %ld\n", 3830 "Total pages: %ld\n",
3828 nr_online_nodes, 3831 nr_online_nodes,
3829 zonelist_order_name[current_zonelist_order], 3832 zonelist_order_name[current_zonelist_order],
3830 page_group_by_mobility_disabled ? "off" : "on", 3833 page_group_by_mobility_disabled ? "off" : "on",
3831 vm_total_pages); 3834 vm_total_pages);
3832 #ifdef CONFIG_NUMA 3835 #ifdef CONFIG_NUMA
3833 printk("Policy zone: %s\n", zone_names[policy_zone]); 3836 printk("Policy zone: %s\n", zone_names[policy_zone]);
3834 #endif 3837 #endif
3835 } 3838 }
3836 3839
3837 /* 3840 /*
3838 * Helper functions to size the waitqueue hash table. 3841 * Helper functions to size the waitqueue hash table.
3839 * Essentially these want to choose hash table sizes sufficiently 3842 * Essentially these want to choose hash table sizes sufficiently
3840 * large so that collisions trying to wait on pages are rare. 3843 * large so that collisions trying to wait on pages are rare.
3841 * But in fact, the number of active page waitqueues on typical 3844 * But in fact, the number of active page waitqueues on typical
3842 * systems is ridiculously low, less than 200. So this is even 3845 * systems is ridiculously low, less than 200. So this is even
3843 * conservative, even though it seems large. 3846 * conservative, even though it seems large.
3844 * 3847 *
3845 * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to 3848 * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
3846 * waitqueues, i.e. the size of the waitq table given the number of pages. 3849 * waitqueues, i.e. the size of the waitq table given the number of pages.
3847 */ 3850 */
3848 #define PAGES_PER_WAITQUEUE 256 3851 #define PAGES_PER_WAITQUEUE 256
3849 3852
3850 #ifndef CONFIG_MEMORY_HOTPLUG 3853 #ifndef CONFIG_MEMORY_HOTPLUG
3851 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) 3854 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
3852 { 3855 {
3853 unsigned long size = 1; 3856 unsigned long size = 1;
3854 3857
3855 pages /= PAGES_PER_WAITQUEUE; 3858 pages /= PAGES_PER_WAITQUEUE;
3856 3859
3857 while (size < pages) 3860 while (size < pages)
3858 size <<= 1; 3861 size <<= 1;
3859 3862
3860 /* 3863 /*
3861 * Once we have dozens or even hundreds of threads sleeping 3864 * Once we have dozens or even hundreds of threads sleeping
3862 * on IO we've got bigger problems than wait queue collision. 3865 * on IO we've got bigger problems than wait queue collision.
3863 * Limit the size of the wait table to a reasonable size. 3866 * Limit the size of the wait table to a reasonable size.
3864 */ 3867 */
3865 size = min(size, 4096UL); 3868 size = min(size, 4096UL);
3866 3869
3867 return max(size, 4UL); 3870 return max(size, 4UL);
3868 } 3871 }
3869 #else 3872 #else
3870 /* 3873 /*
3871 * A zone's size might be changed by hot-add, so it is not possible to determine 3874 * A zone's size might be changed by hot-add, so it is not possible to determine
3872 * a suitable size for its wait_table. So we use the maximum size now. 3875 * a suitable size for its wait_table. So we use the maximum size now.
3873 * 3876 *
3874 * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie: 3877 * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie:
3875 * 3878 *
3876 * i386 (preemption config) : 4096 x 16 = 64Kbyte. 3879 * i386 (preemption config) : 4096 x 16 = 64Kbyte.
3877 * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte. 3880 * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.
3878 * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte. 3881 * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte.
3879 * 3882 *
3880 * The maximum entries are prepared when a zone's memory is (512K + 256) pages 3883 * The maximum entries are prepared when a zone's memory is (512K + 256) pages
3881 * or more by the traditional way. (See above). It equals: 3884 * or more by the traditional way. (See above). It equals:
3882 * 3885 *
3883 * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte. 3886 * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte.
3884 * ia64(16K page size) : = ( 8G + 4M)byte. 3887 * ia64(16K page size) : = ( 8G + 4M)byte.
3885 * powerpc (64K page size) : = (32G +16M)byte. 3888 * powerpc (64K page size) : = (32G +16M)byte.
3886 */ 3889 */
3887 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) 3890 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
3888 { 3891 {
3889 return 4096UL; 3892 return 4096UL;
3890 } 3893 }
3891 #endif 3894 #endif
3892 3895
3893 /* 3896 /*
3894 * This is an integer logarithm so that shifts can be used later 3897 * This is an integer logarithm so that shifts can be used later
3895 * to extract the more random high bits from the multiplicative 3898 * to extract the more random high bits from the multiplicative
3896 * hash function before the remainder is taken. 3899 * hash function before the remainder is taken.
3897 */ 3900 */
3898 static inline unsigned long wait_table_bits(unsigned long size) 3901 static inline unsigned long wait_table_bits(unsigned long size)
3899 { 3902 {
3900 return ffz(~size); 3903 return ffz(~size);
3901 } 3904 }
3902 3905
3903 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) 3906 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
3904 3907
3905 /* 3908 /*
3906 * Check if a pageblock contains reserved pages 3909 * Check if a pageblock contains reserved pages
3907 */ 3910 */
3908 static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn) 3911 static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)
3909 { 3912 {
3910 unsigned long pfn; 3913 unsigned long pfn;
3911 3914
3912 for (pfn = start_pfn; pfn < end_pfn; pfn++) { 3915 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
3913 if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn))) 3916 if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))
3914 return 1; 3917 return 1;
3915 } 3918 }
3916 return 0; 3919 return 0;
3917 } 3920 }
3918 3921
3919 /* 3922 /*
3920 * Mark a number of pageblocks as MIGRATE_RESERVE. The number 3923 * Mark a number of pageblocks as MIGRATE_RESERVE. The number
3921 * of blocks reserved is based on min_wmark_pages(zone). The memory within 3924 * of blocks reserved is based on min_wmark_pages(zone). The memory within
3922 * the reserve will tend to store contiguous free pages. Setting min_free_kbytes 3925 * the reserve will tend to store contiguous free pages. Setting min_free_kbytes
3923 * higher will lead to a bigger reserve which will get freed as contiguous 3926 * higher will lead to a bigger reserve which will get freed as contiguous
3924 * blocks as reclaim kicks in 3927 * blocks as reclaim kicks in
3925 */ 3928 */
3926 static void setup_zone_migrate_reserve(struct zone *zone) 3929 static void setup_zone_migrate_reserve(struct zone *zone)
3927 { 3930 {
3928 unsigned long start_pfn, pfn, end_pfn, block_end_pfn; 3931 unsigned long start_pfn, pfn, end_pfn, block_end_pfn;
3929 struct page *page; 3932 struct page *page;
3930 unsigned long block_migratetype; 3933 unsigned long block_migratetype;
3931 int reserve; 3934 int reserve;
3932 int old_reserve; 3935 int old_reserve;
3933 3936
3934 /* 3937 /*
3935 * Get the start pfn, end pfn and the number of blocks to reserve 3938 * Get the start pfn, end pfn and the number of blocks to reserve
3936 * We have to be careful to be aligned to pageblock_nr_pages to 3939 * We have to be careful to be aligned to pageblock_nr_pages to
3937 * make sure that we always check pfn_valid for the first page in 3940 * make sure that we always check pfn_valid for the first page in
3938 * the block. 3941 * the block.
3939 */ 3942 */
3940 start_pfn = zone->zone_start_pfn; 3943 start_pfn = zone->zone_start_pfn;
3941 end_pfn = zone_end_pfn(zone); 3944 end_pfn = zone_end_pfn(zone);
3942 start_pfn = roundup(start_pfn, pageblock_nr_pages); 3945 start_pfn = roundup(start_pfn, pageblock_nr_pages);
3943 reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> 3946 reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
3944 pageblock_order; 3947 pageblock_order;
3945 3948
3946 /* 3949 /*
3947 * Reserve blocks are generally in place to help high-order atomic 3950 * Reserve blocks are generally in place to help high-order atomic
3948 * allocations that are short-lived. A min_free_kbytes value that 3951 * allocations that are short-lived. A min_free_kbytes value that
3949 * would result in more than 2 reserve blocks for atomic allocations 3952 * would result in more than 2 reserve blocks for atomic allocations
3950 * is assumed to be in place to help anti-fragmentation for the 3953 * is assumed to be in place to help anti-fragmentation for the
3951 * future allocation of hugepages at runtime. 3954 * future allocation of hugepages at runtime.
3952 */ 3955 */
3953 reserve = min(2, reserve); 3956 reserve = min(2, reserve);
3954 old_reserve = zone->nr_migrate_reserve_block; 3957 old_reserve = zone->nr_migrate_reserve_block;
3955 3958
3956 /* When memory hot-add, we almost always need to do nothing */ 3959 /* When memory hot-add, we almost always need to do nothing */
3957 if (reserve == old_reserve) 3960 if (reserve == old_reserve)
3958 return; 3961 return;
3959 zone->nr_migrate_reserve_block = reserve; 3962 zone->nr_migrate_reserve_block = reserve;
3960 3963
3961 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { 3964 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
3962 if (!pfn_valid(pfn)) 3965 if (!pfn_valid(pfn))
3963 continue; 3966 continue;
3964 page = pfn_to_page(pfn); 3967 page = pfn_to_page(pfn);
3965 3968
3966 /* Watch out for overlapping nodes */ 3969 /* Watch out for overlapping nodes */
3967 if (page_to_nid(page) != zone_to_nid(zone)) 3970 if (page_to_nid(page) != zone_to_nid(zone))
3968 continue; 3971 continue;
3969 3972
3970 block_migratetype = get_pageblock_migratetype(page); 3973 block_migratetype = get_pageblock_migratetype(page);
3971 3974
3972 /* Only test what is necessary when the reserves are not met */ 3975 /* Only test what is necessary when the reserves are not met */
3973 if (reserve > 0) { 3976 if (reserve > 0) {
3974 /* 3977 /*
3975 * Blocks with reserved pages will never free, skip 3978 * Blocks with reserved pages will never free, skip
3976 * them. 3979 * them.
3977 */ 3980 */
3978 block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn); 3981 block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
3979 if (pageblock_is_reserved(pfn, block_end_pfn)) 3982 if (pageblock_is_reserved(pfn, block_end_pfn))
3980 continue; 3983 continue;
3981 3984
3982 /* If this block is reserved, account for it */ 3985 /* If this block is reserved, account for it */
3983 if (block_migratetype == MIGRATE_RESERVE) { 3986 if (block_migratetype == MIGRATE_RESERVE) {
3984 reserve--; 3987 reserve--;
3985 continue; 3988 continue;
3986 } 3989 }
3987 3990
3988 /* Suitable for reserving if this block is movable */ 3991 /* Suitable for reserving if this block is movable */
3989 if (block_migratetype == MIGRATE_MOVABLE) { 3992 if (block_migratetype == MIGRATE_MOVABLE) {
3990 set_pageblock_migratetype(page, 3993 set_pageblock_migratetype(page,
3991 MIGRATE_RESERVE); 3994 MIGRATE_RESERVE);
3992 move_freepages_block(zone, page, 3995 move_freepages_block(zone, page,
3993 MIGRATE_RESERVE); 3996 MIGRATE_RESERVE);
3994 reserve--; 3997 reserve--;
3995 continue; 3998 continue;
3996 } 3999 }
3997 } else if (!old_reserve) { 4000 } else if (!old_reserve) {
3998 /* 4001 /*
3999 * At boot time we don't need to scan the whole zone 4002 * At boot time we don't need to scan the whole zone
4000 * for turning off MIGRATE_RESERVE. 4003 * for turning off MIGRATE_RESERVE.
4001 */ 4004 */
4002 break; 4005 break;
4003 } 4006 }
4004 4007
4005 /* 4008 /*
4006 * If the reserve is met and this is a previous reserved block, 4009 * If the reserve is met and this is a previous reserved block,
4007 * take it back 4010 * take it back
4008 */ 4011 */
4009 if (block_migratetype == MIGRATE_RESERVE) { 4012 if (block_migratetype == MIGRATE_RESERVE) {
4010 set_pageblock_migratetype(page, MIGRATE_MOVABLE); 4013 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
4011 move_freepages_block(zone, page, MIGRATE_MOVABLE); 4014 move_freepages_block(zone, page, MIGRATE_MOVABLE);
4012 } 4015 }
4013 } 4016 }
4014 } 4017 }
4015 4018
4016 /* 4019 /*
4017 * Initially all pages are reserved - free ones are freed 4020 * Initially all pages are reserved - free ones are freed
4018 * up by free_all_bootmem() once the early boot process is 4021 * up by free_all_bootmem() once the early boot process is
4019 * done. Non-atomic initialization, single-pass. 4022 * done. Non-atomic initialization, single-pass.
4020 */ 4023 */
4021 void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, 4024 void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
4022 unsigned long start_pfn, enum memmap_context context) 4025 unsigned long start_pfn, enum memmap_context context)
4023 { 4026 {
4024 struct page *page; 4027 struct page *page;
4025 unsigned long end_pfn = start_pfn + size; 4028 unsigned long end_pfn = start_pfn + size;
4026 unsigned long pfn; 4029 unsigned long pfn;
4027 struct zone *z; 4030 struct zone *z;
4028 4031
4029 if (highest_memmap_pfn < end_pfn - 1) 4032 if (highest_memmap_pfn < end_pfn - 1)
4030 highest_memmap_pfn = end_pfn - 1; 4033 highest_memmap_pfn = end_pfn - 1;
4031 4034
4032 z = &NODE_DATA(nid)->node_zones[zone]; 4035 z = &NODE_DATA(nid)->node_zones[zone];
4033 for (pfn = start_pfn; pfn < end_pfn; pfn++) { 4036 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
4034 /* 4037 /*
4035 * There can be holes in boot-time mem_map[]s 4038 * There can be holes in boot-time mem_map[]s
4036 * handed to this function. They do not 4039 * handed to this function. They do not
4037 * exist on hotplugged memory. 4040 * exist on hotplugged memory.
4038 */ 4041 */
4039 if (context == MEMMAP_EARLY) { 4042 if (context == MEMMAP_EARLY) {
4040 if (!early_pfn_valid(pfn)) 4043 if (!early_pfn_valid(pfn))
4041 continue; 4044 continue;
4042 if (!early_pfn_in_nid(pfn, nid)) 4045 if (!early_pfn_in_nid(pfn, nid))
4043 continue; 4046 continue;
4044 } 4047 }
4045 page = pfn_to_page(pfn); 4048 page = pfn_to_page(pfn);
4046 set_page_links(page, zone, nid, pfn); 4049 set_page_links(page, zone, nid, pfn);
4047 mminit_verify_page_links(page, zone, nid, pfn); 4050 mminit_verify_page_links(page, zone, nid, pfn);
4048 init_page_count(page); 4051 init_page_count(page);
4049 page_mapcount_reset(page); 4052 page_mapcount_reset(page);
4050 page_nid_reset_last(page); 4053 page_nid_reset_last(page);
4051 SetPageReserved(page); 4054 SetPageReserved(page);
4052 /* 4055 /*
4053 * Mark the block movable so that blocks are reserved for 4056 * Mark the block movable so that blocks are reserved for
4054 * movable at startup. This will force kernel allocations 4057 * movable at startup. This will force kernel allocations
4055 * to reserve their blocks rather than leaking throughout 4058 * to reserve their blocks rather than leaking throughout
4056 * the address space during boot when many long-lived 4059 * the address space during boot when many long-lived
4057 * kernel allocations are made. Later some blocks near 4060 * kernel allocations are made. Later some blocks near
4058 * the start are marked MIGRATE_RESERVE by 4061 * the start are marked MIGRATE_RESERVE by
4059 * setup_zone_migrate_reserve() 4062 * setup_zone_migrate_reserve()
4060 * 4063 *
4061 * bitmap is created for zone's valid pfn range. but memmap 4064 * bitmap is created for zone's valid pfn range. but memmap
4062 * can be created for invalid pages (for alignment) 4065 * can be created for invalid pages (for alignment)
4063 * check here not to call set_pageblock_migratetype() against 4066 * check here not to call set_pageblock_migratetype() against
4064 * pfn out of zone. 4067 * pfn out of zone.
4065 */ 4068 */
4066 if ((z->zone_start_pfn <= pfn) 4069 if ((z->zone_start_pfn <= pfn)
4067 && (pfn < zone_end_pfn(z)) 4070 && (pfn < zone_end_pfn(z))
4068 && !(pfn & (pageblock_nr_pages - 1))) 4071 && !(pfn & (pageblock_nr_pages - 1)))
4069 set_pageblock_migratetype(page, MIGRATE_MOVABLE); 4072 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
4070 4073
4071 INIT_LIST_HEAD(&page->lru); 4074 INIT_LIST_HEAD(&page->lru);
4072 #ifdef WANT_PAGE_VIRTUAL 4075 #ifdef WANT_PAGE_VIRTUAL
4073 /* The shift won't overflow because ZONE_NORMAL is below 4G. */ 4076 /* The shift won't overflow because ZONE_NORMAL is below 4G. */
4074 if (!is_highmem_idx(zone)) 4077 if (!is_highmem_idx(zone))
4075 set_page_address(page, __va(pfn << PAGE_SHIFT)); 4078 set_page_address(page, __va(pfn << PAGE_SHIFT));
4076 #endif 4079 #endif
4077 } 4080 }
4078 } 4081 }
4079 4082
4080 static void __meminit zone_init_free_lists(struct zone *zone) 4083 static void __meminit zone_init_free_lists(struct zone *zone)
4081 { 4084 {
4082 int order, t; 4085 int order, t;
4083 for_each_migratetype_order(order, t) { 4086 for_each_migratetype_order(order, t) {
4084 INIT_LIST_HEAD(&zone->free_area[order].free_list[t]); 4087 INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
4085 zone->free_area[order].nr_free = 0; 4088 zone->free_area[order].nr_free = 0;
4086 } 4089 }
4087 } 4090 }
4088 4091
4089 #ifndef __HAVE_ARCH_MEMMAP_INIT 4092 #ifndef __HAVE_ARCH_MEMMAP_INIT
4090 #define memmap_init(size, nid, zone, start_pfn) \ 4093 #define memmap_init(size, nid, zone, start_pfn) \
4091 memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) 4094 memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
4092 #endif 4095 #endif
4093 4096
4094 static int zone_batchsize(struct zone *zone) 4097 static int zone_batchsize(struct zone *zone)
4095 { 4098 {
4096 #ifdef CONFIG_MMU 4099 #ifdef CONFIG_MMU
4097 int batch; 4100 int batch;
4098 4101
4099 /* 4102 /*
4100 * The per-cpu-pages pools are set to around 1000th of the 4103 * The per-cpu-pages pools are set to around 1000th of the
4101 * size of the zone. But no more than 1/2 of a meg. 4104 * size of the zone. But no more than 1/2 of a meg.
4102 * 4105 *
4103 * OK, so we don't know how big the cache is. So guess. 4106 * OK, so we don't know how big the cache is. So guess.
4104 */ 4107 */
4105 batch = zone->managed_pages / 1024; 4108 batch = zone->managed_pages / 1024;
4106 if (batch * PAGE_SIZE > 512 * 1024) 4109 if (batch * PAGE_SIZE > 512 * 1024)
4107 batch = (512 * 1024) / PAGE_SIZE; 4110 batch = (512 * 1024) / PAGE_SIZE;
4108 batch /= 4; /* We effectively *= 4 below */ 4111 batch /= 4; /* We effectively *= 4 below */
4109 if (batch < 1) 4112 if (batch < 1)
4110 batch = 1; 4113 batch = 1;
4111 4114
4112 /* 4115 /*
4113 * Clamp the batch to a 2^n - 1 value. Having a power 4116 * Clamp the batch to a 2^n - 1 value. Having a power
4114 * of 2 value was found to be more likely to have 4117 * of 2 value was found to be more likely to have
4115 * suboptimal cache aliasing properties in some cases. 4118 * suboptimal cache aliasing properties in some cases.
4116 * 4119 *
4117 * For example if 2 tasks are alternately allocating 4120 * For example if 2 tasks are alternately allocating
4118 * batches of pages, one task can end up with a lot 4121 * batches of pages, one task can end up with a lot
4119 * of pages of one half of the possible page colors 4122 * of pages of one half of the possible page colors
4120 * and the other with pages of the other colors. 4123 * and the other with pages of the other colors.
4121 */ 4124 */
4122 batch = rounddown_pow_of_two(batch + batch/2) - 1; 4125 batch = rounddown_pow_of_two(batch + batch/2) - 1;
4123 4126
4124 return batch; 4127 return batch;
4125 4128
4126 #else 4129 #else
4127 /* The deferral and batching of frees should be suppressed under NOMMU 4130 /* The deferral and batching of frees should be suppressed under NOMMU
4128 * conditions. 4131 * conditions.
4129 * 4132 *
4130 * The problem is that NOMMU needs to be able to allocate large chunks 4133 * The problem is that NOMMU needs to be able to allocate large chunks
4131 * of contiguous memory as there's no hardware page translation to 4134 * of contiguous memory as there's no hardware page translation to
4132 * assemble apparent contiguous memory from discontiguous pages. 4135 * assemble apparent contiguous memory from discontiguous pages.
4133 * 4136 *
4134 * Queueing large contiguous runs of pages for batching, however, 4137 * Queueing large contiguous runs of pages for batching, however,
4135 * causes the pages to actually be freed in smaller chunks. As there 4138 * causes the pages to actually be freed in smaller chunks. As there
4136 * can be a significant delay between the individual batches being 4139 * can be a significant delay between the individual batches being
4137 * recycled, this leads to the once large chunks of space being 4140 * recycled, this leads to the once large chunks of space being
4138 * fragmented and becoming unavailable for high-order allocations. 4141 * fragmented and becoming unavailable for high-order allocations.
4139 */ 4142 */
4140 return 0; 4143 return 0;
4141 #endif 4144 #endif
4142 } 4145 }
4143 4146
4144 /* 4147 /*
4145 * pcp->high and pcp->batch values are related and dependent on one another: 4148 * pcp->high and pcp->batch values are related and dependent on one another:
4146 * ->batch must never be higher then ->high. 4149 * ->batch must never be higher then ->high.
4147 * The following function updates them in a safe manner without read side 4150 * The following function updates them in a safe manner without read side
4148 * locking. 4151 * locking.
4149 * 4152 *
4150 * Any new users of pcp->batch and pcp->high should ensure they can cope with 4153 * Any new users of pcp->batch and pcp->high should ensure they can cope with
4151 * those fields changing asynchronously (acording the the above rule). 4154 * those fields changing asynchronously (acording the the above rule).
4152 * 4155 *
4153 * mutex_is_locked(&pcp_batch_high_lock) required when calling this function 4156 * mutex_is_locked(&pcp_batch_high_lock) required when calling this function
4154 * outside of boot time (or some other assurance that no concurrent updaters 4157 * outside of boot time (or some other assurance that no concurrent updaters
4155 * exist). 4158 * exist).
4156 */ 4159 */
4157 static void pageset_update(struct per_cpu_pages *pcp, unsigned long high, 4160 static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
4158 unsigned long batch) 4161 unsigned long batch)
4159 { 4162 {
4160 /* start with a fail safe value for batch */ 4163 /* start with a fail safe value for batch */
4161 pcp->batch = 1; 4164 pcp->batch = 1;
4162 smp_wmb(); 4165 smp_wmb();
4163 4166
4164 /* Update high, then batch, in order */ 4167 /* Update high, then batch, in order */
4165 pcp->high = high; 4168 pcp->high = high;
4166 smp_wmb(); 4169 smp_wmb();
4167 4170
4168 pcp->batch = batch; 4171 pcp->batch = batch;
4169 } 4172 }
4170 4173
4171 /* a companion to pageset_set_high() */ 4174 /* a companion to pageset_set_high() */
4172 static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch) 4175 static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch)
4173 { 4176 {
4174 pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch)); 4177 pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch));
4175 } 4178 }
4176 4179
4177 static void pageset_init(struct per_cpu_pageset *p) 4180 static void pageset_init(struct per_cpu_pageset *p)
4178 { 4181 {
4179 struct per_cpu_pages *pcp; 4182 struct per_cpu_pages *pcp;
4180 int migratetype; 4183 int migratetype;
4181 4184
4182 memset(p, 0, sizeof(*p)); 4185 memset(p, 0, sizeof(*p));
4183 4186
4184 pcp = &p->pcp; 4187 pcp = &p->pcp;
4185 pcp->count = 0; 4188 pcp->count = 0;
4186 for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) 4189 for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
4187 INIT_LIST_HEAD(&pcp->lists[migratetype]); 4190 INIT_LIST_HEAD(&pcp->lists[migratetype]);
4188 } 4191 }
4189 4192
4190 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) 4193 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
4191 { 4194 {
4192 pageset_init(p); 4195 pageset_init(p);
4193 pageset_set_batch(p, batch); 4196 pageset_set_batch(p, batch);
4194 } 4197 }
4195 4198
4196 /* 4199 /*
4197 * pageset_set_high() sets the high water mark for hot per_cpu_pagelist 4200 * pageset_set_high() sets the high water mark for hot per_cpu_pagelist
4198 * to the value high for the pageset p. 4201 * to the value high for the pageset p.
4199 */ 4202 */
4200 static void pageset_set_high(struct per_cpu_pageset *p, 4203 static void pageset_set_high(struct per_cpu_pageset *p,
4201 unsigned long high) 4204 unsigned long high)
4202 { 4205 {
4203 unsigned long batch = max(1UL, high / 4); 4206 unsigned long batch = max(1UL, high / 4);
4204 if ((high / 4) > (PAGE_SHIFT * 8)) 4207 if ((high / 4) > (PAGE_SHIFT * 8))
4205 batch = PAGE_SHIFT * 8; 4208 batch = PAGE_SHIFT * 8;
4206 4209
4207 pageset_update(&p->pcp, high, batch); 4210 pageset_update(&p->pcp, high, batch);
4208 } 4211 }
4209 4212
4210 static void pageset_set_high_and_batch(struct zone *zone, 4213 static void pageset_set_high_and_batch(struct zone *zone,
4211 struct per_cpu_pageset *pcp) 4214 struct per_cpu_pageset *pcp)
4212 { 4215 {
4213 if (percpu_pagelist_fraction) 4216 if (percpu_pagelist_fraction)
4214 pageset_set_high(pcp, 4217 pageset_set_high(pcp,
4215 (zone->managed_pages / 4218 (zone->managed_pages /
4216 percpu_pagelist_fraction)); 4219 percpu_pagelist_fraction));
4217 else 4220 else
4218 pageset_set_batch(pcp, zone_batchsize(zone)); 4221 pageset_set_batch(pcp, zone_batchsize(zone));
4219 } 4222 }
4220 4223
4221 static void __meminit zone_pageset_init(struct zone *zone, int cpu) 4224 static void __meminit zone_pageset_init(struct zone *zone, int cpu)
4222 { 4225 {
4223 struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); 4226 struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
4224 4227
4225 pageset_init(pcp); 4228 pageset_init(pcp);
4226 pageset_set_high_and_batch(zone, pcp); 4229 pageset_set_high_and_batch(zone, pcp);
4227 } 4230 }
4228 4231
4229 static void __meminit setup_zone_pageset(struct zone *zone) 4232 static void __meminit setup_zone_pageset(struct zone *zone)
4230 { 4233 {
4231 int cpu; 4234 int cpu;
4232 zone->pageset = alloc_percpu(struct per_cpu_pageset); 4235 zone->pageset = alloc_percpu(struct per_cpu_pageset);
4233 for_each_possible_cpu(cpu) 4236 for_each_possible_cpu(cpu)
4234 zone_pageset_init(zone, cpu); 4237 zone_pageset_init(zone, cpu);
4235 } 4238 }
4236 4239
4237 /* 4240 /*
4238 * Allocate per cpu pagesets and initialize them. 4241 * Allocate per cpu pagesets and initialize them.
4239 * Before this call only boot pagesets were available. 4242 * Before this call only boot pagesets were available.
4240 */ 4243 */
4241 void __init setup_per_cpu_pageset(void) 4244 void __init setup_per_cpu_pageset(void)
4242 { 4245 {
4243 struct zone *zone; 4246 struct zone *zone;
4244 4247
4245 for_each_populated_zone(zone) 4248 for_each_populated_zone(zone)
4246 setup_zone_pageset(zone); 4249 setup_zone_pageset(zone);
4247 } 4250 }
4248 4251
4249 static noinline __init_refok 4252 static noinline __init_refok
4250 int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) 4253 int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
4251 { 4254 {
4252 int i; 4255 int i;
4253 struct pglist_data *pgdat = zone->zone_pgdat; 4256 struct pglist_data *pgdat = zone->zone_pgdat;
4254 size_t alloc_size; 4257 size_t alloc_size;
4255 4258
4256 /* 4259 /*
4257 * The per-page waitqueue mechanism uses hashed waitqueues 4260 * The per-page waitqueue mechanism uses hashed waitqueues
4258 * per zone. 4261 * per zone.
4259 */ 4262 */
4260 zone->wait_table_hash_nr_entries = 4263 zone->wait_table_hash_nr_entries =
4261 wait_table_hash_nr_entries(zone_size_pages); 4264 wait_table_hash_nr_entries(zone_size_pages);
4262 zone->wait_table_bits = 4265 zone->wait_table_bits =
4263 wait_table_bits(zone->wait_table_hash_nr_entries); 4266 wait_table_bits(zone->wait_table_hash_nr_entries);
4264 alloc_size = zone->wait_table_hash_nr_entries 4267 alloc_size = zone->wait_table_hash_nr_entries
4265 * sizeof(wait_queue_head_t); 4268 * sizeof(wait_queue_head_t);
4266 4269
4267 if (!slab_is_available()) { 4270 if (!slab_is_available()) {
4268 zone->wait_table = (wait_queue_head_t *) 4271 zone->wait_table = (wait_queue_head_t *)
4269 alloc_bootmem_node_nopanic(pgdat, alloc_size); 4272 alloc_bootmem_node_nopanic(pgdat, alloc_size);
4270 } else { 4273 } else {
4271 /* 4274 /*
4272 * This case means that a zone whose size was 0 gets new memory 4275 * This case means that a zone whose size was 0 gets new memory
4273 * via memory hot-add. 4276 * via memory hot-add.
4274 * But it may be the case that a new node was hot-added. In 4277 * But it may be the case that a new node was hot-added. In
4275 * this case vmalloc() will not be able to use this new node's 4278 * this case vmalloc() will not be able to use this new node's
4276 * memory - this wait_table must be initialized to use this new 4279 * memory - this wait_table must be initialized to use this new
4277 * node itself as well. 4280 * node itself as well.
4278 * To use this new node's memory, further consideration will be 4281 * To use this new node's memory, further consideration will be
4279 * necessary. 4282 * necessary.
4280 */ 4283 */
4281 zone->wait_table = vmalloc(alloc_size); 4284 zone->wait_table = vmalloc(alloc_size);
4282 } 4285 }
4283 if (!zone->wait_table) 4286 if (!zone->wait_table)
4284 return -ENOMEM; 4287 return -ENOMEM;
4285 4288
4286 for (i = 0; i < zone->wait_table_hash_nr_entries; ++i) 4289 for (i = 0; i < zone->wait_table_hash_nr_entries; ++i)
4287 init_waitqueue_head(zone->wait_table + i); 4290 init_waitqueue_head(zone->wait_table + i);
4288 4291
4289 return 0; 4292 return 0;
4290 } 4293 }
4291 4294
4292 static __meminit void zone_pcp_init(struct zone *zone) 4295 static __meminit void zone_pcp_init(struct zone *zone)
4293 { 4296 {
4294 /* 4297 /*
4295 * per cpu subsystem is not up at this point. The following code 4298 * per cpu subsystem is not up at this point. The following code
4296 * relies on the ability of the linker to provide the 4299 * relies on the ability of the linker to provide the
4297 * offset of a (static) per cpu variable into the per cpu area. 4300 * offset of a (static) per cpu variable into the per cpu area.
4298 */ 4301 */
4299 zone->pageset = &boot_pageset; 4302 zone->pageset = &boot_pageset;
4300 4303
4301 if (zone->present_pages) 4304 if (zone->present_pages)
4302 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n", 4305 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n",
4303 zone->name, zone->present_pages, 4306 zone->name, zone->present_pages,
4304 zone_batchsize(zone)); 4307 zone_batchsize(zone));
4305 } 4308 }
4306 4309
4307 int __meminit init_currently_empty_zone(struct zone *zone, 4310 int __meminit init_currently_empty_zone(struct zone *zone,
4308 unsigned long zone_start_pfn, 4311 unsigned long zone_start_pfn,
4309 unsigned long size, 4312 unsigned long size,
4310 enum memmap_context context) 4313 enum memmap_context context)
4311 { 4314 {
4312 struct pglist_data *pgdat = zone->zone_pgdat; 4315 struct pglist_data *pgdat = zone->zone_pgdat;
4313 int ret; 4316 int ret;
4314 ret = zone_wait_table_init(zone, size); 4317 ret = zone_wait_table_init(zone, size);
4315 if (ret) 4318 if (ret)
4316 return ret; 4319 return ret;
4317 pgdat->nr_zones = zone_idx(zone) + 1; 4320 pgdat->nr_zones = zone_idx(zone) + 1;
4318 4321
4319 zone->zone_start_pfn = zone_start_pfn; 4322 zone->zone_start_pfn = zone_start_pfn;
4320 4323
4321 mminit_dprintk(MMINIT_TRACE, "memmap_init", 4324 mminit_dprintk(MMINIT_TRACE, "memmap_init",
4322 "Initialising map node %d zone %lu pfns %lu -> %lu\n", 4325 "Initialising map node %d zone %lu pfns %lu -> %lu\n",
4323 pgdat->node_id, 4326 pgdat->node_id,
4324 (unsigned long)zone_idx(zone), 4327 (unsigned long)zone_idx(zone),
4325 zone_start_pfn, (zone_start_pfn + size)); 4328 zone_start_pfn, (zone_start_pfn + size));
4326 4329
4327 zone_init_free_lists(zone); 4330 zone_init_free_lists(zone);
4328 4331
4329 return 0; 4332 return 0;
4330 } 4333 }
4331 4334
4332 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 4335 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
4333 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID 4336 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
4334 /* 4337 /*
4335 * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. 4338 * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
4336 * Architectures may implement their own version but if add_active_range() 4339 * Architectures may implement their own version but if add_active_range()
4337 * was used and there are no special requirements, this is a convenient 4340 * was used and there are no special requirements, this is a convenient
4338 * alternative 4341 * alternative
4339 */ 4342 */
4340 int __meminit __early_pfn_to_nid(unsigned long pfn) 4343 int __meminit __early_pfn_to_nid(unsigned long pfn)
4341 { 4344 {
4342 unsigned long start_pfn, end_pfn; 4345 unsigned long start_pfn, end_pfn;
4343 int nid; 4346 int nid;
4344 /* 4347 /*
4345 * NOTE: The following SMP-unsafe globals are only used early in boot 4348 * NOTE: The following SMP-unsafe globals are only used early in boot
4346 * when the kernel is running single-threaded. 4349 * when the kernel is running single-threaded.
4347 */ 4350 */
4348 static unsigned long __meminitdata last_start_pfn, last_end_pfn; 4351 static unsigned long __meminitdata last_start_pfn, last_end_pfn;
4349 static int __meminitdata last_nid; 4352 static int __meminitdata last_nid;
4350 4353
4351 if (last_start_pfn <= pfn && pfn < last_end_pfn) 4354 if (last_start_pfn <= pfn && pfn < last_end_pfn)
4352 return last_nid; 4355 return last_nid;
4353 4356
4354 nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn); 4357 nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
4355 if (nid != -1) { 4358 if (nid != -1) {
4356 last_start_pfn = start_pfn; 4359 last_start_pfn = start_pfn;
4357 last_end_pfn = end_pfn; 4360 last_end_pfn = end_pfn;
4358 last_nid = nid; 4361 last_nid = nid;
4359 } 4362 }
4360 4363
4361 return nid; 4364 return nid;
4362 } 4365 }
4363 #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ 4366 #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
4364 4367
4365 int __meminit early_pfn_to_nid(unsigned long pfn) 4368 int __meminit early_pfn_to_nid(unsigned long pfn)
4366 { 4369 {
4367 int nid; 4370 int nid;
4368 4371
4369 nid = __early_pfn_to_nid(pfn); 4372 nid = __early_pfn_to_nid(pfn);
4370 if (nid >= 0) 4373 if (nid >= 0)
4371 return nid; 4374 return nid;
4372 /* just returns 0 */ 4375 /* just returns 0 */
4373 return 0; 4376 return 0;
4374 } 4377 }
4375 4378
4376 #ifdef CONFIG_NODES_SPAN_OTHER_NODES 4379 #ifdef CONFIG_NODES_SPAN_OTHER_NODES
4377 bool __meminit early_pfn_in_nid(unsigned long pfn, int node) 4380 bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
4378 { 4381 {
4379 int nid; 4382 int nid;
4380 4383
4381 nid = __early_pfn_to_nid(pfn); 4384 nid = __early_pfn_to_nid(pfn);
4382 if (nid >= 0 && nid != node) 4385 if (nid >= 0 && nid != node)
4383 return false; 4386 return false;
4384 return true; 4387 return true;
4385 } 4388 }
4386 #endif 4389 #endif
4387 4390
4388 /** 4391 /**
4389 * free_bootmem_with_active_regions - Call free_bootmem_node for each active range 4392 * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
4390 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. 4393 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
4391 * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node 4394 * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node
4392 * 4395 *
4393 * If an architecture guarantees that all ranges registered with 4396 * If an architecture guarantees that all ranges registered with
4394 * add_active_ranges() contain no holes and may be freed, this 4397 * add_active_ranges() contain no holes and may be freed, this
4395 * this function may be used instead of calling free_bootmem() manually. 4398 * this function may be used instead of calling free_bootmem() manually.
4396 */ 4399 */
4397 void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) 4400 void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
4398 { 4401 {
4399 unsigned long start_pfn, end_pfn; 4402 unsigned long start_pfn, end_pfn;
4400 int i, this_nid; 4403 int i, this_nid;
4401 4404
4402 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) { 4405 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {
4403 start_pfn = min(start_pfn, max_low_pfn); 4406 start_pfn = min(start_pfn, max_low_pfn);
4404 end_pfn = min(end_pfn, max_low_pfn); 4407 end_pfn = min(end_pfn, max_low_pfn);
4405 4408
4406 if (start_pfn < end_pfn) 4409 if (start_pfn < end_pfn)
4407 free_bootmem_node(NODE_DATA(this_nid), 4410 free_bootmem_node(NODE_DATA(this_nid),
4408 PFN_PHYS(start_pfn), 4411 PFN_PHYS(start_pfn),
4409 (end_pfn - start_pfn) << PAGE_SHIFT); 4412 (end_pfn - start_pfn) << PAGE_SHIFT);
4410 } 4413 }
4411 } 4414 }
4412 4415
4413 /** 4416 /**
4414 * sparse_memory_present_with_active_regions - Call memory_present for each active range 4417 * sparse_memory_present_with_active_regions - Call memory_present for each active range
4415 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. 4418 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
4416 * 4419 *
4417 * If an architecture guarantees that all ranges registered with 4420 * If an architecture guarantees that all ranges registered with
4418 * add_active_ranges() contain no holes and may be freed, this 4421 * add_active_ranges() contain no holes and may be freed, this
4419 * function may be used instead of calling memory_present() manually. 4422 * function may be used instead of calling memory_present() manually.
4420 */ 4423 */
4421 void __init sparse_memory_present_with_active_regions(int nid) 4424 void __init sparse_memory_present_with_active_regions(int nid)
4422 { 4425 {
4423 unsigned long start_pfn, end_pfn; 4426 unsigned long start_pfn, end_pfn;
4424 int i, this_nid; 4427 int i, this_nid;
4425 4428
4426 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) 4429 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)
4427 memory_present(this_nid, start_pfn, end_pfn); 4430 memory_present(this_nid, start_pfn, end_pfn);
4428 } 4431 }
4429 4432
4430 /** 4433 /**
4431 * get_pfn_range_for_nid - Return the start and end page frames for a node 4434 * get_pfn_range_for_nid - Return the start and end page frames for a node
4432 * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned. 4435 * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
4433 * @start_pfn: Passed by reference. On return, it will have the node start_pfn. 4436 * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
4434 * @end_pfn: Passed by reference. On return, it will have the node end_pfn. 4437 * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
4435 * 4438 *
4436 * It returns the start and end page frame of a node based on information 4439 * It returns the start and end page frame of a node based on information
4437 * provided by an arch calling add_active_range(). If called for a node 4440 * provided by an arch calling add_active_range(). If called for a node
4438 * with no available memory, a warning is printed and the start and end 4441 * with no available memory, a warning is printed and the start and end
4439 * PFNs will be 0. 4442 * PFNs will be 0.
4440 */ 4443 */
4441 void __meminit get_pfn_range_for_nid(unsigned int nid, 4444 void __meminit get_pfn_range_for_nid(unsigned int nid,
4442 unsigned long *start_pfn, unsigned long *end_pfn) 4445 unsigned long *start_pfn, unsigned long *end_pfn)
4443 { 4446 {
4444 unsigned long this_start_pfn, this_end_pfn; 4447 unsigned long this_start_pfn, this_end_pfn;
4445 int i; 4448 int i;
4446 4449
4447 *start_pfn = -1UL; 4450 *start_pfn = -1UL;
4448 *end_pfn = 0; 4451 *end_pfn = 0;
4449 4452
4450 for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) { 4453 for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {
4451 *start_pfn = min(*start_pfn, this_start_pfn); 4454 *start_pfn = min(*start_pfn, this_start_pfn);
4452 *end_pfn = max(*end_pfn, this_end_pfn); 4455 *end_pfn = max(*end_pfn, this_end_pfn);
4453 } 4456 }
4454 4457
4455 if (*start_pfn == -1UL) 4458 if (*start_pfn == -1UL)
4456 *start_pfn = 0; 4459 *start_pfn = 0;
4457 } 4460 }
4458 4461
4459 /* 4462 /*
4460 * This finds a zone that can be used for ZONE_MOVABLE pages. The 4463 * This finds a zone that can be used for ZONE_MOVABLE pages. The
4461 * assumption is made that zones within a node are ordered in monotonic 4464 * assumption is made that zones within a node are ordered in monotonic
4462 * increasing memory addresses so that the "highest" populated zone is used 4465 * increasing memory addresses so that the "highest" populated zone is used
4463 */ 4466 */
4464 static void __init find_usable_zone_for_movable(void) 4467 static void __init find_usable_zone_for_movable(void)
4465 { 4468 {
4466 int zone_index; 4469 int zone_index;
4467 for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) { 4470 for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
4468 if (zone_index == ZONE_MOVABLE) 4471 if (zone_index == ZONE_MOVABLE)
4469 continue; 4472 continue;
4470 4473
4471 if (arch_zone_highest_possible_pfn[zone_index] > 4474 if (arch_zone_highest_possible_pfn[zone_index] >
4472 arch_zone_lowest_possible_pfn[zone_index]) 4475 arch_zone_lowest_possible_pfn[zone_index])
4473 break; 4476 break;
4474 } 4477 }
4475 4478
4476 VM_BUG_ON(zone_index == -1); 4479 VM_BUG_ON(zone_index == -1);
4477 movable_zone = zone_index; 4480 movable_zone = zone_index;
4478 } 4481 }
4479 4482
4480 /* 4483 /*
4481 * The zone ranges provided by the architecture do not include ZONE_MOVABLE 4484 * The zone ranges provided by the architecture do not include ZONE_MOVABLE
4482 * because it is sized independent of architecture. Unlike the other zones, 4485 * because it is sized independent of architecture. Unlike the other zones,
4483 * the starting point for ZONE_MOVABLE is not fixed. It may be different 4486 * the starting point for ZONE_MOVABLE is not fixed. It may be different
4484 * in each node depending on the size of each node and how evenly kernelcore 4487 * in each node depending on the size of each node and how evenly kernelcore
4485 * is distributed. This helper function adjusts the zone ranges 4488 * is distributed. This helper function adjusts the zone ranges
4486 * provided by the architecture for a given node by using the end of the 4489 * provided by the architecture for a given node by using the end of the
4487 * highest usable zone for ZONE_MOVABLE. This preserves the assumption that 4490 * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
4488 * zones within a node are in order of monotonic increases memory addresses 4491 * zones within a node are in order of monotonic increases memory addresses
4489 */ 4492 */
4490 static void __meminit adjust_zone_range_for_zone_movable(int nid, 4493 static void __meminit adjust_zone_range_for_zone_movable(int nid,
4491 unsigned long zone_type, 4494 unsigned long zone_type,
4492 unsigned long node_start_pfn, 4495 unsigned long node_start_pfn,
4493 unsigned long node_end_pfn, 4496 unsigned long node_end_pfn,
4494 unsigned long *zone_start_pfn, 4497 unsigned long *zone_start_pfn,
4495 unsigned long *zone_end_pfn) 4498 unsigned long *zone_end_pfn)
4496 { 4499 {
4497 /* Only adjust if ZONE_MOVABLE is on this node */ 4500 /* Only adjust if ZONE_MOVABLE is on this node */
4498 if (zone_movable_pfn[nid]) { 4501 if (zone_movable_pfn[nid]) {
4499 /* Size ZONE_MOVABLE */ 4502 /* Size ZONE_MOVABLE */
4500 if (zone_type == ZONE_MOVABLE) { 4503 if (zone_type == ZONE_MOVABLE) {
4501 *zone_start_pfn = zone_movable_pfn[nid]; 4504 *zone_start_pfn = zone_movable_pfn[nid];
4502 *zone_end_pfn = min(node_end_pfn, 4505 *zone_end_pfn = min(node_end_pfn,
4503 arch_zone_highest_possible_pfn[movable_zone]); 4506 arch_zone_highest_possible_pfn[movable_zone]);
4504 4507
4505 /* Adjust for ZONE_MOVABLE starting within this range */ 4508 /* Adjust for ZONE_MOVABLE starting within this range */
4506 } else if (*zone_start_pfn < zone_movable_pfn[nid] && 4509 } else if (*zone_start_pfn < zone_movable_pfn[nid] &&
4507 *zone_end_pfn > zone_movable_pfn[nid]) { 4510 *zone_end_pfn > zone_movable_pfn[nid]) {
4508 *zone_end_pfn = zone_movable_pfn[nid]; 4511 *zone_end_pfn = zone_movable_pfn[nid];
4509 4512
4510 /* Check if this whole range is within ZONE_MOVABLE */ 4513 /* Check if this whole range is within ZONE_MOVABLE */
4511 } else if (*zone_start_pfn >= zone_movable_pfn[nid]) 4514 } else if (*zone_start_pfn >= zone_movable_pfn[nid])
4512 *zone_start_pfn = *zone_end_pfn; 4515 *zone_start_pfn = *zone_end_pfn;
4513 } 4516 }
4514 } 4517 }
4515 4518
4516 /* 4519 /*
4517 * Return the number of pages a zone spans in a node, including holes 4520 * Return the number of pages a zone spans in a node, including holes
4518 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node() 4521 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
4519 */ 4522 */
4520 static unsigned long __meminit zone_spanned_pages_in_node(int nid, 4523 static unsigned long __meminit zone_spanned_pages_in_node(int nid,
4521 unsigned long zone_type, 4524 unsigned long zone_type,
4522 unsigned long node_start_pfn, 4525 unsigned long node_start_pfn,
4523 unsigned long node_end_pfn, 4526 unsigned long node_end_pfn,
4524 unsigned long *ignored) 4527 unsigned long *ignored)
4525 { 4528 {
4526 unsigned long zone_start_pfn, zone_end_pfn; 4529 unsigned long zone_start_pfn, zone_end_pfn;
4527 4530
4528 /* Get the start and end of the zone */ 4531 /* Get the start and end of the zone */
4529 zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; 4532 zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
4530 zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; 4533 zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
4531 adjust_zone_range_for_zone_movable(nid, zone_type, 4534 adjust_zone_range_for_zone_movable(nid, zone_type,
4532 node_start_pfn, node_end_pfn, 4535 node_start_pfn, node_end_pfn,
4533 &zone_start_pfn, &zone_end_pfn); 4536 &zone_start_pfn, &zone_end_pfn);
4534 4537
4535 /* Check that this node has pages within the zone's required range */ 4538 /* Check that this node has pages within the zone's required range */
4536 if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn) 4539 if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
4537 return 0; 4540 return 0;
4538 4541
4539 /* Move the zone boundaries inside the node if necessary */ 4542 /* Move the zone boundaries inside the node if necessary */
4540 zone_end_pfn = min(zone_end_pfn, node_end_pfn); 4543 zone_end_pfn = min(zone_end_pfn, node_end_pfn);
4541 zone_start_pfn = max(zone_start_pfn, node_start_pfn); 4544 zone_start_pfn = max(zone_start_pfn, node_start_pfn);
4542 4545
4543 /* Return the spanned pages */ 4546 /* Return the spanned pages */
4544 return zone_end_pfn - zone_start_pfn; 4547 return zone_end_pfn - zone_start_pfn;
4545 } 4548 }
4546 4549
4547 /* 4550 /*
4548 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, 4551 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
4549 * then all holes in the requested range will be accounted for. 4552 * then all holes in the requested range will be accounted for.
4550 */ 4553 */
4551 unsigned long __meminit __absent_pages_in_range(int nid, 4554 unsigned long __meminit __absent_pages_in_range(int nid,
4552 unsigned long range_start_pfn, 4555 unsigned long range_start_pfn,
4553 unsigned long range_end_pfn) 4556 unsigned long range_end_pfn)
4554 { 4557 {
4555 unsigned long nr_absent = range_end_pfn - range_start_pfn; 4558 unsigned long nr_absent = range_end_pfn - range_start_pfn;
4556 unsigned long start_pfn, end_pfn; 4559 unsigned long start_pfn, end_pfn;
4557 int i; 4560 int i;
4558 4561
4559 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 4562 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
4560 start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn); 4563 start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
4561 end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn); 4564 end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
4562 nr_absent -= end_pfn - start_pfn; 4565 nr_absent -= end_pfn - start_pfn;
4563 } 4566 }
4564 return nr_absent; 4567 return nr_absent;
4565 } 4568 }
4566 4569
4567 /** 4570 /**
4568 * absent_pages_in_range - Return number of page frames in holes within a range 4571 * absent_pages_in_range - Return number of page frames in holes within a range
4569 * @start_pfn: The start PFN to start searching for holes 4572 * @start_pfn: The start PFN to start searching for holes
4570 * @end_pfn: The end PFN to stop searching for holes 4573 * @end_pfn: The end PFN to stop searching for holes
4571 * 4574 *
4572 * It returns the number of pages frames in memory holes within a range. 4575 * It returns the number of pages frames in memory holes within a range.
4573 */ 4576 */
4574 unsigned long __init absent_pages_in_range(unsigned long start_pfn, 4577 unsigned long __init absent_pages_in_range(unsigned long start_pfn,
4575 unsigned long end_pfn) 4578 unsigned long end_pfn)
4576 { 4579 {
4577 return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn); 4580 return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
4578 } 4581 }
4579 4582
4580 /* Return the number of page frames in holes in a zone on a node */ 4583 /* Return the number of page frames in holes in a zone on a node */
4581 static unsigned long __meminit zone_absent_pages_in_node(int nid, 4584 static unsigned long __meminit zone_absent_pages_in_node(int nid,
4582 unsigned long zone_type, 4585 unsigned long zone_type,
4583 unsigned long node_start_pfn, 4586 unsigned long node_start_pfn,
4584 unsigned long node_end_pfn, 4587 unsigned long node_end_pfn,
4585 unsigned long *ignored) 4588 unsigned long *ignored)
4586 { 4589 {
4587 unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; 4590 unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
4588 unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; 4591 unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
4589 unsigned long zone_start_pfn, zone_end_pfn; 4592 unsigned long zone_start_pfn, zone_end_pfn;
4590 4593
4591 zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); 4594 zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
4592 zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); 4595 zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
4593 4596
4594 adjust_zone_range_for_zone_movable(nid, zone_type, 4597 adjust_zone_range_for_zone_movable(nid, zone_type,
4595 node_start_pfn, node_end_pfn, 4598 node_start_pfn, node_end_pfn,
4596 &zone_start_pfn, &zone_end_pfn); 4599 &zone_start_pfn, &zone_end_pfn);
4597 return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); 4600 return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
4598 } 4601 }
4599 4602
4600 #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 4603 #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
4601 static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, 4604 static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
4602 unsigned long zone_type, 4605 unsigned long zone_type,
4603 unsigned long node_start_pfn, 4606 unsigned long node_start_pfn,
4604 unsigned long node_end_pfn, 4607 unsigned long node_end_pfn,
4605 unsigned long *zones_size) 4608 unsigned long *zones_size)
4606 { 4609 {
4607 return zones_size[zone_type]; 4610 return zones_size[zone_type];
4608 } 4611 }
4609 4612
4610 static inline unsigned long __meminit zone_absent_pages_in_node(int nid, 4613 static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
4611 unsigned long zone_type, 4614 unsigned long zone_type,
4612 unsigned long node_start_pfn, 4615 unsigned long node_start_pfn,
4613 unsigned long node_end_pfn, 4616 unsigned long node_end_pfn,
4614 unsigned long *zholes_size) 4617 unsigned long *zholes_size)
4615 { 4618 {
4616 if (!zholes_size) 4619 if (!zholes_size)
4617 return 0; 4620 return 0;
4618 4621
4619 return zholes_size[zone_type]; 4622 return zholes_size[zone_type];
4620 } 4623 }
4621 4624
4622 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 4625 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
4623 4626
4624 static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, 4627 static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
4625 unsigned long node_start_pfn, 4628 unsigned long node_start_pfn,
4626 unsigned long node_end_pfn, 4629 unsigned long node_end_pfn,
4627 unsigned long *zones_size, 4630 unsigned long *zones_size,
4628 unsigned long *zholes_size) 4631 unsigned long *zholes_size)
4629 { 4632 {
4630 unsigned long realtotalpages, totalpages = 0; 4633 unsigned long realtotalpages, totalpages = 0;
4631 enum zone_type i; 4634 enum zone_type i;
4632 4635
4633 for (i = 0; i < MAX_NR_ZONES; i++) 4636 for (i = 0; i < MAX_NR_ZONES; i++)
4634 totalpages += zone_spanned_pages_in_node(pgdat->node_id, i, 4637 totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
4635 node_start_pfn, 4638 node_start_pfn,
4636 node_end_pfn, 4639 node_end_pfn,
4637 zones_size); 4640 zones_size);
4638 pgdat->node_spanned_pages = totalpages; 4641 pgdat->node_spanned_pages = totalpages;
4639 4642
4640 realtotalpages = totalpages; 4643 realtotalpages = totalpages;
4641 for (i = 0; i < MAX_NR_ZONES; i++) 4644 for (i = 0; i < MAX_NR_ZONES; i++)
4642 realtotalpages -= 4645 realtotalpages -=
4643 zone_absent_pages_in_node(pgdat->node_id, i, 4646 zone_absent_pages_in_node(pgdat->node_id, i,
4644 node_start_pfn, node_end_pfn, 4647 node_start_pfn, node_end_pfn,
4645 zholes_size); 4648 zholes_size);
4646 pgdat->node_present_pages = realtotalpages; 4649 pgdat->node_present_pages = realtotalpages;
4647 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, 4650 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
4648 realtotalpages); 4651 realtotalpages);
4649 } 4652 }
4650 4653
4651 #ifndef CONFIG_SPARSEMEM 4654 #ifndef CONFIG_SPARSEMEM
4652 /* 4655 /*
4653 * Calculate the size of the zone->blockflags rounded to an unsigned long 4656 * Calculate the size of the zone->blockflags rounded to an unsigned long
4654 * Start by making sure zonesize is a multiple of pageblock_order by rounding 4657 * Start by making sure zonesize is a multiple of pageblock_order by rounding
4655 * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally 4658 * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
4656 * round what is now in bits to nearest long in bits, then return it in 4659 * round what is now in bits to nearest long in bits, then return it in
4657 * bytes. 4660 * bytes.
4658 */ 4661 */
4659 static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize) 4662 static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)
4660 { 4663 {
4661 unsigned long usemapsize; 4664 unsigned long usemapsize;
4662 4665
4663 zonesize += zone_start_pfn & (pageblock_nr_pages-1); 4666 zonesize += zone_start_pfn & (pageblock_nr_pages-1);
4664 usemapsize = roundup(zonesize, pageblock_nr_pages); 4667 usemapsize = roundup(zonesize, pageblock_nr_pages);
4665 usemapsize = usemapsize >> pageblock_order; 4668 usemapsize = usemapsize >> pageblock_order;
4666 usemapsize *= NR_PAGEBLOCK_BITS; 4669 usemapsize *= NR_PAGEBLOCK_BITS;
4667 usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long)); 4670 usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));
4668 4671
4669 return usemapsize / 8; 4672 return usemapsize / 8;
4670 } 4673 }
4671 4674
4672 static void __init setup_usemap(struct pglist_data *pgdat, 4675 static void __init setup_usemap(struct pglist_data *pgdat,
4673 struct zone *zone, 4676 struct zone *zone,
4674 unsigned long zone_start_pfn, 4677 unsigned long zone_start_pfn,
4675 unsigned long zonesize) 4678 unsigned long zonesize)
4676 { 4679 {
4677 unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize); 4680 unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
4678 zone->pageblock_flags = NULL; 4681 zone->pageblock_flags = NULL;
4679 if (usemapsize) 4682 if (usemapsize)
4680 zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat, 4683 zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,
4681 usemapsize); 4684 usemapsize);
4682 } 4685 }
4683 #else 4686 #else
4684 static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, 4687 static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
4685 unsigned long zone_start_pfn, unsigned long zonesize) {} 4688 unsigned long zone_start_pfn, unsigned long zonesize) {}
4686 #endif /* CONFIG_SPARSEMEM */ 4689 #endif /* CONFIG_SPARSEMEM */
4687 4690
4688 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 4691 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
4689 4692
4690 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ 4693 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
4691 void __paginginit set_pageblock_order(void) 4694 void __paginginit set_pageblock_order(void)
4692 { 4695 {
4693 unsigned int order; 4696 unsigned int order;
4694 4697
4695 /* Check that pageblock_nr_pages has not already been setup */ 4698 /* Check that pageblock_nr_pages has not already been setup */
4696 if (pageblock_order) 4699 if (pageblock_order)
4697 return; 4700 return;
4698 4701
4699 if (HPAGE_SHIFT > PAGE_SHIFT) 4702 if (HPAGE_SHIFT > PAGE_SHIFT)
4700 order = HUGETLB_PAGE_ORDER; 4703 order = HUGETLB_PAGE_ORDER;
4701 else 4704 else
4702 order = MAX_ORDER - 1; 4705 order = MAX_ORDER - 1;
4703 4706
4704 /* 4707 /*
4705 * Assume the largest contiguous order of interest is a huge page. 4708 * Assume the largest contiguous order of interest is a huge page.
4706 * This value may be variable depending on boot parameters on IA64 and 4709 * This value may be variable depending on boot parameters on IA64 and
4707 * powerpc. 4710 * powerpc.
4708 */ 4711 */
4709 pageblock_order = order; 4712 pageblock_order = order;
4710 } 4713 }
4711 #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ 4714 #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
4712 4715
4713 /* 4716 /*
4714 * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order() 4717 * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
4715 * is unused as pageblock_order is set at compile-time. See 4718 * is unused as pageblock_order is set at compile-time. See
4716 * include/linux/pageblock-flags.h for the values of pageblock_order based on 4719 * include/linux/pageblock-flags.h for the values of pageblock_order based on
4717 * the kernel config 4720 * the kernel config
4718 */ 4721 */
4719 void __paginginit set_pageblock_order(void) 4722 void __paginginit set_pageblock_order(void)
4720 { 4723 {
4721 } 4724 }
4722 4725
4723 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ 4726 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
4724 4727
4725 static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages, 4728 static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
4726 unsigned long present_pages) 4729 unsigned long present_pages)
4727 { 4730 {
4728 unsigned long pages = spanned_pages; 4731 unsigned long pages = spanned_pages;
4729 4732
4730 /* 4733 /*
4731 * Provide a more accurate estimation if there are holes within 4734 * Provide a more accurate estimation if there are holes within
4732 * the zone and SPARSEMEM is in use. If there are holes within the 4735 * the zone and SPARSEMEM is in use. If there are holes within the
4733 * zone, each populated memory region may cost us one or two extra 4736 * zone, each populated memory region may cost us one or two extra
4734 * memmap pages due to alignment because memmap pages for each 4737 * memmap pages due to alignment because memmap pages for each
4735 * populated regions may not naturally algined on page boundary. 4738 * populated regions may not naturally algined on page boundary.
4736 * So the (present_pages >> 4) heuristic is a tradeoff for that. 4739 * So the (present_pages >> 4) heuristic is a tradeoff for that.
4737 */ 4740 */
4738 if (spanned_pages > present_pages + (present_pages >> 4) && 4741 if (spanned_pages > present_pages + (present_pages >> 4) &&
4739 IS_ENABLED(CONFIG_SPARSEMEM)) 4742 IS_ENABLED(CONFIG_SPARSEMEM))
4740 pages = present_pages; 4743 pages = present_pages;
4741 4744
4742 return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT; 4745 return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
4743 } 4746 }
4744 4747
4745 /* 4748 /*
4746 * Set up the zone data structures: 4749 * Set up the zone data structures:
4747 * - mark all pages reserved 4750 * - mark all pages reserved
4748 * - mark all memory queues empty 4751 * - mark all memory queues empty
4749 * - clear the memory bitmaps 4752 * - clear the memory bitmaps
4750 * 4753 *
4751 * NOTE: pgdat should get zeroed by caller. 4754 * NOTE: pgdat should get zeroed by caller.
4752 */ 4755 */
4753 static void __paginginit free_area_init_core(struct pglist_data *pgdat, 4756 static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4754 unsigned long node_start_pfn, unsigned long node_end_pfn, 4757 unsigned long node_start_pfn, unsigned long node_end_pfn,
4755 unsigned long *zones_size, unsigned long *zholes_size) 4758 unsigned long *zones_size, unsigned long *zholes_size)
4756 { 4759 {
4757 enum zone_type j; 4760 enum zone_type j;
4758 int nid = pgdat->node_id; 4761 int nid = pgdat->node_id;
4759 unsigned long zone_start_pfn = pgdat->node_start_pfn; 4762 unsigned long zone_start_pfn = pgdat->node_start_pfn;
4760 int ret; 4763 int ret;
4761 4764
4762 pgdat_resize_init(pgdat); 4765 pgdat_resize_init(pgdat);
4763 #ifdef CONFIG_NUMA_BALANCING 4766 #ifdef CONFIG_NUMA_BALANCING
4764 spin_lock_init(&pgdat->numabalancing_migrate_lock); 4767 spin_lock_init(&pgdat->numabalancing_migrate_lock);
4765 pgdat->numabalancing_migrate_nr_pages = 0; 4768 pgdat->numabalancing_migrate_nr_pages = 0;
4766 pgdat->numabalancing_migrate_next_window = jiffies; 4769 pgdat->numabalancing_migrate_next_window = jiffies;
4767 #endif 4770 #endif
4768 init_waitqueue_head(&pgdat->kswapd_wait); 4771 init_waitqueue_head(&pgdat->kswapd_wait);
4769 init_waitqueue_head(&pgdat->pfmemalloc_wait); 4772 init_waitqueue_head(&pgdat->pfmemalloc_wait);
4770 pgdat_page_cgroup_init(pgdat); 4773 pgdat_page_cgroup_init(pgdat);
4771 4774
4772 for (j = 0; j < MAX_NR_ZONES; j++) { 4775 for (j = 0; j < MAX_NR_ZONES; j++) {
4773 struct zone *zone = pgdat->node_zones + j; 4776 struct zone *zone = pgdat->node_zones + j;
4774 unsigned long size, realsize, freesize, memmap_pages; 4777 unsigned long size, realsize, freesize, memmap_pages;
4775 4778
4776 size = zone_spanned_pages_in_node(nid, j, node_start_pfn, 4779 size = zone_spanned_pages_in_node(nid, j, node_start_pfn,
4777 node_end_pfn, zones_size); 4780 node_end_pfn, zones_size);
4778 realsize = freesize = size - zone_absent_pages_in_node(nid, j, 4781 realsize = freesize = size - zone_absent_pages_in_node(nid, j,
4779 node_start_pfn, 4782 node_start_pfn,
4780 node_end_pfn, 4783 node_end_pfn,
4781 zholes_size); 4784 zholes_size);
4782 4785
4783 /* 4786 /*
4784 * Adjust freesize so that it accounts for how much memory 4787 * Adjust freesize so that it accounts for how much memory
4785 * is used by this zone for memmap. This affects the watermark 4788 * is used by this zone for memmap. This affects the watermark
4786 * and per-cpu initialisations 4789 * and per-cpu initialisations
4787 */ 4790 */
4788 memmap_pages = calc_memmap_size(size, realsize); 4791 memmap_pages = calc_memmap_size(size, realsize);
4789 if (freesize >= memmap_pages) { 4792 if (freesize >= memmap_pages) {
4790 freesize -= memmap_pages; 4793 freesize -= memmap_pages;
4791 if (memmap_pages) 4794 if (memmap_pages)
4792 printk(KERN_DEBUG 4795 printk(KERN_DEBUG
4793 " %s zone: %lu pages used for memmap\n", 4796 " %s zone: %lu pages used for memmap\n",
4794 zone_names[j], memmap_pages); 4797 zone_names[j], memmap_pages);
4795 } else 4798 } else
4796 printk(KERN_WARNING 4799 printk(KERN_WARNING
4797 " %s zone: %lu pages exceeds freesize %lu\n", 4800 " %s zone: %lu pages exceeds freesize %lu\n",
4798 zone_names[j], memmap_pages, freesize); 4801 zone_names[j], memmap_pages, freesize);
4799 4802
4800 /* Account for reserved pages */ 4803 /* Account for reserved pages */
4801 if (j == 0 && freesize > dma_reserve) { 4804 if (j == 0 && freesize > dma_reserve) {
4802 freesize -= dma_reserve; 4805 freesize -= dma_reserve;
4803 printk(KERN_DEBUG " %s zone: %lu pages reserved\n", 4806 printk(KERN_DEBUG " %s zone: %lu pages reserved\n",
4804 zone_names[0], dma_reserve); 4807 zone_names[0], dma_reserve);
4805 } 4808 }
4806 4809
4807 if (!is_highmem_idx(j)) 4810 if (!is_highmem_idx(j))
4808 nr_kernel_pages += freesize; 4811 nr_kernel_pages += freesize;
4809 /* Charge for highmem memmap if there are enough kernel pages */ 4812 /* Charge for highmem memmap if there are enough kernel pages */
4810 else if (nr_kernel_pages > memmap_pages * 2) 4813 else if (nr_kernel_pages > memmap_pages * 2)
4811 nr_kernel_pages -= memmap_pages; 4814 nr_kernel_pages -= memmap_pages;
4812 nr_all_pages += freesize; 4815 nr_all_pages += freesize;
4813 4816
4814 zone->spanned_pages = size; 4817 zone->spanned_pages = size;
4815 zone->present_pages = realsize; 4818 zone->present_pages = realsize;
4816 /* 4819 /*
4817 * Set an approximate value for lowmem here, it will be adjusted 4820 * Set an approximate value for lowmem here, it will be adjusted
4818 * when the bootmem allocator frees pages into the buddy system. 4821 * when the bootmem allocator frees pages into the buddy system.
4819 * And all highmem pages will be managed by the buddy system. 4822 * And all highmem pages will be managed by the buddy system.
4820 */ 4823 */
4821 zone->managed_pages = is_highmem_idx(j) ? realsize : freesize; 4824 zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;
4822 #ifdef CONFIG_NUMA 4825 #ifdef CONFIG_NUMA
4823 zone->node = nid; 4826 zone->node = nid;
4824 zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio) 4827 zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)
4825 / 100; 4828 / 100;
4826 zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100; 4829 zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;
4827 #endif 4830 #endif
4828 zone->name = zone_names[j]; 4831 zone->name = zone_names[j];
4829 spin_lock_init(&zone->lock); 4832 spin_lock_init(&zone->lock);
4830 spin_lock_init(&zone->lru_lock); 4833 spin_lock_init(&zone->lru_lock);
4831 zone_seqlock_init(zone); 4834 zone_seqlock_init(zone);
4832 zone->zone_pgdat = pgdat; 4835 zone->zone_pgdat = pgdat;
4833 zone_pcp_init(zone); 4836 zone_pcp_init(zone);
4834 4837
4835 /* For bootup, initialized properly in watermark setup */ 4838 /* For bootup, initialized properly in watermark setup */
4836 mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages); 4839 mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages);
4837 4840
4838 lruvec_init(&zone->lruvec); 4841 lruvec_init(&zone->lruvec);
4839 if (!size) 4842 if (!size)
4840 continue; 4843 continue;
4841 4844
4842 set_pageblock_order(); 4845 set_pageblock_order();
4843 setup_usemap(pgdat, zone, zone_start_pfn, size); 4846 setup_usemap(pgdat, zone, zone_start_pfn, size);
4844 ret = init_currently_empty_zone(zone, zone_start_pfn, 4847 ret = init_currently_empty_zone(zone, zone_start_pfn,
4845 size, MEMMAP_EARLY); 4848 size, MEMMAP_EARLY);
4846 BUG_ON(ret); 4849 BUG_ON(ret);
4847 memmap_init(size, nid, j, zone_start_pfn); 4850 memmap_init(size, nid, j, zone_start_pfn);
4848 zone_start_pfn += size; 4851 zone_start_pfn += size;
4849 } 4852 }
4850 } 4853 }
4851 4854
4852 static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) 4855 static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
4853 { 4856 {
4854 /* Skip empty nodes */ 4857 /* Skip empty nodes */
4855 if (!pgdat->node_spanned_pages) 4858 if (!pgdat->node_spanned_pages)
4856 return; 4859 return;
4857 4860
4858 #ifdef CONFIG_FLAT_NODE_MEM_MAP 4861 #ifdef CONFIG_FLAT_NODE_MEM_MAP
4859 /* ia64 gets its own node_mem_map, before this, without bootmem */ 4862 /* ia64 gets its own node_mem_map, before this, without bootmem */
4860 if (!pgdat->node_mem_map) { 4863 if (!pgdat->node_mem_map) {
4861 unsigned long size, start, end; 4864 unsigned long size, start, end;
4862 struct page *map; 4865 struct page *map;
4863 4866
4864 /* 4867 /*
4865 * The zone's endpoints aren't required to be MAX_ORDER 4868 * The zone's endpoints aren't required to be MAX_ORDER
4866 * aligned but the node_mem_map endpoints must be in order 4869 * aligned but the node_mem_map endpoints must be in order
4867 * for the buddy allocator to function correctly. 4870 * for the buddy allocator to function correctly.
4868 */ 4871 */
4869 start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); 4872 start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
4870 end = pgdat_end_pfn(pgdat); 4873 end = pgdat_end_pfn(pgdat);
4871 end = ALIGN(end, MAX_ORDER_NR_PAGES); 4874 end = ALIGN(end, MAX_ORDER_NR_PAGES);
4872 size = (end - start) * sizeof(struct page); 4875 size = (end - start) * sizeof(struct page);
4873 map = alloc_remap(pgdat->node_id, size); 4876 map = alloc_remap(pgdat->node_id, size);
4874 if (!map) 4877 if (!map)
4875 map = alloc_bootmem_node_nopanic(pgdat, size); 4878 map = alloc_bootmem_node_nopanic(pgdat, size);
4876 pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); 4879 pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
4877 } 4880 }
4878 #ifndef CONFIG_NEED_MULTIPLE_NODES 4881 #ifndef CONFIG_NEED_MULTIPLE_NODES
4879 /* 4882 /*
4880 * With no DISCONTIG, the global mem_map is just set as node 0's 4883 * With no DISCONTIG, the global mem_map is just set as node 0's
4881 */ 4884 */
4882 if (pgdat == NODE_DATA(0)) { 4885 if (pgdat == NODE_DATA(0)) {
4883 mem_map = NODE_DATA(0)->node_mem_map; 4886 mem_map = NODE_DATA(0)->node_mem_map;
4884 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 4887 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
4885 if (page_to_pfn(mem_map) != pgdat->node_start_pfn) 4888 if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
4886 mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET); 4889 mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);
4887 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 4890 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
4888 } 4891 }
4889 #endif 4892 #endif
4890 #endif /* CONFIG_FLAT_NODE_MEM_MAP */ 4893 #endif /* CONFIG_FLAT_NODE_MEM_MAP */
4891 } 4894 }
4892 4895
4893 void __paginginit free_area_init_node(int nid, unsigned long *zones_size, 4896 void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
4894 unsigned long node_start_pfn, unsigned long *zholes_size) 4897 unsigned long node_start_pfn, unsigned long *zholes_size)
4895 { 4898 {
4896 pg_data_t *pgdat = NODE_DATA(nid); 4899 pg_data_t *pgdat = NODE_DATA(nid);
4897 unsigned long start_pfn = 0; 4900 unsigned long start_pfn = 0;
4898 unsigned long end_pfn = 0; 4901 unsigned long end_pfn = 0;
4899 4902
4900 /* pg_data_t should be reset to zero when it's allocated */ 4903 /* pg_data_t should be reset to zero when it's allocated */
4901 WARN_ON(pgdat->nr_zones || pgdat->classzone_idx); 4904 WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);
4902 4905
4903 pgdat->node_id = nid; 4906 pgdat->node_id = nid;
4904 pgdat->node_start_pfn = node_start_pfn; 4907 pgdat->node_start_pfn = node_start_pfn;
4905 if (node_state(nid, N_MEMORY)) 4908 if (node_state(nid, N_MEMORY))
4906 init_zone_allows_reclaim(nid); 4909 init_zone_allows_reclaim(nid);
4907 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 4910 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
4908 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); 4911 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
4909 #endif 4912 #endif
4910 calculate_node_totalpages(pgdat, start_pfn, end_pfn, 4913 calculate_node_totalpages(pgdat, start_pfn, end_pfn,
4911 zones_size, zholes_size); 4914 zones_size, zholes_size);
4912 4915
4913 alloc_node_mem_map(pgdat); 4916 alloc_node_mem_map(pgdat);
4914 #ifdef CONFIG_FLAT_NODE_MEM_MAP 4917 #ifdef CONFIG_FLAT_NODE_MEM_MAP
4915 printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n", 4918 printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",
4916 nid, (unsigned long)pgdat, 4919 nid, (unsigned long)pgdat,
4917 (unsigned long)pgdat->node_mem_map); 4920 (unsigned long)pgdat->node_mem_map);
4918 #endif 4921 #endif
4919 4922
4920 free_area_init_core(pgdat, start_pfn, end_pfn, 4923 free_area_init_core(pgdat, start_pfn, end_pfn,
4921 zones_size, zholes_size); 4924 zones_size, zholes_size);
4922 } 4925 }
4923 4926
4924 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 4927 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
4925 4928
4926 #if MAX_NUMNODES > 1 4929 #if MAX_NUMNODES > 1
4927 /* 4930 /*
4928 * Figure out the number of possible node ids. 4931 * Figure out the number of possible node ids.
4929 */ 4932 */
4930 void __init setup_nr_node_ids(void) 4933 void __init setup_nr_node_ids(void)
4931 { 4934 {
4932 unsigned int node; 4935 unsigned int node;
4933 unsigned int highest = 0; 4936 unsigned int highest = 0;
4934 4937
4935 for_each_node_mask(node, node_possible_map) 4938 for_each_node_mask(node, node_possible_map)
4936 highest = node; 4939 highest = node;
4937 nr_node_ids = highest + 1; 4940 nr_node_ids = highest + 1;
4938 } 4941 }
4939 #endif 4942 #endif
4940 4943
4941 /** 4944 /**
4942 * node_map_pfn_alignment - determine the maximum internode alignment 4945 * node_map_pfn_alignment - determine the maximum internode alignment
4943 * 4946 *
4944 * This function should be called after node map is populated and sorted. 4947 * This function should be called after node map is populated and sorted.
4945 * It calculates the maximum power of two alignment which can distinguish 4948 * It calculates the maximum power of two alignment which can distinguish
4946 * all the nodes. 4949 * all the nodes.
4947 * 4950 *
4948 * For example, if all nodes are 1GiB and aligned to 1GiB, the return value 4951 * For example, if all nodes are 1GiB and aligned to 1GiB, the return value
4949 * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the 4952 * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the
4950 * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is 4953 * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is
4951 * shifted, 1GiB is enough and this function will indicate so. 4954 * shifted, 1GiB is enough and this function will indicate so.
4952 * 4955 *
4953 * This is used to test whether pfn -> nid mapping of the chosen memory 4956 * This is used to test whether pfn -> nid mapping of the chosen memory
4954 * model has fine enough granularity to avoid incorrect mapping for the 4957 * model has fine enough granularity to avoid incorrect mapping for the
4955 * populated node map. 4958 * populated node map.
4956 * 4959 *
4957 * Returns the determined alignment in pfn's. 0 if there is no alignment 4960 * Returns the determined alignment in pfn's. 0 if there is no alignment
4958 * requirement (single node). 4961 * requirement (single node).
4959 */ 4962 */
4960 unsigned long __init node_map_pfn_alignment(void) 4963 unsigned long __init node_map_pfn_alignment(void)
4961 { 4964 {
4962 unsigned long accl_mask = 0, last_end = 0; 4965 unsigned long accl_mask = 0, last_end = 0;
4963 unsigned long start, end, mask; 4966 unsigned long start, end, mask;
4964 int last_nid = -1; 4967 int last_nid = -1;
4965 int i, nid; 4968 int i, nid;
4966 4969
4967 for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) { 4970 for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
4968 if (!start || last_nid < 0 || last_nid == nid) { 4971 if (!start || last_nid < 0 || last_nid == nid) {
4969 last_nid = nid; 4972 last_nid = nid;
4970 last_end = end; 4973 last_end = end;
4971 continue; 4974 continue;
4972 } 4975 }
4973 4976
4974 /* 4977 /*
4975 * Start with a mask granular enough to pin-point to the 4978 * Start with a mask granular enough to pin-point to the
4976 * start pfn and tick off bits one-by-one until it becomes 4979 * start pfn and tick off bits one-by-one until it becomes
4977 * too coarse to separate the current node from the last. 4980 * too coarse to separate the current node from the last.
4978 */ 4981 */
4979 mask = ~((1 << __ffs(start)) - 1); 4982 mask = ~((1 << __ffs(start)) - 1);
4980 while (mask && last_end <= (start & (mask << 1))) 4983 while (mask && last_end <= (start & (mask << 1)))
4981 mask <<= 1; 4984 mask <<= 1;
4982 4985
4983 /* accumulate all internode masks */ 4986 /* accumulate all internode masks */
4984 accl_mask |= mask; 4987 accl_mask |= mask;
4985 } 4988 }
4986 4989
4987 /* convert mask to number of pages */ 4990 /* convert mask to number of pages */
4988 return ~accl_mask + 1; 4991 return ~accl_mask + 1;
4989 } 4992 }
4990 4993
4991 /* Find the lowest pfn for a node */ 4994 /* Find the lowest pfn for a node */
4992 static unsigned long __init find_min_pfn_for_node(int nid) 4995 static unsigned long __init find_min_pfn_for_node(int nid)
4993 { 4996 {
4994 unsigned long min_pfn = ULONG_MAX; 4997 unsigned long min_pfn = ULONG_MAX;
4995 unsigned long start_pfn; 4998 unsigned long start_pfn;
4996 int i; 4999 int i;
4997 5000
4998 for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL) 5001 for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)
4999 min_pfn = min(min_pfn, start_pfn); 5002 min_pfn = min(min_pfn, start_pfn);
5000 5003
5001 if (min_pfn == ULONG_MAX) { 5004 if (min_pfn == ULONG_MAX) {
5002 printk(KERN_WARNING 5005 printk(KERN_WARNING
5003 "Could not find start_pfn for node %d\n", nid); 5006 "Could not find start_pfn for node %d\n", nid);
5004 return 0; 5007 return 0;
5005 } 5008 }
5006 5009
5007 return min_pfn; 5010 return min_pfn;
5008 } 5011 }
5009 5012
5010 /** 5013 /**
5011 * find_min_pfn_with_active_regions - Find the minimum PFN registered 5014 * find_min_pfn_with_active_regions - Find the minimum PFN registered
5012 * 5015 *
5013 * It returns the minimum PFN based on information provided via 5016 * It returns the minimum PFN based on information provided via
5014 * add_active_range(). 5017 * add_active_range().
5015 */ 5018 */
5016 unsigned long __init find_min_pfn_with_active_regions(void) 5019 unsigned long __init find_min_pfn_with_active_regions(void)
5017 { 5020 {
5018 return find_min_pfn_for_node(MAX_NUMNODES); 5021 return find_min_pfn_for_node(MAX_NUMNODES);
5019 } 5022 }
5020 5023
5021 /* 5024 /*
5022 * early_calculate_totalpages() 5025 * early_calculate_totalpages()
5023 * Sum pages in active regions for movable zone. 5026 * Sum pages in active regions for movable zone.
5024 * Populate N_MEMORY for calculating usable_nodes. 5027 * Populate N_MEMORY for calculating usable_nodes.
5025 */ 5028 */
5026 static unsigned long __init early_calculate_totalpages(void) 5029 static unsigned long __init early_calculate_totalpages(void)
5027 { 5030 {
5028 unsigned long totalpages = 0; 5031 unsigned long totalpages = 0;
5029 unsigned long start_pfn, end_pfn; 5032 unsigned long start_pfn, end_pfn;
5030 int i, nid; 5033 int i, nid;
5031 5034
5032 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { 5035 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
5033 unsigned long pages = end_pfn - start_pfn; 5036 unsigned long pages = end_pfn - start_pfn;
5034 5037
5035 totalpages += pages; 5038 totalpages += pages;
5036 if (pages) 5039 if (pages)
5037 node_set_state(nid, N_MEMORY); 5040 node_set_state(nid, N_MEMORY);
5038 } 5041 }
5039 return totalpages; 5042 return totalpages;
5040 } 5043 }
5041 5044
5042 /* 5045 /*
5043 * Find the PFN the Movable zone begins in each node. Kernel memory 5046 * Find the PFN the Movable zone begins in each node. Kernel memory
5044 * is spread evenly between nodes as long as the nodes have enough 5047 * is spread evenly between nodes as long as the nodes have enough
5045 * memory. When they don't, some nodes will have more kernelcore than 5048 * memory. When they don't, some nodes will have more kernelcore than
5046 * others 5049 * others
5047 */ 5050 */
5048 static void __init find_zone_movable_pfns_for_nodes(void) 5051 static void __init find_zone_movable_pfns_for_nodes(void)
5049 { 5052 {
5050 int i, nid; 5053 int i, nid;
5051 unsigned long usable_startpfn; 5054 unsigned long usable_startpfn;
5052 unsigned long kernelcore_node, kernelcore_remaining; 5055 unsigned long kernelcore_node, kernelcore_remaining;
5053 /* save the state before borrow the nodemask */ 5056 /* save the state before borrow the nodemask */
5054 nodemask_t saved_node_state = node_states[N_MEMORY]; 5057 nodemask_t saved_node_state = node_states[N_MEMORY];
5055 unsigned long totalpages = early_calculate_totalpages(); 5058 unsigned long totalpages = early_calculate_totalpages();
5056 int usable_nodes = nodes_weight(node_states[N_MEMORY]); 5059 int usable_nodes = nodes_weight(node_states[N_MEMORY]);
5057 5060
5058 /* 5061 /*
5059 * If movablecore was specified, calculate what size of 5062 * If movablecore was specified, calculate what size of
5060 * kernelcore that corresponds so that memory usable for 5063 * kernelcore that corresponds so that memory usable for
5061 * any allocation type is evenly spread. If both kernelcore 5064 * any allocation type is evenly spread. If both kernelcore
5062 * and movablecore are specified, then the value of kernelcore 5065 * and movablecore are specified, then the value of kernelcore
5063 * will be used for required_kernelcore if it's greater than 5066 * will be used for required_kernelcore if it's greater than
5064 * what movablecore would have allowed. 5067 * what movablecore would have allowed.
5065 */ 5068 */
5066 if (required_movablecore) { 5069 if (required_movablecore) {
5067 unsigned long corepages; 5070 unsigned long corepages;
5068 5071
5069 /* 5072 /*
5070 * Round-up so that ZONE_MOVABLE is at least as large as what 5073 * Round-up so that ZONE_MOVABLE is at least as large as what
5071 * was requested by the user 5074 * was requested by the user
5072 */ 5075 */
5073 required_movablecore = 5076 required_movablecore =
5074 roundup(required_movablecore, MAX_ORDER_NR_PAGES); 5077 roundup(required_movablecore, MAX_ORDER_NR_PAGES);
5075 corepages = totalpages - required_movablecore; 5078 corepages = totalpages - required_movablecore;
5076 5079
5077 required_kernelcore = max(required_kernelcore, corepages); 5080 required_kernelcore = max(required_kernelcore, corepages);
5078 } 5081 }
5079 5082
5080 /* If kernelcore was not specified, there is no ZONE_MOVABLE */ 5083 /* If kernelcore was not specified, there is no ZONE_MOVABLE */
5081 if (!required_kernelcore) 5084 if (!required_kernelcore)
5082 goto out; 5085 goto out;
5083 5086
5084 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ 5087 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
5085 find_usable_zone_for_movable(); 5088 find_usable_zone_for_movable();
5086 usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone]; 5089 usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
5087 5090
5088 restart: 5091 restart:
5089 /* Spread kernelcore memory as evenly as possible throughout nodes */ 5092 /* Spread kernelcore memory as evenly as possible throughout nodes */
5090 kernelcore_node = required_kernelcore / usable_nodes; 5093 kernelcore_node = required_kernelcore / usable_nodes;
5091 for_each_node_state(nid, N_MEMORY) { 5094 for_each_node_state(nid, N_MEMORY) {
5092 unsigned long start_pfn, end_pfn; 5095 unsigned long start_pfn, end_pfn;
5093 5096
5094 /* 5097 /*
5095 * Recalculate kernelcore_node if the division per node 5098 * Recalculate kernelcore_node if the division per node
5096 * now exceeds what is necessary to satisfy the requested 5099 * now exceeds what is necessary to satisfy the requested
5097 * amount of memory for the kernel 5100 * amount of memory for the kernel
5098 */ 5101 */
5099 if (required_kernelcore < kernelcore_node) 5102 if (required_kernelcore < kernelcore_node)
5100 kernelcore_node = required_kernelcore / usable_nodes; 5103 kernelcore_node = required_kernelcore / usable_nodes;
5101 5104
5102 /* 5105 /*
5103 * As the map is walked, we track how much memory is usable 5106 * As the map is walked, we track how much memory is usable
5104 * by the kernel using kernelcore_remaining. When it is 5107 * by the kernel using kernelcore_remaining. When it is
5105 * 0, the rest of the node is usable by ZONE_MOVABLE 5108 * 0, the rest of the node is usable by ZONE_MOVABLE
5106 */ 5109 */
5107 kernelcore_remaining = kernelcore_node; 5110 kernelcore_remaining = kernelcore_node;
5108 5111
5109 /* Go through each range of PFNs within this node */ 5112 /* Go through each range of PFNs within this node */
5110 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 5113 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
5111 unsigned long size_pages; 5114 unsigned long size_pages;
5112 5115
5113 start_pfn = max(start_pfn, zone_movable_pfn[nid]); 5116 start_pfn = max(start_pfn, zone_movable_pfn[nid]);
5114 if (start_pfn >= end_pfn) 5117 if (start_pfn >= end_pfn)
5115 continue; 5118 continue;
5116 5119
5117 /* Account for what is only usable for kernelcore */ 5120 /* Account for what is only usable for kernelcore */
5118 if (start_pfn < usable_startpfn) { 5121 if (start_pfn < usable_startpfn) {
5119 unsigned long kernel_pages; 5122 unsigned long kernel_pages;
5120 kernel_pages = min(end_pfn, usable_startpfn) 5123 kernel_pages = min(end_pfn, usable_startpfn)
5121 - start_pfn; 5124 - start_pfn;
5122 5125
5123 kernelcore_remaining -= min(kernel_pages, 5126 kernelcore_remaining -= min(kernel_pages,
5124 kernelcore_remaining); 5127 kernelcore_remaining);
5125 required_kernelcore -= min(kernel_pages, 5128 required_kernelcore -= min(kernel_pages,
5126 required_kernelcore); 5129 required_kernelcore);
5127 5130
5128 /* Continue if range is now fully accounted */ 5131 /* Continue if range is now fully accounted */
5129 if (end_pfn <= usable_startpfn) { 5132 if (end_pfn <= usable_startpfn) {
5130 5133
5131 /* 5134 /*
5132 * Push zone_movable_pfn to the end so 5135 * Push zone_movable_pfn to the end so
5133 * that if we have to rebalance 5136 * that if we have to rebalance
5134 * kernelcore across nodes, we will 5137 * kernelcore across nodes, we will
5135 * not double account here 5138 * not double account here
5136 */ 5139 */
5137 zone_movable_pfn[nid] = end_pfn; 5140 zone_movable_pfn[nid] = end_pfn;
5138 continue; 5141 continue;
5139 } 5142 }
5140 start_pfn = usable_startpfn; 5143 start_pfn = usable_startpfn;
5141 } 5144 }
5142 5145
5143 /* 5146 /*
5144 * The usable PFN range for ZONE_MOVABLE is from 5147 * The usable PFN range for ZONE_MOVABLE is from
5145 * start_pfn->end_pfn. Calculate size_pages as the 5148 * start_pfn->end_pfn. Calculate size_pages as the
5146 * number of pages used as kernelcore 5149 * number of pages used as kernelcore
5147 */ 5150 */
5148 size_pages = end_pfn - start_pfn; 5151 size_pages = end_pfn - start_pfn;
5149 if (size_pages > kernelcore_remaining) 5152 if (size_pages > kernelcore_remaining)
5150 size_pages = kernelcore_remaining; 5153 size_pages = kernelcore_remaining;
5151 zone_movable_pfn[nid] = start_pfn + size_pages; 5154 zone_movable_pfn[nid] = start_pfn + size_pages;
5152 5155
5153 /* 5156 /*
5154 * Some kernelcore has been met, update counts and 5157 * Some kernelcore has been met, update counts and
5155 * break if the kernelcore for this node has been 5158 * break if the kernelcore for this node has been
5156 * satisfied 5159 * satisfied
5157 */ 5160 */
5158 required_kernelcore -= min(required_kernelcore, 5161 required_kernelcore -= min(required_kernelcore,
5159 size_pages); 5162 size_pages);
5160 kernelcore_remaining -= size_pages; 5163 kernelcore_remaining -= size_pages;
5161 if (!kernelcore_remaining) 5164 if (!kernelcore_remaining)
5162 break; 5165 break;
5163 } 5166 }
5164 } 5167 }
5165 5168
5166 /* 5169 /*
5167 * If there is still required_kernelcore, we do another pass with one 5170 * If there is still required_kernelcore, we do another pass with one
5168 * less node in the count. This will push zone_movable_pfn[nid] further 5171 * less node in the count. This will push zone_movable_pfn[nid] further
5169 * along on the nodes that still have memory until kernelcore is 5172 * along on the nodes that still have memory until kernelcore is
5170 * satisfied 5173 * satisfied
5171 */ 5174 */
5172 usable_nodes--; 5175 usable_nodes--;
5173 if (usable_nodes && required_kernelcore > usable_nodes) 5176 if (usable_nodes && required_kernelcore > usable_nodes)
5174 goto restart; 5177 goto restart;
5175 5178
5176 /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ 5179 /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
5177 for (nid = 0; nid < MAX_NUMNODES; nid++) 5180 for (nid = 0; nid < MAX_NUMNODES; nid++)
5178 zone_movable_pfn[nid] = 5181 zone_movable_pfn[nid] =
5179 roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); 5182 roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
5180 5183
5181 out: 5184 out:
5182 /* restore the node_state */ 5185 /* restore the node_state */
5183 node_states[N_MEMORY] = saved_node_state; 5186 node_states[N_MEMORY] = saved_node_state;
5184 } 5187 }
5185 5188
5186 /* Any regular or high memory on that node ? */ 5189 /* Any regular or high memory on that node ? */
5187 static void check_for_memory(pg_data_t *pgdat, int nid) 5190 static void check_for_memory(pg_data_t *pgdat, int nid)
5188 { 5191 {
5189 enum zone_type zone_type; 5192 enum zone_type zone_type;
5190 5193
5191 if (N_MEMORY == N_NORMAL_MEMORY) 5194 if (N_MEMORY == N_NORMAL_MEMORY)
5192 return; 5195 return;
5193 5196
5194 for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) { 5197 for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
5195 struct zone *zone = &pgdat->node_zones[zone_type]; 5198 struct zone *zone = &pgdat->node_zones[zone_type];
5196 if (zone->present_pages) { 5199 if (zone->present_pages) {
5197 node_set_state(nid, N_HIGH_MEMORY); 5200 node_set_state(nid, N_HIGH_MEMORY);
5198 if (N_NORMAL_MEMORY != N_HIGH_MEMORY && 5201 if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&
5199 zone_type <= ZONE_NORMAL) 5202 zone_type <= ZONE_NORMAL)
5200 node_set_state(nid, N_NORMAL_MEMORY); 5203 node_set_state(nid, N_NORMAL_MEMORY);
5201 break; 5204 break;
5202 } 5205 }
5203 } 5206 }
5204 } 5207 }
5205 5208
5206 /** 5209 /**
5207 * free_area_init_nodes - Initialise all pg_data_t and zone data 5210 * free_area_init_nodes - Initialise all pg_data_t and zone data
5208 * @max_zone_pfn: an array of max PFNs for each zone 5211 * @max_zone_pfn: an array of max PFNs for each zone
5209 * 5212 *
5210 * This will call free_area_init_node() for each active node in the system. 5213 * This will call free_area_init_node() for each active node in the system.
5211 * Using the page ranges provided by add_active_range(), the size of each 5214 * Using the page ranges provided by add_active_range(), the size of each
5212 * zone in each node and their holes is calculated. If the maximum PFN 5215 * zone in each node and their holes is calculated. If the maximum PFN
5213 * between two adjacent zones match, it is assumed that the zone is empty. 5216 * between two adjacent zones match, it is assumed that the zone is empty.
5214 * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed 5217 * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
5215 * that arch_max_dma32_pfn has no pages. It is also assumed that a zone 5218 * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
5216 * starts where the previous one ended. For example, ZONE_DMA32 starts 5219 * starts where the previous one ended. For example, ZONE_DMA32 starts
5217 * at arch_max_dma_pfn. 5220 * at arch_max_dma_pfn.
5218 */ 5221 */
5219 void __init free_area_init_nodes(unsigned long *max_zone_pfn) 5222 void __init free_area_init_nodes(unsigned long *max_zone_pfn)
5220 { 5223 {
5221 unsigned long start_pfn, end_pfn; 5224 unsigned long start_pfn, end_pfn;
5222 int i, nid; 5225 int i, nid;
5223 5226
5224 /* Record where the zone boundaries are */ 5227 /* Record where the zone boundaries are */
5225 memset(arch_zone_lowest_possible_pfn, 0, 5228 memset(arch_zone_lowest_possible_pfn, 0,
5226 sizeof(arch_zone_lowest_possible_pfn)); 5229 sizeof(arch_zone_lowest_possible_pfn));
5227 memset(arch_zone_highest_possible_pfn, 0, 5230 memset(arch_zone_highest_possible_pfn, 0,
5228 sizeof(arch_zone_highest_possible_pfn)); 5231 sizeof(arch_zone_highest_possible_pfn));
5229 arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions(); 5232 arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
5230 arch_zone_highest_possible_pfn[0] = max_zone_pfn[0]; 5233 arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
5231 for (i = 1; i < MAX_NR_ZONES; i++) { 5234 for (i = 1; i < MAX_NR_ZONES; i++) {
5232 if (i == ZONE_MOVABLE) 5235 if (i == ZONE_MOVABLE)
5233 continue; 5236 continue;
5234 arch_zone_lowest_possible_pfn[i] = 5237 arch_zone_lowest_possible_pfn[i] =
5235 arch_zone_highest_possible_pfn[i-1]; 5238 arch_zone_highest_possible_pfn[i-1];
5236 arch_zone_highest_possible_pfn[i] = 5239 arch_zone_highest_possible_pfn[i] =
5237 max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]); 5240 max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
5238 } 5241 }
5239 arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0; 5242 arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;
5240 arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0; 5243 arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;
5241 5244
5242 /* Find the PFNs that ZONE_MOVABLE begins at in each node */ 5245 /* Find the PFNs that ZONE_MOVABLE begins at in each node */
5243 memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); 5246 memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
5244 find_zone_movable_pfns_for_nodes(); 5247 find_zone_movable_pfns_for_nodes();
5245 5248
5246 /* Print out the zone ranges */ 5249 /* Print out the zone ranges */
5247 printk("Zone ranges:\n"); 5250 printk("Zone ranges:\n");
5248 for (i = 0; i < MAX_NR_ZONES; i++) { 5251 for (i = 0; i < MAX_NR_ZONES; i++) {
5249 if (i == ZONE_MOVABLE) 5252 if (i == ZONE_MOVABLE)
5250 continue; 5253 continue;
5251 printk(KERN_CONT " %-8s ", zone_names[i]); 5254 printk(KERN_CONT " %-8s ", zone_names[i]);
5252 if (arch_zone_lowest_possible_pfn[i] == 5255 if (arch_zone_lowest_possible_pfn[i] ==
5253 arch_zone_highest_possible_pfn[i]) 5256 arch_zone_highest_possible_pfn[i])
5254 printk(KERN_CONT "empty\n"); 5257 printk(KERN_CONT "empty\n");
5255 else 5258 else
5256 printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n", 5259 printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n",
5257 arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT, 5260 arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT,
5258 (arch_zone_highest_possible_pfn[i] 5261 (arch_zone_highest_possible_pfn[i]
5259 << PAGE_SHIFT) - 1); 5262 << PAGE_SHIFT) - 1);
5260 } 5263 }
5261 5264
5262 /* Print out the PFNs ZONE_MOVABLE begins at in each node */ 5265 /* Print out the PFNs ZONE_MOVABLE begins at in each node */
5263 printk("Movable zone start for each node\n"); 5266 printk("Movable zone start for each node\n");
5264 for (i = 0; i < MAX_NUMNODES; i++) { 5267 for (i = 0; i < MAX_NUMNODES; i++) {
5265 if (zone_movable_pfn[i]) 5268 if (zone_movable_pfn[i])
5266 printk(" Node %d: %#010lx\n", i, 5269 printk(" Node %d: %#010lx\n", i,
5267 zone_movable_pfn[i] << PAGE_SHIFT); 5270 zone_movable_pfn[i] << PAGE_SHIFT);
5268 } 5271 }
5269 5272
5270 /* Print out the early node map */ 5273 /* Print out the early node map */
5271 printk("Early memory node ranges\n"); 5274 printk("Early memory node ranges\n");
5272 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) 5275 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
5273 printk(" node %3d: [mem %#010lx-%#010lx]\n", nid, 5276 printk(" node %3d: [mem %#010lx-%#010lx]\n", nid,
5274 start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1); 5277 start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);
5275 5278
5276 /* Initialise every node */ 5279 /* Initialise every node */
5277 mminit_verify_pageflags_layout(); 5280 mminit_verify_pageflags_layout();
5278 setup_nr_node_ids(); 5281 setup_nr_node_ids();
5279 for_each_online_node(nid) { 5282 for_each_online_node(nid) {
5280 pg_data_t *pgdat = NODE_DATA(nid); 5283 pg_data_t *pgdat = NODE_DATA(nid);
5281 free_area_init_node(nid, NULL, 5284 free_area_init_node(nid, NULL,
5282 find_min_pfn_for_node(nid), NULL); 5285 find_min_pfn_for_node(nid), NULL);
5283 5286
5284 /* Any memory on that node */ 5287 /* Any memory on that node */
5285 if (pgdat->node_present_pages) 5288 if (pgdat->node_present_pages)
5286 node_set_state(nid, N_MEMORY); 5289 node_set_state(nid, N_MEMORY);
5287 check_for_memory(pgdat, nid); 5290 check_for_memory(pgdat, nid);
5288 } 5291 }
5289 } 5292 }
5290 5293
5291 static int __init cmdline_parse_core(char *p, unsigned long *core) 5294 static int __init cmdline_parse_core(char *p, unsigned long *core)
5292 { 5295 {
5293 unsigned long long coremem; 5296 unsigned long long coremem;
5294 if (!p) 5297 if (!p)
5295 return -EINVAL; 5298 return -EINVAL;
5296 5299
5297 coremem = memparse(p, &p); 5300 coremem = memparse(p, &p);
5298 *core = coremem >> PAGE_SHIFT; 5301 *core = coremem >> PAGE_SHIFT;
5299 5302
5300 /* Paranoid check that UL is enough for the coremem value */ 5303 /* Paranoid check that UL is enough for the coremem value */
5301 WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX); 5304 WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
5302 5305
5303 return 0; 5306 return 0;
5304 } 5307 }
5305 5308
5306 /* 5309 /*
5307 * kernelcore=size sets the amount of memory for use for allocations that 5310 * kernelcore=size sets the amount of memory for use for allocations that
5308 * cannot be reclaimed or migrated. 5311 * cannot be reclaimed or migrated.
5309 */ 5312 */
5310 static int __init cmdline_parse_kernelcore(char *p) 5313 static int __init cmdline_parse_kernelcore(char *p)
5311 { 5314 {
5312 return cmdline_parse_core(p, &required_kernelcore); 5315 return cmdline_parse_core(p, &required_kernelcore);
5313 } 5316 }
5314 5317
5315 /* 5318 /*
5316 * movablecore=size sets the amount of memory for use for allocations that 5319 * movablecore=size sets the amount of memory for use for allocations that
5317 * can be reclaimed or migrated. 5320 * can be reclaimed or migrated.
5318 */ 5321 */
5319 static int __init cmdline_parse_movablecore(char *p) 5322 static int __init cmdline_parse_movablecore(char *p)
5320 { 5323 {
5321 return cmdline_parse_core(p, &required_movablecore); 5324 return cmdline_parse_core(p, &required_movablecore);
5322 } 5325 }
5323 5326
5324 early_param("kernelcore", cmdline_parse_kernelcore); 5327 early_param("kernelcore", cmdline_parse_kernelcore);
5325 early_param("movablecore", cmdline_parse_movablecore); 5328 early_param("movablecore", cmdline_parse_movablecore);
5326 5329
5327 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 5330 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
5328 5331
5329 void adjust_managed_page_count(struct page *page, long count) 5332 void adjust_managed_page_count(struct page *page, long count)
5330 { 5333 {
5331 spin_lock(&managed_page_count_lock); 5334 spin_lock(&managed_page_count_lock);
5332 page_zone(page)->managed_pages += count; 5335 page_zone(page)->managed_pages += count;
5333 totalram_pages += count; 5336 totalram_pages += count;
5334 #ifdef CONFIG_HIGHMEM 5337 #ifdef CONFIG_HIGHMEM
5335 if (PageHighMem(page)) 5338 if (PageHighMem(page))
5336 totalhigh_pages += count; 5339 totalhigh_pages += count;
5337 #endif 5340 #endif
5338 spin_unlock(&managed_page_count_lock); 5341 spin_unlock(&managed_page_count_lock);
5339 } 5342 }
5340 EXPORT_SYMBOL(adjust_managed_page_count); 5343 EXPORT_SYMBOL(adjust_managed_page_count);
5341 5344
5342 unsigned long free_reserved_area(void *start, void *end, int poison, char *s) 5345 unsigned long free_reserved_area(void *start, void *end, int poison, char *s)
5343 { 5346 {
5344 void *pos; 5347 void *pos;
5345 unsigned long pages = 0; 5348 unsigned long pages = 0;
5346 5349
5347 start = (void *)PAGE_ALIGN((unsigned long)start); 5350 start = (void *)PAGE_ALIGN((unsigned long)start);
5348 end = (void *)((unsigned long)end & PAGE_MASK); 5351 end = (void *)((unsigned long)end & PAGE_MASK);
5349 for (pos = start; pos < end; pos += PAGE_SIZE, pages++) { 5352 for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {
5350 if ((unsigned int)poison <= 0xFF) 5353 if ((unsigned int)poison <= 0xFF)
5351 memset(pos, poison, PAGE_SIZE); 5354 memset(pos, poison, PAGE_SIZE);
5352 free_reserved_page(virt_to_page(pos)); 5355 free_reserved_page(virt_to_page(pos));
5353 } 5356 }
5354 5357
5355 if (pages && s) 5358 if (pages && s)
5356 pr_info("Freeing %s memory: %ldK (%p - %p)\n", 5359 pr_info("Freeing %s memory: %ldK (%p - %p)\n",
5357 s, pages << (PAGE_SHIFT - 10), start, end); 5360 s, pages << (PAGE_SHIFT - 10), start, end);
5358 5361
5359 return pages; 5362 return pages;
5360 } 5363 }
5361 EXPORT_SYMBOL(free_reserved_area); 5364 EXPORT_SYMBOL(free_reserved_area);
5362 5365
5363 #ifdef CONFIG_HIGHMEM 5366 #ifdef CONFIG_HIGHMEM
5364 void free_highmem_page(struct page *page) 5367 void free_highmem_page(struct page *page)
5365 { 5368 {
5366 __free_reserved_page(page); 5369 __free_reserved_page(page);
5367 totalram_pages++; 5370 totalram_pages++;
5368 page_zone(page)->managed_pages++; 5371 page_zone(page)->managed_pages++;
5369 totalhigh_pages++; 5372 totalhigh_pages++;
5370 } 5373 }
5371 #endif 5374 #endif
5372 5375
5373 5376
5374 void __init mem_init_print_info(const char *str) 5377 void __init mem_init_print_info(const char *str)
5375 { 5378 {
5376 unsigned long physpages, codesize, datasize, rosize, bss_size; 5379 unsigned long physpages, codesize, datasize, rosize, bss_size;
5377 unsigned long init_code_size, init_data_size; 5380 unsigned long init_code_size, init_data_size;
5378 5381
5379 physpages = get_num_physpages(); 5382 physpages = get_num_physpages();
5380 codesize = _etext - _stext; 5383 codesize = _etext - _stext;
5381 datasize = _edata - _sdata; 5384 datasize = _edata - _sdata;
5382 rosize = __end_rodata - __start_rodata; 5385 rosize = __end_rodata - __start_rodata;
5383 bss_size = __bss_stop - __bss_start; 5386 bss_size = __bss_stop - __bss_start;
5384 init_data_size = __init_end - __init_begin; 5387 init_data_size = __init_end - __init_begin;
5385 init_code_size = _einittext - _sinittext; 5388 init_code_size = _einittext - _sinittext;
5386 5389
5387 /* 5390 /*
5388 * Detect special cases and adjust section sizes accordingly: 5391 * Detect special cases and adjust section sizes accordingly:
5389 * 1) .init.* may be embedded into .data sections 5392 * 1) .init.* may be embedded into .data sections
5390 * 2) .init.text.* may be out of [__init_begin, __init_end], 5393 * 2) .init.text.* may be out of [__init_begin, __init_end],
5391 * please refer to arch/tile/kernel/vmlinux.lds.S. 5394 * please refer to arch/tile/kernel/vmlinux.lds.S.
5392 * 3) .rodata.* may be embedded into .text or .data sections. 5395 * 3) .rodata.* may be embedded into .text or .data sections.
5393 */ 5396 */
5394 #define adj_init_size(start, end, size, pos, adj) \ 5397 #define adj_init_size(start, end, size, pos, adj) \
5395 do { \ 5398 do { \
5396 if (start <= pos && pos < end && size > adj) \ 5399 if (start <= pos && pos < end && size > adj) \
5397 size -= adj; \ 5400 size -= adj; \
5398 } while (0) 5401 } while (0)
5399 5402
5400 adj_init_size(__init_begin, __init_end, init_data_size, 5403 adj_init_size(__init_begin, __init_end, init_data_size,
5401 _sinittext, init_code_size); 5404 _sinittext, init_code_size);
5402 adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size); 5405 adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);
5403 adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size); 5406 adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);
5404 adj_init_size(_stext, _etext, codesize, __start_rodata, rosize); 5407 adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);
5405 adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize); 5408 adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);
5406 5409
5407 #undef adj_init_size 5410 #undef adj_init_size
5408 5411
5409 printk("Memory: %luK/%luK available " 5412 printk("Memory: %luK/%luK available "
5410 "(%luK kernel code, %luK rwdata, %luK rodata, " 5413 "(%luK kernel code, %luK rwdata, %luK rodata, "
5411 "%luK init, %luK bss, %luK reserved" 5414 "%luK init, %luK bss, %luK reserved"
5412 #ifdef CONFIG_HIGHMEM 5415 #ifdef CONFIG_HIGHMEM
5413 ", %luK highmem" 5416 ", %luK highmem"
5414 #endif 5417 #endif
5415 "%s%s)\n", 5418 "%s%s)\n",
5416 nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10), 5419 nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10),
5417 codesize >> 10, datasize >> 10, rosize >> 10, 5420 codesize >> 10, datasize >> 10, rosize >> 10,
5418 (init_data_size + init_code_size) >> 10, bss_size >> 10, 5421 (init_data_size + init_code_size) >> 10, bss_size >> 10,
5419 (physpages - totalram_pages) << (PAGE_SHIFT-10), 5422 (physpages - totalram_pages) << (PAGE_SHIFT-10),
5420 #ifdef CONFIG_HIGHMEM 5423 #ifdef CONFIG_HIGHMEM
5421 totalhigh_pages << (PAGE_SHIFT-10), 5424 totalhigh_pages << (PAGE_SHIFT-10),
5422 #endif 5425 #endif
5423 str ? ", " : "", str ? str : ""); 5426 str ? ", " : "", str ? str : "");
5424 } 5427 }
5425 5428
5426 /** 5429 /**
5427 * set_dma_reserve - set the specified number of pages reserved in the first zone 5430 * set_dma_reserve - set the specified number of pages reserved in the first zone
5428 * @new_dma_reserve: The number of pages to mark reserved 5431 * @new_dma_reserve: The number of pages to mark reserved
5429 * 5432 *
5430 * The per-cpu batchsize and zone watermarks are determined by present_pages. 5433 * The per-cpu batchsize and zone watermarks are determined by present_pages.
5431 * In the DMA zone, a significant percentage may be consumed by kernel image 5434 * In the DMA zone, a significant percentage may be consumed by kernel image
5432 * and other unfreeable allocations which can skew the watermarks badly. This 5435 * and other unfreeable allocations which can skew the watermarks badly. This
5433 * function may optionally be used to account for unfreeable pages in the 5436 * function may optionally be used to account for unfreeable pages in the
5434 * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and 5437 * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
5435 * smaller per-cpu batchsize. 5438 * smaller per-cpu batchsize.
5436 */ 5439 */
5437 void __init set_dma_reserve(unsigned long new_dma_reserve) 5440 void __init set_dma_reserve(unsigned long new_dma_reserve)
5438 { 5441 {
5439 dma_reserve = new_dma_reserve; 5442 dma_reserve = new_dma_reserve;
5440 } 5443 }
5441 5444
5442 void __init free_area_init(unsigned long *zones_size) 5445 void __init free_area_init(unsigned long *zones_size)
5443 { 5446 {
5444 free_area_init_node(0, zones_size, 5447 free_area_init_node(0, zones_size,
5445 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); 5448 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
5446 } 5449 }
5447 5450
5448 static int page_alloc_cpu_notify(struct notifier_block *self, 5451 static int page_alloc_cpu_notify(struct notifier_block *self,
5449 unsigned long action, void *hcpu) 5452 unsigned long action, void *hcpu)
5450 { 5453 {
5451 int cpu = (unsigned long)hcpu; 5454 int cpu = (unsigned long)hcpu;
5452 5455
5453 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { 5456 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
5454 lru_add_drain_cpu(cpu); 5457 lru_add_drain_cpu(cpu);
5455 drain_pages(cpu); 5458 drain_pages(cpu);
5456 5459
5457 /* 5460 /*
5458 * Spill the event counters of the dead processor 5461 * Spill the event counters of the dead processor
5459 * into the current processors event counters. 5462 * into the current processors event counters.
5460 * This artificially elevates the count of the current 5463 * This artificially elevates the count of the current
5461 * processor. 5464 * processor.
5462 */ 5465 */
5463 vm_events_fold_cpu(cpu); 5466 vm_events_fold_cpu(cpu);
5464 5467
5465 /* 5468 /*
5466 * Zero the differential counters of the dead processor 5469 * Zero the differential counters of the dead processor
5467 * so that the vm statistics are consistent. 5470 * so that the vm statistics are consistent.
5468 * 5471 *
5469 * This is only okay since the processor is dead and cannot 5472 * This is only okay since the processor is dead and cannot
5470 * race with what we are doing. 5473 * race with what we are doing.
5471 */ 5474 */
5472 cpu_vm_stats_fold(cpu); 5475 cpu_vm_stats_fold(cpu);
5473 } 5476 }
5474 return NOTIFY_OK; 5477 return NOTIFY_OK;
5475 } 5478 }
5476 5479
5477 void __init page_alloc_init(void) 5480 void __init page_alloc_init(void)
5478 { 5481 {
5479 hotcpu_notifier(page_alloc_cpu_notify, 0); 5482 hotcpu_notifier(page_alloc_cpu_notify, 0);
5480 } 5483 }
5481 5484
5482 /* 5485 /*
5483 * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio 5486 * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio
5484 * or min_free_kbytes changes. 5487 * or min_free_kbytes changes.
5485 */ 5488 */
5486 static void calculate_totalreserve_pages(void) 5489 static void calculate_totalreserve_pages(void)
5487 { 5490 {
5488 struct pglist_data *pgdat; 5491 struct pglist_data *pgdat;
5489 unsigned long reserve_pages = 0; 5492 unsigned long reserve_pages = 0;
5490 enum zone_type i, j; 5493 enum zone_type i, j;
5491 5494
5492 for_each_online_pgdat(pgdat) { 5495 for_each_online_pgdat(pgdat) {
5493 for (i = 0; i < MAX_NR_ZONES; i++) { 5496 for (i = 0; i < MAX_NR_ZONES; i++) {
5494 struct zone *zone = pgdat->node_zones + i; 5497 struct zone *zone = pgdat->node_zones + i;
5495 unsigned long max = 0; 5498 unsigned long max = 0;
5496 5499
5497 /* Find valid and maximum lowmem_reserve in the zone */ 5500 /* Find valid and maximum lowmem_reserve in the zone */
5498 for (j = i; j < MAX_NR_ZONES; j++) { 5501 for (j = i; j < MAX_NR_ZONES; j++) {
5499 if (zone->lowmem_reserve[j] > max) 5502 if (zone->lowmem_reserve[j] > max)
5500 max = zone->lowmem_reserve[j]; 5503 max = zone->lowmem_reserve[j];
5501 } 5504 }
5502 5505
5503 /* we treat the high watermark as reserved pages. */ 5506 /* we treat the high watermark as reserved pages. */
5504 max += high_wmark_pages(zone); 5507 max += high_wmark_pages(zone);
5505 5508
5506 if (max > zone->managed_pages) 5509 if (max > zone->managed_pages)
5507 max = zone->managed_pages; 5510 max = zone->managed_pages;
5508 reserve_pages += max; 5511 reserve_pages += max;
5509 /* 5512 /*
5510 * Lowmem reserves are not available to 5513 * Lowmem reserves are not available to
5511 * GFP_HIGHUSER page cache allocations and 5514 * GFP_HIGHUSER page cache allocations and
5512 * kswapd tries to balance zones to their high 5515 * kswapd tries to balance zones to their high
5513 * watermark. As a result, neither should be 5516 * watermark. As a result, neither should be
5514 * regarded as dirtyable memory, to prevent a 5517 * regarded as dirtyable memory, to prevent a
5515 * situation where reclaim has to clean pages 5518 * situation where reclaim has to clean pages
5516 * in order to balance the zones. 5519 * in order to balance the zones.
5517 */ 5520 */
5518 zone->dirty_balance_reserve = max; 5521 zone->dirty_balance_reserve = max;
5519 } 5522 }
5520 } 5523 }
5521 dirty_balance_reserve = reserve_pages; 5524 dirty_balance_reserve = reserve_pages;
5522 totalreserve_pages = reserve_pages; 5525 totalreserve_pages = reserve_pages;
5523 } 5526 }
5524 5527
5525 /* 5528 /*
5526 * setup_per_zone_lowmem_reserve - called whenever 5529 * setup_per_zone_lowmem_reserve - called whenever
5527 * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone 5530 * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone
5528 * has a correct pages reserved value, so an adequate number of 5531 * has a correct pages reserved value, so an adequate number of
5529 * pages are left in the zone after a successful __alloc_pages(). 5532 * pages are left in the zone after a successful __alloc_pages().
5530 */ 5533 */
5531 static void setup_per_zone_lowmem_reserve(void) 5534 static void setup_per_zone_lowmem_reserve(void)
5532 { 5535 {
5533 struct pglist_data *pgdat; 5536 struct pglist_data *pgdat;
5534 enum zone_type j, idx; 5537 enum zone_type j, idx;
5535 5538
5536 for_each_online_pgdat(pgdat) { 5539 for_each_online_pgdat(pgdat) {
5537 for (j = 0; j < MAX_NR_ZONES; j++) { 5540 for (j = 0; j < MAX_NR_ZONES; j++) {
5538 struct zone *zone = pgdat->node_zones + j; 5541 struct zone *zone = pgdat->node_zones + j;
5539 unsigned long managed_pages = zone->managed_pages; 5542 unsigned long managed_pages = zone->managed_pages;
5540 5543
5541 zone->lowmem_reserve[j] = 0; 5544 zone->lowmem_reserve[j] = 0;
5542 5545
5543 idx = j; 5546 idx = j;
5544 while (idx) { 5547 while (idx) {
5545 struct zone *lower_zone; 5548 struct zone *lower_zone;
5546 5549
5547 idx--; 5550 idx--;
5548 5551
5549 if (sysctl_lowmem_reserve_ratio[idx] < 1) 5552 if (sysctl_lowmem_reserve_ratio[idx] < 1)
5550 sysctl_lowmem_reserve_ratio[idx] = 1; 5553 sysctl_lowmem_reserve_ratio[idx] = 1;
5551 5554
5552 lower_zone = pgdat->node_zones + idx; 5555 lower_zone = pgdat->node_zones + idx;
5553 lower_zone->lowmem_reserve[j] = managed_pages / 5556 lower_zone->lowmem_reserve[j] = managed_pages /
5554 sysctl_lowmem_reserve_ratio[idx]; 5557 sysctl_lowmem_reserve_ratio[idx];
5555 managed_pages += lower_zone->managed_pages; 5558 managed_pages += lower_zone->managed_pages;
5556 } 5559 }
5557 } 5560 }
5558 } 5561 }
5559 5562
5560 /* update totalreserve_pages */ 5563 /* update totalreserve_pages */
5561 calculate_totalreserve_pages(); 5564 calculate_totalreserve_pages();
5562 } 5565 }
5563 5566
5564 static void __setup_per_zone_wmarks(void) 5567 static void __setup_per_zone_wmarks(void)
5565 { 5568 {
5566 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); 5569 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
5567 unsigned long lowmem_pages = 0; 5570 unsigned long lowmem_pages = 0;
5568 struct zone *zone; 5571 struct zone *zone;
5569 unsigned long flags; 5572 unsigned long flags;
5570 5573
5571 /* Calculate total number of !ZONE_HIGHMEM pages */ 5574 /* Calculate total number of !ZONE_HIGHMEM pages */
5572 for_each_zone(zone) { 5575 for_each_zone(zone) {
5573 if (!is_highmem(zone)) 5576 if (!is_highmem(zone))
5574 lowmem_pages += zone->managed_pages; 5577 lowmem_pages += zone->managed_pages;
5575 } 5578 }
5576 5579
5577 for_each_zone(zone) { 5580 for_each_zone(zone) {
5578 u64 tmp; 5581 u64 tmp;
5579 5582
5580 spin_lock_irqsave(&zone->lock, flags); 5583 spin_lock_irqsave(&zone->lock, flags);
5581 tmp = (u64)pages_min * zone->managed_pages; 5584 tmp = (u64)pages_min * zone->managed_pages;
5582 do_div(tmp, lowmem_pages); 5585 do_div(tmp, lowmem_pages);
5583 if (is_highmem(zone)) { 5586 if (is_highmem(zone)) {
5584 /* 5587 /*
5585 * __GFP_HIGH and PF_MEMALLOC allocations usually don't 5588 * __GFP_HIGH and PF_MEMALLOC allocations usually don't
5586 * need highmem pages, so cap pages_min to a small 5589 * need highmem pages, so cap pages_min to a small
5587 * value here. 5590 * value here.
5588 * 5591 *
5589 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN) 5592 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
5590 * deltas controls asynch page reclaim, and so should 5593 * deltas controls asynch page reclaim, and so should
5591 * not be capped for highmem. 5594 * not be capped for highmem.
5592 */ 5595 */
5593 unsigned long min_pages; 5596 unsigned long min_pages;
5594 5597
5595 min_pages = zone->managed_pages / 1024; 5598 min_pages = zone->managed_pages / 1024;
5596 min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL); 5599 min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
5597 zone->watermark[WMARK_MIN] = min_pages; 5600 zone->watermark[WMARK_MIN] = min_pages;
5598 } else { 5601 } else {
5599 /* 5602 /*
5600 * If it's a lowmem zone, reserve a number of pages 5603 * If it's a lowmem zone, reserve a number of pages
5601 * proportionate to the zone's size. 5604 * proportionate to the zone's size.
5602 */ 5605 */
5603 zone->watermark[WMARK_MIN] = tmp; 5606 zone->watermark[WMARK_MIN] = tmp;
5604 } 5607 }
5605 5608
5606 zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); 5609 zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);
5607 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); 5610 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
5608 5611
5609 __mod_zone_page_state(zone, NR_ALLOC_BATCH, 5612 __mod_zone_page_state(zone, NR_ALLOC_BATCH,
5610 high_wmark_pages(zone) - 5613 high_wmark_pages(zone) -
5611 low_wmark_pages(zone) - 5614 low_wmark_pages(zone) -
5612 zone_page_state(zone, NR_ALLOC_BATCH)); 5615 zone_page_state(zone, NR_ALLOC_BATCH));
5613 5616
5614 setup_zone_migrate_reserve(zone); 5617 setup_zone_migrate_reserve(zone);
5615 spin_unlock_irqrestore(&zone->lock, flags); 5618 spin_unlock_irqrestore(&zone->lock, flags);
5616 } 5619 }
5617 5620
5618 /* update totalreserve_pages */ 5621 /* update totalreserve_pages */
5619 calculate_totalreserve_pages(); 5622 calculate_totalreserve_pages();
5620 } 5623 }
5621 5624
5622 /** 5625 /**
5623 * setup_per_zone_wmarks - called when min_free_kbytes changes 5626 * setup_per_zone_wmarks - called when min_free_kbytes changes
5624 * or when memory is hot-{added|removed} 5627 * or when memory is hot-{added|removed}
5625 * 5628 *
5626 * Ensures that the watermark[min,low,high] values for each zone are set 5629 * Ensures that the watermark[min,low,high] values for each zone are set
5627 * correctly with respect to min_free_kbytes. 5630 * correctly with respect to min_free_kbytes.
5628 */ 5631 */
5629 void setup_per_zone_wmarks(void) 5632 void setup_per_zone_wmarks(void)
5630 { 5633 {
5631 mutex_lock(&zonelists_mutex); 5634 mutex_lock(&zonelists_mutex);
5632 __setup_per_zone_wmarks(); 5635 __setup_per_zone_wmarks();
5633 mutex_unlock(&zonelists_mutex); 5636 mutex_unlock(&zonelists_mutex);
5634 } 5637 }
5635 5638
5636 /* 5639 /*
5637 * The inactive anon list should be small enough that the VM never has to 5640 * The inactive anon list should be small enough that the VM never has to
5638 * do too much work, but large enough that each inactive page has a chance 5641 * do too much work, but large enough that each inactive page has a chance
5639 * to be referenced again before it is swapped out. 5642 * to be referenced again before it is swapped out.
5640 * 5643 *
5641 * The inactive_anon ratio is the target ratio of ACTIVE_ANON to 5644 * The inactive_anon ratio is the target ratio of ACTIVE_ANON to
5642 * INACTIVE_ANON pages on this zone's LRU, maintained by the 5645 * INACTIVE_ANON pages on this zone's LRU, maintained by the
5643 * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of 5646 * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of
5644 * the anonymous pages are kept on the inactive list. 5647 * the anonymous pages are kept on the inactive list.
5645 * 5648 *
5646 * total target max 5649 * total target max
5647 * memory ratio inactive anon 5650 * memory ratio inactive anon
5648 * ------------------------------------- 5651 * -------------------------------------
5649 * 10MB 1 5MB 5652 * 10MB 1 5MB
5650 * 100MB 1 50MB 5653 * 100MB 1 50MB
5651 * 1GB 3 250MB 5654 * 1GB 3 250MB
5652 * 10GB 10 0.9GB 5655 * 10GB 10 0.9GB
5653 * 100GB 31 3GB 5656 * 100GB 31 3GB
5654 * 1TB 101 10GB 5657 * 1TB 101 10GB
5655 * 10TB 320 32GB 5658 * 10TB 320 32GB
5656 */ 5659 */
5657 static void __meminit calculate_zone_inactive_ratio(struct zone *zone) 5660 static void __meminit calculate_zone_inactive_ratio(struct zone *zone)
5658 { 5661 {
5659 unsigned int gb, ratio; 5662 unsigned int gb, ratio;
5660 5663
5661 /* Zone size in gigabytes */ 5664 /* Zone size in gigabytes */
5662 gb = zone->managed_pages >> (30 - PAGE_SHIFT); 5665 gb = zone->managed_pages >> (30 - PAGE_SHIFT);
5663 if (gb) 5666 if (gb)
5664 ratio = int_sqrt(10 * gb); 5667 ratio = int_sqrt(10 * gb);
5665 else 5668 else
5666 ratio = 1; 5669 ratio = 1;
5667 5670
5668 zone->inactive_ratio = ratio; 5671 zone->inactive_ratio = ratio;
5669 } 5672 }
5670 5673
5671 static void __meminit setup_per_zone_inactive_ratio(void) 5674 static void __meminit setup_per_zone_inactive_ratio(void)
5672 { 5675 {
5673 struct zone *zone; 5676 struct zone *zone;
5674 5677
5675 for_each_zone(zone) 5678 for_each_zone(zone)
5676 calculate_zone_inactive_ratio(zone); 5679 calculate_zone_inactive_ratio(zone);
5677 } 5680 }
5678 5681
5679 /* 5682 /*
5680 * Initialise min_free_kbytes. 5683 * Initialise min_free_kbytes.
5681 * 5684 *
5682 * For small machines we want it small (128k min). For large machines 5685 * For small machines we want it small (128k min). For large machines
5683 * we want it large (64MB max). But it is not linear, because network 5686 * we want it large (64MB max). But it is not linear, because network
5684 * bandwidth does not increase linearly with machine size. We use 5687 * bandwidth does not increase linearly with machine size. We use
5685 * 5688 *
5686 * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: 5689 * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
5687 * min_free_kbytes = sqrt(lowmem_kbytes * 16) 5690 * min_free_kbytes = sqrt(lowmem_kbytes * 16)
5688 * 5691 *
5689 * which yields 5692 * which yields
5690 * 5693 *
5691 * 16MB: 512k 5694 * 16MB: 512k
5692 * 32MB: 724k 5695 * 32MB: 724k
5693 * 64MB: 1024k 5696 * 64MB: 1024k
5694 * 128MB: 1448k 5697 * 128MB: 1448k
5695 * 256MB: 2048k 5698 * 256MB: 2048k
5696 * 512MB: 2896k 5699 * 512MB: 2896k
5697 * 1024MB: 4096k 5700 * 1024MB: 4096k
5698 * 2048MB: 5792k 5701 * 2048MB: 5792k
5699 * 4096MB: 8192k 5702 * 4096MB: 8192k
5700 * 8192MB: 11584k 5703 * 8192MB: 11584k
5701 * 16384MB: 16384k 5704 * 16384MB: 16384k
5702 */ 5705 */
5703 int __meminit init_per_zone_wmark_min(void) 5706 int __meminit init_per_zone_wmark_min(void)
5704 { 5707 {
5705 unsigned long lowmem_kbytes; 5708 unsigned long lowmem_kbytes;
5706 int new_min_free_kbytes; 5709 int new_min_free_kbytes;
5707 5710
5708 lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); 5711 lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
5709 new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16); 5712 new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
5710 5713
5711 if (new_min_free_kbytes > user_min_free_kbytes) { 5714 if (new_min_free_kbytes > user_min_free_kbytes) {
5712 min_free_kbytes = new_min_free_kbytes; 5715 min_free_kbytes = new_min_free_kbytes;
5713 if (min_free_kbytes < 128) 5716 if (min_free_kbytes < 128)
5714 min_free_kbytes = 128; 5717 min_free_kbytes = 128;
5715 if (min_free_kbytes > 65536) 5718 if (min_free_kbytes > 65536)
5716 min_free_kbytes = 65536; 5719 min_free_kbytes = 65536;
5717 } else { 5720 } else {
5718 pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n", 5721 pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
5719 new_min_free_kbytes, user_min_free_kbytes); 5722 new_min_free_kbytes, user_min_free_kbytes);
5720 } 5723 }
5721 setup_per_zone_wmarks(); 5724 setup_per_zone_wmarks();
5722 refresh_zone_stat_thresholds(); 5725 refresh_zone_stat_thresholds();
5723 setup_per_zone_lowmem_reserve(); 5726 setup_per_zone_lowmem_reserve();
5724 setup_per_zone_inactive_ratio(); 5727 setup_per_zone_inactive_ratio();
5725 return 0; 5728 return 0;
5726 } 5729 }
5727 module_init(init_per_zone_wmark_min) 5730 module_init(init_per_zone_wmark_min)
5728 5731
5729 /* 5732 /*
5730 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 5733 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
5731 * that we can call two helper functions whenever min_free_kbytes 5734 * that we can call two helper functions whenever min_free_kbytes
5732 * changes. 5735 * changes.
5733 */ 5736 */
5734 int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 5737 int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
5735 void __user *buffer, size_t *length, loff_t *ppos) 5738 void __user *buffer, size_t *length, loff_t *ppos)
5736 { 5739 {
5737 int rc; 5740 int rc;
5738 5741
5739 rc = proc_dointvec_minmax(table, write, buffer, length, ppos); 5742 rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
5740 if (rc) 5743 if (rc)
5741 return rc; 5744 return rc;
5742 5745
5743 if (write) { 5746 if (write) {
5744 user_min_free_kbytes = min_free_kbytes; 5747 user_min_free_kbytes = min_free_kbytes;
5745 setup_per_zone_wmarks(); 5748 setup_per_zone_wmarks();
5746 } 5749 }
5747 return 0; 5750 return 0;
5748 } 5751 }
5749 5752
5750 #ifdef CONFIG_NUMA 5753 #ifdef CONFIG_NUMA
5751 int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, 5754 int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
5752 void __user *buffer, size_t *length, loff_t *ppos) 5755 void __user *buffer, size_t *length, loff_t *ppos)
5753 { 5756 {
5754 struct zone *zone; 5757 struct zone *zone;
5755 int rc; 5758 int rc;
5756 5759
5757 rc = proc_dointvec_minmax(table, write, buffer, length, ppos); 5760 rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
5758 if (rc) 5761 if (rc)
5759 return rc; 5762 return rc;
5760 5763
5761 for_each_zone(zone) 5764 for_each_zone(zone)
5762 zone->min_unmapped_pages = (zone->managed_pages * 5765 zone->min_unmapped_pages = (zone->managed_pages *
5763 sysctl_min_unmapped_ratio) / 100; 5766 sysctl_min_unmapped_ratio) / 100;
5764 return 0; 5767 return 0;
5765 } 5768 }
5766 5769
5767 int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, 5770 int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
5768 void __user *buffer, size_t *length, loff_t *ppos) 5771 void __user *buffer, size_t *length, loff_t *ppos)
5769 { 5772 {
5770 struct zone *zone; 5773 struct zone *zone;
5771 int rc; 5774 int rc;
5772 5775
5773 rc = proc_dointvec_minmax(table, write, buffer, length, ppos); 5776 rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
5774 if (rc) 5777 if (rc)
5775 return rc; 5778 return rc;
5776 5779
5777 for_each_zone(zone) 5780 for_each_zone(zone)
5778 zone->min_slab_pages = (zone->managed_pages * 5781 zone->min_slab_pages = (zone->managed_pages *
5779 sysctl_min_slab_ratio) / 100; 5782 sysctl_min_slab_ratio) / 100;
5780 return 0; 5783 return 0;
5781 } 5784 }
5782 #endif 5785 #endif
5783 5786
5784 /* 5787 /*
5785 * lowmem_reserve_ratio_sysctl_handler - just a wrapper around 5788 * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
5786 * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve() 5789 * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
5787 * whenever sysctl_lowmem_reserve_ratio changes. 5790 * whenever sysctl_lowmem_reserve_ratio changes.
5788 * 5791 *
5789 * The reserve ratio obviously has absolutely no relation with the 5792 * The reserve ratio obviously has absolutely no relation with the
5790 * minimum watermarks. The lowmem reserve ratio can only make sense 5793 * minimum watermarks. The lowmem reserve ratio can only make sense
5791 * if in function of the boot time zone sizes. 5794 * if in function of the boot time zone sizes.
5792 */ 5795 */
5793 int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, 5796 int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
5794 void __user *buffer, size_t *length, loff_t *ppos) 5797 void __user *buffer, size_t *length, loff_t *ppos)
5795 { 5798 {
5796 proc_dointvec_minmax(table, write, buffer, length, ppos); 5799 proc_dointvec_minmax(table, write, buffer, length, ppos);
5797 setup_per_zone_lowmem_reserve(); 5800 setup_per_zone_lowmem_reserve();
5798 return 0; 5801 return 0;
5799 } 5802 }
5800 5803
5801 /* 5804 /*
5802 * percpu_pagelist_fraction - changes the pcp->high for each zone on each 5805 * percpu_pagelist_fraction - changes the pcp->high for each zone on each
5803 * cpu. It is the fraction of total pages in each zone that a hot per cpu 5806 * cpu. It is the fraction of total pages in each zone that a hot per cpu
5804 * pagelist can have before it gets flushed back to buddy allocator. 5807 * pagelist can have before it gets flushed back to buddy allocator.
5805 */ 5808 */
5806 int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, 5809 int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
5807 void __user *buffer, size_t *length, loff_t *ppos) 5810 void __user *buffer, size_t *length, loff_t *ppos)
5808 { 5811 {
5809 struct zone *zone; 5812 struct zone *zone;
5810 int old_percpu_pagelist_fraction; 5813 int old_percpu_pagelist_fraction;
5811 int ret; 5814 int ret;
5812 5815
5813 mutex_lock(&pcp_batch_high_lock); 5816 mutex_lock(&pcp_batch_high_lock);
5814 old_percpu_pagelist_fraction = percpu_pagelist_fraction; 5817 old_percpu_pagelist_fraction = percpu_pagelist_fraction;
5815 5818
5816 ret = proc_dointvec_minmax(table, write, buffer, length, ppos); 5819 ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
5817 if (!write || ret < 0) 5820 if (!write || ret < 0)
5818 goto out; 5821 goto out;
5819 5822
5820 /* Sanity checking to avoid pcp imbalance */ 5823 /* Sanity checking to avoid pcp imbalance */
5821 if (percpu_pagelist_fraction && 5824 if (percpu_pagelist_fraction &&
5822 percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) { 5825 percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) {
5823 percpu_pagelist_fraction = old_percpu_pagelist_fraction; 5826 percpu_pagelist_fraction = old_percpu_pagelist_fraction;
5824 ret = -EINVAL; 5827 ret = -EINVAL;
5825 goto out; 5828 goto out;
5826 } 5829 }
5827 5830
5828 /* No change? */ 5831 /* No change? */
5829 if (percpu_pagelist_fraction == old_percpu_pagelist_fraction) 5832 if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)
5830 goto out; 5833 goto out;
5831 5834
5832 for_each_populated_zone(zone) { 5835 for_each_populated_zone(zone) {
5833 unsigned int cpu; 5836 unsigned int cpu;
5834 5837
5835 for_each_possible_cpu(cpu) 5838 for_each_possible_cpu(cpu)
5836 pageset_set_high_and_batch(zone, 5839 pageset_set_high_and_batch(zone,
5837 per_cpu_ptr(zone->pageset, cpu)); 5840 per_cpu_ptr(zone->pageset, cpu));
5838 } 5841 }
5839 out: 5842 out:
5840 mutex_unlock(&pcp_batch_high_lock); 5843 mutex_unlock(&pcp_batch_high_lock);
5841 return ret; 5844 return ret;
5842 } 5845 }
5843 5846
5844 int hashdist = HASHDIST_DEFAULT; 5847 int hashdist = HASHDIST_DEFAULT;
5845 5848
5846 #ifdef CONFIG_NUMA 5849 #ifdef CONFIG_NUMA
5847 static int __init set_hashdist(char *str) 5850 static int __init set_hashdist(char *str)
5848 { 5851 {
5849 if (!str) 5852 if (!str)
5850 return 0; 5853 return 0;
5851 hashdist = simple_strtoul(str, &str, 0); 5854 hashdist = simple_strtoul(str, &str, 0);
5852 return 1; 5855 return 1;
5853 } 5856 }
5854 __setup("hashdist=", set_hashdist); 5857 __setup("hashdist=", set_hashdist);
5855 #endif 5858 #endif
5856 5859
5857 /* 5860 /*
5858 * allocate a large system hash table from bootmem 5861 * allocate a large system hash table from bootmem
5859 * - it is assumed that the hash table must contain an exact power-of-2 5862 * - it is assumed that the hash table must contain an exact power-of-2
5860 * quantity of entries 5863 * quantity of entries
5861 * - limit is the number of hash buckets, not the total allocation size 5864 * - limit is the number of hash buckets, not the total allocation size
5862 */ 5865 */
5863 void *__init alloc_large_system_hash(const char *tablename, 5866 void *__init alloc_large_system_hash(const char *tablename,
5864 unsigned long bucketsize, 5867 unsigned long bucketsize,
5865 unsigned long numentries, 5868 unsigned long numentries,
5866 int scale, 5869 int scale,
5867 int flags, 5870 int flags,
5868 unsigned int *_hash_shift, 5871 unsigned int *_hash_shift,
5869 unsigned int *_hash_mask, 5872 unsigned int *_hash_mask,
5870 unsigned long low_limit, 5873 unsigned long low_limit,
5871 unsigned long high_limit) 5874 unsigned long high_limit)
5872 { 5875 {
5873 unsigned long long max = high_limit; 5876 unsigned long long max = high_limit;
5874 unsigned long log2qty, size; 5877 unsigned long log2qty, size;
5875 void *table = NULL; 5878 void *table = NULL;
5876 5879
5877 /* allow the kernel cmdline to have a say */ 5880 /* allow the kernel cmdline to have a say */
5878 if (!numentries) { 5881 if (!numentries) {
5879 /* round applicable memory size up to nearest megabyte */ 5882 /* round applicable memory size up to nearest megabyte */
5880 numentries = nr_kernel_pages; 5883 numentries = nr_kernel_pages;
5881 5884
5882 /* It isn't necessary when PAGE_SIZE >= 1MB */ 5885 /* It isn't necessary when PAGE_SIZE >= 1MB */
5883 if (PAGE_SHIFT < 20) 5886 if (PAGE_SHIFT < 20)
5884 numentries = round_up(numentries, (1<<20)/PAGE_SIZE); 5887 numentries = round_up(numentries, (1<<20)/PAGE_SIZE);
5885 5888
5886 /* limit to 1 bucket per 2^scale bytes of low memory */ 5889 /* limit to 1 bucket per 2^scale bytes of low memory */
5887 if (scale > PAGE_SHIFT) 5890 if (scale > PAGE_SHIFT)
5888 numentries >>= (scale - PAGE_SHIFT); 5891 numentries >>= (scale - PAGE_SHIFT);
5889 else 5892 else
5890 numentries <<= (PAGE_SHIFT - scale); 5893 numentries <<= (PAGE_SHIFT - scale);
5891 5894
5892 /* Make sure we've got at least a 0-order allocation.. */ 5895 /* Make sure we've got at least a 0-order allocation.. */
5893 if (unlikely(flags & HASH_SMALL)) { 5896 if (unlikely(flags & HASH_SMALL)) {
5894 /* Makes no sense without HASH_EARLY */ 5897 /* Makes no sense without HASH_EARLY */
5895 WARN_ON(!(flags & HASH_EARLY)); 5898 WARN_ON(!(flags & HASH_EARLY));
5896 if (!(numentries >> *_hash_shift)) { 5899 if (!(numentries >> *_hash_shift)) {
5897 numentries = 1UL << *_hash_shift; 5900 numentries = 1UL << *_hash_shift;
5898 BUG_ON(!numentries); 5901 BUG_ON(!numentries);
5899 } 5902 }
5900 } else if (unlikely((numentries * bucketsize) < PAGE_SIZE)) 5903 } else if (unlikely((numentries * bucketsize) < PAGE_SIZE))
5901 numentries = PAGE_SIZE / bucketsize; 5904 numentries = PAGE_SIZE / bucketsize;
5902 } 5905 }
5903 numentries = roundup_pow_of_two(numentries); 5906 numentries = roundup_pow_of_two(numentries);
5904 5907
5905 /* limit allocation size to 1/16 total memory by default */ 5908 /* limit allocation size to 1/16 total memory by default */
5906 if (max == 0) { 5909 if (max == 0) {
5907 max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4; 5910 max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
5908 do_div(max, bucketsize); 5911 do_div(max, bucketsize);
5909 } 5912 }
5910 max = min(max, 0x80000000ULL); 5913 max = min(max, 0x80000000ULL);
5911 5914
5912 if (numentries < low_limit) 5915 if (numentries < low_limit)
5913 numentries = low_limit; 5916 numentries = low_limit;
5914 if (numentries > max) 5917 if (numentries > max)
5915 numentries = max; 5918 numentries = max;
5916 5919
5917 log2qty = ilog2(numentries); 5920 log2qty = ilog2(numentries);
5918 5921
5919 do { 5922 do {
5920 size = bucketsize << log2qty; 5923 size = bucketsize << log2qty;
5921 if (flags & HASH_EARLY) 5924 if (flags & HASH_EARLY)
5922 table = alloc_bootmem_nopanic(size); 5925 table = alloc_bootmem_nopanic(size);
5923 else if (hashdist) 5926 else if (hashdist)
5924 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); 5927 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
5925 else { 5928 else {
5926 /* 5929 /*
5927 * If bucketsize is not a power-of-two, we may free 5930 * If bucketsize is not a power-of-two, we may free
5928 * some pages at the end of hash table which 5931 * some pages at the end of hash table which
5929 * alloc_pages_exact() automatically does 5932 * alloc_pages_exact() automatically does
5930 */ 5933 */
5931 if (get_order(size) < MAX_ORDER) { 5934 if (get_order(size) < MAX_ORDER) {
5932 table = alloc_pages_exact(size, GFP_ATOMIC); 5935 table = alloc_pages_exact(size, GFP_ATOMIC);
5933 kmemleak_alloc(table, size, 1, GFP_ATOMIC); 5936 kmemleak_alloc(table, size, 1, GFP_ATOMIC);
5934 } 5937 }
5935 } 5938 }
5936 } while (!table && size > PAGE_SIZE && --log2qty); 5939 } while (!table && size > PAGE_SIZE && --log2qty);
5937 5940
5938 if (!table) 5941 if (!table)
5939 panic("Failed to allocate %s hash table\n", tablename); 5942 panic("Failed to allocate %s hash table\n", tablename);
5940 5943
5941 printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n", 5944 printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n",
5942 tablename, 5945 tablename,
5943 (1UL << log2qty), 5946 (1UL << log2qty),
5944 ilog2(size) - PAGE_SHIFT, 5947 ilog2(size) - PAGE_SHIFT,
5945 size); 5948 size);
5946 5949
5947 if (_hash_shift) 5950 if (_hash_shift)
5948 *_hash_shift = log2qty; 5951 *_hash_shift = log2qty;
5949 if (_hash_mask) 5952 if (_hash_mask)
5950 *_hash_mask = (1 << log2qty) - 1; 5953 *_hash_mask = (1 << log2qty) - 1;
5951 5954
5952 return table; 5955 return table;
5953 } 5956 }
5954 5957
5955 /* Return a pointer to the bitmap storing bits affecting a block of pages */ 5958 /* Return a pointer to the bitmap storing bits affecting a block of pages */
5956 static inline unsigned long *get_pageblock_bitmap(struct zone *zone, 5959 static inline unsigned long *get_pageblock_bitmap(struct zone *zone,
5957 unsigned long pfn) 5960 unsigned long pfn)
5958 { 5961 {
5959 #ifdef CONFIG_SPARSEMEM 5962 #ifdef CONFIG_SPARSEMEM
5960 return __pfn_to_section(pfn)->pageblock_flags; 5963 return __pfn_to_section(pfn)->pageblock_flags;
5961 #else 5964 #else
5962 return zone->pageblock_flags; 5965 return zone->pageblock_flags;
5963 #endif /* CONFIG_SPARSEMEM */ 5966 #endif /* CONFIG_SPARSEMEM */
5964 } 5967 }
5965 5968
5966 static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn) 5969 static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
5967 { 5970 {
5968 #ifdef CONFIG_SPARSEMEM 5971 #ifdef CONFIG_SPARSEMEM
5969 pfn &= (PAGES_PER_SECTION-1); 5972 pfn &= (PAGES_PER_SECTION-1);
5970 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; 5973 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
5971 #else 5974 #else
5972 pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages); 5975 pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages);
5973 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; 5976 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
5974 #endif /* CONFIG_SPARSEMEM */ 5977 #endif /* CONFIG_SPARSEMEM */
5975 } 5978 }
5976 5979
5977 /** 5980 /**
5978 * get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages 5981 * get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages
5979 * @page: The page within the block of interest 5982 * @page: The page within the block of interest
5980 * @start_bitidx: The first bit of interest to retrieve 5983 * @start_bitidx: The first bit of interest to retrieve
5981 * @end_bitidx: The last bit of interest 5984 * @end_bitidx: The last bit of interest
5982 * returns pageblock_bits flags 5985 * returns pageblock_bits flags
5983 */ 5986 */
5984 unsigned long get_pageblock_flags_mask(struct page *page, 5987 unsigned long get_pageblock_flags_mask(struct page *page,
5985 unsigned long end_bitidx, 5988 unsigned long end_bitidx,
5986 unsigned long mask) 5989 unsigned long mask)
5987 { 5990 {
5988 struct zone *zone; 5991 struct zone *zone;
5989 unsigned long *bitmap; 5992 unsigned long *bitmap;
5990 unsigned long pfn, bitidx, word_bitidx; 5993 unsigned long pfn, bitidx, word_bitidx;
5991 unsigned long word; 5994 unsigned long word;
5992 5995
5993 zone = page_zone(page); 5996 zone = page_zone(page);
5994 pfn = page_to_pfn(page); 5997 pfn = page_to_pfn(page);
5995 bitmap = get_pageblock_bitmap(zone, pfn); 5998 bitmap = get_pageblock_bitmap(zone, pfn);
5996 bitidx = pfn_to_bitidx(zone, pfn); 5999 bitidx = pfn_to_bitidx(zone, pfn);
5997 word_bitidx = bitidx / BITS_PER_LONG; 6000 word_bitidx = bitidx / BITS_PER_LONG;
5998 bitidx &= (BITS_PER_LONG-1); 6001 bitidx &= (BITS_PER_LONG-1);
5999 6002
6000 word = bitmap[word_bitidx]; 6003 word = bitmap[word_bitidx];
6001 bitidx += end_bitidx; 6004 bitidx += end_bitidx;
6002 return (word >> (BITS_PER_LONG - bitidx - 1)) & mask; 6005 return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;
6003 } 6006 }
6004 6007
6005 /** 6008 /**
6006 * set_pageblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages 6009 * set_pageblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
6007 * @page: The page within the block of interest 6010 * @page: The page within the block of interest
6008 * @start_bitidx: The first bit of interest 6011 * @start_bitidx: The first bit of interest
6009 * @end_bitidx: The last bit of interest 6012 * @end_bitidx: The last bit of interest
6010 * @flags: The flags to set 6013 * @flags: The flags to set
6011 */ 6014 */
6012 void set_pageblock_flags_mask(struct page *page, unsigned long flags, 6015 void set_pageblock_flags_mask(struct page *page, unsigned long flags,
6013 unsigned long end_bitidx, 6016 unsigned long end_bitidx,
6014 unsigned long mask) 6017 unsigned long mask)
6015 { 6018 {
6016 struct zone *zone; 6019 struct zone *zone;
6017 unsigned long *bitmap; 6020 unsigned long *bitmap;
6018 unsigned long pfn, bitidx, word_bitidx; 6021 unsigned long pfn, bitidx, word_bitidx;
6019 unsigned long old_word, word; 6022 unsigned long old_word, word;
6020 6023
6021 BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); 6024 BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
6022 6025
6023 zone = page_zone(page); 6026 zone = page_zone(page);
6024 pfn = page_to_pfn(page); 6027 pfn = page_to_pfn(page);
6025 bitmap = get_pageblock_bitmap(zone, pfn); 6028 bitmap = get_pageblock_bitmap(zone, pfn);
6026 bitidx = pfn_to_bitidx(zone, pfn); 6029 bitidx = pfn_to_bitidx(zone, pfn);
6027 word_bitidx = bitidx / BITS_PER_LONG; 6030 word_bitidx = bitidx / BITS_PER_LONG;
6028 bitidx &= (BITS_PER_LONG-1); 6031 bitidx &= (BITS_PER_LONG-1);
6029 6032
6030 VM_BUG_ON(!zone_spans_pfn(zone, pfn)); 6033 VM_BUG_ON(!zone_spans_pfn(zone, pfn));
6031 6034
6032 bitidx += end_bitidx; 6035 bitidx += end_bitidx;
6033 mask <<= (BITS_PER_LONG - bitidx - 1); 6036 mask <<= (BITS_PER_LONG - bitidx - 1);
6034 flags <<= (BITS_PER_LONG - bitidx - 1); 6037 flags <<= (BITS_PER_LONG - bitidx - 1);
6035 6038
6036 word = ACCESS_ONCE(bitmap[word_bitidx]); 6039 word = ACCESS_ONCE(bitmap[word_bitidx]);
6037 for (;;) { 6040 for (;;) {
6038 old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags); 6041 old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
6039 if (word == old_word) 6042 if (word == old_word)
6040 break; 6043 break;
6041 word = old_word; 6044 word = old_word;
6042 } 6045 }
6043 } 6046 }
6044 6047
6045 /* 6048 /*
6046 * This function checks whether pageblock includes unmovable pages or not. 6049 * This function checks whether pageblock includes unmovable pages or not.
6047 * If @count is not zero, it is okay to include less @count unmovable pages 6050 * If @count is not zero, it is okay to include less @count unmovable pages
6048 * 6051 *
6049 * PageLRU check without isolation or lru_lock could race so that 6052 * PageLRU check without isolation or lru_lock could race so that
6050 * MIGRATE_MOVABLE block might include unmovable pages. It means you can't 6053 * MIGRATE_MOVABLE block might include unmovable pages. It means you can't
6051 * expect this function should be exact. 6054 * expect this function should be exact.
6052 */ 6055 */
6053 bool has_unmovable_pages(struct zone *zone, struct page *page, int count, 6056 bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
6054 bool skip_hwpoisoned_pages) 6057 bool skip_hwpoisoned_pages)
6055 { 6058 {
6056 unsigned long pfn, iter, found; 6059 unsigned long pfn, iter, found;
6057 int mt; 6060 int mt;
6058 6061
6059 /* 6062 /*
6060 * For avoiding noise data, lru_add_drain_all() should be called 6063 * For avoiding noise data, lru_add_drain_all() should be called
6061 * If ZONE_MOVABLE, the zone never contains unmovable pages 6064 * If ZONE_MOVABLE, the zone never contains unmovable pages
6062 */ 6065 */
6063 if (zone_idx(zone) == ZONE_MOVABLE) 6066 if (zone_idx(zone) == ZONE_MOVABLE)
6064 return false; 6067 return false;
6065 mt = get_pageblock_migratetype(page); 6068 mt = get_pageblock_migratetype(page);
6066 if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt)) 6069 if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))
6067 return false; 6070 return false;
6068 6071
6069 pfn = page_to_pfn(page); 6072 pfn = page_to_pfn(page);
6070 for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { 6073 for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
6071 unsigned long check = pfn + iter; 6074 unsigned long check = pfn + iter;
6072 6075
6073 if (!pfn_valid_within(check)) 6076 if (!pfn_valid_within(check))
6074 continue; 6077 continue;
6075 6078
6076 page = pfn_to_page(check); 6079 page = pfn_to_page(check);
6077 6080
6078 /* 6081 /*
6079 * Hugepages are not in LRU lists, but they're movable. 6082 * Hugepages are not in LRU lists, but they're movable.
6080 * We need not scan over tail pages bacause we don't 6083 * We need not scan over tail pages bacause we don't
6081 * handle each tail page individually in migration. 6084 * handle each tail page individually in migration.
6082 */ 6085 */
6083 if (PageHuge(page)) { 6086 if (PageHuge(page)) {
6084 iter = round_up(iter + 1, 1<<compound_order(page)) - 1; 6087 iter = round_up(iter + 1, 1<<compound_order(page)) - 1;
6085 continue; 6088 continue;
6086 } 6089 }
6087 6090
6088 /* 6091 /*
6089 * We can't use page_count without pin a page 6092 * We can't use page_count without pin a page
6090 * because another CPU can free compound page. 6093 * because another CPU can free compound page.
6091 * This check already skips compound tails of THP 6094 * This check already skips compound tails of THP
6092 * because their page->_count is zero at all time. 6095 * because their page->_count is zero at all time.
6093 */ 6096 */
6094 if (!atomic_read(&page->_count)) { 6097 if (!atomic_read(&page->_count)) {
6095 if (PageBuddy(page)) 6098 if (PageBuddy(page))
6096 iter += (1 << page_order(page)) - 1; 6099 iter += (1 << page_order(page)) - 1;
6097 continue; 6100 continue;
6098 } 6101 }
6099 6102
6100 /* 6103 /*
6101 * The HWPoisoned page may be not in buddy system, and 6104 * The HWPoisoned page may be not in buddy system, and
6102 * page_count() is not 0. 6105 * page_count() is not 0.
6103 */ 6106 */
6104 if (skip_hwpoisoned_pages && PageHWPoison(page)) 6107 if (skip_hwpoisoned_pages && PageHWPoison(page))
6105 continue; 6108 continue;
6106 6109
6107 if (!PageLRU(page)) 6110 if (!PageLRU(page))
6108 found++; 6111 found++;
6109 /* 6112 /*
6110 * If there are RECLAIMABLE pages, we need to check it. 6113 * If there are RECLAIMABLE pages, we need to check it.
6111 * But now, memory offline itself doesn't call shrink_slab() 6114 * But now, memory offline itself doesn't call shrink_slab()
6112 * and it still to be fixed. 6115 * and it still to be fixed.
6113 */ 6116 */
6114 /* 6117 /*
6115 * If the page is not RAM, page_count()should be 0. 6118 * If the page is not RAM, page_count()should be 0.
6116 * we don't need more check. This is an _used_ not-movable page. 6119 * we don't need more check. This is an _used_ not-movable page.
6117 * 6120 *
6118 * The problematic thing here is PG_reserved pages. PG_reserved 6121 * The problematic thing here is PG_reserved pages. PG_reserved
6119 * is set to both of a memory hole page and a _used_ kernel 6122 * is set to both of a memory hole page and a _used_ kernel
6120 * page at boot. 6123 * page at boot.
6121 */ 6124 */
6122 if (found > count) 6125 if (found > count)
6123 return true; 6126 return true;
6124 } 6127 }
6125 return false; 6128 return false;
6126 } 6129 }
6127 6130
6128 bool is_pageblock_removable_nolock(struct page *page) 6131 bool is_pageblock_removable_nolock(struct page *page)
6129 { 6132 {
6130 struct zone *zone; 6133 struct zone *zone;
6131 unsigned long pfn; 6134 unsigned long pfn;
6132 6135
6133 /* 6136 /*
6134 * We have to be careful here because we are iterating over memory 6137 * We have to be careful here because we are iterating over memory
6135 * sections which are not zone aware so we might end up outside of 6138 * sections which are not zone aware so we might end up outside of
6136 * the zone but still within the section. 6139 * the zone but still within the section.
6137 * We have to take care about the node as well. If the node is offline 6140 * We have to take care about the node as well. If the node is offline
6138 * its NODE_DATA will be NULL - see page_zone. 6141 * its NODE_DATA will be NULL - see page_zone.
6139 */ 6142 */
6140 if (!node_online(page_to_nid(page))) 6143 if (!node_online(page_to_nid(page)))
6141 return false; 6144 return false;
6142 6145
6143 zone = page_zone(page); 6146 zone = page_zone(page);
6144 pfn = page_to_pfn(page); 6147 pfn = page_to_pfn(page);
6145 if (!zone_spans_pfn(zone, pfn)) 6148 if (!zone_spans_pfn(zone, pfn))
6146 return false; 6149 return false;
6147 6150
6148 return !has_unmovable_pages(zone, page, 0, true); 6151 return !has_unmovable_pages(zone, page, 0, true);
6149 } 6152 }
6150 6153
6151 #ifdef CONFIG_CMA 6154 #ifdef CONFIG_CMA
6152 6155
6153 static unsigned long pfn_max_align_down(unsigned long pfn) 6156 static unsigned long pfn_max_align_down(unsigned long pfn)
6154 { 6157 {
6155 return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES, 6158 return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,
6156 pageblock_nr_pages) - 1); 6159 pageblock_nr_pages) - 1);
6157 } 6160 }
6158 6161
6159 static unsigned long pfn_max_align_up(unsigned long pfn) 6162 static unsigned long pfn_max_align_up(unsigned long pfn)
6160 { 6163 {
6161 return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES, 6164 return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,
6162 pageblock_nr_pages)); 6165 pageblock_nr_pages));
6163 } 6166 }
6164 6167
6165 /* [start, end) must belong to a single zone. */ 6168 /* [start, end) must belong to a single zone. */
6166 static int __alloc_contig_migrate_range(struct compact_control *cc, 6169 static int __alloc_contig_migrate_range(struct compact_control *cc,
6167 unsigned long start, unsigned long end) 6170 unsigned long start, unsigned long end)
6168 { 6171 {
6169 /* This function is based on compact_zone() from compaction.c. */ 6172 /* This function is based on compact_zone() from compaction.c. */
6170 unsigned long nr_reclaimed; 6173 unsigned long nr_reclaimed;
6171 unsigned long pfn = start; 6174 unsigned long pfn = start;
6172 unsigned int tries = 0; 6175 unsigned int tries = 0;
6173 int ret = 0; 6176 int ret = 0;
6174 6177
6175 migrate_prep(); 6178 migrate_prep();
6176 6179
6177 while (pfn < end || !list_empty(&cc->migratepages)) { 6180 while (pfn < end || !list_empty(&cc->migratepages)) {
6178 if (fatal_signal_pending(current)) { 6181 if (fatal_signal_pending(current)) {
6179 ret = -EINTR; 6182 ret = -EINTR;
6180 break; 6183 break;
6181 } 6184 }
6182 6185
6183 if (list_empty(&cc->migratepages)) { 6186 if (list_empty(&cc->migratepages)) {
6184 cc->nr_migratepages = 0; 6187 cc->nr_migratepages = 0;
6185 pfn = isolate_migratepages_range(cc->zone, cc, 6188 pfn = isolate_migratepages_range(cc->zone, cc,
6186 pfn, end, true); 6189 pfn, end, true);
6187 if (!pfn) { 6190 if (!pfn) {
6188 ret = -EINTR; 6191 ret = -EINTR;
6189 break; 6192 break;
6190 } 6193 }
6191 tries = 0; 6194 tries = 0;
6192 } else if (++tries == 5) { 6195 } else if (++tries == 5) {
6193 ret = ret < 0 ? ret : -EBUSY; 6196 ret = ret < 0 ? ret : -EBUSY;
6194 break; 6197 break;
6195 } 6198 }
6196 6199
6197 nr_reclaimed = reclaim_clean_pages_from_list(cc->zone, 6200 nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,
6198 &cc->migratepages); 6201 &cc->migratepages);
6199 cc->nr_migratepages -= nr_reclaimed; 6202 cc->nr_migratepages -= nr_reclaimed;
6200 6203
6201 ret = migrate_pages(&cc->migratepages, alloc_migrate_target, 6204 ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
6202 NULL, 0, cc->mode, MR_CMA); 6205 NULL, 0, cc->mode, MR_CMA);
6203 } 6206 }
6204 if (ret < 0) { 6207 if (ret < 0) {
6205 putback_movable_pages(&cc->migratepages); 6208 putback_movable_pages(&cc->migratepages);
6206 return ret; 6209 return ret;
6207 } 6210 }
6208 return 0; 6211 return 0;
6209 } 6212 }
6210 6213
6211 /** 6214 /**
6212 * alloc_contig_range() -- tries to allocate given range of pages 6215 * alloc_contig_range() -- tries to allocate given range of pages
6213 * @start: start PFN to allocate 6216 * @start: start PFN to allocate
6214 * @end: one-past-the-last PFN to allocate 6217 * @end: one-past-the-last PFN to allocate
6215 * @migratetype: migratetype of the underlaying pageblocks (either 6218 * @migratetype: migratetype of the underlaying pageblocks (either
6216 * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks 6219 * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks
6217 * in range must have the same migratetype and it must 6220 * in range must have the same migratetype and it must
6218 * be either of the two. 6221 * be either of the two.
6219 * 6222 *
6220 * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES 6223 * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES
6221 * aligned, however it's the caller's responsibility to guarantee that 6224 * aligned, however it's the caller's responsibility to guarantee that
6222 * we are the only thread that changes migrate type of pageblocks the 6225 * we are the only thread that changes migrate type of pageblocks the
6223 * pages fall in. 6226 * pages fall in.
6224 * 6227 *
6225 * The PFN range must belong to a single zone. 6228 * The PFN range must belong to a single zone.
6226 * 6229 *
6227 * Returns zero on success or negative error code. On success all 6230 * Returns zero on success or negative error code. On success all
6228 * pages which PFN is in [start, end) are allocated for the caller and 6231 * pages which PFN is in [start, end) are allocated for the caller and
6229 * need to be freed with free_contig_range(). 6232 * need to be freed with free_contig_range().
6230 */ 6233 */
6231 int alloc_contig_range(unsigned long start, unsigned long end, 6234 int alloc_contig_range(unsigned long start, unsigned long end,
6232 unsigned migratetype) 6235 unsigned migratetype)
6233 { 6236 {
6234 unsigned long outer_start, outer_end; 6237 unsigned long outer_start, outer_end;
6235 int ret = 0, order; 6238 int ret = 0, order;
6236 6239
6237 struct compact_control cc = { 6240 struct compact_control cc = {
6238 .nr_migratepages = 0, 6241 .nr_migratepages = 0,
6239 .order = -1, 6242 .order = -1,
6240 .zone = page_zone(pfn_to_page(start)), 6243 .zone = page_zone(pfn_to_page(start)),
6241 .mode = MIGRATE_SYNC, 6244 .mode = MIGRATE_SYNC,
6242 .ignore_skip_hint = true, 6245 .ignore_skip_hint = true,
6243 }; 6246 };
6244 INIT_LIST_HEAD(&cc.migratepages); 6247 INIT_LIST_HEAD(&cc.migratepages);
6245 6248
6246 /* 6249 /*
6247 * What we do here is we mark all pageblocks in range as 6250 * What we do here is we mark all pageblocks in range as
6248 * MIGRATE_ISOLATE. Because pageblock and max order pages may 6251 * MIGRATE_ISOLATE. Because pageblock and max order pages may
6249 * have different sizes, and due to the way page allocator 6252 * have different sizes, and due to the way page allocator
6250 * work, we align the range to biggest of the two pages so 6253 * work, we align the range to biggest of the two pages so
6251 * that page allocator won't try to merge buddies from 6254 * that page allocator won't try to merge buddies from
6252 * different pageblocks and change MIGRATE_ISOLATE to some 6255 * different pageblocks and change MIGRATE_ISOLATE to some
6253 * other migration type. 6256 * other migration type.
6254 * 6257 *
6255 * Once the pageblocks are marked as MIGRATE_ISOLATE, we 6258 * Once the pageblocks are marked as MIGRATE_ISOLATE, we
6256 * migrate the pages from an unaligned range (ie. pages that 6259 * migrate the pages from an unaligned range (ie. pages that
6257 * we are interested in). This will put all the pages in 6260 * we are interested in). This will put all the pages in
6258 * range back to page allocator as MIGRATE_ISOLATE. 6261 * range back to page allocator as MIGRATE_ISOLATE.
6259 * 6262 *
6260 * When this is done, we take the pages in range from page 6263 * When this is done, we take the pages in range from page
6261 * allocator removing them from the buddy system. This way 6264 * allocator removing them from the buddy system. This way
6262 * page allocator will never consider using them. 6265 * page allocator will never consider using them.
6263 * 6266 *
6264 * This lets us mark the pageblocks back as 6267 * This lets us mark the pageblocks back as
6265 * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the 6268 * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the
6266 * aligned range but not in the unaligned, original range are 6269 * aligned range but not in the unaligned, original range are
6267 * put back to page allocator so that buddy can use them. 6270 * put back to page allocator so that buddy can use them.
6268 */ 6271 */
6269 6272
6270 ret = start_isolate_page_range(pfn_max_align_down(start), 6273 ret = start_isolate_page_range(pfn_max_align_down(start),
6271 pfn_max_align_up(end), migratetype, 6274 pfn_max_align_up(end), migratetype,
6272 false); 6275 false);
6273 if (ret) 6276 if (ret)
6274 return ret; 6277 return ret;
6275 6278
6276 ret = __alloc_contig_migrate_range(&cc, start, end); 6279 ret = __alloc_contig_migrate_range(&cc, start, end);
6277 if (ret) 6280 if (ret)
6278 goto done; 6281 goto done;
6279 6282
6280 /* 6283 /*
6281 * Pages from [start, end) are within a MAX_ORDER_NR_PAGES 6284 * Pages from [start, end) are within a MAX_ORDER_NR_PAGES
6282 * aligned blocks that are marked as MIGRATE_ISOLATE. What's 6285 * aligned blocks that are marked as MIGRATE_ISOLATE. What's
6283 * more, all pages in [start, end) are free in page allocator. 6286 * more, all pages in [start, end) are free in page allocator.
6284 * What we are going to do is to allocate all pages from 6287 * What we are going to do is to allocate all pages from
6285 * [start, end) (that is remove them from page allocator). 6288 * [start, end) (that is remove them from page allocator).
6286 * 6289 *
6287 * The only problem is that pages at the beginning and at the 6290 * The only problem is that pages at the beginning and at the
6288 * end of interesting range may be not aligned with pages that 6291 * end of interesting range may be not aligned with pages that
6289 * page allocator holds, ie. they can be part of higher order 6292 * page allocator holds, ie. they can be part of higher order
6290 * pages. Because of this, we reserve the bigger range and 6293 * pages. Because of this, we reserve the bigger range and
6291 * once this is done free the pages we are not interested in. 6294 * once this is done free the pages we are not interested in.
6292 * 6295 *
6293 * We don't have to hold zone->lock here because the pages are 6296 * We don't have to hold zone->lock here because the pages are
6294 * isolated thus they won't get removed from buddy. 6297 * isolated thus they won't get removed from buddy.
6295 */ 6298 */
6296 6299
6297 lru_add_drain_all(); 6300 lru_add_drain_all();
6298 drain_all_pages(); 6301 drain_all_pages();
6299 6302
6300 order = 0; 6303 order = 0;
6301 outer_start = start; 6304 outer_start = start;
6302 while (!PageBuddy(pfn_to_page(outer_start))) { 6305 while (!PageBuddy(pfn_to_page(outer_start))) {
6303 if (++order >= MAX_ORDER) { 6306 if (++order >= MAX_ORDER) {
6304 ret = -EBUSY; 6307 ret = -EBUSY;
6305 goto done; 6308 goto done;
6306 } 6309 }
6307 outer_start &= ~0UL << order; 6310 outer_start &= ~0UL << order;
6308 } 6311 }
6309 6312
6310 /* Make sure the range is really isolated. */ 6313 /* Make sure the range is really isolated. */
6311 if (test_pages_isolated(outer_start, end, false)) { 6314 if (test_pages_isolated(outer_start, end, false)) {
6312 pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n", 6315 pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n",
6313 outer_start, end); 6316 outer_start, end);
6314 ret = -EBUSY; 6317 ret = -EBUSY;
6315 goto done; 6318 goto done;
6316 } 6319 }
6317 6320
6318 6321
6319 /* Grab isolated pages from freelists. */ 6322 /* Grab isolated pages from freelists. */
6320 outer_end = isolate_freepages_range(&cc, outer_start, end); 6323 outer_end = isolate_freepages_range(&cc, outer_start, end);
6321 if (!outer_end) { 6324 if (!outer_end) {
6322 ret = -EBUSY; 6325 ret = -EBUSY;
6323 goto done; 6326 goto done;
6324 } 6327 }
6325 6328
6326 /* Free head and tail (if any) */ 6329 /* Free head and tail (if any) */
6327 if (start != outer_start) 6330 if (start != outer_start)
6328 free_contig_range(outer_start, start - outer_start); 6331 free_contig_range(outer_start, start - outer_start);
6329 if (end != outer_end) 6332 if (end != outer_end)
6330 free_contig_range(end, outer_end - end); 6333 free_contig_range(end, outer_end - end);
6331 6334
6332 done: 6335 done:
6333 undo_isolate_page_range(pfn_max_align_down(start), 6336 undo_isolate_page_range(pfn_max_align_down(start),
6334 pfn_max_align_up(end), migratetype); 6337 pfn_max_align_up(end), migratetype);
6335 return ret; 6338 return ret;
6336 } 6339 }
6337 6340
6338 void free_contig_range(unsigned long pfn, unsigned nr_pages) 6341 void free_contig_range(unsigned long pfn, unsigned nr_pages)
6339 { 6342 {
6340 unsigned int count = 0; 6343 unsigned int count = 0;
6341 6344
6342 for (; nr_pages--; pfn++) { 6345 for (; nr_pages--; pfn++) {
6343 struct page *page = pfn_to_page(pfn); 6346 struct page *page = pfn_to_page(pfn);
6344 6347
6345 count += page_count(page) != 1; 6348 count += page_count(page) != 1;
6346 __free_page(page); 6349 __free_page(page);
6347 } 6350 }
6348 WARN(count != 0, "%d pages are still in use!\n", count); 6351 WARN(count != 0, "%d pages are still in use!\n", count);
6349 } 6352 }
6350 #endif 6353 #endif
6351 6354
6352 #ifdef CONFIG_MEMORY_HOTPLUG 6355 #ifdef CONFIG_MEMORY_HOTPLUG
6353 /* 6356 /*
6354 * The zone indicated has a new number of managed_pages; batch sizes and percpu 6357 * The zone indicated has a new number of managed_pages; batch sizes and percpu
6355 * page high values need to be recalulated. 6358 * page high values need to be recalulated.
6356 */ 6359 */
6357 void __meminit zone_pcp_update(struct zone *zone) 6360 void __meminit zone_pcp_update(struct zone *zone)
6358 { 6361 {
6359 unsigned cpu; 6362 unsigned cpu;
6360 mutex_lock(&pcp_batch_high_lock); 6363 mutex_lock(&pcp_batch_high_lock);
6361 for_each_possible_cpu(cpu) 6364 for_each_possible_cpu(cpu)
6362 pageset_set_high_and_batch(zone, 6365 pageset_set_high_and_batch(zone,
6363 per_cpu_ptr(zone->pageset, cpu)); 6366 per_cpu_ptr(zone->pageset, cpu));
6364 mutex_unlock(&pcp_batch_high_lock); 6367 mutex_unlock(&pcp_batch_high_lock);
6365 } 6368 }
6366 #endif 6369 #endif
6367 6370
6368 void zone_pcp_reset(struct zone *zone) 6371 void zone_pcp_reset(struct zone *zone)
6369 { 6372 {
6370 unsigned long flags; 6373 unsigned long flags;
6371 int cpu; 6374 int cpu;
6372 struct per_cpu_pageset *pset; 6375 struct per_cpu_pageset *pset;
6373 6376
6374 /* avoid races with drain_pages() */ 6377 /* avoid races with drain_pages() */
6375 local_irq_save(flags); 6378 local_irq_save(flags);
6376 if (zone->pageset != &boot_pageset) { 6379 if (zone->pageset != &boot_pageset) {
6377 for_each_online_cpu(cpu) { 6380 for_each_online_cpu(cpu) {
6378 pset = per_cpu_ptr(zone->pageset, cpu); 6381 pset = per_cpu_ptr(zone->pageset, cpu);
6379 drain_zonestat(zone, pset); 6382 drain_zonestat(zone, pset);
6380 } 6383 }
6381 free_percpu(zone->pageset); 6384 free_percpu(zone->pageset);
6382 zone->pageset = &boot_pageset; 6385 zone->pageset = &boot_pageset;
6383 } 6386 }
6384 local_irq_restore(flags); 6387 local_irq_restore(flags);
6385 } 6388 }
6386 6389
6387 #ifdef CONFIG_MEMORY_HOTREMOVE 6390 #ifdef CONFIG_MEMORY_HOTREMOVE
6388 /* 6391 /*
6389 * All pages in the range must be isolated before calling this. 6392 * All pages in the range must be isolated before calling this.
6390 */ 6393 */
6391 void 6394 void
6392 __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) 6395 __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
6393 { 6396 {
6394 struct page *page; 6397 struct page *page;
6395 struct zone *zone; 6398 struct zone *zone;
6396 int order, i; 6399 int order, i;
6397 unsigned long pfn; 6400 unsigned long pfn;
6398 unsigned long flags; 6401 unsigned long flags;
6399 /* find the first valid pfn */ 6402 /* find the first valid pfn */
6400 for (pfn = start_pfn; pfn < end_pfn; pfn++) 6403 for (pfn = start_pfn; pfn < end_pfn; pfn++)
6401 if (pfn_valid(pfn)) 6404 if (pfn_valid(pfn))
6402 break; 6405 break;
6403 if (pfn == end_pfn) 6406 if (pfn == end_pfn)
6404 return; 6407 return;
6405 zone = page_zone(pfn_to_page(pfn)); 6408 zone = page_zone(pfn_to_page(pfn));
6406 spin_lock_irqsave(&zone->lock, flags); 6409 spin_lock_irqsave(&zone->lock, flags);
6407 pfn = start_pfn; 6410 pfn = start_pfn;
6408 while (pfn < end_pfn) { 6411 while (pfn < end_pfn) {
6409 if (!pfn_valid(pfn)) { 6412 if (!pfn_valid(pfn)) {
6410 pfn++; 6413 pfn++;
6411 continue; 6414 continue;
6412 } 6415 }
6413 page = pfn_to_page(pfn); 6416 page = pfn_to_page(pfn);
6414 /* 6417 /*
6415 * The HWPoisoned page may be not in buddy system, and 6418 * The HWPoisoned page may be not in buddy system, and
6416 * page_count() is not 0. 6419 * page_count() is not 0.
6417 */ 6420 */
6418 if (unlikely(!PageBuddy(page) && PageHWPoison(page))) { 6421 if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
6419 pfn++; 6422 pfn++;
6420 SetPageReserved(page); 6423 SetPageReserved(page);
6421 continue; 6424 continue;
6422 } 6425 }
6423 6426
6424 BUG_ON(page_count(page)); 6427 BUG_ON(page_count(page));
6425 BUG_ON(!PageBuddy(page)); 6428 BUG_ON(!PageBuddy(page));
6426 order = page_order(page); 6429 order = page_order(page);
6427 #ifdef CONFIG_DEBUG_VM 6430 #ifdef CONFIG_DEBUG_VM
6428 printk(KERN_INFO "remove from free list %lx %d %lx\n", 6431 printk(KERN_INFO "remove from free list %lx %d %lx\n",
6429 pfn, 1 << order, end_pfn); 6432 pfn, 1 << order, end_pfn);
6430 #endif 6433 #endif
6431 list_del(&page->lru); 6434 list_del(&page->lru);
6432 rmv_page_order(page); 6435 rmv_page_order(page);
6433 zone->free_area[order].nr_free--; 6436 zone->free_area[order].nr_free--;
6434 for (i = 0; i < (1 << order); i++) 6437 for (i = 0; i < (1 << order); i++)
6435 SetPageReserved((page+i)); 6438 SetPageReserved((page+i));
6436 pfn += (1 << order); 6439 pfn += (1 << order);
6437 } 6440 }
6438 spin_unlock_irqrestore(&zone->lock, flags); 6441 spin_unlock_irqrestore(&zone->lock, flags);
6439 } 6442 }
6440 #endif 6443 #endif
6441 6444
6442 #ifdef CONFIG_MEMORY_FAILURE 6445 #ifdef CONFIG_MEMORY_FAILURE
6443 bool is_free_buddy_page(struct page *page) 6446 bool is_free_buddy_page(struct page *page)
6444 { 6447 {
6445 struct zone *zone = page_zone(page); 6448 struct zone *zone = page_zone(page);
6446 unsigned long pfn = page_to_pfn(page); 6449 unsigned long pfn = page_to_pfn(page);
6447 unsigned long flags; 6450 unsigned long flags;
6448 int order; 6451 int order;
6449 6452
6450 spin_lock_irqsave(&zone->lock, flags); 6453 spin_lock_irqsave(&zone->lock, flags);
6451 for (order = 0; order < MAX_ORDER; order++) { 6454 for (order = 0; order < MAX_ORDER; order++) {
6452 struct page *page_head = page - (pfn & ((1 << order) - 1)); 6455 struct page *page_head = page - (pfn & ((1 << order) - 1));
6453 6456
6454 if (PageBuddy(page_head) && page_order(page_head) >= order) 6457 if (PageBuddy(page_head) && page_order(page_head) >= order)
6455 break; 6458 break;
6456 } 6459 }
6457 spin_unlock_irqrestore(&zone->lock, flags); 6460 spin_unlock_irqrestore(&zone->lock, flags);
6458 6461
6459 return order < MAX_ORDER; 6462 return order < MAX_ORDER;
6460 } 6463 }
6461 #endif 6464 #endif
6462 6465
6463 static const struct trace_print_flags pageflag_names[] = { 6466 static const struct trace_print_flags pageflag_names[] = {
6464 {1UL << PG_locked, "locked" }, 6467 {1UL << PG_locked, "locked" },
6465 {1UL << PG_error, "error" }, 6468 {1UL << PG_error, "error" },
6466 {1UL << PG_referenced, "referenced" }, 6469 {1UL << PG_referenced, "referenced" },
6467 {1UL << PG_uptodate, "uptodate" }, 6470 {1UL << PG_uptodate, "uptodate" },
6468 {1UL << PG_dirty, "dirty" }, 6471 {1UL << PG_dirty, "dirty" },
6469 {1UL << PG_lru, "lru" }, 6472 {1UL << PG_lru, "lru" },
6470 {1UL << PG_active, "active" }, 6473 {1UL << PG_active, "active" },
6471 {1UL << PG_slab, "slab" }, 6474 {1UL << PG_slab, "slab" },
6472 {1UL << PG_owner_priv_1, "owner_priv_1" }, 6475 {1UL << PG_owner_priv_1, "owner_priv_1" },
6473 {1UL << PG_arch_1, "arch_1" }, 6476 {1UL << PG_arch_1, "arch_1" },
6474 {1UL << PG_reserved, "reserved" }, 6477 {1UL << PG_reserved, "reserved" },
6475 {1UL << PG_private, "private" }, 6478 {1UL << PG_private, "private" },
6476 {1UL << PG_private_2, "private_2" }, 6479 {1UL << PG_private_2, "private_2" },
6477 {1UL << PG_writeback, "writeback" }, 6480 {1UL << PG_writeback, "writeback" },
6478 #ifdef CONFIG_PAGEFLAGS_EXTENDED 6481 #ifdef CONFIG_PAGEFLAGS_EXTENDED
6479 {1UL << PG_head, "head" }, 6482 {1UL << PG_head, "head" },
6480 {1UL << PG_tail, "tail" }, 6483 {1UL << PG_tail, "tail" },
6481 #else 6484 #else
6482 {1UL << PG_compound, "compound" }, 6485 {1UL << PG_compound, "compound" },
6483 #endif 6486 #endif
6484 {1UL << PG_swapcache, "swapcache" }, 6487 {1UL << PG_swapcache, "swapcache" },
6485 {1UL << PG_mappedtodisk, "mappedtodisk" }, 6488 {1UL << PG_mappedtodisk, "mappedtodisk" },
6486 {1UL << PG_reclaim, "reclaim" }, 6489 {1UL << PG_reclaim, "reclaim" },
6487 {1UL << PG_swapbacked, "swapbacked" }, 6490 {1UL << PG_swapbacked, "swapbacked" },
6488 {1UL << PG_unevictable, "unevictable" }, 6491 {1UL << PG_unevictable, "unevictable" },
6489 #ifdef CONFIG_MMU 6492 #ifdef CONFIG_MMU
6490 {1UL << PG_mlocked, "mlocked" }, 6493 {1UL << PG_mlocked, "mlocked" },
6491 #endif 6494 #endif
6492 #ifdef CONFIG_ARCH_USES_PG_UNCACHED 6495 #ifdef CONFIG_ARCH_USES_PG_UNCACHED
6493 {1UL << PG_uncached, "uncached" }, 6496 {1UL << PG_uncached, "uncached" },
6494 #endif 6497 #endif
6495 #ifdef CONFIG_MEMORY_FAILURE 6498 #ifdef CONFIG_MEMORY_FAILURE
6496 {1UL << PG_hwpoison, "hwpoison" }, 6499 {1UL << PG_hwpoison, "hwpoison" },
6497 #endif 6500 #endif
6498 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 6501 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
6499 {1UL << PG_compound_lock, "compound_lock" }, 6502 {1UL << PG_compound_lock, "compound_lock" },
6500 #endif 6503 #endif
6501 }; 6504 };
6502 6505
6503 static void dump_page_flags(unsigned long flags) 6506 static void dump_page_flags(unsigned long flags)
6504 { 6507 {
6505 const char *delim = ""; 6508 const char *delim = "";
6506 unsigned long mask; 6509 unsigned long mask;
6507 int i; 6510 int i;
6508 6511
6509 BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS); 6512 BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);
6510 6513
6511 printk(KERN_ALERT "page flags: %#lx(", flags); 6514 printk(KERN_ALERT "page flags: %#lx(", flags);
6512 6515
6513 /* remove zone id */ 6516 /* remove zone id */
6514 flags &= (1UL << NR_PAGEFLAGS) - 1; 6517 flags &= (1UL << NR_PAGEFLAGS) - 1;
6515 6518
6516 for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) { 6519 for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) {
6517 6520
6518 mask = pageflag_names[i].mask; 6521 mask = pageflag_names[i].mask;
6519 if ((flags & mask) != mask) 6522 if ((flags & mask) != mask)
6520 continue; 6523 continue;
6521 6524
6522 flags &= ~mask; 6525 flags &= ~mask;
6523 printk("%s%s", delim, pageflag_names[i].name); 6526 printk("%s%s", delim, pageflag_names[i].name);
6524 delim = "|"; 6527 delim = "|";
6525 } 6528 }
6526 6529
6527 /* check for left over flags */ 6530 /* check for left over flags */
6528 if (flags) 6531 if (flags)
6529 printk("%s%#lx", delim, flags); 6532 printk("%s%#lx", delim, flags);
6530 6533
6531 printk(")\n"); 6534 printk(")\n");
6532 } 6535 }
6533 6536
6534 void dump_page(struct page *page) 6537 void dump_page(struct page *page)
6535 { 6538 {
6536 printk(KERN_ALERT 6539 printk(KERN_ALERT
6537 "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", 6540 "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",