Blame view

mm/page_alloc.c 172 KB
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
  /*
   *  linux/mm/page_alloc.c
   *
   *  Manages the free list, the system allocates free pages here.
   *  Note that kmalloc() lives in slab.c
   *
   *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   *  Swap reorganised 29.12.95, Stephen Tweedie
   *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
   *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
   *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
   *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
   *  Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
   *          (lots of bits borrowed from Ingo Molnar & Andrew Morton)
   */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
16
17
18
19
20
  #include <linux/stddef.h>
  #include <linux/mm.h>
  #include <linux/swap.h>
  #include <linux/interrupt.h>
  #include <linux/pagemap.h>
10ed273f5   KOSAKI Motohiro   zlc_setup(): hand...
21
  #include <linux/jiffies.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
22
  #include <linux/bootmem.h>
edbe7d23b   Yinghai Lu   memblock: Add fin...
23
  #include <linux/memblock.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
24
  #include <linux/compiler.h>
9f1583339   Randy Dunlap   [PATCH] use add_t...
25
  #include <linux/kernel.h>
b1eeab676   Vegard Nossum   kmemcheck: add ho...
26
  #include <linux/kmemcheck.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
27
28
29
30
31
  #include <linux/module.h>
  #include <linux/suspend.h>
  #include <linux/pagevec.h>
  #include <linux/blkdev.h>
  #include <linux/slab.h>
a238ab5b0   Dave Hansen   mm: break out pag...
32
  #include <linux/ratelimit.h>
5a3135c2e   David Rientjes   oom: move prototy...
33
  #include <linux/oom.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
34
35
36
37
38
  #include <linux/notifier.h>
  #include <linux/topology.h>
  #include <linux/sysctl.h>
  #include <linux/cpu.h>
  #include <linux/cpuset.h>
bdc8cb984   Dave Hansen   [PATCH] memory ho...
39
  #include <linux/memory_hotplug.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
40
41
  #include <linux/nodemask.h>
  #include <linux/vmalloc.h>
a6cccdc36   KOSAKI Motohiro   mm, mem-hotplug: ...
42
  #include <linux/vmstat.h>
4be38e351   Christoph Lameter   [PATCH] mm: move ...
43
  #include <linux/mempolicy.h>
6811378e7   Yasunori Goto   [PATCH] wait_tabl...
44
  #include <linux/stop_machine.h>
c713216de   Mel Gorman   [PATCH] Introduce...
45
46
  #include <linux/sort.h>
  #include <linux/pfn.h>
3fcfab16c   Andrew Morton   [PATCH] separate ...
47
  #include <linux/backing-dev.h>
933e312e7   Akinobu Mita   [PATCH] fault-inj...
48
  #include <linux/fault-inject.h>
a5d76b54a   KAMEZAWA Hiroyuki   memory unplug: pa...
49
  #include <linux/page-isolation.h>
52d4b9ac0   KAMEZAWA Hiroyuki   memcg: allocate a...
50
  #include <linux/page_cgroup.h>
3ac7fe5a4   Thomas Gleixner   infrastructure to...
51
  #include <linux/debugobjects.h>
dbb1f81ca   Catalin Marinas   kmemleak: Add kme...
52
  #include <linux/kmemleak.h>
56de7263f   Mel Gorman   mm: compaction: d...
53
  #include <linux/compaction.h>
0d3d062a6   Mel Gorman   tracing, page-all...
54
  #include <trace/events/kmem.h>
718a38211   Wu Fengguang   mm: introduce dum...
55
  #include <linux/ftrace_event.h>
f212ad7cf   Daisuke Nishimura   memcg: add memcg ...
56
  #include <linux/memcontrol.h>
268bb0ce3   Linus Torvalds   sanitize <linux/p...
57
  #include <linux/prefetch.h>
041d3a8cd   Michal Nazarewicz   mm: page_alloc: i...
58
  #include <linux/migrate.h>
c0a32fc5a   Stanislaw Gruszka   mm: more intensiv...
59
  #include <linux/page-debug-flags.h>
949f7ec57   David Rientjes   mm, hugetlb: incl...
60
  #include <linux/hugetlb.h>
8bd75c77b   Clark Williams   sched/rt: Move rt...
61
  #include <linux/sched/rt.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
62
63
  
  #include <asm/tlbflush.h>
ac924c603   Andrew Morton   [PATCH] setup_per...
64
  #include <asm/div64.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
65
  #include "internal.h"
728120192   Lee Schermerhorn   numa: add generic...
66
67
68
69
  #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
  DEFINE_PER_CPU(int, numa_node);
  EXPORT_PER_CPU_SYMBOL(numa_node);
  #endif
7aac78988   Lee Schermerhorn   numa: introduce n...
70
71
72
73
74
75
76
77
78
79
  #ifdef CONFIG_HAVE_MEMORYLESS_NODES
  /*
   * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
   * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
   * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()
   * defined in <linux/topology.h>.
   */
  DEFINE_PER_CPU(int, _numa_mem_);		/* Kernel "local memory" node */
  EXPORT_PER_CPU_SYMBOL(_numa_mem_);
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
80
  /*
138089107   Christoph Lameter   Memoryless nodes:...
81
   * Array of node states.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
82
   */
138089107   Christoph Lameter   Memoryless nodes:...
83
84
85
86
87
88
89
90
  nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
  	[N_POSSIBLE] = NODE_MASK_ALL,
  	[N_ONLINE] = { { [0] = 1UL } },
  #ifndef CONFIG_NUMA
  	[N_NORMAL_MEMORY] = { { [0] = 1UL } },
  #ifdef CONFIG_HIGHMEM
  	[N_HIGH_MEMORY] = { { [0] = 1UL } },
  #endif
20b2f52b7   Lai Jiangshan   numa: add CONFIG_...
91
92
93
  #ifdef CONFIG_MOVABLE_NODE
  	[N_MEMORY] = { { [0] = 1UL } },
  #endif
138089107   Christoph Lameter   Memoryless nodes:...
94
95
96
97
  	[N_CPU] = { { [0] = 1UL } },
  #endif	/* NUMA */
  };
  EXPORT_SYMBOL(node_states);
6c231b7ba   Ravikiran G Thirumalai   [PATCH] Additions...
98
  unsigned long totalram_pages __read_mostly;
cb45b0e96   Hideo AOKI   [PATCH] overcommi...
99
  unsigned long totalreserve_pages __read_mostly;
ab8fabd46   Johannes Weiner   mm: exclude reser...
100
101
102
103
104
105
106
  /*
   * When calculating the number of globally allowed dirty pages, there
   * is a certain number of per-zone reserves that should not be
   * considered dirtyable memory.  This is the sum of those reserves
   * over all existing zones that contribute dirtyable memory.
   */
  unsigned long dirty_balance_reserve __read_mostly;
1b76b02f1   Hugh Dickins   mm: raise MemFree...
107
  int percpu_pagelist_fraction;
dcce284a2   Benjamin Herrenschmidt   mm: Extend gfp ma...
108
  gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
109

452aa6999   Rafael J. Wysocki   mm/pm: force GFP_...
110
111
112
113
114
115
116
117
118
  #ifdef CONFIG_PM_SLEEP
  /*
   * The following functions are used by the suspend/hibernate code to temporarily
   * change gfp_allowed_mask in order to avoid using I/O during memory allocations
   * while devices are suspended.  To avoid races with the suspend/hibernate code,
   * they should always be called with pm_mutex held (gfp_allowed_mask also should
   * only be modified with pm_mutex held, unless the suspend/hibernate code is
   * guaranteed not to run in parallel with that modification).
   */
c9e664f1f   Rafael J. Wysocki   PM / Hibernate: F...
119
120
121
122
  
  static gfp_t saved_gfp_mask;
  
  void pm_restore_gfp_mask(void)
452aa6999   Rafael J. Wysocki   mm/pm: force GFP_...
123
124
  {
  	WARN_ON(!mutex_is_locked(&pm_mutex));
c9e664f1f   Rafael J. Wysocki   PM / Hibernate: F...
125
126
127
128
  	if (saved_gfp_mask) {
  		gfp_allowed_mask = saved_gfp_mask;
  		saved_gfp_mask = 0;
  	}
452aa6999   Rafael J. Wysocki   mm/pm: force GFP_...
129
  }
c9e664f1f   Rafael J. Wysocki   PM / Hibernate: F...
130
  void pm_restrict_gfp_mask(void)
452aa6999   Rafael J. Wysocki   mm/pm: force GFP_...
131
  {
452aa6999   Rafael J. Wysocki   mm/pm: force GFP_...
132
  	WARN_ON(!mutex_is_locked(&pm_mutex));
c9e664f1f   Rafael J. Wysocki   PM / Hibernate: F...
133
134
135
  	WARN_ON(saved_gfp_mask);
  	saved_gfp_mask = gfp_allowed_mask;
  	gfp_allowed_mask &= ~GFP_IOFS;
452aa6999   Rafael J. Wysocki   mm/pm: force GFP_...
136
  }
f90ac3982   Mel Gorman   mm: avoid liveloc...
137
138
139
140
141
142
143
  
  bool pm_suspended_storage(void)
  {
  	if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)
  		return false;
  	return true;
  }
452aa6999   Rafael J. Wysocki   mm/pm: force GFP_...
144
  #endif /* CONFIG_PM_SLEEP */
d9c234005   Mel Gorman   Do not depend on ...
145
146
147
  #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
  int pageblock_order __read_mostly;
  #endif
d98c7a098   Hugh Dickins   [PATCH] compound ...
148
  static void __free_pages_ok(struct page *page, unsigned int order);
a226f6c89   David Howells   [PATCH] FRV: Clea...
149

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
150
151
152
153
154
155
156
  /*
   * results with 256, 32 in the lowmem_reserve sysctl:
   *	1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
   *	1G machine -> (16M dma, 784M normal, 224M high)
   *	NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
   *	HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
   *	HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
a2f1b4249   Andi Kleen   [PATCH] x86_64: A...
157
158
159
   *
   * TBD: should special case ZONE_DMA32 machines here - in those we normally
   * don't need any ZONE_NORMAL reservation
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
160
   */
2f1b62486   Christoph Lameter   [PATCH] reduce MA...
161
  int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
4b51d6698   Christoph Lameter   [PATCH] optional ...
162
  #ifdef CONFIG_ZONE_DMA
2f1b62486   Christoph Lameter   [PATCH] reduce MA...
163
  	 256,
4b51d6698   Christoph Lameter   [PATCH] optional ...
164
  #endif
fb0e7942b   Christoph Lameter   [PATCH] reduce MA...
165
  #ifdef CONFIG_ZONE_DMA32
2f1b62486   Christoph Lameter   [PATCH] reduce MA...
166
  	 256,
fb0e7942b   Christoph Lameter   [PATCH] reduce MA...
167
  #endif
e53ef38d0   Christoph Lameter   [PATCH] reduce MA...
168
  #ifdef CONFIG_HIGHMEM
2a1e274ac   Mel Gorman   Create the ZONE_M...
169
  	 32,
e53ef38d0   Christoph Lameter   [PATCH] reduce MA...
170
  #endif
2a1e274ac   Mel Gorman   Create the ZONE_M...
171
  	 32,
2f1b62486   Christoph Lameter   [PATCH] reduce MA...
172
  };
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
173
174
  
  EXPORT_SYMBOL(totalram_pages);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
175

15ad7cdcf   Helge Deller   [PATCH] struct se...
176
  static char * const zone_names[MAX_NR_ZONES] = {
4b51d6698   Christoph Lameter   [PATCH] optional ...
177
  #ifdef CONFIG_ZONE_DMA
2f1b62486   Christoph Lameter   [PATCH] reduce MA...
178
  	 "DMA",
4b51d6698   Christoph Lameter   [PATCH] optional ...
179
  #endif
fb0e7942b   Christoph Lameter   [PATCH] reduce MA...
180
  #ifdef CONFIG_ZONE_DMA32
2f1b62486   Christoph Lameter   [PATCH] reduce MA...
181
  	 "DMA32",
fb0e7942b   Christoph Lameter   [PATCH] reduce MA...
182
  #endif
2f1b62486   Christoph Lameter   [PATCH] reduce MA...
183
  	 "Normal",
e53ef38d0   Christoph Lameter   [PATCH] reduce MA...
184
  #ifdef CONFIG_HIGHMEM
2a1e274ac   Mel Gorman   Create the ZONE_M...
185
  	 "HighMem",
e53ef38d0   Christoph Lameter   [PATCH] reduce MA...
186
  #endif
2a1e274ac   Mel Gorman   Create the ZONE_M...
187
  	 "Movable",
2f1b62486   Christoph Lameter   [PATCH] reduce MA...
188
  };
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
189
  int min_free_kbytes = 1024;
2c85f51d2   Jan Beulich   mm: also use allo...
190
191
  static unsigned long __meminitdata nr_kernel_pages;
  static unsigned long __meminitdata nr_all_pages;
a3142c8e1   Yasunori Goto   Fix section misma...
192
  static unsigned long __meminitdata dma_reserve;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
193

0ee332c14   Tejun Heo   memblock: Kill ea...
194
195
196
197
198
199
200
201
202
203
204
  #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
  static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
  static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
  static unsigned long __initdata required_kernelcore;
  static unsigned long __initdata required_movablecore;
  static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
  
  /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
  int movable_zone;
  EXPORT_SYMBOL(movable_zone);
  #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
c713216de   Mel Gorman   [PATCH] Introduce...
205

418508c13   Miklos Szeredi   fix unused setup_...
206
207
  #if MAX_NUMNODES > 1
  int nr_node_ids __read_mostly = MAX_NUMNODES;
62bc62a87   Christoph Lameter   page allocator: u...
208
  int nr_online_nodes __read_mostly = 1;
418508c13   Miklos Szeredi   fix unused setup_...
209
  EXPORT_SYMBOL(nr_node_ids);
62bc62a87   Christoph Lameter   page allocator: u...
210
  EXPORT_SYMBOL(nr_online_nodes);
418508c13   Miklos Szeredi   fix unused setup_...
211
  #endif
9ef9acb05   Mel Gorman   Do not group page...
212
  int page_group_by_mobility_disabled __read_mostly;
ee6f509c3   Minchan Kim   mm: factor out me...
213
  void set_pageblock_migratetype(struct page *page, int migratetype)
b2a0ac887   Mel Gorman   Split the free li...
214
  {
49255c619   Mel Gorman   page allocator: m...
215
216
217
  
  	if (unlikely(page_group_by_mobility_disabled))
  		migratetype = MIGRATE_UNMOVABLE;
b2a0ac887   Mel Gorman   Split the free li...
218
219
220
  	set_pageblock_flags_group(page, (unsigned long)migratetype,
  					PB_migrate, PB_migrate_end);
  }
7f33d49a2   Rafael J. Wysocki   mm, PM/Freezer: D...
221
  bool oom_killer_disabled __read_mostly;
13e7444b0   Nick Piggin   [PATCH] mm: remov...
222
  #ifdef CONFIG_DEBUG_VM
c6a57e19e   Dave Hansen   [PATCH] memory ho...
223
  static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
224
  {
bdc8cb984   Dave Hansen   [PATCH] memory ho...
225
226
227
  	int ret = 0;
  	unsigned seq;
  	unsigned long pfn = page_to_pfn(page);
b5e6a5a27   Cody P Schafer   mm/page_alloc: ad...
228
  	unsigned long sp, start_pfn;
c6a57e19e   Dave Hansen   [PATCH] memory ho...
229

bdc8cb984   Dave Hansen   [PATCH] memory ho...
230
231
  	do {
  		seq = zone_span_seqbegin(zone);
b5e6a5a27   Cody P Schafer   mm/page_alloc: ad...
232
233
  		start_pfn = zone->zone_start_pfn;
  		sp = zone->spanned_pages;
108bcc96e   Cody P Schafer   mm: add & use zon...
234
  		if (!zone_spans_pfn(zone, pfn))
bdc8cb984   Dave Hansen   [PATCH] memory ho...
235
236
  			ret = 1;
  	} while (zone_span_seqretry(zone, seq));
b5e6a5a27   Cody P Schafer   mm/page_alloc: ad...
237
238
239
240
  	if (ret)
  		pr_err("page %lu outside zone [ %lu - %lu ]
  ",
  			pfn, start_pfn, start_pfn + sp);
bdc8cb984   Dave Hansen   [PATCH] memory ho...
241
  	return ret;
c6a57e19e   Dave Hansen   [PATCH] memory ho...
242
243
244
245
  }
  
  static int page_is_consistent(struct zone *zone, struct page *page)
  {
14e072984   Andy Whitcroft   add pfn_valid_wit...
246
  	if (!pfn_valid_within(page_to_pfn(page)))
c6a57e19e   Dave Hansen   [PATCH] memory ho...
247
  		return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
248
  	if (zone != page_zone(page))
c6a57e19e   Dave Hansen   [PATCH] memory ho...
249
250
251
252
253
254
255
256
257
258
  		return 0;
  
  	return 1;
  }
  /*
   * Temporary debugging check for pages not lying within a given zone.
   */
  static int bad_range(struct zone *zone, struct page *page)
  {
  	if (page_outside_zone_boundaries(zone, page))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
259
  		return 1;
c6a57e19e   Dave Hansen   [PATCH] memory ho...
260
261
  	if (!page_is_consistent(zone, page))
  		return 1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
262
263
  	return 0;
  }
13e7444b0   Nick Piggin   [PATCH] mm: remov...
264
265
266
267
268
269
  #else
  static inline int bad_range(struct zone *zone, struct page *page)
  {
  	return 0;
  }
  #endif
224abf92b   Nick Piggin   [PATCH] mm: bad_p...
270
  static void bad_page(struct page *page)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
271
  {
d936cf9b3   Hugh Dickins   badpage: ratelimi...
272
273
274
  	static unsigned long resume;
  	static unsigned long nr_shown;
  	static unsigned long nr_unshown;
2a7684a23   Wu Fengguang   HWPOISON: check a...
275
276
  	/* Don't complain about poisoned pages */
  	if (PageHWPoison(page)) {
22b751c3d   Mel Gorman   mm: rename page s...
277
  		page_mapcount_reset(page); /* remove PageBuddy */
2a7684a23   Wu Fengguang   HWPOISON: check a...
278
279
  		return;
  	}
d936cf9b3   Hugh Dickins   badpage: ratelimi...
280
281
282
283
284
285
286
287
288
289
  	/*
  	 * Allow a burst of 60 reports, then keep quiet for that minute;
  	 * or allow a steady drip of one report per second.
  	 */
  	if (nr_shown == 60) {
  		if (time_before(jiffies, resume)) {
  			nr_unshown++;
  			goto out;
  		}
  		if (nr_unshown) {
1e9e63650   Hugh Dickins   badpage: KERN_ALE...
290
291
292
  			printk(KERN_ALERT
  			      "BUG: Bad page state: %lu messages suppressed
  ",
d936cf9b3   Hugh Dickins   badpage: ratelimi...
293
294
295
296
297
298
299
  				nr_unshown);
  			nr_unshown = 0;
  		}
  		nr_shown = 0;
  	}
  	if (nr_shown++ == 0)
  		resume = jiffies + 60 * HZ;
1e9e63650   Hugh Dickins   badpage: KERN_ALE...
300
301
  	printk(KERN_ALERT "BUG: Bad page state in process %s  pfn:%05lx
  ",
3dc147414   Hugh Dickins   badpage: replace ...
302
  		current->comm, page_to_pfn(page));
718a38211   Wu Fengguang   mm: introduce dum...
303
  	dump_page(page);
3dc147414   Hugh Dickins   badpage: replace ...
304

4f31888c1   Dave Jones   mm: output a list...
305
  	print_modules();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
306
  	dump_stack();
d936cf9b3   Hugh Dickins   badpage: ratelimi...
307
  out:
8cc3b3922   Hugh Dickins   badpage: keep any...
308
  	/* Leave bad fields for debug, except PageBuddy could make trouble */
22b751c3d   Mel Gorman   mm: rename page s...
309
  	page_mapcount_reset(page); /* remove PageBuddy */
373d4d099   Rusty Russell   taint: add explic...
310
  	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
311
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
312
313
314
315
316
317
318
  /*
   * Higher-order pages are called "compound pages".  They are structured thusly:
   *
   * The first PAGE_SIZE page is called the "head page".
   *
   * The remaining PAGE_SIZE pages are called "tail pages".
   *
6416b9fa4   Wang Sheng-Hui   mm: cleanup the c...
319
320
   * All pages have PG_compound set.  All tail pages have their ->first_page
   * pointing at the head page.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
321
   *
41d78ba55   Hugh Dickins   [PATCH] compound ...
322
323
324
   * The first tail page's ->lru.next holds the address of the compound page's
   * put_page() function.  Its ->lru.prev holds the order of allocation.
   * This usage means that zero-order pages may not be compound.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
325
   */
d98c7a098   Hugh Dickins   [PATCH] compound ...
326
327
328
  
  static void free_compound_page(struct page *page)
  {
d85f33855   Christoph Lameter   Make page->privat...
329
  	__free_pages_ok(page, compound_order(page));
d98c7a098   Hugh Dickins   [PATCH] compound ...
330
  }
01ad1c082   Andi Kleen   mm: export prep_c...
331
  void prep_compound_page(struct page *page, unsigned long order)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
332
333
334
  {
  	int i;
  	int nr_pages = 1 << order;
18229df5b   Andy Whitcroft   hugetlb: pull gig...
335
336
337
338
339
340
  
  	set_compound_page_dtor(page, free_compound_page);
  	set_compound_order(page, order);
  	__SetPageHead(page);
  	for (i = 1; i < nr_pages; i++) {
  		struct page *p = page + i;
18229df5b   Andy Whitcroft   hugetlb: pull gig...
341
  		__SetPageTail(p);
58a84aa92   Youquan Song   thp: set compound...
342
  		set_page_count(p, 0);
18229df5b   Andy Whitcroft   hugetlb: pull gig...
343
344
345
  		p->first_page = page;
  	}
  }
59ff42163   Andrea Arcangeli   thp: comment remi...
346
  /* update __split_huge_page_refcount if you change this function */
8cc3b3922   Hugh Dickins   badpage: keep any...
347
  static int destroy_compound_page(struct page *page, unsigned long order)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
348
349
350
  {
  	int i;
  	int nr_pages = 1 << order;
8cc3b3922   Hugh Dickins   badpage: keep any...
351
  	int bad = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
352

0bb2c7637   Gavin Shan   mm/page_alloc.c: ...
353
  	if (unlikely(compound_order(page) != order)) {
224abf92b   Nick Piggin   [PATCH] mm: bad_p...
354
  		bad_page(page);
8cc3b3922   Hugh Dickins   badpage: keep any...
355
356
  		bad++;
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
357

6d7779538   Christoph Lameter   mm: optimize comp...
358
  	__ClearPageHead(page);
8cc3b3922   Hugh Dickins   badpage: keep any...
359

18229df5b   Andy Whitcroft   hugetlb: pull gig...
360
361
  	for (i = 1; i < nr_pages; i++) {
  		struct page *p = page + i;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
362

e713a21d8   Alexey Zaytsev   trivial: Fix dubi...
363
  		if (unlikely(!PageTail(p) || (p->first_page != page))) {
224abf92b   Nick Piggin   [PATCH] mm: bad_p...
364
  			bad_page(page);
8cc3b3922   Hugh Dickins   badpage: keep any...
365
366
  			bad++;
  		}
d85f33855   Christoph Lameter   Make page->privat...
367
  		__ClearPageTail(p);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
368
  	}
8cc3b3922   Hugh Dickins   badpage: keep any...
369
370
  
  	return bad;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
371
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
372

17cf44064   Nick Piggin   [PATCH] mm: clean...
373
374
375
  static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
  {
  	int i;
6626c5d53   Andrew Morton   [PATCH] mm: prep_...
376
377
378
379
  	/*
  	 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
  	 * and __GFP_HIGHMEM from hard or soft interrupt context.
  	 */
725d704ec   Nick Piggin   [PATCH] mm: VM_BU...
380
  	VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
17cf44064   Nick Piggin   [PATCH] mm: clean...
381
382
383
  	for (i = 0; i < (1 << order); i++)
  		clear_highpage(page + i);
  }
c0a32fc5a   Stanislaw Gruszka   mm: more intensiv...
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
  #ifdef CONFIG_DEBUG_PAGEALLOC
  unsigned int _debug_guardpage_minorder;
  
  static int __init debug_guardpage_minorder_setup(char *buf)
  {
  	unsigned long res;
  
  	if (kstrtoul(buf, 10, &res) < 0 ||  res > MAX_ORDER / 2) {
  		printk(KERN_ERR "Bad debug_guardpage_minorder value
  ");
  		return 0;
  	}
  	_debug_guardpage_minorder = res;
  	printk(KERN_INFO "Setting debug_guardpage_minorder to %lu
  ", res);
  	return 0;
  }
  __setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);
  
  static inline void set_page_guard_flag(struct page *page)
  {
  	__set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
  }
  
  static inline void clear_page_guard_flag(struct page *page)
  {
  	__clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
  }
  #else
  static inline void set_page_guard_flag(struct page *page) { }
  static inline void clear_page_guard_flag(struct page *page) { }
  #endif
6aa3001b2   Andrew Morton   [PATCH] page_allo...
416
417
  static inline void set_page_order(struct page *page, int order)
  {
4c21e2f24   Hugh Dickins   [PATCH] mm: split...
418
  	set_page_private(page, order);
676165a8a   Nick Piggin   [PATCH] Fix buddy...
419
  	__SetPageBuddy(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
420
421
422
423
  }
  
  static inline void rmv_page_order(struct page *page)
  {
676165a8a   Nick Piggin   [PATCH] Fix buddy...
424
  	__ClearPageBuddy(page);
4c21e2f24   Hugh Dickins   [PATCH] mm: split...
425
  	set_page_private(page, 0);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
  }
  
  /*
   * Locate the struct page for both the matching buddy in our
   * pair (buddy1) and the combined O(n+1) page they form (page).
   *
   * 1) Any buddy B1 will have an order O twin B2 which satisfies
   * the following equation:
   *     B2 = B1 ^ (1 << O)
   * For example, if the starting buddy (buddy2) is #8 its order
   * 1 buddy is #10:
   *     B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
   *
   * 2) Any buddy B will have an order O+1 parent P which
   * satisfies the following equation:
   *     P = B & ~(1 << O)
   *
d6e05edc5   Andreas Mohr   spelling fixes
443
   * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
444
   */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
445
  static inline unsigned long
43506fad2   KyongHo Cho   mm/page_alloc.c: ...
446
  __find_buddy_index(unsigned long page_idx, unsigned int order)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
447
  {
43506fad2   KyongHo Cho   mm/page_alloc.c: ...
448
  	return page_idx ^ (1 << order);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
449
450
451
452
453
  }
  
  /*
   * This function checks whether a page is free && is the buddy
   * we can do coalesce a page and its buddy if
13e7444b0   Nick Piggin   [PATCH] mm: remov...
454
   * (a) the buddy is not in a hole &&
676165a8a   Nick Piggin   [PATCH] Fix buddy...
455
   * (b) the buddy is in the buddy system &&
cb2b95e1c   Andy Whitcroft   [PATCH] zone hand...
456
457
   * (c) a page and its buddy have the same order &&
   * (d) a page and its buddy are in the same zone.
676165a8a   Nick Piggin   [PATCH] Fix buddy...
458
   *
5f24ce5fd   Andrea Arcangeli   thp: remove PG_buddy
459
460
   * For recording whether a page is in the buddy system, we set ->_mapcount -2.
   * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
461
   *
676165a8a   Nick Piggin   [PATCH] Fix buddy...
462
   * For recording page's order, we use page_private(page).
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
463
   */
cb2b95e1c   Andy Whitcroft   [PATCH] zone hand...
464
465
  static inline int page_is_buddy(struct page *page, struct page *buddy,
  								int order)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
466
  {
14e072984   Andy Whitcroft   add pfn_valid_wit...
467
  	if (!pfn_valid_within(page_to_pfn(buddy)))
13e7444b0   Nick Piggin   [PATCH] mm: remov...
468
  		return 0;
13e7444b0   Nick Piggin   [PATCH] mm: remov...
469

cb2b95e1c   Andy Whitcroft   [PATCH] zone hand...
470
471
  	if (page_zone_id(page) != page_zone_id(buddy))
  		return 0;
c0a32fc5a   Stanislaw Gruszka   mm: more intensiv...
472
473
474
475
  	if (page_is_guard(buddy) && page_order(buddy) == order) {
  		VM_BUG_ON(page_count(buddy) != 0);
  		return 1;
  	}
cb2b95e1c   Andy Whitcroft   [PATCH] zone hand...
476
  	if (PageBuddy(buddy) && page_order(buddy) == order) {
a3af9c389   Nick Piggin   page allocator: d...
477
  		VM_BUG_ON(page_count(buddy) != 0);
6aa3001b2   Andrew Morton   [PATCH] page_allo...
478
  		return 1;
676165a8a   Nick Piggin   [PATCH] Fix buddy...
479
  	}
6aa3001b2   Andrew Morton   [PATCH] page_allo...
480
  	return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
  }
  
  /*
   * Freeing function for a buddy system allocator.
   *
   * The concept of a buddy system is to maintain direct-mapped table
   * (containing bit values) for memory blocks of various "orders".
   * The bottom level table contains the map for the smallest allocatable
   * units of memory (here, pages), and each level above it describes
   * pairs of units from the levels below, hence, "buddies".
   * At a high level, all that happens here is marking the table entry
   * at the bottom level available, and propagating the changes upward
   * as necessary, plus some accounting needed to play nicely with other
   * parts of the VM system.
   * At each level, we keep a list of pages, which are heads of continuous
5f24ce5fd   Andrea Arcangeli   thp: remove PG_buddy
496
   * free pages of length of (1 << order) and marked with _mapcount -2. Page's
4c21e2f24   Hugh Dickins   [PATCH] mm: split...
497
   * order is recorded in page_private(page) field.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
498
   * So when we are allocating or freeing one, we can derive the state of the
5f63b720b   Michal Nazarewicz   mm: page_alloc: r...
499
500
   * other.  That is, if we allocate a small block, and both were
   * free, the remainder of the region must be split into blocks.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
501
   * If a block is freed, and its buddy is also free, then this
5f63b720b   Michal Nazarewicz   mm: page_alloc: r...
502
   * triggers coalescing into a block of larger size.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
503
   *
6d49e352a   Nadia Yvette Chambers   propagate name ch...
504
   * -- nyc
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
505
   */
48db57f8f   Nick Piggin   [PATCH] mm: free_...
506
  static inline void __free_one_page(struct page *page,
ed0ae21dc   Mel Gorman   page allocator: d...
507
508
  		struct zone *zone, unsigned int order,
  		int migratetype)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
509
510
  {
  	unsigned long page_idx;
6dda9d55b   Corrado Zoccolo   page allocator: r...
511
  	unsigned long combined_idx;
43506fad2   KyongHo Cho   mm/page_alloc.c: ...
512
  	unsigned long uninitialized_var(buddy_idx);
6dda9d55b   Corrado Zoccolo   page allocator: r...
513
  	struct page *buddy;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
514

d29bb9782   Cody P Schafer   mm/page_alloc: ad...
515
  	VM_BUG_ON(!zone_is_initialized(zone));
224abf92b   Nick Piggin   [PATCH] mm: bad_p...
516
  	if (unlikely(PageCompound(page)))
8cc3b3922   Hugh Dickins   badpage: keep any...
517
518
  		if (unlikely(destroy_compound_page(page, order)))
  			return;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
519

ed0ae21dc   Mel Gorman   page allocator: d...
520
  	VM_BUG_ON(migratetype == -1);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
521
  	page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
f2260e6b1   Mel Gorman   page allocator: u...
522
  	VM_BUG_ON(page_idx & ((1 << order) - 1));
725d704ec   Nick Piggin   [PATCH] mm: VM_BU...
523
  	VM_BUG_ON(bad_range(zone, page));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
524

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
525
  	while (order < MAX_ORDER-1) {
43506fad2   KyongHo Cho   mm/page_alloc.c: ...
526
527
  		buddy_idx = __find_buddy_index(page_idx, order);
  		buddy = page + (buddy_idx - page_idx);
cb2b95e1c   Andy Whitcroft   [PATCH] zone hand...
528
  		if (!page_is_buddy(page, buddy, order))
3c82d0ce2   Andy Whitcroft   buddy: clarify co...
529
  			break;
c0a32fc5a   Stanislaw Gruszka   mm: more intensiv...
530
531
532
533
534
535
536
  		/*
  		 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
  		 * merge with it and move up one order.
  		 */
  		if (page_is_guard(buddy)) {
  			clear_page_guard_flag(buddy);
  			set_page_private(page, 0);
d1ce749a0   Bartlomiej Zolnierkiewicz   cma: count free C...
537
538
  			__mod_zone_freepage_state(zone, 1 << order,
  						  migratetype);
c0a32fc5a   Stanislaw Gruszka   mm: more intensiv...
539
540
541
542
543
  		} else {
  			list_del(&buddy->lru);
  			zone->free_area[order].nr_free--;
  			rmv_page_order(buddy);
  		}
43506fad2   KyongHo Cho   mm/page_alloc.c: ...
544
  		combined_idx = buddy_idx & page_idx;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
545
546
547
548
549
  		page = page + (combined_idx - page_idx);
  		page_idx = combined_idx;
  		order++;
  	}
  	set_page_order(page, order);
6dda9d55b   Corrado Zoccolo   page allocator: r...
550
551
552
553
554
555
556
557
558
  
  	/*
  	 * If this is not the largest possible page, check if the buddy
  	 * of the next-highest order is free. If it is, it's possible
  	 * that pages are being freed that will coalesce soon. In case,
  	 * that is happening, add the free page to the tail of the list
  	 * so it's less likely to be used soon and more likely to be merged
  	 * as a higher order page
  	 */
b7f50cfa3   Mel Gorman   mm, page-allocato...
559
  	if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
6dda9d55b   Corrado Zoccolo   page allocator: r...
560
  		struct page *higher_page, *higher_buddy;
43506fad2   KyongHo Cho   mm/page_alloc.c: ...
561
562
563
  		combined_idx = buddy_idx & page_idx;
  		higher_page = page + (combined_idx - page_idx);
  		buddy_idx = __find_buddy_index(combined_idx, order + 1);
0ba8f2d59   Li Haifeng   mm/page_alloc: fi...
564
  		higher_buddy = higher_page + (buddy_idx - combined_idx);
6dda9d55b   Corrado Zoccolo   page allocator: r...
565
566
567
568
569
570
571
572
573
  		if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
  			list_add_tail(&page->lru,
  				&zone->free_area[order].free_list[migratetype]);
  			goto out;
  		}
  	}
  
  	list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
  out:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
574
575
  	zone->free_area[order].nr_free++;
  }
224abf92b   Nick Piggin   [PATCH] mm: bad_p...
576
  static inline int free_pages_check(struct page *page)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
577
  {
92be2e33b   Nick Piggin   [PATCH] mm: micro...
578
579
  	if (unlikely(page_mapcount(page) |
  		(page->mapping != NULL)  |
a3af9c389   Nick Piggin   page allocator: d...
580
  		(atomic_read(&page->_count) != 0) |
f212ad7cf   Daisuke Nishimura   memcg: add memcg ...
581
582
  		(page->flags & PAGE_FLAGS_CHECK_AT_FREE) |
  		(mem_cgroup_bad_page_check(page)))) {
224abf92b   Nick Piggin   [PATCH] mm: bad_p...
583
  		bad_page(page);
79f4b7bf3   Hugh Dickins   badpage: simplify...
584
  		return 1;
8cc3b3922   Hugh Dickins   badpage: keep any...
585
  	}
22b751c3d   Mel Gorman   mm: rename page s...
586
  	page_nid_reset_last(page);
79f4b7bf3   Hugh Dickins   badpage: simplify...
587
588
589
  	if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
  		page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
  	return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
590
591
592
  }
  
  /*
5f8dcc212   Mel Gorman   page-allocator: s...
593
   * Frees a number of pages from the PCP lists
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
594
   * Assumes all pages on list are in same zone, and of same order.
207f36eec   Renaud Lienhart   [PATCH] remove in...
595
   * count is the number of pages to free.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
596
597
598
599
600
601
602
   *
   * If the zone was previously in an "all pages pinned" state then look to
   * see if this freeing clears that state.
   *
   * And clear the zone's pages_scanned counter, to hold off the "all pages are
   * pinned" detection logic.
   */
5f8dcc212   Mel Gorman   page-allocator: s...
603
604
  static void free_pcppages_bulk(struct zone *zone, int count,
  					struct per_cpu_pages *pcp)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
605
  {
5f8dcc212   Mel Gorman   page-allocator: s...
606
  	int migratetype = 0;
a6f9edd65   Mel Gorman   page-allocator: m...
607
  	int batch_free = 0;
72853e299   Mel Gorman   mm: page allocato...
608
  	int to_free = count;
5f8dcc212   Mel Gorman   page-allocator: s...
609

c54ad30c7   Nick Piggin   [PATCH] mm: pagea...
610
  	spin_lock(&zone->lock);
93e4a89a8   KOSAKI Motohiro   mm: restore zone-...
611
  	zone->all_unreclaimable = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
612
  	zone->pages_scanned = 0;
f2260e6b1   Mel Gorman   page allocator: u...
613

72853e299   Mel Gorman   mm: page allocato...
614
  	while (to_free) {
48db57f8f   Nick Piggin   [PATCH] mm: free_...
615
  		struct page *page;
5f8dcc212   Mel Gorman   page-allocator: s...
616
617
618
  		struct list_head *list;
  
  		/*
a6f9edd65   Mel Gorman   page-allocator: m...
619
620
621
622
623
  		 * Remove pages from lists in a round-robin fashion. A
  		 * batch_free count is maintained that is incremented when an
  		 * empty list is encountered.  This is so more pages are freed
  		 * off fuller lists instead of spinning excessively around empty
  		 * lists
5f8dcc212   Mel Gorman   page-allocator: s...
624
625
  		 */
  		do {
a6f9edd65   Mel Gorman   page-allocator: m...
626
  			batch_free++;
5f8dcc212   Mel Gorman   page-allocator: s...
627
628
629
630
  			if (++migratetype == MIGRATE_PCPTYPES)
  				migratetype = 0;
  			list = &pcp->lists[migratetype];
  		} while (list_empty(list));
48db57f8f   Nick Piggin   [PATCH] mm: free_...
631

1d16871d8   Namhyung Kim   mm: batch-free pc...
632
633
634
  		/* This is the only non-empty list. Free them all. */
  		if (batch_free == MIGRATE_PCPTYPES)
  			batch_free = to_free;
a6f9edd65   Mel Gorman   page-allocator: m...
635
  		do {
770c8aaaf   Bartlomiej Zolnierkiewicz   mm: fix tracing i...
636
  			int mt;	/* migratetype of the to-be-freed page */
a6f9edd65   Mel Gorman   page-allocator: m...
637
638
639
  			page = list_entry(list->prev, struct page, lru);
  			/* must delete as __free_one_page list manipulates */
  			list_del(&page->lru);
b12c4ad14   Minchan Kim   mm: page_alloc: u...
640
  			mt = get_freepage_migratetype(page);
a7016235a   Hugh Dickins   mm: fix migratety...
641
  			/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
770c8aaaf   Bartlomiej Zolnierkiewicz   mm: fix tracing i...
642
643
  			__free_one_page(page, zone, 0, mt);
  			trace_mm_page_pcpu_drain(page, 0, mt);
194159fbc   Minchan Kim   mm: remove MIGRAT...
644
  			if (likely(!is_migrate_isolate_page(page))) {
97d0da220   Wen Congyang   memory-hotplug: f...
645
646
647
648
  				__mod_zone_page_state(zone, NR_FREE_PAGES, 1);
  				if (is_migrate_cma(mt))
  					__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
  			}
72853e299   Mel Gorman   mm: page allocato...
649
  		} while (--to_free && --batch_free && !list_empty(list));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
650
  	}
c54ad30c7   Nick Piggin   [PATCH] mm: pagea...
651
  	spin_unlock(&zone->lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
652
  }
ed0ae21dc   Mel Gorman   page allocator: d...
653
654
  static void free_one_page(struct zone *zone, struct page *page, int order,
  				int migratetype)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
655
  {
006d22d9b   Christoph Lameter   [PATCH] Optimize ...
656
  	spin_lock(&zone->lock);
93e4a89a8   KOSAKI Motohiro   mm: restore zone-...
657
  	zone->all_unreclaimable = 0;
006d22d9b   Christoph Lameter   [PATCH] Optimize ...
658
  	zone->pages_scanned = 0;
f2260e6b1   Mel Gorman   page allocator: u...
659

ed0ae21dc   Mel Gorman   page allocator: d...
660
  	__free_one_page(page, zone, order, migratetype);
194159fbc   Minchan Kim   mm: remove MIGRAT...
661
  	if (unlikely(!is_migrate_isolate(migratetype)))
d1ce749a0   Bartlomiej Zolnierkiewicz   cma: count free C...
662
  		__mod_zone_freepage_state(zone, 1 << order, migratetype);
006d22d9b   Christoph Lameter   [PATCH] Optimize ...
663
  	spin_unlock(&zone->lock);
48db57f8f   Nick Piggin   [PATCH] mm: free_...
664
  }
ec95f53aa   KOSAKI Motohiro   mm: introduce fre...
665
  static bool free_pages_prepare(struct page *page, unsigned int order)
48db57f8f   Nick Piggin   [PATCH] mm: free_...
666
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
667
  	int i;
8cc3b3922   Hugh Dickins   badpage: keep any...
668
  	int bad = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
669

b413d48aa   Konstantin Khlebnikov   mm-tracepoint: re...
670
  	trace_mm_page_free(page, order);
b1eeab676   Vegard Nossum   kmemcheck: add ho...
671
  	kmemcheck_free_shadow(page, order);
8dd60a3a6   Andrea Arcangeli   thp: clear compou...
672
673
674
675
  	if (PageAnon(page))
  		page->mapping = NULL;
  	for (i = 0; i < (1 << order); i++)
  		bad += free_pages_check(page + i);
8cc3b3922   Hugh Dickins   badpage: keep any...
676
  	if (bad)
ec95f53aa   KOSAKI Motohiro   mm: introduce fre...
677
  		return false;
689bcebfd   Hugh Dickins   [PATCH] unpaged: ...
678

3ac7fe5a4   Thomas Gleixner   infrastructure to...
679
  	if (!PageHighMem(page)) {
9858db504   Nick Piggin   [PATCH] mm: locks...
680
  		debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);
3ac7fe5a4   Thomas Gleixner   infrastructure to...
681
682
683
  		debug_check_no_obj_freed(page_address(page),
  					   PAGE_SIZE << order);
  	}
dafb13673   Nick Piggin   [PATCH] mm: arch_...
684
  	arch_free_page(page, order);
48db57f8f   Nick Piggin   [PATCH] mm: free_...
685
  	kernel_map_pages(page, 1 << order, 0);
dafb13673   Nick Piggin   [PATCH] mm: arch_...
686

ec95f53aa   KOSAKI Motohiro   mm: introduce fre...
687
688
689
690
691
692
  	return true;
  }
  
  static void __free_pages_ok(struct page *page, unsigned int order)
  {
  	unsigned long flags;
95e344124   Minchan Kim   mm: remain migrat...
693
  	int migratetype;
ec95f53aa   KOSAKI Motohiro   mm: introduce fre...
694
695
696
  
  	if (!free_pages_prepare(page, order))
  		return;
c54ad30c7   Nick Piggin   [PATCH] mm: pagea...
697
  	local_irq_save(flags);
f8891e5e1   Christoph Lameter   [PATCH] Light wei...
698
  	__count_vm_events(PGFREE, 1 << order);
95e344124   Minchan Kim   mm: remain migrat...
699
700
701
  	migratetype = get_pageblock_migratetype(page);
  	set_freepage_migratetype(page, migratetype);
  	free_one_page(page_zone(page), page, order, migratetype);
c54ad30c7   Nick Piggin   [PATCH] mm: pagea...
702
  	local_irq_restore(flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
703
  }
9feedc9d8   Jiang Liu   mm: introduce new...
704
705
706
707
708
709
710
  /*
   * Read access to zone->managed_pages is safe because it's unsigned long,
   * but we still need to serialize writers. Currently all callers of
   * __free_pages_bootmem() except put_page_bootmem() should only be used
   * at boot time. So for shorter boot time, we shift the burden to
   * put_page_bootmem() to serialize writers.
   */
af370fb8c   Yasunori Goto   memory hotplug: s...
711
  void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
a226f6c89   David Howells   [PATCH] FRV: Clea...
712
  {
c3993076f   Johannes Weiner   mm: page_alloc: g...
713
714
  	unsigned int nr_pages = 1 << order;
  	unsigned int loop;
a226f6c89   David Howells   [PATCH] FRV: Clea...
715

c3993076f   Johannes Weiner   mm: page_alloc: g...
716
717
718
719
720
721
722
723
  	prefetchw(page);
  	for (loop = 0; loop < nr_pages; loop++) {
  		struct page *p = &page[loop];
  
  		if (loop + 1 < nr_pages)
  			prefetchw(p + 1);
  		__ClearPageReserved(p);
  		set_page_count(p, 0);
a226f6c89   David Howells   [PATCH] FRV: Clea...
724
  	}
c3993076f   Johannes Weiner   mm: page_alloc: g...
725

9feedc9d8   Jiang Liu   mm: introduce new...
726
  	page_zone(page)->managed_pages += 1 << order;
c3993076f   Johannes Weiner   mm: page_alloc: g...
727
728
  	set_page_refcounted(page);
  	__free_pages(page, order);
a226f6c89   David Howells   [PATCH] FRV: Clea...
729
  }
47118af07   Michal Nazarewicz   mm: mmzone: MIGRA...
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
  #ifdef CONFIG_CMA
  /* Free whole pageblock and set it's migration type to MIGRATE_CMA. */
  void __init init_cma_reserved_pageblock(struct page *page)
  {
  	unsigned i = pageblock_nr_pages;
  	struct page *p = page;
  
  	do {
  		__ClearPageReserved(p);
  		set_page_count(p, 0);
  	} while (++p, --i);
  
  	set_page_refcounted(page);
  	set_pageblock_migratetype(page, MIGRATE_CMA);
  	__free_pages(page, pageblock_order);
  	totalram_pages += pageblock_nr_pages;
41a797344   Marek Szyprowski   mm: cma: fix acco...
746
747
748
749
  #ifdef CONFIG_HIGHMEM
  	if (PageHighMem(page))
  		totalhigh_pages += pageblock_nr_pages;
  #endif
47118af07   Michal Nazarewicz   mm: mmzone: MIGRA...
750
751
  }
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
752
753
754
755
756
757
758
759
760
761
762
763
764
  
  /*
   * The order of subdivision here is critical for the IO subsystem.
   * Please do not alter this order without good reasons and regression
   * testing. Specifically, as large blocks of memory are subdivided,
   * the order in which smaller blocks are delivered depends on the order
   * they're subdivided in this function. This is the primary factor
   * influencing the order in which pages are delivered to the IO
   * subsystem according to empirical testing, and this is also justified
   * by considering the behavior of a buddy system containing a single
   * large block of memory acted on by a series of small allocations.
   * This behavior is a critical factor in sglist merging's success.
   *
6d49e352a   Nadia Yvette Chambers   propagate name ch...
765
   * -- nyc
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
766
   */
085cc7d5d   Nick Piggin   [PATCH] mm: page_...
767
  static inline void expand(struct zone *zone, struct page *page,
b2a0ac887   Mel Gorman   Split the free li...
768
769
  	int low, int high, struct free_area *area,
  	int migratetype)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
770
771
772
773
774
775
776
  {
  	unsigned long size = 1 << high;
  
  	while (high > low) {
  		area--;
  		high--;
  		size >>= 1;
725d704ec   Nick Piggin   [PATCH] mm: VM_BU...
777
  		VM_BUG_ON(bad_range(zone, &page[size]));
c0a32fc5a   Stanislaw Gruszka   mm: more intensiv...
778
779
780
781
782
783
784
785
786
787
788
789
790
  
  #ifdef CONFIG_DEBUG_PAGEALLOC
  		if (high < debug_guardpage_minorder()) {
  			/*
  			 * Mark as guard pages (or page), that will allow to
  			 * merge back to allocator when buddy will be freed.
  			 * Corresponding page table entries will not be touched,
  			 * pages will stay not present in virtual address space
  			 */
  			INIT_LIST_HEAD(&page[size].lru);
  			set_page_guard_flag(&page[size]);
  			set_page_private(&page[size], high);
  			/* Guard pages are not available for any usage */
d1ce749a0   Bartlomiej Zolnierkiewicz   cma: count free C...
791
792
  			__mod_zone_freepage_state(zone, -(1 << high),
  						  migratetype);
c0a32fc5a   Stanislaw Gruszka   mm: more intensiv...
793
794
795
  			continue;
  		}
  #endif
b2a0ac887   Mel Gorman   Split the free li...
796
  		list_add(&page[size].lru, &area->free_list[migratetype]);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
797
798
799
  		area->nr_free++;
  		set_page_order(&page[size], high);
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
800
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
801
802
803
  /*
   * This page is about to be returned from the page allocator
   */
2a7684a23   Wu Fengguang   HWPOISON: check a...
804
  static inline int check_new_page(struct page *page)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
805
  {
92be2e33b   Nick Piggin   [PATCH] mm: micro...
806
807
  	if (unlikely(page_mapcount(page) |
  		(page->mapping != NULL)  |
a3af9c389   Nick Piggin   page allocator: d...
808
  		(atomic_read(&page->_count) != 0)  |
f212ad7cf   Daisuke Nishimura   memcg: add memcg ...
809
810
  		(page->flags & PAGE_FLAGS_CHECK_AT_PREP) |
  		(mem_cgroup_bad_page_check(page)))) {
224abf92b   Nick Piggin   [PATCH] mm: bad_p...
811
  		bad_page(page);
689bcebfd   Hugh Dickins   [PATCH] unpaged: ...
812
  		return 1;
8cc3b3922   Hugh Dickins   badpage: keep any...
813
  	}
2a7684a23   Wu Fengguang   HWPOISON: check a...
814
815
816
817
818
819
820
821
822
823
824
825
  	return 0;
  }
  
  static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
  {
  	int i;
  
  	for (i = 0; i < (1 << order); i++) {
  		struct page *p = page + i;
  		if (unlikely(check_new_page(p)))
  			return 1;
  	}
689bcebfd   Hugh Dickins   [PATCH] unpaged: ...
826

4c21e2f24   Hugh Dickins   [PATCH] mm: split...
827
  	set_page_private(page, 0);
7835e98b2   Nick Piggin   [PATCH] remove se...
828
  	set_page_refcounted(page);
cc1025090   Nick Piggin   [PATCH] mm: add a...
829
830
  
  	arch_alloc_page(page, order);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
831
  	kernel_map_pages(page, 1 << order, 1);
17cf44064   Nick Piggin   [PATCH] mm: clean...
832
833
834
835
836
837
  
  	if (gfp_flags & __GFP_ZERO)
  		prep_zero_page(page, order, gfp_flags);
  
  	if (order && (gfp_flags & __GFP_COMP))
  		prep_compound_page(page, order);
689bcebfd   Hugh Dickins   [PATCH] unpaged: ...
838
  	return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
839
  }
56fd56b86   Mel Gorman   Bias the location...
840
841
842
843
  /*
   * Go through the free lists for the given migratetype and remove
   * the smallest available page from the freelists
   */
728ec980f   Mel Gorman   page allocator: i...
844
845
  static inline
  struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
56fd56b86   Mel Gorman   Bias the location...
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
  						int migratetype)
  {
  	unsigned int current_order;
  	struct free_area * area;
  	struct page *page;
  
  	/* Find a page of the appropriate size in the preferred list */
  	for (current_order = order; current_order < MAX_ORDER; ++current_order) {
  		area = &(zone->free_area[current_order]);
  		if (list_empty(&area->free_list[migratetype]))
  			continue;
  
  		page = list_entry(area->free_list[migratetype].next,
  							struct page, lru);
  		list_del(&page->lru);
  		rmv_page_order(page);
  		area->nr_free--;
56fd56b86   Mel Gorman   Bias the location...
863
864
865
866
867
868
  		expand(zone, page, order, current_order, area, migratetype);
  		return page;
  	}
  
  	return NULL;
  }
b2a0ac887   Mel Gorman   Split the free li...
869
870
871
872
  /*
   * This array describes the order lists are fallen back to when
   * the free lists for the desirable migrate type are depleted
   */
47118af07   Michal Nazarewicz   mm: mmzone: MIGRA...
873
874
875
876
877
878
879
880
881
  static int fallbacks[MIGRATE_TYPES][4] = {
  	[MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,     MIGRATE_RESERVE },
  	[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,     MIGRATE_RESERVE },
  #ifdef CONFIG_CMA
  	[MIGRATE_MOVABLE]     = { MIGRATE_CMA,         MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
  	[MIGRATE_CMA]         = { MIGRATE_RESERVE }, /* Never used */
  #else
  	[MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE,   MIGRATE_RESERVE },
  #endif
6d4a49160   Michal Nazarewicz   mm: page_alloc: c...
882
  	[MIGRATE_RESERVE]     = { MIGRATE_RESERVE }, /* Never used */
194159fbc   Minchan Kim   mm: remove MIGRAT...
883
  #ifdef CONFIG_MEMORY_ISOLATION
6d4a49160   Michal Nazarewicz   mm: page_alloc: c...
884
  	[MIGRATE_ISOLATE]     = { MIGRATE_RESERVE }, /* Never used */
194159fbc   Minchan Kim   mm: remove MIGRAT...
885
  #endif
b2a0ac887   Mel Gorman   Split the free li...
886
  };
c361be55b   Mel Gorman   Move free pages b...
887
888
  /*
   * Move the free pages in a range to the free lists of the requested type.
d9c234005   Mel Gorman   Do not depend on ...
889
   * Note that start_page and end_pages are not aligned on a pageblock
c361be55b   Mel Gorman   Move free pages b...
890
891
   * boundary. If alignment is required, use move_freepages_block()
   */
435b405c0   Minchan Kim   memory-hotplug: f...
892
  int move_freepages(struct zone *zone,
b69a7288e   Adrian Bunk   mm/page_alloc.c: ...
893
894
  			  struct page *start_page, struct page *end_page,
  			  int migratetype)
c361be55b   Mel Gorman   Move free pages b...
895
896
897
  {
  	struct page *page;
  	unsigned long order;
d100313fd   Mel Gorman   Fix calculation i...
898
  	int pages_moved = 0;
c361be55b   Mel Gorman   Move free pages b...
899
900
901
902
903
904
905
  
  #ifndef CONFIG_HOLES_IN_ZONE
  	/*
  	 * page_zone is not safe to call in this context when
  	 * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
  	 * anyway as we check zone boundaries in move_freepages_block().
  	 * Remove at a later date when no bug reports exist related to
ac0e5b7a6   Mel Gorman   remove PAGE_GROUP...
906
  	 * grouping pages by mobility
c361be55b   Mel Gorman   Move free pages b...
907
908
909
910
911
  	 */
  	BUG_ON(page_zone(start_page) != page_zone(end_page));
  #endif
  
  	for (page = start_page; page <= end_page;) {
344c790e3   Adam Litke   mm: make setup_zo...
912
913
  		/* Make sure we are not inadvertently changing nodes */
  		VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));
c361be55b   Mel Gorman   Move free pages b...
914
915
916
917
918
919
920
921
922
923
924
  		if (!pfn_valid_within(page_to_pfn(page))) {
  			page++;
  			continue;
  		}
  
  		if (!PageBuddy(page)) {
  			page++;
  			continue;
  		}
  
  		order = page_order(page);
84be48d84   Kirill A. Shutemov   mm/page_alloc.c: ...
925
926
  		list_move(&page->lru,
  			  &zone->free_area[order].free_list[migratetype]);
95e344124   Minchan Kim   mm: remain migrat...
927
  		set_freepage_migratetype(page, migratetype);
c361be55b   Mel Gorman   Move free pages b...
928
  		page += 1 << order;
d100313fd   Mel Gorman   Fix calculation i...
929
  		pages_moved += 1 << order;
c361be55b   Mel Gorman   Move free pages b...
930
  	}
d100313fd   Mel Gorman   Fix calculation i...
931
  	return pages_moved;
c361be55b   Mel Gorman   Move free pages b...
932
  }
ee6f509c3   Minchan Kim   mm: factor out me...
933
  int move_freepages_block(struct zone *zone, struct page *page,
68e3e9262   Linus Torvalds   Revert "mm: compa...
934
  				int migratetype)
c361be55b   Mel Gorman   Move free pages b...
935
936
937
938
939
  {
  	unsigned long start_pfn, end_pfn;
  	struct page *start_page, *end_page;
  
  	start_pfn = page_to_pfn(page);
d9c234005   Mel Gorman   Do not depend on ...
940
  	start_pfn = start_pfn & ~(pageblock_nr_pages-1);
c361be55b   Mel Gorman   Move free pages b...
941
  	start_page = pfn_to_page(start_pfn);
d9c234005   Mel Gorman   Do not depend on ...
942
943
  	end_page = start_page + pageblock_nr_pages - 1;
  	end_pfn = start_pfn + pageblock_nr_pages - 1;
c361be55b   Mel Gorman   Move free pages b...
944
945
  
  	/* Do not cross zone boundaries */
108bcc96e   Cody P Schafer   mm: add & use zon...
946
  	if (!zone_spans_pfn(zone, start_pfn))
c361be55b   Mel Gorman   Move free pages b...
947
  		start_page = page;
108bcc96e   Cody P Schafer   mm: add & use zon...
948
  	if (!zone_spans_pfn(zone, end_pfn))
c361be55b   Mel Gorman   Move free pages b...
949
950
951
952
  		return 0;
  
  	return move_freepages(zone, start_page, end_page, migratetype);
  }
2f66a68f3   Mel Gorman   page-allocator: c...
953
954
955
956
957
958
959
960
961
962
  static void change_pageblock_range(struct page *pageblock_page,
  					int start_order, int migratetype)
  {
  	int nr_pageblocks = 1 << (start_order - pageblock_order);
  
  	while (nr_pageblocks--) {
  		set_pageblock_migratetype(pageblock_page, migratetype);
  		pageblock_page += pageblock_nr_pages;
  	}
  }
b2a0ac887   Mel Gorman   Split the free li...
963
  /* Remove an element from the buddy allocator from the fallback list */
0ac3a4099   Mel Gorman   page allocator: i...
964
965
  static inline struct page *
  __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
b2a0ac887   Mel Gorman   Split the free li...
966
967
968
969
970
971
972
973
974
  {
  	struct free_area * area;
  	int current_order;
  	struct page *page;
  	int migratetype, i;
  
  	/* Find the largest possible block of pages in the other list */
  	for (current_order = MAX_ORDER-1; current_order >= order;
  						--current_order) {
6d4a49160   Michal Nazarewicz   mm: page_alloc: c...
975
  		for (i = 0;; i++) {
b2a0ac887   Mel Gorman   Split the free li...
976
  			migratetype = fallbacks[start_migratetype][i];
56fd56b86   Mel Gorman   Bias the location...
977
978
  			/* MIGRATE_RESERVE handled later if necessary */
  			if (migratetype == MIGRATE_RESERVE)
6d4a49160   Michal Nazarewicz   mm: page_alloc: c...
979
  				break;
e010487db   Mel Gorman   Group high-order ...
980

b2a0ac887   Mel Gorman   Split the free li...
981
982
983
984
985
986
987
988
989
  			area = &(zone->free_area[current_order]);
  			if (list_empty(&area->free_list[migratetype]))
  				continue;
  
  			page = list_entry(area->free_list[migratetype].next,
  					struct page, lru);
  			area->nr_free--;
  
  			/*
c361be55b   Mel Gorman   Move free pages b...
990
  			 * If breaking a large block of pages, move all free
46dafbca2   Mel Gorman   Be more agressive...
991
992
  			 * pages to the preferred allocation list. If falling
  			 * back for a reclaimable kernel allocation, be more
25985edce   Lucas De Marchi   Fix common misspe...
993
  			 * aggressive about taking ownership of free pages
47118af07   Michal Nazarewicz   mm: mmzone: MIGRA...
994
995
996
997
998
999
  			 *
  			 * On the other hand, never change migration
  			 * type of MIGRATE_CMA pageblocks nor move CMA
  			 * pages on different free lists. We don't
  			 * want unmovable pages to be allocated from
  			 * MIGRATE_CMA areas.
b2a0ac887   Mel Gorman   Split the free li...
1000
  			 */
47118af07   Michal Nazarewicz   mm: mmzone: MIGRA...
1001
1002
1003
1004
1005
  			if (!is_migrate_cma(migratetype) &&
  			    (unlikely(current_order >= pageblock_order / 2) ||
  			     start_migratetype == MIGRATE_RECLAIMABLE ||
  			     page_group_by_mobility_disabled)) {
  				int pages;
46dafbca2   Mel Gorman   Be more agressive...
1006
1007
1008
1009
  				pages = move_freepages_block(zone, page,
  								start_migratetype);
  
  				/* Claim the whole block if over half of it is free */
dd5d241ea   Mel Gorman   page-allocator: a...
1010
1011
  				if (pages >= (1 << (pageblock_order-1)) ||
  						page_group_by_mobility_disabled)
46dafbca2   Mel Gorman   Be more agressive...
1012
1013
  					set_pageblock_migratetype(page,
  								start_migratetype);
b2a0ac887   Mel Gorman   Split the free li...
1014
  				migratetype = start_migratetype;
c361be55b   Mel Gorman   Move free pages b...
1015
  			}
b2a0ac887   Mel Gorman   Split the free li...
1016
1017
1018
1019
  
  			/* Remove the page from the freelists */
  			list_del(&page->lru);
  			rmv_page_order(page);
b2a0ac887   Mel Gorman   Split the free li...
1020

2f66a68f3   Mel Gorman   page-allocator: c...
1021
  			/* Take ownership for orders >= pageblock_order */
47118af07   Michal Nazarewicz   mm: mmzone: MIGRA...
1022
1023
  			if (current_order >= pageblock_order &&
  			    !is_migrate_cma(migratetype))
2f66a68f3   Mel Gorman   page-allocator: c...
1024
  				change_pageblock_range(page, current_order,
b2a0ac887   Mel Gorman   Split the free li...
1025
  							start_migratetype);
47118af07   Michal Nazarewicz   mm: mmzone: MIGRA...
1026
1027
1028
  			expand(zone, page, order, current_order, area,
  			       is_migrate_cma(migratetype)
  			     ? migratetype : start_migratetype);
e0fff1bd1   Mel Gorman   tracing, page-all...
1029
1030
1031
  
  			trace_mm_page_alloc_extfrag(page, order, current_order,
  				start_migratetype, migratetype);
b2a0ac887   Mel Gorman   Split the free li...
1032
1033
1034
  			return page;
  		}
  	}
728ec980f   Mel Gorman   page allocator: i...
1035
  	return NULL;
b2a0ac887   Mel Gorman   Split the free li...
1036
  }
56fd56b86   Mel Gorman   Bias the location...
1037
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1038
1039
1040
   * Do the hard work of removing an element from the buddy allocator.
   * Call me with the zone->lock already held.
   */
b2a0ac887   Mel Gorman   Split the free li...
1041
1042
  static struct page *__rmqueue(struct zone *zone, unsigned int order,
  						int migratetype)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1043
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1044
  	struct page *page;
728ec980f   Mel Gorman   page allocator: i...
1045
  retry_reserve:
56fd56b86   Mel Gorman   Bias the location...
1046
  	page = __rmqueue_smallest(zone, order, migratetype);
b2a0ac887   Mel Gorman   Split the free li...
1047

728ec980f   Mel Gorman   page allocator: i...
1048
  	if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
56fd56b86   Mel Gorman   Bias the location...
1049
  		page = __rmqueue_fallback(zone, order, migratetype);
b2a0ac887   Mel Gorman   Split the free li...
1050

728ec980f   Mel Gorman   page allocator: i...
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
  		/*
  		 * Use MIGRATE_RESERVE rather than fail an allocation. goto
  		 * is used because __rmqueue_smallest is an inline function
  		 * and we want just one call site
  		 */
  		if (!page) {
  			migratetype = MIGRATE_RESERVE;
  			goto retry_reserve;
  		}
  	}
0d3d062a6   Mel Gorman   tracing, page-all...
1061
  	trace_mm_page_alloc_zone_locked(page, order, migratetype);
b2a0ac887   Mel Gorman   Split the free li...
1062
  	return page;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1063
  }
5f63b720b   Michal Nazarewicz   mm: page_alloc: r...
1064
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1065
1066
1067
1068
   * Obtain a specified number of elements from the buddy allocator, all under
   * a single hold of the lock, for efficiency.  Add them to the supplied list.
   * Returns the number of new pages which were placed at *list.
   */
5f63b720b   Michal Nazarewicz   mm: page_alloc: r...
1069
  static int rmqueue_bulk(struct zone *zone, unsigned int order,
b2a0ac887   Mel Gorman   Split the free li...
1070
  			unsigned long count, struct list_head *list,
e084b2d95   Mel Gorman   page-allocator: p...
1071
  			int migratetype, int cold)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1072
  {
47118af07   Michal Nazarewicz   mm: mmzone: MIGRA...
1073
  	int mt = migratetype, i;
5f63b720b   Michal Nazarewicz   mm: page_alloc: r...
1074

c54ad30c7   Nick Piggin   [PATCH] mm: pagea...
1075
  	spin_lock(&zone->lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1076
  	for (i = 0; i < count; ++i) {
b2a0ac887   Mel Gorman   Split the free li...
1077
  		struct page *page = __rmqueue(zone, order, migratetype);
085cc7d5d   Nick Piggin   [PATCH] mm: page_...
1078
  		if (unlikely(page == NULL))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1079
  			break;
81eabcbe0   Mel Gorman   mm: fix page allo...
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
  
  		/*
  		 * Split buddy pages returned by expand() are received here
  		 * in physical page order. The page is added to the callers and
  		 * list and the list head then moves forward. From the callers
  		 * perspective, the linked list is ordered by page number in
  		 * some conditions. This is useful for IO devices that can
  		 * merge IO requests if the physical pages are ordered
  		 * properly.
  		 */
e084b2d95   Mel Gorman   page-allocator: p...
1090
1091
1092
1093
  		if (likely(cold == 0))
  			list_add(&page->lru, list);
  		else
  			list_add_tail(&page->lru, list);
47118af07   Michal Nazarewicz   mm: mmzone: MIGRA...
1094
1095
  		if (IS_ENABLED(CONFIG_CMA)) {
  			mt = get_pageblock_migratetype(page);
194159fbc   Minchan Kim   mm: remove MIGRAT...
1096
  			if (!is_migrate_cma(mt) && !is_migrate_isolate(mt))
47118af07   Michal Nazarewicz   mm: mmzone: MIGRA...
1097
1098
  				mt = migratetype;
  		}
b12c4ad14   Minchan Kim   mm: page_alloc: u...
1099
  		set_freepage_migratetype(page, mt);
81eabcbe0   Mel Gorman   mm: fix page allo...
1100
  		list = &page->lru;
d1ce749a0   Bartlomiej Zolnierkiewicz   cma: count free C...
1101
1102
1103
  		if (is_migrate_cma(mt))
  			__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
  					      -(1 << order));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1104
  	}
f2260e6b1   Mel Gorman   page allocator: u...
1105
  	__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
c54ad30c7   Nick Piggin   [PATCH] mm: pagea...
1106
  	spin_unlock(&zone->lock);
085cc7d5d   Nick Piggin   [PATCH] mm: page_...
1107
  	return i;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1108
  }
4ae7c0394   Christoph Lameter   [PATCH] Periodica...
1109
  #ifdef CONFIG_NUMA
8fce4d8e3   Christoph Lameter   [PATCH] slab: Nod...
1110
  /*
4037d4522   Christoph Lameter   Move remote node ...
1111
1112
1113
1114
   * Called from the vmstat counter updater to drain pagesets of this
   * currently executing processor on remote nodes after they have
   * expired.
   *
879336c39   Christoph Lameter   [PATCH] drain_nod...
1115
1116
   * Note that this function must be called with the thread pinned to
   * a single processor.
8fce4d8e3   Christoph Lameter   [PATCH] slab: Nod...
1117
   */
4037d4522   Christoph Lameter   Move remote node ...
1118
  void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
4ae7c0394   Christoph Lameter   [PATCH] Periodica...
1119
  {
4ae7c0394   Christoph Lameter   [PATCH] Periodica...
1120
  	unsigned long flags;
4037d4522   Christoph Lameter   Move remote node ...
1121
  	int to_drain;
4ae7c0394   Christoph Lameter   [PATCH] Periodica...
1122

4037d4522   Christoph Lameter   Move remote node ...
1123
1124
1125
1126
1127
  	local_irq_save(flags);
  	if (pcp->count >= pcp->batch)
  		to_drain = pcp->batch;
  	else
  		to_drain = pcp->count;
2a13515c3   KOSAKI Motohiro   mm: clear pages_s...
1128
1129
1130
1131
  	if (to_drain > 0) {
  		free_pcppages_bulk(zone, to_drain, pcp);
  		pcp->count -= to_drain;
  	}
4037d4522   Christoph Lameter   Move remote node ...
1132
  	local_irq_restore(flags);
4ae7c0394   Christoph Lameter   [PATCH] Periodica...
1133
1134
  }
  #endif
9f8f21725   Christoph Lameter   Page allocator: c...
1135
1136
1137
1138
1139
1140
1141
1142
  /*
   * Drain pages of the indicated processor.
   *
   * The processor must either be the current processor and the
   * thread pinned to the current processor or a processor that
   * is not online.
   */
  static void drain_pages(unsigned int cpu)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1143
  {
c54ad30c7   Nick Piggin   [PATCH] mm: pagea...
1144
  	unsigned long flags;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1145
  	struct zone *zone;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1146

ee99c71c5   KOSAKI Motohiro   mm: introduce for...
1147
  	for_each_populated_zone(zone) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1148
  		struct per_cpu_pageset *pset;
3dfa5721f   Christoph Lameter   Page allocator: g...
1149
  		struct per_cpu_pages *pcp;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1150

99dcc3e5a   Christoph Lameter   this_cpu: Page al...
1151
1152
  		local_irq_save(flags);
  		pset = per_cpu_ptr(zone->pageset, cpu);
3dfa5721f   Christoph Lameter   Page allocator: g...
1153
1154
  
  		pcp = &pset->pcp;
2ff754fa8   David Rientjes   mm: clear pages_s...
1155
1156
1157
1158
  		if (pcp->count) {
  			free_pcppages_bulk(zone, pcp->count, pcp);
  			pcp->count = 0;
  		}
3dfa5721f   Christoph Lameter   Page allocator: g...
1159
  		local_irq_restore(flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1160
1161
  	}
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1162

9f8f21725   Christoph Lameter   Page allocator: c...
1163
1164
1165
1166
1167
1168
1169
1170
1171
  /*
   * Spill all of this CPU's per-cpu pages back into the buddy allocator.
   */
  void drain_local_pages(void *arg)
  {
  	drain_pages(smp_processor_id());
  }
  
  /*
74046494e   Gilad Ben-Yossef   mm: only IPI CPUs...
1172
1173
1174
1175
1176
1177
1178
   * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
   *
   * Note that this code is protected against sending an IPI to an offline
   * CPU but does not guarantee sending an IPI to newly hotplugged CPUs:
   * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but
   * nothing keeps CPUs from showing up after we populated the cpumask and
   * before the call to on_each_cpu_mask().
9f8f21725   Christoph Lameter   Page allocator: c...
1179
1180
1181
   */
  void drain_all_pages(void)
  {
74046494e   Gilad Ben-Yossef   mm: only IPI CPUs...
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
  	int cpu;
  	struct per_cpu_pageset *pcp;
  	struct zone *zone;
  
  	/*
  	 * Allocate in the BSS so we wont require allocation in
  	 * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y
  	 */
  	static cpumask_t cpus_with_pcps;
  
  	/*
  	 * We don't care about racing with CPU hotplug event
  	 * as offline notification will cause the notified
  	 * cpu to drain that CPU pcps and on_each_cpu_mask
  	 * disables preemption as part of its processing
  	 */
  	for_each_online_cpu(cpu) {
  		bool has_pcps = false;
  		for_each_populated_zone(zone) {
  			pcp = per_cpu_ptr(zone->pageset, cpu);
  			if (pcp->pcp.count) {
  				has_pcps = true;
  				break;
  			}
  		}
  		if (has_pcps)
  			cpumask_set_cpu(cpu, &cpus_with_pcps);
  		else
  			cpumask_clear_cpu(cpu, &cpus_with_pcps);
  	}
  	on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1);
9f8f21725   Christoph Lameter   Page allocator: c...
1213
  }
296699de6   Rafael J. Wysocki   Introduce CONFIG_...
1214
  #ifdef CONFIG_HIBERNATION
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1215
1216
1217
  
  void mark_free_pages(struct zone *zone)
  {
f623f0db8   Rafael J. Wysocki   [PATCH] swsusp: F...
1218
1219
  	unsigned long pfn, max_zone_pfn;
  	unsigned long flags;
b2a0ac887   Mel Gorman   Split the free li...
1220
  	int order, t;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1221
1222
1223
1224
1225
1226
  	struct list_head *curr;
  
  	if (!zone->spanned_pages)
  		return;
  
  	spin_lock_irqsave(&zone->lock, flags);
f623f0db8   Rafael J. Wysocki   [PATCH] swsusp: F...
1227

108bcc96e   Cody P Schafer   mm: add & use zon...
1228
  	max_zone_pfn = zone_end_pfn(zone);
f623f0db8   Rafael J. Wysocki   [PATCH] swsusp: F...
1229
1230
1231
  	for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
  		if (pfn_valid(pfn)) {
  			struct page *page = pfn_to_page(pfn);
7be982349   Rafael J. Wysocki   swsusp: use inlin...
1232
1233
  			if (!swsusp_page_is_forbidden(page))
  				swsusp_unset_page_free(page);
f623f0db8   Rafael J. Wysocki   [PATCH] swsusp: F...
1234
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1235

b2a0ac887   Mel Gorman   Split the free li...
1236
1237
  	for_each_migratetype_order(order, t) {
  		list_for_each(curr, &zone->free_area[order].free_list[t]) {
f623f0db8   Rafael J. Wysocki   [PATCH] swsusp: F...
1238
  			unsigned long i;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1239

f623f0db8   Rafael J. Wysocki   [PATCH] swsusp: F...
1240
1241
  			pfn = page_to_pfn(list_entry(curr, struct page, lru));
  			for (i = 0; i < (1UL << order); i++)
7be982349   Rafael J. Wysocki   swsusp: use inlin...
1242
  				swsusp_set_page_free(pfn_to_page(pfn + i));
f623f0db8   Rafael J. Wysocki   [PATCH] swsusp: F...
1243
  		}
b2a0ac887   Mel Gorman   Split the free li...
1244
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1245
1246
  	spin_unlock_irqrestore(&zone->lock, flags);
  }
e2c55dc87   Mel Gorman   Drain per-cpu lis...
1247
  #endif /* CONFIG_PM */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1248
1249
  
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1250
   * Free a 0-order page
fc91668ea   Li Hong   mm: remove free_h...
1251
   * cold == 1 ? free a cold page : free a hot page
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1252
   */
fc91668ea   Li Hong   mm: remove free_h...
1253
  void free_hot_cold_page(struct page *page, int cold)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1254
1255
1256
1257
  {
  	struct zone *zone = page_zone(page);
  	struct per_cpu_pages *pcp;
  	unsigned long flags;
5f8dcc212   Mel Gorman   page-allocator: s...
1258
  	int migratetype;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1259

ec95f53aa   KOSAKI Motohiro   mm: introduce fre...
1260
  	if (!free_pages_prepare(page, 0))
689bcebfd   Hugh Dickins   [PATCH] unpaged: ...
1261
  		return;
5f8dcc212   Mel Gorman   page-allocator: s...
1262
  	migratetype = get_pageblock_migratetype(page);
b12c4ad14   Minchan Kim   mm: page_alloc: u...
1263
  	set_freepage_migratetype(page, migratetype);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1264
  	local_irq_save(flags);
f8891e5e1   Christoph Lameter   [PATCH] Light wei...
1265
  	__count_vm_event(PGFREE);
da456f14d   Mel Gorman   page allocator: d...
1266

5f8dcc212   Mel Gorman   page-allocator: s...
1267
1268
1269
1270
1271
1272
1273
1274
  	/*
  	 * We only track unmovable, reclaimable and movable on pcp lists.
  	 * Free ISOLATE pages back to the allocator because they are being
  	 * offlined but treat RESERVE as movable pages so we can get those
  	 * areas back if necessary. Otherwise, we may have to free
  	 * excessively into the page allocator
  	 */
  	if (migratetype >= MIGRATE_PCPTYPES) {
194159fbc   Minchan Kim   mm: remove MIGRAT...
1275
  		if (unlikely(is_migrate_isolate(migratetype))) {
5f8dcc212   Mel Gorman   page-allocator: s...
1276
1277
1278
1279
1280
  			free_one_page(zone, page, 0, migratetype);
  			goto out;
  		}
  		migratetype = MIGRATE_MOVABLE;
  	}
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
1281
  	pcp = &this_cpu_ptr(zone->pageset)->pcp;
3dfa5721f   Christoph Lameter   Page allocator: g...
1282
  	if (cold)
5f8dcc212   Mel Gorman   page-allocator: s...
1283
  		list_add_tail(&page->lru, &pcp->lists[migratetype]);
3dfa5721f   Christoph Lameter   Page allocator: g...
1284
  	else
5f8dcc212   Mel Gorman   page-allocator: s...
1285
  		list_add(&page->lru, &pcp->lists[migratetype]);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1286
  	pcp->count++;
48db57f8f   Nick Piggin   [PATCH] mm: free_...
1287
  	if (pcp->count >= pcp->high) {
5f8dcc212   Mel Gorman   page-allocator: s...
1288
  		free_pcppages_bulk(zone, pcp->batch, pcp);
48db57f8f   Nick Piggin   [PATCH] mm: free_...
1289
1290
  		pcp->count -= pcp->batch;
  	}
5f8dcc212   Mel Gorman   page-allocator: s...
1291
1292
  
  out:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1293
  	local_irq_restore(flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1294
  }
8dfcc9ba2   Nick Piggin   [PATCH] mm: split...
1295
  /*
cc59850ef   Konstantin Khlebnikov   mm: add free_hot_...
1296
1297
1298
1299
1300
1301
1302
   * Free a list of 0-order pages
   */
  void free_hot_cold_page_list(struct list_head *list, int cold)
  {
  	struct page *page, *next;
  
  	list_for_each_entry_safe(page, next, list, lru) {
b413d48aa   Konstantin Khlebnikov   mm-tracepoint: re...
1303
  		trace_mm_page_free_batched(page, cold);
cc59850ef   Konstantin Khlebnikov   mm: add free_hot_...
1304
1305
1306
1307
1308
  		free_hot_cold_page(page, cold);
  	}
  }
  
  /*
8dfcc9ba2   Nick Piggin   [PATCH] mm: split...
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
   * split_page takes a non-compound higher-order page, and splits it into
   * n (1<<order) sub-pages: page[0..n]
   * Each sub-page must be freed individually.
   *
   * Note: this is probably too low level an operation for use in drivers.
   * Please consult with lkml before using this in your driver.
   */
  void split_page(struct page *page, unsigned int order)
  {
  	int i;
725d704ec   Nick Piggin   [PATCH] mm: VM_BU...
1319
1320
  	VM_BUG_ON(PageCompound(page));
  	VM_BUG_ON(!page_count(page));
b1eeab676   Vegard Nossum   kmemcheck: add ho...
1321
1322
1323
1324
1325
1326
1327
1328
1329
  
  #ifdef CONFIG_KMEMCHECK
  	/*
  	 * Split shadow pages too, because free(page[0]) would
  	 * otherwise free the whole shadow.
  	 */
  	if (kmemcheck_page_is_tracked(page))
  		split_page(virt_to_page(page[0].shadow), order);
  #endif
7835e98b2   Nick Piggin   [PATCH] remove se...
1330
1331
  	for (i = 1; i < (1 << order); i++)
  		set_page_refcounted(page + i);
8dfcc9ba2   Nick Piggin   [PATCH] mm: split...
1332
  }
5853ff23c   K. Y. Srinivasan   mm: export split_...
1333
  EXPORT_SYMBOL_GPL(split_page);
8dfcc9ba2   Nick Piggin   [PATCH] mm: split...
1334

8fb74b9fb   Mel Gorman   mm: compaction: p...
1335
  static int __isolate_free_page(struct page *page, unsigned int order)
748446bb6   Mel Gorman   mm: compaction: m...
1336
  {
748446bb6   Mel Gorman   mm: compaction: m...
1337
1338
  	unsigned long watermark;
  	struct zone *zone;
2139cbe62   Bartlomiej Zolnierkiewicz   cma: fix counting...
1339
  	int mt;
748446bb6   Mel Gorman   mm: compaction: m...
1340
1341
1342
1343
  
  	BUG_ON(!PageBuddy(page));
  
  	zone = page_zone(page);
2e30abd17   Marek Szyprowski   mm: cma: skip wat...
1344
  	mt = get_pageblock_migratetype(page);
748446bb6   Mel Gorman   mm: compaction: m...
1345

194159fbc   Minchan Kim   mm: remove MIGRAT...
1346
  	if (!is_migrate_isolate(mt)) {
2e30abd17   Marek Szyprowski   mm: cma: skip wat...
1347
1348
1349
1350
  		/* Obey watermarks as if the page was being allocated */
  		watermark = low_wmark_pages(zone) + (1 << order);
  		if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
  			return 0;
8fb74b9fb   Mel Gorman   mm: compaction: p...
1351
  		__mod_zone_freepage_state(zone, -(1UL << order), mt);
2e30abd17   Marek Szyprowski   mm: cma: skip wat...
1352
  	}
748446bb6   Mel Gorman   mm: compaction: m...
1353
1354
1355
1356
1357
  
  	/* Remove page from free list */
  	list_del(&page->lru);
  	zone->free_area[order].nr_free--;
  	rmv_page_order(page);
2139cbe62   Bartlomiej Zolnierkiewicz   cma: fix counting...
1358

8fb74b9fb   Mel Gorman   mm: compaction: p...
1359
  	/* Set the pageblock if the isolated page is at least a pageblock */
748446bb6   Mel Gorman   mm: compaction: m...
1360
1361
  	if (order >= pageblock_order - 1) {
  		struct page *endpage = page + (1 << order) - 1;
47118af07   Michal Nazarewicz   mm: mmzone: MIGRA...
1362
1363
  		for (; page < endpage; page += pageblock_nr_pages) {
  			int mt = get_pageblock_migratetype(page);
194159fbc   Minchan Kim   mm: remove MIGRAT...
1364
  			if (!is_migrate_isolate(mt) && !is_migrate_cma(mt))
47118af07   Michal Nazarewicz   mm: mmzone: MIGRA...
1365
1366
1367
  				set_pageblock_migratetype(page,
  							  MIGRATE_MOVABLE);
  		}
748446bb6   Mel Gorman   mm: compaction: m...
1368
  	}
8fb74b9fb   Mel Gorman   mm: compaction: p...
1369
  	return 1UL << order;
1fb3f8ca0   Mel Gorman   mm: compaction: c...
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
  }
  
  /*
   * Similar to split_page except the page is already free. As this is only
   * being used for migration, the migratetype of the block also changes.
   * As this is called with interrupts disabled, the caller is responsible
   * for calling arch_alloc_page() and kernel_map_page() after interrupts
   * are enabled.
   *
   * Note: this is probably too low level an operation for use in drivers.
   * Please consult with lkml before using this in your driver.
   */
  int split_free_page(struct page *page)
  {
  	unsigned int order;
  	int nr_pages;
1fb3f8ca0   Mel Gorman   mm: compaction: c...
1386
  	order = page_order(page);
8fb74b9fb   Mel Gorman   mm: compaction: p...
1387
  	nr_pages = __isolate_free_page(page, order);
1fb3f8ca0   Mel Gorman   mm: compaction: c...
1388
1389
1390
1391
1392
1393
1394
  	if (!nr_pages)
  		return 0;
  
  	/* Split into individual pages */
  	set_page_refcounted(page);
  	split_page(page, order);
  	return nr_pages;
748446bb6   Mel Gorman   mm: compaction: m...
1395
1396
1397
  }
  
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1398
1399
1400
1401
   * Really, prep_compound_page() should be called from __rmqueue_bulk().  But
   * we cheat by calling it from here, in the order > 0 path.  Saves a branch
   * or two.
   */
0a15c3e9f   Mel Gorman   page allocator: i...
1402
1403
  static inline
  struct page *buffered_rmqueue(struct zone *preferred_zone,
3dd282669   Mel Gorman   page allocator: c...
1404
1405
  			struct zone *zone, int order, gfp_t gfp_flags,
  			int migratetype)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1406
1407
  {
  	unsigned long flags;
689bcebfd   Hugh Dickins   [PATCH] unpaged: ...
1408
  	struct page *page;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1409
  	int cold = !!(gfp_flags & __GFP_COLD);
689bcebfd   Hugh Dickins   [PATCH] unpaged: ...
1410
  again:
48db57f8f   Nick Piggin   [PATCH] mm: free_...
1411
  	if (likely(order == 0)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1412
  		struct per_cpu_pages *pcp;
5f8dcc212   Mel Gorman   page-allocator: s...
1413
  		struct list_head *list;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1414

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1415
  		local_irq_save(flags);
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
1416
1417
  		pcp = &this_cpu_ptr(zone->pageset)->pcp;
  		list = &pcp->lists[migratetype];
5f8dcc212   Mel Gorman   page-allocator: s...
1418
  		if (list_empty(list)) {
535131e69   Mel Gorman   Choose pages from...
1419
  			pcp->count += rmqueue_bulk(zone, 0,
5f8dcc212   Mel Gorman   page-allocator: s...
1420
  					pcp->batch, list,
e084b2d95   Mel Gorman   page-allocator: p...
1421
  					migratetype, cold);
5f8dcc212   Mel Gorman   page-allocator: s...
1422
  			if (unlikely(list_empty(list)))
6fb332fab   Shaohua Li   memory hotplug: e...
1423
  				goto failed;
535131e69   Mel Gorman   Choose pages from...
1424
  		}
b92a6edd4   Mel Gorman   Add a configure o...
1425

5f8dcc212   Mel Gorman   page-allocator: s...
1426
1427
1428
1429
  		if (cold)
  			page = list_entry(list->prev, struct page, lru);
  		else
  			page = list_entry(list->next, struct page, lru);
b92a6edd4   Mel Gorman   Add a configure o...
1430
1431
  		list_del(&page->lru);
  		pcp->count--;
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1432
  	} else {
dab48dab3   Andrew Morton   page-allocator: w...
1433
1434
1435
1436
1437
1438
1439
1440
  		if (unlikely(gfp_flags & __GFP_NOFAIL)) {
  			/*
  			 * __GFP_NOFAIL is not to be used in new code.
  			 *
  			 * All __GFP_NOFAIL callers should be fixed so that they
  			 * properly detect and handle allocation failures.
  			 *
  			 * We most definitely don't want callers attempting to
4923abf9f   Linus Torvalds   Don't warn about ...
1441
  			 * allocate greater than order-1 page units with
dab48dab3   Andrew Morton   page-allocator: w...
1442
1443
  			 * __GFP_NOFAIL.
  			 */
4923abf9f   Linus Torvalds   Don't warn about ...
1444
  			WARN_ON_ONCE(order > 1);
dab48dab3   Andrew Morton   page-allocator: w...
1445
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1446
  		spin_lock_irqsave(&zone->lock, flags);
b2a0ac887   Mel Gorman   Split the free li...
1447
  		page = __rmqueue(zone, order, migratetype);
a74609faf   Nick Piggin   [PATCH] mm: page_...
1448
1449
1450
  		spin_unlock(&zone->lock);
  		if (!page)
  			goto failed;
d1ce749a0   Bartlomiej Zolnierkiewicz   cma: count free C...
1451
1452
  		__mod_zone_freepage_state(zone, -(1 << order),
  					  get_pageblock_migratetype(page));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1453
  	}
f8891e5e1   Christoph Lameter   [PATCH] Light wei...
1454
  	__count_zone_vm_events(PGALLOC, zone, 1 << order);
78afd5612   Andi Kleen   mm: add __GFP_OTH...
1455
  	zone_statistics(preferred_zone, zone, gfp_flags);
a74609faf   Nick Piggin   [PATCH] mm: page_...
1456
  	local_irq_restore(flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1457

725d704ec   Nick Piggin   [PATCH] mm: VM_BU...
1458
  	VM_BUG_ON(bad_range(zone, page));
17cf44064   Nick Piggin   [PATCH] mm: clean...
1459
  	if (prep_new_page(page, order, gfp_flags))
a74609faf   Nick Piggin   [PATCH] mm: page_...
1460
  		goto again;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1461
  	return page;
a74609faf   Nick Piggin   [PATCH] mm: page_...
1462
1463
1464
  
  failed:
  	local_irq_restore(flags);
a74609faf   Nick Piggin   [PATCH] mm: page_...
1465
  	return NULL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1466
  }
933e312e7   Akinobu Mita   [PATCH] fault-inj...
1467
  #ifdef CONFIG_FAIL_PAGE_ALLOC
b2588c4b4   Akinobu Mita   fail_page_alloc: ...
1468
  static struct {
933e312e7   Akinobu Mita   [PATCH] fault-inj...
1469
1470
1471
1472
  	struct fault_attr attr;
  
  	u32 ignore_gfp_highmem;
  	u32 ignore_gfp_wait;
54114994f   Akinobu Mita   fault-injection: ...
1473
  	u32 min_order;
933e312e7   Akinobu Mita   [PATCH] fault-inj...
1474
1475
  } fail_page_alloc = {
  	.attr = FAULT_ATTR_INITIALIZER,
6b1b60f41   Don Mullis   [PATCH] fault-inj...
1476
1477
  	.ignore_gfp_wait = 1,
  	.ignore_gfp_highmem = 1,
54114994f   Akinobu Mita   fault-injection: ...
1478
  	.min_order = 1,
933e312e7   Akinobu Mita   [PATCH] fault-inj...
1479
1480
1481
1482
1483
1484
1485
  };
  
  static int __init setup_fail_page_alloc(char *str)
  {
  	return setup_fault_attr(&fail_page_alloc.attr, str);
  }
  __setup("fail_page_alloc=", setup_fail_page_alloc);
deaf386ee   Gavin Shan   mm/buddy: cleanup...
1486
  static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
933e312e7   Akinobu Mita   [PATCH] fault-inj...
1487
  {
54114994f   Akinobu Mita   fault-injection: ...
1488
  	if (order < fail_page_alloc.min_order)
deaf386ee   Gavin Shan   mm/buddy: cleanup...
1489
  		return false;
933e312e7   Akinobu Mita   [PATCH] fault-inj...
1490
  	if (gfp_mask & __GFP_NOFAIL)
deaf386ee   Gavin Shan   mm/buddy: cleanup...
1491
  		return false;
933e312e7   Akinobu Mita   [PATCH] fault-inj...
1492
  	if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
deaf386ee   Gavin Shan   mm/buddy: cleanup...
1493
  		return false;
933e312e7   Akinobu Mita   [PATCH] fault-inj...
1494
  	if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
deaf386ee   Gavin Shan   mm/buddy: cleanup...
1495
  		return false;
933e312e7   Akinobu Mita   [PATCH] fault-inj...
1496
1497
1498
1499
1500
1501
1502
1503
  
  	return should_fail(&fail_page_alloc.attr, 1 << order);
  }
  
  #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
  
  static int __init fail_page_alloc_debugfs(void)
  {
f4ae40a6a   Al Viro   switch debugfs to...
1504
  	umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
933e312e7   Akinobu Mita   [PATCH] fault-inj...
1505
  	struct dentry *dir;
933e312e7   Akinobu Mita   [PATCH] fault-inj...
1506

dd48c085c   Akinobu Mita   fault-injection: ...
1507
1508
1509
1510
  	dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
  					&fail_page_alloc.attr);
  	if (IS_ERR(dir))
  		return PTR_ERR(dir);
933e312e7   Akinobu Mita   [PATCH] fault-inj...
1511

b2588c4b4   Akinobu Mita   fail_page_alloc: ...
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
  	if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
  				&fail_page_alloc.ignore_gfp_wait))
  		goto fail;
  	if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
  				&fail_page_alloc.ignore_gfp_highmem))
  		goto fail;
  	if (!debugfs_create_u32("min-order", mode, dir,
  				&fail_page_alloc.min_order))
  		goto fail;
  
  	return 0;
  fail:
dd48c085c   Akinobu Mita   fault-injection: ...
1524
  	debugfs_remove_recursive(dir);
933e312e7   Akinobu Mita   [PATCH] fault-inj...
1525

b2588c4b4   Akinobu Mita   fail_page_alloc: ...
1526
  	return -ENOMEM;
933e312e7   Akinobu Mita   [PATCH] fault-inj...
1527
1528
1529
1530
1531
1532
1533
  }
  
  late_initcall(fail_page_alloc_debugfs);
  
  #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
  
  #else /* CONFIG_FAIL_PAGE_ALLOC */
deaf386ee   Gavin Shan   mm/buddy: cleanup...
1534
  static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
933e312e7   Akinobu Mita   [PATCH] fault-inj...
1535
  {
deaf386ee   Gavin Shan   mm/buddy: cleanup...
1536
  	return false;
933e312e7   Akinobu Mita   [PATCH] fault-inj...
1537
1538
1539
  }
  
  #endif /* CONFIG_FAIL_PAGE_ALLOC */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1540
  /*
88f5acf88   Mel Gorman   mm: page allocato...
1541
   * Return true if free pages are above 'mark'. This takes into account the order
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1542
1543
   * of the allocation.
   */
88f5acf88   Mel Gorman   mm: page allocato...
1544
1545
  static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
  		      int classzone_idx, int alloc_flags, long free_pages)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1546
1547
  {
  	/* free_pages my go negative - that's OK */
d23ad4232   Christoph Lameter   [PATCH] Use ZVC f...
1548
  	long min = mark;
2cfed0752   Minchan Kim   mm: fix free page...
1549
  	long lowmem_reserve = z->lowmem_reserve[classzone_idx];
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1550
  	int o;
026b08147   Tomasz Stanislawski   mm/page_alloc.c: ...
1551
  	long free_cma = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1552

df0a6daa0   Michal Hocko   mm: fix off-by-tw...
1553
  	free_pages -= (1 << order) - 1;
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1554
  	if (alloc_flags & ALLOC_HIGH)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1555
  		min -= min / 2;
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1556
  	if (alloc_flags & ALLOC_HARDER)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1557
  		min -= min / 4;
d95ea5d18   Bartlomiej Zolnierkiewicz   cma: fix watermar...
1558
1559
1560
  #ifdef CONFIG_CMA
  	/* If allocation can't use CMA areas don't use free CMA pages */
  	if (!(alloc_flags & ALLOC_CMA))
026b08147   Tomasz Stanislawski   mm/page_alloc.c: ...
1561
  		free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);
d95ea5d18   Bartlomiej Zolnierkiewicz   cma: fix watermar...
1562
  #endif
026b08147   Tomasz Stanislawski   mm/page_alloc.c: ...
1563
1564
  
  	if (free_pages - free_cma <= min + lowmem_reserve)
88f5acf88   Mel Gorman   mm: page allocato...
1565
  		return false;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1566
1567
1568
1569
1570
1571
1572
1573
  	for (o = 0; o < order; o++) {
  		/* At the next order, this order's pages become unavailable */
  		free_pages -= z->free_area[o].nr_free << o;
  
  		/* Require fewer higher order pages to be free */
  		min >>= 1;
  
  		if (free_pages <= min)
88f5acf88   Mel Gorman   mm: page allocato...
1574
  			return false;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1575
  	}
88f5acf88   Mel Gorman   mm: page allocato...
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
  	return true;
  }
  
  bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
  		      int classzone_idx, int alloc_flags)
  {
  	return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
  					zone_page_state(z, NR_FREE_PAGES));
  }
  
  bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
  		      int classzone_idx, int alloc_flags)
  {
  	long free_pages = zone_page_state(z, NR_FREE_PAGES);
  
  	if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
  		free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
  
  	return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
  								free_pages);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1596
  }
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1597
1598
1599
1600
1601
1602
  #ifdef CONFIG_NUMA
  /*
   * zlc_setup - Setup for "zonelist cache".  Uses cached zone data to
   * skip over zones that are not allowed by the cpuset, or that have
   * been recently (in last second) found to be nearly full.  See further
   * comments in mmzone.h.  Reduces cache footprint of zonelist scans
183ff22bb   Simon Arlott   spelling fixes: mm/
1603
   * that have to skip over a lot of full or unallowed zones.
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1604
1605
1606
   *
   * If the zonelist cache is present in the passed in zonelist, then
   * returns a pointer to the allowed node mask (either the current
4b0ef1fe8   Lai Jiangshan   page_alloc: use N...
1607
   * tasks mems_allowed, or node_states[N_MEMORY].)
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
   *
   * If the zonelist cache is not available for this zonelist, does
   * nothing and returns NULL.
   *
   * If the fullzones BITMAP in the zonelist cache is stale (more than
   * a second since last zap'd) then we zap it out (clear its bits.)
   *
   * We hold off even calling zlc_setup, until after we've checked the
   * first zone in the zonelist, on the theory that most allocations will
   * be satisfied from that first zone, so best to examine that zone as
   * quickly as we can.
   */
  static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
  {
  	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
  	nodemask_t *allowednodes;	/* zonelist_cache approximation */
  
  	zlc = zonelist->zlcache_ptr;
  	if (!zlc)
  		return NULL;
f05111f50   S.ÇaÄŸlar Onur   mm/page_alloc.c: ...
1628
  	if (time_after(jiffies, zlc->last_full_zap + HZ)) {
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1629
1630
1631
1632
1633
1634
  		bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
  		zlc->last_full_zap = jiffies;
  	}
  
  	allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
  					&cpuset_current_mems_allowed :
4b0ef1fe8   Lai Jiangshan   page_alloc: use N...
1635
  					&node_states[N_MEMORY];
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
  	return allowednodes;
  }
  
  /*
   * Given 'z' scanning a zonelist, run a couple of quick checks to see
   * if it is worth looking at further for free memory:
   *  1) Check that the zone isn't thought to be full (doesn't have its
   *     bit set in the zonelist_cache fullzones BITMAP).
   *  2) Check that the zones node (obtained from the zonelist_cache
   *     z_to_n[] mapping) is allowed in the passed in allowednodes mask.
   * Return true (non-zero) if zone is worth looking at further, or
   * else return false (zero) if it is not.
   *
   * This check -ignores- the distinction between various watermarks,
   * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ...  If a zone is
   * found to be full for any variation of these watermarks, it will
   * be considered full for up to one second by all requests, unless
   * we are so low on memory on all allowed nodes that we are forced
   * into the second scan of the zonelist.
   *
   * In the second scan we ignore this zonelist cache and exactly
   * apply the watermarks to all zones, even it is slower to do so.
   * We are low on memory in the second scan, and should leave no stone
   * unturned looking for a free page.
   */
dd1a239f6   Mel Gorman   mm: have zonelist...
1661
  static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1662
1663
1664
1665
1666
1667
1668
1669
1670
  						nodemask_t *allowednodes)
  {
  	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
  	int i;				/* index of *z in zonelist zones */
  	int n;				/* node that zone *z is on */
  
  	zlc = zonelist->zlcache_ptr;
  	if (!zlc)
  		return 1;
dd1a239f6   Mel Gorman   mm: have zonelist...
1671
  	i = z - zonelist->_zonerefs;
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
  	n = zlc->z_to_n[i];
  
  	/* This zone is worth trying if it is allowed but not full */
  	return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
  }
  
  /*
   * Given 'z' scanning a zonelist, set the corresponding bit in
   * zlc->fullzones, so that subsequent attempts to allocate a page
   * from that zone don't waste time re-examining it.
   */
dd1a239f6   Mel Gorman   mm: have zonelist...
1683
  static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1684
1685
1686
1687
1688
1689
1690
  {
  	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
  	int i;				/* index of *z in zonelist zones */
  
  	zlc = zonelist->zlcache_ptr;
  	if (!zlc)
  		return;
dd1a239f6   Mel Gorman   mm: have zonelist...
1691
  	i = z - zonelist->_zonerefs;
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1692
1693
1694
  
  	set_bit(i, zlc->fullzones);
  }
76d3fbf8f   Mel Gorman   mm: page allocato...
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
  /*
   * clear all zones full, called after direct reclaim makes progress so that
   * a zone that was recently full is not skipped over for up to a second
   */
  static void zlc_clear_zones_full(struct zonelist *zonelist)
  {
  	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
  
  	zlc = zonelist->zlcache_ptr;
  	if (!zlc)
  		return;
  
  	bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
  }
957f822a0   David Rientjes   mm, numa: reclaim...
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
  static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
  {
  	return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes);
  }
  
  static void __paginginit init_zone_allows_reclaim(int nid)
  {
  	int i;
  
  	for_each_online_node(i)
6b187d026   David Rientjes   mm, numa: avoid s...
1719
  		if (node_distance(nid, i) <= RECLAIM_DISTANCE)
957f822a0   David Rientjes   mm, numa: reclaim...
1720
  			node_set(i, NODE_DATA(nid)->reclaim_nodes);
6b187d026   David Rientjes   mm, numa: avoid s...
1721
  		else
957f822a0   David Rientjes   mm, numa: reclaim...
1722
  			zone_reclaim_mode = 1;
957f822a0   David Rientjes   mm, numa: reclaim...
1723
  }
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1724
1725
1726
1727
1728
1729
  #else	/* CONFIG_NUMA */
  
  static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
  {
  	return NULL;
  }
dd1a239f6   Mel Gorman   mm: have zonelist...
1730
  static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1731
1732
1733
1734
  				nodemask_t *allowednodes)
  {
  	return 1;
  }
dd1a239f6   Mel Gorman   mm: have zonelist...
1735
  static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1736
1737
  {
  }
76d3fbf8f   Mel Gorman   mm: page allocato...
1738
1739
1740
1741
  
  static void zlc_clear_zones_full(struct zonelist *zonelist)
  {
  }
957f822a0   David Rientjes   mm, numa: reclaim...
1742
1743
1744
1745
1746
1747
1748
1749
1750
  
  static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
  {
  	return true;
  }
  
  static inline void init_zone_allows_reclaim(int nid)
  {
  }
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1751
  #endif	/* CONFIG_NUMA */
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1752
  /*
0798e5193   Paul Jackson   [PATCH] memory pa...
1753
   * get_page_from_freelist goes through the zonelist trying to allocate
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1754
1755
1756
   * a page.
   */
  static struct page *
19770b326   Mel Gorman   mm: filter based ...
1757
  get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
5117f45d1   Mel Gorman   page allocator: c...
1758
  		struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
3dd282669   Mel Gorman   page allocator: c...
1759
  		struct zone *preferred_zone, int migratetype)
753ee7289   Martin Hicks   [PATCH] VM: early...
1760
  {
dd1a239f6   Mel Gorman   mm: have zonelist...
1761
  	struct zoneref *z;
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1762
  	struct page *page = NULL;
54a6eb5c4   Mel Gorman   mm: use two zonel...
1763
  	int classzone_idx;
5117f45d1   Mel Gorman   page allocator: c...
1764
  	struct zone *zone;
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1765
1766
1767
  	nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
  	int zlc_active = 0;		/* set if using zonelist_cache */
  	int did_zlc_setup = 0;		/* just call zlc_setup() one time */
54a6eb5c4   Mel Gorman   mm: use two zonel...
1768

19770b326   Mel Gorman   mm: filter based ...
1769
  	classzone_idx = zone_idx(preferred_zone);
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1770
  zonelist_scan:
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1771
  	/*
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1772
  	 * Scan zonelist, looking for a zone with enough free.
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1773
1774
  	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
  	 */
19770b326   Mel Gorman   mm: filter based ...
1775
1776
  	for_each_zone_zonelist_nodemask(zone, z, zonelist,
  						high_zoneidx, nodemask) {
e5adfffc8   Kirill A. Shutemov   mm: use IS_ENABLE...
1777
  		if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1778
1779
  			!zlc_zone_worth_trying(zonelist, z, allowednodes))
  				continue;
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1780
  		if ((alloc_flags & ALLOC_CPUSET) &&
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
1781
  			!cpuset_zone_allowed_softwall(zone, gfp_mask))
cd38b115d   Mel Gorman   mm: page allocato...
1782
  				continue;
a756cf590   Johannes Weiner   mm: try to distri...
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
  		/*
  		 * When allocating a page cache page for writing, we
  		 * want to get it from a zone that is within its dirty
  		 * limit, such that no single zone holds more than its
  		 * proportional share of globally allowed dirty pages.
  		 * The dirty limits take into account the zone's
  		 * lowmem reserves and high watermark so that kswapd
  		 * should be able to balance it without having to
  		 * write pages from its LRU list.
  		 *
  		 * This may look like it could increase pressure on
  		 * lower zones by failing allocations in higher zones
  		 * before they are full.  But the pages that do spill
  		 * over are limited as the lower zones are protected
  		 * by this very same mechanism.  It should not become
  		 * a practical burden to them.
  		 *
  		 * XXX: For now, allow allocations to potentially
  		 * exceed the per-zone dirty limit in the slowpath
  		 * (ALLOC_WMARK_LOW unset) before going into reclaim,
  		 * which is important when on a NUMA setup the allowed
  		 * zones are together not big enough to reach the
  		 * global limit.  The proper fix for these situations
  		 * will require awareness of zones in the
  		 * dirty-throttling and the flusher threads.
  		 */
  		if ((alloc_flags & ALLOC_WMARK_LOW) &&
  		    (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))
  			goto this_zone_full;
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1812

418589663   Mel Gorman   page allocator: u...
1813
  		BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1814
  		if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
3148890bf   Nick Piggin   [PATCH] mm: __all...
1815
  			unsigned long mark;
fa5e084e4   Mel Gorman   vmscan: do not un...
1816
  			int ret;
418589663   Mel Gorman   page allocator: u...
1817
  			mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
fa5e084e4   Mel Gorman   vmscan: do not un...
1818
1819
1820
  			if (zone_watermark_ok(zone, order, mark,
  				    classzone_idx, alloc_flags))
  				goto try_this_zone;
e5adfffc8   Kirill A. Shutemov   mm: use IS_ENABLE...
1821
1822
  			if (IS_ENABLED(CONFIG_NUMA) &&
  					!did_zlc_setup && nr_online_nodes > 1) {
cd38b115d   Mel Gorman   mm: page allocato...
1823
1824
1825
1826
1827
1828
1829
1830
1831
  				/*
  				 * we do zlc_setup if there are multiple nodes
  				 * and before considering the first zone allowed
  				 * by the cpuset.
  				 */
  				allowednodes = zlc_setup(zonelist, alloc_flags);
  				zlc_active = 1;
  				did_zlc_setup = 1;
  			}
957f822a0   David Rientjes   mm, numa: reclaim...
1832
1833
  			if (zone_reclaim_mode == 0 ||
  			    !zone_allows_reclaim(preferred_zone, zone))
fa5e084e4   Mel Gorman   vmscan: do not un...
1834
  				goto this_zone_full;
cd38b115d   Mel Gorman   mm: page allocato...
1835
1836
1837
1838
  			/*
  			 * As we may have just activated ZLC, check if the first
  			 * eligible zone has failed zone_reclaim recently.
  			 */
e5adfffc8   Kirill A. Shutemov   mm: use IS_ENABLE...
1839
  			if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
cd38b115d   Mel Gorman   mm: page allocato...
1840
1841
  				!zlc_zone_worth_trying(zonelist, z, allowednodes))
  				continue;
fa5e084e4   Mel Gorman   vmscan: do not un...
1842
1843
1844
1845
  			ret = zone_reclaim(zone, gfp_mask, order);
  			switch (ret) {
  			case ZONE_RECLAIM_NOSCAN:
  				/* did not scan */
cd38b115d   Mel Gorman   mm: page allocato...
1846
  				continue;
fa5e084e4   Mel Gorman   vmscan: do not un...
1847
1848
  			case ZONE_RECLAIM_FULL:
  				/* scanned but unreclaimable */
cd38b115d   Mel Gorman   mm: page allocato...
1849
  				continue;
fa5e084e4   Mel Gorman   vmscan: do not un...
1850
1851
  			default:
  				/* did we reclaim enough */
fed2719e7   Mel Gorman   mm: page_alloc: a...
1852
  				if (zone_watermark_ok(zone, order, mark,
fa5e084e4   Mel Gorman   vmscan: do not un...
1853
  						classzone_idx, alloc_flags))
fed2719e7   Mel Gorman   mm: page_alloc: a...
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
  					goto try_this_zone;
  
  				/*
  				 * Failed to reclaim enough to meet watermark.
  				 * Only mark the zone full if checking the min
  				 * watermark or if we failed to reclaim just
  				 * 1<<order pages or else the page allocator
  				 * fastpath will prematurely mark zones full
  				 * when the watermark is between the low and
  				 * min watermarks.
  				 */
  				if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) ||
  				    ret == ZONE_RECLAIM_SOME)
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1867
  					goto this_zone_full;
fed2719e7   Mel Gorman   mm: page_alloc: a...
1868
1869
  
  				continue;
0798e5193   Paul Jackson   [PATCH] memory pa...
1870
  			}
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1871
  		}
fa5e084e4   Mel Gorman   vmscan: do not un...
1872
  try_this_zone:
3dd282669   Mel Gorman   page allocator: c...
1873
1874
  		page = buffered_rmqueue(preferred_zone, zone, order,
  						gfp_mask, migratetype);
0798e5193   Paul Jackson   [PATCH] memory pa...
1875
  		if (page)
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1876
  			break;
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1877
  this_zone_full:
e5adfffc8   Kirill A. Shutemov   mm: use IS_ENABLE...
1878
  		if (IS_ENABLED(CONFIG_NUMA))
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1879
  			zlc_mark_zone_full(zonelist, z);
54a6eb5c4   Mel Gorman   mm: use two zonel...
1880
  	}
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1881

e5adfffc8   Kirill A. Shutemov   mm: use IS_ENABLE...
1882
  	if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1883
1884
1885
1886
  		/* Disable zlc cache for second zonelist scan */
  		zlc_active = 0;
  		goto zonelist_scan;
  	}
b121186ab   Alex Shi   mm: correct page-...
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
  
  	if (page)
  		/*
  		 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
  		 * necessary to allocate the page. The expectation is
  		 * that the caller is taking steps that will free more
  		 * memory. The caller should avoid the page being used
  		 * for !PFMEMALLOC purposes.
  		 */
  		page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1897
  	return page;
753ee7289   Martin Hicks   [PATCH] VM: early...
1898
  }
29423e77c   David Rientjes   oom: suppress sho...
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
  /*
   * Large machines with many possible nodes should not always dump per-node
   * meminfo in irq context.
   */
  static inline bool should_suppress_show_mem(void)
  {
  	bool ret = false;
  
  #if NODES_SHIFT > 8
  	ret = in_interrupt();
  #endif
  	return ret;
  }
a238ab5b0   Dave Hansen   mm: break out pag...
1912
1913
1914
1915
1916
1917
  static DEFINE_RATELIMIT_STATE(nopage_rs,
  		DEFAULT_RATELIMIT_INTERVAL,
  		DEFAULT_RATELIMIT_BURST);
  
  void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
  {
a238ab5b0   Dave Hansen   mm: break out pag...
1918
  	unsigned int filter = SHOW_MEM_FILTER_NODES;
c0a32fc5a   Stanislaw Gruszka   mm: more intensiv...
1919
1920
  	if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
  	    debug_guardpage_minorder() > 0)
a238ab5b0   Dave Hansen   mm: break out pag...
1921
1922
1923
  		return;
  
  	/*
4b59e6c47   David Rientjes   mm, show_mem: sup...
1924
1925
1926
1927
1928
1929
1930
  	 * Walking all memory to count page types is very expensive and should
  	 * be inhibited in non-blockable contexts.
  	 */
  	if (!(gfp_mask & __GFP_WAIT))
  		filter |= SHOW_MEM_FILTER_PAGE_COUNT;
  
  	/*
a238ab5b0   Dave Hansen   mm: break out pag...
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
  	 * This documents exceptions given to allocations in certain
  	 * contexts that are allowed to allocate outside current's set
  	 * of allowed nodes.
  	 */
  	if (!(gfp_mask & __GFP_NOMEMALLOC))
  		if (test_thread_flag(TIF_MEMDIE) ||
  		    (current->flags & (PF_MEMALLOC | PF_EXITING)))
  			filter &= ~SHOW_MEM_FILTER_NODES;
  	if (in_interrupt() || !(gfp_mask & __GFP_WAIT))
  		filter &= ~SHOW_MEM_FILTER_NODES;
  
  	if (fmt) {
3ee9a4f08   Joe Perches   mm: neaten warn_a...
1943
1944
  		struct va_format vaf;
  		va_list args;
a238ab5b0   Dave Hansen   mm: break out pag...
1945
  		va_start(args, fmt);
3ee9a4f08   Joe Perches   mm: neaten warn_a...
1946
1947
1948
1949
1950
  
  		vaf.fmt = fmt;
  		vaf.va = &args;
  
  		pr_warn("%pV", &vaf);
a238ab5b0   Dave Hansen   mm: break out pag...
1951
1952
  		va_end(args);
  	}
3ee9a4f08   Joe Perches   mm: neaten warn_a...
1953
1954
1955
  	pr_warn("%s: page allocation failure: order:%d, mode:0x%x
  ",
  		current->comm, order, gfp_mask);
a238ab5b0   Dave Hansen   mm: break out pag...
1956
1957
1958
1959
1960
  
  	dump_stack();
  	if (!should_suppress_show_mem())
  		show_mem(filter);
  }
11e33f6a5   Mel Gorman   page allocator: b...
1961
1962
  static inline int
  should_alloc_retry(gfp_t gfp_mask, unsigned int order,
f90ac3982   Mel Gorman   mm: avoid liveloc...
1963
  				unsigned long did_some_progress,
11e33f6a5   Mel Gorman   page allocator: b...
1964
  				unsigned long pages_reclaimed)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1965
  {
11e33f6a5   Mel Gorman   page allocator: b...
1966
1967
1968
  	/* Do not loop if specifically requested */
  	if (gfp_mask & __GFP_NORETRY)
  		return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1969

f90ac3982   Mel Gorman   mm: avoid liveloc...
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
  	/* Always retry if specifically requested */
  	if (gfp_mask & __GFP_NOFAIL)
  		return 1;
  
  	/*
  	 * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim
  	 * making forward progress without invoking OOM. Suspend also disables
  	 * storage devices so kswapd will not help. Bail if we are suspending.
  	 */
  	if (!did_some_progress && pm_suspended_storage())
  		return 0;
11e33f6a5   Mel Gorman   page allocator: b...
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
  	/*
  	 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
  	 * means __GFP_NOFAIL, but that may not be true in other
  	 * implementations.
  	 */
  	if (order <= PAGE_ALLOC_COSTLY_ORDER)
  		return 1;
  
  	/*
  	 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
  	 * specified, then we retry until we no longer reclaim any pages
  	 * (above), or we've reclaimed an order of pages at least as
  	 * large as the allocation's order. In both cases, if the
  	 * allocation still fails, we stop retrying.
  	 */
  	if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
  		return 1;
cf40bd16f   Nick Piggin   lockdep: annotate...
1998

11e33f6a5   Mel Gorman   page allocator: b...
1999
2000
  	return 0;
  }
933e312e7   Akinobu Mita   [PATCH] fault-inj...
2001

11e33f6a5   Mel Gorman   page allocator: b...
2002
2003
2004
  static inline struct page *
  __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
  	struct zonelist *zonelist, enum zone_type high_zoneidx,
3dd282669   Mel Gorman   page allocator: c...
2005
2006
  	nodemask_t *nodemask, struct zone *preferred_zone,
  	int migratetype)
11e33f6a5   Mel Gorman   page allocator: b...
2007
2008
2009
2010
  {
  	struct page *page;
  
  	/* Acquire the OOM killer lock for the zones in zonelist */
ff321feac   Minchan Kim   mm: rename try_se...
2011
  	if (!try_set_zonelist_oom(zonelist, gfp_mask)) {
11e33f6a5   Mel Gorman   page allocator: b...
2012
  		schedule_timeout_uninterruptible(1);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2013
2014
  		return NULL;
  	}
6b1de9161   Jens Axboe   [PATCH] VM: fix z...
2015

11e33f6a5   Mel Gorman   page allocator: b...
2016
2017
2018
2019
2020
2021
2022
  	/*
  	 * Go through the zonelist yet one more time, keep very high watermark
  	 * here, this is only to catch a parallel oom killing, we must fail if
  	 * we're still under heavy pressure.
  	 */
  	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
  		order, zonelist, high_zoneidx,
5117f45d1   Mel Gorman   page allocator: c...
2023
  		ALLOC_WMARK_HIGH|ALLOC_CPUSET,
3dd282669   Mel Gorman   page allocator: c...
2024
  		preferred_zone, migratetype);
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
2025
  	if (page)
11e33f6a5   Mel Gorman   page allocator: b...
2026
  		goto out;
4365a5676   KAMEZAWA Hiroyuki   oom-kill: fix NUM...
2027
2028
2029
2030
  	if (!(gfp_mask & __GFP_NOFAIL)) {
  		/* The OOM killer will not help higher order allocs */
  		if (order > PAGE_ALLOC_COSTLY_ORDER)
  			goto out;
03668b3ce   David Rientjes   oom: avoid oom ki...
2031
2032
2033
  		/* The OOM killer does not needlessly kill tasks for lowmem */
  		if (high_zoneidx < ZONE_NORMAL)
  			goto out;
4365a5676   KAMEZAWA Hiroyuki   oom-kill: fix NUM...
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
  		/*
  		 * GFP_THISNODE contains __GFP_NORETRY and we never hit this.
  		 * Sanity check for bare calls of __GFP_THISNODE, not real OOM.
  		 * The caller should handle page allocation failure by itself if
  		 * it specifies __GFP_THISNODE.
  		 * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.
  		 */
  		if (gfp_mask & __GFP_THISNODE)
  			goto out;
  	}
11e33f6a5   Mel Gorman   page allocator: b...
2044
  	/* Exhausted what can be done so it's blamo time */
08ab9b10d   David Rientjes   mm, oom: force oo...
2045
  	out_of_memory(zonelist, gfp_mask, order, nodemask, false);
11e33f6a5   Mel Gorman   page allocator: b...
2046
2047
2048
2049
2050
  
  out:
  	clear_zonelist_oom(zonelist, gfp_mask);
  	return page;
  }
56de7263f   Mel Gorman   mm: compaction: d...
2051
2052
2053
2054
2055
2056
  #ifdef CONFIG_COMPACTION
  /* Try memory compaction for high-order allocations before reclaim */
  static struct page *
  __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
  	struct zonelist *zonelist, enum zone_type high_zoneidx,
  	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
66199712e   Mel Gorman   mm: page allocato...
2057
  	int migratetype, bool sync_migration,
c67fe3752   Mel Gorman   mm: compaction: A...
2058
  	bool *contended_compaction, bool *deferred_compaction,
66199712e   Mel Gorman   mm: page allocato...
2059
  	unsigned long *did_some_progress)
56de7263f   Mel Gorman   mm: compaction: d...
2060
  {
66199712e   Mel Gorman   mm: page allocato...
2061
  	if (!order)
56de7263f   Mel Gorman   mm: compaction: d...
2062
  		return NULL;
aff622495   Rik van Riel   vmscan: only defe...
2063
  	if (compaction_deferred(preferred_zone, order)) {
66199712e   Mel Gorman   mm: page allocato...
2064
2065
2066
  		*deferred_compaction = true;
  		return NULL;
  	}
c06b1fca1   Andrew Morton   mm/page_alloc.c: ...
2067
  	current->flags |= PF_MEMALLOC;
56de7263f   Mel Gorman   mm: compaction: d...
2068
  	*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
c67fe3752   Mel Gorman   mm: compaction: A...
2069
  						nodemask, sync_migration,
8fb74b9fb   Mel Gorman   mm: compaction: p...
2070
  						contended_compaction);
c06b1fca1   Andrew Morton   mm/page_alloc.c: ...
2071
  	current->flags &= ~PF_MEMALLOC;
56de7263f   Mel Gorman   mm: compaction: d...
2072

1fb3f8ca0   Mel Gorman   mm: compaction: c...
2073
  	if (*did_some_progress != COMPACT_SKIPPED) {
8fb74b9fb   Mel Gorman   mm: compaction: p...
2074
  		struct page *page;
56de7263f   Mel Gorman   mm: compaction: d...
2075
2076
2077
2078
2079
2080
  		/* Page migration frees to the PCP lists but we want merging */
  		drain_pages(get_cpu());
  		put_cpu();
  
  		page = get_page_from_freelist(gfp_mask, nodemask,
  				order, zonelist, high_zoneidx,
cfd19c5a9   Mel Gorman   mm: only set page...
2081
2082
  				alloc_flags & ~ALLOC_NO_WATERMARKS,
  				preferred_zone, migratetype);
56de7263f   Mel Gorman   mm: compaction: d...
2083
  		if (page) {
62997027c   Mel Gorman   mm: compaction: c...
2084
  			preferred_zone->compact_blockskip_flush = false;
4f92e2586   Mel Gorman   mm: compaction: d...
2085
2086
  			preferred_zone->compact_considered = 0;
  			preferred_zone->compact_defer_shift = 0;
aff622495   Rik van Riel   vmscan: only defe...
2087
2088
  			if (order >= preferred_zone->compact_order_failed)
  				preferred_zone->compact_order_failed = order + 1;
56de7263f   Mel Gorman   mm: compaction: d...
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
  			count_vm_event(COMPACTSUCCESS);
  			return page;
  		}
  
  		/*
  		 * It's bad if compaction run occurs and fails.
  		 * The most likely reason is that pages exist,
  		 * but not enough to satisfy watermarks.
  		 */
  		count_vm_event(COMPACTFAIL);
66199712e   Mel Gorman   mm: page allocato...
2099
2100
2101
2102
2103
2104
  
  		/*
  		 * As async compaction considers a subset of pageblocks, only
  		 * defer if the failure was a sync compaction failure.
  		 */
  		if (sync_migration)
aff622495   Rik van Riel   vmscan: only defe...
2105
  			defer_compaction(preferred_zone, order);
56de7263f   Mel Gorman   mm: compaction: d...
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
  
  		cond_resched();
  	}
  
  	return NULL;
  }
  #else
  static inline struct page *
  __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
  	struct zonelist *zonelist, enum zone_type high_zoneidx,
  	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
66199712e   Mel Gorman   mm: page allocato...
2117
  	int migratetype, bool sync_migration,
c67fe3752   Mel Gorman   mm: compaction: A...
2118
  	bool *contended_compaction, bool *deferred_compaction,
66199712e   Mel Gorman   mm: page allocato...
2119
  	unsigned long *did_some_progress)
56de7263f   Mel Gorman   mm: compaction: d...
2120
2121
2122
2123
  {
  	return NULL;
  }
  #endif /* CONFIG_COMPACTION */
bba907108   Marek Szyprowski   mm: extract recla...
2124
2125
2126
2127
  /* Perform direct synchronous page reclaim */
  static int
  __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,
  		  nodemask_t *nodemask)
11e33f6a5   Mel Gorman   page allocator: b...
2128
  {
11e33f6a5   Mel Gorman   page allocator: b...
2129
  	struct reclaim_state reclaim_state;
bba907108   Marek Szyprowski   mm: extract recla...
2130
  	int progress;
11e33f6a5   Mel Gorman   page allocator: b...
2131
2132
2133
2134
2135
  
  	cond_resched();
  
  	/* We now go into synchronous reclaim */
  	cpuset_memory_pressure_bump();
c06b1fca1   Andrew Morton   mm/page_alloc.c: ...
2136
  	current->flags |= PF_MEMALLOC;
11e33f6a5   Mel Gorman   page allocator: b...
2137
2138
  	lockdep_set_current_reclaim_state(gfp_mask);
  	reclaim_state.reclaimed_slab = 0;
c06b1fca1   Andrew Morton   mm/page_alloc.c: ...
2139
  	current->reclaim_state = &reclaim_state;
11e33f6a5   Mel Gorman   page allocator: b...
2140

bba907108   Marek Szyprowski   mm: extract recla...
2141
  	progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
11e33f6a5   Mel Gorman   page allocator: b...
2142

c06b1fca1   Andrew Morton   mm/page_alloc.c: ...
2143
  	current->reclaim_state = NULL;
11e33f6a5   Mel Gorman   page allocator: b...
2144
  	lockdep_clear_current_reclaim_state();
c06b1fca1   Andrew Morton   mm/page_alloc.c: ...
2145
  	current->flags &= ~PF_MEMALLOC;
11e33f6a5   Mel Gorman   page allocator: b...
2146
2147
  
  	cond_resched();
bba907108   Marek Szyprowski   mm: extract recla...
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
  	return progress;
  }
  
  /* The really slow allocator path where we enter direct reclaim */
  static inline struct page *
  __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
  	struct zonelist *zonelist, enum zone_type high_zoneidx,
  	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
  	int migratetype, unsigned long *did_some_progress)
  {
  	struct page *page = NULL;
  	bool drained = false;
  
  	*did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,
  					       nodemask);
9ee493ce0   Mel Gorman   mm: page allocato...
2163
2164
  	if (unlikely(!(*did_some_progress)))
  		return NULL;
11e33f6a5   Mel Gorman   page allocator: b...
2165

76d3fbf8f   Mel Gorman   mm: page allocato...
2166
  	/* After successful reclaim, reconsider all zones for allocation */
e5adfffc8   Kirill A. Shutemov   mm: use IS_ENABLE...
2167
  	if (IS_ENABLED(CONFIG_NUMA))
76d3fbf8f   Mel Gorman   mm: page allocato...
2168
  		zlc_clear_zones_full(zonelist);
9ee493ce0   Mel Gorman   mm: page allocato...
2169
2170
  retry:
  	page = get_page_from_freelist(gfp_mask, nodemask, order,
5117f45d1   Mel Gorman   page allocator: c...
2171
  					zonelist, high_zoneidx,
cfd19c5a9   Mel Gorman   mm: only set page...
2172
2173
  					alloc_flags & ~ALLOC_NO_WATERMARKS,
  					preferred_zone, migratetype);
9ee493ce0   Mel Gorman   mm: page allocato...
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
  
  	/*
  	 * If an allocation failed after direct reclaim, it could be because
  	 * pages are pinned on the per-cpu lists. Drain them and try again
  	 */
  	if (!page && !drained) {
  		drain_all_pages();
  		drained = true;
  		goto retry;
  	}
11e33f6a5   Mel Gorman   page allocator: b...
2184
2185
  	return page;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2186
  /*
11e33f6a5   Mel Gorman   page allocator: b...
2187
2188
   * This is called in the allocator slow-path if the allocation request is of
   * sufficient urgency to ignore watermarks and take other desperate measures
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2189
   */
11e33f6a5   Mel Gorman   page allocator: b...
2190
2191
2192
  static inline struct page *
  __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
  	struct zonelist *zonelist, enum zone_type high_zoneidx,
3dd282669   Mel Gorman   page allocator: c...
2193
2194
  	nodemask_t *nodemask, struct zone *preferred_zone,
  	int migratetype)
11e33f6a5   Mel Gorman   page allocator: b...
2195
2196
2197
2198
2199
  {
  	struct page *page;
  
  	do {
  		page = get_page_from_freelist(gfp_mask, nodemask, order,
5117f45d1   Mel Gorman   page allocator: c...
2200
  			zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,
3dd282669   Mel Gorman   page allocator: c...
2201
  			preferred_zone, migratetype);
11e33f6a5   Mel Gorman   page allocator: b...
2202
2203
  
  		if (!page && gfp_mask & __GFP_NOFAIL)
0e093d997   Mel Gorman   writeback: do not...
2204
  			wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
11e33f6a5   Mel Gorman   page allocator: b...
2205
2206
2207
2208
2209
2210
2211
  	} while (!page && (gfp_mask & __GFP_NOFAIL));
  
  	return page;
  }
  
  static inline
  void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
995047488   Mel Gorman   mm: kswapd: stop ...
2212
2213
  						enum zone_type high_zoneidx,
  						enum zone_type classzone_idx)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2214
  {
dd1a239f6   Mel Gorman   mm: have zonelist...
2215
2216
  	struct zoneref *z;
  	struct zone *zone;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2217

11e33f6a5   Mel Gorman   page allocator: b...
2218
  	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
995047488   Mel Gorman   mm: kswapd: stop ...
2219
  		wakeup_kswapd(zone, order, classzone_idx);
11e33f6a5   Mel Gorman   page allocator: b...
2220
  }
cf40bd16f   Nick Piggin   lockdep: annotate...
2221

341ce06f6   Peter Zijlstra   page allocator: c...
2222
2223
2224
  static inline int
  gfp_to_alloc_flags(gfp_t gfp_mask)
  {
341ce06f6   Peter Zijlstra   page allocator: c...
2225
2226
  	int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
  	const gfp_t wait = gfp_mask & __GFP_WAIT;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2227

a56f57ff9   Mel Gorman   page allocator: r...
2228
  	/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
e6223a3b1   Namhyung Kim   mm: add casts to/...
2229
  	BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
933e312e7   Akinobu Mita   [PATCH] fault-inj...
2230

341ce06f6   Peter Zijlstra   page allocator: c...
2231
2232
2233
2234
2235
2236
  	/*
  	 * The caller may dip into page reserves a bit more if the caller
  	 * cannot run direct reclaim, or if the caller has realtime scheduling
  	 * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
  	 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
  	 */
e6223a3b1   Namhyung Kim   mm: add casts to/...
2237
  	alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2238

341ce06f6   Peter Zijlstra   page allocator: c...
2239
  	if (!wait) {
5c3240d92   Andrea Arcangeli   thp: don't alloc ...
2240
2241
2242
2243
2244
2245
  		/*
  		 * Not worth trying to allocate harder for
  		 * __GFP_NOMEMALLOC even if it can't schedule.
  		 */
  		if  (!(gfp_mask & __GFP_NOMEMALLOC))
  			alloc_flags |= ALLOC_HARDER;
523b94585   Christoph Lameter   Memoryless nodes:...
2246
  		/*
341ce06f6   Peter Zijlstra   page allocator: c...
2247
2248
  		 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
  		 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
523b94585   Christoph Lameter   Memoryless nodes:...
2249
  		 */
341ce06f6   Peter Zijlstra   page allocator: c...
2250
  		alloc_flags &= ~ALLOC_CPUSET;
c06b1fca1   Andrew Morton   mm/page_alloc.c: ...
2251
  	} else if (unlikely(rt_task(current)) && !in_interrupt())
341ce06f6   Peter Zijlstra   page allocator: c...
2252
  		alloc_flags |= ALLOC_HARDER;
b37f1dd0f   Mel Gorman   mm: introduce __G...
2253
2254
2255
  	if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
  		if (gfp_mask & __GFP_MEMALLOC)
  			alloc_flags |= ALLOC_NO_WATERMARKS;
907aed48f   Mel Gorman   mm: allow PF_MEMA...
2256
2257
2258
2259
2260
  		else if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
  			alloc_flags |= ALLOC_NO_WATERMARKS;
  		else if (!in_interrupt() &&
  				((current->flags & PF_MEMALLOC) ||
  				 unlikely(test_thread_flag(TIF_MEMDIE))))
341ce06f6   Peter Zijlstra   page allocator: c...
2261
  			alloc_flags |= ALLOC_NO_WATERMARKS;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2262
  	}
d95ea5d18   Bartlomiej Zolnierkiewicz   cma: fix watermar...
2263
2264
2265
2266
  #ifdef CONFIG_CMA
  	if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
  		alloc_flags |= ALLOC_CMA;
  #endif
341ce06f6   Peter Zijlstra   page allocator: c...
2267
2268
  	return alloc_flags;
  }
072bb0aa5   Mel Gorman   mm: sl[au]b: add ...
2269
2270
  bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
  {
b37f1dd0f   Mel Gorman   mm: introduce __G...
2271
  	return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);
072bb0aa5   Mel Gorman   mm: sl[au]b: add ...
2272
  }
11e33f6a5   Mel Gorman   page allocator: b...
2273
2274
2275
  static inline struct page *
  __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
  	struct zonelist *zonelist, enum zone_type high_zoneidx,
3dd282669   Mel Gorman   page allocator: c...
2276
2277
  	nodemask_t *nodemask, struct zone *preferred_zone,
  	int migratetype)
11e33f6a5   Mel Gorman   page allocator: b...
2278
2279
2280
2281
2282
2283
  {
  	const gfp_t wait = gfp_mask & __GFP_WAIT;
  	struct page *page = NULL;
  	int alloc_flags;
  	unsigned long pages_reclaimed = 0;
  	unsigned long did_some_progress;
77f1fe6b0   Mel Gorman   mm: migration: al...
2284
  	bool sync_migration = false;
66199712e   Mel Gorman   mm: page allocato...
2285
  	bool deferred_compaction = false;
c67fe3752   Mel Gorman   mm: compaction: A...
2286
  	bool contended_compaction = false;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2287

952f3b51b   Christoph Lameter   [PATCH] GFP_THISN...
2288
  	/*
72807a74c   Mel Gorman   page allocator: s...
2289
2290
2291
2292
2293
  	 * In the slowpath, we sanity check order to avoid ever trying to
  	 * reclaim >= MAX_ORDER areas which will never succeed. Callers may
  	 * be using allocators in order of preference for an area that is
  	 * too large.
  	 */
1fc28b70f   Mel Gorman   page-allocator: a...
2294
2295
  	if (order >= MAX_ORDER) {
  		WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
72807a74c   Mel Gorman   page allocator: s...
2296
  		return NULL;
1fc28b70f   Mel Gorman   page-allocator: a...
2297
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2298

952f3b51b   Christoph Lameter   [PATCH] GFP_THISN...
2299
2300
2301
2302
2303
2304
2305
2306
  	/*
  	 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
  	 * __GFP_NOWARN set) should not cause reclaim since the subsystem
  	 * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
  	 * using a larger set of nodes after it has established that the
  	 * allowed per node queues are empty and that nodes are
  	 * over allocated.
  	 */
e5adfffc8   Kirill A. Shutemov   mm: use IS_ENABLE...
2307
2308
  	if (IS_ENABLED(CONFIG_NUMA) &&
  			(gfp_mask & GFP_THISNODE) == GFP_THISNODE)
952f3b51b   Christoph Lameter   [PATCH] GFP_THISN...
2309
  		goto nopage;
cc4a68514   Mel Gorman   page allocator: a...
2310
  restart:
caf491916   Linus Torvalds   Revert "revert "R...
2311
2312
2313
  	if (!(gfp_mask & __GFP_NO_KSWAPD))
  		wake_all_kswapd(order, zonelist, high_zoneidx,
  						zone_idx(preferred_zone));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2314

9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2315
  	/*
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
2316
2317
2318
  	 * OK, we're below the kswapd watermark and have kicked background
  	 * reclaim. Now things get more complex, so set up alloc_flags according
  	 * to how we want to proceed.
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2319
  	 */
341ce06f6   Peter Zijlstra   page allocator: c...
2320
  	alloc_flags = gfp_to_alloc_flags(gfp_mask);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2321

f33261d75   David Rientjes   mm: fix deferred ...
2322
2323
2324
2325
2326
2327
2328
  	/*
  	 * Find the true preferred zone if the allocation is unconstrained by
  	 * cpusets.
  	 */
  	if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)
  		first_zones_zonelist(zonelist, high_zoneidx, NULL,
  					&preferred_zone);
cfa54a0fc   Andrew Barry   mm/page_alloc.c: ...
2329
  rebalance:
341ce06f6   Peter Zijlstra   page allocator: c...
2330
  	/* This is the last chance, in general, before the goto nopage. */
19770b326   Mel Gorman   mm: filter based ...
2331
  	page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
341ce06f6   Peter Zijlstra   page allocator: c...
2332
2333
  			high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
  			preferred_zone, migratetype);
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
2334
2335
  	if (page)
  		goto got_pg;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2336

11e33f6a5   Mel Gorman   page allocator: b...
2337
  	/* Allocate without watermarks if the context allows */
341ce06f6   Peter Zijlstra   page allocator: c...
2338
  	if (alloc_flags & ALLOC_NO_WATERMARKS) {
183f6371a   Mel Gorman   mm: ignore mempol...
2339
2340
2341
2342
2343
2344
  		/*
  		 * Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds
  		 * the allocation is high priority and these type of
  		 * allocations are system rather than user orientated
  		 */
  		zonelist = node_zonelist(numa_node_id(), gfp_mask);
341ce06f6   Peter Zijlstra   page allocator: c...
2345
2346
2347
  		page = __alloc_pages_high_priority(gfp_mask, order,
  				zonelist, high_zoneidx, nodemask,
  				preferred_zone, migratetype);
cfd19c5a9   Mel Gorman   mm: only set page...
2348
  		if (page) {
341ce06f6   Peter Zijlstra   page allocator: c...
2349
  			goto got_pg;
cfd19c5a9   Mel Gorman   mm: only set page...
2350
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2351
2352
2353
2354
2355
  	}
  
  	/* Atomic allocations - we can't balance anything */
  	if (!wait)
  		goto nopage;
341ce06f6   Peter Zijlstra   page allocator: c...
2356
  	/* Avoid recursion of direct reclaim */
c06b1fca1   Andrew Morton   mm/page_alloc.c: ...
2357
  	if (current->flags & PF_MEMALLOC)
341ce06f6   Peter Zijlstra   page allocator: c...
2358
  		goto nopage;
6583bb64f   David Rientjes   mm: avoid endless...
2359
2360
2361
  	/* Avoid allocations with no watermarks from looping endlessly */
  	if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
  		goto nopage;
77f1fe6b0   Mel Gorman   mm: migration: al...
2362
2363
2364
2365
  	/*
  	 * Try direct compaction. The first pass is asynchronous. Subsequent
  	 * attempts after direct reclaim are synchronous
  	 */
56de7263f   Mel Gorman   mm: compaction: d...
2366
2367
2368
2369
  	page = __alloc_pages_direct_compact(gfp_mask, order,
  					zonelist, high_zoneidx,
  					nodemask,
  					alloc_flags, preferred_zone,
66199712e   Mel Gorman   mm: page allocato...
2370
  					migratetype, sync_migration,
c67fe3752   Mel Gorman   mm: compaction: A...
2371
  					&contended_compaction,
66199712e   Mel Gorman   mm: page allocato...
2372
2373
  					&deferred_compaction,
  					&did_some_progress);
56de7263f   Mel Gorman   mm: compaction: d...
2374
2375
  	if (page)
  		goto got_pg;
c6a140bf1   Andrea Arcangeli   mm/compaction: re...
2376
  	sync_migration = true;
56de7263f   Mel Gorman   mm: compaction: d...
2377

31f8d42d4   Linus Torvalds   Revert "mm: avoid...
2378
2379
2380
2381
2382
2383
2384
  	/*
  	 * If compaction is deferred for high-order allocations, it is because
  	 * sync compaction recently failed. In this is the case and the caller
  	 * requested a movable allocation that does not heavily disrupt the
  	 * system then fail the allocation instead of entering direct reclaim.
  	 */
  	if ((deferred_compaction || contended_compaction) &&
caf491916   Linus Torvalds   Revert "revert "R...
2385
  						(gfp_mask & __GFP_NO_KSWAPD))
31f8d42d4   Linus Torvalds   Revert "mm: avoid...
2386
  		goto nopage;
66199712e   Mel Gorman   mm: page allocato...
2387

11e33f6a5   Mel Gorman   page allocator: b...
2388
2389
2390
2391
  	/* Try direct reclaim and then allocating */
  	page = __alloc_pages_direct_reclaim(gfp_mask, order,
  					zonelist, high_zoneidx,
  					nodemask,
5117f45d1   Mel Gorman   page allocator: c...
2392
  					alloc_flags, preferred_zone,
3dd282669   Mel Gorman   page allocator: c...
2393
  					migratetype, &did_some_progress);
11e33f6a5   Mel Gorman   page allocator: b...
2394
2395
  	if (page)
  		goto got_pg;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2396

e33c3b5e1   David Rientjes   cpusets: update m...
2397
  	/*
11e33f6a5   Mel Gorman   page allocator: b...
2398
2399
  	 * If we failed to make any progress reclaiming, then we are
  	 * running out of options and have to consider going OOM
e33c3b5e1   David Rientjes   cpusets: update m...
2400
  	 */
11e33f6a5   Mel Gorman   page allocator: b...
2401
2402
  	if (!did_some_progress) {
  		if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
7f33d49a2   Rafael J. Wysocki   mm, PM/Freezer: D...
2403
2404
  			if (oom_killer_disabled)
  				goto nopage;
29fd66d28   David Rientjes   mm, coredump: fai...
2405
2406
2407
2408
  			/* Coredumps can quickly deplete all memory reserves */
  			if ((current->flags & PF_DUMPCORE) &&
  			    !(gfp_mask & __GFP_NOFAIL))
  				goto nopage;
11e33f6a5   Mel Gorman   page allocator: b...
2409
2410
  			page = __alloc_pages_may_oom(gfp_mask, order,
  					zonelist, high_zoneidx,
3dd282669   Mel Gorman   page allocator: c...
2411
2412
  					nodemask, preferred_zone,
  					migratetype);
11e33f6a5   Mel Gorman   page allocator: b...
2413
2414
  			if (page)
  				goto got_pg;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2415

03668b3ce   David Rientjes   oom: avoid oom ki...
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
  			if (!(gfp_mask & __GFP_NOFAIL)) {
  				/*
  				 * The oom killer is not called for high-order
  				 * allocations that may fail, so if no progress
  				 * is being made, there are no other options and
  				 * retrying is unlikely to help.
  				 */
  				if (order > PAGE_ALLOC_COSTLY_ORDER)
  					goto nopage;
  				/*
  				 * The oom killer is not called for lowmem
  				 * allocations to prevent needlessly killing
  				 * innocent tasks.
  				 */
  				if (high_zoneidx < ZONE_NORMAL)
  					goto nopage;
  			}
e2c55dc87   Mel Gorman   Drain per-cpu lis...
2433

ff0ceb9de   David Rientjes   oom: serialize ou...
2434
2435
  			goto restart;
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2436
  	}
11e33f6a5   Mel Gorman   page allocator: b...
2437
  	/* Check if we should retry the allocation */
a41f24ea9   Nishanth Aravamudan   page allocator: s...
2438
  	pages_reclaimed += did_some_progress;
f90ac3982   Mel Gorman   mm: avoid liveloc...
2439
2440
  	if (should_alloc_retry(gfp_mask, order, did_some_progress,
  						pages_reclaimed)) {
11e33f6a5   Mel Gorman   page allocator: b...
2441
  		/* Wait for some write requests to complete then retry */
0e093d997   Mel Gorman   writeback: do not...
2442
  		wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2443
  		goto rebalance;
3e7d34497   Mel Gorman   mm: vmscan: recla...
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
  	} else {
  		/*
  		 * High-order allocations do not necessarily loop after
  		 * direct reclaim and reclaim/compaction depends on compaction
  		 * being called after reclaim so call directly if necessary
  		 */
  		page = __alloc_pages_direct_compact(gfp_mask, order,
  					zonelist, high_zoneidx,
  					nodemask,
  					alloc_flags, preferred_zone,
66199712e   Mel Gorman   mm: page allocato...
2454
  					migratetype, sync_migration,
c67fe3752   Mel Gorman   mm: compaction: A...
2455
  					&contended_compaction,
66199712e   Mel Gorman   mm: page allocato...
2456
2457
  					&deferred_compaction,
  					&did_some_progress);
3e7d34497   Mel Gorman   mm: vmscan: recla...
2458
2459
  		if (page)
  			goto got_pg;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2460
2461
2462
  	}
  
  nopage:
a238ab5b0   Dave Hansen   mm: break out pag...
2463
  	warn_alloc_failed(gfp_mask, order, NULL);
b1eeab676   Vegard Nossum   kmemcheck: add ho...
2464
  	return page;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2465
  got_pg:
b1eeab676   Vegard Nossum   kmemcheck: add ho...
2466
2467
  	if (kmemcheck_enabled)
  		kmemcheck_pagealloc_alloc(page, order, gfp_mask);
11e33f6a5   Mel Gorman   page allocator: b...
2468

072bb0aa5   Mel Gorman   mm: sl[au]b: add ...
2469
  	return page;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2470
  }
11e33f6a5   Mel Gorman   page allocator: b...
2471
2472
2473
2474
2475
2476
2477
2478
2479
  
  /*
   * This is the 'heart' of the zoned buddy allocator.
   */
  struct page *
  __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
  			struct zonelist *zonelist, nodemask_t *nodemask)
  {
  	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
5117f45d1   Mel Gorman   page allocator: c...
2480
  	struct zone *preferred_zone;
cc9a6c877   Mel Gorman   cpuset: mm: reduc...
2481
  	struct page *page = NULL;
3dd282669   Mel Gorman   page allocator: c...
2482
  	int migratetype = allocflags_to_migratetype(gfp_mask);
cc9a6c877   Mel Gorman   cpuset: mm: reduc...
2483
  	unsigned int cpuset_mems_cookie;
d95ea5d18   Bartlomiej Zolnierkiewicz   cma: fix watermar...
2484
  	int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET;
6a1a0d3b6   Glauber Costa   mm: allocate kern...
2485
  	struct mem_cgroup *memcg = NULL;
11e33f6a5   Mel Gorman   page allocator: b...
2486

dcce284a2   Benjamin Herrenschmidt   mm: Extend gfp ma...
2487
  	gfp_mask &= gfp_allowed_mask;
11e33f6a5   Mel Gorman   page allocator: b...
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
  	lockdep_trace_alloc(gfp_mask);
  
  	might_sleep_if(gfp_mask & __GFP_WAIT);
  
  	if (should_fail_alloc_page(gfp_mask, order))
  		return NULL;
  
  	/*
  	 * Check the zones suitable for the gfp_mask contain at least one
  	 * valid zone. It's possible to have an empty zonelist as a result
  	 * of GFP_THISNODE and a memoryless node
  	 */
  	if (unlikely(!zonelist->_zonerefs->zone))
  		return NULL;
6a1a0d3b6   Glauber Costa   mm: allocate kern...
2502
2503
2504
2505
2506
2507
  	/*
  	 * Will only have any effect when __GFP_KMEMCG is set.  This is
  	 * verified in the (always inline) callee
  	 */
  	if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
  		return NULL;
cc9a6c877   Mel Gorman   cpuset: mm: reduc...
2508
2509
  retry_cpuset:
  	cpuset_mems_cookie = get_mems_allowed();
5117f45d1   Mel Gorman   page allocator: c...
2510
  	/* The preferred zone is used for statistics later */
f33261d75   David Rientjes   mm: fix deferred ...
2511
2512
2513
  	first_zones_zonelist(zonelist, high_zoneidx,
  				nodemask ? : &cpuset_current_mems_allowed,
  				&preferred_zone);
cc9a6c877   Mel Gorman   cpuset: mm: reduc...
2514
2515
  	if (!preferred_zone)
  		goto out;
5117f45d1   Mel Gorman   page allocator: c...
2516

d95ea5d18   Bartlomiej Zolnierkiewicz   cma: fix watermar...
2517
2518
2519
2520
  #ifdef CONFIG_CMA
  	if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
  		alloc_flags |= ALLOC_CMA;
  #endif
5117f45d1   Mel Gorman   page allocator: c...
2521
  	/* First allocation attempt */
11e33f6a5   Mel Gorman   page allocator: b...
2522
  	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
d95ea5d18   Bartlomiej Zolnierkiewicz   cma: fix watermar...
2523
  			zonelist, high_zoneidx, alloc_flags,
3dd282669   Mel Gorman   page allocator: c...
2524
  			preferred_zone, migratetype);
21caf2fc1   Ming Lei   mm: teach mm by c...
2525
2526
2527
2528
2529
2530
2531
  	if (unlikely(!page)) {
  		/*
  		 * Runtime PM, block IO and its error handling path
  		 * can deadlock because I/O on the device might not
  		 * complete.
  		 */
  		gfp_mask = memalloc_noio_flags(gfp_mask);
11e33f6a5   Mel Gorman   page allocator: b...
2532
  		page = __alloc_pages_slowpath(gfp_mask, order,
5117f45d1   Mel Gorman   page allocator: c...
2533
  				zonelist, high_zoneidx, nodemask,
3dd282669   Mel Gorman   page allocator: c...
2534
  				preferred_zone, migratetype);
21caf2fc1   Ming Lei   mm: teach mm by c...
2535
  	}
11e33f6a5   Mel Gorman   page allocator: b...
2536

4b4f278c0   Mel Gorman   tracing, page-all...
2537
  	trace_mm_page_alloc(page, order, gfp_mask, migratetype);
cc9a6c877   Mel Gorman   cpuset: mm: reduc...
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
  
  out:
  	/*
  	 * When updating a task's mems_allowed, it is possible to race with
  	 * parallel threads in such a way that an allocation can fail while
  	 * the mask is being updated. If a page allocation is about to fail,
  	 * check if the cpuset changed during allocation and if so, retry.
  	 */
  	if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
  		goto retry_cpuset;
6a1a0d3b6   Glauber Costa   mm: allocate kern...
2548
  	memcg_kmem_commit_charge(page, memcg, order);
11e33f6a5   Mel Gorman   page allocator: b...
2549
  	return page;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2550
  }
d239171e4   Mel Gorman   page allocator: r...
2551
  EXPORT_SYMBOL(__alloc_pages_nodemask);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2552
2553
2554
2555
  
  /*
   * Common helper functions.
   */
920c7a5d0   Harvey Harrison   mm: remove fastca...
2556
  unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2557
  {
945a11136   Akinobu Mita   mm: add gfp mask ...
2558
2559
2560
2561
2562
2563
2564
  	struct page *page;
  
  	/*
  	 * __get_free_pages() returns a 32-bit address, which cannot represent
  	 * a highmem page
  	 */
  	VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2565
2566
2567
2568
2569
  	page = alloc_pages(gfp_mask, order);
  	if (!page)
  		return 0;
  	return (unsigned long) page_address(page);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2570
  EXPORT_SYMBOL(__get_free_pages);
920c7a5d0   Harvey Harrison   mm: remove fastca...
2571
  unsigned long get_zeroed_page(gfp_t gfp_mask)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2572
  {
945a11136   Akinobu Mita   mm: add gfp mask ...
2573
  	return __get_free_pages(gfp_mask | __GFP_ZERO, 0);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2574
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2575
  EXPORT_SYMBOL(get_zeroed_page);
920c7a5d0   Harvey Harrison   mm: remove fastca...
2576
  void __free_pages(struct page *page, unsigned int order)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2577
  {
b5810039a   Nick Piggin   [PATCH] core remo...
2578
  	if (put_page_testzero(page)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2579
  		if (order == 0)
fc91668ea   Li Hong   mm: remove free_h...
2580
  			free_hot_cold_page(page, 0);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2581
2582
2583
2584
2585
2586
  		else
  			__free_pages_ok(page, order);
  	}
  }
  
  EXPORT_SYMBOL(__free_pages);
920c7a5d0   Harvey Harrison   mm: remove fastca...
2587
  void free_pages(unsigned long addr, unsigned int order)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2588
2589
  {
  	if (addr != 0) {
725d704ec   Nick Piggin   [PATCH] mm: VM_BU...
2590
  		VM_BUG_ON(!virt_addr_valid((void *)addr));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2591
2592
2593
2594
2595
  		__free_pages(virt_to_page((void *)addr), order);
  	}
  }
  
  EXPORT_SYMBOL(free_pages);
6a1a0d3b6   Glauber Costa   mm: allocate kern...
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
  /*
   * __free_memcg_kmem_pages and free_memcg_kmem_pages will free
   * pages allocated with __GFP_KMEMCG.
   *
   * Those pages are accounted to a particular memcg, embedded in the
   * corresponding page_cgroup. To avoid adding a hit in the allocator to search
   * for that information only to find out that it is NULL for users who have no
   * interest in that whatsoever, we provide these functions.
   *
   * The caller knows better which flags it relies on.
   */
  void __free_memcg_kmem_pages(struct page *page, unsigned int order)
  {
  	memcg_kmem_uncharge_pages(page, order);
  	__free_pages(page, order);
  }
  
  void free_memcg_kmem_pages(unsigned long addr, unsigned int order)
  {
  	if (addr != 0) {
  		VM_BUG_ON(!virt_addr_valid((void *)addr));
  		__free_memcg_kmem_pages(virt_to_page((void *)addr), order);
  	}
  }
ee85c2e14   Andi Kleen   mm: add alloc_pag...
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
  static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)
  {
  	if (addr) {
  		unsigned long alloc_end = addr + (PAGE_SIZE << order);
  		unsigned long used = addr + PAGE_ALIGN(size);
  
  		split_page(virt_to_page((void *)addr), order);
  		while (used < alloc_end) {
  			free_page(used);
  			used += PAGE_SIZE;
  		}
  	}
  	return (void *)addr;
  }
2be0ffe2b   Timur Tabi   mm: add alloc_pag...
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
  /**
   * alloc_pages_exact - allocate an exact number physically-contiguous pages.
   * @size: the number of bytes to allocate
   * @gfp_mask: GFP flags for the allocation
   *
   * This function is similar to alloc_pages(), except that it allocates the
   * minimum number of pages to satisfy the request.  alloc_pages() can only
   * allocate memory in power-of-two pages.
   *
   * This function is also limited by MAX_ORDER.
   *
   * Memory allocated by this function must be released by free_pages_exact().
   */
  void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
  {
  	unsigned int order = get_order(size);
  	unsigned long addr;
  
  	addr = __get_free_pages(gfp_mask, order);
ee85c2e14   Andi Kleen   mm: add alloc_pag...
2653
  	return make_alloc_exact(addr, order, size);
2be0ffe2b   Timur Tabi   mm: add alloc_pag...
2654
2655
2656
2657
  }
  EXPORT_SYMBOL(alloc_pages_exact);
  
  /**
ee85c2e14   Andi Kleen   mm: add alloc_pag...
2658
2659
   * alloc_pages_exact_nid - allocate an exact number of physically-contiguous
   *			   pages on a node.
b5e6ab589   Randy Dunlap   mm: fix kernel-do...
2660
   * @nid: the preferred node ID where memory should be allocated
ee85c2e14   Andi Kleen   mm: add alloc_pag...
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
   * @size: the number of bytes to allocate
   * @gfp_mask: GFP flags for the allocation
   *
   * Like alloc_pages_exact(), but try to allocate on node nid first before falling
   * back.
   * Note this is not alloc_pages_exact_node() which allocates on a specific node,
   * but is not exact.
   */
  void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
  {
  	unsigned order = get_order(size);
  	struct page *p = alloc_pages_node(nid, gfp_mask, order);
  	if (!p)
  		return NULL;
  	return make_alloc_exact((unsigned long)page_address(p), order, size);
  }
  EXPORT_SYMBOL(alloc_pages_exact_nid);
  
  /**
2be0ffe2b   Timur Tabi   mm: add alloc_pag...
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
   * free_pages_exact - release memory allocated via alloc_pages_exact()
   * @virt: the value returned by alloc_pages_exact.
   * @size: size of allocation, same value as passed to alloc_pages_exact().
   *
   * Release the memory allocated by a previous call to alloc_pages_exact.
   */
  void free_pages_exact(void *virt, size_t size)
  {
  	unsigned long addr = (unsigned long)virt;
  	unsigned long end = addr + PAGE_ALIGN(size);
  
  	while (addr < end) {
  		free_page(addr);
  		addr += PAGE_SIZE;
  	}
  }
  EXPORT_SYMBOL(free_pages_exact);
e0fb58152   Zhang Yanfei   mm: accurately do...
2697
2698
2699
2700
2701
2702
2703
2704
2705
  /**
   * nr_free_zone_pages - count number of pages beyond high watermark
   * @offset: The zone index of the highest zone
   *
   * nr_free_zone_pages() counts the number of counts pages which are beyond the
   * high watermark within all zones at or below a given zone index.  For each
   * zone, the number of pages is calculated as:
   *     present_pages - high_pages
   */
ebec3862f   Zhang Yanfei   mm: fix return ty...
2706
  static unsigned long nr_free_zone_pages(int offset)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2707
  {
dd1a239f6   Mel Gorman   mm: have zonelist...
2708
  	struct zoneref *z;
54a6eb5c4   Mel Gorman   mm: use two zonel...
2709
  	struct zone *zone;
e310fd432   Martin J. Bligh   [PATCH] Fix NUMA ...
2710
  	/* Just pick one node, since fallback list is circular */
ebec3862f   Zhang Yanfei   mm: fix return ty...
2711
  	unsigned long sum = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2712

0e88460da   Mel Gorman   mm: introduce nod...
2713
  	struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2714

54a6eb5c4   Mel Gorman   mm: use two zonel...
2715
  	for_each_zone_zonelist(zone, z, zonelist, offset) {
b40da0494   Jiang Liu   mm: use zone->pre...
2716
  		unsigned long size = zone->managed_pages;
418589663   Mel Gorman   page allocator: u...
2717
  		unsigned long high = high_wmark_pages(zone);
e310fd432   Martin J. Bligh   [PATCH] Fix NUMA ...
2718
2719
  		if (size > high)
  			sum += size - high;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2720
2721
2722
2723
  	}
  
  	return sum;
  }
e0fb58152   Zhang Yanfei   mm: accurately do...
2724
2725
2726
2727
2728
  /**
   * nr_free_buffer_pages - count number of pages beyond high watermark
   *
   * nr_free_buffer_pages() counts the number of pages which are beyond the high
   * watermark within ZONE_DMA and ZONE_NORMAL.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2729
   */
ebec3862f   Zhang Yanfei   mm: fix return ty...
2730
  unsigned long nr_free_buffer_pages(void)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2731
  {
af4ca457e   Al Viro   [PATCH] gfp_t: in...
2732
  	return nr_free_zone_pages(gfp_zone(GFP_USER));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2733
  }
c2f1a551d   Meelap Shah   knfsd: nfsd4: var...
2734
  EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2735

e0fb58152   Zhang Yanfei   mm: accurately do...
2736
2737
2738
2739
2740
  /**
   * nr_free_pagecache_pages - count number of pages beyond high watermark
   *
   * nr_free_pagecache_pages() counts the number of pages which are beyond the
   * high watermark within all zones.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2741
   */
ebec3862f   Zhang Yanfei   mm: fix return ty...
2742
  unsigned long nr_free_pagecache_pages(void)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2743
  {
2a1e274ac   Mel Gorman   Create the ZONE_M...
2744
  	return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2745
  }
08e0f6a97   Christoph Lameter   [PATCH] Add NUMA_...
2746
2747
  
  static inline void show_node(struct zone *zone)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2748
  {
e5adfffc8   Kirill A. Shutemov   mm: use IS_ENABLE...
2749
  	if (IS_ENABLED(CONFIG_NUMA))
25ba77c14   Andy Whitcroft   [PATCH] numa node...
2750
  		printk("Node %d ", zone_to_nid(zone));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2751
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2752

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2753
2754
2755
2756
  void si_meminfo(struct sysinfo *val)
  {
  	val->totalram = totalram_pages;
  	val->sharedram = 0;
d23ad4232   Christoph Lameter   [PATCH] Use ZVC f...
2757
  	val->freeram = global_page_state(NR_FREE_PAGES);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2758
  	val->bufferram = nr_blockdev_pages();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2759
2760
  	val->totalhigh = totalhigh_pages;
  	val->freehigh = nr_free_highpages();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
  	val->mem_unit = PAGE_SIZE;
  }
  
  EXPORT_SYMBOL(si_meminfo);
  
  #ifdef CONFIG_NUMA
  void si_meminfo_node(struct sysinfo *val, int nid)
  {
  	pg_data_t *pgdat = NODE_DATA(nid);
  
  	val->totalram = pgdat->node_present_pages;
d23ad4232   Christoph Lameter   [PATCH] Use ZVC f...
2772
  	val->freeram = node_page_state(nid, NR_FREE_PAGES);
98d2b0ebd   Christoph Lameter   [PATCH] reduce MA...
2773
  #ifdef CONFIG_HIGHMEM
b40da0494   Jiang Liu   mm: use zone->pre...
2774
  	val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages;
d23ad4232   Christoph Lameter   [PATCH] Use ZVC f...
2775
2776
  	val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
  			NR_FREE_PAGES);
98d2b0ebd   Christoph Lameter   [PATCH] reduce MA...
2777
2778
2779
2780
  #else
  	val->totalhigh = 0;
  	val->freehigh = 0;
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2781
2782
2783
  	val->mem_unit = PAGE_SIZE;
  }
  #endif
ddd588b5d   David Rientjes   oom: suppress nod...
2784
  /*
7bf02ea22   David Rientjes   arch, mm: filter ...
2785
2786
   * Determine whether the node should be displayed or not, depending on whether
   * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
ddd588b5d   David Rientjes   oom: suppress nod...
2787
   */
7bf02ea22   David Rientjes   arch, mm: filter ...
2788
  bool skip_free_areas_node(unsigned int flags, int nid)
ddd588b5d   David Rientjes   oom: suppress nod...
2789
2790
  {
  	bool ret = false;
cc9a6c877   Mel Gorman   cpuset: mm: reduc...
2791
  	unsigned int cpuset_mems_cookie;
ddd588b5d   David Rientjes   oom: suppress nod...
2792
2793
2794
  
  	if (!(flags & SHOW_MEM_FILTER_NODES))
  		goto out;
cc9a6c877   Mel Gorman   cpuset: mm: reduc...
2795
2796
2797
2798
  	do {
  		cpuset_mems_cookie = get_mems_allowed();
  		ret = !node_isset(nid, cpuset_current_mems_allowed);
  	} while (!put_mems_allowed(cpuset_mems_cookie));
ddd588b5d   David Rientjes   oom: suppress nod...
2799
2800
2801
  out:
  	return ret;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2802
  #define K(x) ((x) << (PAGE_SHIFT-10))
377e4f167   Rabin Vincent   mm: show migratio...
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
  static void show_migration_types(unsigned char type)
  {
  	static const char types[MIGRATE_TYPES] = {
  		[MIGRATE_UNMOVABLE]	= 'U',
  		[MIGRATE_RECLAIMABLE]	= 'E',
  		[MIGRATE_MOVABLE]	= 'M',
  		[MIGRATE_RESERVE]	= 'R',
  #ifdef CONFIG_CMA
  		[MIGRATE_CMA]		= 'C',
  #endif
194159fbc   Minchan Kim   mm: remove MIGRAT...
2813
  #ifdef CONFIG_MEMORY_ISOLATION
377e4f167   Rabin Vincent   mm: show migratio...
2814
  		[MIGRATE_ISOLATE]	= 'I',
194159fbc   Minchan Kim   mm: remove MIGRAT...
2815
  #endif
377e4f167   Rabin Vincent   mm: show migratio...
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
  	};
  	char tmp[MIGRATE_TYPES + 1];
  	char *p = tmp;
  	int i;
  
  	for (i = 0; i < MIGRATE_TYPES; i++) {
  		if (type & (1 << i))
  			*p++ = types[i];
  	}
  
  	*p = '\0';
  	printk("(%s) ", tmp);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2829
2830
2831
2832
  /*
   * Show free area list (used inside shift_scroll-lock stuff)
   * We also calculate the percentage fragmentation. We do this by counting the
   * memory on each free list with the exception of the first item on the list.
ddd588b5d   David Rientjes   oom: suppress nod...
2833
2834
   * Suppresses nodes that are not allowed by current's cpuset if
   * SHOW_MEM_FILTER_NODES is passed.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2835
   */
7bf02ea22   David Rientjes   arch, mm: filter ...
2836
  void show_free_areas(unsigned int filter)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2837
  {
c72419138   Jes Sorensen   [PATCH] Condense ...
2838
  	int cpu;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2839
  	struct zone *zone;
ee99c71c5   KOSAKI Motohiro   mm: introduce for...
2840
  	for_each_populated_zone(zone) {
7bf02ea22   David Rientjes   arch, mm: filter ...
2841
  		if (skip_free_areas_node(filter, zone_to_nid(zone)))
ddd588b5d   David Rientjes   oom: suppress nod...
2842
  			continue;
c72419138   Jes Sorensen   [PATCH] Condense ...
2843
2844
2845
  		show_node(zone);
  		printk("%s per-cpu:
  ", zone->name);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2846

6b482c677   Dave Jones   [PATCH] Don't pri...
2847
  		for_each_online_cpu(cpu) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2848
  			struct per_cpu_pageset *pageset;
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
2849
  			pageset = per_cpu_ptr(zone->pageset, cpu);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2850

3dfa5721f   Christoph Lameter   Page allocator: g...
2851
2852
2853
2854
  			printk("CPU %4d: hi:%5d, btch:%4d usd:%4d
  ",
  			       cpu, pageset->pcp.high,
  			       pageset->pcp.batch, pageset->pcp.count);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2855
2856
  		}
  	}
a731286de   KOSAKI Motohiro   mm: vmstat: add i...
2857
2858
2859
2860
  	printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu
  "
  		" active_file:%lu inactive_file:%lu isolated_file:%lu
  "
7b854121e   Lee Schermerhorn   Unevictable LRU P...
2861
  		" unevictable:%lu"
b76146ed1   Andrew Morton   revert "mm: oom a...
2862
2863
  		" dirty:%lu writeback:%lu unstable:%lu
  "
3701b0332   KOSAKI Motohiro   mm: show_free_are...
2864
2865
  		" free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu
  "
d1ce749a0   Bartlomiej Zolnierkiewicz   cma: count free C...
2866
2867
2868
2869
  		" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu
  "
  		" free_cma:%lu
  ",
4f98a2fee   Rik van Riel   vmscan: split LRU...
2870
  		global_page_state(NR_ACTIVE_ANON),
4f98a2fee   Rik van Riel   vmscan: split LRU...
2871
  		global_page_state(NR_INACTIVE_ANON),
a731286de   KOSAKI Motohiro   mm: vmstat: add i...
2872
2873
  		global_page_state(NR_ISOLATED_ANON),
  		global_page_state(NR_ACTIVE_FILE),
4f98a2fee   Rik van Riel   vmscan: split LRU...
2874
  		global_page_state(NR_INACTIVE_FILE),
a731286de   KOSAKI Motohiro   mm: vmstat: add i...
2875
  		global_page_state(NR_ISOLATED_FILE),
7b854121e   Lee Schermerhorn   Unevictable LRU P...
2876
  		global_page_state(NR_UNEVICTABLE),
b1e7a8fd8   Christoph Lameter   [PATCH] zoned vm ...
2877
  		global_page_state(NR_FILE_DIRTY),
ce866b34a   Christoph Lameter   [PATCH] zoned vm ...
2878
  		global_page_state(NR_WRITEBACK),
fd39fc856   Christoph Lameter   [PATCH] zoned vm ...
2879
  		global_page_state(NR_UNSTABLE_NFS),
d23ad4232   Christoph Lameter   [PATCH] Use ZVC f...
2880
  		global_page_state(NR_FREE_PAGES),
3701b0332   KOSAKI Motohiro   mm: show_free_are...
2881
2882
  		global_page_state(NR_SLAB_RECLAIMABLE),
  		global_page_state(NR_SLAB_UNRECLAIMABLE),
65ba55f50   Christoph Lameter   [PATCH] zoned vm ...
2883
  		global_page_state(NR_FILE_MAPPED),
4b02108ac   KOSAKI Motohiro   mm: oom analysis:...
2884
  		global_page_state(NR_SHMEM),
a25700a53   Andrew Morton   [PATCH] mm: show ...
2885
  		global_page_state(NR_PAGETABLE),
d1ce749a0   Bartlomiej Zolnierkiewicz   cma: count free C...
2886
2887
  		global_page_state(NR_BOUNCE),
  		global_page_state(NR_FREE_CMA_PAGES));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2888

ee99c71c5   KOSAKI Motohiro   mm: introduce for...
2889
  	for_each_populated_zone(zone) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2890
  		int i;
7bf02ea22   David Rientjes   arch, mm: filter ...
2891
  		if (skip_free_areas_node(filter, zone_to_nid(zone)))
ddd588b5d   David Rientjes   oom: suppress nod...
2892
  			continue;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2893
2894
2895
2896
2897
2898
  		show_node(zone);
  		printk("%s"
  			" free:%lukB"
  			" min:%lukB"
  			" low:%lukB"
  			" high:%lukB"
4f98a2fee   Rik van Riel   vmscan: split LRU...
2899
2900
2901
2902
  			" active_anon:%lukB"
  			" inactive_anon:%lukB"
  			" active_file:%lukB"
  			" inactive_file:%lukB"
7b854121e   Lee Schermerhorn   Unevictable LRU P...
2903
  			" unevictable:%lukB"
a731286de   KOSAKI Motohiro   mm: vmstat: add i...
2904
2905
  			" isolated(anon):%lukB"
  			" isolated(file):%lukB"
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2906
  			" present:%lukB"
9feedc9d8   Jiang Liu   mm: introduce new...
2907
  			" managed:%lukB"
4a0aa73f1   KOSAKI Motohiro   mm: oom analysis:...
2908
2909
2910
2911
  			" mlocked:%lukB"
  			" dirty:%lukB"
  			" writeback:%lukB"
  			" mapped:%lukB"
4b02108ac   KOSAKI Motohiro   mm: oom analysis:...
2912
  			" shmem:%lukB"
4a0aa73f1   KOSAKI Motohiro   mm: oom analysis:...
2913
2914
  			" slab_reclaimable:%lukB"
  			" slab_unreclaimable:%lukB"
c6a7f5728   KOSAKI Motohiro   mm: oom analysis:...
2915
  			" kernel_stack:%lukB"
4a0aa73f1   KOSAKI Motohiro   mm: oom analysis:...
2916
2917
2918
  			" pagetables:%lukB"
  			" unstable:%lukB"
  			" bounce:%lukB"
d1ce749a0   Bartlomiej Zolnierkiewicz   cma: count free C...
2919
  			" free_cma:%lukB"
4a0aa73f1   KOSAKI Motohiro   mm: oom analysis:...
2920
  			" writeback_tmp:%lukB"
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2921
2922
2923
2924
2925
  			" pages_scanned:%lu"
  			" all_unreclaimable? %s"
  			"
  ",
  			zone->name,
88f5acf88   Mel Gorman   mm: page allocato...
2926
  			K(zone_page_state(zone, NR_FREE_PAGES)),
418589663   Mel Gorman   page allocator: u...
2927
2928
2929
  			K(min_wmark_pages(zone)),
  			K(low_wmark_pages(zone)),
  			K(high_wmark_pages(zone)),
4f98a2fee   Rik van Riel   vmscan: split LRU...
2930
2931
2932
2933
  			K(zone_page_state(zone, NR_ACTIVE_ANON)),
  			K(zone_page_state(zone, NR_INACTIVE_ANON)),
  			K(zone_page_state(zone, NR_ACTIVE_FILE)),
  			K(zone_page_state(zone, NR_INACTIVE_FILE)),
7b854121e   Lee Schermerhorn   Unevictable LRU P...
2934
  			K(zone_page_state(zone, NR_UNEVICTABLE)),
a731286de   KOSAKI Motohiro   mm: vmstat: add i...
2935
2936
  			K(zone_page_state(zone, NR_ISOLATED_ANON)),
  			K(zone_page_state(zone, NR_ISOLATED_FILE)),
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2937
  			K(zone->present_pages),
9feedc9d8   Jiang Liu   mm: introduce new...
2938
  			K(zone->managed_pages),
4a0aa73f1   KOSAKI Motohiro   mm: oom analysis:...
2939
2940
2941
2942
  			K(zone_page_state(zone, NR_MLOCK)),
  			K(zone_page_state(zone, NR_FILE_DIRTY)),
  			K(zone_page_state(zone, NR_WRITEBACK)),
  			K(zone_page_state(zone, NR_FILE_MAPPED)),
4b02108ac   KOSAKI Motohiro   mm: oom analysis:...
2943
  			K(zone_page_state(zone, NR_SHMEM)),
4a0aa73f1   KOSAKI Motohiro   mm: oom analysis:...
2944
2945
  			K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),
  			K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),
c6a7f5728   KOSAKI Motohiro   mm: oom analysis:...
2946
2947
  			zone_page_state(zone, NR_KERNEL_STACK) *
  				THREAD_SIZE / 1024,
4a0aa73f1   KOSAKI Motohiro   mm: oom analysis:...
2948
2949
2950
  			K(zone_page_state(zone, NR_PAGETABLE)),
  			K(zone_page_state(zone, NR_UNSTABLE_NFS)),
  			K(zone_page_state(zone, NR_BOUNCE)),
d1ce749a0   Bartlomiej Zolnierkiewicz   cma: count free C...
2951
  			K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
4a0aa73f1   KOSAKI Motohiro   mm: oom analysis:...
2952
  			K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2953
  			zone->pages_scanned,
93e4a89a8   KOSAKI Motohiro   mm: restore zone-...
2954
  			(zone->all_unreclaimable ? "yes" : "no")
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2955
2956
2957
2958
2959
2960
2961
  			);
  		printk("lowmem_reserve[]:");
  		for (i = 0; i < MAX_NR_ZONES; i++)
  			printk(" %lu", zone->lowmem_reserve[i]);
  		printk("
  ");
  	}
ee99c71c5   KOSAKI Motohiro   mm: introduce for...
2962
  	for_each_populated_zone(zone) {
8f9de51a4   Kirill Korotaev   [PATCH] printk() ...
2963
   		unsigned long nr[MAX_ORDER], flags, order, total = 0;
377e4f167   Rabin Vincent   mm: show migratio...
2964
  		unsigned char types[MAX_ORDER];
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2965

7bf02ea22   David Rientjes   arch, mm: filter ...
2966
  		if (skip_free_areas_node(filter, zone_to_nid(zone)))
ddd588b5d   David Rientjes   oom: suppress nod...
2967
  			continue;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2968
2969
  		show_node(zone);
  		printk("%s: ", zone->name);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2970
2971
2972
  
  		spin_lock_irqsave(&zone->lock, flags);
  		for (order = 0; order < MAX_ORDER; order++) {
377e4f167   Rabin Vincent   mm: show migratio...
2973
2974
2975
2976
  			struct free_area *area = &zone->free_area[order];
  			int type;
  
  			nr[order] = area->nr_free;
8f9de51a4   Kirill Korotaev   [PATCH] printk() ...
2977
  			total += nr[order] << order;
377e4f167   Rabin Vincent   mm: show migratio...
2978
2979
2980
2981
2982
2983
  
  			types[order] = 0;
  			for (type = 0; type < MIGRATE_TYPES; type++) {
  				if (!list_empty(&area->free_list[type]))
  					types[order] |= 1 << type;
  			}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2984
2985
  		}
  		spin_unlock_irqrestore(&zone->lock, flags);
377e4f167   Rabin Vincent   mm: show migratio...
2986
  		for (order = 0; order < MAX_ORDER; order++) {
8f9de51a4   Kirill Korotaev   [PATCH] printk() ...
2987
  			printk("%lu*%lukB ", nr[order], K(1UL) << order);
377e4f167   Rabin Vincent   mm: show migratio...
2988
2989
2990
  			if (nr[order])
  				show_migration_types(types[order]);
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2991
2992
2993
  		printk("= %lukB
  ", K(total));
  	}
949f7ec57   David Rientjes   mm, hugetlb: incl...
2994
  	hugetlb_show_meminfo();
e6f3602d2   Larry Woodman   Include count of ...
2995
2996
  	printk("%ld total pagecache pages
  ", global_page_state(NR_FILE_PAGES));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2997
2998
  	show_swap_cache_info();
  }
19770b326   Mel Gorman   mm: filter based ...
2999
3000
3001
3002
3003
  static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
  {
  	zoneref->zone = zone;
  	zoneref->zone_idx = zone_idx(zone);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3004
3005
  /*
   * Builds allocation fallback zone lists.
1a93205bd   Christoph Lameter   [PATCH] mm: simpl...
3006
3007
   *
   * Add all populated zones of a node to the zonelist.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3008
   */
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
3009
3010
  static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
  				int nr_zones, enum zone_type zone_type)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3011
  {
1a93205bd   Christoph Lameter   [PATCH] mm: simpl...
3012
  	struct zone *zone;
98d2b0ebd   Christoph Lameter   [PATCH] reduce MA...
3013
  	BUG_ON(zone_type >= MAX_NR_ZONES);
2f6726e54   Christoph Lameter   [PATCH] Apply typ...
3014
  	zone_type++;
02a68a5eb   Christoph Lameter   [PATCH] Fix zone ...
3015
3016
  
  	do {
2f6726e54   Christoph Lameter   [PATCH] Apply typ...
3017
  		zone_type--;
070f80326   Christoph Lameter   [PATCH] build_zon...
3018
  		zone = pgdat->node_zones + zone_type;
1a93205bd   Christoph Lameter   [PATCH] mm: simpl...
3019
  		if (populated_zone(zone)) {
dd1a239f6   Mel Gorman   mm: have zonelist...
3020
3021
  			zoneref_set_zone(zone,
  				&zonelist->_zonerefs[nr_zones++]);
070f80326   Christoph Lameter   [PATCH] build_zon...
3022
  			check_highest_zone(zone_type);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3023
  		}
02a68a5eb   Christoph Lameter   [PATCH] Fix zone ...
3024

2f6726e54   Christoph Lameter   [PATCH] Apply typ...
3025
  	} while (zone_type);
070f80326   Christoph Lameter   [PATCH] build_zon...
3026
  	return nr_zones;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3027
  }
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
  
  /*
   *  zonelist_order:
   *  0 = automatic detection of better ordering.
   *  1 = order by ([node] distance, -zonetype)
   *  2 = order by (-zonetype, [node] distance)
   *
   *  If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
   *  the same zonelist. So only NUMA can configure this param.
   */
  #define ZONELIST_ORDER_DEFAULT  0
  #define ZONELIST_ORDER_NODE     1
  #define ZONELIST_ORDER_ZONE     2
  
  /* zonelist order in the kernel.
   * set_zonelist_order() will set this to NODE or ZONE.
   */
  static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;
  static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3047
  #ifdef CONFIG_NUMA
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
  /* The value user specified ....changed by config */
  static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;
  /* string for sysctl */
  #define NUMA_ZONELIST_ORDER_LEN	16
  char numa_zonelist_order[16] = "default";
  
  /*
   * interface for configure zonelist ordering.
   * command line option "numa_zonelist_order"
   *	= "[dD]efault	- default, automatic configuration.
   *	= "[nN]ode 	- order by node locality, then by zone within node
   *	= "[zZ]one      - order by zone, then by locality within zone
   */
  
  static int __parse_numa_zonelist_order(char *s)
  {
  	if (*s == 'd' || *s == 'D') {
  		user_zonelist_order = ZONELIST_ORDER_DEFAULT;
  	} else if (*s == 'n' || *s == 'N') {
  		user_zonelist_order = ZONELIST_ORDER_NODE;
  	} else if (*s == 'z' || *s == 'Z') {
  		user_zonelist_order = ZONELIST_ORDER_ZONE;
  	} else {
  		printk(KERN_WARNING
  			"Ignoring invalid numa_zonelist_order value:  "
  			"%s
  ", s);
  		return -EINVAL;
  	}
  	return 0;
  }
  
  static __init int setup_numa_zonelist_order(char *s)
  {
ecb256f81   Volodymyr G. Lukiianyk   mm: set correct n...
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
  	int ret;
  
  	if (!s)
  		return 0;
  
  	ret = __parse_numa_zonelist_order(s);
  	if (ret == 0)
  		strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
  
  	return ret;
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
3092
3093
3094
3095
3096
3097
3098
  }
  early_param("numa_zonelist_order", setup_numa_zonelist_order);
  
  /*
   * sysctl handler for numa_zonelist_order
   */
  int numa_zonelist_order_handler(ctl_table *table, int write,
8d65af789   Alexey Dobriyan   sysctl: remove "s...
3099
  		void __user *buffer, size_t *length,
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
3100
3101
3102
3103
  		loff_t *ppos)
  {
  	char saved_string[NUMA_ZONELIST_ORDER_LEN];
  	int ret;
443c6f145   Andi Kleen   SYSCTL: Add a mut...
3104
  	static DEFINE_MUTEX(zl_order_mutex);
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
3105

443c6f145   Andi Kleen   SYSCTL: Add a mut...
3106
  	mutex_lock(&zl_order_mutex);
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
3107
  	if (write)
443c6f145   Andi Kleen   SYSCTL: Add a mut...
3108
  		strcpy(saved_string, (char*)table->data);
8d65af789   Alexey Dobriyan   sysctl: remove "s...
3109
  	ret = proc_dostring(table, write, buffer, length, ppos);
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
3110
  	if (ret)
443c6f145   Andi Kleen   SYSCTL: Add a mut...
3111
  		goto out;
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
3112
3113
3114
3115
3116
3117
3118
3119
3120
  	if (write) {
  		int oldval = user_zonelist_order;
  		if (__parse_numa_zonelist_order((char*)table->data)) {
  			/*
  			 * bogus value.  restore saved string
  			 */
  			strncpy((char*)table->data, saved_string,
  				NUMA_ZONELIST_ORDER_LEN);
  			user_zonelist_order = oldval;
4eaf3f643   Haicheng Li   mem-hotplug: fix ...
3121
3122
  		} else if (oldval != user_zonelist_order) {
  			mutex_lock(&zonelists_mutex);
9adb62a5d   Jiang Liu   mm/hotplug: corre...
3123
  			build_all_zonelists(NULL, NULL);
4eaf3f643   Haicheng Li   mem-hotplug: fix ...
3124
3125
  			mutex_unlock(&zonelists_mutex);
  		}
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
3126
  	}
443c6f145   Andi Kleen   SYSCTL: Add a mut...
3127
3128
3129
  out:
  	mutex_unlock(&zl_order_mutex);
  	return ret;
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
3130
  }
62bc62a87   Christoph Lameter   page allocator: u...
3131
  #define MAX_NODE_LOAD (nr_online_nodes)
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
3132
  static int node_load[MAX_NUMNODES];
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3133
  /**
4dc3b16ba   Pavel Pisa   [PATCH] DocBook: ...
3134
   * find_next_best_node - find the next node that should appear in a given node's fallback list
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
   * @node: node whose fallback list we're appending
   * @used_node_mask: nodemask_t of already used nodes
   *
   * We use a number of factors to determine which is the next node that should
   * appear on a given node's fallback list.  The node should not have appeared
   * already in @node's fallback list, and it should be the next closest node
   * according to the distance array (which contains arbitrary distance values
   * from each node to each node in the system), and should also prefer nodes
   * with no CPUs, since presumably they'll have very little allocation pressure
   * on them otherwise.
   * It returns -1 if no node is found.
   */
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
3147
  static int find_next_best_node(int node, nodemask_t *used_node_mask)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3148
  {
4cf808eb4   Linus Torvalds   [PATCH] Handle ho...
3149
  	int n, val;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3150
  	int min_val = INT_MAX;
00ef2d2f8   David Rientjes   mm: use NUMA_NO_NODE
3151
  	int best_node = NUMA_NO_NODE;
a70f73028   Rusty Russell   cpumask: replace ...
3152
  	const struct cpumask *tmp = cpumask_of_node(0);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3153

4cf808eb4   Linus Torvalds   [PATCH] Handle ho...
3154
3155
3156
3157
3158
  	/* Use the local node if we haven't already */
  	if (!node_isset(node, *used_node_mask)) {
  		node_set(node, *used_node_mask);
  		return node;
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3159

4b0ef1fe8   Lai Jiangshan   page_alloc: use N...
3160
  	for_each_node_state(n, N_MEMORY) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3161
3162
3163
3164
  
  		/* Don't want a node to appear more than once */
  		if (node_isset(n, *used_node_mask))
  			continue;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3165
3166
  		/* Use the distance array to find the distance */
  		val = node_distance(node, n);
4cf808eb4   Linus Torvalds   [PATCH] Handle ho...
3167
3168
  		/* Penalize nodes under us ("prefer the next node") */
  		val += (n < node);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3169
  		/* Give preference to headless and unused nodes */
a70f73028   Rusty Russell   cpumask: replace ...
3170
3171
  		tmp = cpumask_of_node(n);
  		if (!cpumask_empty(tmp))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
  			val += PENALTY_FOR_NODE_WITH_CPUS;
  
  		/* Slight preference for less loaded node */
  		val *= (MAX_NODE_LOAD*MAX_NUMNODES);
  		val += node_load[n];
  
  		if (val < min_val) {
  			min_val = val;
  			best_node = n;
  		}
  	}
  
  	if (best_node >= 0)
  		node_set(best_node, *used_node_mask);
  
  	return best_node;
  }
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
3189
3190
3191
3192
3193
3194
3195
  
  /*
   * Build zonelists ordered by node and zones within node.
   * This results in maximum locality--normal zone overflows into local
   * DMA zone, if any--but risks exhausting DMA zone.
   */
  static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3196
  {
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
3197
  	int j;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3198
  	struct zonelist *zonelist;
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
3199

54a6eb5c4   Mel Gorman   mm: use two zonel...
3200
  	zonelist = &pgdat->node_zonelists[0];
dd1a239f6   Mel Gorman   mm: have zonelist...
3201
  	for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
54a6eb5c4   Mel Gorman   mm: use two zonel...
3202
3203
3204
  		;
  	j = build_zonelists_node(NODE_DATA(node), zonelist, j,
  							MAX_NR_ZONES - 1);
dd1a239f6   Mel Gorman   mm: have zonelist...
3205
3206
  	zonelist->_zonerefs[j].zone = NULL;
  	zonelist->_zonerefs[j].zone_idx = 0;
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
3207
3208
3209
  }
  
  /*
523b94585   Christoph Lameter   Memoryless nodes:...
3210
3211
3212
3213
   * Build gfp_thisnode zonelists
   */
  static void build_thisnode_zonelists(pg_data_t *pgdat)
  {
523b94585   Christoph Lameter   Memoryless nodes:...
3214
3215
  	int j;
  	struct zonelist *zonelist;
54a6eb5c4   Mel Gorman   mm: use two zonel...
3216
3217
  	zonelist = &pgdat->node_zonelists[1];
  	j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
dd1a239f6   Mel Gorman   mm: have zonelist...
3218
3219
  	zonelist->_zonerefs[j].zone = NULL;
  	zonelist->_zonerefs[j].zone_idx = 0;
523b94585   Christoph Lameter   Memoryless nodes:...
3220
3221
3222
  }
  
  /*
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
3223
3224
3225
3226
3227
3228
3229
3230
3231
   * Build zonelists ordered by zone and nodes within zones.
   * This results in conserving DMA zone[s] until all Normal memory is
   * exhausted, but results in overflowing to remote node while memory
   * may still exist in local DMA zone.
   */
  static int node_order[MAX_NUMNODES];
  
  static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
  {
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
3232
3233
3234
3235
  	int pos, j, node;
  	int zone_type;		/* needs to be signed */
  	struct zone *z;
  	struct zonelist *zonelist;
54a6eb5c4   Mel Gorman   mm: use two zonel...
3236
3237
3238
3239
3240
3241
3242
  	zonelist = &pgdat->node_zonelists[0];
  	pos = 0;
  	for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
  		for (j = 0; j < nr_nodes; j++) {
  			node = node_order[j];
  			z = &NODE_DATA(node)->node_zones[zone_type];
  			if (populated_zone(z)) {
dd1a239f6   Mel Gorman   mm: have zonelist...
3243
3244
  				zoneref_set_zone(z,
  					&zonelist->_zonerefs[pos++]);
54a6eb5c4   Mel Gorman   mm: use two zonel...
3245
  				check_highest_zone(zone_type);
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
3246
3247
  			}
  		}
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
3248
  	}
dd1a239f6   Mel Gorman   mm: have zonelist...
3249
3250
  	zonelist->_zonerefs[pos].zone = NULL;
  	zonelist->_zonerefs[pos].zone_idx = 0;
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
3251
3252
3253
3254
3255
3256
3257
3258
3259
  }
  
  static int default_zonelist_order(void)
  {
  	int nid, zone_type;
  	unsigned long low_kmem_size,total_size;
  	struct zone *z;
  	int average_size;
  	/*
883931612   Thomas Weber   Fix typos in comm...
3260
           * ZONE_DMA and ZONE_DMA32 can be very small area in the system.
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
3261
3262
  	 * If they are really small and used heavily, the system can fall
  	 * into OOM very easily.
e325c90ff   David Rientjes   mm: default to no...
3263
  	 * This function detect ZONE_DMA/DMA32 size and configures zone order.
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
  	 */
  	/* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */
  	low_kmem_size = 0;
  	total_size = 0;
  	for_each_online_node(nid) {
  		for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
  			z = &NODE_DATA(nid)->node_zones[zone_type];
  			if (populated_zone(z)) {
  				if (zone_type < ZONE_NORMAL)
  					low_kmem_size += z->present_pages;
  				total_size += z->present_pages;
e325c90ff   David Rientjes   mm: default to no...
3275
3276
3277
3278
3279
3280
3281
3282
3283
  			} else if (zone_type == ZONE_NORMAL) {
  				/*
  				 * If any node has only lowmem, then node order
  				 * is preferred to allow kernel allocations
  				 * locally; otherwise, they can easily infringe
  				 * on other nodes when there is an abundance of
  				 * lowmem available to allocate from.
  				 */
  				return ZONELIST_ORDER_NODE;
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
  			}
  		}
  	}
  	if (!low_kmem_size ||  /* there are no DMA area. */
  	    low_kmem_size > total_size/2) /* DMA/DMA32 is big. */
  		return ZONELIST_ORDER_NODE;
  	/*
  	 * look into each node's config.
    	 * If there is a node whose DMA/DMA32 memory is very big area on
   	 * local memory, NODE_ORDER may be suitable.
           */
37b07e416   Lee Schermerhorn   memoryless nodes:...
3295
  	average_size = total_size /
4b0ef1fe8   Lai Jiangshan   page_alloc: use N...
3296
  				(nodes_weight(node_states[N_MEMORY]) + 1);
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
  	for_each_online_node(nid) {
  		low_kmem_size = 0;
  		total_size = 0;
  		for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
  			z = &NODE_DATA(nid)->node_zones[zone_type];
  			if (populated_zone(z)) {
  				if (zone_type < ZONE_NORMAL)
  					low_kmem_size += z->present_pages;
  				total_size += z->present_pages;
  			}
  		}
  		if (low_kmem_size &&
  		    total_size > average_size && /* ignore small node */
  		    low_kmem_size > total_size * 70/100)
  			return ZONELIST_ORDER_NODE;
  	}
  	return ZONELIST_ORDER_ZONE;
  }
  
  static void set_zonelist_order(void)
  {
  	if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)
  		current_zonelist_order = default_zonelist_order();
  	else
  		current_zonelist_order = user_zonelist_order;
  }
  
  static void build_zonelists(pg_data_t *pgdat)
  {
  	int j, node, load;
  	enum zone_type i;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3328
  	nodemask_t used_mask;
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
3329
3330
3331
  	int local_node, prev_node;
  	struct zonelist *zonelist;
  	int order = current_zonelist_order;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3332
3333
  
  	/* initialize zonelists */
523b94585   Christoph Lameter   Memoryless nodes:...
3334
  	for (i = 0; i < MAX_ZONELISTS; i++) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3335
  		zonelist = pgdat->node_zonelists + i;
dd1a239f6   Mel Gorman   mm: have zonelist...
3336
3337
  		zonelist->_zonerefs[0].zone = NULL;
  		zonelist->_zonerefs[0].zone_idx = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3338
3339
3340
3341
  	}
  
  	/* NUMA-aware ordering of nodes */
  	local_node = pgdat->node_id;
62bc62a87   Christoph Lameter   page allocator: u...
3342
  	load = nr_online_nodes;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3343
3344
  	prev_node = local_node;
  	nodes_clear(used_mask);
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
3345

f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
3346
3347
  	memset(node_order, 0, sizeof(node_order));
  	j = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3348
3349
3350
3351
3352
3353
  	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
  		/*
  		 * We don't want to pressure a particular node.
  		 * So adding penalty to the first node in same
  		 * distance group to make it round-robin.
  		 */
957f822a0   David Rientjes   mm, numa: reclaim...
3354
3355
  		if (node_distance(local_node, node) !=
  		    node_distance(local_node, prev_node))
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
3356
  			node_load[node] = load;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3357
3358
  		prev_node = node;
  		load--;
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
3359
3360
3361
3362
3363
  		if (order == ZONELIST_ORDER_NODE)
  			build_zonelists_in_node_order(pgdat, node);
  		else
  			node_order[j++] = node;	/* remember order */
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3364

f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
3365
3366
3367
  	if (order == ZONELIST_ORDER_ZONE) {
  		/* calculate node order -- i.e., DMA last! */
  		build_zonelists_in_zone_order(pgdat, j);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3368
  	}
523b94585   Christoph Lameter   Memoryless nodes:...
3369
3370
  
  	build_thisnode_zonelists(pgdat);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3371
  }
9276b1bc9   Paul Jackson   [PATCH] memory pa...
3372
  /* Construct the zonelist performance cache - see further mmzone.h */
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
3373
  static void build_zonelist_cache(pg_data_t *pgdat)
9276b1bc9   Paul Jackson   [PATCH] memory pa...
3374
  {
54a6eb5c4   Mel Gorman   mm: use two zonel...
3375
3376
  	struct zonelist *zonelist;
  	struct zonelist_cache *zlc;
dd1a239f6   Mel Gorman   mm: have zonelist...
3377
  	struct zoneref *z;
9276b1bc9   Paul Jackson   [PATCH] memory pa...
3378

54a6eb5c4   Mel Gorman   mm: use two zonel...
3379
3380
3381
  	zonelist = &pgdat->node_zonelists[0];
  	zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
  	bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
dd1a239f6   Mel Gorman   mm: have zonelist...
3382
3383
  	for (z = zonelist->_zonerefs; z->zone; z++)
  		zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);
9276b1bc9   Paul Jackson   [PATCH] memory pa...
3384
  }
7aac78988   Lee Schermerhorn   numa: introduce n...
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
  #ifdef CONFIG_HAVE_MEMORYLESS_NODES
  /*
   * Return node id of node used for "local" allocations.
   * I.e., first node id of first zone in arg node's generic zonelist.
   * Used for initializing percpu 'numa_mem', which is used primarily
   * for kernel allocations, so use GFP_KERNEL flags to locate zonelist.
   */
  int local_memory_node(int node)
  {
  	struct zone *zone;
  
  	(void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
  				   gfp_zone(GFP_KERNEL),
  				   NULL,
  				   &zone);
  	return zone->node;
  }
  #endif
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
3403

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3404
  #else	/* CONFIG_NUMA */
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
3405
3406
3407
3408
3409
3410
  static void set_zonelist_order(void)
  {
  	current_zonelist_order = ZONELIST_ORDER_ZONE;
  }
  
  static void build_zonelists(pg_data_t *pgdat)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3411
  {
19655d348   Christoph Lameter   [PATCH] linearly ...
3412
  	int node, local_node;
54a6eb5c4   Mel Gorman   mm: use two zonel...
3413
3414
  	enum zone_type j;
  	struct zonelist *zonelist;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3415
3416
  
  	local_node = pgdat->node_id;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3417

54a6eb5c4   Mel Gorman   mm: use two zonel...
3418
3419
  	zonelist = &pgdat->node_zonelists[0];
  	j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3420

54a6eb5c4   Mel Gorman   mm: use two zonel...
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
  	/*
  	 * Now we build the zonelist so that it contains the zones
  	 * of all the other nodes.
  	 * We don't want to pressure a particular node, so when
  	 * building the zones for node N, we make sure that the
  	 * zones coming right after the local ones are those from
  	 * node N+1 (modulo N)
  	 */
  	for (node = local_node + 1; node < MAX_NUMNODES; node++) {
  		if (!node_online(node))
  			continue;
  		j = build_zonelists_node(NODE_DATA(node), zonelist, j,
  							MAX_NR_ZONES - 1);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3434
  	}
54a6eb5c4   Mel Gorman   mm: use two zonel...
3435
3436
3437
3438
3439
3440
  	for (node = 0; node < local_node; node++) {
  		if (!node_online(node))
  			continue;
  		j = build_zonelists_node(NODE_DATA(node), zonelist, j,
  							MAX_NR_ZONES - 1);
  	}
dd1a239f6   Mel Gorman   mm: have zonelist...
3441
3442
  	zonelist->_zonerefs[j].zone = NULL;
  	zonelist->_zonerefs[j].zone_idx = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3443
  }
9276b1bc9   Paul Jackson   [PATCH] memory pa...
3444
  /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
3445
  static void build_zonelist_cache(pg_data_t *pgdat)
9276b1bc9   Paul Jackson   [PATCH] memory pa...
3446
  {
54a6eb5c4   Mel Gorman   mm: use two zonel...
3447
  	pgdat->node_zonelists[0].zlcache_ptr = NULL;
9276b1bc9   Paul Jackson   [PATCH] memory pa...
3448
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3449
  #endif	/* CONFIG_NUMA */
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
  /*
   * Boot pageset table. One per cpu which is going to be used for all
   * zones and all nodes. The parameters will be set in such a way
   * that an item put on a list will immediately be handed over to
   * the buddy list. This is safe since pageset manipulation is done
   * with interrupts disabled.
   *
   * The boot_pagesets must be kept even after bootup is complete for
   * unused processors and/or zones. They do play a role for bootstrapping
   * hotplugged processors.
   *
   * zoneinfo_show() and maybe other functions do
   * not check if the processor is online before following the pageset pointer.
   * Other parts of the kernel may not check if the zone is available.
   */
  static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
  static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
1f522509c   Haicheng Li   mem-hotplug: avoi...
3467
  static void setup_zone_pageset(struct zone *zone);
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
3468

4eaf3f643   Haicheng Li   mem-hotplug: fix ...
3469
3470
3471
3472
3473
  /*
   * Global mutex to protect against size modification of zonelists
   * as well as to serialize pageset setup for the new populated zone.
   */
  DEFINE_MUTEX(zonelists_mutex);
9b1a4d383   Rusty Russell   stop_machine: Wea...
3474
  /* return values int ....just for stop_machine() */
4ed7e0222   Jiang Liu   mm/hotplug: mark ...
3475
  static int __build_all_zonelists(void *data)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3476
  {
6811378e7   Yasunori Goto   [PATCH] wait_tabl...
3477
  	int nid;
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
3478
  	int cpu;
9adb62a5d   Jiang Liu   mm/hotplug: corre...
3479
  	pg_data_t *self = data;
9276b1bc9   Paul Jackson   [PATCH] memory pa...
3480

7f9cfb310   Bo Liu   mm: build_zonelis...
3481
3482
3483
  #ifdef CONFIG_NUMA
  	memset(node_load, 0, sizeof(node_load));
  #endif
9adb62a5d   Jiang Liu   mm/hotplug: corre...
3484
3485
3486
3487
3488
  
  	if (self && !node_online(self->node_id)) {
  		build_zonelists(self);
  		build_zonelist_cache(self);
  	}
9276b1bc9   Paul Jackson   [PATCH] memory pa...
3489
  	for_each_online_node(nid) {
7ea1530ab   Christoph Lameter   Memoryless nodes:...
3490
3491
3492
3493
  		pg_data_t *pgdat = NODE_DATA(nid);
  
  		build_zonelists(pgdat);
  		build_zonelist_cache(pgdat);
9276b1bc9   Paul Jackson   [PATCH] memory pa...
3494
  	}
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
  
  	/*
  	 * Initialize the boot_pagesets that are going to be used
  	 * for bootstrapping processors. The real pagesets for
  	 * each zone will be allocated later when the per cpu
  	 * allocator is available.
  	 *
  	 * boot_pagesets are used also for bootstrapping offline
  	 * cpus if the system is already booted because the pagesets
  	 * are needed to initialize allocators on a specific cpu too.
  	 * F.e. the percpu allocator needs the page allocator which
  	 * needs the percpu allocator in order to allocate its pagesets
  	 * (a chicken-egg dilemma).
  	 */
7aac78988   Lee Schermerhorn   numa: introduce n...
3509
  	for_each_possible_cpu(cpu) {
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
3510
  		setup_pageset(&per_cpu(boot_pageset, cpu), 0);
7aac78988   Lee Schermerhorn   numa: introduce n...
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
  #ifdef CONFIG_HAVE_MEMORYLESS_NODES
  		/*
  		 * We now know the "local memory node" for each node--
  		 * i.e., the node of the first zone in the generic zonelist.
  		 * Set up numa_mem percpu variable for on-line cpus.  During
  		 * boot, only the boot cpu should be on-line;  we'll init the
  		 * secondary cpus' numa_mem as they come on-line.  During
  		 * node/memory hotplug, we'll fixup all on-line cpus.
  		 */
  		if (cpu_online(cpu))
  			set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
  #endif
  	}
6811378e7   Yasunori Goto   [PATCH] wait_tabl...
3524
3525
  	return 0;
  }
4eaf3f643   Haicheng Li   mem-hotplug: fix ...
3526
3527
3528
3529
  /*
   * Called with zonelists_mutex held always
   * unless system_state == SYSTEM_BOOTING.
   */
9adb62a5d   Jiang Liu   mm/hotplug: corre...
3530
  void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
6811378e7   Yasunori Goto   [PATCH] wait_tabl...
3531
  {
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
3532
  	set_zonelist_order();
6811378e7   Yasunori Goto   [PATCH] wait_tabl...
3533
  	if (system_state == SYSTEM_BOOTING) {
423b41d77   Randy Dunlap   [PATCH] mm/page_a...
3534
  		__build_all_zonelists(NULL);
68ad8df42   Mel Gorman   mm: print out the...
3535
  		mminit_verify_zonelist();
6811378e7   Yasunori Goto   [PATCH] wait_tabl...
3536
3537
  		cpuset_init_current_mems_allowed();
  	} else {
183ff22bb   Simon Arlott   spelling fixes: mm/
3538
  		/* we have to stop all cpus to guarantee there is no user
6811378e7   Yasunori Goto   [PATCH] wait_tabl...
3539
  		   of zonelist */
e9959f0f3   KAMEZAWA Hiroyuki   mm/page_alloc.c: ...
3540
  #ifdef CONFIG_MEMORY_HOTPLUG
9adb62a5d   Jiang Liu   mm/hotplug: corre...
3541
3542
  		if (zone)
  			setup_zone_pageset(zone);
e9959f0f3   KAMEZAWA Hiroyuki   mm/page_alloc.c: ...
3543
  #endif
9adb62a5d   Jiang Liu   mm/hotplug: corre...
3544
  		stop_machine(__build_all_zonelists, pgdat, NULL);
6811378e7   Yasunori Goto   [PATCH] wait_tabl...
3545
3546
  		/* cpuset refresh routine should be here */
  	}
bd1e22b8e   Andrew Morton   [PATCH] initialis...
3547
  	vm_total_pages = nr_free_pagecache_pages();
9ef9acb05   Mel Gorman   Do not group page...
3548
3549
3550
3551
3552
3553
3554
  	/*
  	 * Disable grouping by mobility if the number of pages in the
  	 * system is too low to allow the mechanism to work. It would be
  	 * more accurate, but expensive to check per-zone. This check is
  	 * made on memory-hotadd so a system can start with mobility
  	 * disabled and enable it later
  	 */
d9c234005   Mel Gorman   Do not depend on ...
3555
  	if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
9ef9acb05   Mel Gorman   Do not group page...
3556
3557
3558
3559
3560
3561
3562
  		page_group_by_mobility_disabled = 1;
  	else
  		page_group_by_mobility_disabled = 0;
  
  	printk("Built %i zonelists in %s order, mobility grouping %s.  "
  		"Total pages: %ld
  ",
62bc62a87   Christoph Lameter   page allocator: u...
3563
  			nr_online_nodes,
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
3564
  			zonelist_order_name[current_zonelist_order],
9ef9acb05   Mel Gorman   Do not group page...
3565
  			page_group_by_mobility_disabled ? "off" : "on",
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
3566
3567
3568
3569
3570
  			vm_total_pages);
  #ifdef CONFIG_NUMA
  	printk("Policy zone: %s
  ", zone_names[policy_zone]);
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
  }
  
  /*
   * Helper functions to size the waitqueue hash table.
   * Essentially these want to choose hash table sizes sufficiently
   * large so that collisions trying to wait on pages are rare.
   * But in fact, the number of active page waitqueues on typical
   * systems is ridiculously low, less than 200. So this is even
   * conservative, even though it seems large.
   *
   * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
   * waitqueues, i.e. the size of the waitq table given the number of pages.
   */
  #define PAGES_PER_WAITQUEUE	256
cca448fe9   Yasunori Goto   [PATCH] wait_tabl...
3585
  #ifndef CONFIG_MEMORY_HOTPLUG
02b694dea   Yasunori Goto   [PATCH] wait_tabl...
3586
  static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
  {
  	unsigned long size = 1;
  
  	pages /= PAGES_PER_WAITQUEUE;
  
  	while (size < pages)
  		size <<= 1;
  
  	/*
  	 * Once we have dozens or even hundreds of threads sleeping
  	 * on IO we've got bigger problems than wait queue collision.
  	 * Limit the size of the wait table to a reasonable size.
  	 */
  	size = min(size, 4096UL);
  
  	return max(size, 4UL);
  }
cca448fe9   Yasunori Goto   [PATCH] wait_tabl...
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
  #else
  /*
   * A zone's size might be changed by hot-add, so it is not possible to determine
   * a suitable size for its wait_table.  So we use the maximum size now.
   *
   * The max wait table size = 4096 x sizeof(wait_queue_head_t).   ie:
   *
   *    i386 (preemption config)    : 4096 x 16 = 64Kbyte.
   *    ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.
   *    ia64, x86-64 (preemption)   : 4096 x 24 = 96Kbyte.
   *
   * The maximum entries are prepared when a zone's memory is (512K + 256) pages
   * or more by the traditional way. (See above).  It equals:
   *
   *    i386, x86-64, powerpc(4K page size) : =  ( 2G + 1M)byte.
   *    ia64(16K page size)                 : =  ( 8G + 4M)byte.
   *    powerpc (64K page size)             : =  (32G +16M)byte.
   */
  static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
  {
  	return 4096UL;
  }
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
  
  /*
   * This is an integer logarithm so that shifts can be used later
   * to extract the more random high bits from the multiplicative
   * hash function before the remainder is taken.
   */
  static inline unsigned long wait_table_bits(unsigned long size)
  {
  	return ffz(~size);
  }
  
  #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
56fd56b86   Mel Gorman   Bias the location...
3639
  /*
6d3163ce8   Arve HjønnevÃ¥g   mm: check if any ...
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
   * Check if a pageblock contains reserved pages
   */
  static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)
  {
  	unsigned long pfn;
  
  	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
  		if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))
  			return 1;
  	}
  	return 0;
  }
  
  /*
d9c234005   Mel Gorman   Do not depend on ...
3654
   * Mark a number of pageblocks as MIGRATE_RESERVE. The number
418589663   Mel Gorman   page allocator: u...
3655
3656
   * of blocks reserved is based on min_wmark_pages(zone). The memory within
   * the reserve will tend to store contiguous free pages. Setting min_free_kbytes
56fd56b86   Mel Gorman   Bias the location...
3657
3658
3659
3660
3661
   * higher will lead to a bigger reserve which will get freed as contiguous
   * blocks as reclaim kicks in
   */
  static void setup_zone_migrate_reserve(struct zone *zone)
  {
6d3163ce8   Arve HjønnevÃ¥g   mm: check if any ...
3662
  	unsigned long start_pfn, pfn, end_pfn, block_end_pfn;
56fd56b86   Mel Gorman   Bias the location...
3663
  	struct page *page;
78986a678   Mel Gorman   page-allocator: l...
3664
3665
  	unsigned long block_migratetype;
  	int reserve;
56fd56b86   Mel Gorman   Bias the location...
3666

d02156388   Michal Hocko   mm: Ensure that p...
3667
3668
3669
3670
3671
3672
  	/*
  	 * Get the start pfn, end pfn and the number of blocks to reserve
  	 * We have to be careful to be aligned to pageblock_nr_pages to
  	 * make sure that we always check pfn_valid for the first page in
  	 * the block.
  	 */
56fd56b86   Mel Gorman   Bias the location...
3673
  	start_pfn = zone->zone_start_pfn;
108bcc96e   Cody P Schafer   mm: add & use zon...
3674
  	end_pfn = zone_end_pfn(zone);
d02156388   Michal Hocko   mm: Ensure that p...
3675
  	start_pfn = roundup(start_pfn, pageblock_nr_pages);
418589663   Mel Gorman   page allocator: u...
3676
  	reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
d9c234005   Mel Gorman   Do not depend on ...
3677
  							pageblock_order;
56fd56b86   Mel Gorman   Bias the location...
3678

78986a678   Mel Gorman   page-allocator: l...
3679
3680
3681
3682
3683
3684
3685
3686
  	/*
  	 * Reserve blocks are generally in place to help high-order atomic
  	 * allocations that are short-lived. A min_free_kbytes value that
  	 * would result in more than 2 reserve blocks for atomic allocations
  	 * is assumed to be in place to help anti-fragmentation for the
  	 * future allocation of hugepages at runtime.
  	 */
  	reserve = min(2, reserve);
d9c234005   Mel Gorman   Do not depend on ...
3687
  	for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
56fd56b86   Mel Gorman   Bias the location...
3688
3689
3690
  		if (!pfn_valid(pfn))
  			continue;
  		page = pfn_to_page(pfn);
344c790e3   Adam Litke   mm: make setup_zo...
3691
3692
3693
  		/* Watch out for overlapping nodes */
  		if (page_to_nid(page) != zone_to_nid(zone))
  			continue;
56fd56b86   Mel Gorman   Bias the location...
3694
  		block_migratetype = get_pageblock_migratetype(page);
938929f14   Mel Gorman   mm: reduce the am...
3695
3696
3697
3698
3699
3700
3701
3702
3703
  		/* Only test what is necessary when the reserves are not met */
  		if (reserve > 0) {
  			/*
  			 * Blocks with reserved pages will never free, skip
  			 * them.
  			 */
  			block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
  			if (pageblock_is_reserved(pfn, block_end_pfn))
  				continue;
56fd56b86   Mel Gorman   Bias the location...
3704

938929f14   Mel Gorman   mm: reduce the am...
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
  			/* If this block is reserved, account for it */
  			if (block_migratetype == MIGRATE_RESERVE) {
  				reserve--;
  				continue;
  			}
  
  			/* Suitable for reserving if this block is movable */
  			if (block_migratetype == MIGRATE_MOVABLE) {
  				set_pageblock_migratetype(page,
  							MIGRATE_RESERVE);
  				move_freepages_block(zone, page,
  							MIGRATE_RESERVE);
  				reserve--;
  				continue;
  			}
56fd56b86   Mel Gorman   Bias the location...
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
  		}
  
  		/*
  		 * If the reserve is met and this is a previous reserved block,
  		 * take it back
  		 */
  		if (block_migratetype == MIGRATE_RESERVE) {
  			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
  			move_freepages_block(zone, page, MIGRATE_MOVABLE);
  		}
  	}
  }
ac0e5b7a6   Mel Gorman   remove PAGE_GROUP...
3732

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3733
3734
3735
3736
3737
  /*
   * Initially all pages are reserved - free ones are freed
   * up by free_all_bootmem() once the early boot process is
   * done. Non-atomic initialization, single-pass.
   */
c09b42404   Matt Tolentino   [PATCH] x86_64: a...
3738
  void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
a2f3aa025   Dave Hansen   [PATCH] Fix spars...
3739
  		unsigned long start_pfn, enum memmap_context context)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3740
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3741
  	struct page *page;
29751f699   Andy Whitcroft   [PATCH] sparsemem...
3742
3743
  	unsigned long end_pfn = start_pfn + size;
  	unsigned long pfn;
86051ca5e   KAMEZAWA Hiroyuki   mm: fix usemap in...
3744
  	struct zone *z;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3745

22b31eec6   Hugh Dickins   badpage: vm_norma...
3746
3747
  	if (highest_memmap_pfn < end_pfn - 1)
  		highest_memmap_pfn = end_pfn - 1;
86051ca5e   KAMEZAWA Hiroyuki   mm: fix usemap in...
3748
  	z = &NODE_DATA(nid)->node_zones[zone];
cbe8dd4af   Greg Ungerer   [PATCH] memmap_in...
3749
  	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
a2f3aa025   Dave Hansen   [PATCH] Fix spars...
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
  		/*
  		 * There can be holes in boot-time mem_map[]s
  		 * handed to this function.  They do not
  		 * exist on hotplugged memory.
  		 */
  		if (context == MEMMAP_EARLY) {
  			if (!early_pfn_valid(pfn))
  				continue;
  			if (!early_pfn_in_nid(pfn, nid))
  				continue;
  		}
d41dee369   Andy Whitcroft   [PATCH] sparsemem...
3761
3762
  		page = pfn_to_page(pfn);
  		set_page_links(page, zone, nid, pfn);
708614e61   Mel Gorman   mm: verify the pa...
3763
  		mminit_verify_page_links(page, zone, nid, pfn);
7835e98b2   Nick Piggin   [PATCH] remove se...
3764
  		init_page_count(page);
22b751c3d   Mel Gorman   mm: rename page s...
3765
3766
  		page_mapcount_reset(page);
  		page_nid_reset_last(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3767
  		SetPageReserved(page);
b2a0ac887   Mel Gorman   Split the free li...
3768
3769
3770
3771
3772
  		/*
  		 * Mark the block movable so that blocks are reserved for
  		 * movable at startup. This will force kernel allocations
  		 * to reserve their blocks rather than leaking throughout
  		 * the address space during boot when many long-lived
56fd56b86   Mel Gorman   Bias the location...
3773
3774
3775
  		 * kernel allocations are made. Later some blocks near
  		 * the start are marked MIGRATE_RESERVE by
  		 * setup_zone_migrate_reserve()
86051ca5e   KAMEZAWA Hiroyuki   mm: fix usemap in...
3776
3777
3778
3779
3780
  		 *
  		 * bitmap is created for zone's valid pfn range. but memmap
  		 * can be created for invalid pages (for alignment)
  		 * check here not to call set_pageblock_migratetype() against
  		 * pfn out of zone.
b2a0ac887   Mel Gorman   Split the free li...
3781
  		 */
86051ca5e   KAMEZAWA Hiroyuki   mm: fix usemap in...
3782
  		if ((z->zone_start_pfn <= pfn)
108bcc96e   Cody P Schafer   mm: add & use zon...
3783
  		    && (pfn < zone_end_pfn(z))
86051ca5e   KAMEZAWA Hiroyuki   mm: fix usemap in...
3784
  		    && !(pfn & (pageblock_nr_pages - 1)))
56fd56b86   Mel Gorman   Bias the location...
3785
  			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
b2a0ac887   Mel Gorman   Split the free li...
3786

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3787
3788
3789
3790
  		INIT_LIST_HEAD(&page->lru);
  #ifdef WANT_PAGE_VIRTUAL
  		/* The shift won't overflow because ZONE_NORMAL is below 4G. */
  		if (!is_highmem_idx(zone))
3212c6be2   Bob Picco   [PATCH] fix WANT_...
3791
  			set_page_address(page, __va(pfn << PAGE_SHIFT));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3792
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3793
3794
  	}
  }
1e548deb5   Andi Kleen   page allocator: r...
3795
  static void __meminit zone_init_free_lists(struct zone *zone)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3796
  {
b2a0ac887   Mel Gorman   Split the free li...
3797
3798
3799
  	int order, t;
  	for_each_migratetype_order(order, t) {
  		INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3800
3801
3802
3803
3804
3805
  		zone->free_area[order].nr_free = 0;
  	}
  }
  
  #ifndef __HAVE_ARCH_MEMMAP_INIT
  #define memmap_init(size, nid, zone, start_pfn) \
a2f3aa025   Dave Hansen   [PATCH] Fix spars...
3806
  	memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3807
  #endif
4ed7e0222   Jiang Liu   mm/hotplug: mark ...
3808
  static int __meminit zone_batchsize(struct zone *zone)
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
3809
  {
3a6be87fd   David Howells   nommu: clamp zone...
3810
  #ifdef CONFIG_MMU
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
3811
3812
3813
3814
  	int batch;
  
  	/*
  	 * The per-cpu-pages pools are set to around 1000th of the
ba56e91c9   Seth, Rohit   [PATCH] mm: page_...
3815
  	 * size of the zone.  But no more than 1/2 of a meg.
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
3816
3817
3818
  	 *
  	 * OK, so we don't know how big the cache is.  So guess.
  	 */
b40da0494   Jiang Liu   mm: use zone->pre...
3819
  	batch = zone->managed_pages / 1024;
ba56e91c9   Seth, Rohit   [PATCH] mm: page_...
3820
3821
  	if (batch * PAGE_SIZE > 512 * 1024)
  		batch = (512 * 1024) / PAGE_SIZE;
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
3822
3823
3824
3825
3826
  	batch /= 4;		/* We effectively *= 4 below */
  	if (batch < 1)
  		batch = 1;
  
  	/*
0ceaacc97   Nick Piggin   [PATCH] Fix up pe...
3827
3828
3829
  	 * Clamp the batch to a 2^n - 1 value. Having a power
  	 * of 2 value was found to be more likely to have
  	 * suboptimal cache aliasing properties in some cases.
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
3830
  	 *
0ceaacc97   Nick Piggin   [PATCH] Fix up pe...
3831
3832
3833
3834
  	 * For example if 2 tasks are alternately allocating
  	 * batches of pages, one task can end up with a lot
  	 * of pages of one half of the possible page colors
  	 * and the other with pages of the other colors.
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
3835
  	 */
9155203a5   David Howells   mm: use roundown_...
3836
  	batch = rounddown_pow_of_two(batch + batch/2) - 1;
ba56e91c9   Seth, Rohit   [PATCH] mm: page_...
3837

e7c8d5c99   Christoph Lameter   [PATCH] node loca...
3838
  	return batch;
3a6be87fd   David Howells   nommu: clamp zone...
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
  
  #else
  	/* The deferral and batching of frees should be suppressed under NOMMU
  	 * conditions.
  	 *
  	 * The problem is that NOMMU needs to be able to allocate large chunks
  	 * of contiguous memory as there's no hardware page translation to
  	 * assemble apparent contiguous memory from discontiguous pages.
  	 *
  	 * Queueing large contiguous runs of pages for batching, however,
  	 * causes the pages to actually be freed in smaller chunks.  As there
  	 * can be a significant delay between the individual batches being
  	 * recycled, this leads to the once large chunks of space being
  	 * fragmented and becoming unavailable for high-order allocations.
  	 */
  	return 0;
  #endif
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
3856
  }
b69a7288e   Adrian Bunk   mm/page_alloc.c: ...
3857
  static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
2caaad41e   Christoph Lameter   [PATCH] Reduce si...
3858
3859
  {
  	struct per_cpu_pages *pcp;
5f8dcc212   Mel Gorman   page-allocator: s...
3860
  	int migratetype;
2caaad41e   Christoph Lameter   [PATCH] Reduce si...
3861

1c6fe9465   Magnus Damm   [PATCH] NUMA: bro...
3862
  	memset(p, 0, sizeof(*p));
3dfa5721f   Christoph Lameter   Page allocator: g...
3863
  	pcp = &p->pcp;
2caaad41e   Christoph Lameter   [PATCH] Reduce si...
3864
  	pcp->count = 0;
2caaad41e   Christoph Lameter   [PATCH] Reduce si...
3865
3866
  	pcp->high = 6 * batch;
  	pcp->batch = max(1UL, 1 * batch);
5f8dcc212   Mel Gorman   page-allocator: s...
3867
3868
  	for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
  		INIT_LIST_HEAD(&pcp->lists[migratetype]);
2caaad41e   Christoph Lameter   [PATCH] Reduce si...
3869
  }
8ad4b1fb8   Rohit Seth   [PATCH] Make high...
3870
3871
3872
3873
3874
3875
3876
3877
3878
  /*
   * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
   * to the value high for the pageset p.
   */
  
  static void setup_pagelist_highmark(struct per_cpu_pageset *p,
  				unsigned long high)
  {
  	struct per_cpu_pages *pcp;
3dfa5721f   Christoph Lameter   Page allocator: g...
3879
  	pcp = &p->pcp;
8ad4b1fb8   Rohit Seth   [PATCH] Make high...
3880
3881
3882
3883
3884
  	pcp->high = high;
  	pcp->batch = max(1UL, high/4);
  	if ((high/4) > (PAGE_SHIFT * 8))
  		pcp->batch = PAGE_SHIFT * 8;
  }
4ed7e0222   Jiang Liu   mm/hotplug: mark ...
3885
  static void __meminit setup_zone_pageset(struct zone *zone)
319774e25   Wu Fengguang   mem-hotplug: sepa...
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
  {
  	int cpu;
  
  	zone->pageset = alloc_percpu(struct per_cpu_pageset);
  
  	for_each_possible_cpu(cpu) {
  		struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
  
  		setup_pageset(pcp, zone_batchsize(zone));
  
  		if (percpu_pagelist_fraction)
  			setup_pagelist_highmark(pcp,
b40da0494   Jiang Liu   mm: use zone->pre...
3898
  				(zone->managed_pages /
319774e25   Wu Fengguang   mem-hotplug: sepa...
3899
3900
3901
  					percpu_pagelist_fraction));
  	}
  }
2caaad41e   Christoph Lameter   [PATCH] Reduce si...
3902
  /*
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
3903
3904
   * Allocate per cpu pagesets and initialize them.
   * Before this call only boot pagesets were available.
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
3905
   */
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
3906
  void __init setup_per_cpu_pageset(void)
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
3907
  {
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
3908
  	struct zone *zone;
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
3909

319774e25   Wu Fengguang   mem-hotplug: sepa...
3910
3911
  	for_each_populated_zone(zone)
  		setup_zone_pageset(zone);
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
3912
  }
577a32f62   Sam Ravnborg   mm: fix section m...
3913
  static noinline __init_refok
cca448fe9   Yasunori Goto   [PATCH] wait_tabl...
3914
  int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
3915
3916
3917
  {
  	int i;
  	struct pglist_data *pgdat = zone->zone_pgdat;
cca448fe9   Yasunori Goto   [PATCH] wait_tabl...
3918
  	size_t alloc_size;
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
3919
3920
3921
3922
3923
  
  	/*
  	 * The per-page waitqueue mechanism uses hashed waitqueues
  	 * per zone.
  	 */
02b694dea   Yasunori Goto   [PATCH] wait_tabl...
3924
3925
3926
3927
  	zone->wait_table_hash_nr_entries =
  		 wait_table_hash_nr_entries(zone_size_pages);
  	zone->wait_table_bits =
  		wait_table_bits(zone->wait_table_hash_nr_entries);
cca448fe9   Yasunori Goto   [PATCH] wait_tabl...
3928
3929
  	alloc_size = zone->wait_table_hash_nr_entries
  					* sizeof(wait_queue_head_t);
cd94b9dbf   Heiko Carstens   memory hotplug: f...
3930
  	if (!slab_is_available()) {
cca448fe9   Yasunori Goto   [PATCH] wait_tabl...
3931
  		zone->wait_table = (wait_queue_head_t *)
8f389a99b   Yinghai Lu   mm: use alloc_boo...
3932
  			alloc_bootmem_node_nopanic(pgdat, alloc_size);
cca448fe9   Yasunori Goto   [PATCH] wait_tabl...
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
  	} else {
  		/*
  		 * This case means that a zone whose size was 0 gets new memory
  		 * via memory hot-add.
  		 * But it may be the case that a new node was hot-added.  In
  		 * this case vmalloc() will not be able to use this new node's
  		 * memory - this wait_table must be initialized to use this new
  		 * node itself as well.
  		 * To use this new node's memory, further consideration will be
  		 * necessary.
  		 */
8691f3a72   Jesper Juhl   mm: no need to ca...
3944
  		zone->wait_table = vmalloc(alloc_size);
cca448fe9   Yasunori Goto   [PATCH] wait_tabl...
3945
3946
3947
  	}
  	if (!zone->wait_table)
  		return -ENOMEM;
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
3948

02b694dea   Yasunori Goto   [PATCH] wait_tabl...
3949
  	for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
3950
  		init_waitqueue_head(zone->wait_table + i);
cca448fe9   Yasunori Goto   [PATCH] wait_tabl...
3951
3952
  
  	return 0;
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
3953
  }
c09b42404   Matt Tolentino   [PATCH] x86_64: a...
3954
  static __meminit void zone_pcp_init(struct zone *zone)
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
3955
  {
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
3956
3957
3958
3959
3960
3961
  	/*
  	 * per cpu subsystem is not up at this point. The following code
  	 * relies on the ability of the linker to provide the
  	 * offset of a (static) per cpu variable into the per cpu area.
  	 */
  	zone->pageset = &boot_pageset;
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
3962

f5335c0f1   Anton Blanchard   [PATCH] quieten z...
3963
  	if (zone->present_pages)
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
3964
3965
3966
3967
  		printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%u
  ",
  			zone->name, zone->present_pages,
  					 zone_batchsize(zone));
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
3968
  }
4ed7e0222   Jiang Liu   mm/hotplug: mark ...
3969
  int __meminit init_currently_empty_zone(struct zone *zone,
718127cc3   Yasunori Goto   [PATCH] wait_tabl...
3970
  					unsigned long zone_start_pfn,
a2f3aa025   Dave Hansen   [PATCH] Fix spars...
3971
3972
  					unsigned long size,
  					enum memmap_context context)
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
3973
3974
  {
  	struct pglist_data *pgdat = zone->zone_pgdat;
cca448fe9   Yasunori Goto   [PATCH] wait_tabl...
3975
3976
3977
3978
  	int ret;
  	ret = zone_wait_table_init(zone, size);
  	if (ret)
  		return ret;
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
3979
  	pgdat->nr_zones = zone_idx(zone) + 1;
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
3980
  	zone->zone_start_pfn = zone_start_pfn;
708614e61   Mel Gorman   mm: verify the pa...
3981
3982
3983
3984
3985
3986
  	mminit_dprintk(MMINIT_TRACE, "memmap_init",
  			"Initialising map node %d zone %lu pfns %lu -> %lu
  ",
  			pgdat->node_id,
  			(unsigned long)zone_idx(zone),
  			zone_start_pfn, (zone_start_pfn + size));
1e548deb5   Andi Kleen   page allocator: r...
3987
  	zone_init_free_lists(zone);
718127cc3   Yasunori Goto   [PATCH] wait_tabl...
3988
3989
  
  	return 0;
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
3990
  }
0ee332c14   Tejun Heo   memblock: Kill ea...
3991
  #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
c713216de   Mel Gorman   [PATCH] Introduce...
3992
3993
3994
3995
3996
3997
3998
  #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
  /*
   * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
   * Architectures may implement their own version but if add_active_range()
   * was used and there are no special requirements, this is a convenient
   * alternative
   */
f2dbcfa73   KAMEZAWA Hiroyuki   mm: clean up for ...
3999
  int __meminit __early_pfn_to_nid(unsigned long pfn)
c713216de   Mel Gorman   [PATCH] Introduce...
4000
  {
c13291a53   Tejun Heo   bootmem: Use for_...
4001
4002
  	unsigned long start_pfn, end_pfn;
  	int i, nid;
7c243c716   Russ Anderson   mm: speedup in __...
4003
4004
4005
4006
4007
4008
4009
4010
4011
  	/*
  	 * NOTE: The following SMP-unsafe globals are only used early in boot
  	 * when the kernel is running single-threaded.
  	 */
  	static unsigned long __meminitdata last_start_pfn, last_end_pfn;
  	static int __meminitdata last_nid;
  
  	if (last_start_pfn <= pfn && pfn < last_end_pfn)
  		return last_nid;
c713216de   Mel Gorman   [PATCH] Introduce...
4012

c13291a53   Tejun Heo   bootmem: Use for_...
4013
  	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
7c243c716   Russ Anderson   mm: speedup in __...
4014
4015
4016
4017
  		if (start_pfn <= pfn && pfn < end_pfn) {
  			last_start_pfn = start_pfn;
  			last_end_pfn = end_pfn;
  			last_nid = nid;
c13291a53   Tejun Heo   bootmem: Use for_...
4018
  			return nid;
7c243c716   Russ Anderson   mm: speedup in __...
4019
  		}
cc2559bcc   KAMEZAWA Hiroyuki   mm: fix memmap in...
4020
4021
  	/* This is a memory hole */
  	return -1;
c713216de   Mel Gorman   [PATCH] Introduce...
4022
4023
  }
  #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
f2dbcfa73   KAMEZAWA Hiroyuki   mm: clean up for ...
4024
4025
  int __meminit early_pfn_to_nid(unsigned long pfn)
  {
cc2559bcc   KAMEZAWA Hiroyuki   mm: fix memmap in...
4026
4027
4028
4029
4030
4031
4032
  	int nid;
  
  	nid = __early_pfn_to_nid(pfn);
  	if (nid >= 0)
  		return nid;
  	/* just returns 0 */
  	return 0;
f2dbcfa73   KAMEZAWA Hiroyuki   mm: clean up for ...
4033
  }
cc2559bcc   KAMEZAWA Hiroyuki   mm: fix memmap in...
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
  #ifdef CONFIG_NODES_SPAN_OTHER_NODES
  bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
  {
  	int nid;
  
  	nid = __early_pfn_to_nid(pfn);
  	if (nid >= 0 && nid != node)
  		return false;
  	return true;
  }
  #endif
f2dbcfa73   KAMEZAWA Hiroyuki   mm: clean up for ...
4045

c713216de   Mel Gorman   [PATCH] Introduce...
4046
4047
  /**
   * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
4048
4049
   * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
   * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node
c713216de   Mel Gorman   [PATCH] Introduce...
4050
4051
4052
4053
4054
   *
   * If an architecture guarantees that all ranges registered with
   * add_active_ranges() contain no holes and may be freed, this
   * this function may be used instead of calling free_bootmem() manually.
   */
c13291a53   Tejun Heo   bootmem: Use for_...
4055
  void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
cc2898943   Yinghai Lu   mm: Move early_no...
4056
  {
c13291a53   Tejun Heo   bootmem: Use for_...
4057
4058
  	unsigned long start_pfn, end_pfn;
  	int i, this_nid;
edbe7d23b   Yinghai Lu   memblock: Add fin...
4059

c13291a53   Tejun Heo   bootmem: Use for_...
4060
4061
4062
  	for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {
  		start_pfn = min(start_pfn, max_low_pfn);
  		end_pfn = min(end_pfn, max_low_pfn);
edbe7d23b   Yinghai Lu   memblock: Add fin...
4063

c13291a53   Tejun Heo   bootmem: Use for_...
4064
4065
4066
4067
  		if (start_pfn < end_pfn)
  			free_bootmem_node(NODE_DATA(this_nid),
  					  PFN_PHYS(start_pfn),
  					  (end_pfn - start_pfn) << PAGE_SHIFT);
edbe7d23b   Yinghai Lu   memblock: Add fin...
4068
  	}
edbe7d23b   Yinghai Lu   memblock: Add fin...
4069
  }
edbe7d23b   Yinghai Lu   memblock: Add fin...
4070

c713216de   Mel Gorman   [PATCH] Introduce...
4071
4072
  /**
   * sparse_memory_present_with_active_regions - Call memory_present for each active range
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
4073
   * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
c713216de   Mel Gorman   [PATCH] Introduce...
4074
4075
4076
   *
   * If an architecture guarantees that all ranges registered with
   * add_active_ranges() contain no holes and may be freed, this
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
4077
   * function may be used instead of calling memory_present() manually.
c713216de   Mel Gorman   [PATCH] Introduce...
4078
4079
4080
   */
  void __init sparse_memory_present_with_active_regions(int nid)
  {
c13291a53   Tejun Heo   bootmem: Use for_...
4081
4082
  	unsigned long start_pfn, end_pfn;
  	int i, this_nid;
c713216de   Mel Gorman   [PATCH] Introduce...
4083

c13291a53   Tejun Heo   bootmem: Use for_...
4084
4085
  	for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)
  		memory_present(this_nid, start_pfn, end_pfn);
c713216de   Mel Gorman   [PATCH] Introduce...
4086
4087
4088
4089
  }
  
  /**
   * get_pfn_range_for_nid - Return the start and end page frames for a node
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
4090
4091
4092
   * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
   * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
   * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
c713216de   Mel Gorman   [PATCH] Introduce...
4093
4094
4095
4096
   *
   * It returns the start and end page frame of a node based on information
   * provided by an arch calling add_active_range(). If called for a node
   * with no available memory, a warning is printed and the start and end
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
4097
   * PFNs will be 0.
c713216de   Mel Gorman   [PATCH] Introduce...
4098
   */
a3142c8e1   Yasunori Goto   Fix section misma...
4099
  void __meminit get_pfn_range_for_nid(unsigned int nid,
c713216de   Mel Gorman   [PATCH] Introduce...
4100
4101
  			unsigned long *start_pfn, unsigned long *end_pfn)
  {
c13291a53   Tejun Heo   bootmem: Use for_...
4102
  	unsigned long this_start_pfn, this_end_pfn;
c713216de   Mel Gorman   [PATCH] Introduce...
4103
  	int i;
c13291a53   Tejun Heo   bootmem: Use for_...
4104

c713216de   Mel Gorman   [PATCH] Introduce...
4105
4106
  	*start_pfn = -1UL;
  	*end_pfn = 0;
c13291a53   Tejun Heo   bootmem: Use for_...
4107
4108
4109
  	for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {
  		*start_pfn = min(*start_pfn, this_start_pfn);
  		*end_pfn = max(*end_pfn, this_end_pfn);
c713216de   Mel Gorman   [PATCH] Introduce...
4110
  	}
633c0666b   Christoph Lameter   Memoryless nodes:...
4111
  	if (*start_pfn == -1UL)
c713216de   Mel Gorman   [PATCH] Introduce...
4112
  		*start_pfn = 0;
c713216de   Mel Gorman   [PATCH] Introduce...
4113
4114
4115
  }
  
  /*
2a1e274ac   Mel Gorman   Create the ZONE_M...
4116
4117
4118
4119
   * This finds a zone that can be used for ZONE_MOVABLE pages. The
   * assumption is made that zones within a node are ordered in monotonic
   * increasing memory addresses so that the "highest" populated zone is used
   */
b69a7288e   Adrian Bunk   mm/page_alloc.c: ...
4120
  static void __init find_usable_zone_for_movable(void)
2a1e274ac   Mel Gorman   Create the ZONE_M...
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
  {
  	int zone_index;
  	for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
  		if (zone_index == ZONE_MOVABLE)
  			continue;
  
  		if (arch_zone_highest_possible_pfn[zone_index] >
  				arch_zone_lowest_possible_pfn[zone_index])
  			break;
  	}
  
  	VM_BUG_ON(zone_index == -1);
  	movable_zone = zone_index;
  }
  
  /*
   * The zone ranges provided by the architecture do not include ZONE_MOVABLE
25985edce   Lucas De Marchi   Fix common misspe...
4138
   * because it is sized independent of architecture. Unlike the other zones,
2a1e274ac   Mel Gorman   Create the ZONE_M...
4139
4140
4141
4142
4143
4144
4145
   * the starting point for ZONE_MOVABLE is not fixed. It may be different
   * in each node depending on the size of each node and how evenly kernelcore
   * is distributed. This helper function adjusts the zone ranges
   * provided by the architecture for a given node by using the end of the
   * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
   * zones within a node are in order of monotonic increases memory addresses
   */
b69a7288e   Adrian Bunk   mm/page_alloc.c: ...
4146
  static void __meminit adjust_zone_range_for_zone_movable(int nid,
2a1e274ac   Mel Gorman   Create the ZONE_M...
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
  					unsigned long zone_type,
  					unsigned long node_start_pfn,
  					unsigned long node_end_pfn,
  					unsigned long *zone_start_pfn,
  					unsigned long *zone_end_pfn)
  {
  	/* Only adjust if ZONE_MOVABLE is on this node */
  	if (zone_movable_pfn[nid]) {
  		/* Size ZONE_MOVABLE */
  		if (zone_type == ZONE_MOVABLE) {
  			*zone_start_pfn = zone_movable_pfn[nid];
  			*zone_end_pfn = min(node_end_pfn,
  				arch_zone_highest_possible_pfn[movable_zone]);
  
  		/* Adjust for ZONE_MOVABLE starting within this range */
  		} else if (*zone_start_pfn < zone_movable_pfn[nid] &&
  				*zone_end_pfn > zone_movable_pfn[nid]) {
  			*zone_end_pfn = zone_movable_pfn[nid];
  
  		/* Check if this whole range is within ZONE_MOVABLE */
  		} else if (*zone_start_pfn >= zone_movable_pfn[nid])
  			*zone_start_pfn = *zone_end_pfn;
  	}
  }
  
  /*
c713216de   Mel Gorman   [PATCH] Introduce...
4173
4174
4175
   * Return the number of pages a zone spans in a node, including holes
   * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
   */
6ea6e6887   Paul Mundt   mm: more __memini...
4176
  static unsigned long __meminit zone_spanned_pages_in_node(int nid,
c713216de   Mel Gorman   [PATCH] Introduce...
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
  					unsigned long zone_type,
  					unsigned long *ignored)
  {
  	unsigned long node_start_pfn, node_end_pfn;
  	unsigned long zone_start_pfn, zone_end_pfn;
  
  	/* Get the start and end of the node and zone */
  	get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
  	zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
  	zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
2a1e274ac   Mel Gorman   Create the ZONE_M...
4187
4188
4189
  	adjust_zone_range_for_zone_movable(nid, zone_type,
  				node_start_pfn, node_end_pfn,
  				&zone_start_pfn, &zone_end_pfn);
c713216de   Mel Gorman   [PATCH] Introduce...
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
  
  	/* Check that this node has pages within the zone's required range */
  	if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
  		return 0;
  
  	/* Move the zone boundaries inside the node if necessary */
  	zone_end_pfn = min(zone_end_pfn, node_end_pfn);
  	zone_start_pfn = max(zone_start_pfn, node_start_pfn);
  
  	/* Return the spanned pages */
  	return zone_end_pfn - zone_start_pfn;
  }
  
  /*
   * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
4205
   * then all holes in the requested range will be accounted for.
c713216de   Mel Gorman   [PATCH] Introduce...
4206
   */
329962503   Yinghai Lu   x86: Fix checking...
4207
  unsigned long __meminit __absent_pages_in_range(int nid,
c713216de   Mel Gorman   [PATCH] Introduce...
4208
4209
4210
  				unsigned long range_start_pfn,
  				unsigned long range_end_pfn)
  {
96e907d13   Tejun Heo   bootmem: Reimplem...
4211
4212
4213
  	unsigned long nr_absent = range_end_pfn - range_start_pfn;
  	unsigned long start_pfn, end_pfn;
  	int i;
c713216de   Mel Gorman   [PATCH] Introduce...
4214

96e907d13   Tejun Heo   bootmem: Reimplem...
4215
4216
4217
4218
  	for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
  		start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
  		end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
  		nr_absent -= end_pfn - start_pfn;
c713216de   Mel Gorman   [PATCH] Introduce...
4219
  	}
96e907d13   Tejun Heo   bootmem: Reimplem...
4220
  	return nr_absent;
c713216de   Mel Gorman   [PATCH] Introduce...
4221
4222
4223
4224
4225
4226
4227
  }
  
  /**
   * absent_pages_in_range - Return number of page frames in holes within a range
   * @start_pfn: The start PFN to start searching for holes
   * @end_pfn: The end PFN to stop searching for holes
   *
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
4228
   * It returns the number of pages frames in memory holes within a range.
c713216de   Mel Gorman   [PATCH] Introduce...
4229
4230
4231
4232
4233
4234
4235
4236
   */
  unsigned long __init absent_pages_in_range(unsigned long start_pfn,
  							unsigned long end_pfn)
  {
  	return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
  }
  
  /* Return the number of page frames in holes in a zone on a node */
6ea6e6887   Paul Mundt   mm: more __memini...
4237
  static unsigned long __meminit zone_absent_pages_in_node(int nid,
c713216de   Mel Gorman   [PATCH] Introduce...
4238
4239
4240
  					unsigned long zone_type,
  					unsigned long *ignored)
  {
96e907d13   Tejun Heo   bootmem: Reimplem...
4241
4242
  	unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
  	unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
9c7cd6877   Mel Gorman   [PATCH] Account f...
4243
4244
4245
4246
  	unsigned long node_start_pfn, node_end_pfn;
  	unsigned long zone_start_pfn, zone_end_pfn;
  
  	get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
96e907d13   Tejun Heo   bootmem: Reimplem...
4247
4248
  	zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
  	zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
9c7cd6877   Mel Gorman   [PATCH] Account f...
4249

2a1e274ac   Mel Gorman   Create the ZONE_M...
4250
4251
4252
  	adjust_zone_range_for_zone_movable(nid, zone_type,
  			node_start_pfn, node_end_pfn,
  			&zone_start_pfn, &zone_end_pfn);
9c7cd6877   Mel Gorman   [PATCH] Account f...
4253
  	return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
c713216de   Mel Gorman   [PATCH] Introduce...
4254
  }
0e0b864e0   Mel Gorman   [PATCH] Account f...
4255

0ee332c14   Tejun Heo   memblock: Kill ea...
4256
  #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
6ea6e6887   Paul Mundt   mm: more __memini...
4257
  static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
c713216de   Mel Gorman   [PATCH] Introduce...
4258
4259
4260
4261
4262
  					unsigned long zone_type,
  					unsigned long *zones_size)
  {
  	return zones_size[zone_type];
  }
6ea6e6887   Paul Mundt   mm: more __memini...
4263
  static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
c713216de   Mel Gorman   [PATCH] Introduce...
4264
4265
4266
4267
4268
4269
4270
4271
  						unsigned long zone_type,
  						unsigned long *zholes_size)
  {
  	if (!zholes_size)
  		return 0;
  
  	return zholes_size[zone_type];
  }
20e6926dc   Yinghai Lu   x86, ACPI, mm: Re...
4272

0ee332c14   Tejun Heo   memblock: Kill ea...
4273
  #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
c713216de   Mel Gorman   [PATCH] Introduce...
4274

a3142c8e1   Yasunori Goto   Fix section misma...
4275
  static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
c713216de   Mel Gorman   [PATCH] Introduce...
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
  		unsigned long *zones_size, unsigned long *zholes_size)
  {
  	unsigned long realtotalpages, totalpages = 0;
  	enum zone_type i;
  
  	for (i = 0; i < MAX_NR_ZONES; i++)
  		totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
  								zones_size);
  	pgdat->node_spanned_pages = totalpages;
  
  	realtotalpages = totalpages;
  	for (i = 0; i < MAX_NR_ZONES; i++)
  		realtotalpages -=
  			zone_absent_pages_in_node(pgdat->node_id, i,
  								zholes_size);
  	pgdat->node_present_pages = realtotalpages;
  	printk(KERN_DEBUG "On node %d totalpages: %lu
  ", pgdat->node_id,
  							realtotalpages);
  }
835c134ec   Mel Gorman   Add a bitmap that...
4296
4297
4298
  #ifndef CONFIG_SPARSEMEM
  /*
   * Calculate the size of the zone->blockflags rounded to an unsigned long
d9c234005   Mel Gorman   Do not depend on ...
4299
4300
   * Start by making sure zonesize is a multiple of pageblock_order by rounding
   * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
835c134ec   Mel Gorman   Add a bitmap that...
4301
4302
4303
   * round what is now in bits to nearest long in bits, then return it in
   * bytes.
   */
7c45512df   Linus Torvalds   mm: fix pageblock...
4304
  static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)
835c134ec   Mel Gorman   Add a bitmap that...
4305
4306
  {
  	unsigned long usemapsize;
7c45512df   Linus Torvalds   mm: fix pageblock...
4307
  	zonesize += zone_start_pfn & (pageblock_nr_pages-1);
d9c234005   Mel Gorman   Do not depend on ...
4308
4309
  	usemapsize = roundup(zonesize, pageblock_nr_pages);
  	usemapsize = usemapsize >> pageblock_order;
835c134ec   Mel Gorman   Add a bitmap that...
4310
4311
4312
4313
4314
4315
4316
  	usemapsize *= NR_PAGEBLOCK_BITS;
  	usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));
  
  	return usemapsize / 8;
  }
  
  static void __init setup_usemap(struct pglist_data *pgdat,
7c45512df   Linus Torvalds   mm: fix pageblock...
4317
4318
4319
  				struct zone *zone,
  				unsigned long zone_start_pfn,
  				unsigned long zonesize)
835c134ec   Mel Gorman   Add a bitmap that...
4320
  {
7c45512df   Linus Torvalds   mm: fix pageblock...
4321
  	unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
835c134ec   Mel Gorman   Add a bitmap that...
4322
  	zone->pageblock_flags = NULL;
58a01a457   Julia Lawall   mm/page_alloc.c: ...
4323
  	if (usemapsize)
8f389a99b   Yinghai Lu   mm: use alloc_boo...
4324
4325
  		zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,
  								   usemapsize);
835c134ec   Mel Gorman   Add a bitmap that...
4326
4327
  }
  #else
7c45512df   Linus Torvalds   mm: fix pageblock...
4328
4329
  static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
  				unsigned long zone_start_pfn, unsigned long zonesize) {}
835c134ec   Mel Gorman   Add a bitmap that...
4330
  #endif /* CONFIG_SPARSEMEM */
d9c234005   Mel Gorman   Do not depend on ...
4331
  #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
ba72cb8cb   Mel Gorman   Fix boot problem ...
4332

d9c234005   Mel Gorman   Do not depend on ...
4333
  /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
ca57df79d   Xishi Qiu   mm: setup pageblo...
4334
  void __init set_pageblock_order(void)
d9c234005   Mel Gorman   Do not depend on ...
4335
  {
955c1cd74   Andrew Morton   mm/page_alloc.c: ...
4336
  	unsigned int order;
d9c234005   Mel Gorman   Do not depend on ...
4337
4338
4339
  	/* Check that pageblock_nr_pages has not already been setup */
  	if (pageblock_order)
  		return;
955c1cd74   Andrew Morton   mm/page_alloc.c: ...
4340
4341
4342
4343
  	if (HPAGE_SHIFT > PAGE_SHIFT)
  		order = HUGETLB_PAGE_ORDER;
  	else
  		order = MAX_ORDER - 1;
d9c234005   Mel Gorman   Do not depend on ...
4344
4345
  	/*
  	 * Assume the largest contiguous order of interest is a huge page.
955c1cd74   Andrew Morton   mm/page_alloc.c: ...
4346
4347
  	 * This value may be variable depending on boot parameters on IA64 and
  	 * powerpc.
d9c234005   Mel Gorman   Do not depend on ...
4348
4349
4350
4351
  	 */
  	pageblock_order = order;
  }
  #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
ba72cb8cb   Mel Gorman   Fix boot problem ...
4352
4353
  /*
   * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
955c1cd74   Andrew Morton   mm/page_alloc.c: ...
4354
4355
4356
   * is unused as pageblock_order is set at compile-time. See
   * include/linux/pageblock-flags.h for the values of pageblock_order based on
   * the kernel config
ba72cb8cb   Mel Gorman   Fix boot problem ...
4357
   */
ca57df79d   Xishi Qiu   mm: setup pageblo...
4358
  void __init set_pageblock_order(void)
ba72cb8cb   Mel Gorman   Fix boot problem ...
4359
  {
ba72cb8cb   Mel Gorman   Fix boot problem ...
4360
  }
d9c234005   Mel Gorman   Do not depend on ...
4361
4362
  
  #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
01cefaef4   Jiang Liu   mm: provide more ...
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
  static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
  						   unsigned long present_pages)
  {
  	unsigned long pages = spanned_pages;
  
  	/*
  	 * Provide a more accurate estimation if there are holes within
  	 * the zone and SPARSEMEM is in use. If there are holes within the
  	 * zone, each populated memory region may cost us one or two extra
  	 * memmap pages due to alignment because memmap pages for each
  	 * populated regions may not naturally algined on page boundary.
  	 * So the (present_pages >> 4) heuristic is a tradeoff for that.
  	 */
  	if (spanned_pages > present_pages + (present_pages >> 4) &&
  	    IS_ENABLED(CONFIG_SPARSEMEM))
  		pages = present_pages;
  
  	return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4382
4383
4384
4385
4386
  /*
   * Set up the zone data structures:
   *   - mark all pages reserved
   *   - mark all memory queues empty
   *   - clear the memory bitmaps
6527af5d1   Minchan Kim   mm: remove redund...
4387
4388
   *
   * NOTE: pgdat should get zeroed by caller.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4389
   */
b5a0e0113   Alexander van Heukelum   Solve section mis...
4390
  static void __paginginit free_area_init_core(struct pglist_data *pgdat,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4391
4392
  		unsigned long *zones_size, unsigned long *zholes_size)
  {
2f1b62486   Christoph Lameter   [PATCH] reduce MA...
4393
  	enum zone_type j;
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
4394
  	int nid = pgdat->node_id;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4395
  	unsigned long zone_start_pfn = pgdat->node_start_pfn;
718127cc3   Yasunori Goto   [PATCH] wait_tabl...
4396
  	int ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4397

208d54e55   Dave Hansen   [PATCH] memory ho...
4398
  	pgdat_resize_init(pgdat);
8177a420e   Andrea Arcangeli   mm: numa: Structu...
4399
4400
4401
4402
4403
  #ifdef CONFIG_NUMA_BALANCING
  	spin_lock_init(&pgdat->numabalancing_migrate_lock);
  	pgdat->numabalancing_migrate_nr_pages = 0;
  	pgdat->numabalancing_migrate_next_window = jiffies;
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4404
  	init_waitqueue_head(&pgdat->kswapd_wait);
5515061d2   Mel Gorman   mm: throttle dire...
4405
  	init_waitqueue_head(&pgdat->pfmemalloc_wait);
52d4b9ac0   KAMEZAWA Hiroyuki   memcg: allocate a...
4406
  	pgdat_page_cgroup_init(pgdat);
5f63b720b   Michal Nazarewicz   mm: page_alloc: r...
4407

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4408
4409
  	for (j = 0; j < MAX_NR_ZONES; j++) {
  		struct zone *zone = pgdat->node_zones + j;
9feedc9d8   Jiang Liu   mm: introduce new...
4410
  		unsigned long size, realsize, freesize, memmap_pages;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4411

c713216de   Mel Gorman   [PATCH] Introduce...
4412
  		size = zone_spanned_pages_in_node(nid, j, zones_size);
9feedc9d8   Jiang Liu   mm: introduce new...
4413
  		realsize = freesize = size - zone_absent_pages_in_node(nid, j,
c713216de   Mel Gorman   [PATCH] Introduce...
4414
  								zholes_size);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4415

0e0b864e0   Mel Gorman   [PATCH] Account f...
4416
  		/*
9feedc9d8   Jiang Liu   mm: introduce new...
4417
  		 * Adjust freesize so that it accounts for how much memory
0e0b864e0   Mel Gorman   [PATCH] Account f...
4418
4419
4420
  		 * is used by this zone for memmap. This affects the watermark
  		 * and per-cpu initialisations
  		 */
01cefaef4   Jiang Liu   mm: provide more ...
4421
  		memmap_pages = calc_memmap_size(size, realsize);
9feedc9d8   Jiang Liu   mm: introduce new...
4422
4423
  		if (freesize >= memmap_pages) {
  			freesize -= memmap_pages;
5594c8c81   Yinghai Lu   mm: print out mem...
4424
4425
4426
4427
4428
  			if (memmap_pages)
  				printk(KERN_DEBUG
  				       "  %s zone: %lu pages used for memmap
  ",
  				       zone_names[j], memmap_pages);
0e0b864e0   Mel Gorman   [PATCH] Account f...
4429
4430
  		} else
  			printk(KERN_WARNING
9feedc9d8   Jiang Liu   mm: introduce new...
4431
4432
4433
  				"  %s zone: %lu pages exceeds freesize %lu
  ",
  				zone_names[j], memmap_pages, freesize);
0e0b864e0   Mel Gorman   [PATCH] Account f...
4434

6267276f3   Christoph Lameter   [PATCH] optional ...
4435
  		/* Account for reserved pages */
9feedc9d8   Jiang Liu   mm: introduce new...
4436
4437
  		if (j == 0 && freesize > dma_reserve) {
  			freesize -= dma_reserve;
d903ef9f3   Yinghai Lu   mm: print out mem...
4438
4439
  			printk(KERN_DEBUG "  %s zone: %lu pages reserved
  ",
6267276f3   Christoph Lameter   [PATCH] optional ...
4440
  					zone_names[0], dma_reserve);
0e0b864e0   Mel Gorman   [PATCH] Account f...
4441
  		}
98d2b0ebd   Christoph Lameter   [PATCH] reduce MA...
4442
  		if (!is_highmem_idx(j))
9feedc9d8   Jiang Liu   mm: introduce new...
4443
  			nr_kernel_pages += freesize;
01cefaef4   Jiang Liu   mm: provide more ...
4444
4445
4446
  		/* Charge for highmem memmap if there are enough kernel pages */
  		else if (nr_kernel_pages > memmap_pages * 2)
  			nr_kernel_pages -= memmap_pages;
9feedc9d8   Jiang Liu   mm: introduce new...
4447
  		nr_all_pages += freesize;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4448
4449
  
  		zone->spanned_pages = size;
306f2e9ee   Jiang Liu   mm: set zone->pre...
4450
  		zone->present_pages = realsize;
9feedc9d8   Jiang Liu   mm: introduce new...
4451
4452
4453
4454
4455
4456
  		/*
  		 * Set an approximate value for lowmem here, it will be adjusted
  		 * when the bootmem allocator frees pages into the buddy system.
  		 * And all highmem pages will be managed by the buddy system.
  		 */
  		zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;
9614634fe   Christoph Lameter   [PATCH] ZVC/zone_...
4457
  #ifdef CONFIG_NUMA
d5f541ed6   Christoph Lameter   [PATCH] Add node ...
4458
  		zone->node = nid;
9feedc9d8   Jiang Liu   mm: introduce new...
4459
  		zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)
9614634fe   Christoph Lameter   [PATCH] ZVC/zone_...
4460
  						/ 100;
9feedc9d8   Jiang Liu   mm: introduce new...
4461
  		zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;
9614634fe   Christoph Lameter   [PATCH] ZVC/zone_...
4462
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4463
4464
4465
  		zone->name = zone_names[j];
  		spin_lock_init(&zone->lock);
  		spin_lock_init(&zone->lru_lock);
bdc8cb984   Dave Hansen   [PATCH] memory ho...
4466
  		zone_seqlock_init(zone);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4467
  		zone->zone_pgdat = pgdat;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4468

ed8ece2ec   Dave Hansen   [PATCH] memory ho...
4469
  		zone_pcp_init(zone);
bea8c150a   Hugh Dickins   memcg: fix hotplu...
4470
  		lruvec_init(&zone->lruvec);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4471
4472
  		if (!size)
  			continue;
955c1cd74   Andrew Morton   mm/page_alloc.c: ...
4473
  		set_pageblock_order();
7c45512df   Linus Torvalds   mm: fix pageblock...
4474
  		setup_usemap(pgdat, zone, zone_start_pfn, size);
a2f3aa025   Dave Hansen   [PATCH] Fix spars...
4475
4476
  		ret = init_currently_empty_zone(zone, zone_start_pfn,
  						size, MEMMAP_EARLY);
718127cc3   Yasunori Goto   [PATCH] wait_tabl...
4477
  		BUG_ON(ret);
76cdd58e5   Heiko Carstens   memory_hotplug: a...
4478
  		memmap_init(size, nid, j, zone_start_pfn);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4479
  		zone_start_pfn += size;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4480
4481
  	}
  }
577a32f62   Sam Ravnborg   mm: fix section m...
4482
  static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4483
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4484
4485
4486
  	/* Skip empty nodes */
  	if (!pgdat->node_spanned_pages)
  		return;
d41dee369   Andy Whitcroft   [PATCH] sparsemem...
4487
  #ifdef CONFIG_FLAT_NODE_MEM_MAP
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4488
4489
  	/* ia64 gets its own node_mem_map, before this, without bootmem */
  	if (!pgdat->node_mem_map) {
e984bb43f   Bob Picco   [PATCH] Align the...
4490
  		unsigned long size, start, end;
d41dee369   Andy Whitcroft   [PATCH] sparsemem...
4491
  		struct page *map;
e984bb43f   Bob Picco   [PATCH] Align the...
4492
4493
4494
4495
4496
4497
  		/*
  		 * The zone's endpoints aren't required to be MAX_ORDER
  		 * aligned but the node_mem_map endpoints must be in order
  		 * for the buddy allocator to function correctly.
  		 */
  		start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
108bcc96e   Cody P Schafer   mm: add & use zon...
4498
  		end = pgdat_end_pfn(pgdat);
e984bb43f   Bob Picco   [PATCH] Align the...
4499
4500
  		end = ALIGN(end, MAX_ORDER_NR_PAGES);
  		size =  (end - start) * sizeof(struct page);
6f167ec72   Dave Hansen   [PATCH] sparsemem...
4501
4502
  		map = alloc_remap(pgdat->node_id, size);
  		if (!map)
8f389a99b   Yinghai Lu   mm: use alloc_boo...
4503
  			map = alloc_bootmem_node_nopanic(pgdat, size);
e984bb43f   Bob Picco   [PATCH] Align the...
4504
  		pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4505
  	}
12d810c1b   Roman Zippel   m68k: discontinuo...
4506
  #ifndef CONFIG_NEED_MULTIPLE_NODES
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4507
4508
4509
  	/*
  	 * With no DISCONTIG, the global mem_map is just set as node 0's
  	 */
c713216de   Mel Gorman   [PATCH] Introduce...
4510
  	if (pgdat == NODE_DATA(0)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4511
  		mem_map = NODE_DATA(0)->node_mem_map;
0ee332c14   Tejun Heo   memblock: Kill ea...
4512
  #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
c713216de   Mel Gorman   [PATCH] Introduce...
4513
  		if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
467bc461d   Thomas Bogendoerfer   Fix crash with FL...
4514
  			mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);
0ee332c14   Tejun Heo   memblock: Kill ea...
4515
  #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
c713216de   Mel Gorman   [PATCH] Introduce...
4516
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4517
  #endif
d41dee369   Andy Whitcroft   [PATCH] sparsemem...
4518
  #endif /* CONFIG_FLAT_NODE_MEM_MAP */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4519
  }
9109fb7b3   Johannes Weiner   mm: drop unneeded...
4520
4521
  void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
  		unsigned long node_start_pfn, unsigned long *zholes_size)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4522
  {
9109fb7b3   Johannes Weiner   mm: drop unneeded...
4523
  	pg_data_t *pgdat = NODE_DATA(nid);
88fdf75d1   Minchan Kim   mm: warn if pg_da...
4524
  	/* pg_data_t should be reset to zero when it's allocated */
8783b6e2b   Linus Torvalds   mm: remove node_s...
4525
  	WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);
88fdf75d1   Minchan Kim   mm: warn if pg_da...
4526

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4527
4528
  	pgdat->node_id = nid;
  	pgdat->node_start_pfn = node_start_pfn;
957f822a0   David Rientjes   mm, numa: reclaim...
4529
  	init_zone_allows_reclaim(nid);
c713216de   Mel Gorman   [PATCH] Introduce...
4530
  	calculate_node_totalpages(pgdat, zones_size, zholes_size);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4531
4532
  
  	alloc_node_mem_map(pgdat);
e8c27ac91   Yinghai Lu   x86, numa, 32-bit...
4533
4534
4535
4536
4537
4538
  #ifdef CONFIG_FLAT_NODE_MEM_MAP
  	printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx
  ",
  		nid, (unsigned long)pgdat,
  		(unsigned long)pgdat->node_mem_map);
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4539
4540
4541
  
  	free_area_init_core(pgdat, zones_size, zholes_size);
  }
0ee332c14   Tejun Heo   memblock: Kill ea...
4542
  #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
418508c13   Miklos Szeredi   fix unused setup_...
4543
4544
4545
4546
4547
  
  #if MAX_NUMNODES > 1
  /*
   * Figure out the number of possible node ids.
   */
f9872caf0   Cody P Schafer   page_alloc: make ...
4548
  void __init setup_nr_node_ids(void)
418508c13   Miklos Szeredi   fix unused setup_...
4549
4550
4551
4552
4553
4554
4555
4556
  {
  	unsigned int node;
  	unsigned int highest = 0;
  
  	for_each_node_mask(node, node_possible_map)
  		highest = node;
  	nr_node_ids = highest + 1;
  }
418508c13   Miklos Szeredi   fix unused setup_...
4557
  #endif
c713216de   Mel Gorman   [PATCH] Introduce...
4558
  /**
1e01979c8   Tejun Heo   x86, numa: Implem...
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
   * node_map_pfn_alignment - determine the maximum internode alignment
   *
   * This function should be called after node map is populated and sorted.
   * It calculates the maximum power of two alignment which can distinguish
   * all the nodes.
   *
   * For example, if all nodes are 1GiB and aligned to 1GiB, the return value
   * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)).  If the
   * nodes are shifted by 256MiB, 256MiB.  Note that if only the last node is
   * shifted, 1GiB is enough and this function will indicate so.
   *
   * This is used to test whether pfn -> nid mapping of the chosen memory
   * model has fine enough granularity to avoid incorrect mapping for the
   * populated node map.
   *
   * Returns the determined alignment in pfn's.  0 if there is no alignment
   * requirement (single node).
   */
  unsigned long __init node_map_pfn_alignment(void)
  {
  	unsigned long accl_mask = 0, last_end = 0;
c13291a53   Tejun Heo   bootmem: Use for_...
4580
  	unsigned long start, end, mask;
1e01979c8   Tejun Heo   x86, numa: Implem...
4581
  	int last_nid = -1;
c13291a53   Tejun Heo   bootmem: Use for_...
4582
  	int i, nid;
1e01979c8   Tejun Heo   x86, numa: Implem...
4583

c13291a53   Tejun Heo   bootmem: Use for_...
4584
  	for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
1e01979c8   Tejun Heo   x86, numa: Implem...
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
  		if (!start || last_nid < 0 || last_nid == nid) {
  			last_nid = nid;
  			last_end = end;
  			continue;
  		}
  
  		/*
  		 * Start with a mask granular enough to pin-point to the
  		 * start pfn and tick off bits one-by-one until it becomes
  		 * too coarse to separate the current node from the last.
  		 */
  		mask = ~((1 << __ffs(start)) - 1);
  		while (mask && last_end <= (start & (mask << 1)))
  			mask <<= 1;
  
  		/* accumulate all internode masks */
  		accl_mask |= mask;
  	}
  
  	/* convert mask to number of pages */
  	return ~accl_mask + 1;
  }
a6af2bc3d   Mel Gorman   [PATCH] Avoid exc...
4607
  /* Find the lowest pfn for a node */
b69a7288e   Adrian Bunk   mm/page_alloc.c: ...
4608
  static unsigned long __init find_min_pfn_for_node(int nid)
c713216de   Mel Gorman   [PATCH] Introduce...
4609
  {
a6af2bc3d   Mel Gorman   [PATCH] Avoid exc...
4610
  	unsigned long min_pfn = ULONG_MAX;
c13291a53   Tejun Heo   bootmem: Use for_...
4611
4612
  	unsigned long start_pfn;
  	int i;
1abbfb412   Mel Gorman   [PATCH] x86_64: f...
4613

c13291a53   Tejun Heo   bootmem: Use for_...
4614
4615
  	for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)
  		min_pfn = min(min_pfn, start_pfn);
c713216de   Mel Gorman   [PATCH] Introduce...
4616

a6af2bc3d   Mel Gorman   [PATCH] Avoid exc...
4617
4618
  	if (min_pfn == ULONG_MAX) {
  		printk(KERN_WARNING
2bc0d2615   Paul Jackson   x86 boot: more co...
4619
4620
  			"Could not find start_pfn for node %d
  ", nid);
a6af2bc3d   Mel Gorman   [PATCH] Avoid exc...
4621
4622
4623
4624
  		return 0;
  	}
  
  	return min_pfn;
c713216de   Mel Gorman   [PATCH] Introduce...
4625
4626
4627
4628
4629
4630
  }
  
  /**
   * find_min_pfn_with_active_regions - Find the minimum PFN registered
   *
   * It returns the minimum PFN based on information provided via
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
4631
   * add_active_range().
c713216de   Mel Gorman   [PATCH] Introduce...
4632
4633
4634
4635
4636
   */
  unsigned long __init find_min_pfn_with_active_regions(void)
  {
  	return find_min_pfn_for_node(MAX_NUMNODES);
  }
37b07e416   Lee Schermerhorn   memoryless nodes:...
4637
4638
4639
  /*
   * early_calculate_totalpages()
   * Sum pages in active regions for movable zone.
4b0ef1fe8   Lai Jiangshan   page_alloc: use N...
4640
   * Populate N_MEMORY for calculating usable_nodes.
37b07e416   Lee Schermerhorn   memoryless nodes:...
4641
   */
484f51f82   Adrian Bunk   mm/page_alloc.c: ...
4642
  static unsigned long __init early_calculate_totalpages(void)
7e63efef8   Mel Gorman   Add a movablecore...
4643
  {
7e63efef8   Mel Gorman   Add a movablecore...
4644
  	unsigned long totalpages = 0;
c13291a53   Tejun Heo   bootmem: Use for_...
4645
4646
4647
4648
4649
  	unsigned long start_pfn, end_pfn;
  	int i, nid;
  
  	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
  		unsigned long pages = end_pfn - start_pfn;
7e63efef8   Mel Gorman   Add a movablecore...
4650

37b07e416   Lee Schermerhorn   memoryless nodes:...
4651
4652
  		totalpages += pages;
  		if (pages)
4b0ef1fe8   Lai Jiangshan   page_alloc: use N...
4653
  			node_set_state(nid, N_MEMORY);
37b07e416   Lee Schermerhorn   memoryless nodes:...
4654
4655
  	}
    	return totalpages;
7e63efef8   Mel Gorman   Add a movablecore...
4656
  }
2a1e274ac   Mel Gorman   Create the ZONE_M...
4657
4658
4659
4660
4661
4662
  /*
   * Find the PFN the Movable zone begins in each node. Kernel memory
   * is spread evenly between nodes as long as the nodes have enough
   * memory. When they don't, some nodes will have more kernelcore than
   * others
   */
b224ef856   Kautuk Consul   page_alloc: remov...
4663
  static void __init find_zone_movable_pfns_for_nodes(void)
2a1e274ac   Mel Gorman   Create the ZONE_M...
4664
4665
4666
4667
  {
  	int i, nid;
  	unsigned long usable_startpfn;
  	unsigned long kernelcore_node, kernelcore_remaining;
66918dcdf   Yinghai Lu   x86: only clear n...
4668
  	/* save the state before borrow the nodemask */
4b0ef1fe8   Lai Jiangshan   page_alloc: use N...
4669
  	nodemask_t saved_node_state = node_states[N_MEMORY];
37b07e416   Lee Schermerhorn   memoryless nodes:...
4670
  	unsigned long totalpages = early_calculate_totalpages();
4b0ef1fe8   Lai Jiangshan   page_alloc: use N...
4671
  	int usable_nodes = nodes_weight(node_states[N_MEMORY]);
2a1e274ac   Mel Gorman   Create the ZONE_M...
4672

7e63efef8   Mel Gorman   Add a movablecore...
4673
4674
4675
4676
4677
4678
4679
4680
4681
  	/*
  	 * If movablecore was specified, calculate what size of
  	 * kernelcore that corresponds so that memory usable for
  	 * any allocation type is evenly spread. If both kernelcore
  	 * and movablecore are specified, then the value of kernelcore
  	 * will be used for required_kernelcore if it's greater than
  	 * what movablecore would have allowed.
  	 */
  	if (required_movablecore) {
7e63efef8   Mel Gorman   Add a movablecore...
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
  		unsigned long corepages;
  
  		/*
  		 * Round-up so that ZONE_MOVABLE is at least as large as what
  		 * was requested by the user
  		 */
  		required_movablecore =
  			roundup(required_movablecore, MAX_ORDER_NR_PAGES);
  		corepages = totalpages - required_movablecore;
  
  		required_kernelcore = max(required_kernelcore, corepages);
  	}
20e6926dc   Yinghai Lu   x86, ACPI, mm: Re...
4694
4695
  	/* If kernelcore was not specified, there is no ZONE_MOVABLE */
  	if (!required_kernelcore)
66918dcdf   Yinghai Lu   x86: only clear n...
4696
  		goto out;
2a1e274ac   Mel Gorman   Create the ZONE_M...
4697
4698
  
  	/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
20e6926dc   Yinghai Lu   x86, ACPI, mm: Re...
4699
  	find_usable_zone_for_movable();
2a1e274ac   Mel Gorman   Create the ZONE_M...
4700
4701
4702
4703
4704
  	usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
  
  restart:
  	/* Spread kernelcore memory as evenly as possible throughout nodes */
  	kernelcore_node = required_kernelcore / usable_nodes;
4b0ef1fe8   Lai Jiangshan   page_alloc: use N...
4705
  	for_each_node_state(nid, N_MEMORY) {
c13291a53   Tejun Heo   bootmem: Use for_...
4706
  		unsigned long start_pfn, end_pfn;
2a1e274ac   Mel Gorman   Create the ZONE_M...
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
  		/*
  		 * Recalculate kernelcore_node if the division per node
  		 * now exceeds what is necessary to satisfy the requested
  		 * amount of memory for the kernel
  		 */
  		if (required_kernelcore < kernelcore_node)
  			kernelcore_node = required_kernelcore / usable_nodes;
  
  		/*
  		 * As the map is walked, we track how much memory is usable
  		 * by the kernel using kernelcore_remaining. When it is
  		 * 0, the rest of the node is usable by ZONE_MOVABLE
  		 */
  		kernelcore_remaining = kernelcore_node;
  
  		/* Go through each range of PFNs within this node */
c13291a53   Tejun Heo   bootmem: Use for_...
4723
  		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2a1e274ac   Mel Gorman   Create the ZONE_M...
4724
  			unsigned long size_pages;
c13291a53   Tejun Heo   bootmem: Use for_...
4725
  			start_pfn = max(start_pfn, zone_movable_pfn[nid]);
2a1e274ac   Mel Gorman   Create the ZONE_M...
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
  			if (start_pfn >= end_pfn)
  				continue;
  
  			/* Account for what is only usable for kernelcore */
  			if (start_pfn < usable_startpfn) {
  				unsigned long kernel_pages;
  				kernel_pages = min(end_pfn, usable_startpfn)
  								- start_pfn;
  
  				kernelcore_remaining -= min(kernel_pages,
  							kernelcore_remaining);
  				required_kernelcore -= min(kernel_pages,
  							required_kernelcore);
  
  				/* Continue if range is now fully accounted */
  				if (end_pfn <= usable_startpfn) {
  
  					/*
  					 * Push zone_movable_pfn to the end so
  					 * that if we have to rebalance
  					 * kernelcore across nodes, we will
  					 * not double account here
  					 */
  					zone_movable_pfn[nid] = end_pfn;
  					continue;
  				}
  				start_pfn = usable_startpfn;
  			}
  
  			/*
  			 * The usable PFN range for ZONE_MOVABLE is from
  			 * start_pfn->end_pfn. Calculate size_pages as the
  			 * number of pages used as kernelcore
  			 */
  			size_pages = end_pfn - start_pfn;
  			if (size_pages > kernelcore_remaining)
  				size_pages = kernelcore_remaining;
  			zone_movable_pfn[nid] = start_pfn + size_pages;
  
  			/*
  			 * Some kernelcore has been met, update counts and
  			 * break if the kernelcore for this node has been
  			 * satisified
  			 */
  			required_kernelcore -= min(required_kernelcore,
  								size_pages);
  			kernelcore_remaining -= size_pages;
  			if (!kernelcore_remaining)
  				break;
  		}
  	}
  
  	/*
  	 * If there is still required_kernelcore, we do another pass with one
  	 * less node in the count. This will push zone_movable_pfn[nid] further
  	 * along on the nodes that still have memory until kernelcore is
  	 * satisified
  	 */
  	usable_nodes--;
  	if (usable_nodes && required_kernelcore > usable_nodes)
  		goto restart;
  
  	/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
  	for (nid = 0; nid < MAX_NUMNODES; nid++)
  		zone_movable_pfn[nid] =
  			roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
66918dcdf   Yinghai Lu   x86: only clear n...
4792

20e6926dc   Yinghai Lu   x86, ACPI, mm: Re...
4793
  out:
66918dcdf   Yinghai Lu   x86: only clear n...
4794
  	/* restore the node_state */
4b0ef1fe8   Lai Jiangshan   page_alloc: use N...
4795
  	node_states[N_MEMORY] = saved_node_state;
2a1e274ac   Mel Gorman   Create the ZONE_M...
4796
  }
4b0ef1fe8   Lai Jiangshan   page_alloc: use N...
4797
4798
  /* Any regular or high memory on that node ? */
  static void check_for_memory(pg_data_t *pgdat, int nid)
37b07e416   Lee Schermerhorn   memoryless nodes:...
4799
  {
37b07e416   Lee Schermerhorn   memoryless nodes:...
4800
  	enum zone_type zone_type;
4b0ef1fe8   Lai Jiangshan   page_alloc: use N...
4801
4802
4803
4804
  	if (N_MEMORY == N_NORMAL_MEMORY)
  		return;
  
  	for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
37b07e416   Lee Schermerhorn   memoryless nodes:...
4805
  		struct zone *zone = &pgdat->node_zones[zone_type];
d0048b0e5   Bob Liu   page_alloc: break...
4806
  		if (zone->present_pages) {
4b0ef1fe8   Lai Jiangshan   page_alloc: use N...
4807
4808
4809
4810
  			node_set_state(nid, N_HIGH_MEMORY);
  			if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&
  			    zone_type <= ZONE_NORMAL)
  				node_set_state(nid, N_NORMAL_MEMORY);
d0048b0e5   Bob Liu   page_alloc: break...
4811
4812
  			break;
  		}
37b07e416   Lee Schermerhorn   memoryless nodes:...
4813
  	}
37b07e416   Lee Schermerhorn   memoryless nodes:...
4814
  }
c713216de   Mel Gorman   [PATCH] Introduce...
4815
4816
  /**
   * free_area_init_nodes - Initialise all pg_data_t and zone data
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
4817
   * @max_zone_pfn: an array of max PFNs for each zone
c713216de   Mel Gorman   [PATCH] Introduce...
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
   *
   * This will call free_area_init_node() for each active node in the system.
   * Using the page ranges provided by add_active_range(), the size of each
   * zone in each node and their holes is calculated. If the maximum PFN
   * between two adjacent zones match, it is assumed that the zone is empty.
   * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
   * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
   * starts where the previous one ended. For example, ZONE_DMA32 starts
   * at arch_max_dma_pfn.
   */
  void __init free_area_init_nodes(unsigned long *max_zone_pfn)
  {
c13291a53   Tejun Heo   bootmem: Use for_...
4830
4831
  	unsigned long start_pfn, end_pfn;
  	int i, nid;
a6af2bc3d   Mel Gorman   [PATCH] Avoid exc...
4832

c713216de   Mel Gorman   [PATCH] Introduce...
4833
4834
4835
4836
4837
4838
4839
4840
  	/* Record where the zone boundaries are */
  	memset(arch_zone_lowest_possible_pfn, 0,
  				sizeof(arch_zone_lowest_possible_pfn));
  	memset(arch_zone_highest_possible_pfn, 0,
  				sizeof(arch_zone_highest_possible_pfn));
  	arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
  	arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
  	for (i = 1; i < MAX_NR_ZONES; i++) {
2a1e274ac   Mel Gorman   Create the ZONE_M...
4841
4842
  		if (i == ZONE_MOVABLE)
  			continue;
c713216de   Mel Gorman   [PATCH] Introduce...
4843
4844
4845
4846
4847
  		arch_zone_lowest_possible_pfn[i] =
  			arch_zone_highest_possible_pfn[i-1];
  		arch_zone_highest_possible_pfn[i] =
  			max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
  	}
2a1e274ac   Mel Gorman   Create the ZONE_M...
4848
4849
4850
4851
4852
  	arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;
  	arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;
  
  	/* Find the PFNs that ZONE_MOVABLE begins at in each node */
  	memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
b224ef856   Kautuk Consul   page_alloc: remov...
4853
  	find_zone_movable_pfns_for_nodes();
c713216de   Mel Gorman   [PATCH] Introduce...
4854

c713216de   Mel Gorman   [PATCH] Introduce...
4855
  	/* Print out the zone ranges */
a62e2f4f5   Bjorn Helgaas   mm: print physica...
4856
4857
  	printk("Zone ranges:
  ");
2a1e274ac   Mel Gorman   Create the ZONE_M...
4858
4859
4860
  	for (i = 0; i < MAX_NR_ZONES; i++) {
  		if (i == ZONE_MOVABLE)
  			continue;
155cbfc80   Kay Sievers   mm: use KERN_CONT...
4861
  		printk(KERN_CONT "  %-8s ", zone_names[i]);
72f0ba025   David Rientjes   mm: suppress pfn ...
4862
4863
  		if (arch_zone_lowest_possible_pfn[i] ==
  				arch_zone_highest_possible_pfn[i])
155cbfc80   Kay Sievers   mm: use KERN_CONT...
4864
4865
  			printk(KERN_CONT "empty
  ");
72f0ba025   David Rientjes   mm: suppress pfn ...
4866
  		else
a62e2f4f5   Bjorn Helgaas   mm: print physica...
4867
4868
4869
4870
4871
  			printk(KERN_CONT "[mem %0#10lx-%0#10lx]
  ",
  				arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT,
  				(arch_zone_highest_possible_pfn[i]
  					<< PAGE_SHIFT) - 1);
2a1e274ac   Mel Gorman   Create the ZONE_M...
4872
4873
4874
  	}
  
  	/* Print out the PFNs ZONE_MOVABLE begins at in each node */
a62e2f4f5   Bjorn Helgaas   mm: print physica...
4875
4876
  	printk("Movable zone start for each node
  ");
2a1e274ac   Mel Gorman   Create the ZONE_M...
4877
4878
  	for (i = 0; i < MAX_NUMNODES; i++) {
  		if (zone_movable_pfn[i])
a62e2f4f5   Bjorn Helgaas   mm: print physica...
4879
4880
4881
  			printk("  Node %d: %#010lx
  ", i,
  			       zone_movable_pfn[i] << PAGE_SHIFT);
2a1e274ac   Mel Gorman   Create the ZONE_M...
4882
  	}
c713216de   Mel Gorman   [PATCH] Introduce...
4883

f2d52fe51   Wanpeng Li   mm/memblock: clea...
4884
  	/* Print out the early node map */
a62e2f4f5   Bjorn Helgaas   mm: print physica...
4885
4886
  	printk("Early memory node ranges
  ");
c13291a53   Tejun Heo   bootmem: Use for_...
4887
  	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
a62e2f4f5   Bjorn Helgaas   mm: print physica...
4888
4889
4890
  		printk("  node %3d: [mem %#010lx-%#010lx]
  ", nid,
  		       start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);
c713216de   Mel Gorman   [PATCH] Introduce...
4891
4892
  
  	/* Initialise every node */
708614e61   Mel Gorman   mm: verify the pa...
4893
  	mminit_verify_pageflags_layout();
8ef828668   Christoph Lameter   [PATCH] slab: red...
4894
  	setup_nr_node_ids();
c713216de   Mel Gorman   [PATCH] Introduce...
4895
4896
  	for_each_online_node(nid) {
  		pg_data_t *pgdat = NODE_DATA(nid);
9109fb7b3   Johannes Weiner   mm: drop unneeded...
4897
  		free_area_init_node(nid, NULL,
c713216de   Mel Gorman   [PATCH] Introduce...
4898
  				find_min_pfn_for_node(nid), NULL);
37b07e416   Lee Schermerhorn   memoryless nodes:...
4899
4900
4901
  
  		/* Any memory on that node */
  		if (pgdat->node_present_pages)
4b0ef1fe8   Lai Jiangshan   page_alloc: use N...
4902
4903
  			node_set_state(nid, N_MEMORY);
  		check_for_memory(pgdat, nid);
c713216de   Mel Gorman   [PATCH] Introduce...
4904
4905
  	}
  }
2a1e274ac   Mel Gorman   Create the ZONE_M...
4906

7e63efef8   Mel Gorman   Add a movablecore...
4907
  static int __init cmdline_parse_core(char *p, unsigned long *core)
2a1e274ac   Mel Gorman   Create the ZONE_M...
4908
4909
4910
4911
4912
4913
  {
  	unsigned long long coremem;
  	if (!p)
  		return -EINVAL;
  
  	coremem = memparse(p, &p);
7e63efef8   Mel Gorman   Add a movablecore...
4914
  	*core = coremem >> PAGE_SHIFT;
2a1e274ac   Mel Gorman   Create the ZONE_M...
4915

7e63efef8   Mel Gorman   Add a movablecore...
4916
  	/* Paranoid check that UL is enough for the coremem value */
2a1e274ac   Mel Gorman   Create the ZONE_M...
4917
4918
4919
4920
  	WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
  
  	return 0;
  }
ed7ed3651   Mel Gorman   handle kernelcore...
4921

7e63efef8   Mel Gorman   Add a movablecore...
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
  /*
   * kernelcore=size sets the amount of memory for use for allocations that
   * cannot be reclaimed or migrated.
   */
  static int __init cmdline_parse_kernelcore(char *p)
  {
  	return cmdline_parse_core(p, &required_kernelcore);
  }
  
  /*
   * movablecore=size sets the amount of memory for use for allocations that
   * can be reclaimed or migrated.
   */
  static int __init cmdline_parse_movablecore(char *p)
  {
  	return cmdline_parse_core(p, &required_movablecore);
  }
ed7ed3651   Mel Gorman   handle kernelcore...
4939
  early_param("kernelcore", cmdline_parse_kernelcore);
7e63efef8   Mel Gorman   Add a movablecore...
4940
  early_param("movablecore", cmdline_parse_movablecore);
ed7ed3651   Mel Gorman   handle kernelcore...
4941

0ee332c14   Tejun Heo   memblock: Kill ea...
4942
  #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
c713216de   Mel Gorman   [PATCH] Introduce...
4943

69afade72   Jiang Liu   mm: introduce com...
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
  unsigned long free_reserved_area(unsigned long start, unsigned long end,
  				 int poison, char *s)
  {
  	unsigned long pages, pos;
  
  	pos = start = PAGE_ALIGN(start);
  	end &= PAGE_MASK;
  	for (pages = 0; pos < end; pos += PAGE_SIZE, pages++) {
  		if (poison)
  			memset((void *)pos, poison, PAGE_SIZE);
bb3ec6b08   Ralf Baechle   mm: Fix virt_to_p...
4954
  		free_reserved_page(virt_to_page((void *)pos));
69afade72   Jiang Liu   mm: introduce com...
4955
4956
4957
4958
4959
4960
4961
4962
4963
  	}
  
  	if (pages && s)
  		pr_info("Freeing %s memory: %ldK (%lx - %lx)
  ",
  			s, pages << (PAGE_SHIFT - 10), start, end);
  
  	return pages;
  }
cfa11e08e   Jiang Liu   mm: introduce fre...
4964
4965
4966
4967
4968
4969
4970
4971
  #ifdef	CONFIG_HIGHMEM
  void free_highmem_page(struct page *page)
  {
  	__free_reserved_page(page);
  	totalram_pages++;
  	totalhigh_pages++;
  }
  #endif
0e0b864e0   Mel Gorman   [PATCH] Account f...
4972
  /**
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
4973
4974
   * set_dma_reserve - set the specified number of pages reserved in the first zone
   * @new_dma_reserve: The number of pages to mark reserved
0e0b864e0   Mel Gorman   [PATCH] Account f...
4975
4976
4977
4978
   *
   * The per-cpu batchsize and zone watermarks are determined by present_pages.
   * In the DMA zone, a significant percentage may be consumed by kernel image
   * and other unfreeable allocations which can skew the watermarks badly. This
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
4979
4980
4981
   * function may optionally be used to account for unfreeable pages in the
   * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
   * smaller per-cpu batchsize.
0e0b864e0   Mel Gorman   [PATCH] Account f...
4982
4983
4984
4985
4986
   */
  void __init set_dma_reserve(unsigned long new_dma_reserve)
  {
  	dma_reserve = new_dma_reserve;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4987
4988
  void __init free_area_init(unsigned long *zones_size)
  {
9109fb7b3   Johannes Weiner   mm: drop unneeded...
4989
  	free_area_init_node(0, zones_size,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4990
4991
  			__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4992

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4993
4994
4995
4996
  static int page_alloc_cpu_notify(struct notifier_block *self,
  				 unsigned long action, void *hcpu)
  {
  	int cpu = (unsigned long)hcpu;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4997

8bb784428   Rafael J. Wysocki   Add suspend-relat...
4998
  	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
f0cb3c76a   Konstantin Khlebnikov   mm: drain percpu ...
4999
  		lru_add_drain_cpu(cpu);
9f8f21725   Christoph Lameter   Page allocator: c...
5000
5001
5002
5003
5004
5005
5006
5007
  		drain_pages(cpu);
  
  		/*
  		 * Spill the event counters of the dead processor
  		 * into the current processors event counters.
  		 * This artificially elevates the count of the current
  		 * processor.
  		 */
f8891e5e1   Christoph Lameter   [PATCH] Light wei...
5008
  		vm_events_fold_cpu(cpu);
9f8f21725   Christoph Lameter   Page allocator: c...
5009
5010
5011
5012
5013
5014
5015
5016
  
  		/*
  		 * Zero the differential counters of the dead processor
  		 * so that the vm statistics are consistent.
  		 *
  		 * This is only okay since the processor is dead and cannot
  		 * race with what we are doing.
  		 */
2244b95a7   Christoph Lameter   [PATCH] zoned vm ...
5017
  		refresh_cpu_vm_stats(cpu);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5018
5019
5020
  	}
  	return NOTIFY_OK;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5021
5022
5023
5024
5025
5026
5027
  
  void __init page_alloc_init(void)
  {
  	hotcpu_notifier(page_alloc_cpu_notify, 0);
  }
  
  /*
cb45b0e96   Hideo AOKI   [PATCH] overcommi...
5028
5029
5030
5031
5032
5033
5034
   * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio
   *	or min_free_kbytes changes.
   */
  static void calculate_totalreserve_pages(void)
  {
  	struct pglist_data *pgdat;
  	unsigned long reserve_pages = 0;
2f6726e54   Christoph Lameter   [PATCH] Apply typ...
5035
  	enum zone_type i, j;
cb45b0e96   Hideo AOKI   [PATCH] overcommi...
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
  
  	for_each_online_pgdat(pgdat) {
  		for (i = 0; i < MAX_NR_ZONES; i++) {
  			struct zone *zone = pgdat->node_zones + i;
  			unsigned long max = 0;
  
  			/* Find valid and maximum lowmem_reserve in the zone */
  			for (j = i; j < MAX_NR_ZONES; j++) {
  				if (zone->lowmem_reserve[j] > max)
  					max = zone->lowmem_reserve[j];
  			}
418589663   Mel Gorman   page allocator: u...
5047
5048
  			/* we treat the high watermark as reserved pages. */
  			max += high_wmark_pages(zone);
cb45b0e96   Hideo AOKI   [PATCH] overcommi...
5049

b40da0494   Jiang Liu   mm: use zone->pre...
5050
5051
  			if (max > zone->managed_pages)
  				max = zone->managed_pages;
cb45b0e96   Hideo AOKI   [PATCH] overcommi...
5052
  			reserve_pages += max;
ab8fabd46   Johannes Weiner   mm: exclude reser...
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
  			/*
  			 * Lowmem reserves are not available to
  			 * GFP_HIGHUSER page cache allocations and
  			 * kswapd tries to balance zones to their high
  			 * watermark.  As a result, neither should be
  			 * regarded as dirtyable memory, to prevent a
  			 * situation where reclaim has to clean pages
  			 * in order to balance the zones.
  			 */
  			zone->dirty_balance_reserve = max;
cb45b0e96   Hideo AOKI   [PATCH] overcommi...
5063
5064
  		}
  	}
ab8fabd46   Johannes Weiner   mm: exclude reser...
5065
  	dirty_balance_reserve = reserve_pages;
cb45b0e96   Hideo AOKI   [PATCH] overcommi...
5066
5067
5068
5069
  	totalreserve_pages = reserve_pages;
  }
  
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5070
5071
5072
5073
5074
5075
5076
5077
   * setup_per_zone_lowmem_reserve - called whenever
   *	sysctl_lower_zone_reserve_ratio changes.  Ensures that each zone
   *	has a correct pages reserved value, so an adequate number of
   *	pages are left in the zone after a successful __alloc_pages().
   */
  static void setup_per_zone_lowmem_reserve(void)
  {
  	struct pglist_data *pgdat;
2f6726e54   Christoph Lameter   [PATCH] Apply typ...
5078
  	enum zone_type j, idx;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5079

ec936fc56   KAMEZAWA Hiroyuki   [PATCH] for_each_...
5080
  	for_each_online_pgdat(pgdat) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5081
5082
  		for (j = 0; j < MAX_NR_ZONES; j++) {
  			struct zone *zone = pgdat->node_zones + j;
b40da0494   Jiang Liu   mm: use zone->pre...
5083
  			unsigned long managed_pages = zone->managed_pages;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5084
5085
  
  			zone->lowmem_reserve[j] = 0;
2f6726e54   Christoph Lameter   [PATCH] Apply typ...
5086
5087
  			idx = j;
  			while (idx) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5088
  				struct zone *lower_zone;
2f6726e54   Christoph Lameter   [PATCH] Apply typ...
5089
  				idx--;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5090
5091
5092
5093
  				if (sysctl_lowmem_reserve_ratio[idx] < 1)
  					sysctl_lowmem_reserve_ratio[idx] = 1;
  
  				lower_zone = pgdat->node_zones + idx;
b40da0494   Jiang Liu   mm: use zone->pre...
5094
  				lower_zone->lowmem_reserve[j] = managed_pages /
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5095
  					sysctl_lowmem_reserve_ratio[idx];
b40da0494   Jiang Liu   mm: use zone->pre...
5096
  				managed_pages += lower_zone->managed_pages;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5097
5098
5099
  			}
  		}
  	}
cb45b0e96   Hideo AOKI   [PATCH] overcommi...
5100
5101
5102
  
  	/* update totalreserve_pages */
  	calculate_totalreserve_pages();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5103
  }
cfd3da1e4   Mel Gorman   mm: Serialize acc...
5104
  static void __setup_per_zone_wmarks(void)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5105
5106
5107
5108
5109
5110
5111
5112
5113
  {
  	unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
  	unsigned long lowmem_pages = 0;
  	struct zone *zone;
  	unsigned long flags;
  
  	/* Calculate total number of !ZONE_HIGHMEM pages */
  	for_each_zone(zone) {
  		if (!is_highmem(zone))
b40da0494   Jiang Liu   mm: use zone->pre...
5114
  			lowmem_pages += zone->managed_pages;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5115
5116
5117
  	}
  
  	for_each_zone(zone) {
ac924c603   Andrew Morton   [PATCH] setup_per...
5118
  		u64 tmp;
1125b4e39   Gerald Schaefer   setup_per_zone_pa...
5119
  		spin_lock_irqsave(&zone->lock, flags);
b40da0494   Jiang Liu   mm: use zone->pre...
5120
  		tmp = (u64)pages_min * zone->managed_pages;
ac924c603   Andrew Morton   [PATCH] setup_per...
5121
  		do_div(tmp, lowmem_pages);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5122
5123
  		if (is_highmem(zone)) {
  			/*
669ed1752   Nick Piggin   [PATCH] mm: highm...
5124
5125
5126
5127
  			 * __GFP_HIGH and PF_MEMALLOC allocations usually don't
  			 * need highmem pages, so cap pages_min to a small
  			 * value here.
  			 *
418589663   Mel Gorman   page allocator: u...
5128
  			 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
669ed1752   Nick Piggin   [PATCH] mm: highm...
5129
5130
  			 * deltas controls asynch page reclaim, and so should
  			 * not be capped for highmem.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5131
  			 */
90ae8d670   Andrew Morton   mm/page_alloc.c:_...
5132
  			unsigned long min_pages;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5133

b40da0494   Jiang Liu   mm: use zone->pre...
5134
  			min_pages = zone->managed_pages / 1024;
90ae8d670   Andrew Morton   mm/page_alloc.c:_...
5135
  			min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
418589663   Mel Gorman   page allocator: u...
5136
  			zone->watermark[WMARK_MIN] = min_pages;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5137
  		} else {
669ed1752   Nick Piggin   [PATCH] mm: highm...
5138
5139
  			/*
  			 * If it's a lowmem zone, reserve a number of pages
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5140
5141
  			 * proportionate to the zone's size.
  			 */
418589663   Mel Gorman   page allocator: u...
5142
  			zone->watermark[WMARK_MIN] = tmp;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5143
  		}
418589663   Mel Gorman   page allocator: u...
5144
5145
  		zone->watermark[WMARK_LOW]  = min_wmark_pages(zone) + (tmp >> 2);
  		zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
49f223a9c   Marek Szyprowski   mm: trigger page ...
5146

56fd56b86   Mel Gorman   Bias the location...
5147
  		setup_zone_migrate_reserve(zone);
1125b4e39   Gerald Schaefer   setup_per_zone_pa...
5148
  		spin_unlock_irqrestore(&zone->lock, flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5149
  	}
cb45b0e96   Hideo AOKI   [PATCH] overcommi...
5150
5151
5152
  
  	/* update totalreserve_pages */
  	calculate_totalreserve_pages();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5153
  }
cfd3da1e4   Mel Gorman   mm: Serialize acc...
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
  /**
   * setup_per_zone_wmarks - called when min_free_kbytes changes
   * or when memory is hot-{added|removed}
   *
   * Ensures that the watermark[min,low,high] values for each zone are set
   * correctly with respect to min_free_kbytes.
   */
  void setup_per_zone_wmarks(void)
  {
  	mutex_lock(&zonelists_mutex);
  	__setup_per_zone_wmarks();
  	mutex_unlock(&zonelists_mutex);
  }
55a4462af   Randy Dunlap   page_alloc: fix k...
5167
  /*
556adecba   Rik van Riel   vmscan: second ch...
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
   * The inactive anon list should be small enough that the VM never has to
   * do too much work, but large enough that each inactive page has a chance
   * to be referenced again before it is swapped out.
   *
   * The inactive_anon ratio is the target ratio of ACTIVE_ANON to
   * INACTIVE_ANON pages on this zone's LRU, maintained by the
   * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of
   * the anonymous pages are kept on the inactive list.
   *
   * total     target    max
   * memory    ratio     inactive anon
   * -------------------------------------
   *   10MB       1         5MB
   *  100MB       1        50MB
   *    1GB       3       250MB
   *   10GB      10       0.9GB
   *  100GB      31         3GB
   *    1TB     101        10GB
   *   10TB     320        32GB
   */
1b79acc91   KOSAKI Motohiro   mm, mem-hotplug: ...
5188
  static void __meminit calculate_zone_inactive_ratio(struct zone *zone)
556adecba   Rik van Riel   vmscan: second ch...
5189
  {
96cb4df5d   Minchan Kim   page-allocator: a...
5190
  	unsigned int gb, ratio;
556adecba   Rik van Riel   vmscan: second ch...
5191

96cb4df5d   Minchan Kim   page-allocator: a...
5192
  	/* Zone size in gigabytes */
b40da0494   Jiang Liu   mm: use zone->pre...
5193
  	gb = zone->managed_pages >> (30 - PAGE_SHIFT);
96cb4df5d   Minchan Kim   page-allocator: a...
5194
  	if (gb)
556adecba   Rik van Riel   vmscan: second ch...
5195
  		ratio = int_sqrt(10 * gb);
96cb4df5d   Minchan Kim   page-allocator: a...
5196
5197
  	else
  		ratio = 1;
556adecba   Rik van Riel   vmscan: second ch...
5198

96cb4df5d   Minchan Kim   page-allocator: a...
5199
5200
  	zone->inactive_ratio = ratio;
  }
556adecba   Rik van Riel   vmscan: second ch...
5201

839a4fcc8   KOSAKI Motohiro   mm, mem-hotplug: ...
5202
  static void __meminit setup_per_zone_inactive_ratio(void)
96cb4df5d   Minchan Kim   page-allocator: a...
5203
5204
5205
5206
5207
  {
  	struct zone *zone;
  
  	for_each_zone(zone)
  		calculate_zone_inactive_ratio(zone);
556adecba   Rik van Riel   vmscan: second ch...
5208
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
  /*
   * Initialise min_free_kbytes.
   *
   * For small machines we want it small (128k min).  For large machines
   * we want it large (64MB max).  But it is not linear, because network
   * bandwidth does not increase linearly with machine size.  We use
   *
   * 	min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
   *	min_free_kbytes = sqrt(lowmem_kbytes * 16)
   *
   * which yields
   *
   * 16MB:	512k
   * 32MB:	724k
   * 64MB:	1024k
   * 128MB:	1448k
   * 256MB:	2048k
   * 512MB:	2896k
   * 1024MB:	4096k
   * 2048MB:	5792k
   * 4096MB:	8192k
   * 8192MB:	11584k
   * 16384MB:	16384k
   */
1b79acc91   KOSAKI Motohiro   mm, mem-hotplug: ...
5233
  int __meminit init_per_zone_wmark_min(void)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
  {
  	unsigned long lowmem_kbytes;
  
  	lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
  
  	min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
  	if (min_free_kbytes < 128)
  		min_free_kbytes = 128;
  	if (min_free_kbytes > 65536)
  		min_free_kbytes = 65536;
bc75d33f0   Minchan Kim   page-allocator: c...
5244
  	setup_per_zone_wmarks();
a6cccdc36   KOSAKI Motohiro   mm, mem-hotplug: ...
5245
  	refresh_zone_stat_thresholds();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5246
  	setup_per_zone_lowmem_reserve();
556adecba   Rik van Riel   vmscan: second ch...
5247
  	setup_per_zone_inactive_ratio();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5248
5249
  	return 0;
  }
bc75d33f0   Minchan Kim   page-allocator: c...
5250
  module_init(init_per_zone_wmark_min)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5251
5252
5253
5254
5255
5256
5257
  
  /*
   * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 
   *	that we can call two helper functions whenever min_free_kbytes
   *	changes.
   */
  int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 
8d65af789   Alexey Dobriyan   sysctl: remove "s...
5258
  	void __user *buffer, size_t *length, loff_t *ppos)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5259
  {
8d65af789   Alexey Dobriyan   sysctl: remove "s...
5260
  	proc_dointvec(table, write, buffer, length, ppos);
3b1d92c56   Mel Gorman   Do not disable in...
5261
  	if (write)
bc75d33f0   Minchan Kim   page-allocator: c...
5262
  		setup_per_zone_wmarks();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5263
5264
  	return 0;
  }
9614634fe   Christoph Lameter   [PATCH] ZVC/zone_...
5265
5266
  #ifdef CONFIG_NUMA
  int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
8d65af789   Alexey Dobriyan   sysctl: remove "s...
5267
  	void __user *buffer, size_t *length, loff_t *ppos)
9614634fe   Christoph Lameter   [PATCH] ZVC/zone_...
5268
5269
5270
  {
  	struct zone *zone;
  	int rc;
8d65af789   Alexey Dobriyan   sysctl: remove "s...
5271
  	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
9614634fe   Christoph Lameter   [PATCH] ZVC/zone_...
5272
5273
5274
5275
  	if (rc)
  		return rc;
  
  	for_each_zone(zone)
b40da0494   Jiang Liu   mm: use zone->pre...
5276
  		zone->min_unmapped_pages = (zone->managed_pages *
9614634fe   Christoph Lameter   [PATCH] ZVC/zone_...
5277
5278
5279
  				sysctl_min_unmapped_ratio) / 100;
  	return 0;
  }
0ff38490c   Christoph Lameter   [PATCH] zone_recl...
5280
5281
  
  int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
8d65af789   Alexey Dobriyan   sysctl: remove "s...
5282
  	void __user *buffer, size_t *length, loff_t *ppos)
0ff38490c   Christoph Lameter   [PATCH] zone_recl...
5283
5284
5285
  {
  	struct zone *zone;
  	int rc;
8d65af789   Alexey Dobriyan   sysctl: remove "s...
5286
  	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
0ff38490c   Christoph Lameter   [PATCH] zone_recl...
5287
5288
5289
5290
  	if (rc)
  		return rc;
  
  	for_each_zone(zone)
b40da0494   Jiang Liu   mm: use zone->pre...
5291
  		zone->min_slab_pages = (zone->managed_pages *
0ff38490c   Christoph Lameter   [PATCH] zone_recl...
5292
5293
5294
  				sysctl_min_slab_ratio) / 100;
  	return 0;
  }
9614634fe   Christoph Lameter   [PATCH] ZVC/zone_...
5295
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5296
5297
5298
5299
5300
5301
  /*
   * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
   *	proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
   *	whenever sysctl_lowmem_reserve_ratio changes.
   *
   * The reserve ratio obviously has absolutely no relation with the
418589663   Mel Gorman   page allocator: u...
5302
   * minimum watermarks. The lowmem reserve ratio can only make sense
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5303
5304
5305
   * if in function of the boot time zone sizes.
   */
  int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
8d65af789   Alexey Dobriyan   sysctl: remove "s...
5306
  	void __user *buffer, size_t *length, loff_t *ppos)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5307
  {
8d65af789   Alexey Dobriyan   sysctl: remove "s...
5308
  	proc_dointvec_minmax(table, write, buffer, length, ppos);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5309
5310
5311
  	setup_per_zone_lowmem_reserve();
  	return 0;
  }
8ad4b1fb8   Rohit Seth   [PATCH] Make high...
5312
5313
5314
5315
5316
5317
5318
  /*
   * percpu_pagelist_fraction - changes the pcp->high for each zone on each
   * cpu.  It is the fraction of total pages in each zone that a hot per cpu pagelist
   * can have before it gets flushed back to buddy allocator.
   */
  
  int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
8d65af789   Alexey Dobriyan   sysctl: remove "s...
5319
  	void __user *buffer, size_t *length, loff_t *ppos)
8ad4b1fb8   Rohit Seth   [PATCH] Make high...
5320
5321
5322
5323
  {
  	struct zone *zone;
  	unsigned int cpu;
  	int ret;
8d65af789   Alexey Dobriyan   sysctl: remove "s...
5324
  	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
93278814d   Sasha Levin   mm: fix division ...
5325
  	if (!write || (ret < 0))
8ad4b1fb8   Rohit Seth   [PATCH] Make high...
5326
  		return ret;
364df0ebf   Dimitri Sivanich   mm: fix handling ...
5327
  	for_each_populated_zone(zone) {
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
5328
  		for_each_possible_cpu(cpu) {
8ad4b1fb8   Rohit Seth   [PATCH] Make high...
5329
  			unsigned long  high;
b40da0494   Jiang Liu   mm: use zone->pre...
5330
  			high = zone->managed_pages / percpu_pagelist_fraction;
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
5331
5332
  			setup_pagelist_highmark(
  				per_cpu_ptr(zone->pageset, cpu), high);
8ad4b1fb8   Rohit Seth   [PATCH] Make high...
5333
5334
5335
5336
  		}
  	}
  	return 0;
  }
f034b5d4e   David S. Miller   [XFRM]: Dynamic x...
5337
  int hashdist = HASHDIST_DEFAULT;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
  
  #ifdef CONFIG_NUMA
  static int __init set_hashdist(char *str)
  {
  	if (!str)
  		return 0;
  	hashdist = simple_strtoul(str, &str, 0);
  	return 1;
  }
  __setup("hashdist=", set_hashdist);
  #endif
  
  /*
   * allocate a large system hash table from bootmem
   * - it is assumed that the hash table must contain an exact power-of-2
   *   quantity of entries
   * - limit is the number of hash buckets, not the total allocation size
   */
  void *__init alloc_large_system_hash(const char *tablename,
  				     unsigned long bucketsize,
  				     unsigned long numentries,
  				     int scale,
  				     int flags,
  				     unsigned int *_hash_shift,
  				     unsigned int *_hash_mask,
31fe62b95   Tim Bird   mm: add a low lim...
5363
5364
  				     unsigned long low_limit,
  				     unsigned long high_limit)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5365
  {
31fe62b95   Tim Bird   mm: add a low lim...
5366
  	unsigned long long max = high_limit;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5367
5368
5369
5370
5371
5372
  	unsigned long log2qty, size;
  	void *table = NULL;
  
  	/* allow the kernel cmdline to have a say */
  	if (!numentries) {
  		/* round applicable memory size up to nearest megabyte */
049036643   Andrew Morton   [PATCH] remove HA...
5373
  		numentries = nr_kernel_pages;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5374
5375
5376
5377
5378
5379
5380
5381
5382
  		numentries += (1UL << (20 - PAGE_SHIFT)) - 1;
  		numentries >>= 20 - PAGE_SHIFT;
  		numentries <<= 20 - PAGE_SHIFT;
  
  		/* limit to 1 bucket per 2^scale bytes of low memory */
  		if (scale > PAGE_SHIFT)
  			numentries >>= (scale - PAGE_SHIFT);
  		else
  			numentries <<= (PAGE_SHIFT - scale);
9ab37b8f2   Paul Mundt   [PATCH] Sanely si...
5383
5384
  
  		/* Make sure we've got at least a 0-order allocation.. */
2c85f51d2   Jan Beulich   mm: also use allo...
5385
5386
5387
5388
5389
5390
5391
5392
  		if (unlikely(flags & HASH_SMALL)) {
  			/* Makes no sense without HASH_EARLY */
  			WARN_ON(!(flags & HASH_EARLY));
  			if (!(numentries >> *_hash_shift)) {
  				numentries = 1UL << *_hash_shift;
  				BUG_ON(!numentries);
  			}
  		} else if (unlikely((numentries * bucketsize) < PAGE_SIZE))
9ab37b8f2   Paul Mundt   [PATCH] Sanely si...
5393
  			numentries = PAGE_SIZE / bucketsize;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5394
  	}
6e692ed37   John Hawkes   [PATCH] fix alloc...
5395
  	numentries = roundup_pow_of_two(numentries);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5396
5397
5398
5399
5400
5401
  
  	/* limit allocation size to 1/16 total memory by default */
  	if (max == 0) {
  		max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
  		do_div(max, bucketsize);
  	}
074b85175   Dimitri Sivanich   vfs: fix panic in...
5402
  	max = min(max, 0x80000000ULL);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5403

31fe62b95   Tim Bird   mm: add a low lim...
5404
5405
  	if (numentries < low_limit)
  		numentries = low_limit;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5406
5407
  	if (numentries > max)
  		numentries = max;
f0d1b0b30   David Howells   [PATCH] LOG2: Imp...
5408
  	log2qty = ilog2(numentries);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5409
5410
5411
5412
  
  	do {
  		size = bucketsize << log2qty;
  		if (flags & HASH_EARLY)
74768ed83   Jan Beulich   page allocator: u...
5413
  			table = alloc_bootmem_nopanic(size);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5414
5415
5416
  		else if (hashdist)
  			table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
  		else {
1037b83bd   Eric Dumazet   MM: alloc_large_s...
5417
5418
  			/*
  			 * If bucketsize is not a power-of-two, we may free
a1dd268cf   Mel Gorman   mm: use alloc_pag...
5419
5420
  			 * some pages at the end of hash table which
  			 * alloc_pages_exact() automatically does
1037b83bd   Eric Dumazet   MM: alloc_large_s...
5421
  			 */
264ef8a90   Catalin Marinas   kmemleak: Remove ...
5422
  			if (get_order(size) < MAX_ORDER) {
a1dd268cf   Mel Gorman   mm: use alloc_pag...
5423
  				table = alloc_pages_exact(size, GFP_ATOMIC);
264ef8a90   Catalin Marinas   kmemleak: Remove ...
5424
5425
  				kmemleak_alloc(table, size, 1, GFP_ATOMIC);
  			}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5426
5427
5428
5429
5430
5431
  		}
  	} while (!table && size > PAGE_SIZE && --log2qty);
  
  	if (!table)
  		panic("Failed to allocate %s hash table
  ", tablename);
f241e6607   Robin Holt   mm: alloc_large_s...
5432
5433
  	printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)
  ",
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5434
  	       tablename,
f241e6607   Robin Holt   mm: alloc_large_s...
5435
  	       (1UL << log2qty),
f0d1b0b30   David Howells   [PATCH] LOG2: Imp...
5436
  	       ilog2(size) - PAGE_SHIFT,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5437
5438
5439
5440
5441
5442
5443
5444
5445
  	       size);
  
  	if (_hash_shift)
  		*_hash_shift = log2qty;
  	if (_hash_mask)
  		*_hash_mask = (1 << log2qty) - 1;
  
  	return table;
  }
a117e66ed   KAMEZAWA Hiroyuki   [PATCH] unify pfn...
5446

835c134ec   Mel Gorman   Add a bitmap that...
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
  /* Return a pointer to the bitmap storing bits affecting a block of pages */
  static inline unsigned long *get_pageblock_bitmap(struct zone *zone,
  							unsigned long pfn)
  {
  #ifdef CONFIG_SPARSEMEM
  	return __pfn_to_section(pfn)->pageblock_flags;
  #else
  	return zone->pageblock_flags;
  #endif /* CONFIG_SPARSEMEM */
  }
  
  static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
  {
  #ifdef CONFIG_SPARSEMEM
  	pfn &= (PAGES_PER_SECTION-1);
d9c234005   Mel Gorman   Do not depend on ...
5462
  	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
835c134ec   Mel Gorman   Add a bitmap that...
5463
  #else
c060f943d   Laura Abbott   mm: use aligned z...
5464
  	pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages);
d9c234005   Mel Gorman   Do not depend on ...
5465
  	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
835c134ec   Mel Gorman   Add a bitmap that...
5466
5467
5468
5469
  #endif /* CONFIG_SPARSEMEM */
  }
  
  /**
d9c234005   Mel Gorman   Do not depend on ...
5470
   * get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages
835c134ec   Mel Gorman   Add a bitmap that...
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
   * @page: The page within the block of interest
   * @start_bitidx: The first bit of interest to retrieve
   * @end_bitidx: The last bit of interest
   * returns pageblock_bits flags
   */
  unsigned long get_pageblock_flags_group(struct page *page,
  					int start_bitidx, int end_bitidx)
  {
  	struct zone *zone;
  	unsigned long *bitmap;
  	unsigned long pfn, bitidx;
  	unsigned long flags = 0;
  	unsigned long value = 1;
  
  	zone = page_zone(page);
  	pfn = page_to_pfn(page);
  	bitmap = get_pageblock_bitmap(zone, pfn);
  	bitidx = pfn_to_bitidx(zone, pfn);
  
  	for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
  		if (test_bit(bitidx + start_bitidx, bitmap))
  			flags |= value;
6220ec784   Andrew Morton   [PATCH] highest_p...
5493

835c134ec   Mel Gorman   Add a bitmap that...
5494
5495
5496
5497
  	return flags;
  }
  
  /**
d9c234005   Mel Gorman   Do not depend on ...
5498
   * set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages
835c134ec   Mel Gorman   Add a bitmap that...
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
   * @page: The page within the block of interest
   * @start_bitidx: The first bit of interest
   * @end_bitidx: The last bit of interest
   * @flags: The flags to set
   */
  void set_pageblock_flags_group(struct page *page, unsigned long flags,
  					int start_bitidx, int end_bitidx)
  {
  	struct zone *zone;
  	unsigned long *bitmap;
  	unsigned long pfn, bitidx;
  	unsigned long value = 1;
  
  	zone = page_zone(page);
  	pfn = page_to_pfn(page);
  	bitmap = get_pageblock_bitmap(zone, pfn);
  	bitidx = pfn_to_bitidx(zone, pfn);
108bcc96e   Cody P Schafer   mm: add & use zon...
5516
  	VM_BUG_ON(!zone_spans_pfn(zone, pfn));
835c134ec   Mel Gorman   Add a bitmap that...
5517
5518
5519
5520
5521
5522
5523
  
  	for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
  		if (flags & value)
  			__set_bit(bitidx + start_bitidx, bitmap);
  		else
  			__clear_bit(bitidx + start_bitidx, bitmap);
  }
a5d76b54a   KAMEZAWA Hiroyuki   memory unplug: pa...
5524
5525
  
  /*
80934513b   Minchan Kim   mm: clean up __co...
5526
5527
5528
5529
5530
5531
   * This function checks whether pageblock includes unmovable pages or not.
   * If @count is not zero, it is okay to include less @count unmovable pages
   *
   * PageLRU check wihtout isolation or lru_lock could race so that
   * MIGRATE_MOVABLE block might include unmovable pages. It means you can't
   * expect this function should be exact.
a5d76b54a   KAMEZAWA Hiroyuki   memory unplug: pa...
5532
   */
b023f4681   Wen Congyang   memory-hotplug: s...
5533
5534
  bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
  			 bool skip_hwpoisoned_pages)
49ac82558   KAMEZAWA Hiroyuki   memory hotplug: u...
5535
5536
  {
  	unsigned long pfn, iter, found;
47118af07   Michal Nazarewicz   mm: mmzone: MIGRA...
5537
  	int mt;
49ac82558   KAMEZAWA Hiroyuki   memory hotplug: u...
5538
5539
  	/*
  	 * For avoiding noise data, lru_add_drain_all() should be called
80934513b   Minchan Kim   mm: clean up __co...
5540
  	 * If ZONE_MOVABLE, the zone never contains unmovable pages
49ac82558   KAMEZAWA Hiroyuki   memory hotplug: u...
5541
5542
  	 */
  	if (zone_idx(zone) == ZONE_MOVABLE)
80934513b   Minchan Kim   mm: clean up __co...
5543
  		return false;
47118af07   Michal Nazarewicz   mm: mmzone: MIGRA...
5544
5545
  	mt = get_pageblock_migratetype(page);
  	if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))
80934513b   Minchan Kim   mm: clean up __co...
5546
  		return false;
49ac82558   KAMEZAWA Hiroyuki   memory hotplug: u...
5547
5548
5549
5550
  
  	pfn = page_to_pfn(page);
  	for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
  		unsigned long check = pfn + iter;
29723fccc   Namhyung Kim   mm: fix dubious c...
5551
  		if (!pfn_valid_within(check))
49ac82558   KAMEZAWA Hiroyuki   memory hotplug: u...
5552
  			continue;
29723fccc   Namhyung Kim   mm: fix dubious c...
5553

49ac82558   KAMEZAWA Hiroyuki   memory hotplug: u...
5554
  		page = pfn_to_page(check);
97d255c81   Minchan Kim   mm: do not use pa...
5555
5556
5557
5558
5559
5560
5561
  		/*
  		 * We can't use page_count without pin a page
  		 * because another CPU can free compound page.
  		 * This check already skips compound tails of THP
  		 * because their page->_count is zero at all time.
  		 */
  		if (!atomic_read(&page->_count)) {
49ac82558   KAMEZAWA Hiroyuki   memory hotplug: u...
5562
5563
5564
5565
  			if (PageBuddy(page))
  				iter += (1 << page_order(page)) - 1;
  			continue;
  		}
97d255c81   Minchan Kim   mm: do not use pa...
5566

b023f4681   Wen Congyang   memory-hotplug: s...
5567
5568
5569
5570
5571
5572
  		/*
  		 * The HWPoisoned page may be not in buddy system, and
  		 * page_count() is not 0.
  		 */
  		if (skip_hwpoisoned_pages && PageHWPoison(page))
  			continue;
49ac82558   KAMEZAWA Hiroyuki   memory hotplug: u...
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
  		if (!PageLRU(page))
  			found++;
  		/*
  		 * If there are RECLAIMABLE pages, we need to check it.
  		 * But now, memory offline itself doesn't call shrink_slab()
  		 * and it still to be fixed.
  		 */
  		/*
  		 * If the page is not RAM, page_count()should be 0.
  		 * we don't need more check. This is an _used_ not-movable page.
  		 *
  		 * The problematic thing here is PG_reserved pages. PG_reserved
  		 * is set to both of a memory hole page and a _used_ kernel
  		 * page at boot.
  		 */
  		if (found > count)
80934513b   Minchan Kim   mm: clean up __co...
5589
  			return true;
49ac82558   KAMEZAWA Hiroyuki   memory hotplug: u...
5590
  	}
80934513b   Minchan Kim   mm: clean up __co...
5591
  	return false;
49ac82558   KAMEZAWA Hiroyuki   memory hotplug: u...
5592
5593
5594
5595
  }
  
  bool is_pageblock_removable_nolock(struct page *page)
  {
656a07062   Michal Hocko   mm: __count_immob...
5596
5597
  	struct zone *zone;
  	unsigned long pfn;
687875fb7   Michal Hocko   mm: fix NULL ptr ...
5598
5599
5600
5601
5602
  
  	/*
  	 * We have to be careful here because we are iterating over memory
  	 * sections which are not zone aware so we might end up outside of
  	 * the zone but still within the section.
656a07062   Michal Hocko   mm: __count_immob...
5603
5604
  	 * We have to take care about the node as well. If the node is offline
  	 * its NODE_DATA will be NULL - see page_zone.
687875fb7   Michal Hocko   mm: fix NULL ptr ...
5605
  	 */
656a07062   Michal Hocko   mm: __count_immob...
5606
5607
5608
5609
5610
  	if (!node_online(page_to_nid(page)))
  		return false;
  
  	zone = page_zone(page);
  	pfn = page_to_pfn(page);
108bcc96e   Cody P Schafer   mm: add & use zon...
5611
  	if (!zone_spans_pfn(zone, pfn))
687875fb7   Michal Hocko   mm: fix NULL ptr ...
5612
  		return false;
b023f4681   Wen Congyang   memory-hotplug: s...
5613
  	return !has_unmovable_pages(zone, page, 0, true);
a5d76b54a   KAMEZAWA Hiroyuki   memory unplug: pa...
5614
  }
0c0e61958   KAMEZAWA Hiroyuki   memory unplug: pa...
5615

041d3a8cd   Michal Nazarewicz   mm: page_alloc: i...
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
  #ifdef CONFIG_CMA
  
  static unsigned long pfn_max_align_down(unsigned long pfn)
  {
  	return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,
  			     pageblock_nr_pages) - 1);
  }
  
  static unsigned long pfn_max_align_up(unsigned long pfn)
  {
  	return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,
  				pageblock_nr_pages));
  }
041d3a8cd   Michal Nazarewicz   mm: page_alloc: i...
5629
  /* [start, end) must belong to a single zone. */
bb13ffeb9   Mel Gorman   mm: compaction: c...
5630
5631
  static int __alloc_contig_migrate_range(struct compact_control *cc,
  					unsigned long start, unsigned long end)
041d3a8cd   Michal Nazarewicz   mm: page_alloc: i...
5632
5633
  {
  	/* This function is based on compact_zone() from compaction.c. */
beb51eaa8   Minchan Kim   cma: decrease cc....
5634
  	unsigned long nr_reclaimed;
041d3a8cd   Michal Nazarewicz   mm: page_alloc: i...
5635
5636
5637
  	unsigned long pfn = start;
  	unsigned int tries = 0;
  	int ret = 0;
be49a6e13   Marek Szyprowski   mm: use migrate_p...
5638
  	migrate_prep();
041d3a8cd   Michal Nazarewicz   mm: page_alloc: i...
5639

bb13ffeb9   Mel Gorman   mm: compaction: c...
5640
  	while (pfn < end || !list_empty(&cc->migratepages)) {
041d3a8cd   Michal Nazarewicz   mm: page_alloc: i...
5641
5642
5643
5644
  		if (fatal_signal_pending(current)) {
  			ret = -EINTR;
  			break;
  		}
bb13ffeb9   Mel Gorman   mm: compaction: c...
5645
5646
5647
  		if (list_empty(&cc->migratepages)) {
  			cc->nr_migratepages = 0;
  			pfn = isolate_migratepages_range(cc->zone, cc,
e46a28790   Minchan Kim   CMA: migrate mloc...
5648
  							 pfn, end, true);
041d3a8cd   Michal Nazarewicz   mm: page_alloc: i...
5649
5650
5651
5652
5653
5654
5655
5656
5657
  			if (!pfn) {
  				ret = -EINTR;
  				break;
  			}
  			tries = 0;
  		} else if (++tries == 5) {
  			ret = ret < 0 ? ret : -EBUSY;
  			break;
  		}
beb51eaa8   Minchan Kim   cma: decrease cc....
5658
5659
5660
  		nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,
  							&cc->migratepages);
  		cc->nr_migratepages -= nr_reclaimed;
02c6de8d7   Minchan Kim   mm: cma: discard ...
5661

9c620e2bc   Hugh Dickins   mm: remove offlin...
5662
5663
  		ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
  				    0, MIGRATE_SYNC, MR_CMA);
041d3a8cd   Michal Nazarewicz   mm: page_alloc: i...
5664
  	}
2a6f51241   Srinivas Pandruvada   CMA: make putback...
5665
5666
5667
5668
5669
  	if (ret < 0) {
  		putback_movable_pages(&cc->migratepages);
  		return ret;
  	}
  	return 0;
041d3a8cd   Michal Nazarewicz   mm: page_alloc: i...
5670
5671
5672
5673
5674
5675
  }
  
  /**
   * alloc_contig_range() -- tries to allocate given range of pages
   * @start:	start PFN to allocate
   * @end:	one-past-the-last PFN to allocate
0815f3d81   Michal Nazarewicz   mm: page_isolatio...
5676
5677
5678
5679
   * @migratetype:	migratetype of the underlaying pageblocks (either
   *			#MIGRATE_MOVABLE or #MIGRATE_CMA).  All pageblocks
   *			in range must have the same migratetype and it must
   *			be either of the two.
041d3a8cd   Michal Nazarewicz   mm: page_alloc: i...
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
   *
   * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES
   * aligned, however it's the caller's responsibility to guarantee that
   * we are the only thread that changes migrate type of pageblocks the
   * pages fall in.
   *
   * The PFN range must belong to a single zone.
   *
   * Returns zero on success or negative error code.  On success all
   * pages which PFN is in [start, end) are allocated for the caller and
   * need to be freed with free_contig_range().
   */
0815f3d81   Michal Nazarewicz   mm: page_isolatio...
5692
5693
  int alloc_contig_range(unsigned long start, unsigned long end,
  		       unsigned migratetype)
041d3a8cd   Michal Nazarewicz   mm: page_alloc: i...
5694
  {
041d3a8cd   Michal Nazarewicz   mm: page_alloc: i...
5695
5696
  	unsigned long outer_start, outer_end;
  	int ret = 0, order;
bb13ffeb9   Mel Gorman   mm: compaction: c...
5697
5698
5699
5700
5701
5702
5703
5704
  	struct compact_control cc = {
  		.nr_migratepages = 0,
  		.order = -1,
  		.zone = page_zone(pfn_to_page(start)),
  		.sync = true,
  		.ignore_skip_hint = true,
  	};
  	INIT_LIST_HEAD(&cc.migratepages);
041d3a8cd   Michal Nazarewicz   mm: page_alloc: i...
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
  	/*
  	 * What we do here is we mark all pageblocks in range as
  	 * MIGRATE_ISOLATE.  Because pageblock and max order pages may
  	 * have different sizes, and due to the way page allocator
  	 * work, we align the range to biggest of the two pages so
  	 * that page allocator won't try to merge buddies from
  	 * different pageblocks and change MIGRATE_ISOLATE to some
  	 * other migration type.
  	 *
  	 * Once the pageblocks are marked as MIGRATE_ISOLATE, we
  	 * migrate the pages from an unaligned range (ie. pages that
  	 * we are interested in).  This will put all the pages in
  	 * range back to page allocator as MIGRATE_ISOLATE.
  	 *
  	 * When this is done, we take the pages in range from page
  	 * allocator removing them from the buddy system.  This way
  	 * page allocator will never consider using them.
  	 *
  	 * This lets us mark the pageblocks back as
  	 * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the
  	 * aligned range but not in the unaligned, original range are
  	 * put back to page allocator so that buddy can use them.
  	 */
  
  	ret = start_isolate_page_range(pfn_max_align_down(start),
b023f4681   Wen Congyang   memory-hotplug: s...
5730
5731
  				       pfn_max_align_up(end), migratetype,
  				       false);
041d3a8cd   Michal Nazarewicz   mm: page_alloc: i...
5732
  	if (ret)
86a595f96   Bob Liu   mm/page_alloc.c:a...
5733
  		return ret;
041d3a8cd   Michal Nazarewicz   mm: page_alloc: i...
5734

bb13ffeb9   Mel Gorman   mm: compaction: c...
5735
  	ret = __alloc_contig_migrate_range(&cc, start, end);
041d3a8cd   Michal Nazarewicz   mm: page_alloc: i...
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
  	if (ret)
  		goto done;
  
  	/*
  	 * Pages from [start, end) are within a MAX_ORDER_NR_PAGES
  	 * aligned blocks that are marked as MIGRATE_ISOLATE.  What's
  	 * more, all pages in [start, end) are free in page allocator.
  	 * What we are going to do is to allocate all pages from
  	 * [start, end) (that is remove them from page allocator).
  	 *
  	 * The only problem is that pages at the beginning and at the
  	 * end of interesting range may be not aligned with pages that
  	 * page allocator holds, ie. they can be part of higher order
  	 * pages.  Because of this, we reserve the bigger range and
  	 * once this is done free the pages we are not interested in.
  	 *
  	 * We don't have to hold zone->lock here because the pages are
  	 * isolated thus they won't get removed from buddy.
  	 */
  
  	lru_add_drain_all();
  	drain_all_pages();
  
  	order = 0;
  	outer_start = start;
  	while (!PageBuddy(pfn_to_page(outer_start))) {
  		if (++order >= MAX_ORDER) {
  			ret = -EBUSY;
  			goto done;
  		}
  		outer_start &= ~0UL << order;
  	}
  
  	/* Make sure the range is really isolated. */
b023f4681   Wen Congyang   memory-hotplug: s...
5770
  	if (test_pages_isolated(outer_start, end, false)) {
041d3a8cd   Michal Nazarewicz   mm: page_alloc: i...
5771
5772
5773
5774
5775
5776
  		pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed
  ",
  		       outer_start, end);
  		ret = -EBUSY;
  		goto done;
  	}
49f223a9c   Marek Szyprowski   mm: trigger page ...
5777
5778
  
  	/* Grab isolated pages from freelists. */
bb13ffeb9   Mel Gorman   mm: compaction: c...
5779
  	outer_end = isolate_freepages_range(&cc, outer_start, end);
041d3a8cd   Michal Nazarewicz   mm: page_alloc: i...
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
  	if (!outer_end) {
  		ret = -EBUSY;
  		goto done;
  	}
  
  	/* Free head and tail (if any) */
  	if (start != outer_start)
  		free_contig_range(outer_start, start - outer_start);
  	if (end != outer_end)
  		free_contig_range(end, outer_end - end);
  
  done:
  	undo_isolate_page_range(pfn_max_align_down(start),
0815f3d81   Michal Nazarewicz   mm: page_isolatio...
5793
  				pfn_max_align_up(end), migratetype);
041d3a8cd   Michal Nazarewicz   mm: page_alloc: i...
5794
5795
5796
5797
5798
  	return ret;
  }
  
  void free_contig_range(unsigned long pfn, unsigned nr_pages)
  {
bcc2b02f4   Marek Szyprowski   mm: cma: WARN if ...
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
  	unsigned int count = 0;
  
  	for (; nr_pages--; pfn++) {
  		struct page *page = pfn_to_page(pfn);
  
  		count += page_count(page) != 1;
  		__free_page(page);
  	}
  	WARN(count != 0, "%d pages are still in use!
  ", count);
041d3a8cd   Michal Nazarewicz   mm: page_alloc: i...
5809
5810
  }
  #endif
4ed7e0222   Jiang Liu   mm/hotplug: mark ...
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
  #ifdef CONFIG_MEMORY_HOTPLUG
  static int __meminit __zone_pcp_update(void *data)
  {
  	struct zone *zone = data;
  	int cpu;
  	unsigned long batch = zone_batchsize(zone), flags;
  
  	for_each_possible_cpu(cpu) {
  		struct per_cpu_pageset *pset;
  		struct per_cpu_pages *pcp;
  
  		pset = per_cpu_ptr(zone->pageset, cpu);
  		pcp = &pset->pcp;
  
  		local_irq_save(flags);
  		if (pcp->count > 0)
  			free_pcppages_bulk(zone, pcp->count, pcp);
5a8838138   Minchan Kim   memory-hotplug: f...
5828
  		drain_zonestat(zone, pset);
4ed7e0222   Jiang Liu   mm/hotplug: mark ...
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
  		setup_pageset(pset, batch);
  		local_irq_restore(flags);
  	}
  	return 0;
  }
  
  void __meminit zone_pcp_update(struct zone *zone)
  {
  	stop_machine(__zone_pcp_update, zone, NULL);
  }
  #endif
340175b7d   Jiang Liu   mm/hotplug: free ...
5840
5841
5842
  void zone_pcp_reset(struct zone *zone)
  {
  	unsigned long flags;
5a8838138   Minchan Kim   memory-hotplug: f...
5843
5844
  	int cpu;
  	struct per_cpu_pageset *pset;
340175b7d   Jiang Liu   mm/hotplug: free ...
5845
5846
5847
5848
  
  	/* avoid races with drain_pages()  */
  	local_irq_save(flags);
  	if (zone->pageset != &boot_pageset) {
5a8838138   Minchan Kim   memory-hotplug: f...
5849
5850
5851
5852
  		for_each_online_cpu(cpu) {
  			pset = per_cpu_ptr(zone->pageset, cpu);
  			drain_zonestat(zone, pset);
  		}
340175b7d   Jiang Liu   mm/hotplug: free ...
5853
5854
5855
5856
5857
  		free_percpu(zone->pageset);
  		zone->pageset = &boot_pageset;
  	}
  	local_irq_restore(flags);
  }
6dcd73d70   Wen Congyang   memory-hotplug: a...
5858
  #ifdef CONFIG_MEMORY_HOTREMOVE
0c0e61958   KAMEZAWA Hiroyuki   memory unplug: pa...
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
  /*
   * All pages in the range must be isolated before calling this.
   */
  void
  __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
  {
  	struct page *page;
  	struct zone *zone;
  	int order, i;
  	unsigned long pfn;
  	unsigned long flags;
  	/* find the first valid pfn */
  	for (pfn = start_pfn; pfn < end_pfn; pfn++)
  		if (pfn_valid(pfn))
  			break;
  	if (pfn == end_pfn)
  		return;
  	zone = page_zone(pfn_to_page(pfn));
  	spin_lock_irqsave(&zone->lock, flags);
  	pfn = start_pfn;
  	while (pfn < end_pfn) {
  		if (!pfn_valid(pfn)) {
  			pfn++;
  			continue;
  		}
  		page = pfn_to_page(pfn);
b023f4681   Wen Congyang   memory-hotplug: s...
5885
5886
5887
5888
5889
5890
5891
5892
5893
  		/*
  		 * The HWPoisoned page may be not in buddy system, and
  		 * page_count() is not 0.
  		 */
  		if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
  			pfn++;
  			SetPageReserved(page);
  			continue;
  		}
0c0e61958   KAMEZAWA Hiroyuki   memory unplug: pa...
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
  		BUG_ON(page_count(page));
  		BUG_ON(!PageBuddy(page));
  		order = page_order(page);
  #ifdef CONFIG_DEBUG_VM
  		printk(KERN_INFO "remove from free list %lx %d %lx
  ",
  		       pfn, 1 << order, end_pfn);
  #endif
  		list_del(&page->lru);
  		rmv_page_order(page);
  		zone->free_area[order].nr_free--;
764225780   Wanpeng Li   mm/memory-hotplug...
5905
5906
5907
5908
  #ifdef CONFIG_HIGHMEM
  		if (PageHighMem(page))
  			totalhigh_pages -= 1 << order;
  #endif
0c0e61958   KAMEZAWA Hiroyuki   memory unplug: pa...
5909
5910
5911
5912
5913
5914
5915
  		for (i = 0; i < (1 << order); i++)
  			SetPageReserved((page+i));
  		pfn += (1 << order);
  	}
  	spin_unlock_irqrestore(&zone->lock, flags);
  }
  #endif
8d22ba1b7   Wu Fengguang   HWPOISON: detect ...
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
  
  #ifdef CONFIG_MEMORY_FAILURE
  bool is_free_buddy_page(struct page *page)
  {
  	struct zone *zone = page_zone(page);
  	unsigned long pfn = page_to_pfn(page);
  	unsigned long flags;
  	int order;
  
  	spin_lock_irqsave(&zone->lock, flags);
  	for (order = 0; order < MAX_ORDER; order++) {
  		struct page *page_head = page - (pfn & ((1 << order) - 1));
  
  		if (PageBuddy(page_head) && page_order(page_head) >= order)
  			break;
  	}
  	spin_unlock_irqrestore(&zone->lock, flags);
  
  	return order < MAX_ORDER;
  }
  #endif
718a38211   Wu Fengguang   mm: introduce dum...
5937

51300cef4   Andrew Morton   mm/page_alloc.c: ...
5938
  static const struct trace_print_flags pageflag_names[] = {
718a38211   Wu Fengguang   mm: introduce dum...
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
  	{1UL << PG_locked,		"locked"	},
  	{1UL << PG_error,		"error"		},
  	{1UL << PG_referenced,		"referenced"	},
  	{1UL << PG_uptodate,		"uptodate"	},
  	{1UL << PG_dirty,		"dirty"		},
  	{1UL << PG_lru,			"lru"		},
  	{1UL << PG_active,		"active"	},
  	{1UL << PG_slab,		"slab"		},
  	{1UL << PG_owner_priv_1,	"owner_priv_1"	},
  	{1UL << PG_arch_1,		"arch_1"	},
  	{1UL << PG_reserved,		"reserved"	},
  	{1UL << PG_private,		"private"	},
  	{1UL << PG_private_2,		"private_2"	},
  	{1UL << PG_writeback,		"writeback"	},
  #ifdef CONFIG_PAGEFLAGS_EXTENDED
  	{1UL << PG_head,		"head"		},
  	{1UL << PG_tail,		"tail"		},
  #else
  	{1UL << PG_compound,		"compound"	},
  #endif
  	{1UL << PG_swapcache,		"swapcache"	},
  	{1UL << PG_mappedtodisk,	"mappedtodisk"	},
  	{1UL << PG_reclaim,		"reclaim"	},
718a38211   Wu Fengguang   mm: introduce dum...
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
  	{1UL << PG_swapbacked,		"swapbacked"	},
  	{1UL << PG_unevictable,		"unevictable"	},
  #ifdef CONFIG_MMU
  	{1UL << PG_mlocked,		"mlocked"	},
  #endif
  #ifdef CONFIG_ARCH_USES_PG_UNCACHED
  	{1UL << PG_uncached,		"uncached"	},
  #endif
  #ifdef CONFIG_MEMORY_FAILURE
  	{1UL << PG_hwpoison,		"hwpoison"	},
  #endif
be9cd873e   Gavin Shan   mm/buddy: dump PG...
5973
5974
5975
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  	{1UL << PG_compound_lock,	"compound_lock"	},
  #endif
718a38211   Wu Fengguang   mm: introduce dum...
5976
5977
5978
5979
5980
5981
5982
  };
  
  static void dump_page_flags(unsigned long flags)
  {
  	const char *delim = "";
  	unsigned long mask;
  	int i;
51300cef4   Andrew Morton   mm/page_alloc.c: ...
5983
  	BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);
acc50c110   Johannes Weiner   mm: page_alloc: c...
5984

718a38211   Wu Fengguang   mm: introduce dum...
5985
5986
5987
5988
  	printk(KERN_ALERT "page flags: %#lx(", flags);
  
  	/* remove zone id */
  	flags &= (1UL << NR_PAGEFLAGS) - 1;
51300cef4   Andrew Morton   mm/page_alloc.c: ...
5989
  	for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) {
718a38211   Wu Fengguang   mm: introduce dum...
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
  
  		mask = pageflag_names[i].mask;
  		if ((flags & mask) != mask)
  			continue;
  
  		flags &= ~mask;
  		printk("%s%s", delim, pageflag_names[i].name);
  		delim = "|";
  	}
  
  	/* check for left over flags */
  	if (flags)
  		printk("%s%#lx", delim, flags);
  
  	printk(")
  ");
  }
  
  void dump_page(struct page *page)
  {
  	printk(KERN_ALERT
  	       "page:%p count:%d mapcount:%d mapping:%p index:%#lx
  ",
4e9f64c42   Andrea Arcangeli   thp: fix bad_page...
6013
  		page, atomic_read(&page->_count), page_mapcount(page),
718a38211   Wu Fengguang   mm: introduce dum...
6014
6015
  		page->mapping, page->index);
  	dump_page_flags(page->flags);
f212ad7cf   Daisuke Nishimura   memcg: add memcg ...
6016
  	mem_cgroup_print_bad_page(page);
718a38211   Wu Fengguang   mm: introduce dum...
6017
  }