Blame view

mm/page_alloc.c 155 KB
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
  /*
   *  linux/mm/page_alloc.c
   *
   *  Manages the free list, the system allocates free pages here.
   *  Note that kmalloc() lives in slab.c
   *
   *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   *  Swap reorganised 29.12.95, Stephen Tweedie
   *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
   *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
   *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
   *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
   *  Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
   *          (lots of bits borrowed from Ingo Molnar & Andrew Morton)
   */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
16
17
18
19
20
  #include <linux/stddef.h>
  #include <linux/mm.h>
  #include <linux/swap.h>
  #include <linux/interrupt.h>
  #include <linux/pagemap.h>
10ed273f5   KOSAKI Motohiro   zlc_setup(): hand...
21
  #include <linux/jiffies.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
22
  #include <linux/bootmem.h>
edbe7d23b   Yinghai Lu   memblock: Add fin...
23
  #include <linux/memblock.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
24
  #include <linux/compiler.h>
9f1583339   Randy Dunlap   [PATCH] use add_t...
25
  #include <linux/kernel.h>
b1eeab676   Vegard Nossum   kmemcheck: add ho...
26
  #include <linux/kmemcheck.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
27
28
29
30
31
  #include <linux/module.h>
  #include <linux/suspend.h>
  #include <linux/pagevec.h>
  #include <linux/blkdev.h>
  #include <linux/slab.h>
a238ab5b0   Dave Hansen   mm: break out pag...
32
  #include <linux/ratelimit.h>
5a3135c2e   David Rientjes   oom: move prototy...
33
  #include <linux/oom.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
34
35
36
37
38
  #include <linux/notifier.h>
  #include <linux/topology.h>
  #include <linux/sysctl.h>
  #include <linux/cpu.h>
  #include <linux/cpuset.h>
bdc8cb984   Dave Hansen   [PATCH] memory ho...
39
  #include <linux/memory_hotplug.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
40
41
  #include <linux/nodemask.h>
  #include <linux/vmalloc.h>
a6cccdc36   KOSAKI Motohiro   mm, mem-hotplug: ...
42
  #include <linux/vmstat.h>
4be38e351   Christoph Lameter   [PATCH] mm: move ...
43
  #include <linux/mempolicy.h>
6811378e7   Yasunori Goto   [PATCH] wait_tabl...
44
  #include <linux/stop_machine.h>
c713216de   Mel Gorman   [PATCH] Introduce...
45
46
  #include <linux/sort.h>
  #include <linux/pfn.h>
3fcfab16c   Andrew Morton   [PATCH] separate ...
47
  #include <linux/backing-dev.h>
933e312e7   Akinobu Mita   [PATCH] fault-inj...
48
  #include <linux/fault-inject.h>
a5d76b54a   KAMEZAWA Hiroyuki   memory unplug: pa...
49
  #include <linux/page-isolation.h>
52d4b9ac0   KAMEZAWA Hiroyuki   memcg: allocate a...
50
  #include <linux/page_cgroup.h>
3ac7fe5a4   Thomas Gleixner   infrastructure to...
51
  #include <linux/debugobjects.h>
dbb1f81ca   Catalin Marinas   kmemleak: Add kme...
52
  #include <linux/kmemleak.h>
925cc71e5   Robert Jennings   mm: Add notifier ...
53
  #include <linux/memory.h>
56de7263f   Mel Gorman   mm: compaction: d...
54
  #include <linux/compaction.h>
0d3d062a6   Mel Gorman   tracing, page-all...
55
  #include <trace/events/kmem.h>
718a38211   Wu Fengguang   mm: introduce dum...
56
  #include <linux/ftrace_event.h>
f212ad7cf   Daisuke Nishimura   memcg: add memcg ...
57
  #include <linux/memcontrol.h>
268bb0ce3   Linus Torvalds   sanitize <linux/p...
58
  #include <linux/prefetch.h>
c0a32fc5a   Stanislaw Gruszka   mm: more intensiv...
59
  #include <linux/page-debug-flags.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
60
61
  
  #include <asm/tlbflush.h>
ac924c603   Andrew Morton   [PATCH] setup_per...
62
  #include <asm/div64.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
63
  #include "internal.h"
728120192   Lee Schermerhorn   numa: add generic...
64
65
66
67
  #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
  DEFINE_PER_CPU(int, numa_node);
  EXPORT_PER_CPU_SYMBOL(numa_node);
  #endif
7aac78988   Lee Schermerhorn   numa: introduce n...
68
69
70
71
72
73
74
75
76
77
  #ifdef CONFIG_HAVE_MEMORYLESS_NODES
  /*
   * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
   * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
   * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()
   * defined in <linux/topology.h>.
   */
  DEFINE_PER_CPU(int, _numa_mem_);		/* Kernel "local memory" node */
  EXPORT_PER_CPU_SYMBOL(_numa_mem_);
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
78
  /*
138089107   Christoph Lameter   Memoryless nodes:...
79
   * Array of node states.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
80
   */
138089107   Christoph Lameter   Memoryless nodes:...
81
82
83
84
85
86
87
88
89
90
91
92
  nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
  	[N_POSSIBLE] = NODE_MASK_ALL,
  	[N_ONLINE] = { { [0] = 1UL } },
  #ifndef CONFIG_NUMA
  	[N_NORMAL_MEMORY] = { { [0] = 1UL } },
  #ifdef CONFIG_HIGHMEM
  	[N_HIGH_MEMORY] = { { [0] = 1UL } },
  #endif
  	[N_CPU] = { { [0] = 1UL } },
  #endif	/* NUMA */
  };
  EXPORT_SYMBOL(node_states);
6c231b7ba   Ravikiran G Thirumalai   [PATCH] Additions...
93
  unsigned long totalram_pages __read_mostly;
cb45b0e96   Hideo AOKI   [PATCH] overcommi...
94
  unsigned long totalreserve_pages __read_mostly;
ab8fabd46   Johannes Weiner   mm: exclude reser...
95
96
97
98
99
100
101
  /*
   * When calculating the number of globally allowed dirty pages, there
   * is a certain number of per-zone reserves that should not be
   * considered dirtyable memory.  This is the sum of those reserves
   * over all existing zones that contribute dirtyable memory.
   */
  unsigned long dirty_balance_reserve __read_mostly;
8ad4b1fb8   Rohit Seth   [PATCH] Make high...
102
  int percpu_pagelist_fraction;
dcce284a2   Benjamin Herrenschmidt   mm: Extend gfp ma...
103
  gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
104

452aa6999   Rafael J. Wysocki   mm/pm: force GFP_...
105
106
107
108
109
110
111
112
113
  #ifdef CONFIG_PM_SLEEP
  /*
   * The following functions are used by the suspend/hibernate code to temporarily
   * change gfp_allowed_mask in order to avoid using I/O during memory allocations
   * while devices are suspended.  To avoid races with the suspend/hibernate code,
   * they should always be called with pm_mutex held (gfp_allowed_mask also should
   * only be modified with pm_mutex held, unless the suspend/hibernate code is
   * guaranteed not to run in parallel with that modification).
   */
c9e664f1f   Rafael J. Wysocki   PM / Hibernate: F...
114
115
116
117
  
  static gfp_t saved_gfp_mask;
  
  void pm_restore_gfp_mask(void)
452aa6999   Rafael J. Wysocki   mm/pm: force GFP_...
118
119
  {
  	WARN_ON(!mutex_is_locked(&pm_mutex));
c9e664f1f   Rafael J. Wysocki   PM / Hibernate: F...
120
121
122
123
  	if (saved_gfp_mask) {
  		gfp_allowed_mask = saved_gfp_mask;
  		saved_gfp_mask = 0;
  	}
452aa6999   Rafael J. Wysocki   mm/pm: force GFP_...
124
  }
c9e664f1f   Rafael J. Wysocki   PM / Hibernate: F...
125
  void pm_restrict_gfp_mask(void)
452aa6999   Rafael J. Wysocki   mm/pm: force GFP_...
126
  {
452aa6999   Rafael J. Wysocki   mm/pm: force GFP_...
127
  	WARN_ON(!mutex_is_locked(&pm_mutex));
c9e664f1f   Rafael J. Wysocki   PM / Hibernate: F...
128
129
130
  	WARN_ON(saved_gfp_mask);
  	saved_gfp_mask = gfp_allowed_mask;
  	gfp_allowed_mask &= ~GFP_IOFS;
452aa6999   Rafael J. Wysocki   mm/pm: force GFP_...
131
  }
f90ac3982   Mel Gorman   mm: avoid liveloc...
132
133
134
135
136
137
138
  
  bool pm_suspended_storage(void)
  {
  	if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)
  		return false;
  	return true;
  }
452aa6999   Rafael J. Wysocki   mm/pm: force GFP_...
139
  #endif /* CONFIG_PM_SLEEP */
d9c234005   Mel Gorman   Do not depend on ...
140
141
142
  #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
  int pageblock_order __read_mostly;
  #endif
d98c7a098   Hugh Dickins   [PATCH] compound ...
143
  static void __free_pages_ok(struct page *page, unsigned int order);
a226f6c89   David Howells   [PATCH] FRV: Clea...
144

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
145
146
147
148
149
150
151
  /*
   * results with 256, 32 in the lowmem_reserve sysctl:
   *	1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
   *	1G machine -> (16M dma, 784M normal, 224M high)
   *	NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
   *	HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
   *	HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
a2f1b4249   Andi Kleen   [PATCH] x86_64: A...
152
153
154
   *
   * TBD: should special case ZONE_DMA32 machines here - in those we normally
   * don't need any ZONE_NORMAL reservation
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
155
   */
2f1b62486   Christoph Lameter   [PATCH] reduce MA...
156
  int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
4b51d6698   Christoph Lameter   [PATCH] optional ...
157
  #ifdef CONFIG_ZONE_DMA
2f1b62486   Christoph Lameter   [PATCH] reduce MA...
158
  	 256,
4b51d6698   Christoph Lameter   [PATCH] optional ...
159
  #endif
fb0e7942b   Christoph Lameter   [PATCH] reduce MA...
160
  #ifdef CONFIG_ZONE_DMA32
2f1b62486   Christoph Lameter   [PATCH] reduce MA...
161
  	 256,
fb0e7942b   Christoph Lameter   [PATCH] reduce MA...
162
  #endif
e53ef38d0   Christoph Lameter   [PATCH] reduce MA...
163
  #ifdef CONFIG_HIGHMEM
2a1e274ac   Mel Gorman   Create the ZONE_M...
164
  	 32,
e53ef38d0   Christoph Lameter   [PATCH] reduce MA...
165
  #endif
2a1e274ac   Mel Gorman   Create the ZONE_M...
166
  	 32,
2f1b62486   Christoph Lameter   [PATCH] reduce MA...
167
  };
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
168
169
  
  EXPORT_SYMBOL(totalram_pages);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
170

15ad7cdcf   Helge Deller   [PATCH] struct se...
171
  static char * const zone_names[MAX_NR_ZONES] = {
4b51d6698   Christoph Lameter   [PATCH] optional ...
172
  #ifdef CONFIG_ZONE_DMA
2f1b62486   Christoph Lameter   [PATCH] reduce MA...
173
  	 "DMA",
4b51d6698   Christoph Lameter   [PATCH] optional ...
174
  #endif
fb0e7942b   Christoph Lameter   [PATCH] reduce MA...
175
  #ifdef CONFIG_ZONE_DMA32
2f1b62486   Christoph Lameter   [PATCH] reduce MA...
176
  	 "DMA32",
fb0e7942b   Christoph Lameter   [PATCH] reduce MA...
177
  #endif
2f1b62486   Christoph Lameter   [PATCH] reduce MA...
178
  	 "Normal",
e53ef38d0   Christoph Lameter   [PATCH] reduce MA...
179
  #ifdef CONFIG_HIGHMEM
2a1e274ac   Mel Gorman   Create the ZONE_M...
180
  	 "HighMem",
e53ef38d0   Christoph Lameter   [PATCH] reduce MA...
181
  #endif
2a1e274ac   Mel Gorman   Create the ZONE_M...
182
  	 "Movable",
2f1b62486   Christoph Lameter   [PATCH] reduce MA...
183
  };
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
184
  int min_free_kbytes = 1024;
2c85f51d2   Jan Beulich   mm: also use allo...
185
186
  static unsigned long __meminitdata nr_kernel_pages;
  static unsigned long __meminitdata nr_all_pages;
a3142c8e1   Yasunori Goto   Fix section misma...
187
  static unsigned long __meminitdata dma_reserve;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
188

0ee332c14   Tejun Heo   memblock: Kill ea...
189
190
191
192
193
194
195
196
197
198
199
  #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
  static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
  static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
  static unsigned long __initdata required_kernelcore;
  static unsigned long __initdata required_movablecore;
  static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
  
  /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
  int movable_zone;
  EXPORT_SYMBOL(movable_zone);
  #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
c713216de   Mel Gorman   [PATCH] Introduce...
200

418508c13   Miklos Szeredi   fix unused setup_...
201
202
  #if MAX_NUMNODES > 1
  int nr_node_ids __read_mostly = MAX_NUMNODES;
62bc62a87   Christoph Lameter   page allocator: u...
203
  int nr_online_nodes __read_mostly = 1;
418508c13   Miklos Szeredi   fix unused setup_...
204
  EXPORT_SYMBOL(nr_node_ids);
62bc62a87   Christoph Lameter   page allocator: u...
205
  EXPORT_SYMBOL(nr_online_nodes);
418508c13   Miklos Szeredi   fix unused setup_...
206
  #endif
9ef9acb05   Mel Gorman   Do not group page...
207
  int page_group_by_mobility_disabled __read_mostly;
b2a0ac887   Mel Gorman   Split the free li...
208
209
  static void set_pageblock_migratetype(struct page *page, int migratetype)
  {
49255c619   Mel Gorman   page allocator: m...
210
211
212
  
  	if (unlikely(page_group_by_mobility_disabled))
  		migratetype = MIGRATE_UNMOVABLE;
b2a0ac887   Mel Gorman   Split the free li...
213
214
215
  	set_pageblock_flags_group(page, (unsigned long)migratetype,
  					PB_migrate, PB_migrate_end);
  }
7f33d49a2   Rafael J. Wysocki   mm, PM/Freezer: D...
216
  bool oom_killer_disabled __read_mostly;
13e7444b0   Nick Piggin   [PATCH] mm: remov...
217
  #ifdef CONFIG_DEBUG_VM
c6a57e19e   Dave Hansen   [PATCH] memory ho...
218
  static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
219
  {
bdc8cb984   Dave Hansen   [PATCH] memory ho...
220
221
222
  	int ret = 0;
  	unsigned seq;
  	unsigned long pfn = page_to_pfn(page);
c6a57e19e   Dave Hansen   [PATCH] memory ho...
223

bdc8cb984   Dave Hansen   [PATCH] memory ho...
224
225
226
227
228
229
230
231
232
  	do {
  		seq = zone_span_seqbegin(zone);
  		if (pfn >= zone->zone_start_pfn + zone->spanned_pages)
  			ret = 1;
  		else if (pfn < zone->zone_start_pfn)
  			ret = 1;
  	} while (zone_span_seqretry(zone, seq));
  
  	return ret;
c6a57e19e   Dave Hansen   [PATCH] memory ho...
233
234
235
236
  }
  
  static int page_is_consistent(struct zone *zone, struct page *page)
  {
14e072984   Andy Whitcroft   add pfn_valid_wit...
237
  	if (!pfn_valid_within(page_to_pfn(page)))
c6a57e19e   Dave Hansen   [PATCH] memory ho...
238
  		return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
239
  	if (zone != page_zone(page))
c6a57e19e   Dave Hansen   [PATCH] memory ho...
240
241
242
243
244
245
246
247
248
249
  		return 0;
  
  	return 1;
  }
  /*
   * Temporary debugging check for pages not lying within a given zone.
   */
  static int bad_range(struct zone *zone, struct page *page)
  {
  	if (page_outside_zone_boundaries(zone, page))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
250
  		return 1;
c6a57e19e   Dave Hansen   [PATCH] memory ho...
251
252
  	if (!page_is_consistent(zone, page))
  		return 1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
253
254
  	return 0;
  }
13e7444b0   Nick Piggin   [PATCH] mm: remov...
255
256
257
258
259
260
  #else
  static inline int bad_range(struct zone *zone, struct page *page)
  {
  	return 0;
  }
  #endif
224abf92b   Nick Piggin   [PATCH] mm: bad_p...
261
  static void bad_page(struct page *page)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
262
  {
d936cf9b3   Hugh Dickins   badpage: ratelimi...
263
264
265
  	static unsigned long resume;
  	static unsigned long nr_shown;
  	static unsigned long nr_unshown;
2a7684a23   Wu Fengguang   HWPOISON: check a...
266
267
  	/* Don't complain about poisoned pages */
  	if (PageHWPoison(page)) {
ef2b4b95a   Andrea Arcangeli   mm: PageBuddy and...
268
  		reset_page_mapcount(page); /* remove PageBuddy */
2a7684a23   Wu Fengguang   HWPOISON: check a...
269
270
  		return;
  	}
d936cf9b3   Hugh Dickins   badpage: ratelimi...
271
272
273
274
275
276
277
278
279
280
  	/*
  	 * Allow a burst of 60 reports, then keep quiet for that minute;
  	 * or allow a steady drip of one report per second.
  	 */
  	if (nr_shown == 60) {
  		if (time_before(jiffies, resume)) {
  			nr_unshown++;
  			goto out;
  		}
  		if (nr_unshown) {
1e9e63650   Hugh Dickins   badpage: KERN_ALE...
281
282
283
  			printk(KERN_ALERT
  			      "BUG: Bad page state: %lu messages suppressed
  ",
d936cf9b3   Hugh Dickins   badpage: ratelimi...
284
285
286
287
288
289
290
  				nr_unshown);
  			nr_unshown = 0;
  		}
  		nr_shown = 0;
  	}
  	if (nr_shown++ == 0)
  		resume = jiffies + 60 * HZ;
1e9e63650   Hugh Dickins   badpage: KERN_ALE...
291
292
  	printk(KERN_ALERT "BUG: Bad page state in process %s  pfn:%05lx
  ",
3dc147414   Hugh Dickins   badpage: replace ...
293
  		current->comm, page_to_pfn(page));
718a38211   Wu Fengguang   mm: introduce dum...
294
  	dump_page(page);
3dc147414   Hugh Dickins   badpage: replace ...
295

4f31888c1   Dave Jones   mm: output a list...
296
  	print_modules();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
297
  	dump_stack();
d936cf9b3   Hugh Dickins   badpage: ratelimi...
298
  out:
8cc3b3922   Hugh Dickins   badpage: keep any...
299
  	/* Leave bad fields for debug, except PageBuddy could make trouble */
ef2b4b95a   Andrea Arcangeli   mm: PageBuddy and...
300
  	reset_page_mapcount(page); /* remove PageBuddy */
9f1583339   Randy Dunlap   [PATCH] use add_t...
301
  	add_taint(TAINT_BAD_PAGE);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
302
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
303
304
305
306
307
308
309
  /*
   * Higher-order pages are called "compound pages".  They are structured thusly:
   *
   * The first PAGE_SIZE page is called the "head page".
   *
   * The remaining PAGE_SIZE pages are called "tail pages".
   *
6416b9fa4   Wang Sheng-Hui   mm: cleanup the c...
310
311
   * All pages have PG_compound set.  All tail pages have their ->first_page
   * pointing at the head page.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
312
   *
41d78ba55   Hugh Dickins   [PATCH] compound ...
313
314
315
   * The first tail page's ->lru.next holds the address of the compound page's
   * put_page() function.  Its ->lru.prev holds the order of allocation.
   * This usage means that zero-order pages may not be compound.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
316
   */
d98c7a098   Hugh Dickins   [PATCH] compound ...
317
318
319
  
  static void free_compound_page(struct page *page)
  {
d85f33855   Christoph Lameter   Make page->privat...
320
  	__free_pages_ok(page, compound_order(page));
d98c7a098   Hugh Dickins   [PATCH] compound ...
321
  }
01ad1c082   Andi Kleen   mm: export prep_c...
322
  void prep_compound_page(struct page *page, unsigned long order)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
323
324
325
  {
  	int i;
  	int nr_pages = 1 << order;
18229df5b   Andy Whitcroft   hugetlb: pull gig...
326
327
328
329
330
331
  
  	set_compound_page_dtor(page, free_compound_page);
  	set_compound_order(page, order);
  	__SetPageHead(page);
  	for (i = 1; i < nr_pages; i++) {
  		struct page *p = page + i;
18229df5b   Andy Whitcroft   hugetlb: pull gig...
332
  		__SetPageTail(p);
58a84aa92   Youquan Song   thp: set compound...
333
  		set_page_count(p, 0);
18229df5b   Andy Whitcroft   hugetlb: pull gig...
334
335
336
  		p->first_page = page;
  	}
  }
59ff42163   Andrea Arcangeli   thp: comment remi...
337
  /* update __split_huge_page_refcount if you change this function */
8cc3b3922   Hugh Dickins   badpage: keep any...
338
  static int destroy_compound_page(struct page *page, unsigned long order)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
339
340
341
  {
  	int i;
  	int nr_pages = 1 << order;
8cc3b3922   Hugh Dickins   badpage: keep any...
342
  	int bad = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
343

8cc3b3922   Hugh Dickins   badpage: keep any...
344
345
  	if (unlikely(compound_order(page) != order) ||
  	    unlikely(!PageHead(page))) {
224abf92b   Nick Piggin   [PATCH] mm: bad_p...
346
  		bad_page(page);
8cc3b3922   Hugh Dickins   badpage: keep any...
347
348
  		bad++;
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
349

6d7779538   Christoph Lameter   mm: optimize comp...
350
  	__ClearPageHead(page);
8cc3b3922   Hugh Dickins   badpage: keep any...
351

18229df5b   Andy Whitcroft   hugetlb: pull gig...
352
353
  	for (i = 1; i < nr_pages; i++) {
  		struct page *p = page + i;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
354

e713a21d8   Alexey Zaytsev   trivial: Fix dubi...
355
  		if (unlikely(!PageTail(p) || (p->first_page != page))) {
224abf92b   Nick Piggin   [PATCH] mm: bad_p...
356
  			bad_page(page);
8cc3b3922   Hugh Dickins   badpage: keep any...
357
358
  			bad++;
  		}
d85f33855   Christoph Lameter   Make page->privat...
359
  		__ClearPageTail(p);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
360
  	}
8cc3b3922   Hugh Dickins   badpage: keep any...
361
362
  
  	return bad;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
363
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
364

17cf44064   Nick Piggin   [PATCH] mm: clean...
365
366
367
  static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
  {
  	int i;
6626c5d53   Andrew Morton   [PATCH] mm: prep_...
368
369
370
371
  	/*
  	 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
  	 * and __GFP_HIGHMEM from hard or soft interrupt context.
  	 */
725d704ec   Nick Piggin   [PATCH] mm: VM_BU...
372
  	VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
17cf44064   Nick Piggin   [PATCH] mm: clean...
373
374
375
  	for (i = 0; i < (1 << order); i++)
  		clear_highpage(page + i);
  }
c0a32fc5a   Stanislaw Gruszka   mm: more intensiv...
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
  #ifdef CONFIG_DEBUG_PAGEALLOC
  unsigned int _debug_guardpage_minorder;
  
  static int __init debug_guardpage_minorder_setup(char *buf)
  {
  	unsigned long res;
  
  	if (kstrtoul(buf, 10, &res) < 0 ||  res > MAX_ORDER / 2) {
  		printk(KERN_ERR "Bad debug_guardpage_minorder value
  ");
  		return 0;
  	}
  	_debug_guardpage_minorder = res;
  	printk(KERN_INFO "Setting debug_guardpage_minorder to %lu
  ", res);
  	return 0;
  }
  __setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);
  
  static inline void set_page_guard_flag(struct page *page)
  {
  	__set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
  }
  
  static inline void clear_page_guard_flag(struct page *page)
  {
  	__clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
  }
  #else
  static inline void set_page_guard_flag(struct page *page) { }
  static inline void clear_page_guard_flag(struct page *page) { }
  #endif
6aa3001b2   Andrew Morton   [PATCH] page_allo...
408
409
  static inline void set_page_order(struct page *page, int order)
  {
4c21e2f24   Hugh Dickins   [PATCH] mm: split...
410
  	set_page_private(page, order);
676165a8a   Nick Piggin   [PATCH] Fix buddy...
411
  	__SetPageBuddy(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
412
413
414
415
  }
  
  static inline void rmv_page_order(struct page *page)
  {
676165a8a   Nick Piggin   [PATCH] Fix buddy...
416
  	__ClearPageBuddy(page);
4c21e2f24   Hugh Dickins   [PATCH] mm: split...
417
  	set_page_private(page, 0);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
  }
  
  /*
   * Locate the struct page for both the matching buddy in our
   * pair (buddy1) and the combined O(n+1) page they form (page).
   *
   * 1) Any buddy B1 will have an order O twin B2 which satisfies
   * the following equation:
   *     B2 = B1 ^ (1 << O)
   * For example, if the starting buddy (buddy2) is #8 its order
   * 1 buddy is #10:
   *     B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
   *
   * 2) Any buddy B will have an order O+1 parent P which
   * satisfies the following equation:
   *     P = B & ~(1 << O)
   *
d6e05edc5   Andreas Mohr   spelling fixes
435
   * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
436
   */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
437
  static inline unsigned long
43506fad2   KyongHo Cho   mm/page_alloc.c: ...
438
  __find_buddy_index(unsigned long page_idx, unsigned int order)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
439
  {
43506fad2   KyongHo Cho   mm/page_alloc.c: ...
440
  	return page_idx ^ (1 << order);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
441
442
443
444
445
  }
  
  /*
   * This function checks whether a page is free && is the buddy
   * we can do coalesce a page and its buddy if
13e7444b0   Nick Piggin   [PATCH] mm: remov...
446
   * (a) the buddy is not in a hole &&
676165a8a   Nick Piggin   [PATCH] Fix buddy...
447
   * (b) the buddy is in the buddy system &&
cb2b95e1c   Andy Whitcroft   [PATCH] zone hand...
448
449
   * (c) a page and its buddy have the same order &&
   * (d) a page and its buddy are in the same zone.
676165a8a   Nick Piggin   [PATCH] Fix buddy...
450
   *
5f24ce5fd   Andrea Arcangeli   thp: remove PG_buddy
451
452
   * For recording whether a page is in the buddy system, we set ->_mapcount -2.
   * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
453
   *
676165a8a   Nick Piggin   [PATCH] Fix buddy...
454
   * For recording page's order, we use page_private(page).
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
455
   */
cb2b95e1c   Andy Whitcroft   [PATCH] zone hand...
456
457
  static inline int page_is_buddy(struct page *page, struct page *buddy,
  								int order)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
458
  {
14e072984   Andy Whitcroft   add pfn_valid_wit...
459
  	if (!pfn_valid_within(page_to_pfn(buddy)))
13e7444b0   Nick Piggin   [PATCH] mm: remov...
460
  		return 0;
13e7444b0   Nick Piggin   [PATCH] mm: remov...
461

cb2b95e1c   Andy Whitcroft   [PATCH] zone hand...
462
463
  	if (page_zone_id(page) != page_zone_id(buddy))
  		return 0;
c0a32fc5a   Stanislaw Gruszka   mm: more intensiv...
464
465
466
467
  	if (page_is_guard(buddy) && page_order(buddy) == order) {
  		VM_BUG_ON(page_count(buddy) != 0);
  		return 1;
  	}
cb2b95e1c   Andy Whitcroft   [PATCH] zone hand...
468
  	if (PageBuddy(buddy) && page_order(buddy) == order) {
a3af9c389   Nick Piggin   page allocator: d...
469
  		VM_BUG_ON(page_count(buddy) != 0);
6aa3001b2   Andrew Morton   [PATCH] page_allo...
470
  		return 1;
676165a8a   Nick Piggin   [PATCH] Fix buddy...
471
  	}
6aa3001b2   Andrew Morton   [PATCH] page_allo...
472
  	return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
  }
  
  /*
   * Freeing function for a buddy system allocator.
   *
   * The concept of a buddy system is to maintain direct-mapped table
   * (containing bit values) for memory blocks of various "orders".
   * The bottom level table contains the map for the smallest allocatable
   * units of memory (here, pages), and each level above it describes
   * pairs of units from the levels below, hence, "buddies".
   * At a high level, all that happens here is marking the table entry
   * at the bottom level available, and propagating the changes upward
   * as necessary, plus some accounting needed to play nicely with other
   * parts of the VM system.
   * At each level, we keep a list of pages, which are heads of continuous
5f24ce5fd   Andrea Arcangeli   thp: remove PG_buddy
488
   * free pages of length of (1 << order) and marked with _mapcount -2. Page's
4c21e2f24   Hugh Dickins   [PATCH] mm: split...
489
   * order is recorded in page_private(page) field.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
490
491
492
493
494
495
496
497
   * So when we are allocating or freeing one, we can derive the state of the
   * other.  That is, if we allocate a small block, and both were   
   * free, the remainder of the region must be split into blocks.   
   * If a block is freed, and its buddy is also free, then this
   * triggers coalescing into a block of larger size.            
   *
   * -- wli
   */
48db57f8f   Nick Piggin   [PATCH] mm: free_...
498
  static inline void __free_one_page(struct page *page,
ed0ae21dc   Mel Gorman   page allocator: d...
499
500
  		struct zone *zone, unsigned int order,
  		int migratetype)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
501
502
  {
  	unsigned long page_idx;
6dda9d55b   Corrado Zoccolo   page allocator: r...
503
  	unsigned long combined_idx;
43506fad2   KyongHo Cho   mm/page_alloc.c: ...
504
  	unsigned long uninitialized_var(buddy_idx);
6dda9d55b   Corrado Zoccolo   page allocator: r...
505
  	struct page *buddy;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
506

224abf92b   Nick Piggin   [PATCH] mm: bad_p...
507
  	if (unlikely(PageCompound(page)))
8cc3b3922   Hugh Dickins   badpage: keep any...
508
509
  		if (unlikely(destroy_compound_page(page, order)))
  			return;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
510

ed0ae21dc   Mel Gorman   page allocator: d...
511
  	VM_BUG_ON(migratetype == -1);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
512
  	page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
f2260e6b1   Mel Gorman   page allocator: u...
513
  	VM_BUG_ON(page_idx & ((1 << order) - 1));
725d704ec   Nick Piggin   [PATCH] mm: VM_BU...
514
  	VM_BUG_ON(bad_range(zone, page));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
515

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
516
  	while (order < MAX_ORDER-1) {
43506fad2   KyongHo Cho   mm/page_alloc.c: ...
517
518
  		buddy_idx = __find_buddy_index(page_idx, order);
  		buddy = page + (buddy_idx - page_idx);
cb2b95e1c   Andy Whitcroft   [PATCH] zone hand...
519
  		if (!page_is_buddy(page, buddy, order))
3c82d0ce2   Andy Whitcroft   buddy: clarify co...
520
  			break;
c0a32fc5a   Stanislaw Gruszka   mm: more intensiv...
521
522
523
524
525
526
527
528
529
530
531
532
533
  		/*
  		 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
  		 * merge with it and move up one order.
  		 */
  		if (page_is_guard(buddy)) {
  			clear_page_guard_flag(buddy);
  			set_page_private(page, 0);
  			__mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
  		} else {
  			list_del(&buddy->lru);
  			zone->free_area[order].nr_free--;
  			rmv_page_order(buddy);
  		}
43506fad2   KyongHo Cho   mm/page_alloc.c: ...
534
  		combined_idx = buddy_idx & page_idx;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
535
536
537
538
539
  		page = page + (combined_idx - page_idx);
  		page_idx = combined_idx;
  		order++;
  	}
  	set_page_order(page, order);
6dda9d55b   Corrado Zoccolo   page allocator: r...
540
541
542
543
544
545
546
547
548
  
  	/*
  	 * If this is not the largest possible page, check if the buddy
  	 * of the next-highest order is free. If it is, it's possible
  	 * that pages are being freed that will coalesce soon. In case,
  	 * that is happening, add the free page to the tail of the list
  	 * so it's less likely to be used soon and more likely to be merged
  	 * as a higher order page
  	 */
b7f50cfa3   Mel Gorman   mm, page-allocato...
549
  	if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
6dda9d55b   Corrado Zoccolo   page allocator: r...
550
  		struct page *higher_page, *higher_buddy;
43506fad2   KyongHo Cho   mm/page_alloc.c: ...
551
552
553
554
  		combined_idx = buddy_idx & page_idx;
  		higher_page = page + (combined_idx - page_idx);
  		buddy_idx = __find_buddy_index(combined_idx, order + 1);
  		higher_buddy = page + (buddy_idx - combined_idx);
6dda9d55b   Corrado Zoccolo   page allocator: r...
555
556
557
558
559
560
561
562
563
  		if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
  			list_add_tail(&page->lru,
  				&zone->free_area[order].free_list[migratetype]);
  			goto out;
  		}
  	}
  
  	list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
  out:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
564
565
  	zone->free_area[order].nr_free++;
  }
092cead61   KOSAKI Motohiro   page allocator: m...
566
567
568
569
570
571
572
  /*
   * free_page_mlock() -- clean up attempts to free and mlocked() page.
   * Page should not be on lru, so no need to fix that up.
   * free_pages_check() will verify...
   */
  static inline void free_page_mlock(struct page *page)
  {
092cead61   KOSAKI Motohiro   page allocator: m...
573
574
575
  	__dec_zone_page_state(page, NR_MLOCK);
  	__count_vm_event(UNEVICTABLE_MLOCKFREED);
  }
092cead61   KOSAKI Motohiro   page allocator: m...
576

224abf92b   Nick Piggin   [PATCH] mm: bad_p...
577
  static inline int free_pages_check(struct page *page)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
578
  {
92be2e33b   Nick Piggin   [PATCH] mm: micro...
579
580
  	if (unlikely(page_mapcount(page) |
  		(page->mapping != NULL)  |
a3af9c389   Nick Piggin   page allocator: d...
581
  		(atomic_read(&page->_count) != 0) |
f212ad7cf   Daisuke Nishimura   memcg: add memcg ...
582
583
  		(page->flags & PAGE_FLAGS_CHECK_AT_FREE) |
  		(mem_cgroup_bad_page_check(page)))) {
224abf92b   Nick Piggin   [PATCH] mm: bad_p...
584
  		bad_page(page);
79f4b7bf3   Hugh Dickins   badpage: simplify...
585
  		return 1;
8cc3b3922   Hugh Dickins   badpage: keep any...
586
  	}
79f4b7bf3   Hugh Dickins   badpage: simplify...
587
588
589
  	if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
  		page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
  	return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
590
591
592
  }
  
  /*
5f8dcc212   Mel Gorman   page-allocator: s...
593
   * Frees a number of pages from the PCP lists
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
594
   * Assumes all pages on list are in same zone, and of same order.
207f36eec   Renaud Lienhart   [PATCH] remove in...
595
   * count is the number of pages to free.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
596
597
598
599
600
601
602
   *
   * If the zone was previously in an "all pages pinned" state then look to
   * see if this freeing clears that state.
   *
   * And clear the zone's pages_scanned counter, to hold off the "all pages are
   * pinned" detection logic.
   */
5f8dcc212   Mel Gorman   page-allocator: s...
603
604
  static void free_pcppages_bulk(struct zone *zone, int count,
  					struct per_cpu_pages *pcp)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
605
  {
5f8dcc212   Mel Gorman   page-allocator: s...
606
  	int migratetype = 0;
a6f9edd65   Mel Gorman   page-allocator: m...
607
  	int batch_free = 0;
72853e299   Mel Gorman   mm: page allocato...
608
  	int to_free = count;
5f8dcc212   Mel Gorman   page-allocator: s...
609

c54ad30c7   Nick Piggin   [PATCH] mm: pagea...
610
  	spin_lock(&zone->lock);
93e4a89a8   KOSAKI Motohiro   mm: restore zone-...
611
  	zone->all_unreclaimable = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
612
  	zone->pages_scanned = 0;
f2260e6b1   Mel Gorman   page allocator: u...
613

72853e299   Mel Gorman   mm: page allocato...
614
  	while (to_free) {
48db57f8f   Nick Piggin   [PATCH] mm: free_...
615
  		struct page *page;
5f8dcc212   Mel Gorman   page-allocator: s...
616
617
618
  		struct list_head *list;
  
  		/*
a6f9edd65   Mel Gorman   page-allocator: m...
619
620
621
622
623
  		 * Remove pages from lists in a round-robin fashion. A
  		 * batch_free count is maintained that is incremented when an
  		 * empty list is encountered.  This is so more pages are freed
  		 * off fuller lists instead of spinning excessively around empty
  		 * lists
5f8dcc212   Mel Gorman   page-allocator: s...
624
625
  		 */
  		do {
a6f9edd65   Mel Gorman   page-allocator: m...
626
  			batch_free++;
5f8dcc212   Mel Gorman   page-allocator: s...
627
628
629
630
  			if (++migratetype == MIGRATE_PCPTYPES)
  				migratetype = 0;
  			list = &pcp->lists[migratetype];
  		} while (list_empty(list));
48db57f8f   Nick Piggin   [PATCH] mm: free_...
631

1d16871d8   Namhyung Kim   mm: batch-free pc...
632
633
634
  		/* This is the only non-empty list. Free them all. */
  		if (batch_free == MIGRATE_PCPTYPES)
  			batch_free = to_free;
a6f9edd65   Mel Gorman   page-allocator: m...
635
636
637
638
  		do {
  			page = list_entry(list->prev, struct page, lru);
  			/* must delete as __free_one_page list manipulates */
  			list_del(&page->lru);
a7016235a   Hugh Dickins   mm: fix migratety...
639
640
641
  			/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
  			__free_one_page(page, zone, 0, page_private(page));
  			trace_mm_page_pcpu_drain(page, 0, page_private(page));
72853e299   Mel Gorman   mm: page allocato...
642
  		} while (--to_free && --batch_free && !list_empty(list));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
643
  	}
72853e299   Mel Gorman   mm: page allocato...
644
  	__mod_zone_page_state(zone, NR_FREE_PAGES, count);
c54ad30c7   Nick Piggin   [PATCH] mm: pagea...
645
  	spin_unlock(&zone->lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
646
  }
ed0ae21dc   Mel Gorman   page allocator: d...
647
648
  static void free_one_page(struct zone *zone, struct page *page, int order,
  				int migratetype)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
649
  {
006d22d9b   Christoph Lameter   [PATCH] Optimize ...
650
  	spin_lock(&zone->lock);
93e4a89a8   KOSAKI Motohiro   mm: restore zone-...
651
  	zone->all_unreclaimable = 0;
006d22d9b   Christoph Lameter   [PATCH] Optimize ...
652
  	zone->pages_scanned = 0;
f2260e6b1   Mel Gorman   page allocator: u...
653

ed0ae21dc   Mel Gorman   page allocator: d...
654
  	__free_one_page(page, zone, order, migratetype);
72853e299   Mel Gorman   mm: page allocato...
655
  	__mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
006d22d9b   Christoph Lameter   [PATCH] Optimize ...
656
  	spin_unlock(&zone->lock);
48db57f8f   Nick Piggin   [PATCH] mm: free_...
657
  }
ec95f53aa   KOSAKI Motohiro   mm: introduce fre...
658
  static bool free_pages_prepare(struct page *page, unsigned int order)
48db57f8f   Nick Piggin   [PATCH] mm: free_...
659
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
660
  	int i;
8cc3b3922   Hugh Dickins   badpage: keep any...
661
  	int bad = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
662

b413d48aa   Konstantin Khlebnikov   mm-tracepoint: re...
663
  	trace_mm_page_free(page, order);
b1eeab676   Vegard Nossum   kmemcheck: add ho...
664
  	kmemcheck_free_shadow(page, order);
8dd60a3a6   Andrea Arcangeli   thp: clear compou...
665
666
667
668
  	if (PageAnon(page))
  		page->mapping = NULL;
  	for (i = 0; i < (1 << order); i++)
  		bad += free_pages_check(page + i);
8cc3b3922   Hugh Dickins   badpage: keep any...
669
  	if (bad)
ec95f53aa   KOSAKI Motohiro   mm: introduce fre...
670
  		return false;
689bcebfd   Hugh Dickins   [PATCH] unpaged: ...
671

3ac7fe5a4   Thomas Gleixner   infrastructure to...
672
  	if (!PageHighMem(page)) {
9858db504   Nick Piggin   [PATCH] mm: locks...
673
  		debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);
3ac7fe5a4   Thomas Gleixner   infrastructure to...
674
675
676
  		debug_check_no_obj_freed(page_address(page),
  					   PAGE_SIZE << order);
  	}
dafb13673   Nick Piggin   [PATCH] mm: arch_...
677
  	arch_free_page(page, order);
48db57f8f   Nick Piggin   [PATCH] mm: free_...
678
  	kernel_map_pages(page, 1 << order, 0);
dafb13673   Nick Piggin   [PATCH] mm: arch_...
679

ec95f53aa   KOSAKI Motohiro   mm: introduce fre...
680
681
682
683
684
685
686
687
688
689
  	return true;
  }
  
  static void __free_pages_ok(struct page *page, unsigned int order)
  {
  	unsigned long flags;
  	int wasMlocked = __TestClearPageMlocked(page);
  
  	if (!free_pages_prepare(page, order))
  		return;
c54ad30c7   Nick Piggin   [PATCH] mm: pagea...
690
  	local_irq_save(flags);
c277331d5   Johannes Weiner   mm: page_alloc: c...
691
  	if (unlikely(wasMlocked))
da456f14d   Mel Gorman   page allocator: d...
692
  		free_page_mlock(page);
f8891e5e1   Christoph Lameter   [PATCH] Light wei...
693
  	__count_vm_events(PGFREE, 1 << order);
ed0ae21dc   Mel Gorman   page allocator: d...
694
695
  	free_one_page(page_zone(page), page, order,
  					get_pageblock_migratetype(page));
c54ad30c7   Nick Piggin   [PATCH] mm: pagea...
696
  	local_irq_restore(flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
697
  }
af370fb8c   Yasunori Goto   memory hotplug: s...
698
  void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
a226f6c89   David Howells   [PATCH] FRV: Clea...
699
  {
c3993076f   Johannes Weiner   mm: page_alloc: g...
700
701
  	unsigned int nr_pages = 1 << order;
  	unsigned int loop;
a226f6c89   David Howells   [PATCH] FRV: Clea...
702

c3993076f   Johannes Weiner   mm: page_alloc: g...
703
704
705
706
707
708
709
710
  	prefetchw(page);
  	for (loop = 0; loop < nr_pages; loop++) {
  		struct page *p = &page[loop];
  
  		if (loop + 1 < nr_pages)
  			prefetchw(p + 1);
  		__ClearPageReserved(p);
  		set_page_count(p, 0);
a226f6c89   David Howells   [PATCH] FRV: Clea...
711
  	}
c3993076f   Johannes Weiner   mm: page_alloc: g...
712
713
714
  
  	set_page_refcounted(page);
  	__free_pages(page, order);
a226f6c89   David Howells   [PATCH] FRV: Clea...
715
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
  
  /*
   * The order of subdivision here is critical for the IO subsystem.
   * Please do not alter this order without good reasons and regression
   * testing. Specifically, as large blocks of memory are subdivided,
   * the order in which smaller blocks are delivered depends on the order
   * they're subdivided in this function. This is the primary factor
   * influencing the order in which pages are delivered to the IO
   * subsystem according to empirical testing, and this is also justified
   * by considering the behavior of a buddy system containing a single
   * large block of memory acted on by a series of small allocations.
   * This behavior is a critical factor in sglist merging's success.
   *
   * -- wli
   */
085cc7d5d   Nick Piggin   [PATCH] mm: page_...
731
  static inline void expand(struct zone *zone, struct page *page,
b2a0ac887   Mel Gorman   Split the free li...
732
733
  	int low, int high, struct free_area *area,
  	int migratetype)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
734
735
736
737
738
739
740
  {
  	unsigned long size = 1 << high;
  
  	while (high > low) {
  		area--;
  		high--;
  		size >>= 1;
725d704ec   Nick Piggin   [PATCH] mm: VM_BU...
741
  		VM_BUG_ON(bad_range(zone, &page[size]));
c0a32fc5a   Stanislaw Gruszka   mm: more intensiv...
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
  
  #ifdef CONFIG_DEBUG_PAGEALLOC
  		if (high < debug_guardpage_minorder()) {
  			/*
  			 * Mark as guard pages (or page), that will allow to
  			 * merge back to allocator when buddy will be freed.
  			 * Corresponding page table entries will not be touched,
  			 * pages will stay not present in virtual address space
  			 */
  			INIT_LIST_HEAD(&page[size].lru);
  			set_page_guard_flag(&page[size]);
  			set_page_private(&page[size], high);
  			/* Guard pages are not available for any usage */
  			__mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << high));
  			continue;
  		}
  #endif
b2a0ac887   Mel Gorman   Split the free li...
759
  		list_add(&page[size].lru, &area->free_list[migratetype]);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
760
761
762
  		area->nr_free++;
  		set_page_order(&page[size], high);
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
763
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
764
765
766
  /*
   * This page is about to be returned from the page allocator
   */
2a7684a23   Wu Fengguang   HWPOISON: check a...
767
  static inline int check_new_page(struct page *page)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
768
  {
92be2e33b   Nick Piggin   [PATCH] mm: micro...
769
770
  	if (unlikely(page_mapcount(page) |
  		(page->mapping != NULL)  |
a3af9c389   Nick Piggin   page allocator: d...
771
  		(atomic_read(&page->_count) != 0)  |
f212ad7cf   Daisuke Nishimura   memcg: add memcg ...
772
773
  		(page->flags & PAGE_FLAGS_CHECK_AT_PREP) |
  		(mem_cgroup_bad_page_check(page)))) {
224abf92b   Nick Piggin   [PATCH] mm: bad_p...
774
  		bad_page(page);
689bcebfd   Hugh Dickins   [PATCH] unpaged: ...
775
  		return 1;
8cc3b3922   Hugh Dickins   badpage: keep any...
776
  	}
2a7684a23   Wu Fengguang   HWPOISON: check a...
777
778
779
780
781
782
783
784
785
786
787
788
  	return 0;
  }
  
  static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
  {
  	int i;
  
  	for (i = 0; i < (1 << order); i++) {
  		struct page *p = page + i;
  		if (unlikely(check_new_page(p)))
  			return 1;
  	}
689bcebfd   Hugh Dickins   [PATCH] unpaged: ...
789

4c21e2f24   Hugh Dickins   [PATCH] mm: split...
790
  	set_page_private(page, 0);
7835e98b2   Nick Piggin   [PATCH] remove se...
791
  	set_page_refcounted(page);
cc1025090   Nick Piggin   [PATCH] mm: add a...
792
793
  
  	arch_alloc_page(page, order);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
794
  	kernel_map_pages(page, 1 << order, 1);
17cf44064   Nick Piggin   [PATCH] mm: clean...
795
796
797
798
799
800
  
  	if (gfp_flags & __GFP_ZERO)
  		prep_zero_page(page, order, gfp_flags);
  
  	if (order && (gfp_flags & __GFP_COMP))
  		prep_compound_page(page, order);
689bcebfd   Hugh Dickins   [PATCH] unpaged: ...
801
  	return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
802
  }
56fd56b86   Mel Gorman   Bias the location...
803
804
805
806
  /*
   * Go through the free lists for the given migratetype and remove
   * the smallest available page from the freelists
   */
728ec980f   Mel Gorman   page allocator: i...
807
808
  static inline
  struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
56fd56b86   Mel Gorman   Bias the location...
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
  						int migratetype)
  {
  	unsigned int current_order;
  	struct free_area * area;
  	struct page *page;
  
  	/* Find a page of the appropriate size in the preferred list */
  	for (current_order = order; current_order < MAX_ORDER; ++current_order) {
  		area = &(zone->free_area[current_order]);
  		if (list_empty(&area->free_list[migratetype]))
  			continue;
  
  		page = list_entry(area->free_list[migratetype].next,
  							struct page, lru);
  		list_del(&page->lru);
  		rmv_page_order(page);
  		area->nr_free--;
56fd56b86   Mel Gorman   Bias the location...
826
827
828
829
830
831
  		expand(zone, page, order, current_order, area, migratetype);
  		return page;
  	}
  
  	return NULL;
  }
b2a0ac887   Mel Gorman   Split the free li...
832
833
834
835
836
  /*
   * This array describes the order lists are fallen back to when
   * the free lists for the desirable migrate type are depleted
   */
  static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = {
64c5e135b   Mel Gorman   don't group high ...
837
838
839
840
  	[MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,   MIGRATE_RESERVE },
  	[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,   MIGRATE_RESERVE },
  	[MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
  	[MIGRATE_RESERVE]     = { MIGRATE_RESERVE,     MIGRATE_RESERVE,   MIGRATE_RESERVE }, /* Never used */
b2a0ac887   Mel Gorman   Split the free li...
841
  };
c361be55b   Mel Gorman   Move free pages b...
842
843
  /*
   * Move the free pages in a range to the free lists of the requested type.
d9c234005   Mel Gorman   Do not depend on ...
844
   * Note that start_page and end_pages are not aligned on a pageblock
c361be55b   Mel Gorman   Move free pages b...
845
846
   * boundary. If alignment is required, use move_freepages_block()
   */
b69a7288e   Adrian Bunk   mm/page_alloc.c: ...
847
848
849
  static int move_freepages(struct zone *zone,
  			  struct page *start_page, struct page *end_page,
  			  int migratetype)
c361be55b   Mel Gorman   Move free pages b...
850
851
852
  {
  	struct page *page;
  	unsigned long order;
d100313fd   Mel Gorman   Fix calculation i...
853
  	int pages_moved = 0;
c361be55b   Mel Gorman   Move free pages b...
854
855
856
857
858
859
860
  
  #ifndef CONFIG_HOLES_IN_ZONE
  	/*
  	 * page_zone is not safe to call in this context when
  	 * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
  	 * anyway as we check zone boundaries in move_freepages_block().
  	 * Remove at a later date when no bug reports exist related to
ac0e5b7a6   Mel Gorman   remove PAGE_GROUP...
861
  	 * grouping pages by mobility
c361be55b   Mel Gorman   Move free pages b...
862
863
864
865
866
  	 */
  	BUG_ON(page_zone(start_page) != page_zone(end_page));
  #endif
  
  	for (page = start_page; page <= end_page;) {
344c790e3   Adam Litke   mm: make setup_zo...
867
868
  		/* Make sure we are not inadvertently changing nodes */
  		VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));
c361be55b   Mel Gorman   Move free pages b...
869
870
871
872
873
874
875
876
877
878
879
  		if (!pfn_valid_within(page_to_pfn(page))) {
  			page++;
  			continue;
  		}
  
  		if (!PageBuddy(page)) {
  			page++;
  			continue;
  		}
  
  		order = page_order(page);
84be48d84   Kirill A. Shutemov   mm/page_alloc.c: ...
880
881
  		list_move(&page->lru,
  			  &zone->free_area[order].free_list[migratetype]);
c361be55b   Mel Gorman   Move free pages b...
882
  		page += 1 << order;
d100313fd   Mel Gorman   Fix calculation i...
883
  		pages_moved += 1 << order;
c361be55b   Mel Gorman   Move free pages b...
884
  	}
d100313fd   Mel Gorman   Fix calculation i...
885
  	return pages_moved;
c361be55b   Mel Gorman   Move free pages b...
886
  }
b69a7288e   Adrian Bunk   mm/page_alloc.c: ...
887
888
  static int move_freepages_block(struct zone *zone, struct page *page,
  				int migratetype)
c361be55b   Mel Gorman   Move free pages b...
889
890
891
892
893
  {
  	unsigned long start_pfn, end_pfn;
  	struct page *start_page, *end_page;
  
  	start_pfn = page_to_pfn(page);
d9c234005   Mel Gorman   Do not depend on ...
894
  	start_pfn = start_pfn & ~(pageblock_nr_pages-1);
c361be55b   Mel Gorman   Move free pages b...
895
  	start_page = pfn_to_page(start_pfn);
d9c234005   Mel Gorman   Do not depend on ...
896
897
  	end_page = start_page + pageblock_nr_pages - 1;
  	end_pfn = start_pfn + pageblock_nr_pages - 1;
c361be55b   Mel Gorman   Move free pages b...
898
899
900
901
902
903
904
905
906
  
  	/* Do not cross zone boundaries */
  	if (start_pfn < zone->zone_start_pfn)
  		start_page = page;
  	if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages)
  		return 0;
  
  	return move_freepages(zone, start_page, end_page, migratetype);
  }
2f66a68f3   Mel Gorman   page-allocator: c...
907
908
909
910
911
912
913
914
915
916
  static void change_pageblock_range(struct page *pageblock_page,
  					int start_order, int migratetype)
  {
  	int nr_pageblocks = 1 << (start_order - pageblock_order);
  
  	while (nr_pageblocks--) {
  		set_pageblock_migratetype(pageblock_page, migratetype);
  		pageblock_page += pageblock_nr_pages;
  	}
  }
b2a0ac887   Mel Gorman   Split the free li...
917
  /* Remove an element from the buddy allocator from the fallback list */
0ac3a4099   Mel Gorman   page allocator: i...
918
919
  static inline struct page *
  __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
b2a0ac887   Mel Gorman   Split the free li...
920
921
922
923
924
925
926
927
928
929
930
  {
  	struct free_area * area;
  	int current_order;
  	struct page *page;
  	int migratetype, i;
  
  	/* Find the largest possible block of pages in the other list */
  	for (current_order = MAX_ORDER-1; current_order >= order;
  						--current_order) {
  		for (i = 0; i < MIGRATE_TYPES - 1; i++) {
  			migratetype = fallbacks[start_migratetype][i];
56fd56b86   Mel Gorman   Bias the location...
931
932
933
  			/* MIGRATE_RESERVE handled later if necessary */
  			if (migratetype == MIGRATE_RESERVE)
  				continue;
e010487db   Mel Gorman   Group high-order ...
934

b2a0ac887   Mel Gorman   Split the free li...
935
936
937
938
939
940
941
942
943
  			area = &(zone->free_area[current_order]);
  			if (list_empty(&area->free_list[migratetype]))
  				continue;
  
  			page = list_entry(area->free_list[migratetype].next,
  					struct page, lru);
  			area->nr_free--;
  
  			/*
c361be55b   Mel Gorman   Move free pages b...
944
  			 * If breaking a large block of pages, move all free
46dafbca2   Mel Gorman   Be more agressive...
945
946
  			 * pages to the preferred allocation list. If falling
  			 * back for a reclaimable kernel allocation, be more
25985edce   Lucas De Marchi   Fix common misspe...
947
  			 * aggressive about taking ownership of free pages
b2a0ac887   Mel Gorman   Split the free li...
948
  			 */
d9c234005   Mel Gorman   Do not depend on ...
949
  			if (unlikely(current_order >= (pageblock_order >> 1)) ||
dd5d241ea   Mel Gorman   page-allocator: a...
950
951
  					start_migratetype == MIGRATE_RECLAIMABLE ||
  					page_group_by_mobility_disabled) {
46dafbca2   Mel Gorman   Be more agressive...
952
953
954
955
956
  				unsigned long pages;
  				pages = move_freepages_block(zone, page,
  								start_migratetype);
  
  				/* Claim the whole block if over half of it is free */
dd5d241ea   Mel Gorman   page-allocator: a...
957
958
  				if (pages >= (1 << (pageblock_order-1)) ||
  						page_group_by_mobility_disabled)
46dafbca2   Mel Gorman   Be more agressive...
959
960
  					set_pageblock_migratetype(page,
  								start_migratetype);
b2a0ac887   Mel Gorman   Split the free li...
961
  				migratetype = start_migratetype;
c361be55b   Mel Gorman   Move free pages b...
962
  			}
b2a0ac887   Mel Gorman   Split the free li...
963
964
965
966
  
  			/* Remove the page from the freelists */
  			list_del(&page->lru);
  			rmv_page_order(page);
b2a0ac887   Mel Gorman   Split the free li...
967

2f66a68f3   Mel Gorman   page-allocator: c...
968
969
970
  			/* Take ownership for orders >= pageblock_order */
  			if (current_order >= pageblock_order)
  				change_pageblock_range(page, current_order,
b2a0ac887   Mel Gorman   Split the free li...
971
972
973
  							start_migratetype);
  
  			expand(zone, page, order, current_order, area, migratetype);
e0fff1bd1   Mel Gorman   tracing, page-all...
974
975
976
  
  			trace_mm_page_alloc_extfrag(page, order, current_order,
  				start_migratetype, migratetype);
b2a0ac887   Mel Gorman   Split the free li...
977
978
979
  			return page;
  		}
  	}
728ec980f   Mel Gorman   page allocator: i...
980
  	return NULL;
b2a0ac887   Mel Gorman   Split the free li...
981
  }
56fd56b86   Mel Gorman   Bias the location...
982
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
983
984
985
   * Do the hard work of removing an element from the buddy allocator.
   * Call me with the zone->lock already held.
   */
b2a0ac887   Mel Gorman   Split the free li...
986
987
  static struct page *__rmqueue(struct zone *zone, unsigned int order,
  						int migratetype)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
988
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
989
  	struct page *page;
728ec980f   Mel Gorman   page allocator: i...
990
  retry_reserve:
56fd56b86   Mel Gorman   Bias the location...
991
  	page = __rmqueue_smallest(zone, order, migratetype);
b2a0ac887   Mel Gorman   Split the free li...
992

728ec980f   Mel Gorman   page allocator: i...
993
  	if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
56fd56b86   Mel Gorman   Bias the location...
994
  		page = __rmqueue_fallback(zone, order, migratetype);
b2a0ac887   Mel Gorman   Split the free li...
995

728ec980f   Mel Gorman   page allocator: i...
996
997
998
999
1000
1001
1002
1003
1004
1005
  		/*
  		 * Use MIGRATE_RESERVE rather than fail an allocation. goto
  		 * is used because __rmqueue_smallest is an inline function
  		 * and we want just one call site
  		 */
  		if (!page) {
  			migratetype = MIGRATE_RESERVE;
  			goto retry_reserve;
  		}
  	}
0d3d062a6   Mel Gorman   tracing, page-all...
1006
  	trace_mm_page_alloc_zone_locked(page, order, migratetype);
b2a0ac887   Mel Gorman   Split the free li...
1007
  	return page;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1008
1009
1010
1011
1012
1013
1014
1015
  }
  
  /* 
   * Obtain a specified number of elements from the buddy allocator, all under
   * a single hold of the lock, for efficiency.  Add them to the supplied list.
   * Returns the number of new pages which were placed at *list.
   */
  static int rmqueue_bulk(struct zone *zone, unsigned int order, 
b2a0ac887   Mel Gorman   Split the free li...
1016
  			unsigned long count, struct list_head *list,
e084b2d95   Mel Gorman   page-allocator: p...
1017
  			int migratetype, int cold)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1018
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1019
  	int i;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1020
  	
c54ad30c7   Nick Piggin   [PATCH] mm: pagea...
1021
  	spin_lock(&zone->lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1022
  	for (i = 0; i < count; ++i) {
b2a0ac887   Mel Gorman   Split the free li...
1023
  		struct page *page = __rmqueue(zone, order, migratetype);
085cc7d5d   Nick Piggin   [PATCH] mm: page_...
1024
  		if (unlikely(page == NULL))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1025
  			break;
81eabcbe0   Mel Gorman   mm: fix page allo...
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
  
  		/*
  		 * Split buddy pages returned by expand() are received here
  		 * in physical page order. The page is added to the callers and
  		 * list and the list head then moves forward. From the callers
  		 * perspective, the linked list is ordered by page number in
  		 * some conditions. This is useful for IO devices that can
  		 * merge IO requests if the physical pages are ordered
  		 * properly.
  		 */
e084b2d95   Mel Gorman   page-allocator: p...
1036
1037
1038
1039
  		if (likely(cold == 0))
  			list_add(&page->lru, list);
  		else
  			list_add_tail(&page->lru, list);
535131e69   Mel Gorman   Choose pages from...
1040
  		set_page_private(page, migratetype);
81eabcbe0   Mel Gorman   mm: fix page allo...
1041
  		list = &page->lru;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1042
  	}
f2260e6b1   Mel Gorman   page allocator: u...
1043
  	__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
c54ad30c7   Nick Piggin   [PATCH] mm: pagea...
1044
  	spin_unlock(&zone->lock);
085cc7d5d   Nick Piggin   [PATCH] mm: page_...
1045
  	return i;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1046
  }
4ae7c0394   Christoph Lameter   [PATCH] Periodica...
1047
  #ifdef CONFIG_NUMA
8fce4d8e3   Christoph Lameter   [PATCH] slab: Nod...
1048
  /*
4037d4522   Christoph Lameter   Move remote node ...
1049
1050
1051
1052
   * Called from the vmstat counter updater to drain pagesets of this
   * currently executing processor on remote nodes after they have
   * expired.
   *
879336c39   Christoph Lameter   [PATCH] drain_nod...
1053
1054
   * Note that this function must be called with the thread pinned to
   * a single processor.
8fce4d8e3   Christoph Lameter   [PATCH] slab: Nod...
1055
   */
4037d4522   Christoph Lameter   Move remote node ...
1056
  void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
4ae7c0394   Christoph Lameter   [PATCH] Periodica...
1057
  {
4ae7c0394   Christoph Lameter   [PATCH] Periodica...
1058
  	unsigned long flags;
4037d4522   Christoph Lameter   Move remote node ...
1059
  	int to_drain;
4ae7c0394   Christoph Lameter   [PATCH] Periodica...
1060

4037d4522   Christoph Lameter   Move remote node ...
1061
1062
1063
1064
1065
  	local_irq_save(flags);
  	if (pcp->count >= pcp->batch)
  		to_drain = pcp->batch;
  	else
  		to_drain = pcp->count;
5f8dcc212   Mel Gorman   page-allocator: s...
1066
  	free_pcppages_bulk(zone, to_drain, pcp);
4037d4522   Christoph Lameter   Move remote node ...
1067
1068
  	pcp->count -= to_drain;
  	local_irq_restore(flags);
4ae7c0394   Christoph Lameter   [PATCH] Periodica...
1069
1070
  }
  #endif
9f8f21725   Christoph Lameter   Page allocator: c...
1071
1072
1073
1074
1075
1076
1077
1078
  /*
   * Drain pages of the indicated processor.
   *
   * The processor must either be the current processor and the
   * thread pinned to the current processor or a processor that
   * is not online.
   */
  static void drain_pages(unsigned int cpu)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1079
  {
c54ad30c7   Nick Piggin   [PATCH] mm: pagea...
1080
  	unsigned long flags;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1081
  	struct zone *zone;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1082

ee99c71c5   KOSAKI Motohiro   mm: introduce for...
1083
  	for_each_populated_zone(zone) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1084
  		struct per_cpu_pageset *pset;
3dfa5721f   Christoph Lameter   Page allocator: g...
1085
  		struct per_cpu_pages *pcp;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1086

99dcc3e5a   Christoph Lameter   this_cpu: Page al...
1087
1088
  		local_irq_save(flags);
  		pset = per_cpu_ptr(zone->pageset, cpu);
3dfa5721f   Christoph Lameter   Page allocator: g...
1089
1090
  
  		pcp = &pset->pcp;
2ff754fa8   David Rientjes   mm: clear pages_s...
1091
1092
1093
1094
  		if (pcp->count) {
  			free_pcppages_bulk(zone, pcp->count, pcp);
  			pcp->count = 0;
  		}
3dfa5721f   Christoph Lameter   Page allocator: g...
1095
  		local_irq_restore(flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1096
1097
  	}
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1098

9f8f21725   Christoph Lameter   Page allocator: c...
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
  /*
   * Spill all of this CPU's per-cpu pages back into the buddy allocator.
   */
  void drain_local_pages(void *arg)
  {
  	drain_pages(smp_processor_id());
  }
  
  /*
   * Spill all the per-cpu pages from all CPUs back into the buddy allocator
   */
  void drain_all_pages(void)
  {
15c8b6c1a   Jens Axboe   on_each_cpu(): ki...
1112
  	on_each_cpu(drain_local_pages, NULL, 1);
9f8f21725   Christoph Lameter   Page allocator: c...
1113
  }
296699de6   Rafael J. Wysocki   Introduce CONFIG_...
1114
  #ifdef CONFIG_HIBERNATION
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1115
1116
1117
  
  void mark_free_pages(struct zone *zone)
  {
f623f0db8   Rafael J. Wysocki   [PATCH] swsusp: F...
1118
1119
  	unsigned long pfn, max_zone_pfn;
  	unsigned long flags;
b2a0ac887   Mel Gorman   Split the free li...
1120
  	int order, t;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1121
1122
1123
1124
1125
1126
  	struct list_head *curr;
  
  	if (!zone->spanned_pages)
  		return;
  
  	spin_lock_irqsave(&zone->lock, flags);
f623f0db8   Rafael J. Wysocki   [PATCH] swsusp: F...
1127
1128
1129
1130
1131
  
  	max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
  	for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
  		if (pfn_valid(pfn)) {
  			struct page *page = pfn_to_page(pfn);
7be982349   Rafael J. Wysocki   swsusp: use inlin...
1132
1133
  			if (!swsusp_page_is_forbidden(page))
  				swsusp_unset_page_free(page);
f623f0db8   Rafael J. Wysocki   [PATCH] swsusp: F...
1134
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1135

b2a0ac887   Mel Gorman   Split the free li...
1136
1137
  	for_each_migratetype_order(order, t) {
  		list_for_each(curr, &zone->free_area[order].free_list[t]) {
f623f0db8   Rafael J. Wysocki   [PATCH] swsusp: F...
1138
  			unsigned long i;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1139

f623f0db8   Rafael J. Wysocki   [PATCH] swsusp: F...
1140
1141
  			pfn = page_to_pfn(list_entry(curr, struct page, lru));
  			for (i = 0; i < (1UL << order); i++)
7be982349   Rafael J. Wysocki   swsusp: use inlin...
1142
  				swsusp_set_page_free(pfn_to_page(pfn + i));
f623f0db8   Rafael J. Wysocki   [PATCH] swsusp: F...
1143
  		}
b2a0ac887   Mel Gorman   Split the free li...
1144
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1145
1146
  	spin_unlock_irqrestore(&zone->lock, flags);
  }
e2c55dc87   Mel Gorman   Drain per-cpu lis...
1147
  #endif /* CONFIG_PM */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1148
1149
  
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1150
   * Free a 0-order page
fc91668ea   Li Hong   mm: remove free_h...
1151
   * cold == 1 ? free a cold page : free a hot page
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1152
   */
fc91668ea   Li Hong   mm: remove free_h...
1153
  void free_hot_cold_page(struct page *page, int cold)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1154
1155
1156
1157
  {
  	struct zone *zone = page_zone(page);
  	struct per_cpu_pages *pcp;
  	unsigned long flags;
5f8dcc212   Mel Gorman   page-allocator: s...
1158
  	int migratetype;
451ea25da   Johannes Weiner   mm: perform non-a...
1159
  	int wasMlocked = __TestClearPageMlocked(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1160

ec95f53aa   KOSAKI Motohiro   mm: introduce fre...
1161
  	if (!free_pages_prepare(page, 0))
689bcebfd   Hugh Dickins   [PATCH] unpaged: ...
1162
  		return;
5f8dcc212   Mel Gorman   page-allocator: s...
1163
1164
  	migratetype = get_pageblock_migratetype(page);
  	set_page_private(page, migratetype);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1165
  	local_irq_save(flags);
c277331d5   Johannes Weiner   mm: page_alloc: c...
1166
  	if (unlikely(wasMlocked))
da456f14d   Mel Gorman   page allocator: d...
1167
  		free_page_mlock(page);
f8891e5e1   Christoph Lameter   [PATCH] Light wei...
1168
  	__count_vm_event(PGFREE);
da456f14d   Mel Gorman   page allocator: d...
1169

5f8dcc212   Mel Gorman   page-allocator: s...
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
  	/*
  	 * We only track unmovable, reclaimable and movable on pcp lists.
  	 * Free ISOLATE pages back to the allocator because they are being
  	 * offlined but treat RESERVE as movable pages so we can get those
  	 * areas back if necessary. Otherwise, we may have to free
  	 * excessively into the page allocator
  	 */
  	if (migratetype >= MIGRATE_PCPTYPES) {
  		if (unlikely(migratetype == MIGRATE_ISOLATE)) {
  			free_one_page(zone, page, 0, migratetype);
  			goto out;
  		}
  		migratetype = MIGRATE_MOVABLE;
  	}
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
1184
  	pcp = &this_cpu_ptr(zone->pageset)->pcp;
3dfa5721f   Christoph Lameter   Page allocator: g...
1185
  	if (cold)
5f8dcc212   Mel Gorman   page-allocator: s...
1186
  		list_add_tail(&page->lru, &pcp->lists[migratetype]);
3dfa5721f   Christoph Lameter   Page allocator: g...
1187
  	else
5f8dcc212   Mel Gorman   page-allocator: s...
1188
  		list_add(&page->lru, &pcp->lists[migratetype]);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1189
  	pcp->count++;
48db57f8f   Nick Piggin   [PATCH] mm: free_...
1190
  	if (pcp->count >= pcp->high) {
5f8dcc212   Mel Gorman   page-allocator: s...
1191
  		free_pcppages_bulk(zone, pcp->batch, pcp);
48db57f8f   Nick Piggin   [PATCH] mm: free_...
1192
1193
  		pcp->count -= pcp->batch;
  	}
5f8dcc212   Mel Gorman   page-allocator: s...
1194
1195
  
  out:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1196
  	local_irq_restore(flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1197
  }
8dfcc9ba2   Nick Piggin   [PATCH] mm: split...
1198
  /*
cc59850ef   Konstantin Khlebnikov   mm: add free_hot_...
1199
1200
1201
1202
1203
1204
1205
   * Free a list of 0-order pages
   */
  void free_hot_cold_page_list(struct list_head *list, int cold)
  {
  	struct page *page, *next;
  
  	list_for_each_entry_safe(page, next, list, lru) {
b413d48aa   Konstantin Khlebnikov   mm-tracepoint: re...
1206
  		trace_mm_page_free_batched(page, cold);
cc59850ef   Konstantin Khlebnikov   mm: add free_hot_...
1207
1208
1209
1210
1211
  		free_hot_cold_page(page, cold);
  	}
  }
  
  /*
8dfcc9ba2   Nick Piggin   [PATCH] mm: split...
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
   * split_page takes a non-compound higher-order page, and splits it into
   * n (1<<order) sub-pages: page[0..n]
   * Each sub-page must be freed individually.
   *
   * Note: this is probably too low level an operation for use in drivers.
   * Please consult with lkml before using this in your driver.
   */
  void split_page(struct page *page, unsigned int order)
  {
  	int i;
725d704ec   Nick Piggin   [PATCH] mm: VM_BU...
1222
1223
  	VM_BUG_ON(PageCompound(page));
  	VM_BUG_ON(!page_count(page));
b1eeab676   Vegard Nossum   kmemcheck: add ho...
1224
1225
1226
1227
1228
1229
1230
1231
1232
  
  #ifdef CONFIG_KMEMCHECK
  	/*
  	 * Split shadow pages too, because free(page[0]) would
  	 * otherwise free the whole shadow.
  	 */
  	if (kmemcheck_page_is_tracked(page))
  		split_page(virt_to_page(page[0].shadow), order);
  #endif
7835e98b2   Nick Piggin   [PATCH] remove se...
1233
1234
  	for (i = 1; i < (1 << order); i++)
  		set_page_refcounted(page + i);
8dfcc9ba2   Nick Piggin   [PATCH] mm: split...
1235
  }
8dfcc9ba2   Nick Piggin   [PATCH] mm: split...
1236

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1237
  /*
748446bb6   Mel Gorman   mm: compaction: m...
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
   * Similar to split_page except the page is already free. As this is only
   * being used for migration, the migratetype of the block also changes.
   * As this is called with interrupts disabled, the caller is responsible
   * for calling arch_alloc_page() and kernel_map_page() after interrupts
   * are enabled.
   *
   * Note: this is probably too low level an operation for use in drivers.
   * Please consult with lkml before using this in your driver.
   */
  int split_free_page(struct page *page)
  {
  	unsigned int order;
  	unsigned long watermark;
  	struct zone *zone;
  
  	BUG_ON(!PageBuddy(page));
  
  	zone = page_zone(page);
  	order = page_order(page);
  
  	/* Obey watermarks as if the page was being allocated */
  	watermark = low_wmark_pages(zone) + (1 << order);
  	if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
  		return 0;
  
  	/* Remove page from free list */
  	list_del(&page->lru);
  	zone->free_area[order].nr_free--;
  	rmv_page_order(page);
  	__mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order));
  
  	/* Split into individual pages */
  	set_page_refcounted(page);
  	split_page(page, order);
  
  	if (order >= pageblock_order - 1) {
  		struct page *endpage = page + (1 << order) - 1;
  		for (; page < endpage; page += pageblock_nr_pages)
  			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
  	}
  
  	return 1 << order;
  }
  
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1283
1284
1285
1286
   * Really, prep_compound_page() should be called from __rmqueue_bulk().  But
   * we cheat by calling it from here, in the order > 0 path.  Saves a branch
   * or two.
   */
0a15c3e9f   Mel Gorman   page allocator: i...
1287
1288
  static inline
  struct page *buffered_rmqueue(struct zone *preferred_zone,
3dd282669   Mel Gorman   page allocator: c...
1289
1290
  			struct zone *zone, int order, gfp_t gfp_flags,
  			int migratetype)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1291
1292
  {
  	unsigned long flags;
689bcebfd   Hugh Dickins   [PATCH] unpaged: ...
1293
  	struct page *page;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1294
  	int cold = !!(gfp_flags & __GFP_COLD);
689bcebfd   Hugh Dickins   [PATCH] unpaged: ...
1295
  again:
48db57f8f   Nick Piggin   [PATCH] mm: free_...
1296
  	if (likely(order == 0)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1297
  		struct per_cpu_pages *pcp;
5f8dcc212   Mel Gorman   page-allocator: s...
1298
  		struct list_head *list;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1299

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1300
  		local_irq_save(flags);
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
1301
1302
  		pcp = &this_cpu_ptr(zone->pageset)->pcp;
  		list = &pcp->lists[migratetype];
5f8dcc212   Mel Gorman   page-allocator: s...
1303
  		if (list_empty(list)) {
535131e69   Mel Gorman   Choose pages from...
1304
  			pcp->count += rmqueue_bulk(zone, 0,
5f8dcc212   Mel Gorman   page-allocator: s...
1305
  					pcp->batch, list,
e084b2d95   Mel Gorman   page-allocator: p...
1306
  					migratetype, cold);
5f8dcc212   Mel Gorman   page-allocator: s...
1307
  			if (unlikely(list_empty(list)))
6fb332fab   Shaohua Li   memory hotplug: e...
1308
  				goto failed;
535131e69   Mel Gorman   Choose pages from...
1309
  		}
b92a6edd4   Mel Gorman   Add a configure o...
1310

5f8dcc212   Mel Gorman   page-allocator: s...
1311
1312
1313
1314
  		if (cold)
  			page = list_entry(list->prev, struct page, lru);
  		else
  			page = list_entry(list->next, struct page, lru);
b92a6edd4   Mel Gorman   Add a configure o...
1315
1316
  		list_del(&page->lru);
  		pcp->count--;
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1317
  	} else {
dab48dab3   Andrew Morton   page-allocator: w...
1318
1319
1320
1321
1322
1323
1324
1325
  		if (unlikely(gfp_flags & __GFP_NOFAIL)) {
  			/*
  			 * __GFP_NOFAIL is not to be used in new code.
  			 *
  			 * All __GFP_NOFAIL callers should be fixed so that they
  			 * properly detect and handle allocation failures.
  			 *
  			 * We most definitely don't want callers attempting to
4923abf9f   Linus Torvalds   Don't warn about ...
1326
  			 * allocate greater than order-1 page units with
dab48dab3   Andrew Morton   page-allocator: w...
1327
1328
  			 * __GFP_NOFAIL.
  			 */
4923abf9f   Linus Torvalds   Don't warn about ...
1329
  			WARN_ON_ONCE(order > 1);
dab48dab3   Andrew Morton   page-allocator: w...
1330
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1331
  		spin_lock_irqsave(&zone->lock, flags);
b2a0ac887   Mel Gorman   Split the free li...
1332
  		page = __rmqueue(zone, order, migratetype);
a74609faf   Nick Piggin   [PATCH] mm: page_...
1333
1334
1335
  		spin_unlock(&zone->lock);
  		if (!page)
  			goto failed;
6ccf80eb1   KOSAKI Motohiro   page allocator: u...
1336
  		__mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1337
  	}
f8891e5e1   Christoph Lameter   [PATCH] Light wei...
1338
  	__count_zone_vm_events(PGALLOC, zone, 1 << order);
78afd5612   Andi Kleen   mm: add __GFP_OTH...
1339
  	zone_statistics(preferred_zone, zone, gfp_flags);
a74609faf   Nick Piggin   [PATCH] mm: page_...
1340
  	local_irq_restore(flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1341

725d704ec   Nick Piggin   [PATCH] mm: VM_BU...
1342
  	VM_BUG_ON(bad_range(zone, page));
17cf44064   Nick Piggin   [PATCH] mm: clean...
1343
  	if (prep_new_page(page, order, gfp_flags))
a74609faf   Nick Piggin   [PATCH] mm: page_...
1344
  		goto again;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1345
  	return page;
a74609faf   Nick Piggin   [PATCH] mm: page_...
1346
1347
1348
  
  failed:
  	local_irq_restore(flags);
a74609faf   Nick Piggin   [PATCH] mm: page_...
1349
  	return NULL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1350
  }
418589663   Mel Gorman   page allocator: u...
1351
1352
1353
1354
1355
1356
1357
1358
  /* The ALLOC_WMARK bits are used as an index to zone->watermark */
  #define ALLOC_WMARK_MIN		WMARK_MIN
  #define ALLOC_WMARK_LOW		WMARK_LOW
  #define ALLOC_WMARK_HIGH	WMARK_HIGH
  #define ALLOC_NO_WATERMARKS	0x04 /* don't check watermarks at all */
  
  /* Mask to get the watermark bits */
  #define ALLOC_WMARK_MASK	(ALLOC_NO_WATERMARKS-1)
3148890bf   Nick Piggin   [PATCH] mm: __all...
1359
1360
1361
  #define ALLOC_HARDER		0x10 /* try to alloc harder */
  #define ALLOC_HIGH		0x20 /* __GFP_HIGH set */
  #define ALLOC_CPUSET		0x40 /* check for correct cpuset */
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1362

933e312e7   Akinobu Mita   [PATCH] fault-inj...
1363
  #ifdef CONFIG_FAIL_PAGE_ALLOC
b2588c4b4   Akinobu Mita   fail_page_alloc: ...
1364
  static struct {
933e312e7   Akinobu Mita   [PATCH] fault-inj...
1365
1366
1367
1368
  	struct fault_attr attr;
  
  	u32 ignore_gfp_highmem;
  	u32 ignore_gfp_wait;
54114994f   Akinobu Mita   fault-injection: ...
1369
  	u32 min_order;
933e312e7   Akinobu Mita   [PATCH] fault-inj...
1370
1371
  } fail_page_alloc = {
  	.attr = FAULT_ATTR_INITIALIZER,
6b1b60f41   Don Mullis   [PATCH] fault-inj...
1372
1373
  	.ignore_gfp_wait = 1,
  	.ignore_gfp_highmem = 1,
54114994f   Akinobu Mita   fault-injection: ...
1374
  	.min_order = 1,
933e312e7   Akinobu Mita   [PATCH] fault-inj...
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
  };
  
  static int __init setup_fail_page_alloc(char *str)
  {
  	return setup_fault_attr(&fail_page_alloc.attr, str);
  }
  __setup("fail_page_alloc=", setup_fail_page_alloc);
  
  static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
  {
54114994f   Akinobu Mita   fault-injection: ...
1385
1386
  	if (order < fail_page_alloc.min_order)
  		return 0;
933e312e7   Akinobu Mita   [PATCH] fault-inj...
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
  	if (gfp_mask & __GFP_NOFAIL)
  		return 0;
  	if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
  		return 0;
  	if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
  		return 0;
  
  	return should_fail(&fail_page_alloc.attr, 1 << order);
  }
  
  #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
  
  static int __init fail_page_alloc_debugfs(void)
  {
f4ae40a6a   Al Viro   switch debugfs to...
1401
  	umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
933e312e7   Akinobu Mita   [PATCH] fault-inj...
1402
  	struct dentry *dir;
933e312e7   Akinobu Mita   [PATCH] fault-inj...
1403

dd48c085c   Akinobu Mita   fault-injection: ...
1404
1405
1406
1407
  	dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
  					&fail_page_alloc.attr);
  	if (IS_ERR(dir))
  		return PTR_ERR(dir);
933e312e7   Akinobu Mita   [PATCH] fault-inj...
1408

b2588c4b4   Akinobu Mita   fail_page_alloc: ...
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
  	if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
  				&fail_page_alloc.ignore_gfp_wait))
  		goto fail;
  	if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
  				&fail_page_alloc.ignore_gfp_highmem))
  		goto fail;
  	if (!debugfs_create_u32("min-order", mode, dir,
  				&fail_page_alloc.min_order))
  		goto fail;
  
  	return 0;
  fail:
dd48c085c   Akinobu Mita   fault-injection: ...
1421
  	debugfs_remove_recursive(dir);
933e312e7   Akinobu Mita   [PATCH] fault-inj...
1422

b2588c4b4   Akinobu Mita   fail_page_alloc: ...
1423
  	return -ENOMEM;
933e312e7   Akinobu Mita   [PATCH] fault-inj...
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
  }
  
  late_initcall(fail_page_alloc_debugfs);
  
  #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
  
  #else /* CONFIG_FAIL_PAGE_ALLOC */
  
  static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
  {
  	return 0;
  }
  
  #endif /* CONFIG_FAIL_PAGE_ALLOC */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1438
  /*
88f5acf88   Mel Gorman   mm: page allocato...
1439
   * Return true if free pages are above 'mark'. This takes into account the order
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1440
1441
   * of the allocation.
   */
88f5acf88   Mel Gorman   mm: page allocato...
1442
1443
  static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
  		      int classzone_idx, int alloc_flags, long free_pages)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1444
1445
  {
  	/* free_pages my go negative - that's OK */
d23ad4232   Christoph Lameter   [PATCH] Use ZVC f...
1446
  	long min = mark;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1447
  	int o;
df0a6daa0   Michal Hocko   mm: fix off-by-tw...
1448
  	free_pages -= (1 << order) - 1;
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1449
  	if (alloc_flags & ALLOC_HIGH)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1450
  		min -= min / 2;
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1451
  	if (alloc_flags & ALLOC_HARDER)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1452
1453
1454
  		min -= min / 4;
  
  	if (free_pages <= min + z->lowmem_reserve[classzone_idx])
88f5acf88   Mel Gorman   mm: page allocato...
1455
  		return false;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1456
1457
1458
1459
1460
1461
1462
1463
  	for (o = 0; o < order; o++) {
  		/* At the next order, this order's pages become unavailable */
  		free_pages -= z->free_area[o].nr_free << o;
  
  		/* Require fewer higher order pages to be free */
  		min >>= 1;
  
  		if (free_pages <= min)
88f5acf88   Mel Gorman   mm: page allocato...
1464
  			return false;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1465
  	}
88f5acf88   Mel Gorman   mm: page allocato...
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
  	return true;
  }
  
  bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
  		      int classzone_idx, int alloc_flags)
  {
  	return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
  					zone_page_state(z, NR_FREE_PAGES));
  }
  
  bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
  		      int classzone_idx, int alloc_flags)
  {
  	long free_pages = zone_page_state(z, NR_FREE_PAGES);
  
  	if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
  		free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
  
  	return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
  								free_pages);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1486
  }
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1487
1488
1489
1490
1491
1492
  #ifdef CONFIG_NUMA
  /*
   * zlc_setup - Setup for "zonelist cache".  Uses cached zone data to
   * skip over zones that are not allowed by the cpuset, or that have
   * been recently (in last second) found to be nearly full.  See further
   * comments in mmzone.h.  Reduces cache footprint of zonelist scans
183ff22bb   Simon Arlott   spelling fixes: mm/
1493
   * that have to skip over a lot of full or unallowed zones.
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1494
1495
1496
   *
   * If the zonelist cache is present in the passed in zonelist, then
   * returns a pointer to the allowed node mask (either the current
37b07e416   Lee Schermerhorn   memoryless nodes:...
1497
   * tasks mems_allowed, or node_states[N_HIGH_MEMORY].)
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
   *
   * If the zonelist cache is not available for this zonelist, does
   * nothing and returns NULL.
   *
   * If the fullzones BITMAP in the zonelist cache is stale (more than
   * a second since last zap'd) then we zap it out (clear its bits.)
   *
   * We hold off even calling zlc_setup, until after we've checked the
   * first zone in the zonelist, on the theory that most allocations will
   * be satisfied from that first zone, so best to examine that zone as
   * quickly as we can.
   */
  static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
  {
  	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
  	nodemask_t *allowednodes;	/* zonelist_cache approximation */
  
  	zlc = zonelist->zlcache_ptr;
  	if (!zlc)
  		return NULL;
f05111f50   S.ÇaÄŸlar Onur   mm/page_alloc.c: ...
1518
  	if (time_after(jiffies, zlc->last_full_zap + HZ)) {
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1519
1520
1521
1522
1523
1524
  		bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
  		zlc->last_full_zap = jiffies;
  	}
  
  	allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
  					&cpuset_current_mems_allowed :
37b07e416   Lee Schermerhorn   memoryless nodes:...
1525
  					&node_states[N_HIGH_MEMORY];
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
  	return allowednodes;
  }
  
  /*
   * Given 'z' scanning a zonelist, run a couple of quick checks to see
   * if it is worth looking at further for free memory:
   *  1) Check that the zone isn't thought to be full (doesn't have its
   *     bit set in the zonelist_cache fullzones BITMAP).
   *  2) Check that the zones node (obtained from the zonelist_cache
   *     z_to_n[] mapping) is allowed in the passed in allowednodes mask.
   * Return true (non-zero) if zone is worth looking at further, or
   * else return false (zero) if it is not.
   *
   * This check -ignores- the distinction between various watermarks,
   * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ...  If a zone is
   * found to be full for any variation of these watermarks, it will
   * be considered full for up to one second by all requests, unless
   * we are so low on memory on all allowed nodes that we are forced
   * into the second scan of the zonelist.
   *
   * In the second scan we ignore this zonelist cache and exactly
   * apply the watermarks to all zones, even it is slower to do so.
   * We are low on memory in the second scan, and should leave no stone
   * unturned looking for a free page.
   */
dd1a239f6   Mel Gorman   mm: have zonelist...
1551
  static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1552
1553
1554
1555
1556
1557
1558
1559
1560
  						nodemask_t *allowednodes)
  {
  	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
  	int i;				/* index of *z in zonelist zones */
  	int n;				/* node that zone *z is on */
  
  	zlc = zonelist->zlcache_ptr;
  	if (!zlc)
  		return 1;
dd1a239f6   Mel Gorman   mm: have zonelist...
1561
  	i = z - zonelist->_zonerefs;
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
  	n = zlc->z_to_n[i];
  
  	/* This zone is worth trying if it is allowed but not full */
  	return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
  }
  
  /*
   * Given 'z' scanning a zonelist, set the corresponding bit in
   * zlc->fullzones, so that subsequent attempts to allocate a page
   * from that zone don't waste time re-examining it.
   */
dd1a239f6   Mel Gorman   mm: have zonelist...
1573
  static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1574
1575
1576
1577
1578
1579
1580
  {
  	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
  	int i;				/* index of *z in zonelist zones */
  
  	zlc = zonelist->zlcache_ptr;
  	if (!zlc)
  		return;
dd1a239f6   Mel Gorman   mm: have zonelist...
1581
  	i = z - zonelist->_zonerefs;
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1582
1583
1584
  
  	set_bit(i, zlc->fullzones);
  }
76d3fbf8f   Mel Gorman   mm: page allocato...
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
  /*
   * clear all zones full, called after direct reclaim makes progress so that
   * a zone that was recently full is not skipped over for up to a second
   */
  static void zlc_clear_zones_full(struct zonelist *zonelist)
  {
  	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
  
  	zlc = zonelist->zlcache_ptr;
  	if (!zlc)
  		return;
  
  	bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
  }
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1599
1600
1601
1602
1603
1604
  #else	/* CONFIG_NUMA */
  
  static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
  {
  	return NULL;
  }
dd1a239f6   Mel Gorman   mm: have zonelist...
1605
  static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1606
1607
1608
1609
  				nodemask_t *allowednodes)
  {
  	return 1;
  }
dd1a239f6   Mel Gorman   mm: have zonelist...
1610
  static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1611
1612
  {
  }
76d3fbf8f   Mel Gorman   mm: page allocato...
1613
1614
1615
1616
  
  static void zlc_clear_zones_full(struct zonelist *zonelist)
  {
  }
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1617
  #endif	/* CONFIG_NUMA */
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1618
  /*
0798e5193   Paul Jackson   [PATCH] memory pa...
1619
   * get_page_from_freelist goes through the zonelist trying to allocate
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1620
1621
1622
   * a page.
   */
  static struct page *
19770b326   Mel Gorman   mm: filter based ...
1623
  get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
5117f45d1   Mel Gorman   page allocator: c...
1624
  		struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
3dd282669   Mel Gorman   page allocator: c...
1625
  		struct zone *preferred_zone, int migratetype)
753ee7289   Martin Hicks   [PATCH] VM: early...
1626
  {
dd1a239f6   Mel Gorman   mm: have zonelist...
1627
  	struct zoneref *z;
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1628
  	struct page *page = NULL;
54a6eb5c4   Mel Gorman   mm: use two zonel...
1629
  	int classzone_idx;
5117f45d1   Mel Gorman   page allocator: c...
1630
  	struct zone *zone;
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1631
1632
1633
  	nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
  	int zlc_active = 0;		/* set if using zonelist_cache */
  	int did_zlc_setup = 0;		/* just call zlc_setup() one time */
54a6eb5c4   Mel Gorman   mm: use two zonel...
1634

19770b326   Mel Gorman   mm: filter based ...
1635
  	classzone_idx = zone_idx(preferred_zone);
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1636
  zonelist_scan:
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1637
  	/*
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1638
  	 * Scan zonelist, looking for a zone with enough free.
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1639
1640
  	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
  	 */
19770b326   Mel Gorman   mm: filter based ...
1641
1642
  	for_each_zone_zonelist_nodemask(zone, z, zonelist,
  						high_zoneidx, nodemask) {
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1643
1644
1645
  		if (NUMA_BUILD && zlc_active &&
  			!zlc_zone_worth_trying(zonelist, z, allowednodes))
  				continue;
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1646
  		if ((alloc_flags & ALLOC_CPUSET) &&
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
1647
  			!cpuset_zone_allowed_softwall(zone, gfp_mask))
cd38b115d   Mel Gorman   mm: page allocato...
1648
  				continue;
a756cf590   Johannes Weiner   mm: try to distri...
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
  		/*
  		 * When allocating a page cache page for writing, we
  		 * want to get it from a zone that is within its dirty
  		 * limit, such that no single zone holds more than its
  		 * proportional share of globally allowed dirty pages.
  		 * The dirty limits take into account the zone's
  		 * lowmem reserves and high watermark so that kswapd
  		 * should be able to balance it without having to
  		 * write pages from its LRU list.
  		 *
  		 * This may look like it could increase pressure on
  		 * lower zones by failing allocations in higher zones
  		 * before they are full.  But the pages that do spill
  		 * over are limited as the lower zones are protected
  		 * by this very same mechanism.  It should not become
  		 * a practical burden to them.
  		 *
  		 * XXX: For now, allow allocations to potentially
  		 * exceed the per-zone dirty limit in the slowpath
  		 * (ALLOC_WMARK_LOW unset) before going into reclaim,
  		 * which is important when on a NUMA setup the allowed
  		 * zones are together not big enough to reach the
  		 * global limit.  The proper fix for these situations
  		 * will require awareness of zones in the
  		 * dirty-throttling and the flusher threads.
  		 */
  		if ((alloc_flags & ALLOC_WMARK_LOW) &&
  		    (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))
  			goto this_zone_full;
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1678

418589663   Mel Gorman   page allocator: u...
1679
  		BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1680
  		if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
3148890bf   Nick Piggin   [PATCH] mm: __all...
1681
  			unsigned long mark;
fa5e084e4   Mel Gorman   vmscan: do not un...
1682
  			int ret;
418589663   Mel Gorman   page allocator: u...
1683
  			mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
fa5e084e4   Mel Gorman   vmscan: do not un...
1684
1685
1686
  			if (zone_watermark_ok(zone, order, mark,
  				    classzone_idx, alloc_flags))
  				goto try_this_zone;
cd38b115d   Mel Gorman   mm: page allocato...
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
  			if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
  				/*
  				 * we do zlc_setup if there are multiple nodes
  				 * and before considering the first zone allowed
  				 * by the cpuset.
  				 */
  				allowednodes = zlc_setup(zonelist, alloc_flags);
  				zlc_active = 1;
  				did_zlc_setup = 1;
  			}
fa5e084e4   Mel Gorman   vmscan: do not un...
1697
1698
  			if (zone_reclaim_mode == 0)
  				goto this_zone_full;
cd38b115d   Mel Gorman   mm: page allocato...
1699
1700
1701
1702
1703
1704
1705
  			/*
  			 * As we may have just activated ZLC, check if the first
  			 * eligible zone has failed zone_reclaim recently.
  			 */
  			if (NUMA_BUILD && zlc_active &&
  				!zlc_zone_worth_trying(zonelist, z, allowednodes))
  				continue;
fa5e084e4   Mel Gorman   vmscan: do not un...
1706
1707
1708
1709
  			ret = zone_reclaim(zone, gfp_mask, order);
  			switch (ret) {
  			case ZONE_RECLAIM_NOSCAN:
  				/* did not scan */
cd38b115d   Mel Gorman   mm: page allocato...
1710
  				continue;
fa5e084e4   Mel Gorman   vmscan: do not un...
1711
1712
  			case ZONE_RECLAIM_FULL:
  				/* scanned but unreclaimable */
cd38b115d   Mel Gorman   mm: page allocato...
1713
  				continue;
fa5e084e4   Mel Gorman   vmscan: do not un...
1714
1715
1716
1717
  			default:
  				/* did we reclaim enough */
  				if (!zone_watermark_ok(zone, order, mark,
  						classzone_idx, alloc_flags))
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1718
  					goto this_zone_full;
0798e5193   Paul Jackson   [PATCH] memory pa...
1719
  			}
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1720
  		}
fa5e084e4   Mel Gorman   vmscan: do not un...
1721
  try_this_zone:
3dd282669   Mel Gorman   page allocator: c...
1722
1723
  		page = buffered_rmqueue(preferred_zone, zone, order,
  						gfp_mask, migratetype);
0798e5193   Paul Jackson   [PATCH] memory pa...
1724
  		if (page)
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1725
  			break;
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1726
1727
1728
  this_zone_full:
  		if (NUMA_BUILD)
  			zlc_mark_zone_full(zonelist, z);
54a6eb5c4   Mel Gorman   mm: use two zonel...
1729
  	}
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1730
1731
1732
1733
1734
1735
  
  	if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
  		/* Disable zlc cache for second zonelist scan */
  		zlc_active = 0;
  		goto zonelist_scan;
  	}
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1736
  	return page;
753ee7289   Martin Hicks   [PATCH] VM: early...
1737
  }
29423e77c   David Rientjes   oom: suppress sho...
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
  /*
   * Large machines with many possible nodes should not always dump per-node
   * meminfo in irq context.
   */
  static inline bool should_suppress_show_mem(void)
  {
  	bool ret = false;
  
  #if NODES_SHIFT > 8
  	ret = in_interrupt();
  #endif
  	return ret;
  }
a238ab5b0   Dave Hansen   mm: break out pag...
1751
1752
1753
1754
1755
1756
  static DEFINE_RATELIMIT_STATE(nopage_rs,
  		DEFAULT_RATELIMIT_INTERVAL,
  		DEFAULT_RATELIMIT_BURST);
  
  void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
  {
a238ab5b0   Dave Hansen   mm: break out pag...
1757
  	unsigned int filter = SHOW_MEM_FILTER_NODES;
c0a32fc5a   Stanislaw Gruszka   mm: more intensiv...
1758
1759
  	if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
  	    debug_guardpage_minorder() > 0)
a238ab5b0   Dave Hansen   mm: break out pag...
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
  		return;
  
  	/*
  	 * This documents exceptions given to allocations in certain
  	 * contexts that are allowed to allocate outside current's set
  	 * of allowed nodes.
  	 */
  	if (!(gfp_mask & __GFP_NOMEMALLOC))
  		if (test_thread_flag(TIF_MEMDIE) ||
  		    (current->flags & (PF_MEMALLOC | PF_EXITING)))
  			filter &= ~SHOW_MEM_FILTER_NODES;
  	if (in_interrupt() || !(gfp_mask & __GFP_WAIT))
  		filter &= ~SHOW_MEM_FILTER_NODES;
  
  	if (fmt) {
3ee9a4f08   Joe Perches   mm: neaten warn_a...
1775
1776
  		struct va_format vaf;
  		va_list args;
a238ab5b0   Dave Hansen   mm: break out pag...
1777
  		va_start(args, fmt);
3ee9a4f08   Joe Perches   mm: neaten warn_a...
1778
1779
1780
1781
1782
  
  		vaf.fmt = fmt;
  		vaf.va = &args;
  
  		pr_warn("%pV", &vaf);
a238ab5b0   Dave Hansen   mm: break out pag...
1783
1784
  		va_end(args);
  	}
3ee9a4f08   Joe Perches   mm: neaten warn_a...
1785
1786
1787
  	pr_warn("%s: page allocation failure: order:%d, mode:0x%x
  ",
  		current->comm, order, gfp_mask);
a238ab5b0   Dave Hansen   mm: break out pag...
1788
1789
1790
1791
1792
  
  	dump_stack();
  	if (!should_suppress_show_mem())
  		show_mem(filter);
  }
11e33f6a5   Mel Gorman   page allocator: b...
1793
1794
  static inline int
  should_alloc_retry(gfp_t gfp_mask, unsigned int order,
f90ac3982   Mel Gorman   mm: avoid liveloc...
1795
  				unsigned long did_some_progress,
11e33f6a5   Mel Gorman   page allocator: b...
1796
  				unsigned long pages_reclaimed)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1797
  {
11e33f6a5   Mel Gorman   page allocator: b...
1798
1799
1800
  	/* Do not loop if specifically requested */
  	if (gfp_mask & __GFP_NORETRY)
  		return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1801

f90ac3982   Mel Gorman   mm: avoid liveloc...
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
  	/* Always retry if specifically requested */
  	if (gfp_mask & __GFP_NOFAIL)
  		return 1;
  
  	/*
  	 * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim
  	 * making forward progress without invoking OOM. Suspend also disables
  	 * storage devices so kswapd will not help. Bail if we are suspending.
  	 */
  	if (!did_some_progress && pm_suspended_storage())
  		return 0;
11e33f6a5   Mel Gorman   page allocator: b...
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
  	/*
  	 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
  	 * means __GFP_NOFAIL, but that may not be true in other
  	 * implementations.
  	 */
  	if (order <= PAGE_ALLOC_COSTLY_ORDER)
  		return 1;
  
  	/*
  	 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
  	 * specified, then we retry until we no longer reclaim any pages
  	 * (above), or we've reclaimed an order of pages at least as
  	 * large as the allocation's order. In both cases, if the
  	 * allocation still fails, we stop retrying.
  	 */
  	if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
  		return 1;
cf40bd16f   Nick Piggin   lockdep: annotate...
1830

11e33f6a5   Mel Gorman   page allocator: b...
1831
1832
  	return 0;
  }
933e312e7   Akinobu Mita   [PATCH] fault-inj...
1833

11e33f6a5   Mel Gorman   page allocator: b...
1834
1835
1836
  static inline struct page *
  __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
  	struct zonelist *zonelist, enum zone_type high_zoneidx,
3dd282669   Mel Gorman   page allocator: c...
1837
1838
  	nodemask_t *nodemask, struct zone *preferred_zone,
  	int migratetype)
11e33f6a5   Mel Gorman   page allocator: b...
1839
1840
1841
1842
  {
  	struct page *page;
  
  	/* Acquire the OOM killer lock for the zones in zonelist */
ff321feac   Minchan Kim   mm: rename try_se...
1843
  	if (!try_set_zonelist_oom(zonelist, gfp_mask)) {
11e33f6a5   Mel Gorman   page allocator: b...
1844
  		schedule_timeout_uninterruptible(1);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1845
1846
  		return NULL;
  	}
6b1de9161   Jens Axboe   [PATCH] VM: fix z...
1847

11e33f6a5   Mel Gorman   page allocator: b...
1848
1849
1850
1851
1852
1853
1854
  	/*
  	 * Go through the zonelist yet one more time, keep very high watermark
  	 * here, this is only to catch a parallel oom killing, we must fail if
  	 * we're still under heavy pressure.
  	 */
  	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
  		order, zonelist, high_zoneidx,
5117f45d1   Mel Gorman   page allocator: c...
1855
  		ALLOC_WMARK_HIGH|ALLOC_CPUSET,
3dd282669   Mel Gorman   page allocator: c...
1856
  		preferred_zone, migratetype);
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1857
  	if (page)
11e33f6a5   Mel Gorman   page allocator: b...
1858
  		goto out;
4365a5676   KAMEZAWA Hiroyuki   oom-kill: fix NUM...
1859
1860
1861
1862
  	if (!(gfp_mask & __GFP_NOFAIL)) {
  		/* The OOM killer will not help higher order allocs */
  		if (order > PAGE_ALLOC_COSTLY_ORDER)
  			goto out;
03668b3ce   David Rientjes   oom: avoid oom ki...
1863
1864
1865
  		/* The OOM killer does not needlessly kill tasks for lowmem */
  		if (high_zoneidx < ZONE_NORMAL)
  			goto out;
4365a5676   KAMEZAWA Hiroyuki   oom-kill: fix NUM...
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
  		/*
  		 * GFP_THISNODE contains __GFP_NORETRY and we never hit this.
  		 * Sanity check for bare calls of __GFP_THISNODE, not real OOM.
  		 * The caller should handle page allocation failure by itself if
  		 * it specifies __GFP_THISNODE.
  		 * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.
  		 */
  		if (gfp_mask & __GFP_THISNODE)
  			goto out;
  	}
11e33f6a5   Mel Gorman   page allocator: b...
1876
  	/* Exhausted what can be done so it's blamo time */
4365a5676   KAMEZAWA Hiroyuki   oom-kill: fix NUM...
1877
  	out_of_memory(zonelist, gfp_mask, order, nodemask);
11e33f6a5   Mel Gorman   page allocator: b...
1878
1879
1880
1881
1882
  
  out:
  	clear_zonelist_oom(zonelist, gfp_mask);
  	return page;
  }
56de7263f   Mel Gorman   mm: compaction: d...
1883
1884
1885
1886
1887
1888
  #ifdef CONFIG_COMPACTION
  /* Try memory compaction for high-order allocations before reclaim */
  static struct page *
  __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
  	struct zonelist *zonelist, enum zone_type high_zoneidx,
  	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
66199712e   Mel Gorman   mm: page allocato...
1889
1890
1891
  	int migratetype, bool sync_migration,
  	bool *deferred_compaction,
  	unsigned long *did_some_progress)
56de7263f   Mel Gorman   mm: compaction: d...
1892
1893
  {
  	struct page *page;
66199712e   Mel Gorman   mm: page allocato...
1894
  	if (!order)
56de7263f   Mel Gorman   mm: compaction: d...
1895
  		return NULL;
66199712e   Mel Gorman   mm: page allocato...
1896
1897
1898
1899
  	if (compaction_deferred(preferred_zone)) {
  		*deferred_compaction = true;
  		return NULL;
  	}
c06b1fca1   Andrew Morton   mm/page_alloc.c: ...
1900
  	current->flags |= PF_MEMALLOC;
56de7263f   Mel Gorman   mm: compaction: d...
1901
  	*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
77f1fe6b0   Mel Gorman   mm: migration: al...
1902
  						nodemask, sync_migration);
c06b1fca1   Andrew Morton   mm/page_alloc.c: ...
1903
  	current->flags &= ~PF_MEMALLOC;
56de7263f   Mel Gorman   mm: compaction: d...
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
  	if (*did_some_progress != COMPACT_SKIPPED) {
  
  		/* Page migration frees to the PCP lists but we want merging */
  		drain_pages(get_cpu());
  		put_cpu();
  
  		page = get_page_from_freelist(gfp_mask, nodemask,
  				order, zonelist, high_zoneidx,
  				alloc_flags, preferred_zone,
  				migratetype);
  		if (page) {
4f92e2586   Mel Gorman   mm: compaction: d...
1915
1916
  			preferred_zone->compact_considered = 0;
  			preferred_zone->compact_defer_shift = 0;
56de7263f   Mel Gorman   mm: compaction: d...
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
  			count_vm_event(COMPACTSUCCESS);
  			return page;
  		}
  
  		/*
  		 * It's bad if compaction run occurs and fails.
  		 * The most likely reason is that pages exist,
  		 * but not enough to satisfy watermarks.
  		 */
  		count_vm_event(COMPACTFAIL);
66199712e   Mel Gorman   mm: page allocato...
1927
1928
1929
1930
1931
1932
1933
  
  		/*
  		 * As async compaction considers a subset of pageblocks, only
  		 * defer if the failure was a sync compaction failure.
  		 */
  		if (sync_migration)
  			defer_compaction(preferred_zone);
56de7263f   Mel Gorman   mm: compaction: d...
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
  
  		cond_resched();
  	}
  
  	return NULL;
  }
  #else
  static inline struct page *
  __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
  	struct zonelist *zonelist, enum zone_type high_zoneidx,
  	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
66199712e   Mel Gorman   mm: page allocato...
1945
1946
1947
  	int migratetype, bool sync_migration,
  	bool *deferred_compaction,
  	unsigned long *did_some_progress)
56de7263f   Mel Gorman   mm: compaction: d...
1948
1949
1950
1951
  {
  	return NULL;
  }
  #endif /* CONFIG_COMPACTION */
11e33f6a5   Mel Gorman   page allocator: b...
1952
1953
1954
1955
  /* The really slow allocator path where we enter direct reclaim */
  static inline struct page *
  __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
  	struct zonelist *zonelist, enum zone_type high_zoneidx,
5117f45d1   Mel Gorman   page allocator: c...
1956
  	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
3dd282669   Mel Gorman   page allocator: c...
1957
  	int migratetype, unsigned long *did_some_progress)
11e33f6a5   Mel Gorman   page allocator: b...
1958
1959
1960
  {
  	struct page *page = NULL;
  	struct reclaim_state reclaim_state;
9ee493ce0   Mel Gorman   mm: page allocato...
1961
  	bool drained = false;
11e33f6a5   Mel Gorman   page allocator: b...
1962
1963
1964
1965
1966
  
  	cond_resched();
  
  	/* We now go into synchronous reclaim */
  	cpuset_memory_pressure_bump();
c06b1fca1   Andrew Morton   mm/page_alloc.c: ...
1967
  	current->flags |= PF_MEMALLOC;
11e33f6a5   Mel Gorman   page allocator: b...
1968
1969
  	lockdep_set_current_reclaim_state(gfp_mask);
  	reclaim_state.reclaimed_slab = 0;
c06b1fca1   Andrew Morton   mm/page_alloc.c: ...
1970
  	current->reclaim_state = &reclaim_state;
11e33f6a5   Mel Gorman   page allocator: b...
1971
1972
  
  	*did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
c06b1fca1   Andrew Morton   mm/page_alloc.c: ...
1973
  	current->reclaim_state = NULL;
11e33f6a5   Mel Gorman   page allocator: b...
1974
  	lockdep_clear_current_reclaim_state();
c06b1fca1   Andrew Morton   mm/page_alloc.c: ...
1975
  	current->flags &= ~PF_MEMALLOC;
11e33f6a5   Mel Gorman   page allocator: b...
1976
1977
  
  	cond_resched();
9ee493ce0   Mel Gorman   mm: page allocato...
1978
1979
  	if (unlikely(!(*did_some_progress)))
  		return NULL;
11e33f6a5   Mel Gorman   page allocator: b...
1980

76d3fbf8f   Mel Gorman   mm: page allocato...
1981
1982
1983
  	/* After successful reclaim, reconsider all zones for allocation */
  	if (NUMA_BUILD)
  		zlc_clear_zones_full(zonelist);
9ee493ce0   Mel Gorman   mm: page allocato...
1984
1985
  retry:
  	page = get_page_from_freelist(gfp_mask, nodemask, order,
5117f45d1   Mel Gorman   page allocator: c...
1986
  					zonelist, high_zoneidx,
3dd282669   Mel Gorman   page allocator: c...
1987
1988
  					alloc_flags, preferred_zone,
  					migratetype);
9ee493ce0   Mel Gorman   mm: page allocato...
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
  
  	/*
  	 * If an allocation failed after direct reclaim, it could be because
  	 * pages are pinned on the per-cpu lists. Drain them and try again
  	 */
  	if (!page && !drained) {
  		drain_all_pages();
  		drained = true;
  		goto retry;
  	}
11e33f6a5   Mel Gorman   page allocator: b...
1999
2000
  	return page;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2001
  /*
11e33f6a5   Mel Gorman   page allocator: b...
2002
2003
   * This is called in the allocator slow-path if the allocation request is of
   * sufficient urgency to ignore watermarks and take other desperate measures
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2004
   */
11e33f6a5   Mel Gorman   page allocator: b...
2005
2006
2007
  static inline struct page *
  __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
  	struct zonelist *zonelist, enum zone_type high_zoneidx,
3dd282669   Mel Gorman   page allocator: c...
2008
2009
  	nodemask_t *nodemask, struct zone *preferred_zone,
  	int migratetype)
11e33f6a5   Mel Gorman   page allocator: b...
2010
2011
2012
2013
2014
  {
  	struct page *page;
  
  	do {
  		page = get_page_from_freelist(gfp_mask, nodemask, order,
5117f45d1   Mel Gorman   page allocator: c...
2015
  			zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,
3dd282669   Mel Gorman   page allocator: c...
2016
  			preferred_zone, migratetype);
11e33f6a5   Mel Gorman   page allocator: b...
2017
2018
  
  		if (!page && gfp_mask & __GFP_NOFAIL)
0e093d997   Mel Gorman   writeback: do not...
2019
  			wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
11e33f6a5   Mel Gorman   page allocator: b...
2020
2021
2022
2023
2024
2025
2026
  	} while (!page && (gfp_mask & __GFP_NOFAIL));
  
  	return page;
  }
  
  static inline
  void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
995047488   Mel Gorman   mm: kswapd: stop ...
2027
2028
  						enum zone_type high_zoneidx,
  						enum zone_type classzone_idx)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2029
  {
dd1a239f6   Mel Gorman   mm: have zonelist...
2030
2031
  	struct zoneref *z;
  	struct zone *zone;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2032

11e33f6a5   Mel Gorman   page allocator: b...
2033
  	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
995047488   Mel Gorman   mm: kswapd: stop ...
2034
  		wakeup_kswapd(zone, order, classzone_idx);
11e33f6a5   Mel Gorman   page allocator: b...
2035
  }
cf40bd16f   Nick Piggin   lockdep: annotate...
2036

341ce06f6   Peter Zijlstra   page allocator: c...
2037
2038
2039
  static inline int
  gfp_to_alloc_flags(gfp_t gfp_mask)
  {
341ce06f6   Peter Zijlstra   page allocator: c...
2040
2041
  	int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
  	const gfp_t wait = gfp_mask & __GFP_WAIT;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2042

a56f57ff9   Mel Gorman   page allocator: r...
2043
  	/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
e6223a3b1   Namhyung Kim   mm: add casts to/...
2044
  	BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
933e312e7   Akinobu Mita   [PATCH] fault-inj...
2045

341ce06f6   Peter Zijlstra   page allocator: c...
2046
2047
2048
2049
2050
2051
  	/*
  	 * The caller may dip into page reserves a bit more if the caller
  	 * cannot run direct reclaim, or if the caller has realtime scheduling
  	 * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
  	 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
  	 */
e6223a3b1   Namhyung Kim   mm: add casts to/...
2052
  	alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2053

341ce06f6   Peter Zijlstra   page allocator: c...
2054
  	if (!wait) {
5c3240d92   Andrea Arcangeli   thp: don't alloc ...
2055
2056
2057
2058
2059
2060
  		/*
  		 * Not worth trying to allocate harder for
  		 * __GFP_NOMEMALLOC even if it can't schedule.
  		 */
  		if  (!(gfp_mask & __GFP_NOMEMALLOC))
  			alloc_flags |= ALLOC_HARDER;
523b94585   Christoph Lameter   Memoryless nodes:...
2061
  		/*
341ce06f6   Peter Zijlstra   page allocator: c...
2062
2063
  		 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
  		 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
523b94585   Christoph Lameter   Memoryless nodes:...
2064
  		 */
341ce06f6   Peter Zijlstra   page allocator: c...
2065
  		alloc_flags &= ~ALLOC_CPUSET;
c06b1fca1   Andrew Morton   mm/page_alloc.c: ...
2066
  	} else if (unlikely(rt_task(current)) && !in_interrupt())
341ce06f6   Peter Zijlstra   page allocator: c...
2067
2068
2069
2070
  		alloc_flags |= ALLOC_HARDER;
  
  	if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
  		if (!in_interrupt() &&
c06b1fca1   Andrew Morton   mm/page_alloc.c: ...
2071
  		    ((current->flags & PF_MEMALLOC) ||
341ce06f6   Peter Zijlstra   page allocator: c...
2072
2073
  		     unlikely(test_thread_flag(TIF_MEMDIE))))
  			alloc_flags |= ALLOC_NO_WATERMARKS;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2074
  	}
6b1de9161   Jens Axboe   [PATCH] VM: fix z...
2075

341ce06f6   Peter Zijlstra   page allocator: c...
2076
2077
  	return alloc_flags;
  }
11e33f6a5   Mel Gorman   page allocator: b...
2078
2079
2080
  static inline struct page *
  __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
  	struct zonelist *zonelist, enum zone_type high_zoneidx,
3dd282669   Mel Gorman   page allocator: c...
2081
2082
  	nodemask_t *nodemask, struct zone *preferred_zone,
  	int migratetype)
11e33f6a5   Mel Gorman   page allocator: b...
2083
2084
2085
2086
2087
2088
  {
  	const gfp_t wait = gfp_mask & __GFP_WAIT;
  	struct page *page = NULL;
  	int alloc_flags;
  	unsigned long pages_reclaimed = 0;
  	unsigned long did_some_progress;
77f1fe6b0   Mel Gorman   mm: migration: al...
2089
  	bool sync_migration = false;
66199712e   Mel Gorman   mm: page allocato...
2090
  	bool deferred_compaction = false;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2091

952f3b51b   Christoph Lameter   [PATCH] GFP_THISN...
2092
  	/*
72807a74c   Mel Gorman   page allocator: s...
2093
2094
2095
2096
2097
  	 * In the slowpath, we sanity check order to avoid ever trying to
  	 * reclaim >= MAX_ORDER areas which will never succeed. Callers may
  	 * be using allocators in order of preference for an area that is
  	 * too large.
  	 */
1fc28b70f   Mel Gorman   page-allocator: a...
2098
2099
  	if (order >= MAX_ORDER) {
  		WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
72807a74c   Mel Gorman   page allocator: s...
2100
  		return NULL;
1fc28b70f   Mel Gorman   page-allocator: a...
2101
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2102

952f3b51b   Christoph Lameter   [PATCH] GFP_THISN...
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
  	/*
  	 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
  	 * __GFP_NOWARN set) should not cause reclaim since the subsystem
  	 * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
  	 * using a larger set of nodes after it has established that the
  	 * allowed per node queues are empty and that nodes are
  	 * over allocated.
  	 */
  	if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
  		goto nopage;
cc4a68514   Mel Gorman   page allocator: a...
2113
  restart:
32dba98e0   Andrea Arcangeli   thp: _GFP_NO_KSWAPD
2114
2115
  	if (!(gfp_mask & __GFP_NO_KSWAPD))
  		wake_all_kswapd(order, zonelist, high_zoneidx,
995047488   Mel Gorman   mm: kswapd: stop ...
2116
  						zone_idx(preferred_zone));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2117

9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2118
  	/*
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
2119
2120
2121
  	 * OK, we're below the kswapd watermark and have kicked background
  	 * reclaim. Now things get more complex, so set up alloc_flags according
  	 * to how we want to proceed.
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2122
  	 */
341ce06f6   Peter Zijlstra   page allocator: c...
2123
  	alloc_flags = gfp_to_alloc_flags(gfp_mask);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2124

f33261d75   David Rientjes   mm: fix deferred ...
2125
2126
2127
2128
2129
2130
2131
  	/*
  	 * Find the true preferred zone if the allocation is unconstrained by
  	 * cpusets.
  	 */
  	if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)
  		first_zones_zonelist(zonelist, high_zoneidx, NULL,
  					&preferred_zone);
cfa54a0fc   Andrew Barry   mm/page_alloc.c: ...
2132
  rebalance:
341ce06f6   Peter Zijlstra   page allocator: c...
2133
  	/* This is the last chance, in general, before the goto nopage. */
19770b326   Mel Gorman   mm: filter based ...
2134
  	page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
341ce06f6   Peter Zijlstra   page allocator: c...
2135
2136
  			high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
  			preferred_zone, migratetype);
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
2137
2138
  	if (page)
  		goto got_pg;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2139

11e33f6a5   Mel Gorman   page allocator: b...
2140
  	/* Allocate without watermarks if the context allows */
341ce06f6   Peter Zijlstra   page allocator: c...
2141
2142
2143
2144
2145
2146
  	if (alloc_flags & ALLOC_NO_WATERMARKS) {
  		page = __alloc_pages_high_priority(gfp_mask, order,
  				zonelist, high_zoneidx, nodemask,
  				preferred_zone, migratetype);
  		if (page)
  			goto got_pg;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2147
2148
2149
2150
2151
  	}
  
  	/* Atomic allocations - we can't balance anything */
  	if (!wait)
  		goto nopage;
341ce06f6   Peter Zijlstra   page allocator: c...
2152
  	/* Avoid recursion of direct reclaim */
c06b1fca1   Andrew Morton   mm/page_alloc.c: ...
2153
  	if (current->flags & PF_MEMALLOC)
341ce06f6   Peter Zijlstra   page allocator: c...
2154
  		goto nopage;
6583bb64f   David Rientjes   mm: avoid endless...
2155
2156
2157
  	/* Avoid allocations with no watermarks from looping endlessly */
  	if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
  		goto nopage;
77f1fe6b0   Mel Gorman   mm: migration: al...
2158
2159
2160
2161
  	/*
  	 * Try direct compaction. The first pass is asynchronous. Subsequent
  	 * attempts after direct reclaim are synchronous
  	 */
56de7263f   Mel Gorman   mm: compaction: d...
2162
2163
2164
2165
  	page = __alloc_pages_direct_compact(gfp_mask, order,
  					zonelist, high_zoneidx,
  					nodemask,
  					alloc_flags, preferred_zone,
66199712e   Mel Gorman   mm: page allocato...
2166
2167
2168
  					migratetype, sync_migration,
  					&deferred_compaction,
  					&did_some_progress);
56de7263f   Mel Gorman   mm: compaction: d...
2169
2170
  	if (page)
  		goto got_pg;
c6a140bf1   Andrea Arcangeli   mm/compaction: re...
2171
  	sync_migration = true;
56de7263f   Mel Gorman   mm: compaction: d...
2172

66199712e   Mel Gorman   mm: page allocato...
2173
2174
2175
2176
2177
2178
2179
2180
  	/*
  	 * If compaction is deferred for high-order allocations, it is because
  	 * sync compaction recently failed. In this is the case and the caller
  	 * has requested the system not be heavily disrupted, fail the
  	 * allocation now instead of entering direct reclaim
  	 */
  	if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD))
  		goto nopage;
11e33f6a5   Mel Gorman   page allocator: b...
2181
2182
2183
2184
  	/* Try direct reclaim and then allocating */
  	page = __alloc_pages_direct_reclaim(gfp_mask, order,
  					zonelist, high_zoneidx,
  					nodemask,
5117f45d1   Mel Gorman   page allocator: c...
2185
  					alloc_flags, preferred_zone,
3dd282669   Mel Gorman   page allocator: c...
2186
  					migratetype, &did_some_progress);
11e33f6a5   Mel Gorman   page allocator: b...
2187
2188
  	if (page)
  		goto got_pg;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2189

e33c3b5e1   David Rientjes   cpusets: update m...
2190
  	/*
11e33f6a5   Mel Gorman   page allocator: b...
2191
2192
  	 * If we failed to make any progress reclaiming, then we are
  	 * running out of options and have to consider going OOM
e33c3b5e1   David Rientjes   cpusets: update m...
2193
  	 */
11e33f6a5   Mel Gorman   page allocator: b...
2194
2195
  	if (!did_some_progress) {
  		if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
7f33d49a2   Rafael J. Wysocki   mm, PM/Freezer: D...
2196
2197
  			if (oom_killer_disabled)
  				goto nopage;
11e33f6a5   Mel Gorman   page allocator: b...
2198
2199
  			page = __alloc_pages_may_oom(gfp_mask, order,
  					zonelist, high_zoneidx,
3dd282669   Mel Gorman   page allocator: c...
2200
2201
  					nodemask, preferred_zone,
  					migratetype);
11e33f6a5   Mel Gorman   page allocator: b...
2202
2203
  			if (page)
  				goto got_pg;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2204

03668b3ce   David Rientjes   oom: avoid oom ki...
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
  			if (!(gfp_mask & __GFP_NOFAIL)) {
  				/*
  				 * The oom killer is not called for high-order
  				 * allocations that may fail, so if no progress
  				 * is being made, there are no other options and
  				 * retrying is unlikely to help.
  				 */
  				if (order > PAGE_ALLOC_COSTLY_ORDER)
  					goto nopage;
  				/*
  				 * The oom killer is not called for lowmem
  				 * allocations to prevent needlessly killing
  				 * innocent tasks.
  				 */
  				if (high_zoneidx < ZONE_NORMAL)
  					goto nopage;
  			}
e2c55dc87   Mel Gorman   Drain per-cpu lis...
2222

ff0ceb9de   David Rientjes   oom: serialize ou...
2223
2224
  			goto restart;
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2225
  	}
11e33f6a5   Mel Gorman   page allocator: b...
2226
  	/* Check if we should retry the allocation */
a41f24ea9   Nishanth Aravamudan   page allocator: s...
2227
  	pages_reclaimed += did_some_progress;
f90ac3982   Mel Gorman   mm: avoid liveloc...
2228
2229
  	if (should_alloc_retry(gfp_mask, order, did_some_progress,
  						pages_reclaimed)) {
11e33f6a5   Mel Gorman   page allocator: b...
2230
  		/* Wait for some write requests to complete then retry */
0e093d997   Mel Gorman   writeback: do not...
2231
  		wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2232
  		goto rebalance;
3e7d34497   Mel Gorman   mm: vmscan: recla...
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
  	} else {
  		/*
  		 * High-order allocations do not necessarily loop after
  		 * direct reclaim and reclaim/compaction depends on compaction
  		 * being called after reclaim so call directly if necessary
  		 */
  		page = __alloc_pages_direct_compact(gfp_mask, order,
  					zonelist, high_zoneidx,
  					nodemask,
  					alloc_flags, preferred_zone,
66199712e   Mel Gorman   mm: page allocato...
2243
2244
2245
  					migratetype, sync_migration,
  					&deferred_compaction,
  					&did_some_progress);
3e7d34497   Mel Gorman   mm: vmscan: recla...
2246
2247
  		if (page)
  			goto got_pg;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2248
2249
2250
  	}
  
  nopage:
a238ab5b0   Dave Hansen   mm: break out pag...
2251
  	warn_alloc_failed(gfp_mask, order, NULL);
b1eeab676   Vegard Nossum   kmemcheck: add ho...
2252
  	return page;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2253
  got_pg:
b1eeab676   Vegard Nossum   kmemcheck: add ho...
2254
2255
  	if (kmemcheck_enabled)
  		kmemcheck_pagealloc_alloc(page, order, gfp_mask);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2256
  	return page;
11e33f6a5   Mel Gorman   page allocator: b...
2257

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2258
  }
11e33f6a5   Mel Gorman   page allocator: b...
2259
2260
2261
2262
2263
2264
2265
2266
2267
  
  /*
   * This is the 'heart' of the zoned buddy allocator.
   */
  struct page *
  __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
  			struct zonelist *zonelist, nodemask_t *nodemask)
  {
  	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
5117f45d1   Mel Gorman   page allocator: c...
2268
  	struct zone *preferred_zone;
11e33f6a5   Mel Gorman   page allocator: b...
2269
  	struct page *page;
3dd282669   Mel Gorman   page allocator: c...
2270
  	int migratetype = allocflags_to_migratetype(gfp_mask);
11e33f6a5   Mel Gorman   page allocator: b...
2271

dcce284a2   Benjamin Herrenschmidt   mm: Extend gfp ma...
2272
  	gfp_mask &= gfp_allowed_mask;
11e33f6a5   Mel Gorman   page allocator: b...
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
  	lockdep_trace_alloc(gfp_mask);
  
  	might_sleep_if(gfp_mask & __GFP_WAIT);
  
  	if (should_fail_alloc_page(gfp_mask, order))
  		return NULL;
  
  	/*
  	 * Check the zones suitable for the gfp_mask contain at least one
  	 * valid zone. It's possible to have an empty zonelist as a result
  	 * of GFP_THISNODE and a memoryless node
  	 */
  	if (unlikely(!zonelist->_zonerefs->zone))
  		return NULL;
c0ff7453b   Miao Xie   cpuset,mm: fix no...
2287
  	get_mems_allowed();
5117f45d1   Mel Gorman   page allocator: c...
2288
  	/* The preferred zone is used for statistics later */
f33261d75   David Rientjes   mm: fix deferred ...
2289
2290
2291
  	first_zones_zonelist(zonelist, high_zoneidx,
  				nodemask ? : &cpuset_current_mems_allowed,
  				&preferred_zone);
c0ff7453b   Miao Xie   cpuset,mm: fix no...
2292
2293
  	if (!preferred_zone) {
  		put_mems_allowed();
5117f45d1   Mel Gorman   page allocator: c...
2294
  		return NULL;
c0ff7453b   Miao Xie   cpuset,mm: fix no...
2295
  	}
5117f45d1   Mel Gorman   page allocator: c...
2296
2297
  
  	/* First allocation attempt */
11e33f6a5   Mel Gorman   page allocator: b...
2298
  	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
5117f45d1   Mel Gorman   page allocator: c...
2299
  			zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,
3dd282669   Mel Gorman   page allocator: c...
2300
  			preferred_zone, migratetype);
11e33f6a5   Mel Gorman   page allocator: b...
2301
2302
  	if (unlikely(!page))
  		page = __alloc_pages_slowpath(gfp_mask, order,
5117f45d1   Mel Gorman   page allocator: c...
2303
  				zonelist, high_zoneidx, nodemask,
3dd282669   Mel Gorman   page allocator: c...
2304
  				preferred_zone, migratetype);
c0ff7453b   Miao Xie   cpuset,mm: fix no...
2305
  	put_mems_allowed();
11e33f6a5   Mel Gorman   page allocator: b...
2306

4b4f278c0   Mel Gorman   tracing, page-all...
2307
  	trace_mm_page_alloc(page, order, gfp_mask, migratetype);
11e33f6a5   Mel Gorman   page allocator: b...
2308
  	return page;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2309
  }
d239171e4   Mel Gorman   page allocator: r...
2310
  EXPORT_SYMBOL(__alloc_pages_nodemask);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2311
2312
2313
2314
  
  /*
   * Common helper functions.
   */
920c7a5d0   Harvey Harrison   mm: remove fastca...
2315
  unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2316
  {
945a11136   Akinobu Mita   mm: add gfp mask ...
2317
2318
2319
2320
2321
2322
2323
  	struct page *page;
  
  	/*
  	 * __get_free_pages() returns a 32-bit address, which cannot represent
  	 * a highmem page
  	 */
  	VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2324
2325
2326
2327
2328
  	page = alloc_pages(gfp_mask, order);
  	if (!page)
  		return 0;
  	return (unsigned long) page_address(page);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2329
  EXPORT_SYMBOL(__get_free_pages);
920c7a5d0   Harvey Harrison   mm: remove fastca...
2330
  unsigned long get_zeroed_page(gfp_t gfp_mask)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2331
  {
945a11136   Akinobu Mita   mm: add gfp mask ...
2332
  	return __get_free_pages(gfp_mask | __GFP_ZERO, 0);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2333
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2334
  EXPORT_SYMBOL(get_zeroed_page);
920c7a5d0   Harvey Harrison   mm: remove fastca...
2335
  void __free_pages(struct page *page, unsigned int order)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2336
  {
b5810039a   Nick Piggin   [PATCH] core remo...
2337
  	if (put_page_testzero(page)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2338
  		if (order == 0)
fc91668ea   Li Hong   mm: remove free_h...
2339
  			free_hot_cold_page(page, 0);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2340
2341
2342
2343
2344
2345
  		else
  			__free_pages_ok(page, order);
  	}
  }
  
  EXPORT_SYMBOL(__free_pages);
920c7a5d0   Harvey Harrison   mm: remove fastca...
2346
  void free_pages(unsigned long addr, unsigned int order)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2347
2348
  {
  	if (addr != 0) {
725d704ec   Nick Piggin   [PATCH] mm: VM_BU...
2349
  		VM_BUG_ON(!virt_addr_valid((void *)addr));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2350
2351
2352
2353
2354
  		__free_pages(virt_to_page((void *)addr), order);
  	}
  }
  
  EXPORT_SYMBOL(free_pages);
ee85c2e14   Andi Kleen   mm: add alloc_pag...
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
  static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)
  {
  	if (addr) {
  		unsigned long alloc_end = addr + (PAGE_SIZE << order);
  		unsigned long used = addr + PAGE_ALIGN(size);
  
  		split_page(virt_to_page((void *)addr), order);
  		while (used < alloc_end) {
  			free_page(used);
  			used += PAGE_SIZE;
  		}
  	}
  	return (void *)addr;
  }
2be0ffe2b   Timur Tabi   mm: add alloc_pag...
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
  /**
   * alloc_pages_exact - allocate an exact number physically-contiguous pages.
   * @size: the number of bytes to allocate
   * @gfp_mask: GFP flags for the allocation
   *
   * This function is similar to alloc_pages(), except that it allocates the
   * minimum number of pages to satisfy the request.  alloc_pages() can only
   * allocate memory in power-of-two pages.
   *
   * This function is also limited by MAX_ORDER.
   *
   * Memory allocated by this function must be released by free_pages_exact().
   */
  void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
  {
  	unsigned int order = get_order(size);
  	unsigned long addr;
  
  	addr = __get_free_pages(gfp_mask, order);
ee85c2e14   Andi Kleen   mm: add alloc_pag...
2388
  	return make_alloc_exact(addr, order, size);
2be0ffe2b   Timur Tabi   mm: add alloc_pag...
2389
2390
2391
2392
  }
  EXPORT_SYMBOL(alloc_pages_exact);
  
  /**
ee85c2e14   Andi Kleen   mm: add alloc_pag...
2393
2394
   * alloc_pages_exact_nid - allocate an exact number of physically-contiguous
   *			   pages on a node.
b5e6ab589   Randy Dunlap   mm: fix kernel-do...
2395
   * @nid: the preferred node ID where memory should be allocated
ee85c2e14   Andi Kleen   mm: add alloc_pag...
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
   * @size: the number of bytes to allocate
   * @gfp_mask: GFP flags for the allocation
   *
   * Like alloc_pages_exact(), but try to allocate on node nid first before falling
   * back.
   * Note this is not alloc_pages_exact_node() which allocates on a specific node,
   * but is not exact.
   */
  void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
  {
  	unsigned order = get_order(size);
  	struct page *p = alloc_pages_node(nid, gfp_mask, order);
  	if (!p)
  		return NULL;
  	return make_alloc_exact((unsigned long)page_address(p), order, size);
  }
  EXPORT_SYMBOL(alloc_pages_exact_nid);
  
  /**
2be0ffe2b   Timur Tabi   mm: add alloc_pag...
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
   * free_pages_exact - release memory allocated via alloc_pages_exact()
   * @virt: the value returned by alloc_pages_exact.
   * @size: size of allocation, same value as passed to alloc_pages_exact().
   *
   * Release the memory allocated by a previous call to alloc_pages_exact.
   */
  void free_pages_exact(void *virt, size_t size)
  {
  	unsigned long addr = (unsigned long)virt;
  	unsigned long end = addr + PAGE_ALIGN(size);
  
  	while (addr < end) {
  		free_page(addr);
  		addr += PAGE_SIZE;
  	}
  }
  EXPORT_SYMBOL(free_pages_exact);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2432
2433
  static unsigned int nr_free_zone_pages(int offset)
  {
dd1a239f6   Mel Gorman   mm: have zonelist...
2434
  	struct zoneref *z;
54a6eb5c4   Mel Gorman   mm: use two zonel...
2435
  	struct zone *zone;
e310fd432   Martin J. Bligh   [PATCH] Fix NUMA ...
2436
  	/* Just pick one node, since fallback list is circular */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2437
  	unsigned int sum = 0;
0e88460da   Mel Gorman   mm: introduce nod...
2438
  	struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2439

54a6eb5c4   Mel Gorman   mm: use two zonel...
2440
  	for_each_zone_zonelist(zone, z, zonelist, offset) {
e310fd432   Martin J. Bligh   [PATCH] Fix NUMA ...
2441
  		unsigned long size = zone->present_pages;
418589663   Mel Gorman   page allocator: u...
2442
  		unsigned long high = high_wmark_pages(zone);
e310fd432   Martin J. Bligh   [PATCH] Fix NUMA ...
2443
2444
  		if (size > high)
  			sum += size - high;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
  	}
  
  	return sum;
  }
  
  /*
   * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL
   */
  unsigned int nr_free_buffer_pages(void)
  {
af4ca457e   Al Viro   [PATCH] gfp_t: in...
2455
  	return nr_free_zone_pages(gfp_zone(GFP_USER));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2456
  }
c2f1a551d   Meelap Shah   knfsd: nfsd4: var...
2457
  EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2458
2459
2460
2461
2462
2463
  
  /*
   * Amount of free RAM allocatable within all zones
   */
  unsigned int nr_free_pagecache_pages(void)
  {
2a1e274ac   Mel Gorman   Create the ZONE_M...
2464
  	return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2465
  }
08e0f6a97   Christoph Lameter   [PATCH] Add NUMA_...
2466
2467
  
  static inline void show_node(struct zone *zone)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2468
  {
08e0f6a97   Christoph Lameter   [PATCH] Add NUMA_...
2469
  	if (NUMA_BUILD)
25ba77c14   Andy Whitcroft   [PATCH] numa node...
2470
  		printk("Node %d ", zone_to_nid(zone));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2471
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2472

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2473
2474
2475
2476
  void si_meminfo(struct sysinfo *val)
  {
  	val->totalram = totalram_pages;
  	val->sharedram = 0;
d23ad4232   Christoph Lameter   [PATCH] Use ZVC f...
2477
  	val->freeram = global_page_state(NR_FREE_PAGES);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2478
  	val->bufferram = nr_blockdev_pages();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2479
2480
  	val->totalhigh = totalhigh_pages;
  	val->freehigh = nr_free_highpages();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
  	val->mem_unit = PAGE_SIZE;
  }
  
  EXPORT_SYMBOL(si_meminfo);
  
  #ifdef CONFIG_NUMA
  void si_meminfo_node(struct sysinfo *val, int nid)
  {
  	pg_data_t *pgdat = NODE_DATA(nid);
  
  	val->totalram = pgdat->node_present_pages;
d23ad4232   Christoph Lameter   [PATCH] Use ZVC f...
2492
  	val->freeram = node_page_state(nid, NR_FREE_PAGES);
98d2b0ebd   Christoph Lameter   [PATCH] reduce MA...
2493
  #ifdef CONFIG_HIGHMEM
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2494
  	val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;
d23ad4232   Christoph Lameter   [PATCH] Use ZVC f...
2495
2496
  	val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
  			NR_FREE_PAGES);
98d2b0ebd   Christoph Lameter   [PATCH] reduce MA...
2497
2498
2499
2500
  #else
  	val->totalhigh = 0;
  	val->freehigh = 0;
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2501
2502
2503
  	val->mem_unit = PAGE_SIZE;
  }
  #endif
ddd588b5d   David Rientjes   oom: suppress nod...
2504
  /*
7bf02ea22   David Rientjes   arch, mm: filter ...
2505
2506
   * Determine whether the node should be displayed or not, depending on whether
   * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
ddd588b5d   David Rientjes   oom: suppress nod...
2507
   */
7bf02ea22   David Rientjes   arch, mm: filter ...
2508
  bool skip_free_areas_node(unsigned int flags, int nid)
ddd588b5d   David Rientjes   oom: suppress nod...
2509
2510
2511
2512
2513
2514
2515
  {
  	bool ret = false;
  
  	if (!(flags & SHOW_MEM_FILTER_NODES))
  		goto out;
  
  	get_mems_allowed();
7bf02ea22   David Rientjes   arch, mm: filter ...
2516
  	ret = !node_isset(nid, cpuset_current_mems_allowed);
ddd588b5d   David Rientjes   oom: suppress nod...
2517
2518
2519
2520
  	put_mems_allowed();
  out:
  	return ret;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2521
2522
2523
2524
2525
2526
  #define K(x) ((x) << (PAGE_SHIFT-10))
  
  /*
   * Show free area list (used inside shift_scroll-lock stuff)
   * We also calculate the percentage fragmentation. We do this by counting the
   * memory on each free list with the exception of the first item on the list.
ddd588b5d   David Rientjes   oom: suppress nod...
2527
2528
   * Suppresses nodes that are not allowed by current's cpuset if
   * SHOW_MEM_FILTER_NODES is passed.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2529
   */
7bf02ea22   David Rientjes   arch, mm: filter ...
2530
  void show_free_areas(unsigned int filter)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2531
  {
c72419138   Jes Sorensen   [PATCH] Condense ...
2532
  	int cpu;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2533
  	struct zone *zone;
ee99c71c5   KOSAKI Motohiro   mm: introduce for...
2534
  	for_each_populated_zone(zone) {
7bf02ea22   David Rientjes   arch, mm: filter ...
2535
  		if (skip_free_areas_node(filter, zone_to_nid(zone)))
ddd588b5d   David Rientjes   oom: suppress nod...
2536
  			continue;
c72419138   Jes Sorensen   [PATCH] Condense ...
2537
2538
2539
  		show_node(zone);
  		printk("%s per-cpu:
  ", zone->name);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2540

6b482c677   Dave Jones   [PATCH] Don't pri...
2541
  		for_each_online_cpu(cpu) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2542
  			struct per_cpu_pageset *pageset;
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
2543
  			pageset = per_cpu_ptr(zone->pageset, cpu);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2544

3dfa5721f   Christoph Lameter   Page allocator: g...
2545
2546
2547
2548
  			printk("CPU %4d: hi:%5d, btch:%4d usd:%4d
  ",
  			       cpu, pageset->pcp.high,
  			       pageset->pcp.batch, pageset->pcp.count);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2549
2550
  		}
  	}
a731286de   KOSAKI Motohiro   mm: vmstat: add i...
2551
2552
2553
2554
  	printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu
  "
  		" active_file:%lu inactive_file:%lu isolated_file:%lu
  "
7b854121e   Lee Schermerhorn   Unevictable LRU P...
2555
  		" unevictable:%lu"
b76146ed1   Andrew Morton   revert "mm: oom a...
2556
2557
  		" dirty:%lu writeback:%lu unstable:%lu
  "
3701b0332   KOSAKI Motohiro   mm: show_free_are...
2558
2559
  		" free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu
  "
4b02108ac   KOSAKI Motohiro   mm: oom analysis:...
2560
2561
  		" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu
  ",
4f98a2fee   Rik van Riel   vmscan: split LRU...
2562
  		global_page_state(NR_ACTIVE_ANON),
4f98a2fee   Rik van Riel   vmscan: split LRU...
2563
  		global_page_state(NR_INACTIVE_ANON),
a731286de   KOSAKI Motohiro   mm: vmstat: add i...
2564
2565
  		global_page_state(NR_ISOLATED_ANON),
  		global_page_state(NR_ACTIVE_FILE),
4f98a2fee   Rik van Riel   vmscan: split LRU...
2566
  		global_page_state(NR_INACTIVE_FILE),
a731286de   KOSAKI Motohiro   mm: vmstat: add i...
2567
  		global_page_state(NR_ISOLATED_FILE),
7b854121e   Lee Schermerhorn   Unevictable LRU P...
2568
  		global_page_state(NR_UNEVICTABLE),
b1e7a8fd8   Christoph Lameter   [PATCH] zoned vm ...
2569
  		global_page_state(NR_FILE_DIRTY),
ce866b34a   Christoph Lameter   [PATCH] zoned vm ...
2570
  		global_page_state(NR_WRITEBACK),
fd39fc856   Christoph Lameter   [PATCH] zoned vm ...
2571
  		global_page_state(NR_UNSTABLE_NFS),
d23ad4232   Christoph Lameter   [PATCH] Use ZVC f...
2572
  		global_page_state(NR_FREE_PAGES),
3701b0332   KOSAKI Motohiro   mm: show_free_are...
2573
2574
  		global_page_state(NR_SLAB_RECLAIMABLE),
  		global_page_state(NR_SLAB_UNRECLAIMABLE),
65ba55f50   Christoph Lameter   [PATCH] zoned vm ...
2575
  		global_page_state(NR_FILE_MAPPED),
4b02108ac   KOSAKI Motohiro   mm: oom analysis:...
2576
  		global_page_state(NR_SHMEM),
a25700a53   Andrew Morton   [PATCH] mm: show ...
2577
2578
  		global_page_state(NR_PAGETABLE),
  		global_page_state(NR_BOUNCE));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2579

ee99c71c5   KOSAKI Motohiro   mm: introduce for...
2580
  	for_each_populated_zone(zone) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2581
  		int i;
7bf02ea22   David Rientjes   arch, mm: filter ...
2582
  		if (skip_free_areas_node(filter, zone_to_nid(zone)))
ddd588b5d   David Rientjes   oom: suppress nod...
2583
  			continue;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2584
2585
2586
2587
2588
2589
  		show_node(zone);
  		printk("%s"
  			" free:%lukB"
  			" min:%lukB"
  			" low:%lukB"
  			" high:%lukB"
4f98a2fee   Rik van Riel   vmscan: split LRU...
2590
2591
2592
2593
  			" active_anon:%lukB"
  			" inactive_anon:%lukB"
  			" active_file:%lukB"
  			" inactive_file:%lukB"
7b854121e   Lee Schermerhorn   Unevictable LRU P...
2594
  			" unevictable:%lukB"
a731286de   KOSAKI Motohiro   mm: vmstat: add i...
2595
2596
  			" isolated(anon):%lukB"
  			" isolated(file):%lukB"
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2597
  			" present:%lukB"
4a0aa73f1   KOSAKI Motohiro   mm: oom analysis:...
2598
2599
2600
2601
  			" mlocked:%lukB"
  			" dirty:%lukB"
  			" writeback:%lukB"
  			" mapped:%lukB"
4b02108ac   KOSAKI Motohiro   mm: oom analysis:...
2602
  			" shmem:%lukB"
4a0aa73f1   KOSAKI Motohiro   mm: oom analysis:...
2603
2604
  			" slab_reclaimable:%lukB"
  			" slab_unreclaimable:%lukB"
c6a7f5728   KOSAKI Motohiro   mm: oom analysis:...
2605
  			" kernel_stack:%lukB"
4a0aa73f1   KOSAKI Motohiro   mm: oom analysis:...
2606
2607
2608
2609
  			" pagetables:%lukB"
  			" unstable:%lukB"
  			" bounce:%lukB"
  			" writeback_tmp:%lukB"
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2610
2611
2612
2613
2614
  			" pages_scanned:%lu"
  			" all_unreclaimable? %s"
  			"
  ",
  			zone->name,
88f5acf88   Mel Gorman   mm: page allocato...
2615
  			K(zone_page_state(zone, NR_FREE_PAGES)),
418589663   Mel Gorman   page allocator: u...
2616
2617
2618
  			K(min_wmark_pages(zone)),
  			K(low_wmark_pages(zone)),
  			K(high_wmark_pages(zone)),
4f98a2fee   Rik van Riel   vmscan: split LRU...
2619
2620
2621
2622
  			K(zone_page_state(zone, NR_ACTIVE_ANON)),
  			K(zone_page_state(zone, NR_INACTIVE_ANON)),
  			K(zone_page_state(zone, NR_ACTIVE_FILE)),
  			K(zone_page_state(zone, NR_INACTIVE_FILE)),
7b854121e   Lee Schermerhorn   Unevictable LRU P...
2623
  			K(zone_page_state(zone, NR_UNEVICTABLE)),
a731286de   KOSAKI Motohiro   mm: vmstat: add i...
2624
2625
  			K(zone_page_state(zone, NR_ISOLATED_ANON)),
  			K(zone_page_state(zone, NR_ISOLATED_FILE)),
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2626
  			K(zone->present_pages),
4a0aa73f1   KOSAKI Motohiro   mm: oom analysis:...
2627
2628
2629
2630
  			K(zone_page_state(zone, NR_MLOCK)),
  			K(zone_page_state(zone, NR_FILE_DIRTY)),
  			K(zone_page_state(zone, NR_WRITEBACK)),
  			K(zone_page_state(zone, NR_FILE_MAPPED)),
4b02108ac   KOSAKI Motohiro   mm: oom analysis:...
2631
  			K(zone_page_state(zone, NR_SHMEM)),
4a0aa73f1   KOSAKI Motohiro   mm: oom analysis:...
2632
2633
  			K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),
  			K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),
c6a7f5728   KOSAKI Motohiro   mm: oom analysis:...
2634
2635
  			zone_page_state(zone, NR_KERNEL_STACK) *
  				THREAD_SIZE / 1024,
4a0aa73f1   KOSAKI Motohiro   mm: oom analysis:...
2636
2637
2638
2639
  			K(zone_page_state(zone, NR_PAGETABLE)),
  			K(zone_page_state(zone, NR_UNSTABLE_NFS)),
  			K(zone_page_state(zone, NR_BOUNCE)),
  			K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2640
  			zone->pages_scanned,
93e4a89a8   KOSAKI Motohiro   mm: restore zone-...
2641
  			(zone->all_unreclaimable ? "yes" : "no")
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2642
2643
2644
2645
2646
2647
2648
  			);
  		printk("lowmem_reserve[]:");
  		for (i = 0; i < MAX_NR_ZONES; i++)
  			printk(" %lu", zone->lowmem_reserve[i]);
  		printk("
  ");
  	}
ee99c71c5   KOSAKI Motohiro   mm: introduce for...
2649
  	for_each_populated_zone(zone) {
8f9de51a4   Kirill Korotaev   [PATCH] printk() ...
2650
   		unsigned long nr[MAX_ORDER], flags, order, total = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2651

7bf02ea22   David Rientjes   arch, mm: filter ...
2652
  		if (skip_free_areas_node(filter, zone_to_nid(zone)))
ddd588b5d   David Rientjes   oom: suppress nod...
2653
  			continue;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2654
2655
  		show_node(zone);
  		printk("%s: ", zone->name);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2656
2657
2658
  
  		spin_lock_irqsave(&zone->lock, flags);
  		for (order = 0; order < MAX_ORDER; order++) {
8f9de51a4   Kirill Korotaev   [PATCH] printk() ...
2659
2660
  			nr[order] = zone->free_area[order].nr_free;
  			total += nr[order] << order;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2661
2662
  		}
  		spin_unlock_irqrestore(&zone->lock, flags);
8f9de51a4   Kirill Korotaev   [PATCH] printk() ...
2663
2664
  		for (order = 0; order < MAX_ORDER; order++)
  			printk("%lu*%lukB ", nr[order], K(1UL) << order);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2665
2666
2667
  		printk("= %lukB
  ", K(total));
  	}
e6f3602d2   Larry Woodman   Include count of ...
2668
2669
  	printk("%ld total pagecache pages
  ", global_page_state(NR_FILE_PAGES));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2670
2671
  	show_swap_cache_info();
  }
19770b326   Mel Gorman   mm: filter based ...
2672
2673
2674
2675
2676
  static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
  {
  	zoneref->zone = zone;
  	zoneref->zone_idx = zone_idx(zone);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2677
2678
  /*
   * Builds allocation fallback zone lists.
1a93205bd   Christoph Lameter   [PATCH] mm: simpl...
2679
2680
   *
   * Add all populated zones of a node to the zonelist.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2681
   */
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2682
2683
  static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
  				int nr_zones, enum zone_type zone_type)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2684
  {
1a93205bd   Christoph Lameter   [PATCH] mm: simpl...
2685
  	struct zone *zone;
98d2b0ebd   Christoph Lameter   [PATCH] reduce MA...
2686
  	BUG_ON(zone_type >= MAX_NR_ZONES);
2f6726e54   Christoph Lameter   [PATCH] Apply typ...
2687
  	zone_type++;
02a68a5eb   Christoph Lameter   [PATCH] Fix zone ...
2688
2689
  
  	do {
2f6726e54   Christoph Lameter   [PATCH] Apply typ...
2690
  		zone_type--;
070f80326   Christoph Lameter   [PATCH] build_zon...
2691
  		zone = pgdat->node_zones + zone_type;
1a93205bd   Christoph Lameter   [PATCH] mm: simpl...
2692
  		if (populated_zone(zone)) {
dd1a239f6   Mel Gorman   mm: have zonelist...
2693
2694
  			zoneref_set_zone(zone,
  				&zonelist->_zonerefs[nr_zones++]);
070f80326   Christoph Lameter   [PATCH] build_zon...
2695
  			check_highest_zone(zone_type);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2696
  		}
02a68a5eb   Christoph Lameter   [PATCH] Fix zone ...
2697

2f6726e54   Christoph Lameter   [PATCH] Apply typ...
2698
  	} while (zone_type);
070f80326   Christoph Lameter   [PATCH] build_zon...
2699
  	return nr_zones;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2700
  }
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
  
  /*
   *  zonelist_order:
   *  0 = automatic detection of better ordering.
   *  1 = order by ([node] distance, -zonetype)
   *  2 = order by (-zonetype, [node] distance)
   *
   *  If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
   *  the same zonelist. So only NUMA can configure this param.
   */
  #define ZONELIST_ORDER_DEFAULT  0
  #define ZONELIST_ORDER_NODE     1
  #define ZONELIST_ORDER_ZONE     2
  
  /* zonelist order in the kernel.
   * set_zonelist_order() will set this to NODE or ZONE.
   */
  static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;
  static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2720
  #ifdef CONFIG_NUMA
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
  /* The value user specified ....changed by config */
  static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;
  /* string for sysctl */
  #define NUMA_ZONELIST_ORDER_LEN	16
  char numa_zonelist_order[16] = "default";
  
  /*
   * interface for configure zonelist ordering.
   * command line option "numa_zonelist_order"
   *	= "[dD]efault	- default, automatic configuration.
   *	= "[nN]ode 	- order by node locality, then by zone within node
   *	= "[zZ]one      - order by zone, then by locality within zone
   */
  
  static int __parse_numa_zonelist_order(char *s)
  {
  	if (*s == 'd' || *s == 'D') {
  		user_zonelist_order = ZONELIST_ORDER_DEFAULT;
  	} else if (*s == 'n' || *s == 'N') {
  		user_zonelist_order = ZONELIST_ORDER_NODE;
  	} else if (*s == 'z' || *s == 'Z') {
  		user_zonelist_order = ZONELIST_ORDER_ZONE;
  	} else {
  		printk(KERN_WARNING
  			"Ignoring invalid numa_zonelist_order value:  "
  			"%s
  ", s);
  		return -EINVAL;
  	}
  	return 0;
  }
  
  static __init int setup_numa_zonelist_order(char *s)
  {
ecb256f81   Volodymyr G. Lukiianyk   mm: set correct n...
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
  	int ret;
  
  	if (!s)
  		return 0;
  
  	ret = __parse_numa_zonelist_order(s);
  	if (ret == 0)
  		strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
  
  	return ret;
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2765
2766
2767
2768
2769
2770
2771
  }
  early_param("numa_zonelist_order", setup_numa_zonelist_order);
  
  /*
   * sysctl handler for numa_zonelist_order
   */
  int numa_zonelist_order_handler(ctl_table *table, int write,
8d65af789   Alexey Dobriyan   sysctl: remove "s...
2772
  		void __user *buffer, size_t *length,
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2773
2774
2775
2776
  		loff_t *ppos)
  {
  	char saved_string[NUMA_ZONELIST_ORDER_LEN];
  	int ret;
443c6f145   Andi Kleen   SYSCTL: Add a mut...
2777
  	static DEFINE_MUTEX(zl_order_mutex);
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2778

443c6f145   Andi Kleen   SYSCTL: Add a mut...
2779
  	mutex_lock(&zl_order_mutex);
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2780
  	if (write)
443c6f145   Andi Kleen   SYSCTL: Add a mut...
2781
  		strcpy(saved_string, (char*)table->data);
8d65af789   Alexey Dobriyan   sysctl: remove "s...
2782
  	ret = proc_dostring(table, write, buffer, length, ppos);
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2783
  	if (ret)
443c6f145   Andi Kleen   SYSCTL: Add a mut...
2784
  		goto out;
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2785
2786
2787
2788
2789
2790
2791
2792
2793
  	if (write) {
  		int oldval = user_zonelist_order;
  		if (__parse_numa_zonelist_order((char*)table->data)) {
  			/*
  			 * bogus value.  restore saved string
  			 */
  			strncpy((char*)table->data, saved_string,
  				NUMA_ZONELIST_ORDER_LEN);
  			user_zonelist_order = oldval;
4eaf3f643   Haicheng Li   mem-hotplug: fix ...
2794
2795
  		} else if (oldval != user_zonelist_order) {
  			mutex_lock(&zonelists_mutex);
1f522509c   Haicheng Li   mem-hotplug: avoi...
2796
  			build_all_zonelists(NULL);
4eaf3f643   Haicheng Li   mem-hotplug: fix ...
2797
2798
  			mutex_unlock(&zonelists_mutex);
  		}
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2799
  	}
443c6f145   Andi Kleen   SYSCTL: Add a mut...
2800
2801
2802
  out:
  	mutex_unlock(&zl_order_mutex);
  	return ret;
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2803
  }
62bc62a87   Christoph Lameter   page allocator: u...
2804
  #define MAX_NODE_LOAD (nr_online_nodes)
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2805
  static int node_load[MAX_NUMNODES];
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2806
  /**
4dc3b16ba   Pavel Pisa   [PATCH] DocBook: ...
2807
   * find_next_best_node - find the next node that should appear in a given node's fallback list
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
   * @node: node whose fallback list we're appending
   * @used_node_mask: nodemask_t of already used nodes
   *
   * We use a number of factors to determine which is the next node that should
   * appear on a given node's fallback list.  The node should not have appeared
   * already in @node's fallback list, and it should be the next closest node
   * according to the distance array (which contains arbitrary distance values
   * from each node to each node in the system), and should also prefer nodes
   * with no CPUs, since presumably they'll have very little allocation pressure
   * on them otherwise.
   * It returns -1 if no node is found.
   */
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2820
  static int find_next_best_node(int node, nodemask_t *used_node_mask)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2821
  {
4cf808eb4   Linus Torvalds   [PATCH] Handle ho...
2822
  	int n, val;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2823
2824
  	int min_val = INT_MAX;
  	int best_node = -1;
a70f73028   Rusty Russell   cpumask: replace ...
2825
  	const struct cpumask *tmp = cpumask_of_node(0);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2826

4cf808eb4   Linus Torvalds   [PATCH] Handle ho...
2827
2828
2829
2830
2831
  	/* Use the local node if we haven't already */
  	if (!node_isset(node, *used_node_mask)) {
  		node_set(node, *used_node_mask);
  		return node;
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2832

37b07e416   Lee Schermerhorn   memoryless nodes:...
2833
  	for_each_node_state(n, N_HIGH_MEMORY) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2834
2835
2836
2837
  
  		/* Don't want a node to appear more than once */
  		if (node_isset(n, *used_node_mask))
  			continue;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2838
2839
  		/* Use the distance array to find the distance */
  		val = node_distance(node, n);
4cf808eb4   Linus Torvalds   [PATCH] Handle ho...
2840
2841
  		/* Penalize nodes under us ("prefer the next node") */
  		val += (n < node);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2842
  		/* Give preference to headless and unused nodes */
a70f73028   Rusty Russell   cpumask: replace ...
2843
2844
  		tmp = cpumask_of_node(n);
  		if (!cpumask_empty(tmp))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
  			val += PENALTY_FOR_NODE_WITH_CPUS;
  
  		/* Slight preference for less loaded node */
  		val *= (MAX_NODE_LOAD*MAX_NUMNODES);
  		val += node_load[n];
  
  		if (val < min_val) {
  			min_val = val;
  			best_node = n;
  		}
  	}
  
  	if (best_node >= 0)
  		node_set(best_node, *used_node_mask);
  
  	return best_node;
  }
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2862
2863
2864
2865
2866
2867
2868
  
  /*
   * Build zonelists ordered by node and zones within node.
   * This results in maximum locality--normal zone overflows into local
   * DMA zone, if any--but risks exhausting DMA zone.
   */
  static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2869
  {
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2870
  	int j;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2871
  	struct zonelist *zonelist;
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2872

54a6eb5c4   Mel Gorman   mm: use two zonel...
2873
  	zonelist = &pgdat->node_zonelists[0];
dd1a239f6   Mel Gorman   mm: have zonelist...
2874
  	for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
54a6eb5c4   Mel Gorman   mm: use two zonel...
2875
2876
2877
  		;
  	j = build_zonelists_node(NODE_DATA(node), zonelist, j,
  							MAX_NR_ZONES - 1);
dd1a239f6   Mel Gorman   mm: have zonelist...
2878
2879
  	zonelist->_zonerefs[j].zone = NULL;
  	zonelist->_zonerefs[j].zone_idx = 0;
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2880
2881
2882
  }
  
  /*
523b94585   Christoph Lameter   Memoryless nodes:...
2883
2884
2885
2886
   * Build gfp_thisnode zonelists
   */
  static void build_thisnode_zonelists(pg_data_t *pgdat)
  {
523b94585   Christoph Lameter   Memoryless nodes:...
2887
2888
  	int j;
  	struct zonelist *zonelist;
54a6eb5c4   Mel Gorman   mm: use two zonel...
2889
2890
  	zonelist = &pgdat->node_zonelists[1];
  	j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
dd1a239f6   Mel Gorman   mm: have zonelist...
2891
2892
  	zonelist->_zonerefs[j].zone = NULL;
  	zonelist->_zonerefs[j].zone_idx = 0;
523b94585   Christoph Lameter   Memoryless nodes:...
2893
2894
2895
  }
  
  /*
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2896
2897
2898
2899
2900
2901
2902
2903
2904
   * Build zonelists ordered by zone and nodes within zones.
   * This results in conserving DMA zone[s] until all Normal memory is
   * exhausted, but results in overflowing to remote node while memory
   * may still exist in local DMA zone.
   */
  static int node_order[MAX_NUMNODES];
  
  static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
  {
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2905
2906
2907
2908
  	int pos, j, node;
  	int zone_type;		/* needs to be signed */
  	struct zone *z;
  	struct zonelist *zonelist;
54a6eb5c4   Mel Gorman   mm: use two zonel...
2909
2910
2911
2912
2913
2914
2915
  	zonelist = &pgdat->node_zonelists[0];
  	pos = 0;
  	for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
  		for (j = 0; j < nr_nodes; j++) {
  			node = node_order[j];
  			z = &NODE_DATA(node)->node_zones[zone_type];
  			if (populated_zone(z)) {
dd1a239f6   Mel Gorman   mm: have zonelist...
2916
2917
  				zoneref_set_zone(z,
  					&zonelist->_zonerefs[pos++]);
54a6eb5c4   Mel Gorman   mm: use two zonel...
2918
  				check_highest_zone(zone_type);
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2919
2920
  			}
  		}
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2921
  	}
dd1a239f6   Mel Gorman   mm: have zonelist...
2922
2923
  	zonelist->_zonerefs[pos].zone = NULL;
  	zonelist->_zonerefs[pos].zone_idx = 0;
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2924
2925
2926
2927
2928
2929
2930
2931
2932
  }
  
  static int default_zonelist_order(void)
  {
  	int nid, zone_type;
  	unsigned long low_kmem_size,total_size;
  	struct zone *z;
  	int average_size;
  	/*
883931612   Thomas Weber   Fix typos in comm...
2933
           * ZONE_DMA and ZONE_DMA32 can be very small area in the system.
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2934
2935
  	 * If they are really small and used heavily, the system can fall
  	 * into OOM very easily.
e325c90ff   David Rientjes   mm: default to no...
2936
  	 * This function detect ZONE_DMA/DMA32 size and configures zone order.
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
  	 */
  	/* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */
  	low_kmem_size = 0;
  	total_size = 0;
  	for_each_online_node(nid) {
  		for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
  			z = &NODE_DATA(nid)->node_zones[zone_type];
  			if (populated_zone(z)) {
  				if (zone_type < ZONE_NORMAL)
  					low_kmem_size += z->present_pages;
  				total_size += z->present_pages;
e325c90ff   David Rientjes   mm: default to no...
2948
2949
2950
2951
2952
2953
2954
2955
2956
  			} else if (zone_type == ZONE_NORMAL) {
  				/*
  				 * If any node has only lowmem, then node order
  				 * is preferred to allow kernel allocations
  				 * locally; otherwise, they can easily infringe
  				 * on other nodes when there is an abundance of
  				 * lowmem available to allocate from.
  				 */
  				return ZONELIST_ORDER_NODE;
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
  			}
  		}
  	}
  	if (!low_kmem_size ||  /* there are no DMA area. */
  	    low_kmem_size > total_size/2) /* DMA/DMA32 is big. */
  		return ZONELIST_ORDER_NODE;
  	/*
  	 * look into each node's config.
    	 * If there is a node whose DMA/DMA32 memory is very big area on
   	 * local memory, NODE_ORDER may be suitable.
           */
37b07e416   Lee Schermerhorn   memoryless nodes:...
2968
2969
  	average_size = total_size /
  				(nodes_weight(node_states[N_HIGH_MEMORY]) + 1);
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
  	for_each_online_node(nid) {
  		low_kmem_size = 0;
  		total_size = 0;
  		for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
  			z = &NODE_DATA(nid)->node_zones[zone_type];
  			if (populated_zone(z)) {
  				if (zone_type < ZONE_NORMAL)
  					low_kmem_size += z->present_pages;
  				total_size += z->present_pages;
  			}
  		}
  		if (low_kmem_size &&
  		    total_size > average_size && /* ignore small node */
  		    low_kmem_size > total_size * 70/100)
  			return ZONELIST_ORDER_NODE;
  	}
  	return ZONELIST_ORDER_ZONE;
  }
  
  static void set_zonelist_order(void)
  {
  	if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)
  		current_zonelist_order = default_zonelist_order();
  	else
  		current_zonelist_order = user_zonelist_order;
  }
  
  static void build_zonelists(pg_data_t *pgdat)
  {
  	int j, node, load;
  	enum zone_type i;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3001
  	nodemask_t used_mask;
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
3002
3003
3004
  	int local_node, prev_node;
  	struct zonelist *zonelist;
  	int order = current_zonelist_order;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3005
3006
  
  	/* initialize zonelists */
523b94585   Christoph Lameter   Memoryless nodes:...
3007
  	for (i = 0; i < MAX_ZONELISTS; i++) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3008
  		zonelist = pgdat->node_zonelists + i;
dd1a239f6   Mel Gorman   mm: have zonelist...
3009
3010
  		zonelist->_zonerefs[0].zone = NULL;
  		zonelist->_zonerefs[0].zone_idx = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3011
3012
3013
3014
  	}
  
  	/* NUMA-aware ordering of nodes */
  	local_node = pgdat->node_id;
62bc62a87   Christoph Lameter   page allocator: u...
3015
  	load = nr_online_nodes;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3016
3017
  	prev_node = local_node;
  	nodes_clear(used_mask);
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
3018

f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
3019
3020
  	memset(node_order, 0, sizeof(node_order));
  	j = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3021
  	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
9eeff2395   Christoph Lameter   [PATCH] Zone recl...
3022
3023
3024
3025
3026
3027
3028
3029
  		int distance = node_distance(local_node, node);
  
  		/*
  		 * If another node is sufficiently far away then it is better
  		 * to reclaim pages in a zone before going off node.
  		 */
  		if (distance > RECLAIM_DISTANCE)
  			zone_reclaim_mode = 1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3030
3031
3032
3033
3034
  		/*
  		 * We don't want to pressure a particular node.
  		 * So adding penalty to the first node in same
  		 * distance group to make it round-robin.
  		 */
9eeff2395   Christoph Lameter   [PATCH] Zone recl...
3035
  		if (distance != node_distance(local_node, prev_node))
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
3036
  			node_load[node] = load;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3037
3038
  		prev_node = node;
  		load--;
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
3039
3040
3041
3042
3043
  		if (order == ZONELIST_ORDER_NODE)
  			build_zonelists_in_node_order(pgdat, node);
  		else
  			node_order[j++] = node;	/* remember order */
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3044

f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
3045
3046
3047
  	if (order == ZONELIST_ORDER_ZONE) {
  		/* calculate node order -- i.e., DMA last! */
  		build_zonelists_in_zone_order(pgdat, j);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3048
  	}
523b94585   Christoph Lameter   Memoryless nodes:...
3049
3050
  
  	build_thisnode_zonelists(pgdat);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3051
  }
9276b1bc9   Paul Jackson   [PATCH] memory pa...
3052
  /* Construct the zonelist performance cache - see further mmzone.h */
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
3053
  static void build_zonelist_cache(pg_data_t *pgdat)
9276b1bc9   Paul Jackson   [PATCH] memory pa...
3054
  {
54a6eb5c4   Mel Gorman   mm: use two zonel...
3055
3056
  	struct zonelist *zonelist;
  	struct zonelist_cache *zlc;
dd1a239f6   Mel Gorman   mm: have zonelist...
3057
  	struct zoneref *z;
9276b1bc9   Paul Jackson   [PATCH] memory pa...
3058

54a6eb5c4   Mel Gorman   mm: use two zonel...
3059
3060
3061
  	zonelist = &pgdat->node_zonelists[0];
  	zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
  	bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
dd1a239f6   Mel Gorman   mm: have zonelist...
3062
3063
  	for (z = zonelist->_zonerefs; z->zone; z++)
  		zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);
9276b1bc9   Paul Jackson   [PATCH] memory pa...
3064
  }
7aac78988   Lee Schermerhorn   numa: introduce n...
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
  #ifdef CONFIG_HAVE_MEMORYLESS_NODES
  /*
   * Return node id of node used for "local" allocations.
   * I.e., first node id of first zone in arg node's generic zonelist.
   * Used for initializing percpu 'numa_mem', which is used primarily
   * for kernel allocations, so use GFP_KERNEL flags to locate zonelist.
   */
  int local_memory_node(int node)
  {
  	struct zone *zone;
  
  	(void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
  				   gfp_zone(GFP_KERNEL),
  				   NULL,
  				   &zone);
  	return zone->node;
  }
  #endif
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
3083

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3084
  #else	/* CONFIG_NUMA */
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
3085
3086
3087
3088
3089
3090
  static void set_zonelist_order(void)
  {
  	current_zonelist_order = ZONELIST_ORDER_ZONE;
  }
  
  static void build_zonelists(pg_data_t *pgdat)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3091
  {
19655d348   Christoph Lameter   [PATCH] linearly ...
3092
  	int node, local_node;
54a6eb5c4   Mel Gorman   mm: use two zonel...
3093
3094
  	enum zone_type j;
  	struct zonelist *zonelist;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3095
3096
  
  	local_node = pgdat->node_id;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3097

54a6eb5c4   Mel Gorman   mm: use two zonel...
3098
3099
  	zonelist = &pgdat->node_zonelists[0];
  	j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3100

54a6eb5c4   Mel Gorman   mm: use two zonel...
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
  	/*
  	 * Now we build the zonelist so that it contains the zones
  	 * of all the other nodes.
  	 * We don't want to pressure a particular node, so when
  	 * building the zones for node N, we make sure that the
  	 * zones coming right after the local ones are those from
  	 * node N+1 (modulo N)
  	 */
  	for (node = local_node + 1; node < MAX_NUMNODES; node++) {
  		if (!node_online(node))
  			continue;
  		j = build_zonelists_node(NODE_DATA(node), zonelist, j,
  							MAX_NR_ZONES - 1);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3114
  	}
54a6eb5c4   Mel Gorman   mm: use two zonel...
3115
3116
3117
3118
3119
3120
  	for (node = 0; node < local_node; node++) {
  		if (!node_online(node))
  			continue;
  		j = build_zonelists_node(NODE_DATA(node), zonelist, j,
  							MAX_NR_ZONES - 1);
  	}
dd1a239f6   Mel Gorman   mm: have zonelist...
3121
3122
  	zonelist->_zonerefs[j].zone = NULL;
  	zonelist->_zonerefs[j].zone_idx = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3123
  }
9276b1bc9   Paul Jackson   [PATCH] memory pa...
3124
  /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
3125
  static void build_zonelist_cache(pg_data_t *pgdat)
9276b1bc9   Paul Jackson   [PATCH] memory pa...
3126
  {
54a6eb5c4   Mel Gorman   mm: use two zonel...
3127
  	pgdat->node_zonelists[0].zlcache_ptr = NULL;
9276b1bc9   Paul Jackson   [PATCH] memory pa...
3128
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3129
  #endif	/* CONFIG_NUMA */
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
  /*
   * Boot pageset table. One per cpu which is going to be used for all
   * zones and all nodes. The parameters will be set in such a way
   * that an item put on a list will immediately be handed over to
   * the buddy list. This is safe since pageset manipulation is done
   * with interrupts disabled.
   *
   * The boot_pagesets must be kept even after bootup is complete for
   * unused processors and/or zones. They do play a role for bootstrapping
   * hotplugged processors.
   *
   * zoneinfo_show() and maybe other functions do
   * not check if the processor is online before following the pageset pointer.
   * Other parts of the kernel may not check if the zone is available.
   */
  static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
  static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
1f522509c   Haicheng Li   mem-hotplug: avoi...
3147
  static void setup_zone_pageset(struct zone *zone);
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
3148

4eaf3f643   Haicheng Li   mem-hotplug: fix ...
3149
3150
3151
3152
3153
  /*
   * Global mutex to protect against size modification of zonelists
   * as well as to serialize pageset setup for the new populated zone.
   */
  DEFINE_MUTEX(zonelists_mutex);
9b1a4d383   Rusty Russell   stop_machine: Wea...
3154
  /* return values int ....just for stop_machine() */
1f522509c   Haicheng Li   mem-hotplug: avoi...
3155
  static __init_refok int __build_all_zonelists(void *data)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3156
  {
6811378e7   Yasunori Goto   [PATCH] wait_tabl...
3157
  	int nid;
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
3158
  	int cpu;
9276b1bc9   Paul Jackson   [PATCH] memory pa...
3159

7f9cfb310   Bo Liu   mm: build_zonelis...
3160
3161
3162
  #ifdef CONFIG_NUMA
  	memset(node_load, 0, sizeof(node_load));
  #endif
9276b1bc9   Paul Jackson   [PATCH] memory pa...
3163
  	for_each_online_node(nid) {
7ea1530ab   Christoph Lameter   Memoryless nodes:...
3164
3165
3166
3167
  		pg_data_t *pgdat = NODE_DATA(nid);
  
  		build_zonelists(pgdat);
  		build_zonelist_cache(pgdat);
9276b1bc9   Paul Jackson   [PATCH] memory pa...
3168
  	}
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
  
  	/*
  	 * Initialize the boot_pagesets that are going to be used
  	 * for bootstrapping processors. The real pagesets for
  	 * each zone will be allocated later when the per cpu
  	 * allocator is available.
  	 *
  	 * boot_pagesets are used also for bootstrapping offline
  	 * cpus if the system is already booted because the pagesets
  	 * are needed to initialize allocators on a specific cpu too.
  	 * F.e. the percpu allocator needs the page allocator which
  	 * needs the percpu allocator in order to allocate its pagesets
  	 * (a chicken-egg dilemma).
  	 */
7aac78988   Lee Schermerhorn   numa: introduce n...
3183
  	for_each_possible_cpu(cpu) {
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
3184
  		setup_pageset(&per_cpu(boot_pageset, cpu), 0);
7aac78988   Lee Schermerhorn   numa: introduce n...
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
  #ifdef CONFIG_HAVE_MEMORYLESS_NODES
  		/*
  		 * We now know the "local memory node" for each node--
  		 * i.e., the node of the first zone in the generic zonelist.
  		 * Set up numa_mem percpu variable for on-line cpus.  During
  		 * boot, only the boot cpu should be on-line;  we'll init the
  		 * secondary cpus' numa_mem as they come on-line.  During
  		 * node/memory hotplug, we'll fixup all on-line cpus.
  		 */
  		if (cpu_online(cpu))
  			set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
  #endif
  	}
6811378e7   Yasunori Goto   [PATCH] wait_tabl...
3198
3199
  	return 0;
  }
4eaf3f643   Haicheng Li   mem-hotplug: fix ...
3200
3201
3202
3203
  /*
   * Called with zonelists_mutex held always
   * unless system_state == SYSTEM_BOOTING.
   */
9f6ae448b   Paul Mundt   mm/page_alloc.c: ...
3204
  void __ref build_all_zonelists(void *data)
6811378e7   Yasunori Goto   [PATCH] wait_tabl...
3205
  {
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
3206
  	set_zonelist_order();
6811378e7   Yasunori Goto   [PATCH] wait_tabl...
3207
  	if (system_state == SYSTEM_BOOTING) {
423b41d77   Randy Dunlap   [PATCH] mm/page_a...
3208
  		__build_all_zonelists(NULL);
68ad8df42   Mel Gorman   mm: print out the...
3209
  		mminit_verify_zonelist();
6811378e7   Yasunori Goto   [PATCH] wait_tabl...
3210
3211
  		cpuset_init_current_mems_allowed();
  	} else {
183ff22bb   Simon Arlott   spelling fixes: mm/
3212
  		/* we have to stop all cpus to guarantee there is no user
6811378e7   Yasunori Goto   [PATCH] wait_tabl...
3213
  		   of zonelist */
e9959f0f3   KAMEZAWA Hiroyuki   mm/page_alloc.c: ...
3214
3215
3216
3217
3218
  #ifdef CONFIG_MEMORY_HOTPLUG
  		if (data)
  			setup_zone_pageset((struct zone *)data);
  #endif
  		stop_machine(__build_all_zonelists, NULL, NULL);
6811378e7   Yasunori Goto   [PATCH] wait_tabl...
3219
3220
  		/* cpuset refresh routine should be here */
  	}
bd1e22b8e   Andrew Morton   [PATCH] initialis...
3221
  	vm_total_pages = nr_free_pagecache_pages();
9ef9acb05   Mel Gorman   Do not group page...
3222
3223
3224
3225
3226
3227
3228
  	/*
  	 * Disable grouping by mobility if the number of pages in the
  	 * system is too low to allow the mechanism to work. It would be
  	 * more accurate, but expensive to check per-zone. This check is
  	 * made on memory-hotadd so a system can start with mobility
  	 * disabled and enable it later
  	 */
d9c234005   Mel Gorman   Do not depend on ...
3229
  	if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
9ef9acb05   Mel Gorman   Do not group page...
3230
3231
3232
3233
3234
3235
3236
  		page_group_by_mobility_disabled = 1;
  	else
  		page_group_by_mobility_disabled = 0;
  
  	printk("Built %i zonelists in %s order, mobility grouping %s.  "
  		"Total pages: %ld
  ",
62bc62a87   Christoph Lameter   page allocator: u...
3237
  			nr_online_nodes,
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
3238
  			zonelist_order_name[current_zonelist_order],
9ef9acb05   Mel Gorman   Do not group page...
3239
  			page_group_by_mobility_disabled ? "off" : "on",
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
3240
3241
3242
3243
3244
  			vm_total_pages);
  #ifdef CONFIG_NUMA
  	printk("Policy zone: %s
  ", zone_names[policy_zone]);
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
  }
  
  /*
   * Helper functions to size the waitqueue hash table.
   * Essentially these want to choose hash table sizes sufficiently
   * large so that collisions trying to wait on pages are rare.
   * But in fact, the number of active page waitqueues on typical
   * systems is ridiculously low, less than 200. So this is even
   * conservative, even though it seems large.
   *
   * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
   * waitqueues, i.e. the size of the waitq table given the number of pages.
   */
  #define PAGES_PER_WAITQUEUE	256
cca448fe9   Yasunori Goto   [PATCH] wait_tabl...
3259
  #ifndef CONFIG_MEMORY_HOTPLUG
02b694dea   Yasunori Goto   [PATCH] wait_tabl...
3260
  static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
  {
  	unsigned long size = 1;
  
  	pages /= PAGES_PER_WAITQUEUE;
  
  	while (size < pages)
  		size <<= 1;
  
  	/*
  	 * Once we have dozens or even hundreds of threads sleeping
  	 * on IO we've got bigger problems than wait queue collision.
  	 * Limit the size of the wait table to a reasonable size.
  	 */
  	size = min(size, 4096UL);
  
  	return max(size, 4UL);
  }
cca448fe9   Yasunori Goto   [PATCH] wait_tabl...
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
  #else
  /*
   * A zone's size might be changed by hot-add, so it is not possible to determine
   * a suitable size for its wait_table.  So we use the maximum size now.
   *
   * The max wait table size = 4096 x sizeof(wait_queue_head_t).   ie:
   *
   *    i386 (preemption config)    : 4096 x 16 = 64Kbyte.
   *    ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.
   *    ia64, x86-64 (preemption)   : 4096 x 24 = 96Kbyte.
   *
   * The maximum entries are prepared when a zone's memory is (512K + 256) pages
   * or more by the traditional way. (See above).  It equals:
   *
   *    i386, x86-64, powerpc(4K page size) : =  ( 2G + 1M)byte.
   *    ia64(16K page size)                 : =  ( 8G + 4M)byte.
   *    powerpc (64K page size)             : =  (32G +16M)byte.
   */
  static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
  {
  	return 4096UL;
  }
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
  
  /*
   * This is an integer logarithm so that shifts can be used later
   * to extract the more random high bits from the multiplicative
   * hash function before the remainder is taken.
   */
  static inline unsigned long wait_table_bits(unsigned long size)
  {
  	return ffz(~size);
  }
  
  #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
56fd56b86   Mel Gorman   Bias the location...
3313
  /*
6d3163ce8   Arve HjønnevÃ¥g   mm: check if any ...
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
   * Check if a pageblock contains reserved pages
   */
  static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)
  {
  	unsigned long pfn;
  
  	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
  		if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))
  			return 1;
  	}
  	return 0;
  }
  
  /*
d9c234005   Mel Gorman   Do not depend on ...
3328
   * Mark a number of pageblocks as MIGRATE_RESERVE. The number
418589663   Mel Gorman   page allocator: u...
3329
3330
   * of blocks reserved is based on min_wmark_pages(zone). The memory within
   * the reserve will tend to store contiguous free pages. Setting min_free_kbytes
56fd56b86   Mel Gorman   Bias the location...
3331
3332
3333
3334
3335
   * higher will lead to a bigger reserve which will get freed as contiguous
   * blocks as reclaim kicks in
   */
  static void setup_zone_migrate_reserve(struct zone *zone)
  {
6d3163ce8   Arve HjønnevÃ¥g   mm: check if any ...
3336
  	unsigned long start_pfn, pfn, end_pfn, block_end_pfn;
56fd56b86   Mel Gorman   Bias the location...
3337
  	struct page *page;
78986a678   Mel Gorman   page-allocator: l...
3338
3339
  	unsigned long block_migratetype;
  	int reserve;
56fd56b86   Mel Gorman   Bias the location...
3340

d02156388   Michal Hocko   mm: Ensure that p...
3341
3342
3343
3344
3345
3346
  	/*
  	 * Get the start pfn, end pfn and the number of blocks to reserve
  	 * We have to be careful to be aligned to pageblock_nr_pages to
  	 * make sure that we always check pfn_valid for the first page in
  	 * the block.
  	 */
56fd56b86   Mel Gorman   Bias the location...
3347
3348
  	start_pfn = zone->zone_start_pfn;
  	end_pfn = start_pfn + zone->spanned_pages;
d02156388   Michal Hocko   mm: Ensure that p...
3349
  	start_pfn = roundup(start_pfn, pageblock_nr_pages);
418589663   Mel Gorman   page allocator: u...
3350
  	reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
d9c234005   Mel Gorman   Do not depend on ...
3351
  							pageblock_order;
56fd56b86   Mel Gorman   Bias the location...
3352

78986a678   Mel Gorman   page-allocator: l...
3353
3354
3355
3356
3357
3358
3359
3360
  	/*
  	 * Reserve blocks are generally in place to help high-order atomic
  	 * allocations that are short-lived. A min_free_kbytes value that
  	 * would result in more than 2 reserve blocks for atomic allocations
  	 * is assumed to be in place to help anti-fragmentation for the
  	 * future allocation of hugepages at runtime.
  	 */
  	reserve = min(2, reserve);
d9c234005   Mel Gorman   Do not depend on ...
3361
  	for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
56fd56b86   Mel Gorman   Bias the location...
3362
3363
3364
  		if (!pfn_valid(pfn))
  			continue;
  		page = pfn_to_page(pfn);
344c790e3   Adam Litke   mm: make setup_zo...
3365
3366
3367
  		/* Watch out for overlapping nodes */
  		if (page_to_nid(page) != zone_to_nid(zone))
  			continue;
56fd56b86   Mel Gorman   Bias the location...
3368
  		block_migratetype = get_pageblock_migratetype(page);
938929f14   Mel Gorman   mm: reduce the am...
3369
3370
3371
3372
3373
3374
3375
3376
3377
  		/* Only test what is necessary when the reserves are not met */
  		if (reserve > 0) {
  			/*
  			 * Blocks with reserved pages will never free, skip
  			 * them.
  			 */
  			block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
  			if (pageblock_is_reserved(pfn, block_end_pfn))
  				continue;
56fd56b86   Mel Gorman   Bias the location...
3378

938929f14   Mel Gorman   mm: reduce the am...
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
  			/* If this block is reserved, account for it */
  			if (block_migratetype == MIGRATE_RESERVE) {
  				reserve--;
  				continue;
  			}
  
  			/* Suitable for reserving if this block is movable */
  			if (block_migratetype == MIGRATE_MOVABLE) {
  				set_pageblock_migratetype(page,
  							MIGRATE_RESERVE);
  				move_freepages_block(zone, page,
  							MIGRATE_RESERVE);
  				reserve--;
  				continue;
  			}
56fd56b86   Mel Gorman   Bias the location...
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
  		}
  
  		/*
  		 * If the reserve is met and this is a previous reserved block,
  		 * take it back
  		 */
  		if (block_migratetype == MIGRATE_RESERVE) {
  			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
  			move_freepages_block(zone, page, MIGRATE_MOVABLE);
  		}
  	}
  }
ac0e5b7a6   Mel Gorman   remove PAGE_GROUP...
3406

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3407
3408
3409
3410
3411
  /*
   * Initially all pages are reserved - free ones are freed
   * up by free_all_bootmem() once the early boot process is
   * done. Non-atomic initialization, single-pass.
   */
c09b42404   Matt Tolentino   [PATCH] x86_64: a...
3412
  void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
a2f3aa025   Dave Hansen   [PATCH] Fix spars...
3413
  		unsigned long start_pfn, enum memmap_context context)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3414
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3415
  	struct page *page;
29751f699   Andy Whitcroft   [PATCH] sparsemem...
3416
3417
  	unsigned long end_pfn = start_pfn + size;
  	unsigned long pfn;
86051ca5e   KAMEZAWA Hiroyuki   mm: fix usemap in...
3418
  	struct zone *z;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3419

22b31eec6   Hugh Dickins   badpage: vm_norma...
3420
3421
  	if (highest_memmap_pfn < end_pfn - 1)
  		highest_memmap_pfn = end_pfn - 1;
86051ca5e   KAMEZAWA Hiroyuki   mm: fix usemap in...
3422
  	z = &NODE_DATA(nid)->node_zones[zone];
cbe8dd4af   Greg Ungerer   [PATCH] memmap_in...
3423
  	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
a2f3aa025   Dave Hansen   [PATCH] Fix spars...
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
  		/*
  		 * There can be holes in boot-time mem_map[]s
  		 * handed to this function.  They do not
  		 * exist on hotplugged memory.
  		 */
  		if (context == MEMMAP_EARLY) {
  			if (!early_pfn_valid(pfn))
  				continue;
  			if (!early_pfn_in_nid(pfn, nid))
  				continue;
  		}
d41dee369   Andy Whitcroft   [PATCH] sparsemem...
3435
3436
  		page = pfn_to_page(pfn);
  		set_page_links(page, zone, nid, pfn);
708614e61   Mel Gorman   mm: verify the pa...
3437
  		mminit_verify_page_links(page, zone, nid, pfn);
7835e98b2   Nick Piggin   [PATCH] remove se...
3438
  		init_page_count(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3439
3440
  		reset_page_mapcount(page);
  		SetPageReserved(page);
b2a0ac887   Mel Gorman   Split the free li...
3441
3442
3443
3444
3445
  		/*
  		 * Mark the block movable so that blocks are reserved for
  		 * movable at startup. This will force kernel allocations
  		 * to reserve their blocks rather than leaking throughout
  		 * the address space during boot when many long-lived
56fd56b86   Mel Gorman   Bias the location...
3446
3447
3448
  		 * kernel allocations are made. Later some blocks near
  		 * the start are marked MIGRATE_RESERVE by
  		 * setup_zone_migrate_reserve()
86051ca5e   KAMEZAWA Hiroyuki   mm: fix usemap in...
3449
3450
3451
3452
3453
  		 *
  		 * bitmap is created for zone's valid pfn range. but memmap
  		 * can be created for invalid pages (for alignment)
  		 * check here not to call set_pageblock_migratetype() against
  		 * pfn out of zone.
b2a0ac887   Mel Gorman   Split the free li...
3454
  		 */
86051ca5e   KAMEZAWA Hiroyuki   mm: fix usemap in...
3455
3456
3457
  		if ((z->zone_start_pfn <= pfn)
  		    && (pfn < z->zone_start_pfn + z->spanned_pages)
  		    && !(pfn & (pageblock_nr_pages - 1)))
56fd56b86   Mel Gorman   Bias the location...
3458
  			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
b2a0ac887   Mel Gorman   Split the free li...
3459

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3460
3461
3462
3463
  		INIT_LIST_HEAD(&page->lru);
  #ifdef WANT_PAGE_VIRTUAL
  		/* The shift won't overflow because ZONE_NORMAL is below 4G. */
  		if (!is_highmem_idx(zone))
3212c6be2   Bob Picco   [PATCH] fix WANT_...
3464
  			set_page_address(page, __va(pfn << PAGE_SHIFT));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3465
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3466
3467
  	}
  }
1e548deb5   Andi Kleen   page allocator: r...
3468
  static void __meminit zone_init_free_lists(struct zone *zone)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3469
  {
b2a0ac887   Mel Gorman   Split the free li...
3470
3471
3472
  	int order, t;
  	for_each_migratetype_order(order, t) {
  		INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3473
3474
3475
3476
3477
3478
  		zone->free_area[order].nr_free = 0;
  	}
  }
  
  #ifndef __HAVE_ARCH_MEMMAP_INIT
  #define memmap_init(size, nid, zone, start_pfn) \
a2f3aa025   Dave Hansen   [PATCH] Fix spars...
3479
  	memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3480
  #endif
1d6f4e60e   Sam Ravnborg   mm: fix section m...
3481
  static int zone_batchsize(struct zone *zone)
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
3482
  {
3a6be87fd   David Howells   nommu: clamp zone...
3483
  #ifdef CONFIG_MMU
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
3484
3485
3486
3487
  	int batch;
  
  	/*
  	 * The per-cpu-pages pools are set to around 1000th of the
ba56e91c9   Seth, Rohit   [PATCH] mm: page_...
3488
  	 * size of the zone.  But no more than 1/2 of a meg.
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
3489
3490
3491
3492
  	 *
  	 * OK, so we don't know how big the cache is.  So guess.
  	 */
  	batch = zone->present_pages / 1024;
ba56e91c9   Seth, Rohit   [PATCH] mm: page_...
3493
3494
  	if (batch * PAGE_SIZE > 512 * 1024)
  		batch = (512 * 1024) / PAGE_SIZE;
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
3495
3496
3497
3498
3499
  	batch /= 4;		/* We effectively *= 4 below */
  	if (batch < 1)
  		batch = 1;
  
  	/*
0ceaacc97   Nick Piggin   [PATCH] Fix up pe...
3500
3501
3502
  	 * Clamp the batch to a 2^n - 1 value. Having a power
  	 * of 2 value was found to be more likely to have
  	 * suboptimal cache aliasing properties in some cases.
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
3503
  	 *
0ceaacc97   Nick Piggin   [PATCH] Fix up pe...
3504
3505
3506
3507
  	 * For example if 2 tasks are alternately allocating
  	 * batches of pages, one task can end up with a lot
  	 * of pages of one half of the possible page colors
  	 * and the other with pages of the other colors.
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
3508
  	 */
9155203a5   David Howells   mm: use roundown_...
3509
  	batch = rounddown_pow_of_two(batch + batch/2) - 1;
ba56e91c9   Seth, Rohit   [PATCH] mm: page_...
3510

e7c8d5c99   Christoph Lameter   [PATCH] node loca...
3511
  	return batch;
3a6be87fd   David Howells   nommu: clamp zone...
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
  
  #else
  	/* The deferral and batching of frees should be suppressed under NOMMU
  	 * conditions.
  	 *
  	 * The problem is that NOMMU needs to be able to allocate large chunks
  	 * of contiguous memory as there's no hardware page translation to
  	 * assemble apparent contiguous memory from discontiguous pages.
  	 *
  	 * Queueing large contiguous runs of pages for batching, however,
  	 * causes the pages to actually be freed in smaller chunks.  As there
  	 * can be a significant delay between the individual batches being
  	 * recycled, this leads to the once large chunks of space being
  	 * fragmented and becoming unavailable for high-order allocations.
  	 */
  	return 0;
  #endif
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
3529
  }
b69a7288e   Adrian Bunk   mm/page_alloc.c: ...
3530
  static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
2caaad41e   Christoph Lameter   [PATCH] Reduce si...
3531
3532
  {
  	struct per_cpu_pages *pcp;
5f8dcc212   Mel Gorman   page-allocator: s...
3533
  	int migratetype;
2caaad41e   Christoph Lameter   [PATCH] Reduce si...
3534

1c6fe9465   Magnus Damm   [PATCH] NUMA: bro...
3535
  	memset(p, 0, sizeof(*p));
3dfa5721f   Christoph Lameter   Page allocator: g...
3536
  	pcp = &p->pcp;
2caaad41e   Christoph Lameter   [PATCH] Reduce si...
3537
  	pcp->count = 0;
2caaad41e   Christoph Lameter   [PATCH] Reduce si...
3538
3539
  	pcp->high = 6 * batch;
  	pcp->batch = max(1UL, 1 * batch);
5f8dcc212   Mel Gorman   page-allocator: s...
3540
3541
  	for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
  		INIT_LIST_HEAD(&pcp->lists[migratetype]);
2caaad41e   Christoph Lameter   [PATCH] Reduce si...
3542
  }
8ad4b1fb8   Rohit Seth   [PATCH] Make high...
3543
3544
3545
3546
3547
3548
3549
3550
3551
  /*
   * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
   * to the value high for the pageset p.
   */
  
  static void setup_pagelist_highmark(struct per_cpu_pageset *p,
  				unsigned long high)
  {
  	struct per_cpu_pages *pcp;
3dfa5721f   Christoph Lameter   Page allocator: g...
3552
  	pcp = &p->pcp;
8ad4b1fb8   Rohit Seth   [PATCH] Make high...
3553
3554
3555
3556
3557
  	pcp->high = high;
  	pcp->batch = max(1UL, high/4);
  	if ((high/4) > (PAGE_SHIFT * 8))
  		pcp->batch = PAGE_SHIFT * 8;
  }
58c2ee400   Nikanth Karthikesan   mm: Fix section m...
3558
  static void setup_zone_pageset(struct zone *zone)
319774e25   Wu Fengguang   mem-hotplug: sepa...
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
  {
  	int cpu;
  
  	zone->pageset = alloc_percpu(struct per_cpu_pageset);
  
  	for_each_possible_cpu(cpu) {
  		struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
  
  		setup_pageset(pcp, zone_batchsize(zone));
  
  		if (percpu_pagelist_fraction)
  			setup_pagelist_highmark(pcp,
  				(zone->present_pages /
  					percpu_pagelist_fraction));
  	}
  }
2caaad41e   Christoph Lameter   [PATCH] Reduce si...
3575
  /*
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
3576
3577
   * Allocate per cpu pagesets and initialize them.
   * Before this call only boot pagesets were available.
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
3578
   */
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
3579
  void __init setup_per_cpu_pageset(void)
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
3580
  {
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
3581
  	struct zone *zone;
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
3582

319774e25   Wu Fengguang   mem-hotplug: sepa...
3583
3584
  	for_each_populated_zone(zone)
  		setup_zone_pageset(zone);
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
3585
  }
577a32f62   Sam Ravnborg   mm: fix section m...
3586
  static noinline __init_refok
cca448fe9   Yasunori Goto   [PATCH] wait_tabl...
3587
  int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
3588
3589
3590
  {
  	int i;
  	struct pglist_data *pgdat = zone->zone_pgdat;
cca448fe9   Yasunori Goto   [PATCH] wait_tabl...
3591
  	size_t alloc_size;
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
3592
3593
3594
3595
3596
  
  	/*
  	 * The per-page waitqueue mechanism uses hashed waitqueues
  	 * per zone.
  	 */
02b694dea   Yasunori Goto   [PATCH] wait_tabl...
3597
3598
3599
3600
  	zone->wait_table_hash_nr_entries =
  		 wait_table_hash_nr_entries(zone_size_pages);
  	zone->wait_table_bits =
  		wait_table_bits(zone->wait_table_hash_nr_entries);
cca448fe9   Yasunori Goto   [PATCH] wait_tabl...
3601
3602
  	alloc_size = zone->wait_table_hash_nr_entries
  					* sizeof(wait_queue_head_t);
cd94b9dbf   Heiko Carstens   memory hotplug: f...
3603
  	if (!slab_is_available()) {
cca448fe9   Yasunori Goto   [PATCH] wait_tabl...
3604
  		zone->wait_table = (wait_queue_head_t *)
8f389a99b   Yinghai Lu   mm: use alloc_boo...
3605
  			alloc_bootmem_node_nopanic(pgdat, alloc_size);
cca448fe9   Yasunori Goto   [PATCH] wait_tabl...
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
  	} else {
  		/*
  		 * This case means that a zone whose size was 0 gets new memory
  		 * via memory hot-add.
  		 * But it may be the case that a new node was hot-added.  In
  		 * this case vmalloc() will not be able to use this new node's
  		 * memory - this wait_table must be initialized to use this new
  		 * node itself as well.
  		 * To use this new node's memory, further consideration will be
  		 * necessary.
  		 */
8691f3a72   Jesper Juhl   mm: no need to ca...
3617
  		zone->wait_table = vmalloc(alloc_size);
cca448fe9   Yasunori Goto   [PATCH] wait_tabl...
3618
3619
3620
  	}
  	if (!zone->wait_table)
  		return -ENOMEM;
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
3621

02b694dea   Yasunori Goto   [PATCH] wait_tabl...
3622
  	for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
3623
  		init_waitqueue_head(zone->wait_table + i);
cca448fe9   Yasunori Goto   [PATCH] wait_tabl...
3624
3625
  
  	return 0;
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
3626
  }
112067f09   Shaohua Li   memory hotplug: u...
3627
3628
3629
3630
3631
  static int __zone_pcp_update(void *data)
  {
  	struct zone *zone = data;
  	int cpu;
  	unsigned long batch = zone_batchsize(zone), flags;
2d30a1f63   Thomas Gleixner   mm: do not iterat...
3632
  	for_each_possible_cpu(cpu) {
112067f09   Shaohua Li   memory hotplug: u...
3633
3634
  		struct per_cpu_pageset *pset;
  		struct per_cpu_pages *pcp;
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
3635
  		pset = per_cpu_ptr(zone->pageset, cpu);
112067f09   Shaohua Li   memory hotplug: u...
3636
3637
3638
  		pcp = &pset->pcp;
  
  		local_irq_save(flags);
5f8dcc212   Mel Gorman   page-allocator: s...
3639
  		free_pcppages_bulk(zone, pcp->count, pcp);
112067f09   Shaohua Li   memory hotplug: u...
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
  		setup_pageset(pset, batch);
  		local_irq_restore(flags);
  	}
  	return 0;
  }
  
  void zone_pcp_update(struct zone *zone)
  {
  	stop_machine(__zone_pcp_update, zone, NULL);
  }
c09b42404   Matt Tolentino   [PATCH] x86_64: a...
3650
  static __meminit void zone_pcp_init(struct zone *zone)
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
3651
  {
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
3652
3653
3654
3655
3656
3657
  	/*
  	 * per cpu subsystem is not up at this point. The following code
  	 * relies on the ability of the linker to provide the
  	 * offset of a (static) per cpu variable into the per cpu area.
  	 */
  	zone->pageset = &boot_pageset;
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
3658

f5335c0f1   Anton Blanchard   [PATCH] quieten z...
3659
  	if (zone->present_pages)
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
3660
3661
3662
3663
  		printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%u
  ",
  			zone->name, zone->present_pages,
  					 zone_batchsize(zone));
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
3664
  }
718127cc3   Yasunori Goto   [PATCH] wait_tabl...
3665
3666
  __meminit int init_currently_empty_zone(struct zone *zone,
  					unsigned long zone_start_pfn,
a2f3aa025   Dave Hansen   [PATCH] Fix spars...
3667
3668
  					unsigned long size,
  					enum memmap_context context)
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
3669
3670
  {
  	struct pglist_data *pgdat = zone->zone_pgdat;
cca448fe9   Yasunori Goto   [PATCH] wait_tabl...
3671
3672
3673
3674
  	int ret;
  	ret = zone_wait_table_init(zone, size);
  	if (ret)
  		return ret;
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
3675
  	pgdat->nr_zones = zone_idx(zone) + 1;
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
3676
  	zone->zone_start_pfn = zone_start_pfn;
708614e61   Mel Gorman   mm: verify the pa...
3677
3678
3679
3680
3681
3682
  	mminit_dprintk(MMINIT_TRACE, "memmap_init",
  			"Initialising map node %d zone %lu pfns %lu -> %lu
  ",
  			pgdat->node_id,
  			(unsigned long)zone_idx(zone),
  			zone_start_pfn, (zone_start_pfn + size));
1e548deb5   Andi Kleen   page allocator: r...
3683
  	zone_init_free_lists(zone);
718127cc3   Yasunori Goto   [PATCH] wait_tabl...
3684
3685
  
  	return 0;
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
3686
  }
0ee332c14   Tejun Heo   memblock: Kill ea...
3687
  #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
c713216de   Mel Gorman   [PATCH] Introduce...
3688
3689
3690
3691
3692
3693
3694
  #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
  /*
   * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
   * Architectures may implement their own version but if add_active_range()
   * was used and there are no special requirements, this is a convenient
   * alternative
   */
f2dbcfa73   KAMEZAWA Hiroyuki   mm: clean up for ...
3695
  int __meminit __early_pfn_to_nid(unsigned long pfn)
c713216de   Mel Gorman   [PATCH] Introduce...
3696
  {
c13291a53   Tejun Heo   bootmem: Use for_...
3697
3698
  	unsigned long start_pfn, end_pfn;
  	int i, nid;
c713216de   Mel Gorman   [PATCH] Introduce...
3699

c13291a53   Tejun Heo   bootmem: Use for_...
3700
  	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
c713216de   Mel Gorman   [PATCH] Introduce...
3701
  		if (start_pfn <= pfn && pfn < end_pfn)
c13291a53   Tejun Heo   bootmem: Use for_...
3702
  			return nid;
cc2559bcc   KAMEZAWA Hiroyuki   mm: fix memmap in...
3703
3704
  	/* This is a memory hole */
  	return -1;
c713216de   Mel Gorman   [PATCH] Introduce...
3705
3706
  }
  #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
f2dbcfa73   KAMEZAWA Hiroyuki   mm: clean up for ...
3707
3708
  int __meminit early_pfn_to_nid(unsigned long pfn)
  {
cc2559bcc   KAMEZAWA Hiroyuki   mm: fix memmap in...
3709
3710
3711
3712
3713
3714
3715
  	int nid;
  
  	nid = __early_pfn_to_nid(pfn);
  	if (nid >= 0)
  		return nid;
  	/* just returns 0 */
  	return 0;
f2dbcfa73   KAMEZAWA Hiroyuki   mm: clean up for ...
3716
  }
cc2559bcc   KAMEZAWA Hiroyuki   mm: fix memmap in...
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
  #ifdef CONFIG_NODES_SPAN_OTHER_NODES
  bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
  {
  	int nid;
  
  	nid = __early_pfn_to_nid(pfn);
  	if (nid >= 0 && nid != node)
  		return false;
  	return true;
  }
  #endif
f2dbcfa73   KAMEZAWA Hiroyuki   mm: clean up for ...
3728

c713216de   Mel Gorman   [PATCH] Introduce...
3729
3730
  /**
   * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
3731
3732
   * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
   * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node
c713216de   Mel Gorman   [PATCH] Introduce...
3733
3734
3735
3736
3737
   *
   * If an architecture guarantees that all ranges registered with
   * add_active_ranges() contain no holes and may be freed, this
   * this function may be used instead of calling free_bootmem() manually.
   */
c13291a53   Tejun Heo   bootmem: Use for_...
3738
  void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
cc2898943   Yinghai Lu   mm: Move early_no...
3739
  {
c13291a53   Tejun Heo   bootmem: Use for_...
3740
3741
  	unsigned long start_pfn, end_pfn;
  	int i, this_nid;
edbe7d23b   Yinghai Lu   memblock: Add fin...
3742

c13291a53   Tejun Heo   bootmem: Use for_...
3743
3744
3745
  	for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {
  		start_pfn = min(start_pfn, max_low_pfn);
  		end_pfn = min(end_pfn, max_low_pfn);
edbe7d23b   Yinghai Lu   memblock: Add fin...
3746

c13291a53   Tejun Heo   bootmem: Use for_...
3747
3748
3749
3750
  		if (start_pfn < end_pfn)
  			free_bootmem_node(NODE_DATA(this_nid),
  					  PFN_PHYS(start_pfn),
  					  (end_pfn - start_pfn) << PAGE_SHIFT);
edbe7d23b   Yinghai Lu   memblock: Add fin...
3751
  	}
edbe7d23b   Yinghai Lu   memblock: Add fin...
3752
  }
edbe7d23b   Yinghai Lu   memblock: Add fin...
3753

08677214e   Yinghai Lu   x86: Make 64 bit ...
3754
3755
3756
  int __init add_from_early_node_map(struct range *range, int az,
  				   int nr_range, int nid)
  {
c13291a53   Tejun Heo   bootmem: Use for_...
3757
  	unsigned long start_pfn, end_pfn;
08677214e   Yinghai Lu   x86: Make 64 bit ...
3758
  	int i;
08677214e   Yinghai Lu   x86: Make 64 bit ...
3759
3760
  
  	/* need to go over early_node_map to find out good range for node */
c13291a53   Tejun Heo   bootmem: Use for_...
3761
3762
  	for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL)
  		nr_range = add_range(range, az, nr_range, start_pfn, end_pfn);
08677214e   Yinghai Lu   x86: Make 64 bit ...
3763
3764
  	return nr_range;
  }
c713216de   Mel Gorman   [PATCH] Introduce...
3765
3766
  /**
   * sparse_memory_present_with_active_regions - Call memory_present for each active range
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
3767
   * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
c713216de   Mel Gorman   [PATCH] Introduce...
3768
3769
3770
   *
   * If an architecture guarantees that all ranges registered with
   * add_active_ranges() contain no holes and may be freed, this
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
3771
   * function may be used instead of calling memory_present() manually.
c713216de   Mel Gorman   [PATCH] Introduce...
3772
3773
3774
   */
  void __init sparse_memory_present_with_active_regions(int nid)
  {
c13291a53   Tejun Heo   bootmem: Use for_...
3775
3776
  	unsigned long start_pfn, end_pfn;
  	int i, this_nid;
c713216de   Mel Gorman   [PATCH] Introduce...
3777

c13291a53   Tejun Heo   bootmem: Use for_...
3778
3779
  	for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)
  		memory_present(this_nid, start_pfn, end_pfn);
c713216de   Mel Gorman   [PATCH] Introduce...
3780
3781
3782
3783
  }
  
  /**
   * get_pfn_range_for_nid - Return the start and end page frames for a node
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
3784
3785
3786
   * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
   * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
   * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
c713216de   Mel Gorman   [PATCH] Introduce...
3787
3788
3789
3790
   *
   * It returns the start and end page frame of a node based on information
   * provided by an arch calling add_active_range(). If called for a node
   * with no available memory, a warning is printed and the start and end
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
3791
   * PFNs will be 0.
c713216de   Mel Gorman   [PATCH] Introduce...
3792
   */
a3142c8e1   Yasunori Goto   Fix section misma...
3793
  void __meminit get_pfn_range_for_nid(unsigned int nid,
c713216de   Mel Gorman   [PATCH] Introduce...
3794
3795
  			unsigned long *start_pfn, unsigned long *end_pfn)
  {
c13291a53   Tejun Heo   bootmem: Use for_...
3796
  	unsigned long this_start_pfn, this_end_pfn;
c713216de   Mel Gorman   [PATCH] Introduce...
3797
  	int i;
c13291a53   Tejun Heo   bootmem: Use for_...
3798

c713216de   Mel Gorman   [PATCH] Introduce...
3799
3800
  	*start_pfn = -1UL;
  	*end_pfn = 0;
c13291a53   Tejun Heo   bootmem: Use for_...
3801
3802
3803
  	for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {
  		*start_pfn = min(*start_pfn, this_start_pfn);
  		*end_pfn = max(*end_pfn, this_end_pfn);
c713216de   Mel Gorman   [PATCH] Introduce...
3804
  	}
633c0666b   Christoph Lameter   Memoryless nodes:...
3805
  	if (*start_pfn == -1UL)
c713216de   Mel Gorman   [PATCH] Introduce...
3806
  		*start_pfn = 0;
c713216de   Mel Gorman   [PATCH] Introduce...
3807
3808
3809
  }
  
  /*
2a1e274ac   Mel Gorman   Create the ZONE_M...
3810
3811
3812
3813
   * This finds a zone that can be used for ZONE_MOVABLE pages. The
   * assumption is made that zones within a node are ordered in monotonic
   * increasing memory addresses so that the "highest" populated zone is used
   */
b69a7288e   Adrian Bunk   mm/page_alloc.c: ...
3814
  static void __init find_usable_zone_for_movable(void)
2a1e274ac   Mel Gorman   Create the ZONE_M...
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
  {
  	int zone_index;
  	for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
  		if (zone_index == ZONE_MOVABLE)
  			continue;
  
  		if (arch_zone_highest_possible_pfn[zone_index] >
  				arch_zone_lowest_possible_pfn[zone_index])
  			break;
  	}
  
  	VM_BUG_ON(zone_index == -1);
  	movable_zone = zone_index;
  }
  
  /*
   * The zone ranges provided by the architecture do not include ZONE_MOVABLE
25985edce   Lucas De Marchi   Fix common misspe...
3832
   * because it is sized independent of architecture. Unlike the other zones,
2a1e274ac   Mel Gorman   Create the ZONE_M...
3833
3834
3835
3836
3837
3838
3839
   * the starting point for ZONE_MOVABLE is not fixed. It may be different
   * in each node depending on the size of each node and how evenly kernelcore
   * is distributed. This helper function adjusts the zone ranges
   * provided by the architecture for a given node by using the end of the
   * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
   * zones within a node are in order of monotonic increases memory addresses
   */
b69a7288e   Adrian Bunk   mm/page_alloc.c: ...
3840
  static void __meminit adjust_zone_range_for_zone_movable(int nid,
2a1e274ac   Mel Gorman   Create the ZONE_M...
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
  					unsigned long zone_type,
  					unsigned long node_start_pfn,
  					unsigned long node_end_pfn,
  					unsigned long *zone_start_pfn,
  					unsigned long *zone_end_pfn)
  {
  	/* Only adjust if ZONE_MOVABLE is on this node */
  	if (zone_movable_pfn[nid]) {
  		/* Size ZONE_MOVABLE */
  		if (zone_type == ZONE_MOVABLE) {
  			*zone_start_pfn = zone_movable_pfn[nid];
  			*zone_end_pfn = min(node_end_pfn,
  				arch_zone_highest_possible_pfn[movable_zone]);
  
  		/* Adjust for ZONE_MOVABLE starting within this range */
  		} else if (*zone_start_pfn < zone_movable_pfn[nid] &&
  				*zone_end_pfn > zone_movable_pfn[nid]) {
  			*zone_end_pfn = zone_movable_pfn[nid];
  
  		/* Check if this whole range is within ZONE_MOVABLE */
  		} else if (*zone_start_pfn >= zone_movable_pfn[nid])
  			*zone_start_pfn = *zone_end_pfn;
  	}
  }
  
  /*
c713216de   Mel Gorman   [PATCH] Introduce...
3867
3868
3869
   * Return the number of pages a zone spans in a node, including holes
   * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
   */
6ea6e6887   Paul Mundt   mm: more __memini...
3870
  static unsigned long __meminit zone_spanned_pages_in_node(int nid,
c713216de   Mel Gorman   [PATCH] Introduce...
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
  					unsigned long zone_type,
  					unsigned long *ignored)
  {
  	unsigned long node_start_pfn, node_end_pfn;
  	unsigned long zone_start_pfn, zone_end_pfn;
  
  	/* Get the start and end of the node and zone */
  	get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
  	zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
  	zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
2a1e274ac   Mel Gorman   Create the ZONE_M...
3881
3882
3883
  	adjust_zone_range_for_zone_movable(nid, zone_type,
  				node_start_pfn, node_end_pfn,
  				&zone_start_pfn, &zone_end_pfn);
c713216de   Mel Gorman   [PATCH] Introduce...
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
  
  	/* Check that this node has pages within the zone's required range */
  	if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
  		return 0;
  
  	/* Move the zone boundaries inside the node if necessary */
  	zone_end_pfn = min(zone_end_pfn, node_end_pfn);
  	zone_start_pfn = max(zone_start_pfn, node_start_pfn);
  
  	/* Return the spanned pages */
  	return zone_end_pfn - zone_start_pfn;
  }
  
  /*
   * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
3899
   * then all holes in the requested range will be accounted for.
c713216de   Mel Gorman   [PATCH] Introduce...
3900
   */
329962503   Yinghai Lu   x86: Fix checking...
3901
  unsigned long __meminit __absent_pages_in_range(int nid,
c713216de   Mel Gorman   [PATCH] Introduce...
3902
3903
3904
  				unsigned long range_start_pfn,
  				unsigned long range_end_pfn)
  {
96e907d13   Tejun Heo   bootmem: Reimplem...
3905
3906
3907
  	unsigned long nr_absent = range_end_pfn - range_start_pfn;
  	unsigned long start_pfn, end_pfn;
  	int i;
c713216de   Mel Gorman   [PATCH] Introduce...
3908

96e907d13   Tejun Heo   bootmem: Reimplem...
3909
3910
3911
3912
  	for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
  		start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
  		end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
  		nr_absent -= end_pfn - start_pfn;
c713216de   Mel Gorman   [PATCH] Introduce...
3913
  	}
96e907d13   Tejun Heo   bootmem: Reimplem...
3914
  	return nr_absent;
c713216de   Mel Gorman   [PATCH] Introduce...
3915
3916
3917
3918
3919
3920
3921
  }
  
  /**
   * absent_pages_in_range - Return number of page frames in holes within a range
   * @start_pfn: The start PFN to start searching for holes
   * @end_pfn: The end PFN to stop searching for holes
   *
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
3922
   * It returns the number of pages frames in memory holes within a range.
c713216de   Mel Gorman   [PATCH] Introduce...
3923
3924
3925
3926
3927
3928
3929
3930
   */
  unsigned long __init absent_pages_in_range(unsigned long start_pfn,
  							unsigned long end_pfn)
  {
  	return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
  }
  
  /* Return the number of page frames in holes in a zone on a node */
6ea6e6887   Paul Mundt   mm: more __memini...
3931
  static unsigned long __meminit zone_absent_pages_in_node(int nid,
c713216de   Mel Gorman   [PATCH] Introduce...
3932
3933
3934
  					unsigned long zone_type,
  					unsigned long *ignored)
  {
96e907d13   Tejun Heo   bootmem: Reimplem...
3935
3936
  	unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
  	unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
9c7cd6877   Mel Gorman   [PATCH] Account f...
3937
3938
3939
3940
  	unsigned long node_start_pfn, node_end_pfn;
  	unsigned long zone_start_pfn, zone_end_pfn;
  
  	get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
96e907d13   Tejun Heo   bootmem: Reimplem...
3941
3942
  	zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
  	zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
9c7cd6877   Mel Gorman   [PATCH] Account f...
3943

2a1e274ac   Mel Gorman   Create the ZONE_M...
3944
3945
3946
  	adjust_zone_range_for_zone_movable(nid, zone_type,
  			node_start_pfn, node_end_pfn,
  			&zone_start_pfn, &zone_end_pfn);
9c7cd6877   Mel Gorman   [PATCH] Account f...
3947
  	return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
c713216de   Mel Gorman   [PATCH] Introduce...
3948
  }
0e0b864e0   Mel Gorman   [PATCH] Account f...
3949

0ee332c14   Tejun Heo   memblock: Kill ea...
3950
  #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
6ea6e6887   Paul Mundt   mm: more __memini...
3951
  static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
c713216de   Mel Gorman   [PATCH] Introduce...
3952
3953
3954
3955
3956
  					unsigned long zone_type,
  					unsigned long *zones_size)
  {
  	return zones_size[zone_type];
  }
6ea6e6887   Paul Mundt   mm: more __memini...
3957
  static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
c713216de   Mel Gorman   [PATCH] Introduce...
3958
3959
3960
3961
3962
3963
3964
3965
  						unsigned long zone_type,
  						unsigned long *zholes_size)
  {
  	if (!zholes_size)
  		return 0;
  
  	return zholes_size[zone_type];
  }
0e0b864e0   Mel Gorman   [PATCH] Account f...
3966

0ee332c14   Tejun Heo   memblock: Kill ea...
3967
  #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
c713216de   Mel Gorman   [PATCH] Introduce...
3968

a3142c8e1   Yasunori Goto   Fix section misma...
3969
  static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
c713216de   Mel Gorman   [PATCH] Introduce...
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
  		unsigned long *zones_size, unsigned long *zholes_size)
  {
  	unsigned long realtotalpages, totalpages = 0;
  	enum zone_type i;
  
  	for (i = 0; i < MAX_NR_ZONES; i++)
  		totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
  								zones_size);
  	pgdat->node_spanned_pages = totalpages;
  
  	realtotalpages = totalpages;
  	for (i = 0; i < MAX_NR_ZONES; i++)
  		realtotalpages -=
  			zone_absent_pages_in_node(pgdat->node_id, i,
  								zholes_size);
  	pgdat->node_present_pages = realtotalpages;
  	printk(KERN_DEBUG "On node %d totalpages: %lu
  ", pgdat->node_id,
  							realtotalpages);
  }
835c134ec   Mel Gorman   Add a bitmap that...
3990
3991
3992
  #ifndef CONFIG_SPARSEMEM
  /*
   * Calculate the size of the zone->blockflags rounded to an unsigned long
d9c234005   Mel Gorman   Do not depend on ...
3993
3994
   * Start by making sure zonesize is a multiple of pageblock_order by rounding
   * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
835c134ec   Mel Gorman   Add a bitmap that...
3995
3996
3997
3998
3999
4000
   * round what is now in bits to nearest long in bits, then return it in
   * bytes.
   */
  static unsigned long __init usemap_size(unsigned long zonesize)
  {
  	unsigned long usemapsize;
d9c234005   Mel Gorman   Do not depend on ...
4001
4002
  	usemapsize = roundup(zonesize, pageblock_nr_pages);
  	usemapsize = usemapsize >> pageblock_order;
835c134ec   Mel Gorman   Add a bitmap that...
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
  	usemapsize *= NR_PAGEBLOCK_BITS;
  	usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));
  
  	return usemapsize / 8;
  }
  
  static void __init setup_usemap(struct pglist_data *pgdat,
  				struct zone *zone, unsigned long zonesize)
  {
  	unsigned long usemapsize = usemap_size(zonesize);
  	zone->pageblock_flags = NULL;
58a01a457   Julia Lawall   mm/page_alloc.c: ...
4014
  	if (usemapsize)
8f389a99b   Yinghai Lu   mm: use alloc_boo...
4015
4016
  		zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,
  								   usemapsize);
835c134ec   Mel Gorman   Add a bitmap that...
4017
4018
  }
  #else
fa9f90be7   Jesper Juhl   Kill off a bunch ...
4019
  static inline void setup_usemap(struct pglist_data *pgdat,
835c134ec   Mel Gorman   Add a bitmap that...
4020
4021
  				struct zone *zone, unsigned long zonesize) {}
  #endif /* CONFIG_SPARSEMEM */
d9c234005   Mel Gorman   Do not depend on ...
4022
  #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
ba72cb8cb   Mel Gorman   Fix boot problem ...
4023
4024
4025
4026
4027
4028
4029
4030
4031
  
  /* Return a sensible default order for the pageblock size. */
  static inline int pageblock_default_order(void)
  {
  	if (HPAGE_SHIFT > PAGE_SHIFT)
  		return HUGETLB_PAGE_ORDER;
  
  	return MAX_ORDER-1;
  }
d9c234005   Mel Gorman   Do not depend on ...
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
  /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
  static inline void __init set_pageblock_order(unsigned int order)
  {
  	/* Check that pageblock_nr_pages has not already been setup */
  	if (pageblock_order)
  		return;
  
  	/*
  	 * Assume the largest contiguous order of interest is a huge page.
  	 * This value may be variable depending on boot parameters on IA64
  	 */
  	pageblock_order = order;
  }
  #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
ba72cb8cb   Mel Gorman   Fix boot problem ...
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
  /*
   * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
   * and pageblock_default_order() are unused as pageblock_order is set
   * at compile-time. See include/linux/pageblock-flags.h for the values of
   * pageblock_order based on the kernel config
   */
  static inline int pageblock_default_order(unsigned int order)
  {
  	return MAX_ORDER-1;
  }
d9c234005   Mel Gorman   Do not depend on ...
4056
4057
4058
  #define set_pageblock_order(x)	do {} while (0)
  
  #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4059
4060
4061
4062
4063
4064
  /*
   * Set up the zone data structures:
   *   - mark all pages reserved
   *   - mark all memory queues empty
   *   - clear the memory bitmaps
   */
b5a0e0113   Alexander van Heukelum   Solve section mis...
4065
  static void __paginginit free_area_init_core(struct pglist_data *pgdat,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4066
4067
  		unsigned long *zones_size, unsigned long *zholes_size)
  {
2f1b62486   Christoph Lameter   [PATCH] reduce MA...
4068
  	enum zone_type j;
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
4069
  	int nid = pgdat->node_id;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4070
  	unsigned long zone_start_pfn = pgdat->node_start_pfn;
718127cc3   Yasunori Goto   [PATCH] wait_tabl...
4071
  	int ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4072

208d54e55   Dave Hansen   [PATCH] memory ho...
4073
  	pgdat_resize_init(pgdat);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4074
4075
4076
  	pgdat->nr_zones = 0;
  	init_waitqueue_head(&pgdat->kswapd_wait);
  	pgdat->kswapd_max_order = 0;
52d4b9ac0   KAMEZAWA Hiroyuki   memcg: allocate a...
4077
  	pgdat_page_cgroup_init(pgdat);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4078
4079
4080
  	
  	for (j = 0; j < MAX_NR_ZONES; j++) {
  		struct zone *zone = pgdat->node_zones + j;
0e0b864e0   Mel Gorman   [PATCH] Account f...
4081
  		unsigned long size, realsize, memmap_pages;
4111304da   Hugh Dickins   mm: enum lru_list...
4082
  		enum lru_list lru;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4083

c713216de   Mel Gorman   [PATCH] Introduce...
4084
4085
4086
  		size = zone_spanned_pages_in_node(nid, j, zones_size);
  		realsize = size - zone_absent_pages_in_node(nid, j,
  								zholes_size);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4087

0e0b864e0   Mel Gorman   [PATCH] Account f...
4088
4089
4090
4091
4092
  		/*
  		 * Adjust realsize so that it accounts for how much memory
  		 * is used by this zone for memmap. This affects the watermark
  		 * and per-cpu initialisations
  		 */
f72321541   Johannes Weiner   mm: don't drop a ...
4093
4094
  		memmap_pages =
  			PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;
0e0b864e0   Mel Gorman   [PATCH] Account f...
4095
4096
  		if (realsize >= memmap_pages) {
  			realsize -= memmap_pages;
5594c8c81   Yinghai Lu   mm: print out mem...
4097
4098
4099
4100
4101
  			if (memmap_pages)
  				printk(KERN_DEBUG
  				       "  %s zone: %lu pages used for memmap
  ",
  				       zone_names[j], memmap_pages);
0e0b864e0   Mel Gorman   [PATCH] Account f...
4102
4103
4104
4105
4106
  		} else
  			printk(KERN_WARNING
  				"  %s zone: %lu pages exceeds realsize %lu
  ",
  				zone_names[j], memmap_pages, realsize);
6267276f3   Christoph Lameter   [PATCH] optional ...
4107
4108
  		/* Account for reserved pages */
  		if (j == 0 && realsize > dma_reserve) {
0e0b864e0   Mel Gorman   [PATCH] Account f...
4109
  			realsize -= dma_reserve;
d903ef9f3   Yinghai Lu   mm: print out mem...
4110
4111
  			printk(KERN_DEBUG "  %s zone: %lu pages reserved
  ",
6267276f3   Christoph Lameter   [PATCH] optional ...
4112
  					zone_names[0], dma_reserve);
0e0b864e0   Mel Gorman   [PATCH] Account f...
4113
  		}
98d2b0ebd   Christoph Lameter   [PATCH] reduce MA...
4114
  		if (!is_highmem_idx(j))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4115
4116
4117
4118
4119
  			nr_kernel_pages += realsize;
  		nr_all_pages += realsize;
  
  		zone->spanned_pages = size;
  		zone->present_pages = realsize;
9614634fe   Christoph Lameter   [PATCH] ZVC/zone_...
4120
  #ifdef CONFIG_NUMA
d5f541ed6   Christoph Lameter   [PATCH] Add node ...
4121
  		zone->node = nid;
8417bba4b   Christoph Lameter   [PATCH] Replace m...
4122
  		zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
9614634fe   Christoph Lameter   [PATCH] ZVC/zone_...
4123
  						/ 100;
0ff38490c   Christoph Lameter   [PATCH] zone_recl...
4124
  		zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
9614634fe   Christoph Lameter   [PATCH] ZVC/zone_...
4125
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4126
4127
4128
  		zone->name = zone_names[j];
  		spin_lock_init(&zone->lock);
  		spin_lock_init(&zone->lru_lock);
bdc8cb984   Dave Hansen   [PATCH] memory ho...
4129
  		zone_seqlock_init(zone);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4130
  		zone->zone_pgdat = pgdat;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4131

ed8ece2ec   Dave Hansen   [PATCH] memory ho...
4132
  		zone_pcp_init(zone);
4111304da   Hugh Dickins   mm: enum lru_list...
4133
4134
  		for_each_lru(lru)
  			INIT_LIST_HEAD(&zone->lruvec.lists[lru]);
6e9015716   KOSAKI Motohiro   mm: introduce zon...
4135
4136
4137
4138
  		zone->reclaim_stat.recent_rotated[0] = 0;
  		zone->reclaim_stat.recent_rotated[1] = 0;
  		zone->reclaim_stat.recent_scanned[0] = 0;
  		zone->reclaim_stat.recent_scanned[1] = 0;
2244b95a7   Christoph Lameter   [PATCH] zoned vm ...
4139
  		zap_zone_vm_stats(zone);
e815af95f   David Rientjes   oom: change all_u...
4140
  		zone->flags = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4141
4142
  		if (!size)
  			continue;
ba72cb8cb   Mel Gorman   Fix boot problem ...
4143
  		set_pageblock_order(pageblock_default_order());
835c134ec   Mel Gorman   Add a bitmap that...
4144
  		setup_usemap(pgdat, zone, size);
a2f3aa025   Dave Hansen   [PATCH] Fix spars...
4145
4146
  		ret = init_currently_empty_zone(zone, zone_start_pfn,
  						size, MEMMAP_EARLY);
718127cc3   Yasunori Goto   [PATCH] wait_tabl...
4147
  		BUG_ON(ret);
76cdd58e5   Heiko Carstens   memory_hotplug: a...
4148
  		memmap_init(size, nid, j, zone_start_pfn);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4149
  		zone_start_pfn += size;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4150
4151
  	}
  }
577a32f62   Sam Ravnborg   mm: fix section m...
4152
  static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4153
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4154
4155
4156
  	/* Skip empty nodes */
  	if (!pgdat->node_spanned_pages)
  		return;
d41dee369   Andy Whitcroft   [PATCH] sparsemem...
4157
  #ifdef CONFIG_FLAT_NODE_MEM_MAP
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4158
4159
  	/* ia64 gets its own node_mem_map, before this, without bootmem */
  	if (!pgdat->node_mem_map) {
e984bb43f   Bob Picco   [PATCH] Align the...
4160
  		unsigned long size, start, end;
d41dee369   Andy Whitcroft   [PATCH] sparsemem...
4161
  		struct page *map;
e984bb43f   Bob Picco   [PATCH] Align the...
4162
4163
4164
4165
4166
4167
4168
4169
4170
  		/*
  		 * The zone's endpoints aren't required to be MAX_ORDER
  		 * aligned but the node_mem_map endpoints must be in order
  		 * for the buddy allocator to function correctly.
  		 */
  		start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
  		end = pgdat->node_start_pfn + pgdat->node_spanned_pages;
  		end = ALIGN(end, MAX_ORDER_NR_PAGES);
  		size =  (end - start) * sizeof(struct page);
6f167ec72   Dave Hansen   [PATCH] sparsemem...
4171
4172
  		map = alloc_remap(pgdat->node_id, size);
  		if (!map)
8f389a99b   Yinghai Lu   mm: use alloc_boo...
4173
  			map = alloc_bootmem_node_nopanic(pgdat, size);
e984bb43f   Bob Picco   [PATCH] Align the...
4174
  		pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4175
  	}
12d810c1b   Roman Zippel   m68k: discontinuo...
4176
  #ifndef CONFIG_NEED_MULTIPLE_NODES
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4177
4178
4179
  	/*
  	 * With no DISCONTIG, the global mem_map is just set as node 0's
  	 */
c713216de   Mel Gorman   [PATCH] Introduce...
4180
  	if (pgdat == NODE_DATA(0)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4181
  		mem_map = NODE_DATA(0)->node_mem_map;
0ee332c14   Tejun Heo   memblock: Kill ea...
4182
  #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
c713216de   Mel Gorman   [PATCH] Introduce...
4183
  		if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
467bc461d   Thomas Bogendoerfer   Fix crash with FL...
4184
  			mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);
0ee332c14   Tejun Heo   memblock: Kill ea...
4185
  #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
c713216de   Mel Gorman   [PATCH] Introduce...
4186
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4187
  #endif
d41dee369   Andy Whitcroft   [PATCH] sparsemem...
4188
  #endif /* CONFIG_FLAT_NODE_MEM_MAP */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4189
  }
9109fb7b3   Johannes Weiner   mm: drop unneeded...
4190
4191
  void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
  		unsigned long node_start_pfn, unsigned long *zholes_size)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4192
  {
9109fb7b3   Johannes Weiner   mm: drop unneeded...
4193
  	pg_data_t *pgdat = NODE_DATA(nid);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4194
4195
  	pgdat->node_id = nid;
  	pgdat->node_start_pfn = node_start_pfn;
c713216de   Mel Gorman   [PATCH] Introduce...
4196
  	calculate_node_totalpages(pgdat, zones_size, zholes_size);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4197
4198
  
  	alloc_node_mem_map(pgdat);
e8c27ac91   Yinghai Lu   x86, numa, 32-bit...
4199
4200
4201
4202
4203
4204
  #ifdef CONFIG_FLAT_NODE_MEM_MAP
  	printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx
  ",
  		nid, (unsigned long)pgdat,
  		(unsigned long)pgdat->node_mem_map);
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4205
4206
4207
  
  	free_area_init_core(pgdat, zones_size, zholes_size);
  }
0ee332c14   Tejun Heo   memblock: Kill ea...
4208
  #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
418508c13   Miklos Szeredi   fix unused setup_...
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
  
  #if MAX_NUMNODES > 1
  /*
   * Figure out the number of possible node ids.
   */
  static void __init setup_nr_node_ids(void)
  {
  	unsigned int node;
  	unsigned int highest = 0;
  
  	for_each_node_mask(node, node_possible_map)
  		highest = node;
  	nr_node_ids = highest + 1;
  }
  #else
  static inline void setup_nr_node_ids(void)
  {
  }
  #endif
c713216de   Mel Gorman   [PATCH] Introduce...
4228
  /**
1e01979c8   Tejun Heo   x86, numa: Implem...
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
   * node_map_pfn_alignment - determine the maximum internode alignment
   *
   * This function should be called after node map is populated and sorted.
   * It calculates the maximum power of two alignment which can distinguish
   * all the nodes.
   *
   * For example, if all nodes are 1GiB and aligned to 1GiB, the return value
   * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)).  If the
   * nodes are shifted by 256MiB, 256MiB.  Note that if only the last node is
   * shifted, 1GiB is enough and this function will indicate so.
   *
   * This is used to test whether pfn -> nid mapping of the chosen memory
   * model has fine enough granularity to avoid incorrect mapping for the
   * populated node map.
   *
   * Returns the determined alignment in pfn's.  0 if there is no alignment
   * requirement (single node).
   */
  unsigned long __init node_map_pfn_alignment(void)
  {
  	unsigned long accl_mask = 0, last_end = 0;
c13291a53   Tejun Heo   bootmem: Use for_...
4250
  	unsigned long start, end, mask;
1e01979c8   Tejun Heo   x86, numa: Implem...
4251
  	int last_nid = -1;
c13291a53   Tejun Heo   bootmem: Use for_...
4252
  	int i, nid;
1e01979c8   Tejun Heo   x86, numa: Implem...
4253

c13291a53   Tejun Heo   bootmem: Use for_...
4254
  	for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
1e01979c8   Tejun Heo   x86, numa: Implem...
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
  		if (!start || last_nid < 0 || last_nid == nid) {
  			last_nid = nid;
  			last_end = end;
  			continue;
  		}
  
  		/*
  		 * Start with a mask granular enough to pin-point to the
  		 * start pfn and tick off bits one-by-one until it becomes
  		 * too coarse to separate the current node from the last.
  		 */
  		mask = ~((1 << __ffs(start)) - 1);
  		while (mask && last_end <= (start & (mask << 1)))
  			mask <<= 1;
  
  		/* accumulate all internode masks */
  		accl_mask |= mask;
  	}
  
  	/* convert mask to number of pages */
  	return ~accl_mask + 1;
  }
a6af2bc3d   Mel Gorman   [PATCH] Avoid exc...
4277
  /* Find the lowest pfn for a node */
b69a7288e   Adrian Bunk   mm/page_alloc.c: ...
4278
  static unsigned long __init find_min_pfn_for_node(int nid)
c713216de   Mel Gorman   [PATCH] Introduce...
4279
  {
a6af2bc3d   Mel Gorman   [PATCH] Avoid exc...
4280
  	unsigned long min_pfn = ULONG_MAX;
c13291a53   Tejun Heo   bootmem: Use for_...
4281
4282
  	unsigned long start_pfn;
  	int i;
1abbfb412   Mel Gorman   [PATCH] x86_64: f...
4283

c13291a53   Tejun Heo   bootmem: Use for_...
4284
4285
  	for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)
  		min_pfn = min(min_pfn, start_pfn);
c713216de   Mel Gorman   [PATCH] Introduce...
4286

a6af2bc3d   Mel Gorman   [PATCH] Avoid exc...
4287
4288
  	if (min_pfn == ULONG_MAX) {
  		printk(KERN_WARNING
2bc0d2615   Paul Jackson   x86 boot: more co...
4289
4290
  			"Could not find start_pfn for node %d
  ", nid);
a6af2bc3d   Mel Gorman   [PATCH] Avoid exc...
4291
4292
4293
4294
  		return 0;
  	}
  
  	return min_pfn;
c713216de   Mel Gorman   [PATCH] Introduce...
4295
4296
4297
4298
4299
4300
  }
  
  /**
   * find_min_pfn_with_active_regions - Find the minimum PFN registered
   *
   * It returns the minimum PFN based on information provided via
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
4301
   * add_active_range().
c713216de   Mel Gorman   [PATCH] Introduce...
4302
4303
4304
4305
4306
   */
  unsigned long __init find_min_pfn_with_active_regions(void)
  {
  	return find_min_pfn_for_node(MAX_NUMNODES);
  }
37b07e416   Lee Schermerhorn   memoryless nodes:...
4307
4308
4309
4310
4311
  /*
   * early_calculate_totalpages()
   * Sum pages in active regions for movable zone.
   * Populate N_HIGH_MEMORY for calculating usable_nodes.
   */
484f51f82   Adrian Bunk   mm/page_alloc.c: ...
4312
  static unsigned long __init early_calculate_totalpages(void)
7e63efef8   Mel Gorman   Add a movablecore...
4313
  {
7e63efef8   Mel Gorman   Add a movablecore...
4314
  	unsigned long totalpages = 0;
c13291a53   Tejun Heo   bootmem: Use for_...
4315
4316
4317
4318
4319
  	unsigned long start_pfn, end_pfn;
  	int i, nid;
  
  	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
  		unsigned long pages = end_pfn - start_pfn;
7e63efef8   Mel Gorman   Add a movablecore...
4320

37b07e416   Lee Schermerhorn   memoryless nodes:...
4321
4322
  		totalpages += pages;
  		if (pages)
c13291a53   Tejun Heo   bootmem: Use for_...
4323
  			node_set_state(nid, N_HIGH_MEMORY);
37b07e416   Lee Schermerhorn   memoryless nodes:...
4324
4325
  	}
    	return totalpages;
7e63efef8   Mel Gorman   Add a movablecore...
4326
  }
2a1e274ac   Mel Gorman   Create the ZONE_M...
4327
4328
4329
4330
4331
4332
  /*
   * Find the PFN the Movable zone begins in each node. Kernel memory
   * is spread evenly between nodes as long as the nodes have enough
   * memory. When they don't, some nodes will have more kernelcore than
   * others
   */
b69a7288e   Adrian Bunk   mm/page_alloc.c: ...
4333
  static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
2a1e274ac   Mel Gorman   Create the ZONE_M...
4334
4335
4336
4337
  {
  	int i, nid;
  	unsigned long usable_startpfn;
  	unsigned long kernelcore_node, kernelcore_remaining;
66918dcdf   Yinghai Lu   x86: only clear n...
4338
4339
  	/* save the state before borrow the nodemask */
  	nodemask_t saved_node_state = node_states[N_HIGH_MEMORY];
37b07e416   Lee Schermerhorn   memoryless nodes:...
4340
4341
  	unsigned long totalpages = early_calculate_totalpages();
  	int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);
2a1e274ac   Mel Gorman   Create the ZONE_M...
4342

7e63efef8   Mel Gorman   Add a movablecore...
4343
4344
4345
4346
4347
4348
4349
4350
4351
  	/*
  	 * If movablecore was specified, calculate what size of
  	 * kernelcore that corresponds so that memory usable for
  	 * any allocation type is evenly spread. If both kernelcore
  	 * and movablecore are specified, then the value of kernelcore
  	 * will be used for required_kernelcore if it's greater than
  	 * what movablecore would have allowed.
  	 */
  	if (required_movablecore) {
7e63efef8   Mel Gorman   Add a movablecore...
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
  		unsigned long corepages;
  
  		/*
  		 * Round-up so that ZONE_MOVABLE is at least as large as what
  		 * was requested by the user
  		 */
  		required_movablecore =
  			roundup(required_movablecore, MAX_ORDER_NR_PAGES);
  		corepages = totalpages - required_movablecore;
  
  		required_kernelcore = max(required_kernelcore, corepages);
  	}
2a1e274ac   Mel Gorman   Create the ZONE_M...
4364
4365
  	/* If kernelcore was not specified, there is no ZONE_MOVABLE */
  	if (!required_kernelcore)
66918dcdf   Yinghai Lu   x86: only clear n...
4366
  		goto out;
2a1e274ac   Mel Gorman   Create the ZONE_M...
4367
4368
4369
4370
4371
4372
4373
4374
  
  	/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
  	find_usable_zone_for_movable();
  	usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
  
  restart:
  	/* Spread kernelcore memory as evenly as possible throughout nodes */
  	kernelcore_node = required_kernelcore / usable_nodes;
37b07e416   Lee Schermerhorn   memoryless nodes:...
4375
  	for_each_node_state(nid, N_HIGH_MEMORY) {
c13291a53   Tejun Heo   bootmem: Use for_...
4376
  		unsigned long start_pfn, end_pfn;
2a1e274ac   Mel Gorman   Create the ZONE_M...
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
  		/*
  		 * Recalculate kernelcore_node if the division per node
  		 * now exceeds what is necessary to satisfy the requested
  		 * amount of memory for the kernel
  		 */
  		if (required_kernelcore < kernelcore_node)
  			kernelcore_node = required_kernelcore / usable_nodes;
  
  		/*
  		 * As the map is walked, we track how much memory is usable
  		 * by the kernel using kernelcore_remaining. When it is
  		 * 0, the rest of the node is usable by ZONE_MOVABLE
  		 */
  		kernelcore_remaining = kernelcore_node;
  
  		/* Go through each range of PFNs within this node */
c13291a53   Tejun Heo   bootmem: Use for_...
4393
  		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2a1e274ac   Mel Gorman   Create the ZONE_M...
4394
  			unsigned long size_pages;
c13291a53   Tejun Heo   bootmem: Use for_...
4395
  			start_pfn = max(start_pfn, zone_movable_pfn[nid]);
2a1e274ac   Mel Gorman   Create the ZONE_M...
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
  			if (start_pfn >= end_pfn)
  				continue;
  
  			/* Account for what is only usable for kernelcore */
  			if (start_pfn < usable_startpfn) {
  				unsigned long kernel_pages;
  				kernel_pages = min(end_pfn, usable_startpfn)
  								- start_pfn;
  
  				kernelcore_remaining -= min(kernel_pages,
  							kernelcore_remaining);
  				required_kernelcore -= min(kernel_pages,
  							required_kernelcore);
  
  				/* Continue if range is now fully accounted */
  				if (end_pfn <= usable_startpfn) {
  
  					/*
  					 * Push zone_movable_pfn to the end so
  					 * that if we have to rebalance
  					 * kernelcore across nodes, we will
  					 * not double account here
  					 */
  					zone_movable_pfn[nid] = end_pfn;
  					continue;
  				}
  				start_pfn = usable_startpfn;
  			}
  
  			/*
  			 * The usable PFN range for ZONE_MOVABLE is from
  			 * start_pfn->end_pfn. Calculate size_pages as the
  			 * number of pages used as kernelcore
  			 */
  			size_pages = end_pfn - start_pfn;
  			if (size_pages > kernelcore_remaining)
  				size_pages = kernelcore_remaining;
  			zone_movable_pfn[nid] = start_pfn + size_pages;
  
  			/*
  			 * Some kernelcore has been met, update counts and
  			 * break if the kernelcore for this node has been
  			 * satisified
  			 */
  			required_kernelcore -= min(required_kernelcore,
  								size_pages);
  			kernelcore_remaining -= size_pages;
  			if (!kernelcore_remaining)
  				break;
  		}
  	}
  
  	/*
  	 * If there is still required_kernelcore, we do another pass with one
  	 * less node in the count. This will push zone_movable_pfn[nid] further
  	 * along on the nodes that still have memory until kernelcore is
  	 * satisified
  	 */
  	usable_nodes--;
  	if (usable_nodes && required_kernelcore > usable_nodes)
  		goto restart;
  
  	/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
  	for (nid = 0; nid < MAX_NUMNODES; nid++)
  		zone_movable_pfn[nid] =
  			roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
66918dcdf   Yinghai Lu   x86: only clear n...
4462
4463
4464
4465
  
  out:
  	/* restore the node_state */
  	node_states[N_HIGH_MEMORY] = saved_node_state;
2a1e274ac   Mel Gorman   Create the ZONE_M...
4466
  }
37b07e416   Lee Schermerhorn   memoryless nodes:...
4467
4468
4469
4470
4471
4472
4473
4474
  /* Any regular memory on that node ? */
  static void check_for_regular_memory(pg_data_t *pgdat)
  {
  #ifdef CONFIG_HIGHMEM
  	enum zone_type zone_type;
  
  	for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) {
  		struct zone *zone = &pgdat->node_zones[zone_type];
d0048b0e5   Bob Liu   page_alloc: break...
4475
  		if (zone->present_pages) {
37b07e416   Lee Schermerhorn   memoryless nodes:...
4476
  			node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY);
d0048b0e5   Bob Liu   page_alloc: break...
4477
4478
  			break;
  		}
37b07e416   Lee Schermerhorn   memoryless nodes:...
4479
4480
4481
  	}
  #endif
  }
c713216de   Mel Gorman   [PATCH] Introduce...
4482
4483
  /**
   * free_area_init_nodes - Initialise all pg_data_t and zone data
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
4484
   * @max_zone_pfn: an array of max PFNs for each zone
c713216de   Mel Gorman   [PATCH] Introduce...
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
   *
   * This will call free_area_init_node() for each active node in the system.
   * Using the page ranges provided by add_active_range(), the size of each
   * zone in each node and their holes is calculated. If the maximum PFN
   * between two adjacent zones match, it is assumed that the zone is empty.
   * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
   * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
   * starts where the previous one ended. For example, ZONE_DMA32 starts
   * at arch_max_dma_pfn.
   */
  void __init free_area_init_nodes(unsigned long *max_zone_pfn)
  {
c13291a53   Tejun Heo   bootmem: Use for_...
4497
4498
  	unsigned long start_pfn, end_pfn;
  	int i, nid;
a6af2bc3d   Mel Gorman   [PATCH] Avoid exc...
4499

c713216de   Mel Gorman   [PATCH] Introduce...
4500
4501
4502
4503
4504
4505
4506
4507
  	/* Record where the zone boundaries are */
  	memset(arch_zone_lowest_possible_pfn, 0,
  				sizeof(arch_zone_lowest_possible_pfn));
  	memset(arch_zone_highest_possible_pfn, 0,
  				sizeof(arch_zone_highest_possible_pfn));
  	arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
  	arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
  	for (i = 1; i < MAX_NR_ZONES; i++) {
2a1e274ac   Mel Gorman   Create the ZONE_M...
4508
4509
  		if (i == ZONE_MOVABLE)
  			continue;
c713216de   Mel Gorman   [PATCH] Introduce...
4510
4511
4512
4513
4514
  		arch_zone_lowest_possible_pfn[i] =
  			arch_zone_highest_possible_pfn[i-1];
  		arch_zone_highest_possible_pfn[i] =
  			max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
  	}
2a1e274ac   Mel Gorman   Create the ZONE_M...
4515
4516
4517
4518
4519
4520
  	arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;
  	arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;
  
  	/* Find the PFNs that ZONE_MOVABLE begins at in each node */
  	memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
  	find_zone_movable_pfns_for_nodes(zone_movable_pfn);
c713216de   Mel Gorman   [PATCH] Introduce...
4521

c713216de   Mel Gorman   [PATCH] Introduce...
4522
4523
4524
  	/* Print out the zone ranges */
  	printk("Zone PFN ranges:
  ");
2a1e274ac   Mel Gorman   Create the ZONE_M...
4525
4526
4527
  	for (i = 0; i < MAX_NR_ZONES; i++) {
  		if (i == ZONE_MOVABLE)
  			continue;
72f0ba025   David Rientjes   mm: suppress pfn ...
4528
4529
4530
4531
4532
4533
4534
4535
  		printk("  %-8s ", zone_names[i]);
  		if (arch_zone_lowest_possible_pfn[i] ==
  				arch_zone_highest_possible_pfn[i])
  			printk("empty
  ");
  		else
  			printk("%0#10lx -> %0#10lx
  ",
c713216de   Mel Gorman   [PATCH] Introduce...
4536
4537
  				arch_zone_lowest_possible_pfn[i],
  				arch_zone_highest_possible_pfn[i]);
2a1e274ac   Mel Gorman   Create the ZONE_M...
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
  	}
  
  	/* Print out the PFNs ZONE_MOVABLE begins at in each node */
  	printk("Movable zone start PFN for each node
  ");
  	for (i = 0; i < MAX_NUMNODES; i++) {
  		if (zone_movable_pfn[i])
  			printk("  Node %d: %lu
  ", i, zone_movable_pfn[i]);
  	}
c713216de   Mel Gorman   [PATCH] Introduce...
4548
4549
  
  	/* Print out the early_node_map[] */
c13291a53   Tejun Heo   bootmem: Use for_...
4550
4551
4552
4553
4554
  	printk("Early memory PFN ranges
  ");
  	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
  		printk("  %3d: %0#10lx -> %0#10lx
  ", nid, start_pfn, end_pfn);
c713216de   Mel Gorman   [PATCH] Introduce...
4555
4556
  
  	/* Initialise every node */
708614e61   Mel Gorman   mm: verify the pa...
4557
  	mminit_verify_pageflags_layout();
8ef828668   Christoph Lameter   [PATCH] slab: red...
4558
  	setup_nr_node_ids();
c713216de   Mel Gorman   [PATCH] Introduce...
4559
4560
  	for_each_online_node(nid) {
  		pg_data_t *pgdat = NODE_DATA(nid);
9109fb7b3   Johannes Weiner   mm: drop unneeded...
4561
  		free_area_init_node(nid, NULL,
c713216de   Mel Gorman   [PATCH] Introduce...
4562
  				find_min_pfn_for_node(nid), NULL);
37b07e416   Lee Schermerhorn   memoryless nodes:...
4563
4564
4565
4566
4567
  
  		/* Any memory on that node */
  		if (pgdat->node_present_pages)
  			node_set_state(nid, N_HIGH_MEMORY);
  		check_for_regular_memory(pgdat);
c713216de   Mel Gorman   [PATCH] Introduce...
4568
4569
  	}
  }
2a1e274ac   Mel Gorman   Create the ZONE_M...
4570

7e63efef8   Mel Gorman   Add a movablecore...
4571
  static int __init cmdline_parse_core(char *p, unsigned long *core)
2a1e274ac   Mel Gorman   Create the ZONE_M...
4572
4573
4574
4575
4576
4577
  {
  	unsigned long long coremem;
  	if (!p)
  		return -EINVAL;
  
  	coremem = memparse(p, &p);
7e63efef8   Mel Gorman   Add a movablecore...
4578
  	*core = coremem >> PAGE_SHIFT;
2a1e274ac   Mel Gorman   Create the ZONE_M...
4579

7e63efef8   Mel Gorman   Add a movablecore...
4580
  	/* Paranoid check that UL is enough for the coremem value */
2a1e274ac   Mel Gorman   Create the ZONE_M...
4581
4582
4583
4584
  	WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
  
  	return 0;
  }
ed7ed3651   Mel Gorman   handle kernelcore...
4585

7e63efef8   Mel Gorman   Add a movablecore...
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
  /*
   * kernelcore=size sets the amount of memory for use for allocations that
   * cannot be reclaimed or migrated.
   */
  static int __init cmdline_parse_kernelcore(char *p)
  {
  	return cmdline_parse_core(p, &required_kernelcore);
  }
  
  /*
   * movablecore=size sets the amount of memory for use for allocations that
   * can be reclaimed or migrated.
   */
  static int __init cmdline_parse_movablecore(char *p)
  {
  	return cmdline_parse_core(p, &required_movablecore);
  }
ed7ed3651   Mel Gorman   handle kernelcore...
4603
  early_param("kernelcore", cmdline_parse_kernelcore);
7e63efef8   Mel Gorman   Add a movablecore...
4604
  early_param("movablecore", cmdline_parse_movablecore);
ed7ed3651   Mel Gorman   handle kernelcore...
4605

0ee332c14   Tejun Heo   memblock: Kill ea...
4606
  #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
c713216de   Mel Gorman   [PATCH] Introduce...
4607

0e0b864e0   Mel Gorman   [PATCH] Account f...
4608
  /**
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
4609
4610
   * set_dma_reserve - set the specified number of pages reserved in the first zone
   * @new_dma_reserve: The number of pages to mark reserved
0e0b864e0   Mel Gorman   [PATCH] Account f...
4611
4612
4613
4614
   *
   * The per-cpu batchsize and zone watermarks are determined by present_pages.
   * In the DMA zone, a significant percentage may be consumed by kernel image
   * and other unfreeable allocations which can skew the watermarks badly. This
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
4615
4616
4617
   * function may optionally be used to account for unfreeable pages in the
   * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
   * smaller per-cpu batchsize.
0e0b864e0   Mel Gorman   [PATCH] Account f...
4618
4619
4620
4621
4622
   */
  void __init set_dma_reserve(unsigned long new_dma_reserve)
  {
  	dma_reserve = new_dma_reserve;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4623
4624
  void __init free_area_init(unsigned long *zones_size)
  {
9109fb7b3   Johannes Weiner   mm: drop unneeded...
4625
  	free_area_init_node(0, zones_size,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4626
4627
  			__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4628

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4629
4630
4631
4632
  static int page_alloc_cpu_notify(struct notifier_block *self,
  				 unsigned long action, void *hcpu)
  {
  	int cpu = (unsigned long)hcpu;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4633

8bb784428   Rafael J. Wysocki   Add suspend-relat...
4634
  	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
9f8f21725   Christoph Lameter   Page allocator: c...
4635
4636
4637
4638
4639
4640
4641
4642
  		drain_pages(cpu);
  
  		/*
  		 * Spill the event counters of the dead processor
  		 * into the current processors event counters.
  		 * This artificially elevates the count of the current
  		 * processor.
  		 */
f8891e5e1   Christoph Lameter   [PATCH] Light wei...
4643
  		vm_events_fold_cpu(cpu);
9f8f21725   Christoph Lameter   Page allocator: c...
4644
4645
4646
4647
4648
4649
4650
4651
  
  		/*
  		 * Zero the differential counters of the dead processor
  		 * so that the vm statistics are consistent.
  		 *
  		 * This is only okay since the processor is dead and cannot
  		 * race with what we are doing.
  		 */
2244b95a7   Christoph Lameter   [PATCH] zoned vm ...
4652
  		refresh_cpu_vm_stats(cpu);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4653
4654
4655
  	}
  	return NOTIFY_OK;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4656
4657
4658
4659
4660
4661
4662
  
  void __init page_alloc_init(void)
  {
  	hotcpu_notifier(page_alloc_cpu_notify, 0);
  }
  
  /*
cb45b0e96   Hideo AOKI   [PATCH] overcommi...
4663
4664
4665
4666
4667
4668
4669
   * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio
   *	or min_free_kbytes changes.
   */
  static void calculate_totalreserve_pages(void)
  {
  	struct pglist_data *pgdat;
  	unsigned long reserve_pages = 0;
2f6726e54   Christoph Lameter   [PATCH] Apply typ...
4670
  	enum zone_type i, j;
cb45b0e96   Hideo AOKI   [PATCH] overcommi...
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
  
  	for_each_online_pgdat(pgdat) {
  		for (i = 0; i < MAX_NR_ZONES; i++) {
  			struct zone *zone = pgdat->node_zones + i;
  			unsigned long max = 0;
  
  			/* Find valid and maximum lowmem_reserve in the zone */
  			for (j = i; j < MAX_NR_ZONES; j++) {
  				if (zone->lowmem_reserve[j] > max)
  					max = zone->lowmem_reserve[j];
  			}
418589663   Mel Gorman   page allocator: u...
4682
4683
  			/* we treat the high watermark as reserved pages. */
  			max += high_wmark_pages(zone);
cb45b0e96   Hideo AOKI   [PATCH] overcommi...
4684
4685
4686
4687
  
  			if (max > zone->present_pages)
  				max = zone->present_pages;
  			reserve_pages += max;
ab8fabd46   Johannes Weiner   mm: exclude reser...
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
  			/*
  			 * Lowmem reserves are not available to
  			 * GFP_HIGHUSER page cache allocations and
  			 * kswapd tries to balance zones to their high
  			 * watermark.  As a result, neither should be
  			 * regarded as dirtyable memory, to prevent a
  			 * situation where reclaim has to clean pages
  			 * in order to balance the zones.
  			 */
  			zone->dirty_balance_reserve = max;
cb45b0e96   Hideo AOKI   [PATCH] overcommi...
4698
4699
  		}
  	}
ab8fabd46   Johannes Weiner   mm: exclude reser...
4700
  	dirty_balance_reserve = reserve_pages;
cb45b0e96   Hideo AOKI   [PATCH] overcommi...
4701
4702
4703
4704
  	totalreserve_pages = reserve_pages;
  }
  
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4705
4706
4707
4708
4709
4710
4711
4712
   * setup_per_zone_lowmem_reserve - called whenever
   *	sysctl_lower_zone_reserve_ratio changes.  Ensures that each zone
   *	has a correct pages reserved value, so an adequate number of
   *	pages are left in the zone after a successful __alloc_pages().
   */
  static void setup_per_zone_lowmem_reserve(void)
  {
  	struct pglist_data *pgdat;
2f6726e54   Christoph Lameter   [PATCH] Apply typ...
4713
  	enum zone_type j, idx;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4714

ec936fc56   KAMEZAWA Hiroyuki   [PATCH] for_each_...
4715
  	for_each_online_pgdat(pgdat) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4716
4717
4718
4719
4720
  		for (j = 0; j < MAX_NR_ZONES; j++) {
  			struct zone *zone = pgdat->node_zones + j;
  			unsigned long present_pages = zone->present_pages;
  
  			zone->lowmem_reserve[j] = 0;
2f6726e54   Christoph Lameter   [PATCH] Apply typ...
4721
4722
  			idx = j;
  			while (idx) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4723
  				struct zone *lower_zone;
2f6726e54   Christoph Lameter   [PATCH] Apply typ...
4724
  				idx--;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
  				if (sysctl_lowmem_reserve_ratio[idx] < 1)
  					sysctl_lowmem_reserve_ratio[idx] = 1;
  
  				lower_zone = pgdat->node_zones + idx;
  				lower_zone->lowmem_reserve[j] = present_pages /
  					sysctl_lowmem_reserve_ratio[idx];
  				present_pages += lower_zone->present_pages;
  			}
  		}
  	}
cb45b0e96   Hideo AOKI   [PATCH] overcommi...
4735
4736
4737
  
  	/* update totalreserve_pages */
  	calculate_totalreserve_pages();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4738
  }
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
4739
  /**
bc75d33f0   Minchan Kim   page-allocator: c...
4740
   * setup_per_zone_wmarks - called when min_free_kbytes changes
bce7394a3   Minchan Kim   page-allocator: r...
4741
   * or when memory is hot-{added|removed}
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
4742
   *
bc75d33f0   Minchan Kim   page-allocator: c...
4743
4744
   * Ensures that the watermark[min,low,high] values for each zone are set
   * correctly with respect to min_free_kbytes.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4745
   */
bc75d33f0   Minchan Kim   page-allocator: c...
4746
  void setup_per_zone_wmarks(void)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
  {
  	unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
  	unsigned long lowmem_pages = 0;
  	struct zone *zone;
  	unsigned long flags;
  
  	/* Calculate total number of !ZONE_HIGHMEM pages */
  	for_each_zone(zone) {
  		if (!is_highmem(zone))
  			lowmem_pages += zone->present_pages;
  	}
  
  	for_each_zone(zone) {
ac924c603   Andrew Morton   [PATCH] setup_per...
4760
  		u64 tmp;
1125b4e39   Gerald Schaefer   setup_per_zone_pa...
4761
  		spin_lock_irqsave(&zone->lock, flags);
ac924c603   Andrew Morton   [PATCH] setup_per...
4762
4763
  		tmp = (u64)pages_min * zone->present_pages;
  		do_div(tmp, lowmem_pages);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4764
4765
  		if (is_highmem(zone)) {
  			/*
669ed1752   Nick Piggin   [PATCH] mm: highm...
4766
4767
4768
4769
  			 * __GFP_HIGH and PF_MEMALLOC allocations usually don't
  			 * need highmem pages, so cap pages_min to a small
  			 * value here.
  			 *
418589663   Mel Gorman   page allocator: u...
4770
  			 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
669ed1752   Nick Piggin   [PATCH] mm: highm...
4771
4772
  			 * deltas controls asynch page reclaim, and so should
  			 * not be capped for highmem.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4773
4774
4775
4776
4777
4778
4779
4780
  			 */
  			int min_pages;
  
  			min_pages = zone->present_pages / 1024;
  			if (min_pages < SWAP_CLUSTER_MAX)
  				min_pages = SWAP_CLUSTER_MAX;
  			if (min_pages > 128)
  				min_pages = 128;
418589663   Mel Gorman   page allocator: u...
4781
  			zone->watermark[WMARK_MIN] = min_pages;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4782
  		} else {
669ed1752   Nick Piggin   [PATCH] mm: highm...
4783
4784
  			/*
  			 * If it's a lowmem zone, reserve a number of pages
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4785
4786
  			 * proportionate to the zone's size.
  			 */
418589663   Mel Gorman   page allocator: u...
4787
  			zone->watermark[WMARK_MIN] = tmp;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4788
  		}
418589663   Mel Gorman   page allocator: u...
4789
4790
  		zone->watermark[WMARK_LOW]  = min_wmark_pages(zone) + (tmp >> 2);
  		zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
56fd56b86   Mel Gorman   Bias the location...
4791
  		setup_zone_migrate_reserve(zone);
1125b4e39   Gerald Schaefer   setup_per_zone_pa...
4792
  		spin_unlock_irqrestore(&zone->lock, flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4793
  	}
cb45b0e96   Hideo AOKI   [PATCH] overcommi...
4794
4795
4796
  
  	/* update totalreserve_pages */
  	calculate_totalreserve_pages();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4797
  }
55a4462af   Randy Dunlap   page_alloc: fix k...
4798
  /*
556adecba   Rik van Riel   vmscan: second ch...
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
   * The inactive anon list should be small enough that the VM never has to
   * do too much work, but large enough that each inactive page has a chance
   * to be referenced again before it is swapped out.
   *
   * The inactive_anon ratio is the target ratio of ACTIVE_ANON to
   * INACTIVE_ANON pages on this zone's LRU, maintained by the
   * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of
   * the anonymous pages are kept on the inactive list.
   *
   * total     target    max
   * memory    ratio     inactive anon
   * -------------------------------------
   *   10MB       1         5MB
   *  100MB       1        50MB
   *    1GB       3       250MB
   *   10GB      10       0.9GB
   *  100GB      31         3GB
   *    1TB     101        10GB
   *   10TB     320        32GB
   */
1b79acc91   KOSAKI Motohiro   mm, mem-hotplug: ...
4819
  static void __meminit calculate_zone_inactive_ratio(struct zone *zone)
556adecba   Rik van Riel   vmscan: second ch...
4820
  {
96cb4df5d   Minchan Kim   page-allocator: a...
4821
  	unsigned int gb, ratio;
556adecba   Rik van Riel   vmscan: second ch...
4822

96cb4df5d   Minchan Kim   page-allocator: a...
4823
4824
4825
  	/* Zone size in gigabytes */
  	gb = zone->present_pages >> (30 - PAGE_SHIFT);
  	if (gb)
556adecba   Rik van Riel   vmscan: second ch...
4826
  		ratio = int_sqrt(10 * gb);
96cb4df5d   Minchan Kim   page-allocator: a...
4827
4828
  	else
  		ratio = 1;
556adecba   Rik van Riel   vmscan: second ch...
4829

96cb4df5d   Minchan Kim   page-allocator: a...
4830
4831
  	zone->inactive_ratio = ratio;
  }
556adecba   Rik van Riel   vmscan: second ch...
4832

839a4fcc8   KOSAKI Motohiro   mm, mem-hotplug: ...
4833
  static void __meminit setup_per_zone_inactive_ratio(void)
96cb4df5d   Minchan Kim   page-allocator: a...
4834
4835
4836
4837
4838
  {
  	struct zone *zone;
  
  	for_each_zone(zone)
  		calculate_zone_inactive_ratio(zone);
556adecba   Rik van Riel   vmscan: second ch...
4839
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
  /*
   * Initialise min_free_kbytes.
   *
   * For small machines we want it small (128k min).  For large machines
   * we want it large (64MB max).  But it is not linear, because network
   * bandwidth does not increase linearly with machine size.  We use
   *
   * 	min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
   *	min_free_kbytes = sqrt(lowmem_kbytes * 16)
   *
   * which yields
   *
   * 16MB:	512k
   * 32MB:	724k
   * 64MB:	1024k
   * 128MB:	1448k
   * 256MB:	2048k
   * 512MB:	2896k
   * 1024MB:	4096k
   * 2048MB:	5792k
   * 4096MB:	8192k
   * 8192MB:	11584k
   * 16384MB:	16384k
   */
1b79acc91   KOSAKI Motohiro   mm, mem-hotplug: ...
4864
  int __meminit init_per_zone_wmark_min(void)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
  {
  	unsigned long lowmem_kbytes;
  
  	lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
  
  	min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
  	if (min_free_kbytes < 128)
  		min_free_kbytes = 128;
  	if (min_free_kbytes > 65536)
  		min_free_kbytes = 65536;
bc75d33f0   Minchan Kim   page-allocator: c...
4875
  	setup_per_zone_wmarks();
a6cccdc36   KOSAKI Motohiro   mm, mem-hotplug: ...
4876
  	refresh_zone_stat_thresholds();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4877
  	setup_per_zone_lowmem_reserve();
556adecba   Rik van Riel   vmscan: second ch...
4878
  	setup_per_zone_inactive_ratio();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4879
4880
  	return 0;
  }
bc75d33f0   Minchan Kim   page-allocator: c...
4881
  module_init(init_per_zone_wmark_min)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4882
4883
4884
4885
4886
4887
4888
  
  /*
   * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 
   *	that we can call two helper functions whenever min_free_kbytes
   *	changes.
   */
  int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 
8d65af789   Alexey Dobriyan   sysctl: remove "s...
4889
  	void __user *buffer, size_t *length, loff_t *ppos)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4890
  {
8d65af789   Alexey Dobriyan   sysctl: remove "s...
4891
  	proc_dointvec(table, write, buffer, length, ppos);
3b1d92c56   Mel Gorman   Do not disable in...
4892
  	if (write)
bc75d33f0   Minchan Kim   page-allocator: c...
4893
  		setup_per_zone_wmarks();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4894
4895
  	return 0;
  }
9614634fe   Christoph Lameter   [PATCH] ZVC/zone_...
4896
4897
  #ifdef CONFIG_NUMA
  int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
8d65af789   Alexey Dobriyan   sysctl: remove "s...
4898
  	void __user *buffer, size_t *length, loff_t *ppos)
9614634fe   Christoph Lameter   [PATCH] ZVC/zone_...
4899
4900
4901
  {
  	struct zone *zone;
  	int rc;
8d65af789   Alexey Dobriyan   sysctl: remove "s...
4902
  	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
9614634fe   Christoph Lameter   [PATCH] ZVC/zone_...
4903
4904
4905
4906
  	if (rc)
  		return rc;
  
  	for_each_zone(zone)
8417bba4b   Christoph Lameter   [PATCH] Replace m...
4907
  		zone->min_unmapped_pages = (zone->present_pages *
9614634fe   Christoph Lameter   [PATCH] ZVC/zone_...
4908
4909
4910
  				sysctl_min_unmapped_ratio) / 100;
  	return 0;
  }
0ff38490c   Christoph Lameter   [PATCH] zone_recl...
4911
4912
  
  int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
8d65af789   Alexey Dobriyan   sysctl: remove "s...
4913
  	void __user *buffer, size_t *length, loff_t *ppos)
0ff38490c   Christoph Lameter   [PATCH] zone_recl...
4914
4915
4916
  {
  	struct zone *zone;
  	int rc;
8d65af789   Alexey Dobriyan   sysctl: remove "s...
4917
  	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
0ff38490c   Christoph Lameter   [PATCH] zone_recl...
4918
4919
4920
4921
4922
4923
4924
4925
  	if (rc)
  		return rc;
  
  	for_each_zone(zone)
  		zone->min_slab_pages = (zone->present_pages *
  				sysctl_min_slab_ratio) / 100;
  	return 0;
  }
9614634fe   Christoph Lameter   [PATCH] ZVC/zone_...
4926
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4927
4928
4929
4930
4931
4932
  /*
   * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
   *	proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
   *	whenever sysctl_lowmem_reserve_ratio changes.
   *
   * The reserve ratio obviously has absolutely no relation with the
418589663   Mel Gorman   page allocator: u...
4933
   * minimum watermarks. The lowmem reserve ratio can only make sense
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4934
4935
4936
   * if in function of the boot time zone sizes.
   */
  int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
8d65af789   Alexey Dobriyan   sysctl: remove "s...
4937
  	void __user *buffer, size_t *length, loff_t *ppos)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4938
  {
8d65af789   Alexey Dobriyan   sysctl: remove "s...
4939
  	proc_dointvec_minmax(table, write, buffer, length, ppos);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4940
4941
4942
  	setup_per_zone_lowmem_reserve();
  	return 0;
  }
8ad4b1fb8   Rohit Seth   [PATCH] Make high...
4943
4944
4945
4946
4947
4948
4949
  /*
   * percpu_pagelist_fraction - changes the pcp->high for each zone on each
   * cpu.  It is the fraction of total pages in each zone that a hot per cpu pagelist
   * can have before it gets flushed back to buddy allocator.
   */
  
  int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
8d65af789   Alexey Dobriyan   sysctl: remove "s...
4950
  	void __user *buffer, size_t *length, loff_t *ppos)
8ad4b1fb8   Rohit Seth   [PATCH] Make high...
4951
4952
4953
4954
  {
  	struct zone *zone;
  	unsigned int cpu;
  	int ret;
8d65af789   Alexey Dobriyan   sysctl: remove "s...
4955
  	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
8ad4b1fb8   Rohit Seth   [PATCH] Make high...
4956
4957
  	if (!write || (ret == -EINVAL))
  		return ret;
364df0ebf   Dimitri Sivanich   mm: fix handling ...
4958
  	for_each_populated_zone(zone) {
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
4959
  		for_each_possible_cpu(cpu) {
8ad4b1fb8   Rohit Seth   [PATCH] Make high...
4960
4961
  			unsigned long  high;
  			high = zone->present_pages / percpu_pagelist_fraction;
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
4962
4963
  			setup_pagelist_highmark(
  				per_cpu_ptr(zone->pageset, cpu), high);
8ad4b1fb8   Rohit Seth   [PATCH] Make high...
4964
4965
4966
4967
  		}
  	}
  	return 0;
  }
f034b5d4e   David S. Miller   [XFRM]: Dynamic x...
4968
  int hashdist = HASHDIST_DEFAULT;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
  
  #ifdef CONFIG_NUMA
  static int __init set_hashdist(char *str)
  {
  	if (!str)
  		return 0;
  	hashdist = simple_strtoul(str, &str, 0);
  	return 1;
  }
  __setup("hashdist=", set_hashdist);
  #endif
  
  /*
   * allocate a large system hash table from bootmem
   * - it is assumed that the hash table must contain an exact power-of-2
   *   quantity of entries
   * - limit is the number of hash buckets, not the total allocation size
   */
  void *__init alloc_large_system_hash(const char *tablename,
  				     unsigned long bucketsize,
  				     unsigned long numentries,
  				     int scale,
  				     int flags,
  				     unsigned int *_hash_shift,
  				     unsigned int *_hash_mask,
  				     unsigned long limit)
  {
  	unsigned long long max = limit;
  	unsigned long log2qty, size;
  	void *table = NULL;
  
  	/* allow the kernel cmdline to have a say */
  	if (!numentries) {
  		/* round applicable memory size up to nearest megabyte */
049036643   Andrew Morton   [PATCH] remove HA...
5003
  		numentries = nr_kernel_pages;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5004
5005
5006
5007
5008
5009
5010
5011
5012
  		numentries += (1UL << (20 - PAGE_SHIFT)) - 1;
  		numentries >>= 20 - PAGE_SHIFT;
  		numentries <<= 20 - PAGE_SHIFT;
  
  		/* limit to 1 bucket per 2^scale bytes of low memory */
  		if (scale > PAGE_SHIFT)
  			numentries >>= (scale - PAGE_SHIFT);
  		else
  			numentries <<= (PAGE_SHIFT - scale);
9ab37b8f2   Paul Mundt   [PATCH] Sanely si...
5013
5014
  
  		/* Make sure we've got at least a 0-order allocation.. */
2c85f51d2   Jan Beulich   mm: also use allo...
5015
5016
5017
5018
5019
5020
5021
5022
  		if (unlikely(flags & HASH_SMALL)) {
  			/* Makes no sense without HASH_EARLY */
  			WARN_ON(!(flags & HASH_EARLY));
  			if (!(numentries >> *_hash_shift)) {
  				numentries = 1UL << *_hash_shift;
  				BUG_ON(!numentries);
  			}
  		} else if (unlikely((numentries * bucketsize) < PAGE_SIZE))
9ab37b8f2   Paul Mundt   [PATCH] Sanely si...
5023
  			numentries = PAGE_SIZE / bucketsize;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5024
  	}
6e692ed37   John Hawkes   [PATCH] fix alloc...
5025
  	numentries = roundup_pow_of_two(numentries);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5026
5027
5028
5029
5030
5031
5032
5033
5034
  
  	/* limit allocation size to 1/16 total memory by default */
  	if (max == 0) {
  		max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
  		do_div(max, bucketsize);
  	}
  
  	if (numentries > max)
  		numentries = max;
f0d1b0b30   David Howells   [PATCH] LOG2: Imp...
5035
  	log2qty = ilog2(numentries);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5036
5037
5038
5039
  
  	do {
  		size = bucketsize << log2qty;
  		if (flags & HASH_EARLY)
74768ed83   Jan Beulich   page allocator: u...
5040
  			table = alloc_bootmem_nopanic(size);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5041
5042
5043
  		else if (hashdist)
  			table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
  		else {
1037b83bd   Eric Dumazet   MM: alloc_large_s...
5044
5045
  			/*
  			 * If bucketsize is not a power-of-two, we may free
a1dd268cf   Mel Gorman   mm: use alloc_pag...
5046
5047
  			 * some pages at the end of hash table which
  			 * alloc_pages_exact() automatically does
1037b83bd   Eric Dumazet   MM: alloc_large_s...
5048
  			 */
264ef8a90   Catalin Marinas   kmemleak: Remove ...
5049
  			if (get_order(size) < MAX_ORDER) {
a1dd268cf   Mel Gorman   mm: use alloc_pag...
5050
  				table = alloc_pages_exact(size, GFP_ATOMIC);
264ef8a90   Catalin Marinas   kmemleak: Remove ...
5051
5052
  				kmemleak_alloc(table, size, 1, GFP_ATOMIC);
  			}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5053
5054
5055
5056
5057
5058
  		}
  	} while (!table && size > PAGE_SIZE && --log2qty);
  
  	if (!table)
  		panic("Failed to allocate %s hash table
  ", tablename);
f241e6607   Robin Holt   mm: alloc_large_s...
5059
5060
  	printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)
  ",
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5061
  	       tablename,
f241e6607   Robin Holt   mm: alloc_large_s...
5062
  	       (1UL << log2qty),
f0d1b0b30   David Howells   [PATCH] LOG2: Imp...
5063
  	       ilog2(size) - PAGE_SHIFT,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5064
5065
5066
5067
5068
5069
5070
5071
5072
  	       size);
  
  	if (_hash_shift)
  		*_hash_shift = log2qty;
  	if (_hash_mask)
  		*_hash_mask = (1 << log2qty) - 1;
  
  	return table;
  }
a117e66ed   KAMEZAWA Hiroyuki   [PATCH] unify pfn...
5073

835c134ec   Mel Gorman   Add a bitmap that...
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
  /* Return a pointer to the bitmap storing bits affecting a block of pages */
  static inline unsigned long *get_pageblock_bitmap(struct zone *zone,
  							unsigned long pfn)
  {
  #ifdef CONFIG_SPARSEMEM
  	return __pfn_to_section(pfn)->pageblock_flags;
  #else
  	return zone->pageblock_flags;
  #endif /* CONFIG_SPARSEMEM */
  }
  
  static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
  {
  #ifdef CONFIG_SPARSEMEM
  	pfn &= (PAGES_PER_SECTION-1);
d9c234005   Mel Gorman   Do not depend on ...
5089
  	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
835c134ec   Mel Gorman   Add a bitmap that...
5090
5091
  #else
  	pfn = pfn - zone->zone_start_pfn;
d9c234005   Mel Gorman   Do not depend on ...
5092
  	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
835c134ec   Mel Gorman   Add a bitmap that...
5093
5094
5095
5096
  #endif /* CONFIG_SPARSEMEM */
  }
  
  /**
d9c234005   Mel Gorman   Do not depend on ...
5097
   * get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages
835c134ec   Mel Gorman   Add a bitmap that...
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
   * @page: The page within the block of interest
   * @start_bitidx: The first bit of interest to retrieve
   * @end_bitidx: The last bit of interest
   * returns pageblock_bits flags
   */
  unsigned long get_pageblock_flags_group(struct page *page,
  					int start_bitidx, int end_bitidx)
  {
  	struct zone *zone;
  	unsigned long *bitmap;
  	unsigned long pfn, bitidx;
  	unsigned long flags = 0;
  	unsigned long value = 1;
  
  	zone = page_zone(page);
  	pfn = page_to_pfn(page);
  	bitmap = get_pageblock_bitmap(zone, pfn);
  	bitidx = pfn_to_bitidx(zone, pfn);
  
  	for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
  		if (test_bit(bitidx + start_bitidx, bitmap))
  			flags |= value;
6220ec784   Andrew Morton   [PATCH] highest_p...
5120

835c134ec   Mel Gorman   Add a bitmap that...
5121
5122
5123
5124
  	return flags;
  }
  
  /**
d9c234005   Mel Gorman   Do not depend on ...
5125
   * set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages
835c134ec   Mel Gorman   Add a bitmap that...
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
   * @page: The page within the block of interest
   * @start_bitidx: The first bit of interest
   * @end_bitidx: The last bit of interest
   * @flags: The flags to set
   */
  void set_pageblock_flags_group(struct page *page, unsigned long flags,
  					int start_bitidx, int end_bitidx)
  {
  	struct zone *zone;
  	unsigned long *bitmap;
  	unsigned long pfn, bitidx;
  	unsigned long value = 1;
  
  	zone = page_zone(page);
  	pfn = page_to_pfn(page);
  	bitmap = get_pageblock_bitmap(zone, pfn);
  	bitidx = pfn_to_bitidx(zone, pfn);
86051ca5e   KAMEZAWA Hiroyuki   mm: fix usemap in...
5143
5144
  	VM_BUG_ON(pfn < zone->zone_start_pfn);
  	VM_BUG_ON(pfn >= zone->zone_start_pfn + zone->spanned_pages);
835c134ec   Mel Gorman   Add a bitmap that...
5145
5146
5147
5148
5149
5150
5151
  
  	for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
  		if (flags & value)
  			__set_bit(bitidx + start_bitidx, bitmap);
  		else
  			__clear_bit(bitidx + start_bitidx, bitmap);
  }
a5d76b54a   KAMEZAWA Hiroyuki   memory unplug: pa...
5152
5153
5154
5155
5156
5157
  
  /*
   * This is designed as sub function...plz see page_isolation.c also.
   * set/clear page block's type to be ISOLATE.
   * page allocater never alloc memory from ISOLATE block.
   */
49ac82558   KAMEZAWA Hiroyuki   memory hotplug: u...
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
  static int
  __count_immobile_pages(struct zone *zone, struct page *page, int count)
  {
  	unsigned long pfn, iter, found;
  	/*
  	 * For avoiding noise data, lru_add_drain_all() should be called
  	 * If ZONE_MOVABLE, the zone never contains immobile pages
  	 */
  	if (zone_idx(zone) == ZONE_MOVABLE)
  		return true;
  
  	if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE)
  		return true;
  
  	pfn = page_to_pfn(page);
  	for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
  		unsigned long check = pfn + iter;
29723fccc   Namhyung Kim   mm: fix dubious c...
5175
  		if (!pfn_valid_within(check))
49ac82558   KAMEZAWA Hiroyuki   memory hotplug: u...
5176
  			continue;
29723fccc   Namhyung Kim   mm: fix dubious c...
5177

49ac82558   KAMEZAWA Hiroyuki   memory hotplug: u...
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
  		page = pfn_to_page(check);
  		if (!page_count(page)) {
  			if (PageBuddy(page))
  				iter += (1 << page_order(page)) - 1;
  			continue;
  		}
  		if (!PageLRU(page))
  			found++;
  		/*
  		 * If there are RECLAIMABLE pages, we need to check it.
  		 * But now, memory offline itself doesn't call shrink_slab()
  		 * and it still to be fixed.
  		 */
  		/*
  		 * If the page is not RAM, page_count()should be 0.
  		 * we don't need more check. This is an _used_ not-movable page.
  		 *
  		 * The problematic thing here is PG_reserved pages. PG_reserved
  		 * is set to both of a memory hole page and a _used_ kernel
  		 * page at boot.
  		 */
  		if (found > count)
  			return false;
  	}
  	return true;
  }
  
  bool is_pageblock_removable_nolock(struct page *page)
  {
  	struct zone *zone = page_zone(page);
  	return __count_immobile_pages(zone, page, 0);
  }
a5d76b54a   KAMEZAWA Hiroyuki   memory unplug: pa...
5210
5211
5212
  int set_migratetype_isolate(struct page *page)
  {
  	struct zone *zone;
49ac82558   KAMEZAWA Hiroyuki   memory hotplug: u...
5213
  	unsigned long flags, pfn;
925cc71e5   Robert Jennings   mm: Add notifier ...
5214
5215
  	struct memory_isolate_notify arg;
  	int notifier_ret;
a5d76b54a   KAMEZAWA Hiroyuki   memory unplug: pa...
5216
5217
5218
  	int ret = -EBUSY;
  
  	zone = page_zone(page);
925cc71e5   Robert Jennings   mm: Add notifier ...
5219

a5d76b54a   KAMEZAWA Hiroyuki   memory unplug: pa...
5220
  	spin_lock_irqsave(&zone->lock, flags);
925cc71e5   Robert Jennings   mm: Add notifier ...
5221
5222
5223
5224
5225
  
  	pfn = page_to_pfn(page);
  	arg.start_pfn = pfn;
  	arg.nr_pages = pageblock_nr_pages;
  	arg.pages_found = 0;
a5d76b54a   KAMEZAWA Hiroyuki   memory unplug: pa...
5226
  	/*
925cc71e5   Robert Jennings   mm: Add notifier ...
5227
5228
5229
5230
5231
5232
5233
5234
5235
  	 * It may be possible to isolate a pageblock even if the
  	 * migratetype is not MIGRATE_MOVABLE. The memory isolation
  	 * notifier chain is used by balloon drivers to return the
  	 * number of pages in a range that are held by the balloon
  	 * driver to shrink memory. If all the pages are accounted for
  	 * by balloons, are free, or on the LRU, isolation can continue.
  	 * Later, for example, when memory hotplug notifier runs, these
  	 * pages reported as "can be isolated" should be isolated(freed)
  	 * by the balloon driver through the memory notifier chain.
a5d76b54a   KAMEZAWA Hiroyuki   memory unplug: pa...
5236
  	 */
925cc71e5   Robert Jennings   mm: Add notifier ...
5237
5238
  	notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
  	notifier_ret = notifier_to_errno(notifier_ret);
4b20477f5   KAMEZAWA Hiroyuki   memory hotplug: f...
5239
  	if (notifier_ret)
a5d76b54a   KAMEZAWA Hiroyuki   memory unplug: pa...
5240
  		goto out;
49ac82558   KAMEZAWA Hiroyuki   memory hotplug: u...
5241
5242
5243
5244
5245
  	/*
  	 * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
  	 * We just check MOVABLE pages.
  	 */
  	if (__count_immobile_pages(zone, page, arg.pages_found))
925cc71e5   Robert Jennings   mm: Add notifier ...
5246
  		ret = 0;
49ac82558   KAMEZAWA Hiroyuki   memory hotplug: u...
5247
5248
5249
5250
  	/*
  	 * immobile means "not-on-lru" paes. If immobile is larger than
  	 * removable-by-driver pages reported by notifier, we'll fail.
  	 */
a5d76b54a   KAMEZAWA Hiroyuki   memory unplug: pa...
5251
  out:
925cc71e5   Robert Jennings   mm: Add notifier ...
5252
5253
5254
5255
  	if (!ret) {
  		set_pageblock_migratetype(page, MIGRATE_ISOLATE);
  		move_freepages_block(zone, page, MIGRATE_ISOLATE);
  	}
a5d76b54a   KAMEZAWA Hiroyuki   memory unplug: pa...
5256
5257
  	spin_unlock_irqrestore(&zone->lock, flags);
  	if (!ret)
9f8f21725   Christoph Lameter   Page allocator: c...
5258
  		drain_all_pages();
a5d76b54a   KAMEZAWA Hiroyuki   memory unplug: pa...
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
  	return ret;
  }
  
  void unset_migratetype_isolate(struct page *page)
  {
  	struct zone *zone;
  	unsigned long flags;
  	zone = page_zone(page);
  	spin_lock_irqsave(&zone->lock, flags);
  	if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
  		goto out;
  	set_pageblock_migratetype(page, MIGRATE_MOVABLE);
  	move_freepages_block(zone, page, MIGRATE_MOVABLE);
  out:
  	spin_unlock_irqrestore(&zone->lock, flags);
  }
0c0e61958   KAMEZAWA Hiroyuki   memory unplug: pa...
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
  
  #ifdef CONFIG_MEMORY_HOTREMOVE
  /*
   * All pages in the range must be isolated before calling this.
   */
  void
  __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
  {
  	struct page *page;
  	struct zone *zone;
  	int order, i;
  	unsigned long pfn;
  	unsigned long flags;
  	/* find the first valid pfn */
  	for (pfn = start_pfn; pfn < end_pfn; pfn++)
  		if (pfn_valid(pfn))
  			break;
  	if (pfn == end_pfn)
  		return;
  	zone = page_zone(pfn_to_page(pfn));
  	spin_lock_irqsave(&zone->lock, flags);
  	pfn = start_pfn;
  	while (pfn < end_pfn) {
  		if (!pfn_valid(pfn)) {
  			pfn++;
  			continue;
  		}
  		page = pfn_to_page(pfn);
  		BUG_ON(page_count(page));
  		BUG_ON(!PageBuddy(page));
  		order = page_order(page);
  #ifdef CONFIG_DEBUG_VM
  		printk(KERN_INFO "remove from free list %lx %d %lx
  ",
  		       pfn, 1 << order, end_pfn);
  #endif
  		list_del(&page->lru);
  		rmv_page_order(page);
  		zone->free_area[order].nr_free--;
  		__mod_zone_page_state(zone, NR_FREE_PAGES,
  				      - (1UL << order));
  		for (i = 0; i < (1 << order); i++)
  			SetPageReserved((page+i));
  		pfn += (1 << order);
  	}
  	spin_unlock_irqrestore(&zone->lock, flags);
  }
  #endif
8d22ba1b7   Wu Fengguang   HWPOISON: detect ...
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
  
  #ifdef CONFIG_MEMORY_FAILURE
  bool is_free_buddy_page(struct page *page)
  {
  	struct zone *zone = page_zone(page);
  	unsigned long pfn = page_to_pfn(page);
  	unsigned long flags;
  	int order;
  
  	spin_lock_irqsave(&zone->lock, flags);
  	for (order = 0; order < MAX_ORDER; order++) {
  		struct page *page_head = page - (pfn & ((1 << order) - 1));
  
  		if (PageBuddy(page_head) && page_order(page_head) >= order)
  			break;
  	}
  	spin_unlock_irqrestore(&zone->lock, flags);
  
  	return order < MAX_ORDER;
  }
  #endif
718a38211   Wu Fengguang   mm: introduce dum...
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
  
  static struct trace_print_flags pageflag_names[] = {
  	{1UL << PG_locked,		"locked"	},
  	{1UL << PG_error,		"error"		},
  	{1UL << PG_referenced,		"referenced"	},
  	{1UL << PG_uptodate,		"uptodate"	},
  	{1UL << PG_dirty,		"dirty"		},
  	{1UL << PG_lru,			"lru"		},
  	{1UL << PG_active,		"active"	},
  	{1UL << PG_slab,		"slab"		},
  	{1UL << PG_owner_priv_1,	"owner_priv_1"	},
  	{1UL << PG_arch_1,		"arch_1"	},
  	{1UL << PG_reserved,		"reserved"	},
  	{1UL << PG_private,		"private"	},
  	{1UL << PG_private_2,		"private_2"	},
  	{1UL << PG_writeback,		"writeback"	},
  #ifdef CONFIG_PAGEFLAGS_EXTENDED
  	{1UL << PG_head,		"head"		},
  	{1UL << PG_tail,		"tail"		},
  #else
  	{1UL << PG_compound,		"compound"	},
  #endif
  	{1UL << PG_swapcache,		"swapcache"	},
  	{1UL << PG_mappedtodisk,	"mappedtodisk"	},
  	{1UL << PG_reclaim,		"reclaim"	},
718a38211   Wu Fengguang   mm: introduce dum...
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
  	{1UL << PG_swapbacked,		"swapbacked"	},
  	{1UL << PG_unevictable,		"unevictable"	},
  #ifdef CONFIG_MMU
  	{1UL << PG_mlocked,		"mlocked"	},
  #endif
  #ifdef CONFIG_ARCH_USES_PG_UNCACHED
  	{1UL << PG_uncached,		"uncached"	},
  #endif
  #ifdef CONFIG_MEMORY_FAILURE
  	{1UL << PG_hwpoison,		"hwpoison"	},
  #endif
  	{-1UL,				NULL		},
  };
  
  static void dump_page_flags(unsigned long flags)
  {
  	const char *delim = "";
  	unsigned long mask;
  	int i;
  
  	printk(KERN_ALERT "page flags: %#lx(", flags);
  
  	/* remove zone id */
  	flags &= (1UL << NR_PAGEFLAGS) - 1;
  
  	for (i = 0; pageflag_names[i].name && flags; i++) {
  
  		mask = pageflag_names[i].mask;
  		if ((flags & mask) != mask)
  			continue;
  
  		flags &= ~mask;
  		printk("%s%s", delim, pageflag_names[i].name);
  		delim = "|";
  	}
  
  	/* check for left over flags */
  	if (flags)
  		printk("%s%#lx", delim, flags);
  
  	printk(")
  ");
  }
  
  void dump_page(struct page *page)
  {
  	printk(KERN_ALERT
  	       "page:%p count:%d mapcount:%d mapping:%p index:%#lx
  ",
4e9f64c42   Andrea Arcangeli   thp: fix bad_page...
5418
  		page, atomic_read(&page->_count), page_mapcount(page),
718a38211   Wu Fengguang   mm: introduce dum...
5419
5420
  		page->mapping, page->index);
  	dump_page_flags(page->flags);
f212ad7cf   Daisuke Nishimura   memcg: add memcg ...
5421
  	mem_cgroup_print_bad_page(page);
718a38211   Wu Fengguang   mm: introduce dum...
5422
  }