Blame view

mm/page_alloc.c 150 KB
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
  /*
   *  linux/mm/page_alloc.c
   *
   *  Manages the free list, the system allocates free pages here.
   *  Note that kmalloc() lives in slab.c
   *
   *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   *  Swap reorganised 29.12.95, Stephen Tweedie
   *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
   *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
   *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
   *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
   *  Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
   *          (lots of bits borrowed from Ingo Molnar & Andrew Morton)
   */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
16
17
18
19
20
  #include <linux/stddef.h>
  #include <linux/mm.h>
  #include <linux/swap.h>
  #include <linux/interrupt.h>
  #include <linux/pagemap.h>
10ed273f5   KOSAKI Motohiro   zlc_setup(): hand...
21
  #include <linux/jiffies.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
22
23
  #include <linux/bootmem.h>
  #include <linux/compiler.h>
9f1583339   Randy Dunlap   [PATCH] use add_t...
24
  #include <linux/kernel.h>
b1eeab676   Vegard Nossum   kmemcheck: add ho...
25
  #include <linux/kmemcheck.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
26
27
28
29
30
  #include <linux/module.h>
  #include <linux/suspend.h>
  #include <linux/pagevec.h>
  #include <linux/blkdev.h>
  #include <linux/slab.h>
5a3135c2e   David Rientjes   oom: move prototy...
31
  #include <linux/oom.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
32
33
34
35
36
  #include <linux/notifier.h>
  #include <linux/topology.h>
  #include <linux/sysctl.h>
  #include <linux/cpu.h>
  #include <linux/cpuset.h>
bdc8cb984   Dave Hansen   [PATCH] memory ho...
37
  #include <linux/memory_hotplug.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
38
39
  #include <linux/nodemask.h>
  #include <linux/vmalloc.h>
4be38e351   Christoph Lameter   [PATCH] mm: move ...
40
  #include <linux/mempolicy.h>
6811378e7   Yasunori Goto   [PATCH] wait_tabl...
41
  #include <linux/stop_machine.h>
c713216de   Mel Gorman   [PATCH] Introduce...
42
43
  #include <linux/sort.h>
  #include <linux/pfn.h>
3fcfab16c   Andrew Morton   [PATCH] separate ...
44
  #include <linux/backing-dev.h>
933e312e7   Akinobu Mita   [PATCH] fault-inj...
45
  #include <linux/fault-inject.h>
a5d76b54a   KAMEZAWA Hiroyuki   memory unplug: pa...
46
  #include <linux/page-isolation.h>
52d4b9ac0   KAMEZAWA Hiroyuki   memcg: allocate a...
47
  #include <linux/page_cgroup.h>
3ac7fe5a4   Thomas Gleixner   infrastructure to...
48
  #include <linux/debugobjects.h>
dbb1f81ca   Catalin Marinas   kmemleak: Add kme...
49
  #include <linux/kmemleak.h>
925cc71e5   Robert Jennings   mm: Add notifier ...
50
  #include <linux/memory.h>
56de7263f   Mel Gorman   mm: compaction: d...
51
  #include <linux/compaction.h>
0d3d062a6   Mel Gorman   tracing, page-all...
52
  #include <trace/events/kmem.h>
718a38211   Wu Fengguang   mm: introduce dum...
53
  #include <linux/ftrace_event.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
54
55
  
  #include <asm/tlbflush.h>
ac924c603   Andrew Morton   [PATCH] setup_per...
56
  #include <asm/div64.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
57
  #include "internal.h"
728120192   Lee Schermerhorn   numa: add generic...
58
59
60
61
  #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
  DEFINE_PER_CPU(int, numa_node);
  EXPORT_PER_CPU_SYMBOL(numa_node);
  #endif
7aac78988   Lee Schermerhorn   numa: introduce n...
62
63
64
65
66
67
68
69
70
71
  #ifdef CONFIG_HAVE_MEMORYLESS_NODES
  /*
   * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
   * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
   * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()
   * defined in <linux/topology.h>.
   */
  DEFINE_PER_CPU(int, _numa_mem_);		/* Kernel "local memory" node */
  EXPORT_PER_CPU_SYMBOL(_numa_mem_);
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
72
  /*
138089107   Christoph Lameter   Memoryless nodes:...
73
   * Array of node states.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
74
   */
138089107   Christoph Lameter   Memoryless nodes:...
75
76
77
78
79
80
81
82
83
84
85
86
  nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
  	[N_POSSIBLE] = NODE_MASK_ALL,
  	[N_ONLINE] = { { [0] = 1UL } },
  #ifndef CONFIG_NUMA
  	[N_NORMAL_MEMORY] = { { [0] = 1UL } },
  #ifdef CONFIG_HIGHMEM
  	[N_HIGH_MEMORY] = { { [0] = 1UL } },
  #endif
  	[N_CPU] = { { [0] = 1UL } },
  #endif	/* NUMA */
  };
  EXPORT_SYMBOL(node_states);
6c231b7ba   Ravikiran G Thirumalai   [PATCH] Additions...
87
  unsigned long totalram_pages __read_mostly;
cb45b0e96   Hideo AOKI   [PATCH] overcommi...
88
  unsigned long totalreserve_pages __read_mostly;
8ad4b1fb8   Rohit Seth   [PATCH] Make high...
89
  int percpu_pagelist_fraction;
dcce284a2   Benjamin Herrenschmidt   mm: Extend gfp ma...
90
  gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
91

452aa6999   Rafael J. Wysocki   mm/pm: force GFP_...
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
  #ifdef CONFIG_PM_SLEEP
  /*
   * The following functions are used by the suspend/hibernate code to temporarily
   * change gfp_allowed_mask in order to avoid using I/O during memory allocations
   * while devices are suspended.  To avoid races with the suspend/hibernate code,
   * they should always be called with pm_mutex held (gfp_allowed_mask also should
   * only be modified with pm_mutex held, unless the suspend/hibernate code is
   * guaranteed not to run in parallel with that modification).
   */
  void set_gfp_allowed_mask(gfp_t mask)
  {
  	WARN_ON(!mutex_is_locked(&pm_mutex));
  	gfp_allowed_mask = mask;
  }
  
  gfp_t clear_gfp_allowed_mask(gfp_t mask)
  {
  	gfp_t ret = gfp_allowed_mask;
  
  	WARN_ON(!mutex_is_locked(&pm_mutex));
  	gfp_allowed_mask &= ~mask;
  	return ret;
  }
  #endif /* CONFIG_PM_SLEEP */
d9c234005   Mel Gorman   Do not depend on ...
116
117
118
  #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
  int pageblock_order __read_mostly;
  #endif
d98c7a098   Hugh Dickins   [PATCH] compound ...
119
  static void __free_pages_ok(struct page *page, unsigned int order);
a226f6c89   David Howells   [PATCH] FRV: Clea...
120

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
121
122
123
124
125
126
127
  /*
   * results with 256, 32 in the lowmem_reserve sysctl:
   *	1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
   *	1G machine -> (16M dma, 784M normal, 224M high)
   *	NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
   *	HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
   *	HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
a2f1b4249   Andi Kleen   [PATCH] x86_64: A...
128
129
130
   *
   * TBD: should special case ZONE_DMA32 machines here - in those we normally
   * don't need any ZONE_NORMAL reservation
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
131
   */
2f1b62486   Christoph Lameter   [PATCH] reduce MA...
132
  int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
4b51d6698   Christoph Lameter   [PATCH] optional ...
133
  #ifdef CONFIG_ZONE_DMA
2f1b62486   Christoph Lameter   [PATCH] reduce MA...
134
  	 256,
4b51d6698   Christoph Lameter   [PATCH] optional ...
135
  #endif
fb0e7942b   Christoph Lameter   [PATCH] reduce MA...
136
  #ifdef CONFIG_ZONE_DMA32
2f1b62486   Christoph Lameter   [PATCH] reduce MA...
137
  	 256,
fb0e7942b   Christoph Lameter   [PATCH] reduce MA...
138
  #endif
e53ef38d0   Christoph Lameter   [PATCH] reduce MA...
139
  #ifdef CONFIG_HIGHMEM
2a1e274ac   Mel Gorman   Create the ZONE_M...
140
  	 32,
e53ef38d0   Christoph Lameter   [PATCH] reduce MA...
141
  #endif
2a1e274ac   Mel Gorman   Create the ZONE_M...
142
  	 32,
2f1b62486   Christoph Lameter   [PATCH] reduce MA...
143
  };
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
144
145
  
  EXPORT_SYMBOL(totalram_pages);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
146

15ad7cdcf   Helge Deller   [PATCH] struct se...
147
  static char * const zone_names[MAX_NR_ZONES] = {
4b51d6698   Christoph Lameter   [PATCH] optional ...
148
  #ifdef CONFIG_ZONE_DMA
2f1b62486   Christoph Lameter   [PATCH] reduce MA...
149
  	 "DMA",
4b51d6698   Christoph Lameter   [PATCH] optional ...
150
  #endif
fb0e7942b   Christoph Lameter   [PATCH] reduce MA...
151
  #ifdef CONFIG_ZONE_DMA32
2f1b62486   Christoph Lameter   [PATCH] reduce MA...
152
  	 "DMA32",
fb0e7942b   Christoph Lameter   [PATCH] reduce MA...
153
  #endif
2f1b62486   Christoph Lameter   [PATCH] reduce MA...
154
  	 "Normal",
e53ef38d0   Christoph Lameter   [PATCH] reduce MA...
155
  #ifdef CONFIG_HIGHMEM
2a1e274ac   Mel Gorman   Create the ZONE_M...
156
  	 "HighMem",
e53ef38d0   Christoph Lameter   [PATCH] reduce MA...
157
  #endif
2a1e274ac   Mel Gorman   Create the ZONE_M...
158
  	 "Movable",
2f1b62486   Christoph Lameter   [PATCH] reduce MA...
159
  };
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
160
  int min_free_kbytes = 1024;
2c85f51d2   Jan Beulich   mm: also use allo...
161
162
  static unsigned long __meminitdata nr_kernel_pages;
  static unsigned long __meminitdata nr_all_pages;
a3142c8e1   Yasunori Goto   Fix section misma...
163
  static unsigned long __meminitdata dma_reserve;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
164

c713216de   Mel Gorman   [PATCH] Introduce...
165
166
  #ifdef CONFIG_ARCH_POPULATES_NODE_MAP
    /*
183ff22bb   Simon Arlott   spelling fixes: mm/
167
     * MAX_ACTIVE_REGIONS determines the maximum number of distinct
c713216de   Mel Gorman   [PATCH] Introduce...
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
     * ranges of memory (RAM) that may be registered with add_active_range().
     * Ranges passed to add_active_range() will be merged if possible
     * so the number of times add_active_range() can be called is
     * related to the number of nodes and the number of holes
     */
    #ifdef CONFIG_MAX_ACTIVE_REGIONS
      /* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */
      #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS
    #else
      #if MAX_NUMNODES >= 32
        /* If there can be many nodes, allow up to 50 holes per node */
        #define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50)
      #else
        /* By default, allow up to 256 distinct regions */
        #define MAX_ACTIVE_REGIONS 256
      #endif
    #endif
98011f569   Jan Beulich   mm: fix improper ...
185
186
187
188
    static struct node_active_region __meminitdata early_node_map[MAX_ACTIVE_REGIONS];
    static int __meminitdata nr_nodemap_entries;
    static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
    static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
b69a7288e   Adrian Bunk   mm/page_alloc.c: ...
189
    static unsigned long __initdata required_kernelcore;
484f51f82   Adrian Bunk   mm/page_alloc.c: ...
190
    static unsigned long __initdata required_movablecore;
b69a7288e   Adrian Bunk   mm/page_alloc.c: ...
191
    static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
2a1e274ac   Mel Gorman   Create the ZONE_M...
192
193
194
195
  
    /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
    int movable_zone;
    EXPORT_SYMBOL(movable_zone);
c713216de   Mel Gorman   [PATCH] Introduce...
196
  #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
418508c13   Miklos Szeredi   fix unused setup_...
197
198
  #if MAX_NUMNODES > 1
  int nr_node_ids __read_mostly = MAX_NUMNODES;
62bc62a87   Christoph Lameter   page allocator: u...
199
  int nr_online_nodes __read_mostly = 1;
418508c13   Miklos Szeredi   fix unused setup_...
200
  EXPORT_SYMBOL(nr_node_ids);
62bc62a87   Christoph Lameter   page allocator: u...
201
  EXPORT_SYMBOL(nr_online_nodes);
418508c13   Miklos Szeredi   fix unused setup_...
202
  #endif
9ef9acb05   Mel Gorman   Do not group page...
203
  int page_group_by_mobility_disabled __read_mostly;
b2a0ac887   Mel Gorman   Split the free li...
204
205
  static void set_pageblock_migratetype(struct page *page, int migratetype)
  {
49255c619   Mel Gorman   page allocator: m...
206
207
208
  
  	if (unlikely(page_group_by_mobility_disabled))
  		migratetype = MIGRATE_UNMOVABLE;
b2a0ac887   Mel Gorman   Split the free li...
209
210
211
  	set_pageblock_flags_group(page, (unsigned long)migratetype,
  					PB_migrate, PB_migrate_end);
  }
7f33d49a2   Rafael J. Wysocki   mm, PM/Freezer: D...
212
  bool oom_killer_disabled __read_mostly;
13e7444b0   Nick Piggin   [PATCH] mm: remov...
213
  #ifdef CONFIG_DEBUG_VM
c6a57e19e   Dave Hansen   [PATCH] memory ho...
214
  static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
215
  {
bdc8cb984   Dave Hansen   [PATCH] memory ho...
216
217
218
  	int ret = 0;
  	unsigned seq;
  	unsigned long pfn = page_to_pfn(page);
c6a57e19e   Dave Hansen   [PATCH] memory ho...
219

bdc8cb984   Dave Hansen   [PATCH] memory ho...
220
221
222
223
224
225
226
227
228
  	do {
  		seq = zone_span_seqbegin(zone);
  		if (pfn >= zone->zone_start_pfn + zone->spanned_pages)
  			ret = 1;
  		else if (pfn < zone->zone_start_pfn)
  			ret = 1;
  	} while (zone_span_seqretry(zone, seq));
  
  	return ret;
c6a57e19e   Dave Hansen   [PATCH] memory ho...
229
230
231
232
  }
  
  static int page_is_consistent(struct zone *zone, struct page *page)
  {
14e072984   Andy Whitcroft   add pfn_valid_wit...
233
  	if (!pfn_valid_within(page_to_pfn(page)))
c6a57e19e   Dave Hansen   [PATCH] memory ho...
234
  		return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
235
  	if (zone != page_zone(page))
c6a57e19e   Dave Hansen   [PATCH] memory ho...
236
237
238
239
240
241
242
243
244
245
  		return 0;
  
  	return 1;
  }
  /*
   * Temporary debugging check for pages not lying within a given zone.
   */
  static int bad_range(struct zone *zone, struct page *page)
  {
  	if (page_outside_zone_boundaries(zone, page))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
246
  		return 1;
c6a57e19e   Dave Hansen   [PATCH] memory ho...
247
248
  	if (!page_is_consistent(zone, page))
  		return 1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
249
250
  	return 0;
  }
13e7444b0   Nick Piggin   [PATCH] mm: remov...
251
252
253
254
255
256
  #else
  static inline int bad_range(struct zone *zone, struct page *page)
  {
  	return 0;
  }
  #endif
224abf92b   Nick Piggin   [PATCH] mm: bad_p...
257
  static void bad_page(struct page *page)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
258
  {
d936cf9b3   Hugh Dickins   badpage: ratelimi...
259
260
261
  	static unsigned long resume;
  	static unsigned long nr_shown;
  	static unsigned long nr_unshown;
2a7684a23   Wu Fengguang   HWPOISON: check a...
262
263
264
265
266
  	/* Don't complain about poisoned pages */
  	if (PageHWPoison(page)) {
  		__ClearPageBuddy(page);
  		return;
  	}
d936cf9b3   Hugh Dickins   badpage: ratelimi...
267
268
269
270
271
272
273
274
275
276
  	/*
  	 * Allow a burst of 60 reports, then keep quiet for that minute;
  	 * or allow a steady drip of one report per second.
  	 */
  	if (nr_shown == 60) {
  		if (time_before(jiffies, resume)) {
  			nr_unshown++;
  			goto out;
  		}
  		if (nr_unshown) {
1e9e63650   Hugh Dickins   badpage: KERN_ALE...
277
278
279
  			printk(KERN_ALERT
  			      "BUG: Bad page state: %lu messages suppressed
  ",
d936cf9b3   Hugh Dickins   badpage: ratelimi...
280
281
282
283
284
285
286
  				nr_unshown);
  			nr_unshown = 0;
  		}
  		nr_shown = 0;
  	}
  	if (nr_shown++ == 0)
  		resume = jiffies + 60 * HZ;
1e9e63650   Hugh Dickins   badpage: KERN_ALE...
287
288
  	printk(KERN_ALERT "BUG: Bad page state in process %s  pfn:%05lx
  ",
3dc147414   Hugh Dickins   badpage: replace ...
289
  		current->comm, page_to_pfn(page));
718a38211   Wu Fengguang   mm: introduce dum...
290
  	dump_page(page);
3dc147414   Hugh Dickins   badpage: replace ...
291

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
292
  	dump_stack();
d936cf9b3   Hugh Dickins   badpage: ratelimi...
293
  out:
8cc3b3922   Hugh Dickins   badpage: keep any...
294
295
  	/* Leave bad fields for debug, except PageBuddy could make trouble */
  	__ClearPageBuddy(page);
9f1583339   Randy Dunlap   [PATCH] use add_t...
296
  	add_taint(TAINT_BAD_PAGE);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
297
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
298
299
300
301
302
303
304
305
306
307
  /*
   * Higher-order pages are called "compound pages".  They are structured thusly:
   *
   * The first PAGE_SIZE page is called the "head page".
   *
   * The remaining PAGE_SIZE pages are called "tail pages".
   *
   * All pages have PG_compound set.  All pages have their ->private pointing at
   * the head page (even the head page has this).
   *
41d78ba55   Hugh Dickins   [PATCH] compound ...
308
309
310
   * The first tail page's ->lru.next holds the address of the compound page's
   * put_page() function.  Its ->lru.prev holds the order of allocation.
   * This usage means that zero-order pages may not be compound.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
311
   */
d98c7a098   Hugh Dickins   [PATCH] compound ...
312
313
314
  
  static void free_compound_page(struct page *page)
  {
d85f33855   Christoph Lameter   Make page->privat...
315
  	__free_pages_ok(page, compound_order(page));
d98c7a098   Hugh Dickins   [PATCH] compound ...
316
  }
01ad1c082   Andi Kleen   mm: export prep_c...
317
  void prep_compound_page(struct page *page, unsigned long order)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
318
319
320
  {
  	int i;
  	int nr_pages = 1 << order;
18229df5b   Andy Whitcroft   hugetlb: pull gig...
321
322
323
324
325
326
327
328
329
330
331
  
  	set_compound_page_dtor(page, free_compound_page);
  	set_compound_order(page, order);
  	__SetPageHead(page);
  	for (i = 1; i < nr_pages; i++) {
  		struct page *p = page + i;
  
  		__SetPageTail(p);
  		p->first_page = page;
  	}
  }
8cc3b3922   Hugh Dickins   badpage: keep any...
332
  static int destroy_compound_page(struct page *page, unsigned long order)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
333
334
335
  {
  	int i;
  	int nr_pages = 1 << order;
8cc3b3922   Hugh Dickins   badpage: keep any...
336
  	int bad = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
337

8cc3b3922   Hugh Dickins   badpage: keep any...
338
339
  	if (unlikely(compound_order(page) != order) ||
  	    unlikely(!PageHead(page))) {
224abf92b   Nick Piggin   [PATCH] mm: bad_p...
340
  		bad_page(page);
8cc3b3922   Hugh Dickins   badpage: keep any...
341
342
  		bad++;
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
343

6d7779538   Christoph Lameter   mm: optimize comp...
344
  	__ClearPageHead(page);
8cc3b3922   Hugh Dickins   badpage: keep any...
345

18229df5b   Andy Whitcroft   hugetlb: pull gig...
346
347
  	for (i = 1; i < nr_pages; i++) {
  		struct page *p = page + i;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
348

e713a21d8   Alexey Zaytsev   trivial: Fix dubi...
349
  		if (unlikely(!PageTail(p) || (p->first_page != page))) {
224abf92b   Nick Piggin   [PATCH] mm: bad_p...
350
  			bad_page(page);
8cc3b3922   Hugh Dickins   badpage: keep any...
351
352
  			bad++;
  		}
d85f33855   Christoph Lameter   Make page->privat...
353
  		__ClearPageTail(p);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
354
  	}
8cc3b3922   Hugh Dickins   badpage: keep any...
355
356
  
  	return bad;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
357
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
358

17cf44064   Nick Piggin   [PATCH] mm: clean...
359
360
361
  static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
  {
  	int i;
6626c5d53   Andrew Morton   [PATCH] mm: prep_...
362
363
364
365
  	/*
  	 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
  	 * and __GFP_HIGHMEM from hard or soft interrupt context.
  	 */
725d704ec   Nick Piggin   [PATCH] mm: VM_BU...
366
  	VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
17cf44064   Nick Piggin   [PATCH] mm: clean...
367
368
369
  	for (i = 0; i < (1 << order); i++)
  		clear_highpage(page + i);
  }
6aa3001b2   Andrew Morton   [PATCH] page_allo...
370
371
  static inline void set_page_order(struct page *page, int order)
  {
4c21e2f24   Hugh Dickins   [PATCH] mm: split...
372
  	set_page_private(page, order);
676165a8a   Nick Piggin   [PATCH] Fix buddy...
373
  	__SetPageBuddy(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
374
375
376
377
  }
  
  static inline void rmv_page_order(struct page *page)
  {
676165a8a   Nick Piggin   [PATCH] Fix buddy...
378
  	__ClearPageBuddy(page);
4c21e2f24   Hugh Dickins   [PATCH] mm: split...
379
  	set_page_private(page, 0);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
  }
  
  /*
   * Locate the struct page for both the matching buddy in our
   * pair (buddy1) and the combined O(n+1) page they form (page).
   *
   * 1) Any buddy B1 will have an order O twin B2 which satisfies
   * the following equation:
   *     B2 = B1 ^ (1 << O)
   * For example, if the starting buddy (buddy2) is #8 its order
   * 1 buddy is #10:
   *     B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
   *
   * 2) Any buddy B will have an order O+1 parent P which
   * satisfies the following equation:
   *     P = B & ~(1 << O)
   *
d6e05edc5   Andreas Mohr   spelling fixes
397
   * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
   */
  static inline struct page *
  __page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
  {
  	unsigned long buddy_idx = page_idx ^ (1 << order);
  
  	return page + (buddy_idx - page_idx);
  }
  
  static inline unsigned long
  __find_combined_index(unsigned long page_idx, unsigned int order)
  {
  	return (page_idx & ~(1 << order));
  }
  
  /*
   * This function checks whether a page is free && is the buddy
   * we can do coalesce a page and its buddy if
13e7444b0   Nick Piggin   [PATCH] mm: remov...
416
   * (a) the buddy is not in a hole &&
676165a8a   Nick Piggin   [PATCH] Fix buddy...
417
   * (b) the buddy is in the buddy system &&
cb2b95e1c   Andy Whitcroft   [PATCH] zone hand...
418
419
   * (c) a page and its buddy have the same order &&
   * (d) a page and its buddy are in the same zone.
676165a8a   Nick Piggin   [PATCH] Fix buddy...
420
421
422
   *
   * For recording whether a page is in the buddy system, we use PG_buddy.
   * Setting, clearing, and testing PG_buddy is serialized by zone->lock.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
423
   *
676165a8a   Nick Piggin   [PATCH] Fix buddy...
424
   * For recording page's order, we use page_private(page).
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
425
   */
cb2b95e1c   Andy Whitcroft   [PATCH] zone hand...
426
427
  static inline int page_is_buddy(struct page *page, struct page *buddy,
  								int order)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
428
  {
14e072984   Andy Whitcroft   add pfn_valid_wit...
429
  	if (!pfn_valid_within(page_to_pfn(buddy)))
13e7444b0   Nick Piggin   [PATCH] mm: remov...
430
  		return 0;
13e7444b0   Nick Piggin   [PATCH] mm: remov...
431

cb2b95e1c   Andy Whitcroft   [PATCH] zone hand...
432
433
434
435
  	if (page_zone_id(page) != page_zone_id(buddy))
  		return 0;
  
  	if (PageBuddy(buddy) && page_order(buddy) == order) {
a3af9c389   Nick Piggin   page allocator: d...
436
  		VM_BUG_ON(page_count(buddy) != 0);
6aa3001b2   Andrew Morton   [PATCH] page_allo...
437
  		return 1;
676165a8a   Nick Piggin   [PATCH] Fix buddy...
438
  	}
6aa3001b2   Andrew Morton   [PATCH] page_allo...
439
  	return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
  }
  
  /*
   * Freeing function for a buddy system allocator.
   *
   * The concept of a buddy system is to maintain direct-mapped table
   * (containing bit values) for memory blocks of various "orders".
   * The bottom level table contains the map for the smallest allocatable
   * units of memory (here, pages), and each level above it describes
   * pairs of units from the levels below, hence, "buddies".
   * At a high level, all that happens here is marking the table entry
   * at the bottom level available, and propagating the changes upward
   * as necessary, plus some accounting needed to play nicely with other
   * parts of the VM system.
   * At each level, we keep a list of pages, which are heads of continuous
676165a8a   Nick Piggin   [PATCH] Fix buddy...
455
   * free pages of length of (1 << order) and marked with PG_buddy. Page's
4c21e2f24   Hugh Dickins   [PATCH] mm: split...
456
   * order is recorded in page_private(page) field.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
457
458
459
460
461
462
463
464
   * So when we are allocating or freeing one, we can derive the state of the
   * other.  That is, if we allocate a small block, and both were   
   * free, the remainder of the region must be split into blocks.   
   * If a block is freed, and its buddy is also free, then this
   * triggers coalescing into a block of larger size.            
   *
   * -- wli
   */
48db57f8f   Nick Piggin   [PATCH] mm: free_...
465
  static inline void __free_one_page(struct page *page,
ed0ae21dc   Mel Gorman   page allocator: d...
466
467
  		struct zone *zone, unsigned int order,
  		int migratetype)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
468
469
  {
  	unsigned long page_idx;
6dda9d55b   Corrado Zoccolo   page allocator: r...
470
471
  	unsigned long combined_idx;
  	struct page *buddy;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
472

224abf92b   Nick Piggin   [PATCH] mm: bad_p...
473
  	if (unlikely(PageCompound(page)))
8cc3b3922   Hugh Dickins   badpage: keep any...
474
475
  		if (unlikely(destroy_compound_page(page, order)))
  			return;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
476

ed0ae21dc   Mel Gorman   page allocator: d...
477
  	VM_BUG_ON(migratetype == -1);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
478
  	page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
f2260e6b1   Mel Gorman   page allocator: u...
479
  	VM_BUG_ON(page_idx & ((1 << order) - 1));
725d704ec   Nick Piggin   [PATCH] mm: VM_BU...
480
  	VM_BUG_ON(bad_range(zone, page));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
481

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
482
  	while (order < MAX_ORDER-1) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
483
  		buddy = __page_find_buddy(page, page_idx, order);
cb2b95e1c   Andy Whitcroft   [PATCH] zone hand...
484
  		if (!page_is_buddy(page, buddy, order))
3c82d0ce2   Andy Whitcroft   buddy: clarify co...
485
  			break;
13e7444b0   Nick Piggin   [PATCH] mm: remov...
486

3c82d0ce2   Andy Whitcroft   buddy: clarify co...
487
  		/* Our buddy is free, merge with it and move up one order. */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
488
  		list_del(&buddy->lru);
b2a0ac887   Mel Gorman   Split the free li...
489
  		zone->free_area[order].nr_free--;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
490
  		rmv_page_order(buddy);
13e7444b0   Nick Piggin   [PATCH] mm: remov...
491
  		combined_idx = __find_combined_index(page_idx, order);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
492
493
494
495
496
  		page = page + (combined_idx - page_idx);
  		page_idx = combined_idx;
  		order++;
  	}
  	set_page_order(page, order);
6dda9d55b   Corrado Zoccolo   page allocator: r...
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
  
  	/*
  	 * If this is not the largest possible page, check if the buddy
  	 * of the next-highest order is free. If it is, it's possible
  	 * that pages are being freed that will coalesce soon. In case,
  	 * that is happening, add the free page to the tail of the list
  	 * so it's less likely to be used soon and more likely to be merged
  	 * as a higher order page
  	 */
  	if ((order < MAX_ORDER-1) && pfn_valid_within(page_to_pfn(buddy))) {
  		struct page *higher_page, *higher_buddy;
  		combined_idx = __find_combined_index(page_idx, order);
  		higher_page = page + combined_idx - page_idx;
  		higher_buddy = __page_find_buddy(higher_page, combined_idx, order + 1);
  		if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
  			list_add_tail(&page->lru,
  				&zone->free_area[order].free_list[migratetype]);
  			goto out;
  		}
  	}
  
  	list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
  out:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
520
521
  	zone->free_area[order].nr_free++;
  }
092cead61   KOSAKI Motohiro   page allocator: m...
522
523
524
525
526
527
528
  /*
   * free_page_mlock() -- clean up attempts to free and mlocked() page.
   * Page should not be on lru, so no need to fix that up.
   * free_pages_check() will verify...
   */
  static inline void free_page_mlock(struct page *page)
  {
092cead61   KOSAKI Motohiro   page allocator: m...
529
530
531
  	__dec_zone_page_state(page, NR_MLOCK);
  	__count_vm_event(UNEVICTABLE_MLOCKFREED);
  }
092cead61   KOSAKI Motohiro   page allocator: m...
532

224abf92b   Nick Piggin   [PATCH] mm: bad_p...
533
  static inline int free_pages_check(struct page *page)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
534
  {
92be2e33b   Nick Piggin   [PATCH] mm: micro...
535
536
  	if (unlikely(page_mapcount(page) |
  		(page->mapping != NULL)  |
a3af9c389   Nick Piggin   page allocator: d...
537
  		(atomic_read(&page->_count) != 0) |
8cc3b3922   Hugh Dickins   badpage: keep any...
538
  		(page->flags & PAGE_FLAGS_CHECK_AT_FREE))) {
224abf92b   Nick Piggin   [PATCH] mm: bad_p...
539
  		bad_page(page);
79f4b7bf3   Hugh Dickins   badpage: simplify...
540
  		return 1;
8cc3b3922   Hugh Dickins   badpage: keep any...
541
  	}
79f4b7bf3   Hugh Dickins   badpage: simplify...
542
543
544
  	if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
  		page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
  	return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
545
546
547
  }
  
  /*
5f8dcc212   Mel Gorman   page-allocator: s...
548
   * Frees a number of pages from the PCP lists
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
549
   * Assumes all pages on list are in same zone, and of same order.
207f36eec   Renaud Lienhart   [PATCH] remove in...
550
   * count is the number of pages to free.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
551
552
553
554
555
556
557
   *
   * If the zone was previously in an "all pages pinned" state then look to
   * see if this freeing clears that state.
   *
   * And clear the zone's pages_scanned counter, to hold off the "all pages are
   * pinned" detection logic.
   */
5f8dcc212   Mel Gorman   page-allocator: s...
558
559
  static void free_pcppages_bulk(struct zone *zone, int count,
  					struct per_cpu_pages *pcp)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
560
  {
5f8dcc212   Mel Gorman   page-allocator: s...
561
  	int migratetype = 0;
a6f9edd65   Mel Gorman   page-allocator: m...
562
  	int batch_free = 0;
5f8dcc212   Mel Gorman   page-allocator: s...
563

c54ad30c7   Nick Piggin   [PATCH] mm: pagea...
564
  	spin_lock(&zone->lock);
93e4a89a8   KOSAKI Motohiro   mm: restore zone-...
565
  	zone->all_unreclaimable = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
566
  	zone->pages_scanned = 0;
f2260e6b1   Mel Gorman   page allocator: u...
567

5f8dcc212   Mel Gorman   page-allocator: s...
568
  	__mod_zone_page_state(zone, NR_FREE_PAGES, count);
a6f9edd65   Mel Gorman   page-allocator: m...
569
  	while (count) {
48db57f8f   Nick Piggin   [PATCH] mm: free_...
570
  		struct page *page;
5f8dcc212   Mel Gorman   page-allocator: s...
571
572
573
  		struct list_head *list;
  
  		/*
a6f9edd65   Mel Gorman   page-allocator: m...
574
575
576
577
578
  		 * Remove pages from lists in a round-robin fashion. A
  		 * batch_free count is maintained that is incremented when an
  		 * empty list is encountered.  This is so more pages are freed
  		 * off fuller lists instead of spinning excessively around empty
  		 * lists
5f8dcc212   Mel Gorman   page-allocator: s...
579
580
  		 */
  		do {
a6f9edd65   Mel Gorman   page-allocator: m...
581
  			batch_free++;
5f8dcc212   Mel Gorman   page-allocator: s...
582
583
584
585
  			if (++migratetype == MIGRATE_PCPTYPES)
  				migratetype = 0;
  			list = &pcp->lists[migratetype];
  		} while (list_empty(list));
48db57f8f   Nick Piggin   [PATCH] mm: free_...
586

a6f9edd65   Mel Gorman   page-allocator: m...
587
588
589
590
  		do {
  			page = list_entry(list->prev, struct page, lru);
  			/* must delete as __free_one_page list manipulates */
  			list_del(&page->lru);
a7016235a   Hugh Dickins   mm: fix migratety...
591
592
593
  			/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
  			__free_one_page(page, zone, 0, page_private(page));
  			trace_mm_page_pcpu_drain(page, 0, page_private(page));
a6f9edd65   Mel Gorman   page-allocator: m...
594
  		} while (--count && --batch_free && !list_empty(list));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
595
  	}
c54ad30c7   Nick Piggin   [PATCH] mm: pagea...
596
  	spin_unlock(&zone->lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
597
  }
ed0ae21dc   Mel Gorman   page allocator: d...
598
599
  static void free_one_page(struct zone *zone, struct page *page, int order,
  				int migratetype)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
600
  {
006d22d9b   Christoph Lameter   [PATCH] Optimize ...
601
  	spin_lock(&zone->lock);
93e4a89a8   KOSAKI Motohiro   mm: restore zone-...
602
  	zone->all_unreclaimable = 0;
006d22d9b   Christoph Lameter   [PATCH] Optimize ...
603
  	zone->pages_scanned = 0;
f2260e6b1   Mel Gorman   page allocator: u...
604
605
  
  	__mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
ed0ae21dc   Mel Gorman   page allocator: d...
606
  	__free_one_page(page, zone, order, migratetype);
006d22d9b   Christoph Lameter   [PATCH] Optimize ...
607
  	spin_unlock(&zone->lock);
48db57f8f   Nick Piggin   [PATCH] mm: free_...
608
  }
ec95f53aa   KOSAKI Motohiro   mm: introduce fre...
609
  static bool free_pages_prepare(struct page *page, unsigned int order)
48db57f8f   Nick Piggin   [PATCH] mm: free_...
610
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
611
  	int i;
8cc3b3922   Hugh Dickins   badpage: keep any...
612
  	int bad = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
613

f650316c8   Li Hong   mm/page_alloc.c: ...
614
  	trace_mm_page_free_direct(page, order);
b1eeab676   Vegard Nossum   kmemcheck: add ho...
615
  	kmemcheck_free_shadow(page, order);
ec95f53aa   KOSAKI Motohiro   mm: introduce fre...
616
617
618
619
620
621
622
  	for (i = 0; i < (1 << order); i++) {
  		struct page *pg = page + i;
  
  		if (PageAnon(pg))
  			pg->mapping = NULL;
  		bad += free_pages_check(pg);
  	}
8cc3b3922   Hugh Dickins   badpage: keep any...
623
  	if (bad)
ec95f53aa   KOSAKI Motohiro   mm: introduce fre...
624
  		return false;
689bcebfd   Hugh Dickins   [PATCH] unpaged: ...
625

3ac7fe5a4   Thomas Gleixner   infrastructure to...
626
  	if (!PageHighMem(page)) {
9858db504   Nick Piggin   [PATCH] mm: locks...
627
  		debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);
3ac7fe5a4   Thomas Gleixner   infrastructure to...
628
629
630
  		debug_check_no_obj_freed(page_address(page),
  					   PAGE_SIZE << order);
  	}
dafb13673   Nick Piggin   [PATCH] mm: arch_...
631
  	arch_free_page(page, order);
48db57f8f   Nick Piggin   [PATCH] mm: free_...
632
  	kernel_map_pages(page, 1 << order, 0);
dafb13673   Nick Piggin   [PATCH] mm: arch_...
633

ec95f53aa   KOSAKI Motohiro   mm: introduce fre...
634
635
636
637
638
639
640
641
642
643
  	return true;
  }
  
  static void __free_pages_ok(struct page *page, unsigned int order)
  {
  	unsigned long flags;
  	int wasMlocked = __TestClearPageMlocked(page);
  
  	if (!free_pages_prepare(page, order))
  		return;
c54ad30c7   Nick Piggin   [PATCH] mm: pagea...
644
  	local_irq_save(flags);
c277331d5   Johannes Weiner   mm: page_alloc: c...
645
  	if (unlikely(wasMlocked))
da456f14d   Mel Gorman   page allocator: d...
646
  		free_page_mlock(page);
f8891e5e1   Christoph Lameter   [PATCH] Light wei...
647
  	__count_vm_events(PGFREE, 1 << order);
ed0ae21dc   Mel Gorman   page allocator: d...
648
649
  	free_one_page(page_zone(page), page, order,
  					get_pageblock_migratetype(page));
c54ad30c7   Nick Piggin   [PATCH] mm: pagea...
650
  	local_irq_restore(flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
651
  }
a226f6c89   David Howells   [PATCH] FRV: Clea...
652
653
654
  /*
   * permit the bootmem allocator to evade page validation on high-order frees
   */
af370fb8c   Yasunori Goto   memory hotplug: s...
655
  void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
a226f6c89   David Howells   [PATCH] FRV: Clea...
656
657
658
659
  {
  	if (order == 0) {
  		__ClearPageReserved(page);
  		set_page_count(page, 0);
7835e98b2   Nick Piggin   [PATCH] remove se...
660
  		set_page_refcounted(page);
545b1ea9b   Nick Piggin   [PATCH] mm: clean...
661
  		__free_page(page);
a226f6c89   David Howells   [PATCH] FRV: Clea...
662
  	} else {
a226f6c89   David Howells   [PATCH] FRV: Clea...
663
  		int loop;
545b1ea9b   Nick Piggin   [PATCH] mm: clean...
664
  		prefetchw(page);
a226f6c89   David Howells   [PATCH] FRV: Clea...
665
666
  		for (loop = 0; loop < BITS_PER_LONG; loop++) {
  			struct page *p = &page[loop];
545b1ea9b   Nick Piggin   [PATCH] mm: clean...
667
668
  			if (loop + 1 < BITS_PER_LONG)
  				prefetchw(p + 1);
a226f6c89   David Howells   [PATCH] FRV: Clea...
669
670
671
  			__ClearPageReserved(p);
  			set_page_count(p, 0);
  		}
7835e98b2   Nick Piggin   [PATCH] remove se...
672
  		set_page_refcounted(page);
545b1ea9b   Nick Piggin   [PATCH] mm: clean...
673
  		__free_pages(page, order);
a226f6c89   David Howells   [PATCH] FRV: Clea...
674
675
  	}
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
  
  /*
   * The order of subdivision here is critical for the IO subsystem.
   * Please do not alter this order without good reasons and regression
   * testing. Specifically, as large blocks of memory are subdivided,
   * the order in which smaller blocks are delivered depends on the order
   * they're subdivided in this function. This is the primary factor
   * influencing the order in which pages are delivered to the IO
   * subsystem according to empirical testing, and this is also justified
   * by considering the behavior of a buddy system containing a single
   * large block of memory acted on by a series of small allocations.
   * This behavior is a critical factor in sglist merging's success.
   *
   * -- wli
   */
085cc7d5d   Nick Piggin   [PATCH] mm: page_...
691
  static inline void expand(struct zone *zone, struct page *page,
b2a0ac887   Mel Gorman   Split the free li...
692
693
  	int low, int high, struct free_area *area,
  	int migratetype)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
694
695
696
697
698
699
700
  {
  	unsigned long size = 1 << high;
  
  	while (high > low) {
  		area--;
  		high--;
  		size >>= 1;
725d704ec   Nick Piggin   [PATCH] mm: VM_BU...
701
  		VM_BUG_ON(bad_range(zone, &page[size]));
b2a0ac887   Mel Gorman   Split the free li...
702
  		list_add(&page[size].lru, &area->free_list[migratetype]);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
703
704
705
  		area->nr_free++;
  		set_page_order(&page[size], high);
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
706
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
707
708
709
  /*
   * This page is about to be returned from the page allocator
   */
2a7684a23   Wu Fengguang   HWPOISON: check a...
710
  static inline int check_new_page(struct page *page)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
711
  {
92be2e33b   Nick Piggin   [PATCH] mm: micro...
712
713
  	if (unlikely(page_mapcount(page) |
  		(page->mapping != NULL)  |
a3af9c389   Nick Piggin   page allocator: d...
714
  		(atomic_read(&page->_count) != 0)  |
8cc3b3922   Hugh Dickins   badpage: keep any...
715
  		(page->flags & PAGE_FLAGS_CHECK_AT_PREP))) {
224abf92b   Nick Piggin   [PATCH] mm: bad_p...
716
  		bad_page(page);
689bcebfd   Hugh Dickins   [PATCH] unpaged: ...
717
  		return 1;
8cc3b3922   Hugh Dickins   badpage: keep any...
718
  	}
2a7684a23   Wu Fengguang   HWPOISON: check a...
719
720
721
722
723
724
725
726
727
728
729
730
  	return 0;
  }
  
  static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
  {
  	int i;
  
  	for (i = 0; i < (1 << order); i++) {
  		struct page *p = page + i;
  		if (unlikely(check_new_page(p)))
  			return 1;
  	}
689bcebfd   Hugh Dickins   [PATCH] unpaged: ...
731

4c21e2f24   Hugh Dickins   [PATCH] mm: split...
732
  	set_page_private(page, 0);
7835e98b2   Nick Piggin   [PATCH] remove se...
733
  	set_page_refcounted(page);
cc1025090   Nick Piggin   [PATCH] mm: add a...
734
735
  
  	arch_alloc_page(page, order);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
736
  	kernel_map_pages(page, 1 << order, 1);
17cf44064   Nick Piggin   [PATCH] mm: clean...
737
738
739
740
741
742
  
  	if (gfp_flags & __GFP_ZERO)
  		prep_zero_page(page, order, gfp_flags);
  
  	if (order && (gfp_flags & __GFP_COMP))
  		prep_compound_page(page, order);
689bcebfd   Hugh Dickins   [PATCH] unpaged: ...
743
  	return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
744
  }
56fd56b86   Mel Gorman   Bias the location...
745
746
747
748
  /*
   * Go through the free lists for the given migratetype and remove
   * the smallest available page from the freelists
   */
728ec980f   Mel Gorman   page allocator: i...
749
750
  static inline
  struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
56fd56b86   Mel Gorman   Bias the location...
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
  						int migratetype)
  {
  	unsigned int current_order;
  	struct free_area * area;
  	struct page *page;
  
  	/* Find a page of the appropriate size in the preferred list */
  	for (current_order = order; current_order < MAX_ORDER; ++current_order) {
  		area = &(zone->free_area[current_order]);
  		if (list_empty(&area->free_list[migratetype]))
  			continue;
  
  		page = list_entry(area->free_list[migratetype].next,
  							struct page, lru);
  		list_del(&page->lru);
  		rmv_page_order(page);
  		area->nr_free--;
56fd56b86   Mel Gorman   Bias the location...
768
769
770
771
772
773
  		expand(zone, page, order, current_order, area, migratetype);
  		return page;
  	}
  
  	return NULL;
  }
b2a0ac887   Mel Gorman   Split the free li...
774
775
776
777
778
  /*
   * This array describes the order lists are fallen back to when
   * the free lists for the desirable migrate type are depleted
   */
  static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = {
64c5e135b   Mel Gorman   don't group high ...
779
780
781
782
  	[MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,   MIGRATE_RESERVE },
  	[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,   MIGRATE_RESERVE },
  	[MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
  	[MIGRATE_RESERVE]     = { MIGRATE_RESERVE,     MIGRATE_RESERVE,   MIGRATE_RESERVE }, /* Never used */
b2a0ac887   Mel Gorman   Split the free li...
783
  };
c361be55b   Mel Gorman   Move free pages b...
784
785
  /*
   * Move the free pages in a range to the free lists of the requested type.
d9c234005   Mel Gorman   Do not depend on ...
786
   * Note that start_page and end_pages are not aligned on a pageblock
c361be55b   Mel Gorman   Move free pages b...
787
788
   * boundary. If alignment is required, use move_freepages_block()
   */
b69a7288e   Adrian Bunk   mm/page_alloc.c: ...
789
790
791
  static int move_freepages(struct zone *zone,
  			  struct page *start_page, struct page *end_page,
  			  int migratetype)
c361be55b   Mel Gorman   Move free pages b...
792
793
794
  {
  	struct page *page;
  	unsigned long order;
d100313fd   Mel Gorman   Fix calculation i...
795
  	int pages_moved = 0;
c361be55b   Mel Gorman   Move free pages b...
796
797
798
799
800
801
802
  
  #ifndef CONFIG_HOLES_IN_ZONE
  	/*
  	 * page_zone is not safe to call in this context when
  	 * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
  	 * anyway as we check zone boundaries in move_freepages_block().
  	 * Remove at a later date when no bug reports exist related to
ac0e5b7a6   Mel Gorman   remove PAGE_GROUP...
803
  	 * grouping pages by mobility
c361be55b   Mel Gorman   Move free pages b...
804
805
806
807
808
  	 */
  	BUG_ON(page_zone(start_page) != page_zone(end_page));
  #endif
  
  	for (page = start_page; page <= end_page;) {
344c790e3   Adam Litke   mm: make setup_zo...
809
810
  		/* Make sure we are not inadvertently changing nodes */
  		VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));
c361be55b   Mel Gorman   Move free pages b...
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
  		if (!pfn_valid_within(page_to_pfn(page))) {
  			page++;
  			continue;
  		}
  
  		if (!PageBuddy(page)) {
  			page++;
  			continue;
  		}
  
  		order = page_order(page);
  		list_del(&page->lru);
  		list_add(&page->lru,
  			&zone->free_area[order].free_list[migratetype]);
  		page += 1 << order;
d100313fd   Mel Gorman   Fix calculation i...
826
  		pages_moved += 1 << order;
c361be55b   Mel Gorman   Move free pages b...
827
  	}
d100313fd   Mel Gorman   Fix calculation i...
828
  	return pages_moved;
c361be55b   Mel Gorman   Move free pages b...
829
  }
b69a7288e   Adrian Bunk   mm/page_alloc.c: ...
830
831
  static int move_freepages_block(struct zone *zone, struct page *page,
  				int migratetype)
c361be55b   Mel Gorman   Move free pages b...
832
833
834
835
836
  {
  	unsigned long start_pfn, end_pfn;
  	struct page *start_page, *end_page;
  
  	start_pfn = page_to_pfn(page);
d9c234005   Mel Gorman   Do not depend on ...
837
  	start_pfn = start_pfn & ~(pageblock_nr_pages-1);
c361be55b   Mel Gorman   Move free pages b...
838
  	start_page = pfn_to_page(start_pfn);
d9c234005   Mel Gorman   Do not depend on ...
839
840
  	end_page = start_page + pageblock_nr_pages - 1;
  	end_pfn = start_pfn + pageblock_nr_pages - 1;
c361be55b   Mel Gorman   Move free pages b...
841
842
843
844
845
846
847
848
849
  
  	/* Do not cross zone boundaries */
  	if (start_pfn < zone->zone_start_pfn)
  		start_page = page;
  	if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages)
  		return 0;
  
  	return move_freepages(zone, start_page, end_page, migratetype);
  }
2f66a68f3   Mel Gorman   page-allocator: c...
850
851
852
853
854
855
856
857
858
859
  static void change_pageblock_range(struct page *pageblock_page,
  					int start_order, int migratetype)
  {
  	int nr_pageblocks = 1 << (start_order - pageblock_order);
  
  	while (nr_pageblocks--) {
  		set_pageblock_migratetype(pageblock_page, migratetype);
  		pageblock_page += pageblock_nr_pages;
  	}
  }
b2a0ac887   Mel Gorman   Split the free li...
860
  /* Remove an element from the buddy allocator from the fallback list */
0ac3a4099   Mel Gorman   page allocator: i...
861
862
  static inline struct page *
  __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
b2a0ac887   Mel Gorman   Split the free li...
863
864
865
866
867
868
869
870
871
872
873
  {
  	struct free_area * area;
  	int current_order;
  	struct page *page;
  	int migratetype, i;
  
  	/* Find the largest possible block of pages in the other list */
  	for (current_order = MAX_ORDER-1; current_order >= order;
  						--current_order) {
  		for (i = 0; i < MIGRATE_TYPES - 1; i++) {
  			migratetype = fallbacks[start_migratetype][i];
56fd56b86   Mel Gorman   Bias the location...
874
875
876
  			/* MIGRATE_RESERVE handled later if necessary */
  			if (migratetype == MIGRATE_RESERVE)
  				continue;
e010487db   Mel Gorman   Group high-order ...
877

b2a0ac887   Mel Gorman   Split the free li...
878
879
880
881
882
883
884
885
886
  			area = &(zone->free_area[current_order]);
  			if (list_empty(&area->free_list[migratetype]))
  				continue;
  
  			page = list_entry(area->free_list[migratetype].next,
  					struct page, lru);
  			area->nr_free--;
  
  			/*
c361be55b   Mel Gorman   Move free pages b...
887
  			 * If breaking a large block of pages, move all free
46dafbca2   Mel Gorman   Be more agressive...
888
889
890
  			 * pages to the preferred allocation list. If falling
  			 * back for a reclaimable kernel allocation, be more
  			 * agressive about taking ownership of free pages
b2a0ac887   Mel Gorman   Split the free li...
891
  			 */
d9c234005   Mel Gorman   Do not depend on ...
892
  			if (unlikely(current_order >= (pageblock_order >> 1)) ||
dd5d241ea   Mel Gorman   page-allocator: a...
893
894
  					start_migratetype == MIGRATE_RECLAIMABLE ||
  					page_group_by_mobility_disabled) {
46dafbca2   Mel Gorman   Be more agressive...
895
896
897
898
899
  				unsigned long pages;
  				pages = move_freepages_block(zone, page,
  								start_migratetype);
  
  				/* Claim the whole block if over half of it is free */
dd5d241ea   Mel Gorman   page-allocator: a...
900
901
  				if (pages >= (1 << (pageblock_order-1)) ||
  						page_group_by_mobility_disabled)
46dafbca2   Mel Gorman   Be more agressive...
902
903
  					set_pageblock_migratetype(page,
  								start_migratetype);
b2a0ac887   Mel Gorman   Split the free li...
904
  				migratetype = start_migratetype;
c361be55b   Mel Gorman   Move free pages b...
905
  			}
b2a0ac887   Mel Gorman   Split the free li...
906
907
908
909
  
  			/* Remove the page from the freelists */
  			list_del(&page->lru);
  			rmv_page_order(page);
b2a0ac887   Mel Gorman   Split the free li...
910

2f66a68f3   Mel Gorman   page-allocator: c...
911
912
913
  			/* Take ownership for orders >= pageblock_order */
  			if (current_order >= pageblock_order)
  				change_pageblock_range(page, current_order,
b2a0ac887   Mel Gorman   Split the free li...
914
915
916
  							start_migratetype);
  
  			expand(zone, page, order, current_order, area, migratetype);
e0fff1bd1   Mel Gorman   tracing, page-all...
917
918
919
  
  			trace_mm_page_alloc_extfrag(page, order, current_order,
  				start_migratetype, migratetype);
b2a0ac887   Mel Gorman   Split the free li...
920
921
922
  			return page;
  		}
  	}
728ec980f   Mel Gorman   page allocator: i...
923
  	return NULL;
b2a0ac887   Mel Gorman   Split the free li...
924
  }
56fd56b86   Mel Gorman   Bias the location...
925
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
926
927
928
   * Do the hard work of removing an element from the buddy allocator.
   * Call me with the zone->lock already held.
   */
b2a0ac887   Mel Gorman   Split the free li...
929
930
  static struct page *__rmqueue(struct zone *zone, unsigned int order,
  						int migratetype)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
931
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
932
  	struct page *page;
728ec980f   Mel Gorman   page allocator: i...
933
  retry_reserve:
56fd56b86   Mel Gorman   Bias the location...
934
  	page = __rmqueue_smallest(zone, order, migratetype);
b2a0ac887   Mel Gorman   Split the free li...
935

728ec980f   Mel Gorman   page allocator: i...
936
  	if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
56fd56b86   Mel Gorman   Bias the location...
937
  		page = __rmqueue_fallback(zone, order, migratetype);
b2a0ac887   Mel Gorman   Split the free li...
938

728ec980f   Mel Gorman   page allocator: i...
939
940
941
942
943
944
945
946
947
948
  		/*
  		 * Use MIGRATE_RESERVE rather than fail an allocation. goto
  		 * is used because __rmqueue_smallest is an inline function
  		 * and we want just one call site
  		 */
  		if (!page) {
  			migratetype = MIGRATE_RESERVE;
  			goto retry_reserve;
  		}
  	}
0d3d062a6   Mel Gorman   tracing, page-all...
949
  	trace_mm_page_alloc_zone_locked(page, order, migratetype);
b2a0ac887   Mel Gorman   Split the free li...
950
  	return page;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
951
952
953
954
955
956
957
958
  }
  
  /* 
   * Obtain a specified number of elements from the buddy allocator, all under
   * a single hold of the lock, for efficiency.  Add them to the supplied list.
   * Returns the number of new pages which were placed at *list.
   */
  static int rmqueue_bulk(struct zone *zone, unsigned int order, 
b2a0ac887   Mel Gorman   Split the free li...
959
  			unsigned long count, struct list_head *list,
e084b2d95   Mel Gorman   page-allocator: p...
960
  			int migratetype, int cold)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
961
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
962
  	int i;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
963
  	
c54ad30c7   Nick Piggin   [PATCH] mm: pagea...
964
  	spin_lock(&zone->lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
965
  	for (i = 0; i < count; ++i) {
b2a0ac887   Mel Gorman   Split the free li...
966
  		struct page *page = __rmqueue(zone, order, migratetype);
085cc7d5d   Nick Piggin   [PATCH] mm: page_...
967
  		if (unlikely(page == NULL))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
968
  			break;
81eabcbe0   Mel Gorman   mm: fix page allo...
969
970
971
972
973
974
975
976
977
978
  
  		/*
  		 * Split buddy pages returned by expand() are received here
  		 * in physical page order. The page is added to the callers and
  		 * list and the list head then moves forward. From the callers
  		 * perspective, the linked list is ordered by page number in
  		 * some conditions. This is useful for IO devices that can
  		 * merge IO requests if the physical pages are ordered
  		 * properly.
  		 */
e084b2d95   Mel Gorman   page-allocator: p...
979
980
981
982
  		if (likely(cold == 0))
  			list_add(&page->lru, list);
  		else
  			list_add_tail(&page->lru, list);
535131e69   Mel Gorman   Choose pages from...
983
  		set_page_private(page, migratetype);
81eabcbe0   Mel Gorman   mm: fix page allo...
984
  		list = &page->lru;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
985
  	}
f2260e6b1   Mel Gorman   page allocator: u...
986
  	__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
c54ad30c7   Nick Piggin   [PATCH] mm: pagea...
987
  	spin_unlock(&zone->lock);
085cc7d5d   Nick Piggin   [PATCH] mm: page_...
988
  	return i;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
989
  }
4ae7c0394   Christoph Lameter   [PATCH] Periodica...
990
  #ifdef CONFIG_NUMA
8fce4d8e3   Christoph Lameter   [PATCH] slab: Nod...
991
  /*
4037d4522   Christoph Lameter   Move remote node ...
992
993
994
995
   * Called from the vmstat counter updater to drain pagesets of this
   * currently executing processor on remote nodes after they have
   * expired.
   *
879336c39   Christoph Lameter   [PATCH] drain_nod...
996
997
   * Note that this function must be called with the thread pinned to
   * a single processor.
8fce4d8e3   Christoph Lameter   [PATCH] slab: Nod...
998
   */
4037d4522   Christoph Lameter   Move remote node ...
999
  void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
4ae7c0394   Christoph Lameter   [PATCH] Periodica...
1000
  {
4ae7c0394   Christoph Lameter   [PATCH] Periodica...
1001
  	unsigned long flags;
4037d4522   Christoph Lameter   Move remote node ...
1002
  	int to_drain;
4ae7c0394   Christoph Lameter   [PATCH] Periodica...
1003

4037d4522   Christoph Lameter   Move remote node ...
1004
1005
1006
1007
1008
  	local_irq_save(flags);
  	if (pcp->count >= pcp->batch)
  		to_drain = pcp->batch;
  	else
  		to_drain = pcp->count;
5f8dcc212   Mel Gorman   page-allocator: s...
1009
  	free_pcppages_bulk(zone, to_drain, pcp);
4037d4522   Christoph Lameter   Move remote node ...
1010
1011
  	pcp->count -= to_drain;
  	local_irq_restore(flags);
4ae7c0394   Christoph Lameter   [PATCH] Periodica...
1012
1013
  }
  #endif
9f8f21725   Christoph Lameter   Page allocator: c...
1014
1015
1016
1017
1018
1019
1020
1021
  /*
   * Drain pages of the indicated processor.
   *
   * The processor must either be the current processor and the
   * thread pinned to the current processor or a processor that
   * is not online.
   */
  static void drain_pages(unsigned int cpu)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1022
  {
c54ad30c7   Nick Piggin   [PATCH] mm: pagea...
1023
  	unsigned long flags;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1024
  	struct zone *zone;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1025

ee99c71c5   KOSAKI Motohiro   mm: introduce for...
1026
  	for_each_populated_zone(zone) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1027
  		struct per_cpu_pageset *pset;
3dfa5721f   Christoph Lameter   Page allocator: g...
1028
  		struct per_cpu_pages *pcp;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1029

99dcc3e5a   Christoph Lameter   this_cpu: Page al...
1030
1031
  		local_irq_save(flags);
  		pset = per_cpu_ptr(zone->pageset, cpu);
3dfa5721f   Christoph Lameter   Page allocator: g...
1032
1033
  
  		pcp = &pset->pcp;
5f8dcc212   Mel Gorman   page-allocator: s...
1034
  		free_pcppages_bulk(zone, pcp->count, pcp);
3dfa5721f   Christoph Lameter   Page allocator: g...
1035
1036
  		pcp->count = 0;
  		local_irq_restore(flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1037
1038
  	}
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1039

9f8f21725   Christoph Lameter   Page allocator: c...
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
  /*
   * Spill all of this CPU's per-cpu pages back into the buddy allocator.
   */
  void drain_local_pages(void *arg)
  {
  	drain_pages(smp_processor_id());
  }
  
  /*
   * Spill all the per-cpu pages from all CPUs back into the buddy allocator
   */
  void drain_all_pages(void)
  {
15c8b6c1a   Jens Axboe   on_each_cpu(): ki...
1053
  	on_each_cpu(drain_local_pages, NULL, 1);
9f8f21725   Christoph Lameter   Page allocator: c...
1054
  }
296699de6   Rafael J. Wysocki   Introduce CONFIG_...
1055
  #ifdef CONFIG_HIBERNATION
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1056
1057
1058
  
  void mark_free_pages(struct zone *zone)
  {
f623f0db8   Rafael J. Wysocki   [PATCH] swsusp: F...
1059
1060
  	unsigned long pfn, max_zone_pfn;
  	unsigned long flags;
b2a0ac887   Mel Gorman   Split the free li...
1061
  	int order, t;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1062
1063
1064
1065
1066
1067
  	struct list_head *curr;
  
  	if (!zone->spanned_pages)
  		return;
  
  	spin_lock_irqsave(&zone->lock, flags);
f623f0db8   Rafael J. Wysocki   [PATCH] swsusp: F...
1068
1069
1070
1071
1072
  
  	max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
  	for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
  		if (pfn_valid(pfn)) {
  			struct page *page = pfn_to_page(pfn);
7be982349   Rafael J. Wysocki   swsusp: use inlin...
1073
1074
  			if (!swsusp_page_is_forbidden(page))
  				swsusp_unset_page_free(page);
f623f0db8   Rafael J. Wysocki   [PATCH] swsusp: F...
1075
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1076

b2a0ac887   Mel Gorman   Split the free li...
1077
1078
  	for_each_migratetype_order(order, t) {
  		list_for_each(curr, &zone->free_area[order].free_list[t]) {
f623f0db8   Rafael J. Wysocki   [PATCH] swsusp: F...
1079
  			unsigned long i;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1080

f623f0db8   Rafael J. Wysocki   [PATCH] swsusp: F...
1081
1082
  			pfn = page_to_pfn(list_entry(curr, struct page, lru));
  			for (i = 0; i < (1UL << order); i++)
7be982349   Rafael J. Wysocki   swsusp: use inlin...
1083
  				swsusp_set_page_free(pfn_to_page(pfn + i));
f623f0db8   Rafael J. Wysocki   [PATCH] swsusp: F...
1084
  		}
b2a0ac887   Mel Gorman   Split the free li...
1085
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1086
1087
  	spin_unlock_irqrestore(&zone->lock, flags);
  }
e2c55dc87   Mel Gorman   Drain per-cpu lis...
1088
  #endif /* CONFIG_PM */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1089
1090
  
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1091
   * Free a 0-order page
fc91668ea   Li Hong   mm: remove free_h...
1092
   * cold == 1 ? free a cold page : free a hot page
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1093
   */
fc91668ea   Li Hong   mm: remove free_h...
1094
  void free_hot_cold_page(struct page *page, int cold)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1095
1096
1097
1098
  {
  	struct zone *zone = page_zone(page);
  	struct per_cpu_pages *pcp;
  	unsigned long flags;
5f8dcc212   Mel Gorman   page-allocator: s...
1099
  	int migratetype;
451ea25da   Johannes Weiner   mm: perform non-a...
1100
  	int wasMlocked = __TestClearPageMlocked(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1101

ec95f53aa   KOSAKI Motohiro   mm: introduce fre...
1102
  	if (!free_pages_prepare(page, 0))
689bcebfd   Hugh Dickins   [PATCH] unpaged: ...
1103
  		return;
5f8dcc212   Mel Gorman   page-allocator: s...
1104
1105
  	migratetype = get_pageblock_migratetype(page);
  	set_page_private(page, migratetype);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1106
  	local_irq_save(flags);
c277331d5   Johannes Weiner   mm: page_alloc: c...
1107
  	if (unlikely(wasMlocked))
da456f14d   Mel Gorman   page allocator: d...
1108
  		free_page_mlock(page);
f8891e5e1   Christoph Lameter   [PATCH] Light wei...
1109
  	__count_vm_event(PGFREE);
da456f14d   Mel Gorman   page allocator: d...
1110

5f8dcc212   Mel Gorman   page-allocator: s...
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
  	/*
  	 * We only track unmovable, reclaimable and movable on pcp lists.
  	 * Free ISOLATE pages back to the allocator because they are being
  	 * offlined but treat RESERVE as movable pages so we can get those
  	 * areas back if necessary. Otherwise, we may have to free
  	 * excessively into the page allocator
  	 */
  	if (migratetype >= MIGRATE_PCPTYPES) {
  		if (unlikely(migratetype == MIGRATE_ISOLATE)) {
  			free_one_page(zone, page, 0, migratetype);
  			goto out;
  		}
  		migratetype = MIGRATE_MOVABLE;
  	}
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
1125
  	pcp = &this_cpu_ptr(zone->pageset)->pcp;
3dfa5721f   Christoph Lameter   Page allocator: g...
1126
  	if (cold)
5f8dcc212   Mel Gorman   page-allocator: s...
1127
  		list_add_tail(&page->lru, &pcp->lists[migratetype]);
3dfa5721f   Christoph Lameter   Page allocator: g...
1128
  	else
5f8dcc212   Mel Gorman   page-allocator: s...
1129
  		list_add(&page->lru, &pcp->lists[migratetype]);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1130
  	pcp->count++;
48db57f8f   Nick Piggin   [PATCH] mm: free_...
1131
  	if (pcp->count >= pcp->high) {
5f8dcc212   Mel Gorman   page-allocator: s...
1132
  		free_pcppages_bulk(zone, pcp->batch, pcp);
48db57f8f   Nick Piggin   [PATCH] mm: free_...
1133
1134
  		pcp->count -= pcp->batch;
  	}
5f8dcc212   Mel Gorman   page-allocator: s...
1135
1136
  
  out:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1137
  	local_irq_restore(flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1138
  }
8dfcc9ba2   Nick Piggin   [PATCH] mm: split...
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
  /*
   * split_page takes a non-compound higher-order page, and splits it into
   * n (1<<order) sub-pages: page[0..n]
   * Each sub-page must be freed individually.
   *
   * Note: this is probably too low level an operation for use in drivers.
   * Please consult with lkml before using this in your driver.
   */
  void split_page(struct page *page, unsigned int order)
  {
  	int i;
725d704ec   Nick Piggin   [PATCH] mm: VM_BU...
1150
1151
  	VM_BUG_ON(PageCompound(page));
  	VM_BUG_ON(!page_count(page));
b1eeab676   Vegard Nossum   kmemcheck: add ho...
1152
1153
1154
1155
1156
1157
1158
1159
1160
  
  #ifdef CONFIG_KMEMCHECK
  	/*
  	 * Split shadow pages too, because free(page[0]) would
  	 * otherwise free the whole shadow.
  	 */
  	if (kmemcheck_page_is_tracked(page))
  		split_page(virt_to_page(page[0].shadow), order);
  #endif
7835e98b2   Nick Piggin   [PATCH] remove se...
1161
1162
  	for (i = 1; i < (1 << order); i++)
  		set_page_refcounted(page + i);
8dfcc9ba2   Nick Piggin   [PATCH] mm: split...
1163
  }
8dfcc9ba2   Nick Piggin   [PATCH] mm: split...
1164

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1165
  /*
748446bb6   Mel Gorman   mm: compaction: m...
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
   * Similar to split_page except the page is already free. As this is only
   * being used for migration, the migratetype of the block also changes.
   * As this is called with interrupts disabled, the caller is responsible
   * for calling arch_alloc_page() and kernel_map_page() after interrupts
   * are enabled.
   *
   * Note: this is probably too low level an operation for use in drivers.
   * Please consult with lkml before using this in your driver.
   */
  int split_free_page(struct page *page)
  {
  	unsigned int order;
  	unsigned long watermark;
  	struct zone *zone;
  
  	BUG_ON(!PageBuddy(page));
  
  	zone = page_zone(page);
  	order = page_order(page);
  
  	/* Obey watermarks as if the page was being allocated */
  	watermark = low_wmark_pages(zone) + (1 << order);
  	if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
  		return 0;
  
  	/* Remove page from free list */
  	list_del(&page->lru);
  	zone->free_area[order].nr_free--;
  	rmv_page_order(page);
  	__mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order));
  
  	/* Split into individual pages */
  	set_page_refcounted(page);
  	split_page(page, order);
  
  	if (order >= pageblock_order - 1) {
  		struct page *endpage = page + (1 << order) - 1;
  		for (; page < endpage; page += pageblock_nr_pages)
  			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
  	}
  
  	return 1 << order;
  }
  
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1211
1212
1213
1214
   * Really, prep_compound_page() should be called from __rmqueue_bulk().  But
   * we cheat by calling it from here, in the order > 0 path.  Saves a branch
   * or two.
   */
0a15c3e9f   Mel Gorman   page allocator: i...
1215
1216
  static inline
  struct page *buffered_rmqueue(struct zone *preferred_zone,
3dd282669   Mel Gorman   page allocator: c...
1217
1218
  			struct zone *zone, int order, gfp_t gfp_flags,
  			int migratetype)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1219
1220
  {
  	unsigned long flags;
689bcebfd   Hugh Dickins   [PATCH] unpaged: ...
1221
  	struct page *page;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1222
  	int cold = !!(gfp_flags & __GFP_COLD);
689bcebfd   Hugh Dickins   [PATCH] unpaged: ...
1223
  again:
48db57f8f   Nick Piggin   [PATCH] mm: free_...
1224
  	if (likely(order == 0)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1225
  		struct per_cpu_pages *pcp;
5f8dcc212   Mel Gorman   page-allocator: s...
1226
  		struct list_head *list;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1227

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1228
  		local_irq_save(flags);
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
1229
1230
  		pcp = &this_cpu_ptr(zone->pageset)->pcp;
  		list = &pcp->lists[migratetype];
5f8dcc212   Mel Gorman   page-allocator: s...
1231
  		if (list_empty(list)) {
535131e69   Mel Gorman   Choose pages from...
1232
  			pcp->count += rmqueue_bulk(zone, 0,
5f8dcc212   Mel Gorman   page-allocator: s...
1233
  					pcp->batch, list,
e084b2d95   Mel Gorman   page-allocator: p...
1234
  					migratetype, cold);
5f8dcc212   Mel Gorman   page-allocator: s...
1235
  			if (unlikely(list_empty(list)))
6fb332fab   Shaohua Li   memory hotplug: e...
1236
  				goto failed;
535131e69   Mel Gorman   Choose pages from...
1237
  		}
b92a6edd4   Mel Gorman   Add a configure o...
1238

5f8dcc212   Mel Gorman   page-allocator: s...
1239
1240
1241
1242
  		if (cold)
  			page = list_entry(list->prev, struct page, lru);
  		else
  			page = list_entry(list->next, struct page, lru);
b92a6edd4   Mel Gorman   Add a configure o...
1243
1244
  		list_del(&page->lru);
  		pcp->count--;
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1245
  	} else {
dab48dab3   Andrew Morton   page-allocator: w...
1246
1247
1248
1249
1250
1251
1252
1253
  		if (unlikely(gfp_flags & __GFP_NOFAIL)) {
  			/*
  			 * __GFP_NOFAIL is not to be used in new code.
  			 *
  			 * All __GFP_NOFAIL callers should be fixed so that they
  			 * properly detect and handle allocation failures.
  			 *
  			 * We most definitely don't want callers attempting to
4923abf9f   Linus Torvalds   Don't warn about ...
1254
  			 * allocate greater than order-1 page units with
dab48dab3   Andrew Morton   page-allocator: w...
1255
1256
  			 * __GFP_NOFAIL.
  			 */
4923abf9f   Linus Torvalds   Don't warn about ...
1257
  			WARN_ON_ONCE(order > 1);
dab48dab3   Andrew Morton   page-allocator: w...
1258
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1259
  		spin_lock_irqsave(&zone->lock, flags);
b2a0ac887   Mel Gorman   Split the free li...
1260
  		page = __rmqueue(zone, order, migratetype);
a74609faf   Nick Piggin   [PATCH] mm: page_...
1261
1262
1263
  		spin_unlock(&zone->lock);
  		if (!page)
  			goto failed;
6ccf80eb1   KOSAKI Motohiro   page allocator: u...
1264
  		__mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1265
  	}
f8891e5e1   Christoph Lameter   [PATCH] Light wei...
1266
  	__count_zone_vm_events(PGALLOC, zone, 1 << order);
18ea7e710   Mel Gorman   mm: remember what...
1267
  	zone_statistics(preferred_zone, zone);
a74609faf   Nick Piggin   [PATCH] mm: page_...
1268
  	local_irq_restore(flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1269

725d704ec   Nick Piggin   [PATCH] mm: VM_BU...
1270
  	VM_BUG_ON(bad_range(zone, page));
17cf44064   Nick Piggin   [PATCH] mm: clean...
1271
  	if (prep_new_page(page, order, gfp_flags))
a74609faf   Nick Piggin   [PATCH] mm: page_...
1272
  		goto again;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1273
  	return page;
a74609faf   Nick Piggin   [PATCH] mm: page_...
1274
1275
1276
  
  failed:
  	local_irq_restore(flags);
a74609faf   Nick Piggin   [PATCH] mm: page_...
1277
  	return NULL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1278
  }
418589663   Mel Gorman   page allocator: u...
1279
1280
1281
1282
1283
1284
1285
1286
  /* The ALLOC_WMARK bits are used as an index to zone->watermark */
  #define ALLOC_WMARK_MIN		WMARK_MIN
  #define ALLOC_WMARK_LOW		WMARK_LOW
  #define ALLOC_WMARK_HIGH	WMARK_HIGH
  #define ALLOC_NO_WATERMARKS	0x04 /* don't check watermarks at all */
  
  /* Mask to get the watermark bits */
  #define ALLOC_WMARK_MASK	(ALLOC_NO_WATERMARKS-1)
3148890bf   Nick Piggin   [PATCH] mm: __all...
1287
1288
1289
  #define ALLOC_HARDER		0x10 /* try to alloc harder */
  #define ALLOC_HIGH		0x20 /* __GFP_HIGH set */
  #define ALLOC_CPUSET		0x40 /* check for correct cpuset */
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1290

933e312e7   Akinobu Mita   [PATCH] fault-inj...
1291
1292
1293
1294
1295
1296
1297
  #ifdef CONFIG_FAIL_PAGE_ALLOC
  
  static struct fail_page_alloc_attr {
  	struct fault_attr attr;
  
  	u32 ignore_gfp_highmem;
  	u32 ignore_gfp_wait;
54114994f   Akinobu Mita   fault-injection: ...
1298
  	u32 min_order;
933e312e7   Akinobu Mita   [PATCH] fault-inj...
1299
1300
1301
1302
1303
  
  #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
  
  	struct dentry *ignore_gfp_highmem_file;
  	struct dentry *ignore_gfp_wait_file;
54114994f   Akinobu Mita   fault-injection: ...
1304
  	struct dentry *min_order_file;
933e312e7   Akinobu Mita   [PATCH] fault-inj...
1305
1306
1307
1308
1309
  
  #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
  
  } fail_page_alloc = {
  	.attr = FAULT_ATTR_INITIALIZER,
6b1b60f41   Don Mullis   [PATCH] fault-inj...
1310
1311
  	.ignore_gfp_wait = 1,
  	.ignore_gfp_highmem = 1,
54114994f   Akinobu Mita   fault-injection: ...
1312
  	.min_order = 1,
933e312e7   Akinobu Mita   [PATCH] fault-inj...
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
  };
  
  static int __init setup_fail_page_alloc(char *str)
  {
  	return setup_fault_attr(&fail_page_alloc.attr, str);
  }
  __setup("fail_page_alloc=", setup_fail_page_alloc);
  
  static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
  {
54114994f   Akinobu Mita   fault-injection: ...
1323
1324
  	if (order < fail_page_alloc.min_order)
  		return 0;
933e312e7   Akinobu Mita   [PATCH] fault-inj...
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
  	if (gfp_mask & __GFP_NOFAIL)
  		return 0;
  	if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
  		return 0;
  	if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
  		return 0;
  
  	return should_fail(&fail_page_alloc.attr, 1 << order);
  }
  
  #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
  
  static int __init fail_page_alloc_debugfs(void)
  {
  	mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
  	struct dentry *dir;
  	int err;
  
  	err = init_fault_attr_dentries(&fail_page_alloc.attr,
  				       "fail_page_alloc");
  	if (err)
  		return err;
  	dir = fail_page_alloc.attr.dentries.dir;
  
  	fail_page_alloc.ignore_gfp_wait_file =
  		debugfs_create_bool("ignore-gfp-wait", mode, dir,
  				      &fail_page_alloc.ignore_gfp_wait);
  
  	fail_page_alloc.ignore_gfp_highmem_file =
  		debugfs_create_bool("ignore-gfp-highmem", mode, dir,
  				      &fail_page_alloc.ignore_gfp_highmem);
54114994f   Akinobu Mita   fault-injection: ...
1356
1357
1358
  	fail_page_alloc.min_order_file =
  		debugfs_create_u32("min-order", mode, dir,
  				   &fail_page_alloc.min_order);
933e312e7   Akinobu Mita   [PATCH] fault-inj...
1359
1360
  
  	if (!fail_page_alloc.ignore_gfp_wait_file ||
54114994f   Akinobu Mita   fault-injection: ...
1361
1362
              !fail_page_alloc.ignore_gfp_highmem_file ||
              !fail_page_alloc.min_order_file) {
933e312e7   Akinobu Mita   [PATCH] fault-inj...
1363
1364
1365
  		err = -ENOMEM;
  		debugfs_remove(fail_page_alloc.ignore_gfp_wait_file);
  		debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file);
54114994f   Akinobu Mita   fault-injection: ...
1366
  		debugfs_remove(fail_page_alloc.min_order_file);
933e312e7   Akinobu Mita   [PATCH] fault-inj...
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
  		cleanup_fault_attr_dentries(&fail_page_alloc.attr);
  	}
  
  	return err;
  }
  
  late_initcall(fail_page_alloc_debugfs);
  
  #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
  
  #else /* CONFIG_FAIL_PAGE_ALLOC */
  
  static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
  {
  	return 0;
  }
  
  #endif /* CONFIG_FAIL_PAGE_ALLOC */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1385
1386
1387
1388
1389
  /*
   * Return 1 if free pages are above 'mark'. This takes into account the order
   * of the allocation.
   */
  int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1390
  		      int classzone_idx, int alloc_flags)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1391
1392
  {
  	/* free_pages my go negative - that's OK */
d23ad4232   Christoph Lameter   [PATCH] Use ZVC f...
1393
1394
  	long min = mark;
  	long free_pages = zone_page_state(z, NR_FREE_PAGES) - (1 << order) + 1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1395
  	int o;
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1396
  	if (alloc_flags & ALLOC_HIGH)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1397
  		min -= min / 2;
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1398
  	if (alloc_flags & ALLOC_HARDER)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
  		min -= min / 4;
  
  	if (free_pages <= min + z->lowmem_reserve[classzone_idx])
  		return 0;
  	for (o = 0; o < order; o++) {
  		/* At the next order, this order's pages become unavailable */
  		free_pages -= z->free_area[o].nr_free << o;
  
  		/* Require fewer higher order pages to be free */
  		min >>= 1;
  
  		if (free_pages <= min)
  			return 0;
  	}
  	return 1;
  }
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1415
1416
1417
1418
1419
1420
  #ifdef CONFIG_NUMA
  /*
   * zlc_setup - Setup for "zonelist cache".  Uses cached zone data to
   * skip over zones that are not allowed by the cpuset, or that have
   * been recently (in last second) found to be nearly full.  See further
   * comments in mmzone.h.  Reduces cache footprint of zonelist scans
183ff22bb   Simon Arlott   spelling fixes: mm/
1421
   * that have to skip over a lot of full or unallowed zones.
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1422
1423
1424
   *
   * If the zonelist cache is present in the passed in zonelist, then
   * returns a pointer to the allowed node mask (either the current
37b07e416   Lee Schermerhorn   memoryless nodes:...
1425
   * tasks mems_allowed, or node_states[N_HIGH_MEMORY].)
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
   *
   * If the zonelist cache is not available for this zonelist, does
   * nothing and returns NULL.
   *
   * If the fullzones BITMAP in the zonelist cache is stale (more than
   * a second since last zap'd) then we zap it out (clear its bits.)
   *
   * We hold off even calling zlc_setup, until after we've checked the
   * first zone in the zonelist, on the theory that most allocations will
   * be satisfied from that first zone, so best to examine that zone as
   * quickly as we can.
   */
  static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
  {
  	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
  	nodemask_t *allowednodes;	/* zonelist_cache approximation */
  
  	zlc = zonelist->zlcache_ptr;
  	if (!zlc)
  		return NULL;
f05111f50   S.ÇaÄŸlar Onur   mm/page_alloc.c: ...
1446
  	if (time_after(jiffies, zlc->last_full_zap + HZ)) {
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1447
1448
1449
1450
1451
1452
  		bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
  		zlc->last_full_zap = jiffies;
  	}
  
  	allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
  					&cpuset_current_mems_allowed :
37b07e416   Lee Schermerhorn   memoryless nodes:...
1453
  					&node_states[N_HIGH_MEMORY];
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
  	return allowednodes;
  }
  
  /*
   * Given 'z' scanning a zonelist, run a couple of quick checks to see
   * if it is worth looking at further for free memory:
   *  1) Check that the zone isn't thought to be full (doesn't have its
   *     bit set in the zonelist_cache fullzones BITMAP).
   *  2) Check that the zones node (obtained from the zonelist_cache
   *     z_to_n[] mapping) is allowed in the passed in allowednodes mask.
   * Return true (non-zero) if zone is worth looking at further, or
   * else return false (zero) if it is not.
   *
   * This check -ignores- the distinction between various watermarks,
   * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ...  If a zone is
   * found to be full for any variation of these watermarks, it will
   * be considered full for up to one second by all requests, unless
   * we are so low on memory on all allowed nodes that we are forced
   * into the second scan of the zonelist.
   *
   * In the second scan we ignore this zonelist cache and exactly
   * apply the watermarks to all zones, even it is slower to do so.
   * We are low on memory in the second scan, and should leave no stone
   * unturned looking for a free page.
   */
dd1a239f6   Mel Gorman   mm: have zonelist...
1479
  static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1480
1481
1482
1483
1484
1485
1486
1487
1488
  						nodemask_t *allowednodes)
  {
  	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
  	int i;				/* index of *z in zonelist zones */
  	int n;				/* node that zone *z is on */
  
  	zlc = zonelist->zlcache_ptr;
  	if (!zlc)
  		return 1;
dd1a239f6   Mel Gorman   mm: have zonelist...
1489
  	i = z - zonelist->_zonerefs;
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
  	n = zlc->z_to_n[i];
  
  	/* This zone is worth trying if it is allowed but not full */
  	return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
  }
  
  /*
   * Given 'z' scanning a zonelist, set the corresponding bit in
   * zlc->fullzones, so that subsequent attempts to allocate a page
   * from that zone don't waste time re-examining it.
   */
dd1a239f6   Mel Gorman   mm: have zonelist...
1501
  static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1502
1503
1504
1505
1506
1507
1508
  {
  	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
  	int i;				/* index of *z in zonelist zones */
  
  	zlc = zonelist->zlcache_ptr;
  	if (!zlc)
  		return;
dd1a239f6   Mel Gorman   mm: have zonelist...
1509
  	i = z - zonelist->_zonerefs;
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
  
  	set_bit(i, zlc->fullzones);
  }
  
  #else	/* CONFIG_NUMA */
  
  static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
  {
  	return NULL;
  }
dd1a239f6   Mel Gorman   mm: have zonelist...
1520
  static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1521
1522
1523
1524
  				nodemask_t *allowednodes)
  {
  	return 1;
  }
dd1a239f6   Mel Gorman   mm: have zonelist...
1525
  static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1526
1527
1528
  {
  }
  #endif	/* CONFIG_NUMA */
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1529
  /*
0798e5193   Paul Jackson   [PATCH] memory pa...
1530
   * get_page_from_freelist goes through the zonelist trying to allocate
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1531
1532
1533
   * a page.
   */
  static struct page *
19770b326   Mel Gorman   mm: filter based ...
1534
  get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
5117f45d1   Mel Gorman   page allocator: c...
1535
  		struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
3dd282669   Mel Gorman   page allocator: c...
1536
  		struct zone *preferred_zone, int migratetype)
753ee7289   Martin Hicks   [PATCH] VM: early...
1537
  {
dd1a239f6   Mel Gorman   mm: have zonelist...
1538
  	struct zoneref *z;
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1539
  	struct page *page = NULL;
54a6eb5c4   Mel Gorman   mm: use two zonel...
1540
  	int classzone_idx;
5117f45d1   Mel Gorman   page allocator: c...
1541
  	struct zone *zone;
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1542
1543
1544
  	nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
  	int zlc_active = 0;		/* set if using zonelist_cache */
  	int did_zlc_setup = 0;		/* just call zlc_setup() one time */
54a6eb5c4   Mel Gorman   mm: use two zonel...
1545

19770b326   Mel Gorman   mm: filter based ...
1546
  	classzone_idx = zone_idx(preferred_zone);
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1547
  zonelist_scan:
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1548
  	/*
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1549
  	 * Scan zonelist, looking for a zone with enough free.
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1550
1551
  	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
  	 */
19770b326   Mel Gorman   mm: filter based ...
1552
1553
  	for_each_zone_zonelist_nodemask(zone, z, zonelist,
  						high_zoneidx, nodemask) {
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1554
1555
1556
  		if (NUMA_BUILD && zlc_active &&
  			!zlc_zone_worth_trying(zonelist, z, allowednodes))
  				continue;
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1557
  		if ((alloc_flags & ALLOC_CPUSET) &&
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
1558
  			!cpuset_zone_allowed_softwall(zone, gfp_mask))
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1559
  				goto try_next_zone;
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1560

418589663   Mel Gorman   page allocator: u...
1561
  		BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1562
  		if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
3148890bf   Nick Piggin   [PATCH] mm: __all...
1563
  			unsigned long mark;
fa5e084e4   Mel Gorman   vmscan: do not un...
1564
  			int ret;
418589663   Mel Gorman   page allocator: u...
1565
  			mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
fa5e084e4   Mel Gorman   vmscan: do not un...
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
  			if (zone_watermark_ok(zone, order, mark,
  				    classzone_idx, alloc_flags))
  				goto try_this_zone;
  
  			if (zone_reclaim_mode == 0)
  				goto this_zone_full;
  
  			ret = zone_reclaim(zone, gfp_mask, order);
  			switch (ret) {
  			case ZONE_RECLAIM_NOSCAN:
  				/* did not scan */
  				goto try_next_zone;
  			case ZONE_RECLAIM_FULL:
  				/* scanned but unreclaimable */
  				goto this_zone_full;
  			default:
  				/* did we reclaim enough */
  				if (!zone_watermark_ok(zone, order, mark,
  						classzone_idx, alloc_flags))
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1585
  					goto this_zone_full;
0798e5193   Paul Jackson   [PATCH] memory pa...
1586
  			}
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1587
  		}
fa5e084e4   Mel Gorman   vmscan: do not un...
1588
  try_this_zone:
3dd282669   Mel Gorman   page allocator: c...
1589
1590
  		page = buffered_rmqueue(preferred_zone, zone, order,
  						gfp_mask, migratetype);
0798e5193   Paul Jackson   [PATCH] memory pa...
1591
  		if (page)
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1592
  			break;
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1593
1594
1595
1596
  this_zone_full:
  		if (NUMA_BUILD)
  			zlc_mark_zone_full(zonelist, z);
  try_next_zone:
62bc62a87   Christoph Lameter   page allocator: u...
1597
  		if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
d395b7342   Mel Gorman   page allocator: d...
1598
1599
1600
1601
  			/*
  			 * we do zlc_setup after the first zone is tried but only
  			 * if there are multiple nodes make it worthwhile
  			 */
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1602
1603
1604
1605
  			allowednodes = zlc_setup(zonelist, alloc_flags);
  			zlc_active = 1;
  			did_zlc_setup = 1;
  		}
54a6eb5c4   Mel Gorman   mm: use two zonel...
1606
  	}
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1607
1608
1609
1610
1611
1612
  
  	if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
  		/* Disable zlc cache for second zonelist scan */
  		zlc_active = 0;
  		goto zonelist_scan;
  	}
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1613
  	return page;
753ee7289   Martin Hicks   [PATCH] VM: early...
1614
  }
11e33f6a5   Mel Gorman   page allocator: b...
1615
1616
1617
  static inline int
  should_alloc_retry(gfp_t gfp_mask, unsigned int order,
  				unsigned long pages_reclaimed)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1618
  {
11e33f6a5   Mel Gorman   page allocator: b...
1619
1620
1621
  	/* Do not loop if specifically requested */
  	if (gfp_mask & __GFP_NORETRY)
  		return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1622

11e33f6a5   Mel Gorman   page allocator: b...
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
  	/*
  	 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
  	 * means __GFP_NOFAIL, but that may not be true in other
  	 * implementations.
  	 */
  	if (order <= PAGE_ALLOC_COSTLY_ORDER)
  		return 1;
  
  	/*
  	 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
  	 * specified, then we retry until we no longer reclaim any pages
  	 * (above), or we've reclaimed an order of pages at least as
  	 * large as the allocation's order. In both cases, if the
  	 * allocation still fails, we stop retrying.
  	 */
  	if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
  		return 1;
cf40bd16f   Nick Piggin   lockdep: annotate...
1640

11e33f6a5   Mel Gorman   page allocator: b...
1641
1642
1643
1644
1645
1646
  	/*
  	 * Don't let big-order allocations loop unless the caller
  	 * explicitly requests that.
  	 */
  	if (gfp_mask & __GFP_NOFAIL)
  		return 1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1647

11e33f6a5   Mel Gorman   page allocator: b...
1648
1649
  	return 0;
  }
933e312e7   Akinobu Mita   [PATCH] fault-inj...
1650

11e33f6a5   Mel Gorman   page allocator: b...
1651
1652
1653
  static inline struct page *
  __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
  	struct zonelist *zonelist, enum zone_type high_zoneidx,
3dd282669   Mel Gorman   page allocator: c...
1654
1655
  	nodemask_t *nodemask, struct zone *preferred_zone,
  	int migratetype)
11e33f6a5   Mel Gorman   page allocator: b...
1656
1657
1658
1659
1660
1661
  {
  	struct page *page;
  
  	/* Acquire the OOM killer lock for the zones in zonelist */
  	if (!try_set_zone_oom(zonelist, gfp_mask)) {
  		schedule_timeout_uninterruptible(1);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1662
1663
  		return NULL;
  	}
6b1de9161   Jens Axboe   [PATCH] VM: fix z...
1664

11e33f6a5   Mel Gorman   page allocator: b...
1665
1666
1667
1668
1669
1670
1671
  	/*
  	 * Go through the zonelist yet one more time, keep very high watermark
  	 * here, this is only to catch a parallel oom killing, we must fail if
  	 * we're still under heavy pressure.
  	 */
  	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
  		order, zonelist, high_zoneidx,
5117f45d1   Mel Gorman   page allocator: c...
1672
  		ALLOC_WMARK_HIGH|ALLOC_CPUSET,
3dd282669   Mel Gorman   page allocator: c...
1673
  		preferred_zone, migratetype);
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1674
  	if (page)
11e33f6a5   Mel Gorman   page allocator: b...
1675
  		goto out;
4365a5676   KAMEZAWA Hiroyuki   oom-kill: fix NUM...
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
  	if (!(gfp_mask & __GFP_NOFAIL)) {
  		/* The OOM killer will not help higher order allocs */
  		if (order > PAGE_ALLOC_COSTLY_ORDER)
  			goto out;
  		/*
  		 * GFP_THISNODE contains __GFP_NORETRY and we never hit this.
  		 * Sanity check for bare calls of __GFP_THISNODE, not real OOM.
  		 * The caller should handle page allocation failure by itself if
  		 * it specifies __GFP_THISNODE.
  		 * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.
  		 */
  		if (gfp_mask & __GFP_THISNODE)
  			goto out;
  	}
11e33f6a5   Mel Gorman   page allocator: b...
1690
  	/* Exhausted what can be done so it's blamo time */
4365a5676   KAMEZAWA Hiroyuki   oom-kill: fix NUM...
1691
  	out_of_memory(zonelist, gfp_mask, order, nodemask);
11e33f6a5   Mel Gorman   page allocator: b...
1692
1693
1694
1695
1696
  
  out:
  	clear_zonelist_oom(zonelist, gfp_mask);
  	return page;
  }
56de7263f   Mel Gorman   mm: compaction: d...
1697
1698
1699
1700
1701
1702
1703
1704
1705
  #ifdef CONFIG_COMPACTION
  /* Try memory compaction for high-order allocations before reclaim */
  static struct page *
  __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
  	struct zonelist *zonelist, enum zone_type high_zoneidx,
  	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
  	int migratetype, unsigned long *did_some_progress)
  {
  	struct page *page;
4f92e2586   Mel Gorman   mm: compaction: d...
1706
  	if (!order || compaction_deferred(preferred_zone))
56de7263f   Mel Gorman   mm: compaction: d...
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
  		return NULL;
  
  	*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
  								nodemask);
  	if (*did_some_progress != COMPACT_SKIPPED) {
  
  		/* Page migration frees to the PCP lists but we want merging */
  		drain_pages(get_cpu());
  		put_cpu();
  
  		page = get_page_from_freelist(gfp_mask, nodemask,
  				order, zonelist, high_zoneidx,
  				alloc_flags, preferred_zone,
  				migratetype);
  		if (page) {
4f92e2586   Mel Gorman   mm: compaction: d...
1722
1723
  			preferred_zone->compact_considered = 0;
  			preferred_zone->compact_defer_shift = 0;
56de7263f   Mel Gorman   mm: compaction: d...
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
  			count_vm_event(COMPACTSUCCESS);
  			return page;
  		}
  
  		/*
  		 * It's bad if compaction run occurs and fails.
  		 * The most likely reason is that pages exist,
  		 * but not enough to satisfy watermarks.
  		 */
  		count_vm_event(COMPACTFAIL);
4f92e2586   Mel Gorman   mm: compaction: d...
1734
  		defer_compaction(preferred_zone);
56de7263f   Mel Gorman   mm: compaction: d...
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
  
  		cond_resched();
  	}
  
  	return NULL;
  }
  #else
  static inline struct page *
  __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
  	struct zonelist *zonelist, enum zone_type high_zoneidx,
  	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
  	int migratetype, unsigned long *did_some_progress)
  {
  	return NULL;
  }
  #endif /* CONFIG_COMPACTION */
11e33f6a5   Mel Gorman   page allocator: b...
1751
1752
1753
1754
  /* The really slow allocator path where we enter direct reclaim */
  static inline struct page *
  __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
  	struct zonelist *zonelist, enum zone_type high_zoneidx,
5117f45d1   Mel Gorman   page allocator: c...
1755
  	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
3dd282669   Mel Gorman   page allocator: c...
1756
  	int migratetype, unsigned long *did_some_progress)
11e33f6a5   Mel Gorman   page allocator: b...
1757
1758
1759
1760
1761
1762
1763
1764
1765
  {
  	struct page *page = NULL;
  	struct reclaim_state reclaim_state;
  	struct task_struct *p = current;
  
  	cond_resched();
  
  	/* We now go into synchronous reclaim */
  	cpuset_memory_pressure_bump();
11e33f6a5   Mel Gorman   page allocator: b...
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
  	p->flags |= PF_MEMALLOC;
  	lockdep_set_current_reclaim_state(gfp_mask);
  	reclaim_state.reclaimed_slab = 0;
  	p->reclaim_state = &reclaim_state;
  
  	*did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
  
  	p->reclaim_state = NULL;
  	lockdep_clear_current_reclaim_state();
  	p->flags &= ~PF_MEMALLOC;
  
  	cond_resched();
  
  	if (order != 0)
  		drain_all_pages();
  
  	if (likely(*did_some_progress))
  		page = get_page_from_freelist(gfp_mask, nodemask, order,
5117f45d1   Mel Gorman   page allocator: c...
1784
  					zonelist, high_zoneidx,
3dd282669   Mel Gorman   page allocator: c...
1785
1786
  					alloc_flags, preferred_zone,
  					migratetype);
11e33f6a5   Mel Gorman   page allocator: b...
1787
1788
  	return page;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1789
  /*
11e33f6a5   Mel Gorman   page allocator: b...
1790
1791
   * This is called in the allocator slow-path if the allocation request is of
   * sufficient urgency to ignore watermarks and take other desperate measures
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1792
   */
11e33f6a5   Mel Gorman   page allocator: b...
1793
1794
1795
  static inline struct page *
  __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
  	struct zonelist *zonelist, enum zone_type high_zoneidx,
3dd282669   Mel Gorman   page allocator: c...
1796
1797
  	nodemask_t *nodemask, struct zone *preferred_zone,
  	int migratetype)
11e33f6a5   Mel Gorman   page allocator: b...
1798
1799
1800
1801
1802
  {
  	struct page *page;
  
  	do {
  		page = get_page_from_freelist(gfp_mask, nodemask, order,
5117f45d1   Mel Gorman   page allocator: c...
1803
  			zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,
3dd282669   Mel Gorman   page allocator: c...
1804
  			preferred_zone, migratetype);
11e33f6a5   Mel Gorman   page allocator: b...
1805
1806
  
  		if (!page && gfp_mask & __GFP_NOFAIL)
8aa7e847d   Jens Axboe   Fix congestion_wa...
1807
  			congestion_wait(BLK_RW_ASYNC, HZ/50);
11e33f6a5   Mel Gorman   page allocator: b...
1808
1809
1810
1811
1812
1813
1814
1815
  	} while (!page && (gfp_mask & __GFP_NOFAIL));
  
  	return page;
  }
  
  static inline
  void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
  						enum zone_type high_zoneidx)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1816
  {
dd1a239f6   Mel Gorman   mm: have zonelist...
1817
1818
  	struct zoneref *z;
  	struct zone *zone;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1819

11e33f6a5   Mel Gorman   page allocator: b...
1820
1821
1822
  	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
  		wakeup_kswapd(zone, order);
  }
cf40bd16f   Nick Piggin   lockdep: annotate...
1823

341ce06f6   Peter Zijlstra   page allocator: c...
1824
1825
1826
1827
1828
1829
  static inline int
  gfp_to_alloc_flags(gfp_t gfp_mask)
  {
  	struct task_struct *p = current;
  	int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
  	const gfp_t wait = gfp_mask & __GFP_WAIT;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1830

a56f57ff9   Mel Gorman   page allocator: r...
1831
1832
  	/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
  	BUILD_BUG_ON(__GFP_HIGH != ALLOC_HIGH);
933e312e7   Akinobu Mita   [PATCH] fault-inj...
1833

341ce06f6   Peter Zijlstra   page allocator: c...
1834
1835
1836
1837
1838
1839
  	/*
  	 * The caller may dip into page reserves a bit more if the caller
  	 * cannot run direct reclaim, or if the caller has realtime scheduling
  	 * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
  	 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
  	 */
a56f57ff9   Mel Gorman   page allocator: r...
1840
  	alloc_flags |= (gfp_mask & __GFP_HIGH);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1841

341ce06f6   Peter Zijlstra   page allocator: c...
1842
1843
  	if (!wait) {
  		alloc_flags |= ALLOC_HARDER;
523b94585   Christoph Lameter   Memoryless nodes:...
1844
  		/*
341ce06f6   Peter Zijlstra   page allocator: c...
1845
1846
  		 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
  		 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
523b94585   Christoph Lameter   Memoryless nodes:...
1847
  		 */
341ce06f6   Peter Zijlstra   page allocator: c...
1848
  		alloc_flags &= ~ALLOC_CPUSET;
9d0ed60fe   Mel Gorman   page allocator: D...
1849
  	} else if (unlikely(rt_task(p)) && !in_interrupt())
341ce06f6   Peter Zijlstra   page allocator: c...
1850
1851
1852
1853
1854
1855
1856
  		alloc_flags |= ALLOC_HARDER;
  
  	if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
  		if (!in_interrupt() &&
  		    ((p->flags & PF_MEMALLOC) ||
  		     unlikely(test_thread_flag(TIF_MEMDIE))))
  			alloc_flags |= ALLOC_NO_WATERMARKS;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1857
  	}
6b1de9161   Jens Axboe   [PATCH] VM: fix z...
1858

341ce06f6   Peter Zijlstra   page allocator: c...
1859
1860
  	return alloc_flags;
  }
11e33f6a5   Mel Gorman   page allocator: b...
1861
1862
1863
  static inline struct page *
  __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
  	struct zonelist *zonelist, enum zone_type high_zoneidx,
3dd282669   Mel Gorman   page allocator: c...
1864
1865
  	nodemask_t *nodemask, struct zone *preferred_zone,
  	int migratetype)
11e33f6a5   Mel Gorman   page allocator: b...
1866
1867
1868
1869
1870
1871
1872
  {
  	const gfp_t wait = gfp_mask & __GFP_WAIT;
  	struct page *page = NULL;
  	int alloc_flags;
  	unsigned long pages_reclaimed = 0;
  	unsigned long did_some_progress;
  	struct task_struct *p = current;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1873

952f3b51b   Christoph Lameter   [PATCH] GFP_THISN...
1874
  	/*
72807a74c   Mel Gorman   page allocator: s...
1875
1876
1877
1878
1879
  	 * In the slowpath, we sanity check order to avoid ever trying to
  	 * reclaim >= MAX_ORDER areas which will never succeed. Callers may
  	 * be using allocators in order of preference for an area that is
  	 * too large.
  	 */
1fc28b70f   Mel Gorman   page-allocator: a...
1880
1881
  	if (order >= MAX_ORDER) {
  		WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
72807a74c   Mel Gorman   page allocator: s...
1882
  		return NULL;
1fc28b70f   Mel Gorman   page-allocator: a...
1883
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1884

952f3b51b   Christoph Lameter   [PATCH] GFP_THISN...
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
  	/*
  	 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
  	 * __GFP_NOWARN set) should not cause reclaim since the subsystem
  	 * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
  	 * using a larger set of nodes after it has established that the
  	 * allowed per node queues are empty and that nodes are
  	 * over allocated.
  	 */
  	if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
  		goto nopage;
cc4a68514   Mel Gorman   page allocator: a...
1895
  restart:
11e33f6a5   Mel Gorman   page allocator: b...
1896
  	wake_all_kswapd(order, zonelist, high_zoneidx);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1897

9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
1898
  	/*
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1899
1900
1901
  	 * OK, we're below the kswapd watermark and have kicked background
  	 * reclaim. Now things get more complex, so set up alloc_flags according
  	 * to how we want to proceed.
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
1902
  	 */
341ce06f6   Peter Zijlstra   page allocator: c...
1903
  	alloc_flags = gfp_to_alloc_flags(gfp_mask);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1904

341ce06f6   Peter Zijlstra   page allocator: c...
1905
  	/* This is the last chance, in general, before the goto nopage. */
19770b326   Mel Gorman   mm: filter based ...
1906
  	page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
341ce06f6   Peter Zijlstra   page allocator: c...
1907
1908
  			high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
  			preferred_zone, migratetype);
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1909
1910
  	if (page)
  		goto got_pg;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1911

b43a57bb4   Kirill Korotaev   [PATCH] OOM can p...
1912
  rebalance:
11e33f6a5   Mel Gorman   page allocator: b...
1913
  	/* Allocate without watermarks if the context allows */
341ce06f6   Peter Zijlstra   page allocator: c...
1914
1915
1916
1917
1918
1919
  	if (alloc_flags & ALLOC_NO_WATERMARKS) {
  		page = __alloc_pages_high_priority(gfp_mask, order,
  				zonelist, high_zoneidx, nodemask,
  				preferred_zone, migratetype);
  		if (page)
  			goto got_pg;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1920
1921
1922
1923
1924
  	}
  
  	/* Atomic allocations - we can't balance anything */
  	if (!wait)
  		goto nopage;
341ce06f6   Peter Zijlstra   page allocator: c...
1925
1926
1927
  	/* Avoid recursion of direct reclaim */
  	if (p->flags & PF_MEMALLOC)
  		goto nopage;
6583bb64f   David Rientjes   mm: avoid endless...
1928
1929
1930
  	/* Avoid allocations with no watermarks from looping endlessly */
  	if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
  		goto nopage;
56de7263f   Mel Gorman   mm: compaction: d...
1931
1932
1933
1934
1935
1936
1937
1938
  	/* Try direct compaction */
  	page = __alloc_pages_direct_compact(gfp_mask, order,
  					zonelist, high_zoneidx,
  					nodemask,
  					alloc_flags, preferred_zone,
  					migratetype, &did_some_progress);
  	if (page)
  		goto got_pg;
11e33f6a5   Mel Gorman   page allocator: b...
1939
1940
1941
1942
  	/* Try direct reclaim and then allocating */
  	page = __alloc_pages_direct_reclaim(gfp_mask, order,
  					zonelist, high_zoneidx,
  					nodemask,
5117f45d1   Mel Gorman   page allocator: c...
1943
  					alloc_flags, preferred_zone,
3dd282669   Mel Gorman   page allocator: c...
1944
  					migratetype, &did_some_progress);
11e33f6a5   Mel Gorman   page allocator: b...
1945
1946
  	if (page)
  		goto got_pg;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1947

e33c3b5e1   David Rientjes   cpusets: update m...
1948
  	/*
11e33f6a5   Mel Gorman   page allocator: b...
1949
1950
  	 * If we failed to make any progress reclaiming, then we are
  	 * running out of options and have to consider going OOM
e33c3b5e1   David Rientjes   cpusets: update m...
1951
  	 */
11e33f6a5   Mel Gorman   page allocator: b...
1952
1953
  	if (!did_some_progress) {
  		if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
7f33d49a2   Rafael J. Wysocki   mm, PM/Freezer: D...
1954
1955
  			if (oom_killer_disabled)
  				goto nopage;
11e33f6a5   Mel Gorman   page allocator: b...
1956
1957
  			page = __alloc_pages_may_oom(gfp_mask, order,
  					zonelist, high_zoneidx,
3dd282669   Mel Gorman   page allocator: c...
1958
1959
  					nodemask, preferred_zone,
  					migratetype);
11e33f6a5   Mel Gorman   page allocator: b...
1960
1961
  			if (page)
  				goto got_pg;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1962

11e33f6a5   Mel Gorman   page allocator: b...
1963
  			/*
82553a937   David Rientjes   oom: invoke oom k...
1964
1965
1966
1967
  			 * The OOM killer does not trigger for high-order
  			 * ~__GFP_NOFAIL allocations so if no progress is being
  			 * made, there are no other options and retrying is
  			 * unlikely to help.
11e33f6a5   Mel Gorman   page allocator: b...
1968
  			 */
82553a937   David Rientjes   oom: invoke oom k...
1969
1970
  			if (order > PAGE_ALLOC_COSTLY_ORDER &&
  						!(gfp_mask & __GFP_NOFAIL))
11e33f6a5   Mel Gorman   page allocator: b...
1971
  				goto nopage;
e2c55dc87   Mel Gorman   Drain per-cpu lis...
1972

ff0ceb9de   David Rientjes   oom: serialize ou...
1973
1974
  			goto restart;
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1975
  	}
11e33f6a5   Mel Gorman   page allocator: b...
1976
  	/* Check if we should retry the allocation */
a41f24ea9   Nishanth Aravamudan   page allocator: s...
1977
  	pages_reclaimed += did_some_progress;
11e33f6a5   Mel Gorman   page allocator: b...
1978
1979
  	if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
  		/* Wait for some write requests to complete then retry */
8aa7e847d   Jens Axboe   Fix congestion_wa...
1980
  		congestion_wait(BLK_RW_ASYNC, HZ/50);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
  		goto rebalance;
  	}
  
  nopage:
  	if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
  		printk(KERN_WARNING "%s: page allocation failure."
  			" order:%d, mode:0x%x
  ",
  			p->comm, order, gfp_mask);
  		dump_stack();
578c2fd6a   Janet Morgan   [PATCH] add OOM d...
1991
  		show_mem();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1992
  	}
b1eeab676   Vegard Nossum   kmemcheck: add ho...
1993
  	return page;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1994
  got_pg:
b1eeab676   Vegard Nossum   kmemcheck: add ho...
1995
1996
  	if (kmemcheck_enabled)
  		kmemcheck_pagealloc_alloc(page, order, gfp_mask);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1997
  	return page;
11e33f6a5   Mel Gorman   page allocator: b...
1998

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1999
  }
11e33f6a5   Mel Gorman   page allocator: b...
2000
2001
2002
2003
2004
2005
2006
2007
2008
  
  /*
   * This is the 'heart' of the zoned buddy allocator.
   */
  struct page *
  __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
  			struct zonelist *zonelist, nodemask_t *nodemask)
  {
  	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
5117f45d1   Mel Gorman   page allocator: c...
2009
  	struct zone *preferred_zone;
11e33f6a5   Mel Gorman   page allocator: b...
2010
  	struct page *page;
3dd282669   Mel Gorman   page allocator: c...
2011
  	int migratetype = allocflags_to_migratetype(gfp_mask);
11e33f6a5   Mel Gorman   page allocator: b...
2012

dcce284a2   Benjamin Herrenschmidt   mm: Extend gfp ma...
2013
  	gfp_mask &= gfp_allowed_mask;
11e33f6a5   Mel Gorman   page allocator: b...
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
  	lockdep_trace_alloc(gfp_mask);
  
  	might_sleep_if(gfp_mask & __GFP_WAIT);
  
  	if (should_fail_alloc_page(gfp_mask, order))
  		return NULL;
  
  	/*
  	 * Check the zones suitable for the gfp_mask contain at least one
  	 * valid zone. It's possible to have an empty zonelist as a result
  	 * of GFP_THISNODE and a memoryless node
  	 */
  	if (unlikely(!zonelist->_zonerefs->zone))
  		return NULL;
c0ff7453b   Miao Xie   cpuset,mm: fix no...
2028
  	get_mems_allowed();
5117f45d1   Mel Gorman   page allocator: c...
2029
2030
  	/* The preferred zone is used for statistics later */
  	first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone);
c0ff7453b   Miao Xie   cpuset,mm: fix no...
2031
2032
  	if (!preferred_zone) {
  		put_mems_allowed();
5117f45d1   Mel Gorman   page allocator: c...
2033
  		return NULL;
c0ff7453b   Miao Xie   cpuset,mm: fix no...
2034
  	}
5117f45d1   Mel Gorman   page allocator: c...
2035
2036
  
  	/* First allocation attempt */
11e33f6a5   Mel Gorman   page allocator: b...
2037
  	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
5117f45d1   Mel Gorman   page allocator: c...
2038
  			zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,
3dd282669   Mel Gorman   page allocator: c...
2039
  			preferred_zone, migratetype);
11e33f6a5   Mel Gorman   page allocator: b...
2040
2041
  	if (unlikely(!page))
  		page = __alloc_pages_slowpath(gfp_mask, order,
5117f45d1   Mel Gorman   page allocator: c...
2042
  				zonelist, high_zoneidx, nodemask,
3dd282669   Mel Gorman   page allocator: c...
2043
  				preferred_zone, migratetype);
c0ff7453b   Miao Xie   cpuset,mm: fix no...
2044
  	put_mems_allowed();
11e33f6a5   Mel Gorman   page allocator: b...
2045

4b4f278c0   Mel Gorman   tracing, page-all...
2046
  	trace_mm_page_alloc(page, order, gfp_mask, migratetype);
11e33f6a5   Mel Gorman   page allocator: b...
2047
  	return page;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2048
  }
d239171e4   Mel Gorman   page allocator: r...
2049
  EXPORT_SYMBOL(__alloc_pages_nodemask);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2050
2051
2052
2053
  
  /*
   * Common helper functions.
   */
920c7a5d0   Harvey Harrison   mm: remove fastca...
2054
  unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2055
  {
945a11136   Akinobu Mita   mm: add gfp mask ...
2056
2057
2058
2059
2060
2061
2062
  	struct page *page;
  
  	/*
  	 * __get_free_pages() returns a 32-bit address, which cannot represent
  	 * a highmem page
  	 */
  	VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2063
2064
2065
2066
2067
  	page = alloc_pages(gfp_mask, order);
  	if (!page)
  		return 0;
  	return (unsigned long) page_address(page);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2068
  EXPORT_SYMBOL(__get_free_pages);
920c7a5d0   Harvey Harrison   mm: remove fastca...
2069
  unsigned long get_zeroed_page(gfp_t gfp_mask)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2070
  {
945a11136   Akinobu Mita   mm: add gfp mask ...
2071
  	return __get_free_pages(gfp_mask | __GFP_ZERO, 0);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2072
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2073
2074
2075
2076
2077
  EXPORT_SYMBOL(get_zeroed_page);
  
  void __pagevec_free(struct pagevec *pvec)
  {
  	int i = pagevec_count(pvec);
4b4f278c0   Mel Gorman   tracing, page-all...
2078
2079
  	while (--i >= 0) {
  		trace_mm_pagevec_free(pvec->pages[i], pvec->cold);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2080
  		free_hot_cold_page(pvec->pages[i], pvec->cold);
4b4f278c0   Mel Gorman   tracing, page-all...
2081
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2082
  }
920c7a5d0   Harvey Harrison   mm: remove fastca...
2083
  void __free_pages(struct page *page, unsigned int order)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2084
  {
b5810039a   Nick Piggin   [PATCH] core remo...
2085
  	if (put_page_testzero(page)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2086
  		if (order == 0)
fc91668ea   Li Hong   mm: remove free_h...
2087
  			free_hot_cold_page(page, 0);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2088
2089
2090
2091
2092
2093
  		else
  			__free_pages_ok(page, order);
  	}
  }
  
  EXPORT_SYMBOL(__free_pages);
920c7a5d0   Harvey Harrison   mm: remove fastca...
2094
  void free_pages(unsigned long addr, unsigned int order)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2095
2096
  {
  	if (addr != 0) {
725d704ec   Nick Piggin   [PATCH] mm: VM_BU...
2097
  		VM_BUG_ON(!virt_addr_valid((void *)addr));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2098
2099
2100
2101
2102
  		__free_pages(virt_to_page((void *)addr), order);
  	}
  }
  
  EXPORT_SYMBOL(free_pages);
2be0ffe2b   Timur Tabi   mm: add alloc_pag...
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
  /**
   * alloc_pages_exact - allocate an exact number physically-contiguous pages.
   * @size: the number of bytes to allocate
   * @gfp_mask: GFP flags for the allocation
   *
   * This function is similar to alloc_pages(), except that it allocates the
   * minimum number of pages to satisfy the request.  alloc_pages() can only
   * allocate memory in power-of-two pages.
   *
   * This function is also limited by MAX_ORDER.
   *
   * Memory allocated by this function must be released by free_pages_exact().
   */
  void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
  {
  	unsigned int order = get_order(size);
  	unsigned long addr;
  
  	addr = __get_free_pages(gfp_mask, order);
  	if (addr) {
  		unsigned long alloc_end = addr + (PAGE_SIZE << order);
  		unsigned long used = addr + PAGE_ALIGN(size);
5bfd75609   Kevin Cernekee   Fix virt_to_phys(...
2125
  		split_page(virt_to_page((void *)addr), order);
2be0ffe2b   Timur Tabi   mm: add alloc_pag...
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
  		while (used < alloc_end) {
  			free_page(used);
  			used += PAGE_SIZE;
  		}
  	}
  
  	return (void *)addr;
  }
  EXPORT_SYMBOL(alloc_pages_exact);
  
  /**
   * free_pages_exact - release memory allocated via alloc_pages_exact()
   * @virt: the value returned by alloc_pages_exact.
   * @size: size of allocation, same value as passed to alloc_pages_exact().
   *
   * Release the memory allocated by a previous call to alloc_pages_exact.
   */
  void free_pages_exact(void *virt, size_t size)
  {
  	unsigned long addr = (unsigned long)virt;
  	unsigned long end = addr + PAGE_ALIGN(size);
  
  	while (addr < end) {
  		free_page(addr);
  		addr += PAGE_SIZE;
  	}
  }
  EXPORT_SYMBOL(free_pages_exact);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2154
2155
  static unsigned int nr_free_zone_pages(int offset)
  {
dd1a239f6   Mel Gorman   mm: have zonelist...
2156
  	struct zoneref *z;
54a6eb5c4   Mel Gorman   mm: use two zonel...
2157
  	struct zone *zone;
e310fd432   Martin J. Bligh   [PATCH] Fix NUMA ...
2158
  	/* Just pick one node, since fallback list is circular */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2159
  	unsigned int sum = 0;
0e88460da   Mel Gorman   mm: introduce nod...
2160
  	struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2161

54a6eb5c4   Mel Gorman   mm: use two zonel...
2162
  	for_each_zone_zonelist(zone, z, zonelist, offset) {
e310fd432   Martin J. Bligh   [PATCH] Fix NUMA ...
2163
  		unsigned long size = zone->present_pages;
418589663   Mel Gorman   page allocator: u...
2164
  		unsigned long high = high_wmark_pages(zone);
e310fd432   Martin J. Bligh   [PATCH] Fix NUMA ...
2165
2166
  		if (size > high)
  			sum += size - high;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
  	}
  
  	return sum;
  }
  
  /*
   * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL
   */
  unsigned int nr_free_buffer_pages(void)
  {
af4ca457e   Al Viro   [PATCH] gfp_t: in...
2177
  	return nr_free_zone_pages(gfp_zone(GFP_USER));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2178
  }
c2f1a551d   Meelap Shah   knfsd: nfsd4: var...
2179
  EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2180
2181
2182
2183
2184
2185
  
  /*
   * Amount of free RAM allocatable within all zones
   */
  unsigned int nr_free_pagecache_pages(void)
  {
2a1e274ac   Mel Gorman   Create the ZONE_M...
2186
  	return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2187
  }
08e0f6a97   Christoph Lameter   [PATCH] Add NUMA_...
2188
2189
  
  static inline void show_node(struct zone *zone)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2190
  {
08e0f6a97   Christoph Lameter   [PATCH] Add NUMA_...
2191
  	if (NUMA_BUILD)
25ba77c14   Andy Whitcroft   [PATCH] numa node...
2192
  		printk("Node %d ", zone_to_nid(zone));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2193
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2194

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2195
2196
2197
2198
  void si_meminfo(struct sysinfo *val)
  {
  	val->totalram = totalram_pages;
  	val->sharedram = 0;
d23ad4232   Christoph Lameter   [PATCH] Use ZVC f...
2199
  	val->freeram = global_page_state(NR_FREE_PAGES);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2200
  	val->bufferram = nr_blockdev_pages();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2201
2202
  	val->totalhigh = totalhigh_pages;
  	val->freehigh = nr_free_highpages();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
  	val->mem_unit = PAGE_SIZE;
  }
  
  EXPORT_SYMBOL(si_meminfo);
  
  #ifdef CONFIG_NUMA
  void si_meminfo_node(struct sysinfo *val, int nid)
  {
  	pg_data_t *pgdat = NODE_DATA(nid);
  
  	val->totalram = pgdat->node_present_pages;
d23ad4232   Christoph Lameter   [PATCH] Use ZVC f...
2214
  	val->freeram = node_page_state(nid, NR_FREE_PAGES);
98d2b0ebd   Christoph Lameter   [PATCH] reduce MA...
2215
  #ifdef CONFIG_HIGHMEM
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2216
  	val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;
d23ad4232   Christoph Lameter   [PATCH] Use ZVC f...
2217
2218
  	val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
  			NR_FREE_PAGES);
98d2b0ebd   Christoph Lameter   [PATCH] reduce MA...
2219
2220
2221
2222
  #else
  	val->totalhigh = 0;
  	val->freehigh = 0;
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
  	val->mem_unit = PAGE_SIZE;
  }
  #endif
  
  #define K(x) ((x) << (PAGE_SHIFT-10))
  
  /*
   * Show free area list (used inside shift_scroll-lock stuff)
   * We also calculate the percentage fragmentation. We do this by counting the
   * memory on each free list with the exception of the first item on the list.
   */
  void show_free_areas(void)
  {
c72419138   Jes Sorensen   [PATCH] Condense ...
2236
  	int cpu;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2237
  	struct zone *zone;
ee99c71c5   KOSAKI Motohiro   mm: introduce for...
2238
  	for_each_populated_zone(zone) {
c72419138   Jes Sorensen   [PATCH] Condense ...
2239
2240
2241
  		show_node(zone);
  		printk("%s per-cpu:
  ", zone->name);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2242

6b482c677   Dave Jones   [PATCH] Don't pri...
2243
  		for_each_online_cpu(cpu) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2244
  			struct per_cpu_pageset *pageset;
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
2245
  			pageset = per_cpu_ptr(zone->pageset, cpu);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2246

3dfa5721f   Christoph Lameter   Page allocator: g...
2247
2248
2249
2250
  			printk("CPU %4d: hi:%5d, btch:%4d usd:%4d
  ",
  			       cpu, pageset->pcp.high,
  			       pageset->pcp.batch, pageset->pcp.count);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2251
2252
  		}
  	}
a731286de   KOSAKI Motohiro   mm: vmstat: add i...
2253
2254
2255
2256
  	printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu
  "
  		" active_file:%lu inactive_file:%lu isolated_file:%lu
  "
7b854121e   Lee Schermerhorn   Unevictable LRU P...
2257
  		" unevictable:%lu"
b76146ed1   Andrew Morton   revert "mm: oom a...
2258
2259
  		" dirty:%lu writeback:%lu unstable:%lu
  "
3701b0332   KOSAKI Motohiro   mm: show_free_are...
2260
2261
  		" free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu
  "
4b02108ac   KOSAKI Motohiro   mm: oom analysis:...
2262
2263
  		" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu
  ",
4f98a2fee   Rik van Riel   vmscan: split LRU...
2264
  		global_page_state(NR_ACTIVE_ANON),
4f98a2fee   Rik van Riel   vmscan: split LRU...
2265
  		global_page_state(NR_INACTIVE_ANON),
a731286de   KOSAKI Motohiro   mm: vmstat: add i...
2266
2267
  		global_page_state(NR_ISOLATED_ANON),
  		global_page_state(NR_ACTIVE_FILE),
4f98a2fee   Rik van Riel   vmscan: split LRU...
2268
  		global_page_state(NR_INACTIVE_FILE),
a731286de   KOSAKI Motohiro   mm: vmstat: add i...
2269
  		global_page_state(NR_ISOLATED_FILE),
7b854121e   Lee Schermerhorn   Unevictable LRU P...
2270
  		global_page_state(NR_UNEVICTABLE),
b1e7a8fd8   Christoph Lameter   [PATCH] zoned vm ...
2271
  		global_page_state(NR_FILE_DIRTY),
ce866b34a   Christoph Lameter   [PATCH] zoned vm ...
2272
  		global_page_state(NR_WRITEBACK),
fd39fc856   Christoph Lameter   [PATCH] zoned vm ...
2273
  		global_page_state(NR_UNSTABLE_NFS),
d23ad4232   Christoph Lameter   [PATCH] Use ZVC f...
2274
  		global_page_state(NR_FREE_PAGES),
3701b0332   KOSAKI Motohiro   mm: show_free_are...
2275
2276
  		global_page_state(NR_SLAB_RECLAIMABLE),
  		global_page_state(NR_SLAB_UNRECLAIMABLE),
65ba55f50   Christoph Lameter   [PATCH] zoned vm ...
2277
  		global_page_state(NR_FILE_MAPPED),
4b02108ac   KOSAKI Motohiro   mm: oom analysis:...
2278
  		global_page_state(NR_SHMEM),
a25700a53   Andrew Morton   [PATCH] mm: show ...
2279
2280
  		global_page_state(NR_PAGETABLE),
  		global_page_state(NR_BOUNCE));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2281

ee99c71c5   KOSAKI Motohiro   mm: introduce for...
2282
  	for_each_populated_zone(zone) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2283
2284
2285
2286
2287
2288
2289
2290
  		int i;
  
  		show_node(zone);
  		printk("%s"
  			" free:%lukB"
  			" min:%lukB"
  			" low:%lukB"
  			" high:%lukB"
4f98a2fee   Rik van Riel   vmscan: split LRU...
2291
2292
2293
2294
  			" active_anon:%lukB"
  			" inactive_anon:%lukB"
  			" active_file:%lukB"
  			" inactive_file:%lukB"
7b854121e   Lee Schermerhorn   Unevictable LRU P...
2295
  			" unevictable:%lukB"
a731286de   KOSAKI Motohiro   mm: vmstat: add i...
2296
2297
  			" isolated(anon):%lukB"
  			" isolated(file):%lukB"
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2298
  			" present:%lukB"
4a0aa73f1   KOSAKI Motohiro   mm: oom analysis:...
2299
2300
2301
2302
  			" mlocked:%lukB"
  			" dirty:%lukB"
  			" writeback:%lukB"
  			" mapped:%lukB"
4b02108ac   KOSAKI Motohiro   mm: oom analysis:...
2303
  			" shmem:%lukB"
4a0aa73f1   KOSAKI Motohiro   mm: oom analysis:...
2304
2305
  			" slab_reclaimable:%lukB"
  			" slab_unreclaimable:%lukB"
c6a7f5728   KOSAKI Motohiro   mm: oom analysis:...
2306
  			" kernel_stack:%lukB"
4a0aa73f1   KOSAKI Motohiro   mm: oom analysis:...
2307
2308
2309
2310
  			" pagetables:%lukB"
  			" unstable:%lukB"
  			" bounce:%lukB"
  			" writeback_tmp:%lukB"
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2311
2312
2313
2314
2315
  			" pages_scanned:%lu"
  			" all_unreclaimable? %s"
  			"
  ",
  			zone->name,
d23ad4232   Christoph Lameter   [PATCH] Use ZVC f...
2316
  			K(zone_page_state(zone, NR_FREE_PAGES)),
418589663   Mel Gorman   page allocator: u...
2317
2318
2319
  			K(min_wmark_pages(zone)),
  			K(low_wmark_pages(zone)),
  			K(high_wmark_pages(zone)),
4f98a2fee   Rik van Riel   vmscan: split LRU...
2320
2321
2322
2323
  			K(zone_page_state(zone, NR_ACTIVE_ANON)),
  			K(zone_page_state(zone, NR_INACTIVE_ANON)),
  			K(zone_page_state(zone, NR_ACTIVE_FILE)),
  			K(zone_page_state(zone, NR_INACTIVE_FILE)),
7b854121e   Lee Schermerhorn   Unevictable LRU P...
2324
  			K(zone_page_state(zone, NR_UNEVICTABLE)),
a731286de   KOSAKI Motohiro   mm: vmstat: add i...
2325
2326
  			K(zone_page_state(zone, NR_ISOLATED_ANON)),
  			K(zone_page_state(zone, NR_ISOLATED_FILE)),
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2327
  			K(zone->present_pages),
4a0aa73f1   KOSAKI Motohiro   mm: oom analysis:...
2328
2329
2330
2331
  			K(zone_page_state(zone, NR_MLOCK)),
  			K(zone_page_state(zone, NR_FILE_DIRTY)),
  			K(zone_page_state(zone, NR_WRITEBACK)),
  			K(zone_page_state(zone, NR_FILE_MAPPED)),
4b02108ac   KOSAKI Motohiro   mm: oom analysis:...
2332
  			K(zone_page_state(zone, NR_SHMEM)),
4a0aa73f1   KOSAKI Motohiro   mm: oom analysis:...
2333
2334
  			K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),
  			K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),
c6a7f5728   KOSAKI Motohiro   mm: oom analysis:...
2335
2336
  			zone_page_state(zone, NR_KERNEL_STACK) *
  				THREAD_SIZE / 1024,
4a0aa73f1   KOSAKI Motohiro   mm: oom analysis:...
2337
2338
2339
2340
  			K(zone_page_state(zone, NR_PAGETABLE)),
  			K(zone_page_state(zone, NR_UNSTABLE_NFS)),
  			K(zone_page_state(zone, NR_BOUNCE)),
  			K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2341
  			zone->pages_scanned,
93e4a89a8   KOSAKI Motohiro   mm: restore zone-...
2342
  			(zone->all_unreclaimable ? "yes" : "no")
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2343
2344
2345
2346
2347
2348
2349
  			);
  		printk("lowmem_reserve[]:");
  		for (i = 0; i < MAX_NR_ZONES; i++)
  			printk(" %lu", zone->lowmem_reserve[i]);
  		printk("
  ");
  	}
ee99c71c5   KOSAKI Motohiro   mm: introduce for...
2350
  	for_each_populated_zone(zone) {
8f9de51a4   Kirill Korotaev   [PATCH] printk() ...
2351
   		unsigned long nr[MAX_ORDER], flags, order, total = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2352
2353
2354
  
  		show_node(zone);
  		printk("%s: ", zone->name);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2355
2356
2357
  
  		spin_lock_irqsave(&zone->lock, flags);
  		for (order = 0; order < MAX_ORDER; order++) {
8f9de51a4   Kirill Korotaev   [PATCH] printk() ...
2358
2359
  			nr[order] = zone->free_area[order].nr_free;
  			total += nr[order] << order;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2360
2361
  		}
  		spin_unlock_irqrestore(&zone->lock, flags);
8f9de51a4   Kirill Korotaev   [PATCH] printk() ...
2362
2363
  		for (order = 0; order < MAX_ORDER; order++)
  			printk("%lu*%lukB ", nr[order], K(1UL) << order);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2364
2365
2366
  		printk("= %lukB
  ", K(total));
  	}
e6f3602d2   Larry Woodman   Include count of ...
2367
2368
  	printk("%ld total pagecache pages
  ", global_page_state(NR_FILE_PAGES));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2369
2370
  	show_swap_cache_info();
  }
19770b326   Mel Gorman   mm: filter based ...
2371
2372
2373
2374
2375
  static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
  {
  	zoneref->zone = zone;
  	zoneref->zone_idx = zone_idx(zone);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2376
2377
  /*
   * Builds allocation fallback zone lists.
1a93205bd   Christoph Lameter   [PATCH] mm: simpl...
2378
2379
   *
   * Add all populated zones of a node to the zonelist.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2380
   */
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2381
2382
  static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
  				int nr_zones, enum zone_type zone_type)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2383
  {
1a93205bd   Christoph Lameter   [PATCH] mm: simpl...
2384
  	struct zone *zone;
98d2b0ebd   Christoph Lameter   [PATCH] reduce MA...
2385
  	BUG_ON(zone_type >= MAX_NR_ZONES);
2f6726e54   Christoph Lameter   [PATCH] Apply typ...
2386
  	zone_type++;
02a68a5eb   Christoph Lameter   [PATCH] Fix zone ...
2387
2388
  
  	do {
2f6726e54   Christoph Lameter   [PATCH] Apply typ...
2389
  		zone_type--;
070f80326   Christoph Lameter   [PATCH] build_zon...
2390
  		zone = pgdat->node_zones + zone_type;
1a93205bd   Christoph Lameter   [PATCH] mm: simpl...
2391
  		if (populated_zone(zone)) {
dd1a239f6   Mel Gorman   mm: have zonelist...
2392
2393
  			zoneref_set_zone(zone,
  				&zonelist->_zonerefs[nr_zones++]);
070f80326   Christoph Lameter   [PATCH] build_zon...
2394
  			check_highest_zone(zone_type);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2395
  		}
02a68a5eb   Christoph Lameter   [PATCH] Fix zone ...
2396

2f6726e54   Christoph Lameter   [PATCH] Apply typ...
2397
  	} while (zone_type);
070f80326   Christoph Lameter   [PATCH] build_zon...
2398
  	return nr_zones;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2399
  }
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
  
  /*
   *  zonelist_order:
   *  0 = automatic detection of better ordering.
   *  1 = order by ([node] distance, -zonetype)
   *  2 = order by (-zonetype, [node] distance)
   *
   *  If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
   *  the same zonelist. So only NUMA can configure this param.
   */
  #define ZONELIST_ORDER_DEFAULT  0
  #define ZONELIST_ORDER_NODE     1
  #define ZONELIST_ORDER_ZONE     2
  
  /* zonelist order in the kernel.
   * set_zonelist_order() will set this to NODE or ZONE.
   */
  static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;
  static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2419
  #ifdef CONFIG_NUMA
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
  /* The value user specified ....changed by config */
  static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;
  /* string for sysctl */
  #define NUMA_ZONELIST_ORDER_LEN	16
  char numa_zonelist_order[16] = "default";
  
  /*
   * interface for configure zonelist ordering.
   * command line option "numa_zonelist_order"
   *	= "[dD]efault	- default, automatic configuration.
   *	= "[nN]ode 	- order by node locality, then by zone within node
   *	= "[zZ]one      - order by zone, then by locality within zone
   */
  
  static int __parse_numa_zonelist_order(char *s)
  {
  	if (*s == 'd' || *s == 'D') {
  		user_zonelist_order = ZONELIST_ORDER_DEFAULT;
  	} else if (*s == 'n' || *s == 'N') {
  		user_zonelist_order = ZONELIST_ORDER_NODE;
  	} else if (*s == 'z' || *s == 'Z') {
  		user_zonelist_order = ZONELIST_ORDER_ZONE;
  	} else {
  		printk(KERN_WARNING
  			"Ignoring invalid numa_zonelist_order value:  "
  			"%s
  ", s);
  		return -EINVAL;
  	}
  	return 0;
  }
  
  static __init int setup_numa_zonelist_order(char *s)
  {
  	if (s)
  		return __parse_numa_zonelist_order(s);
  	return 0;
  }
  early_param("numa_zonelist_order", setup_numa_zonelist_order);
  
  /*
   * sysctl handler for numa_zonelist_order
   */
  int numa_zonelist_order_handler(ctl_table *table, int write,
8d65af789   Alexey Dobriyan   sysctl: remove "s...
2464
  		void __user *buffer, size_t *length,
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2465
2466
2467
2468
  		loff_t *ppos)
  {
  	char saved_string[NUMA_ZONELIST_ORDER_LEN];
  	int ret;
443c6f145   Andi Kleen   SYSCTL: Add a mut...
2469
  	static DEFINE_MUTEX(zl_order_mutex);
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2470

443c6f145   Andi Kleen   SYSCTL: Add a mut...
2471
  	mutex_lock(&zl_order_mutex);
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2472
  	if (write)
443c6f145   Andi Kleen   SYSCTL: Add a mut...
2473
  		strcpy(saved_string, (char*)table->data);
8d65af789   Alexey Dobriyan   sysctl: remove "s...
2474
  	ret = proc_dostring(table, write, buffer, length, ppos);
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2475
  	if (ret)
443c6f145   Andi Kleen   SYSCTL: Add a mut...
2476
  		goto out;
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2477
2478
2479
2480
2481
2482
2483
2484
2485
  	if (write) {
  		int oldval = user_zonelist_order;
  		if (__parse_numa_zonelist_order((char*)table->data)) {
  			/*
  			 * bogus value.  restore saved string
  			 */
  			strncpy((char*)table->data, saved_string,
  				NUMA_ZONELIST_ORDER_LEN);
  			user_zonelist_order = oldval;
4eaf3f643   Haicheng Li   mem-hotplug: fix ...
2486
2487
  		} else if (oldval != user_zonelist_order) {
  			mutex_lock(&zonelists_mutex);
1f522509c   Haicheng Li   mem-hotplug: avoi...
2488
  			build_all_zonelists(NULL);
4eaf3f643   Haicheng Li   mem-hotplug: fix ...
2489
2490
  			mutex_unlock(&zonelists_mutex);
  		}
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2491
  	}
443c6f145   Andi Kleen   SYSCTL: Add a mut...
2492
2493
2494
  out:
  	mutex_unlock(&zl_order_mutex);
  	return ret;
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2495
  }
62bc62a87   Christoph Lameter   page allocator: u...
2496
  #define MAX_NODE_LOAD (nr_online_nodes)
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2497
  static int node_load[MAX_NUMNODES];
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2498
  /**
4dc3b16ba   Pavel Pisa   [PATCH] DocBook: ...
2499
   * find_next_best_node - find the next node that should appear in a given node's fallback list
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
   * @node: node whose fallback list we're appending
   * @used_node_mask: nodemask_t of already used nodes
   *
   * We use a number of factors to determine which is the next node that should
   * appear on a given node's fallback list.  The node should not have appeared
   * already in @node's fallback list, and it should be the next closest node
   * according to the distance array (which contains arbitrary distance values
   * from each node to each node in the system), and should also prefer nodes
   * with no CPUs, since presumably they'll have very little allocation pressure
   * on them otherwise.
   * It returns -1 if no node is found.
   */
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2512
  static int find_next_best_node(int node, nodemask_t *used_node_mask)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2513
  {
4cf808eb4   Linus Torvalds   [PATCH] Handle ho...
2514
  	int n, val;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2515
2516
  	int min_val = INT_MAX;
  	int best_node = -1;
a70f73028   Rusty Russell   cpumask: replace ...
2517
  	const struct cpumask *tmp = cpumask_of_node(0);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2518

4cf808eb4   Linus Torvalds   [PATCH] Handle ho...
2519
2520
2521
2522
2523
  	/* Use the local node if we haven't already */
  	if (!node_isset(node, *used_node_mask)) {
  		node_set(node, *used_node_mask);
  		return node;
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2524

37b07e416   Lee Schermerhorn   memoryless nodes:...
2525
  	for_each_node_state(n, N_HIGH_MEMORY) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2526
2527
2528
2529
  
  		/* Don't want a node to appear more than once */
  		if (node_isset(n, *used_node_mask))
  			continue;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2530
2531
  		/* Use the distance array to find the distance */
  		val = node_distance(node, n);
4cf808eb4   Linus Torvalds   [PATCH] Handle ho...
2532
2533
  		/* Penalize nodes under us ("prefer the next node") */
  		val += (n < node);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2534
  		/* Give preference to headless and unused nodes */
a70f73028   Rusty Russell   cpumask: replace ...
2535
2536
  		tmp = cpumask_of_node(n);
  		if (!cpumask_empty(tmp))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
  			val += PENALTY_FOR_NODE_WITH_CPUS;
  
  		/* Slight preference for less loaded node */
  		val *= (MAX_NODE_LOAD*MAX_NUMNODES);
  		val += node_load[n];
  
  		if (val < min_val) {
  			min_val = val;
  			best_node = n;
  		}
  	}
  
  	if (best_node >= 0)
  		node_set(best_node, *used_node_mask);
  
  	return best_node;
  }
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2554
2555
2556
2557
2558
2559
2560
  
  /*
   * Build zonelists ordered by node and zones within node.
   * This results in maximum locality--normal zone overflows into local
   * DMA zone, if any--but risks exhausting DMA zone.
   */
  static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2561
  {
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2562
  	int j;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2563
  	struct zonelist *zonelist;
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2564

54a6eb5c4   Mel Gorman   mm: use two zonel...
2565
  	zonelist = &pgdat->node_zonelists[0];
dd1a239f6   Mel Gorman   mm: have zonelist...
2566
  	for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
54a6eb5c4   Mel Gorman   mm: use two zonel...
2567
2568
2569
  		;
  	j = build_zonelists_node(NODE_DATA(node), zonelist, j,
  							MAX_NR_ZONES - 1);
dd1a239f6   Mel Gorman   mm: have zonelist...
2570
2571
  	zonelist->_zonerefs[j].zone = NULL;
  	zonelist->_zonerefs[j].zone_idx = 0;
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2572
2573
2574
  }
  
  /*
523b94585   Christoph Lameter   Memoryless nodes:...
2575
2576
2577
2578
   * Build gfp_thisnode zonelists
   */
  static void build_thisnode_zonelists(pg_data_t *pgdat)
  {
523b94585   Christoph Lameter   Memoryless nodes:...
2579
2580
  	int j;
  	struct zonelist *zonelist;
54a6eb5c4   Mel Gorman   mm: use two zonel...
2581
2582
  	zonelist = &pgdat->node_zonelists[1];
  	j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
dd1a239f6   Mel Gorman   mm: have zonelist...
2583
2584
  	zonelist->_zonerefs[j].zone = NULL;
  	zonelist->_zonerefs[j].zone_idx = 0;
523b94585   Christoph Lameter   Memoryless nodes:...
2585
2586
2587
  }
  
  /*
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2588
2589
2590
2591
2592
2593
2594
2595
2596
   * Build zonelists ordered by zone and nodes within zones.
   * This results in conserving DMA zone[s] until all Normal memory is
   * exhausted, but results in overflowing to remote node while memory
   * may still exist in local DMA zone.
   */
  static int node_order[MAX_NUMNODES];
  
  static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
  {
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2597
2598
2599
2600
  	int pos, j, node;
  	int zone_type;		/* needs to be signed */
  	struct zone *z;
  	struct zonelist *zonelist;
54a6eb5c4   Mel Gorman   mm: use two zonel...
2601
2602
2603
2604
2605
2606
2607
  	zonelist = &pgdat->node_zonelists[0];
  	pos = 0;
  	for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
  		for (j = 0; j < nr_nodes; j++) {
  			node = node_order[j];
  			z = &NODE_DATA(node)->node_zones[zone_type];
  			if (populated_zone(z)) {
dd1a239f6   Mel Gorman   mm: have zonelist...
2608
2609
  				zoneref_set_zone(z,
  					&zonelist->_zonerefs[pos++]);
54a6eb5c4   Mel Gorman   mm: use two zonel...
2610
  				check_highest_zone(zone_type);
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2611
2612
  			}
  		}
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2613
  	}
dd1a239f6   Mel Gorman   mm: have zonelist...
2614
2615
  	zonelist->_zonerefs[pos].zone = NULL;
  	zonelist->_zonerefs[pos].zone_idx = 0;
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2616
2617
2618
2619
2620
2621
2622
2623
2624
  }
  
  static int default_zonelist_order(void)
  {
  	int nid, zone_type;
  	unsigned long low_kmem_size,total_size;
  	struct zone *z;
  	int average_size;
  	/*
883931612   Thomas Weber   Fix typos in comm...
2625
           * ZONE_DMA and ZONE_DMA32 can be very small area in the system.
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2626
2627
  	 * If they are really small and used heavily, the system can fall
  	 * into OOM very easily.
e325c90ff   David Rientjes   mm: default to no...
2628
  	 * This function detect ZONE_DMA/DMA32 size and configures zone order.
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
  	 */
  	/* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */
  	low_kmem_size = 0;
  	total_size = 0;
  	for_each_online_node(nid) {
  		for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
  			z = &NODE_DATA(nid)->node_zones[zone_type];
  			if (populated_zone(z)) {
  				if (zone_type < ZONE_NORMAL)
  					low_kmem_size += z->present_pages;
  				total_size += z->present_pages;
e325c90ff   David Rientjes   mm: default to no...
2640
2641
2642
2643
2644
2645
2646
2647
2648
  			} else if (zone_type == ZONE_NORMAL) {
  				/*
  				 * If any node has only lowmem, then node order
  				 * is preferred to allow kernel allocations
  				 * locally; otherwise, they can easily infringe
  				 * on other nodes when there is an abundance of
  				 * lowmem available to allocate from.
  				 */
  				return ZONELIST_ORDER_NODE;
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
  			}
  		}
  	}
  	if (!low_kmem_size ||  /* there are no DMA area. */
  	    low_kmem_size > total_size/2) /* DMA/DMA32 is big. */
  		return ZONELIST_ORDER_NODE;
  	/*
  	 * look into each node's config.
    	 * If there is a node whose DMA/DMA32 memory is very big area on
   	 * local memory, NODE_ORDER may be suitable.
           */
37b07e416   Lee Schermerhorn   memoryless nodes:...
2660
2661
  	average_size = total_size /
  				(nodes_weight(node_states[N_HIGH_MEMORY]) + 1);
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
  	for_each_online_node(nid) {
  		low_kmem_size = 0;
  		total_size = 0;
  		for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
  			z = &NODE_DATA(nid)->node_zones[zone_type];
  			if (populated_zone(z)) {
  				if (zone_type < ZONE_NORMAL)
  					low_kmem_size += z->present_pages;
  				total_size += z->present_pages;
  			}
  		}
  		if (low_kmem_size &&
  		    total_size > average_size && /* ignore small node */
  		    low_kmem_size > total_size * 70/100)
  			return ZONELIST_ORDER_NODE;
  	}
  	return ZONELIST_ORDER_ZONE;
  }
  
  static void set_zonelist_order(void)
  {
  	if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)
  		current_zonelist_order = default_zonelist_order();
  	else
  		current_zonelist_order = user_zonelist_order;
  }
  
  static void build_zonelists(pg_data_t *pgdat)
  {
  	int j, node, load;
  	enum zone_type i;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2693
  	nodemask_t used_mask;
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2694
2695
2696
  	int local_node, prev_node;
  	struct zonelist *zonelist;
  	int order = current_zonelist_order;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2697
2698
  
  	/* initialize zonelists */
523b94585   Christoph Lameter   Memoryless nodes:...
2699
  	for (i = 0; i < MAX_ZONELISTS; i++) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2700
  		zonelist = pgdat->node_zonelists + i;
dd1a239f6   Mel Gorman   mm: have zonelist...
2701
2702
  		zonelist->_zonerefs[0].zone = NULL;
  		zonelist->_zonerefs[0].zone_idx = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2703
2704
2705
2706
  	}
  
  	/* NUMA-aware ordering of nodes */
  	local_node = pgdat->node_id;
62bc62a87   Christoph Lameter   page allocator: u...
2707
  	load = nr_online_nodes;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2708
2709
  	prev_node = local_node;
  	nodes_clear(used_mask);
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2710

f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2711
2712
  	memset(node_order, 0, sizeof(node_order));
  	j = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2713
  	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
9eeff2395   Christoph Lameter   [PATCH] Zone recl...
2714
2715
2716
2717
2718
2719
2720
2721
  		int distance = node_distance(local_node, node);
  
  		/*
  		 * If another node is sufficiently far away then it is better
  		 * to reclaim pages in a zone before going off node.
  		 */
  		if (distance > RECLAIM_DISTANCE)
  			zone_reclaim_mode = 1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2722
2723
2724
2725
2726
  		/*
  		 * We don't want to pressure a particular node.
  		 * So adding penalty to the first node in same
  		 * distance group to make it round-robin.
  		 */
9eeff2395   Christoph Lameter   [PATCH] Zone recl...
2727
  		if (distance != node_distance(local_node, prev_node))
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2728
  			node_load[node] = load;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2729
2730
  		prev_node = node;
  		load--;
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2731
2732
2733
2734
2735
  		if (order == ZONELIST_ORDER_NODE)
  			build_zonelists_in_node_order(pgdat, node);
  		else
  			node_order[j++] = node;	/* remember order */
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2736

f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2737
2738
2739
  	if (order == ZONELIST_ORDER_ZONE) {
  		/* calculate node order -- i.e., DMA last! */
  		build_zonelists_in_zone_order(pgdat, j);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2740
  	}
523b94585   Christoph Lameter   Memoryless nodes:...
2741
2742
  
  	build_thisnode_zonelists(pgdat);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2743
  }
9276b1bc9   Paul Jackson   [PATCH] memory pa...
2744
  /* Construct the zonelist performance cache - see further mmzone.h */
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2745
  static void build_zonelist_cache(pg_data_t *pgdat)
9276b1bc9   Paul Jackson   [PATCH] memory pa...
2746
  {
54a6eb5c4   Mel Gorman   mm: use two zonel...
2747
2748
  	struct zonelist *zonelist;
  	struct zonelist_cache *zlc;
dd1a239f6   Mel Gorman   mm: have zonelist...
2749
  	struct zoneref *z;
9276b1bc9   Paul Jackson   [PATCH] memory pa...
2750

54a6eb5c4   Mel Gorman   mm: use two zonel...
2751
2752
2753
  	zonelist = &pgdat->node_zonelists[0];
  	zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
  	bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
dd1a239f6   Mel Gorman   mm: have zonelist...
2754
2755
  	for (z = zonelist->_zonerefs; z->zone; z++)
  		zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);
9276b1bc9   Paul Jackson   [PATCH] memory pa...
2756
  }
7aac78988   Lee Schermerhorn   numa: introduce n...
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
  #ifdef CONFIG_HAVE_MEMORYLESS_NODES
  /*
   * Return node id of node used for "local" allocations.
   * I.e., first node id of first zone in arg node's generic zonelist.
   * Used for initializing percpu 'numa_mem', which is used primarily
   * for kernel allocations, so use GFP_KERNEL flags to locate zonelist.
   */
  int local_memory_node(int node)
  {
  	struct zone *zone;
  
  	(void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
  				   gfp_zone(GFP_KERNEL),
  				   NULL,
  				   &zone);
  	return zone->node;
  }
  #endif
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2775

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2776
  #else	/* CONFIG_NUMA */
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2777
2778
2779
2780
2781
2782
  static void set_zonelist_order(void)
  {
  	current_zonelist_order = ZONELIST_ORDER_ZONE;
  }
  
  static void build_zonelists(pg_data_t *pgdat)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2783
  {
19655d348   Christoph Lameter   [PATCH] linearly ...
2784
  	int node, local_node;
54a6eb5c4   Mel Gorman   mm: use two zonel...
2785
2786
  	enum zone_type j;
  	struct zonelist *zonelist;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2787
2788
  
  	local_node = pgdat->node_id;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2789

54a6eb5c4   Mel Gorman   mm: use two zonel...
2790
2791
  	zonelist = &pgdat->node_zonelists[0];
  	j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2792

54a6eb5c4   Mel Gorman   mm: use two zonel...
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
  	/*
  	 * Now we build the zonelist so that it contains the zones
  	 * of all the other nodes.
  	 * We don't want to pressure a particular node, so when
  	 * building the zones for node N, we make sure that the
  	 * zones coming right after the local ones are those from
  	 * node N+1 (modulo N)
  	 */
  	for (node = local_node + 1; node < MAX_NUMNODES; node++) {
  		if (!node_online(node))
  			continue;
  		j = build_zonelists_node(NODE_DATA(node), zonelist, j,
  							MAX_NR_ZONES - 1);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2806
  	}
54a6eb5c4   Mel Gorman   mm: use two zonel...
2807
2808
2809
2810
2811
2812
  	for (node = 0; node < local_node; node++) {
  		if (!node_online(node))
  			continue;
  		j = build_zonelists_node(NODE_DATA(node), zonelist, j,
  							MAX_NR_ZONES - 1);
  	}
dd1a239f6   Mel Gorman   mm: have zonelist...
2813
2814
  	zonelist->_zonerefs[j].zone = NULL;
  	zonelist->_zonerefs[j].zone_idx = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2815
  }
9276b1bc9   Paul Jackson   [PATCH] memory pa...
2816
  /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2817
  static void build_zonelist_cache(pg_data_t *pgdat)
9276b1bc9   Paul Jackson   [PATCH] memory pa...
2818
  {
54a6eb5c4   Mel Gorman   mm: use two zonel...
2819
  	pgdat->node_zonelists[0].zlcache_ptr = NULL;
9276b1bc9   Paul Jackson   [PATCH] memory pa...
2820
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2821
  #endif	/* CONFIG_NUMA */
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
  /*
   * Boot pageset table. One per cpu which is going to be used for all
   * zones and all nodes. The parameters will be set in such a way
   * that an item put on a list will immediately be handed over to
   * the buddy list. This is safe since pageset manipulation is done
   * with interrupts disabled.
   *
   * The boot_pagesets must be kept even after bootup is complete for
   * unused processors and/or zones. They do play a role for bootstrapping
   * hotplugged processors.
   *
   * zoneinfo_show() and maybe other functions do
   * not check if the processor is online before following the pageset pointer.
   * Other parts of the kernel may not check if the zone is available.
   */
  static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
  static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
1f522509c   Haicheng Li   mem-hotplug: avoi...
2839
  static void setup_zone_pageset(struct zone *zone);
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
2840

4eaf3f643   Haicheng Li   mem-hotplug: fix ...
2841
2842
2843
2844
2845
  /*
   * Global mutex to protect against size modification of zonelists
   * as well as to serialize pageset setup for the new populated zone.
   */
  DEFINE_MUTEX(zonelists_mutex);
9b1a4d383   Rusty Russell   stop_machine: Wea...
2846
  /* return values int ....just for stop_machine() */
1f522509c   Haicheng Li   mem-hotplug: avoi...
2847
  static __init_refok int __build_all_zonelists(void *data)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2848
  {
6811378e7   Yasunori Goto   [PATCH] wait_tabl...
2849
  	int nid;
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
2850
  	int cpu;
9276b1bc9   Paul Jackson   [PATCH] memory pa...
2851

7f9cfb310   Bo Liu   mm: build_zonelis...
2852
2853
2854
  #ifdef CONFIG_NUMA
  	memset(node_load, 0, sizeof(node_load));
  #endif
9276b1bc9   Paul Jackson   [PATCH] memory pa...
2855
  	for_each_online_node(nid) {
7ea1530ab   Christoph Lameter   Memoryless nodes:...
2856
2857
2858
2859
  		pg_data_t *pgdat = NODE_DATA(nid);
  
  		build_zonelists(pgdat);
  		build_zonelist_cache(pgdat);
9276b1bc9   Paul Jackson   [PATCH] memory pa...
2860
  	}
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
2861

1f522509c   Haicheng Li   mem-hotplug: avoi...
2862
2863
2864
2865
2866
2867
2868
  #ifdef CONFIG_MEMORY_HOTPLUG
  	/* Setup real pagesets for the new zone */
  	if (data) {
  		struct zone *zone = data;
  		setup_zone_pageset(zone);
  	}
  #endif
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
  	/*
  	 * Initialize the boot_pagesets that are going to be used
  	 * for bootstrapping processors. The real pagesets for
  	 * each zone will be allocated later when the per cpu
  	 * allocator is available.
  	 *
  	 * boot_pagesets are used also for bootstrapping offline
  	 * cpus if the system is already booted because the pagesets
  	 * are needed to initialize allocators on a specific cpu too.
  	 * F.e. the percpu allocator needs the page allocator which
  	 * needs the percpu allocator in order to allocate its pagesets
  	 * (a chicken-egg dilemma).
  	 */
7aac78988   Lee Schermerhorn   numa: introduce n...
2882
  	for_each_possible_cpu(cpu) {
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
2883
  		setup_pageset(&per_cpu(boot_pageset, cpu), 0);
7aac78988   Lee Schermerhorn   numa: introduce n...
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
  #ifdef CONFIG_HAVE_MEMORYLESS_NODES
  		/*
  		 * We now know the "local memory node" for each node--
  		 * i.e., the node of the first zone in the generic zonelist.
  		 * Set up numa_mem percpu variable for on-line cpus.  During
  		 * boot, only the boot cpu should be on-line;  we'll init the
  		 * secondary cpus' numa_mem as they come on-line.  During
  		 * node/memory hotplug, we'll fixup all on-line cpus.
  		 */
  		if (cpu_online(cpu))
  			set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
  #endif
  	}
6811378e7   Yasunori Goto   [PATCH] wait_tabl...
2897
2898
  	return 0;
  }
4eaf3f643   Haicheng Li   mem-hotplug: fix ...
2899
2900
2901
2902
  /*
   * Called with zonelists_mutex held always
   * unless system_state == SYSTEM_BOOTING.
   */
1f522509c   Haicheng Li   mem-hotplug: avoi...
2903
  void build_all_zonelists(void *data)
6811378e7   Yasunori Goto   [PATCH] wait_tabl...
2904
  {
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2905
  	set_zonelist_order();
6811378e7   Yasunori Goto   [PATCH] wait_tabl...
2906
  	if (system_state == SYSTEM_BOOTING) {
423b41d77   Randy Dunlap   [PATCH] mm/page_a...
2907
  		__build_all_zonelists(NULL);
68ad8df42   Mel Gorman   mm: print out the...
2908
  		mminit_verify_zonelist();
6811378e7   Yasunori Goto   [PATCH] wait_tabl...
2909
2910
  		cpuset_init_current_mems_allowed();
  	} else {
183ff22bb   Simon Arlott   spelling fixes: mm/
2911
  		/* we have to stop all cpus to guarantee there is no user
6811378e7   Yasunori Goto   [PATCH] wait_tabl...
2912
  		   of zonelist */
1f522509c   Haicheng Li   mem-hotplug: avoi...
2913
  		stop_machine(__build_all_zonelists, data, NULL);
6811378e7   Yasunori Goto   [PATCH] wait_tabl...
2914
2915
  		/* cpuset refresh routine should be here */
  	}
bd1e22b8e   Andrew Morton   [PATCH] initialis...
2916
  	vm_total_pages = nr_free_pagecache_pages();
9ef9acb05   Mel Gorman   Do not group page...
2917
2918
2919
2920
2921
2922
2923
  	/*
  	 * Disable grouping by mobility if the number of pages in the
  	 * system is too low to allow the mechanism to work. It would be
  	 * more accurate, but expensive to check per-zone. This check is
  	 * made on memory-hotadd so a system can start with mobility
  	 * disabled and enable it later
  	 */
d9c234005   Mel Gorman   Do not depend on ...
2924
  	if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
9ef9acb05   Mel Gorman   Do not group page...
2925
2926
2927
2928
2929
2930
2931
  		page_group_by_mobility_disabled = 1;
  	else
  		page_group_by_mobility_disabled = 0;
  
  	printk("Built %i zonelists in %s order, mobility grouping %s.  "
  		"Total pages: %ld
  ",
62bc62a87   Christoph Lameter   page allocator: u...
2932
  			nr_online_nodes,
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2933
  			zonelist_order_name[current_zonelist_order],
9ef9acb05   Mel Gorman   Do not group page...
2934
  			page_group_by_mobility_disabled ? "off" : "on",
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2935
2936
2937
2938
2939
  			vm_total_pages);
  #ifdef CONFIG_NUMA
  	printk("Policy zone: %s
  ", zone_names[policy_zone]);
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
  }
  
  /*
   * Helper functions to size the waitqueue hash table.
   * Essentially these want to choose hash table sizes sufficiently
   * large so that collisions trying to wait on pages are rare.
   * But in fact, the number of active page waitqueues on typical
   * systems is ridiculously low, less than 200. So this is even
   * conservative, even though it seems large.
   *
   * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
   * waitqueues, i.e. the size of the waitq table given the number of pages.
   */
  #define PAGES_PER_WAITQUEUE	256
cca448fe9   Yasunori Goto   [PATCH] wait_tabl...
2954
  #ifndef CONFIG_MEMORY_HOTPLUG
02b694dea   Yasunori Goto   [PATCH] wait_tabl...
2955
  static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
  {
  	unsigned long size = 1;
  
  	pages /= PAGES_PER_WAITQUEUE;
  
  	while (size < pages)
  		size <<= 1;
  
  	/*
  	 * Once we have dozens or even hundreds of threads sleeping
  	 * on IO we've got bigger problems than wait queue collision.
  	 * Limit the size of the wait table to a reasonable size.
  	 */
  	size = min(size, 4096UL);
  
  	return max(size, 4UL);
  }
cca448fe9   Yasunori Goto   [PATCH] wait_tabl...
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
  #else
  /*
   * A zone's size might be changed by hot-add, so it is not possible to determine
   * a suitable size for its wait_table.  So we use the maximum size now.
   *
   * The max wait table size = 4096 x sizeof(wait_queue_head_t).   ie:
   *
   *    i386 (preemption config)    : 4096 x 16 = 64Kbyte.
   *    ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.
   *    ia64, x86-64 (preemption)   : 4096 x 24 = 96Kbyte.
   *
   * The maximum entries are prepared when a zone's memory is (512K + 256) pages
   * or more by the traditional way. (See above).  It equals:
   *
   *    i386, x86-64, powerpc(4K page size) : =  ( 2G + 1M)byte.
   *    ia64(16K page size)                 : =  ( 8G + 4M)byte.
   *    powerpc (64K page size)             : =  (32G +16M)byte.
   */
  static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
  {
  	return 4096UL;
  }
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
  
  /*
   * This is an integer logarithm so that shifts can be used later
   * to extract the more random high bits from the multiplicative
   * hash function before the remainder is taken.
   */
  static inline unsigned long wait_table_bits(unsigned long size)
  {
  	return ffz(~size);
  }
  
  #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
56fd56b86   Mel Gorman   Bias the location...
3008
  /*
d9c234005   Mel Gorman   Do not depend on ...
3009
   * Mark a number of pageblocks as MIGRATE_RESERVE. The number
418589663   Mel Gorman   page allocator: u...
3010
3011
   * of blocks reserved is based on min_wmark_pages(zone). The memory within
   * the reserve will tend to store contiguous free pages. Setting min_free_kbytes
56fd56b86   Mel Gorman   Bias the location...
3012
3013
3014
3015
3016
3017
3018
   * higher will lead to a bigger reserve which will get freed as contiguous
   * blocks as reclaim kicks in
   */
  static void setup_zone_migrate_reserve(struct zone *zone)
  {
  	unsigned long start_pfn, pfn, end_pfn;
  	struct page *page;
78986a678   Mel Gorman   page-allocator: l...
3019
3020
  	unsigned long block_migratetype;
  	int reserve;
56fd56b86   Mel Gorman   Bias the location...
3021
3022
3023
3024
  
  	/* Get the start pfn, end pfn and the number of blocks to reserve */
  	start_pfn = zone->zone_start_pfn;
  	end_pfn = start_pfn + zone->spanned_pages;
418589663   Mel Gorman   page allocator: u...
3025
  	reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
d9c234005   Mel Gorman   Do not depend on ...
3026
  							pageblock_order;
56fd56b86   Mel Gorman   Bias the location...
3027

78986a678   Mel Gorman   page-allocator: l...
3028
3029
3030
3031
3032
3033
3034
3035
  	/*
  	 * Reserve blocks are generally in place to help high-order atomic
  	 * allocations that are short-lived. A min_free_kbytes value that
  	 * would result in more than 2 reserve blocks for atomic allocations
  	 * is assumed to be in place to help anti-fragmentation for the
  	 * future allocation of hugepages at runtime.
  	 */
  	reserve = min(2, reserve);
d9c234005   Mel Gorman   Do not depend on ...
3036
  	for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
56fd56b86   Mel Gorman   Bias the location...
3037
3038
3039
  		if (!pfn_valid(pfn))
  			continue;
  		page = pfn_to_page(pfn);
344c790e3   Adam Litke   mm: make setup_zo...
3040
3041
3042
  		/* Watch out for overlapping nodes */
  		if (page_to_nid(page) != zone_to_nid(zone))
  			continue;
56fd56b86   Mel Gorman   Bias the location...
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
  		/* Blocks with reserved pages will never free, skip them. */
  		if (PageReserved(page))
  			continue;
  
  		block_migratetype = get_pageblock_migratetype(page);
  
  		/* If this block is reserved, account for it */
  		if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) {
  			reserve--;
  			continue;
  		}
  
  		/* Suitable for reserving if this block is movable */
  		if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) {
  			set_pageblock_migratetype(page, MIGRATE_RESERVE);
  			move_freepages_block(zone, page, MIGRATE_RESERVE);
  			reserve--;
  			continue;
  		}
  
  		/*
  		 * If the reserve is met and this is a previous reserved block,
  		 * take it back
  		 */
  		if (block_migratetype == MIGRATE_RESERVE) {
  			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
  			move_freepages_block(zone, page, MIGRATE_MOVABLE);
  		}
  	}
  }
ac0e5b7a6   Mel Gorman   remove PAGE_GROUP...
3073

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3074
3075
3076
3077
3078
  /*
   * Initially all pages are reserved - free ones are freed
   * up by free_all_bootmem() once the early boot process is
   * done. Non-atomic initialization, single-pass.
   */
c09b42404   Matt Tolentino   [PATCH] x86_64: a...
3079
  void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
a2f3aa025   Dave Hansen   [PATCH] Fix spars...
3080
  		unsigned long start_pfn, enum memmap_context context)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3081
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3082
  	struct page *page;
29751f699   Andy Whitcroft   [PATCH] sparsemem...
3083
3084
  	unsigned long end_pfn = start_pfn + size;
  	unsigned long pfn;
86051ca5e   KAMEZAWA Hiroyuki   mm: fix usemap in...
3085
  	struct zone *z;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3086

22b31eec6   Hugh Dickins   badpage: vm_norma...
3087
3088
  	if (highest_memmap_pfn < end_pfn - 1)
  		highest_memmap_pfn = end_pfn - 1;
86051ca5e   KAMEZAWA Hiroyuki   mm: fix usemap in...
3089
  	z = &NODE_DATA(nid)->node_zones[zone];
cbe8dd4af   Greg Ungerer   [PATCH] memmap_in...
3090
  	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
a2f3aa025   Dave Hansen   [PATCH] Fix spars...
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
  		/*
  		 * There can be holes in boot-time mem_map[]s
  		 * handed to this function.  They do not
  		 * exist on hotplugged memory.
  		 */
  		if (context == MEMMAP_EARLY) {
  			if (!early_pfn_valid(pfn))
  				continue;
  			if (!early_pfn_in_nid(pfn, nid))
  				continue;
  		}
d41dee369   Andy Whitcroft   [PATCH] sparsemem...
3102
3103
  		page = pfn_to_page(pfn);
  		set_page_links(page, zone, nid, pfn);
708614e61   Mel Gorman   mm: verify the pa...
3104
  		mminit_verify_page_links(page, zone, nid, pfn);
7835e98b2   Nick Piggin   [PATCH] remove se...
3105
  		init_page_count(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3106
3107
  		reset_page_mapcount(page);
  		SetPageReserved(page);
b2a0ac887   Mel Gorman   Split the free li...
3108
3109
3110
3111
3112
  		/*
  		 * Mark the block movable so that blocks are reserved for
  		 * movable at startup. This will force kernel allocations
  		 * to reserve their blocks rather than leaking throughout
  		 * the address space during boot when many long-lived
56fd56b86   Mel Gorman   Bias the location...
3113
3114
3115
  		 * kernel allocations are made. Later some blocks near
  		 * the start are marked MIGRATE_RESERVE by
  		 * setup_zone_migrate_reserve()
86051ca5e   KAMEZAWA Hiroyuki   mm: fix usemap in...
3116
3117
3118
3119
3120
  		 *
  		 * bitmap is created for zone's valid pfn range. but memmap
  		 * can be created for invalid pages (for alignment)
  		 * check here not to call set_pageblock_migratetype() against
  		 * pfn out of zone.
b2a0ac887   Mel Gorman   Split the free li...
3121
  		 */
86051ca5e   KAMEZAWA Hiroyuki   mm: fix usemap in...
3122
3123
3124
  		if ((z->zone_start_pfn <= pfn)
  		    && (pfn < z->zone_start_pfn + z->spanned_pages)
  		    && !(pfn & (pageblock_nr_pages - 1)))
56fd56b86   Mel Gorman   Bias the location...
3125
  			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
b2a0ac887   Mel Gorman   Split the free li...
3126

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3127
3128
3129
3130
  		INIT_LIST_HEAD(&page->lru);
  #ifdef WANT_PAGE_VIRTUAL
  		/* The shift won't overflow because ZONE_NORMAL is below 4G. */
  		if (!is_highmem_idx(zone))
3212c6be2   Bob Picco   [PATCH] fix WANT_...
3131
  			set_page_address(page, __va(pfn << PAGE_SHIFT));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3132
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3133
3134
  	}
  }
1e548deb5   Andi Kleen   page allocator: r...
3135
  static void __meminit zone_init_free_lists(struct zone *zone)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3136
  {
b2a0ac887   Mel Gorman   Split the free li...
3137
3138
3139
  	int order, t;
  	for_each_migratetype_order(order, t) {
  		INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3140
3141
3142
3143
3144
3145
  		zone->free_area[order].nr_free = 0;
  	}
  }
  
  #ifndef __HAVE_ARCH_MEMMAP_INIT
  #define memmap_init(size, nid, zone, start_pfn) \
a2f3aa025   Dave Hansen   [PATCH] Fix spars...
3146
  	memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3147
  #endif
1d6f4e60e   Sam Ravnborg   mm: fix section m...
3148
  static int zone_batchsize(struct zone *zone)
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
3149
  {
3a6be87fd   David Howells   nommu: clamp zone...
3150
  #ifdef CONFIG_MMU
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
3151
3152
3153
3154
  	int batch;
  
  	/*
  	 * The per-cpu-pages pools are set to around 1000th of the
ba56e91c9   Seth, Rohit   [PATCH] mm: page_...
3155
  	 * size of the zone.  But no more than 1/2 of a meg.
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
3156
3157
3158
3159
  	 *
  	 * OK, so we don't know how big the cache is.  So guess.
  	 */
  	batch = zone->present_pages / 1024;
ba56e91c9   Seth, Rohit   [PATCH] mm: page_...
3160
3161
  	if (batch * PAGE_SIZE > 512 * 1024)
  		batch = (512 * 1024) / PAGE_SIZE;
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
3162
3163
3164
3165
3166
  	batch /= 4;		/* We effectively *= 4 below */
  	if (batch < 1)
  		batch = 1;
  
  	/*
0ceaacc97   Nick Piggin   [PATCH] Fix up pe...
3167
3168
3169
  	 * Clamp the batch to a 2^n - 1 value. Having a power
  	 * of 2 value was found to be more likely to have
  	 * suboptimal cache aliasing properties in some cases.
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
3170
  	 *
0ceaacc97   Nick Piggin   [PATCH] Fix up pe...
3171
3172
3173
3174
  	 * For example if 2 tasks are alternately allocating
  	 * batches of pages, one task can end up with a lot
  	 * of pages of one half of the possible page colors
  	 * and the other with pages of the other colors.
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
3175
  	 */
9155203a5   David Howells   mm: use roundown_...
3176
  	batch = rounddown_pow_of_two(batch + batch/2) - 1;
ba56e91c9   Seth, Rohit   [PATCH] mm: page_...
3177

e7c8d5c99   Christoph Lameter   [PATCH] node loca...
3178
  	return batch;
3a6be87fd   David Howells   nommu: clamp zone...
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
  
  #else
  	/* The deferral and batching of frees should be suppressed under NOMMU
  	 * conditions.
  	 *
  	 * The problem is that NOMMU needs to be able to allocate large chunks
  	 * of contiguous memory as there's no hardware page translation to
  	 * assemble apparent contiguous memory from discontiguous pages.
  	 *
  	 * Queueing large contiguous runs of pages for batching, however,
  	 * causes the pages to actually be freed in smaller chunks.  As there
  	 * can be a significant delay between the individual batches being
  	 * recycled, this leads to the once large chunks of space being
  	 * fragmented and becoming unavailable for high-order allocations.
  	 */
  	return 0;
  #endif
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
3196
  }
b69a7288e   Adrian Bunk   mm/page_alloc.c: ...
3197
  static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
2caaad41e   Christoph Lameter   [PATCH] Reduce si...
3198
3199
  {
  	struct per_cpu_pages *pcp;
5f8dcc212   Mel Gorman   page-allocator: s...
3200
  	int migratetype;
2caaad41e   Christoph Lameter   [PATCH] Reduce si...
3201

1c6fe9465   Magnus Damm   [PATCH] NUMA: bro...
3202
  	memset(p, 0, sizeof(*p));
3dfa5721f   Christoph Lameter   Page allocator: g...
3203
  	pcp = &p->pcp;
2caaad41e   Christoph Lameter   [PATCH] Reduce si...
3204
  	pcp->count = 0;
2caaad41e   Christoph Lameter   [PATCH] Reduce si...
3205
3206
  	pcp->high = 6 * batch;
  	pcp->batch = max(1UL, 1 * batch);
5f8dcc212   Mel Gorman   page-allocator: s...
3207
3208
  	for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
  		INIT_LIST_HEAD(&pcp->lists[migratetype]);
2caaad41e   Christoph Lameter   [PATCH] Reduce si...
3209
  }
8ad4b1fb8   Rohit Seth   [PATCH] Make high...
3210
3211
3212
3213
3214
3215
3216
3217
3218
  /*
   * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
   * to the value high for the pageset p.
   */
  
  static void setup_pagelist_highmark(struct per_cpu_pageset *p,
  				unsigned long high)
  {
  	struct per_cpu_pages *pcp;
3dfa5721f   Christoph Lameter   Page allocator: g...
3219
  	pcp = &p->pcp;
8ad4b1fb8   Rohit Seth   [PATCH] Make high...
3220
3221
3222
3223
3224
  	pcp->high = high;
  	pcp->batch = max(1UL, high/4);
  	if ((high/4) > (PAGE_SHIFT * 8))
  		pcp->batch = PAGE_SHIFT * 8;
  }
319774e25   Wu Fengguang   mem-hotplug: sepa...
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
  static __meminit void setup_zone_pageset(struct zone *zone)
  {
  	int cpu;
  
  	zone->pageset = alloc_percpu(struct per_cpu_pageset);
  
  	for_each_possible_cpu(cpu) {
  		struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
  
  		setup_pageset(pcp, zone_batchsize(zone));
  
  		if (percpu_pagelist_fraction)
  			setup_pagelist_highmark(pcp,
  				(zone->present_pages /
  					percpu_pagelist_fraction));
  	}
  }
2caaad41e   Christoph Lameter   [PATCH] Reduce si...
3242
  /*
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
3243
3244
   * Allocate per cpu pagesets and initialize them.
   * Before this call only boot pagesets were available.
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
3245
   */
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
3246
  void __init setup_per_cpu_pageset(void)
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
3247
  {
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
3248
  	struct zone *zone;
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
3249

319774e25   Wu Fengguang   mem-hotplug: sepa...
3250
3251
  	for_each_populated_zone(zone)
  		setup_zone_pageset(zone);
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
3252
  }
577a32f62   Sam Ravnborg   mm: fix section m...
3253
  static noinline __init_refok
cca448fe9   Yasunori Goto   [PATCH] wait_tabl...
3254
  int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
3255
3256
3257
  {
  	int i;
  	struct pglist_data *pgdat = zone->zone_pgdat;
cca448fe9   Yasunori Goto   [PATCH] wait_tabl...
3258
  	size_t alloc_size;
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
3259
3260
3261
3262
3263
  
  	/*
  	 * The per-page waitqueue mechanism uses hashed waitqueues
  	 * per zone.
  	 */
02b694dea   Yasunori Goto   [PATCH] wait_tabl...
3264
3265
3266
3267
  	zone->wait_table_hash_nr_entries =
  		 wait_table_hash_nr_entries(zone_size_pages);
  	zone->wait_table_bits =
  		wait_table_bits(zone->wait_table_hash_nr_entries);
cca448fe9   Yasunori Goto   [PATCH] wait_tabl...
3268
3269
  	alloc_size = zone->wait_table_hash_nr_entries
  					* sizeof(wait_queue_head_t);
cd94b9dbf   Heiko Carstens   memory hotplug: f...
3270
  	if (!slab_is_available()) {
cca448fe9   Yasunori Goto   [PATCH] wait_tabl...
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
  		zone->wait_table = (wait_queue_head_t *)
  			alloc_bootmem_node(pgdat, alloc_size);
  	} else {
  		/*
  		 * This case means that a zone whose size was 0 gets new memory
  		 * via memory hot-add.
  		 * But it may be the case that a new node was hot-added.  In
  		 * this case vmalloc() will not be able to use this new node's
  		 * memory - this wait_table must be initialized to use this new
  		 * node itself as well.
  		 * To use this new node's memory, further consideration will be
  		 * necessary.
  		 */
8691f3a72   Jesper Juhl   mm: no need to ca...
3284
  		zone->wait_table = vmalloc(alloc_size);
cca448fe9   Yasunori Goto   [PATCH] wait_tabl...
3285
3286
3287
  	}
  	if (!zone->wait_table)
  		return -ENOMEM;
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
3288

02b694dea   Yasunori Goto   [PATCH] wait_tabl...
3289
  	for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
3290
  		init_waitqueue_head(zone->wait_table + i);
cca448fe9   Yasunori Goto   [PATCH] wait_tabl...
3291
3292
  
  	return 0;
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
3293
  }
112067f09   Shaohua Li   memory hotplug: u...
3294
3295
3296
3297
3298
  static int __zone_pcp_update(void *data)
  {
  	struct zone *zone = data;
  	int cpu;
  	unsigned long batch = zone_batchsize(zone), flags;
2d30a1f63   Thomas Gleixner   mm: do not iterat...
3299
  	for_each_possible_cpu(cpu) {
112067f09   Shaohua Li   memory hotplug: u...
3300
3301
  		struct per_cpu_pageset *pset;
  		struct per_cpu_pages *pcp;
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
3302
  		pset = per_cpu_ptr(zone->pageset, cpu);
112067f09   Shaohua Li   memory hotplug: u...
3303
3304
3305
  		pcp = &pset->pcp;
  
  		local_irq_save(flags);
5f8dcc212   Mel Gorman   page-allocator: s...
3306
  		free_pcppages_bulk(zone, pcp->count, pcp);
112067f09   Shaohua Li   memory hotplug: u...
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
  		setup_pageset(pset, batch);
  		local_irq_restore(flags);
  	}
  	return 0;
  }
  
  void zone_pcp_update(struct zone *zone)
  {
  	stop_machine(__zone_pcp_update, zone, NULL);
  }
c09b42404   Matt Tolentino   [PATCH] x86_64: a...
3317
  static __meminit void zone_pcp_init(struct zone *zone)
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
3318
  {
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
3319
3320
3321
3322
3323
3324
  	/*
  	 * per cpu subsystem is not up at this point. The following code
  	 * relies on the ability of the linker to provide the
  	 * offset of a (static) per cpu variable into the per cpu area.
  	 */
  	zone->pageset = &boot_pageset;
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
3325

f5335c0f1   Anton Blanchard   [PATCH] quieten z...
3326
  	if (zone->present_pages)
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
3327
3328
3329
3330
  		printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%u
  ",
  			zone->name, zone->present_pages,
  					 zone_batchsize(zone));
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
3331
  }
718127cc3   Yasunori Goto   [PATCH] wait_tabl...
3332
3333
  __meminit int init_currently_empty_zone(struct zone *zone,
  					unsigned long zone_start_pfn,
a2f3aa025   Dave Hansen   [PATCH] Fix spars...
3334
3335
  					unsigned long size,
  					enum memmap_context context)
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
3336
3337
  {
  	struct pglist_data *pgdat = zone->zone_pgdat;
cca448fe9   Yasunori Goto   [PATCH] wait_tabl...
3338
3339
3340
3341
  	int ret;
  	ret = zone_wait_table_init(zone, size);
  	if (ret)
  		return ret;
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
3342
  	pgdat->nr_zones = zone_idx(zone) + 1;
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
3343
  	zone->zone_start_pfn = zone_start_pfn;
708614e61   Mel Gorman   mm: verify the pa...
3344
3345
3346
3347
3348
3349
  	mminit_dprintk(MMINIT_TRACE, "memmap_init",
  			"Initialising map node %d zone %lu pfns %lu -> %lu
  ",
  			pgdat->node_id,
  			(unsigned long)zone_idx(zone),
  			zone_start_pfn, (zone_start_pfn + size));
1e548deb5   Andi Kleen   page allocator: r...
3350
  	zone_init_free_lists(zone);
718127cc3   Yasunori Goto   [PATCH] wait_tabl...
3351
3352
  
  	return 0;
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
3353
  }
c713216de   Mel Gorman   [PATCH] Introduce...
3354
3355
3356
3357
3358
  #ifdef CONFIG_ARCH_POPULATES_NODE_MAP
  /*
   * Basic iterator support. Return the first range of PFNs for a node
   * Note: nid == MAX_NUMNODES returns first region regardless of node
   */
a3142c8e1   Yasunori Goto   Fix section misma...
3359
  static int __meminit first_active_region_index_in_nid(int nid)
c713216de   Mel Gorman   [PATCH] Introduce...
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
  {
  	int i;
  
  	for (i = 0; i < nr_nodemap_entries; i++)
  		if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
  			return i;
  
  	return -1;
  }
  
  /*
   * Basic iterator support. Return the next active range of PFNs for a node
183ff22bb   Simon Arlott   spelling fixes: mm/
3372
   * Note: nid == MAX_NUMNODES returns next region regardless of node
c713216de   Mel Gorman   [PATCH] Introduce...
3373
   */
a3142c8e1   Yasunori Goto   Fix section misma...
3374
  static int __meminit next_active_region_index_in_nid(int index, int nid)
c713216de   Mel Gorman   [PATCH] Introduce...
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
  {
  	for (index = index + 1; index < nr_nodemap_entries; index++)
  		if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
  			return index;
  
  	return -1;
  }
  
  #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
  /*
   * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
   * Architectures may implement their own version but if add_active_range()
   * was used and there are no special requirements, this is a convenient
   * alternative
   */
f2dbcfa73   KAMEZAWA Hiroyuki   mm: clean up for ...
3390
  int __meminit __early_pfn_to_nid(unsigned long pfn)
c713216de   Mel Gorman   [PATCH] Introduce...
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
  {
  	int i;
  
  	for (i = 0; i < nr_nodemap_entries; i++) {
  		unsigned long start_pfn = early_node_map[i].start_pfn;
  		unsigned long end_pfn = early_node_map[i].end_pfn;
  
  		if (start_pfn <= pfn && pfn < end_pfn)
  			return early_node_map[i].nid;
  	}
cc2559bcc   KAMEZAWA Hiroyuki   mm: fix memmap in...
3401
3402
  	/* This is a memory hole */
  	return -1;
c713216de   Mel Gorman   [PATCH] Introduce...
3403
3404
  }
  #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
f2dbcfa73   KAMEZAWA Hiroyuki   mm: clean up for ...
3405
3406
  int __meminit early_pfn_to_nid(unsigned long pfn)
  {
cc2559bcc   KAMEZAWA Hiroyuki   mm: fix memmap in...
3407
3408
3409
3410
3411
3412
3413
  	int nid;
  
  	nid = __early_pfn_to_nid(pfn);
  	if (nid >= 0)
  		return nid;
  	/* just returns 0 */
  	return 0;
f2dbcfa73   KAMEZAWA Hiroyuki   mm: clean up for ...
3414
  }
cc2559bcc   KAMEZAWA Hiroyuki   mm: fix memmap in...
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
  #ifdef CONFIG_NODES_SPAN_OTHER_NODES
  bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
  {
  	int nid;
  
  	nid = __early_pfn_to_nid(pfn);
  	if (nid >= 0 && nid != node)
  		return false;
  	return true;
  }
  #endif
f2dbcfa73   KAMEZAWA Hiroyuki   mm: clean up for ...
3426

c713216de   Mel Gorman   [PATCH] Introduce...
3427
3428
3429
3430
3431
3432
3433
  /* Basic iterator support to walk early_node_map[] */
  #define for_each_active_range_index_in_nid(i, nid) \
  	for (i = first_active_region_index_in_nid(nid); i != -1; \
  				i = next_active_region_index_in_nid(i, nid))
  
  /**
   * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
3434
3435
   * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
   * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node
c713216de   Mel Gorman   [PATCH] Introduce...
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
   *
   * If an architecture guarantees that all ranges registered with
   * add_active_ranges() contain no holes and may be freed, this
   * this function may be used instead of calling free_bootmem() manually.
   */
  void __init free_bootmem_with_active_regions(int nid,
  						unsigned long max_low_pfn)
  {
  	int i;
  
  	for_each_active_range_index_in_nid(i, nid) {
  		unsigned long size_pages = 0;
  		unsigned long end_pfn = early_node_map[i].end_pfn;
  
  		if (early_node_map[i].start_pfn >= max_low_pfn)
  			continue;
  
  		if (end_pfn > max_low_pfn)
  			end_pfn = max_low_pfn;
  
  		size_pages = end_pfn - early_node_map[i].start_pfn;
  		free_bootmem_node(NODE_DATA(early_node_map[i].nid),
  				PFN_PHYS(early_node_map[i].start_pfn),
  				size_pages << PAGE_SHIFT);
  	}
  }
08677214e   Yinghai Lu   x86: Make 64 bit ...
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
  int __init add_from_early_node_map(struct range *range, int az,
  				   int nr_range, int nid)
  {
  	int i;
  	u64 start, end;
  
  	/* need to go over early_node_map to find out good range for node */
  	for_each_active_range_index_in_nid(i, nid) {
  		start = early_node_map[i].start_pfn;
  		end = early_node_map[i].end_pfn;
  		nr_range = add_range(range, az, nr_range, start, end);
  	}
  	return nr_range;
  }
2ee78f7b1   Yinghai Lu   x86: Fix non-boot...
3476
  #ifdef CONFIG_NO_BOOTMEM
08677214e   Yinghai Lu   x86: Make 64 bit ...
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
  void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
  					u64 goal, u64 limit)
  {
  	int i;
  	void *ptr;
  
  	/* need to go over early_node_map to find out good range for node */
  	for_each_active_range_index_in_nid(i, nid) {
  		u64 addr;
  		u64 ei_start, ei_last;
  
  		ei_last = early_node_map[i].end_pfn;
  		ei_last <<= PAGE_SHIFT;
  		ei_start = early_node_map[i].start_pfn;
  		ei_start <<= PAGE_SHIFT;
  		addr = find_early_area(ei_start, ei_last,
  					 goal, limit, size, align);
  
  		if (addr == -1ULL)
  			continue;
  
  #if 0
  		printk(KERN_DEBUG "alloc (nid=%d %llx - %llx) (%llx - %llx) %llx %llx => %llx
  ",
  				nid,
  				ei_start, ei_last, goal, limit, size,
  				align, addr);
  #endif
  
  		ptr = phys_to_virt(addr);
  		memset(ptr, 0, size);
  		reserve_early_without_check(addr, addr + size, "BOOTMEM");
  		return ptr;
  	}
  
  	return NULL;
  }
2ee78f7b1   Yinghai Lu   x86: Fix non-boot...
3514
  #endif
08677214e   Yinghai Lu   x86: Make 64 bit ...
3515

b5bc6c0e5   Yinghai Lu   x86, mm: use add_...
3516
3517
3518
  void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data)
  {
  	int i;
d52d53b8a   Yinghai Lu   RFC x86: try to r...
3519
  	int ret;
b5bc6c0e5   Yinghai Lu   x86, mm: use add_...
3520

d52d53b8a   Yinghai Lu   RFC x86: try to r...
3521
3522
3523
3524
3525
3526
  	for_each_active_range_index_in_nid(i, nid) {
  		ret = work_fn(early_node_map[i].start_pfn,
  			      early_node_map[i].end_pfn, data);
  		if (ret)
  			break;
  	}
b5bc6c0e5   Yinghai Lu   x86, mm: use add_...
3527
  }
c713216de   Mel Gorman   [PATCH] Introduce...
3528
3529
  /**
   * sparse_memory_present_with_active_regions - Call memory_present for each active range
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
3530
   * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
c713216de   Mel Gorman   [PATCH] Introduce...
3531
3532
3533
   *
   * If an architecture guarantees that all ranges registered with
   * add_active_ranges() contain no holes and may be freed, this
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
3534
   * function may be used instead of calling memory_present() manually.
c713216de   Mel Gorman   [PATCH] Introduce...
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
   */
  void __init sparse_memory_present_with_active_regions(int nid)
  {
  	int i;
  
  	for_each_active_range_index_in_nid(i, nid)
  		memory_present(early_node_map[i].nid,
  				early_node_map[i].start_pfn,
  				early_node_map[i].end_pfn);
  }
  
  /**
   * get_pfn_range_for_nid - Return the start and end page frames for a node
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
3548
3549
3550
   * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
   * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
   * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
c713216de   Mel Gorman   [PATCH] Introduce...
3551
3552
3553
3554
   *
   * It returns the start and end page frame of a node based on information
   * provided by an arch calling add_active_range(). If called for a node
   * with no available memory, a warning is printed and the start and end
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
3555
   * PFNs will be 0.
c713216de   Mel Gorman   [PATCH] Introduce...
3556
   */
a3142c8e1   Yasunori Goto   Fix section misma...
3557
  void __meminit get_pfn_range_for_nid(unsigned int nid,
c713216de   Mel Gorman   [PATCH] Introduce...
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
  			unsigned long *start_pfn, unsigned long *end_pfn)
  {
  	int i;
  	*start_pfn = -1UL;
  	*end_pfn = 0;
  
  	for_each_active_range_index_in_nid(i, nid) {
  		*start_pfn = min(*start_pfn, early_node_map[i].start_pfn);
  		*end_pfn = max(*end_pfn, early_node_map[i].end_pfn);
  	}
633c0666b   Christoph Lameter   Memoryless nodes:...
3568
  	if (*start_pfn == -1UL)
c713216de   Mel Gorman   [PATCH] Introduce...
3569
  		*start_pfn = 0;
c713216de   Mel Gorman   [PATCH] Introduce...
3570
3571
3572
  }
  
  /*
2a1e274ac   Mel Gorman   Create the ZONE_M...
3573
3574
3575
3576
   * This finds a zone that can be used for ZONE_MOVABLE pages. The
   * assumption is made that zones within a node are ordered in monotonic
   * increasing memory addresses so that the "highest" populated zone is used
   */
b69a7288e   Adrian Bunk   mm/page_alloc.c: ...
3577
  static void __init find_usable_zone_for_movable(void)
2a1e274ac   Mel Gorman   Create the ZONE_M...
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
  {
  	int zone_index;
  	for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
  		if (zone_index == ZONE_MOVABLE)
  			continue;
  
  		if (arch_zone_highest_possible_pfn[zone_index] >
  				arch_zone_lowest_possible_pfn[zone_index])
  			break;
  	}
  
  	VM_BUG_ON(zone_index == -1);
  	movable_zone = zone_index;
  }
  
  /*
   * The zone ranges provided by the architecture do not include ZONE_MOVABLE
   * because it is sized independant of architecture. Unlike the other zones,
   * the starting point for ZONE_MOVABLE is not fixed. It may be different
   * in each node depending on the size of each node and how evenly kernelcore
   * is distributed. This helper function adjusts the zone ranges
   * provided by the architecture for a given node by using the end of the
   * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
   * zones within a node are in order of monotonic increases memory addresses
   */
b69a7288e   Adrian Bunk   mm/page_alloc.c: ...
3603
  static void __meminit adjust_zone_range_for_zone_movable(int nid,
2a1e274ac   Mel Gorman   Create the ZONE_M...
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
  					unsigned long zone_type,
  					unsigned long node_start_pfn,
  					unsigned long node_end_pfn,
  					unsigned long *zone_start_pfn,
  					unsigned long *zone_end_pfn)
  {
  	/* Only adjust if ZONE_MOVABLE is on this node */
  	if (zone_movable_pfn[nid]) {
  		/* Size ZONE_MOVABLE */
  		if (zone_type == ZONE_MOVABLE) {
  			*zone_start_pfn = zone_movable_pfn[nid];
  			*zone_end_pfn = min(node_end_pfn,
  				arch_zone_highest_possible_pfn[movable_zone]);
  
  		/* Adjust for ZONE_MOVABLE starting within this range */
  		} else if (*zone_start_pfn < zone_movable_pfn[nid] &&
  				*zone_end_pfn > zone_movable_pfn[nid]) {
  			*zone_end_pfn = zone_movable_pfn[nid];
  
  		/* Check if this whole range is within ZONE_MOVABLE */
  		} else if (*zone_start_pfn >= zone_movable_pfn[nid])
  			*zone_start_pfn = *zone_end_pfn;
  	}
  }
  
  /*
c713216de   Mel Gorman   [PATCH] Introduce...
3630
3631
3632
   * Return the number of pages a zone spans in a node, including holes
   * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
   */
6ea6e6887   Paul Mundt   mm: more __memini...
3633
  static unsigned long __meminit zone_spanned_pages_in_node(int nid,
c713216de   Mel Gorman   [PATCH] Introduce...
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
  					unsigned long zone_type,
  					unsigned long *ignored)
  {
  	unsigned long node_start_pfn, node_end_pfn;
  	unsigned long zone_start_pfn, zone_end_pfn;
  
  	/* Get the start and end of the node and zone */
  	get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
  	zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
  	zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
2a1e274ac   Mel Gorman   Create the ZONE_M...
3644
3645
3646
  	adjust_zone_range_for_zone_movable(nid, zone_type,
  				node_start_pfn, node_end_pfn,
  				&zone_start_pfn, &zone_end_pfn);
c713216de   Mel Gorman   [PATCH] Introduce...
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
  
  	/* Check that this node has pages within the zone's required range */
  	if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
  		return 0;
  
  	/* Move the zone boundaries inside the node if necessary */
  	zone_end_pfn = min(zone_end_pfn, node_end_pfn);
  	zone_start_pfn = max(zone_start_pfn, node_start_pfn);
  
  	/* Return the spanned pages */
  	return zone_end_pfn - zone_start_pfn;
  }
  
  /*
   * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
3662
   * then all holes in the requested range will be accounted for.
c713216de   Mel Gorman   [PATCH] Introduce...
3663
   */
329962503   Yinghai Lu   x86: Fix checking...
3664
  unsigned long __meminit __absent_pages_in_range(int nid,
c713216de   Mel Gorman   [PATCH] Introduce...
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
  				unsigned long range_start_pfn,
  				unsigned long range_end_pfn)
  {
  	int i = 0;
  	unsigned long prev_end_pfn = 0, hole_pages = 0;
  	unsigned long start_pfn;
  
  	/* Find the end_pfn of the first active range of pfns in the node */
  	i = first_active_region_index_in_nid(nid);
  	if (i == -1)
  		return 0;
b5445f956   Mel Gorman   Allow nodes to ex...
3676
  	prev_end_pfn = min(early_node_map[i].start_pfn, range_end_pfn);
9c7cd6877   Mel Gorman   [PATCH] Account f...
3677
3678
  	/* Account for ranges before physical memory on this node */
  	if (early_node_map[i].start_pfn > range_start_pfn)
b5445f956   Mel Gorman   Allow nodes to ex...
3679
  		hole_pages = prev_end_pfn - range_start_pfn;
c713216de   Mel Gorman   [PATCH] Introduce...
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
  
  	/* Find all holes for the zone within the node */
  	for (; i != -1; i = next_active_region_index_in_nid(i, nid)) {
  
  		/* No need to continue if prev_end_pfn is outside the zone */
  		if (prev_end_pfn >= range_end_pfn)
  			break;
  
  		/* Make sure the end of the zone is not within the hole */
  		start_pfn = min(early_node_map[i].start_pfn, range_end_pfn);
  		prev_end_pfn = max(prev_end_pfn, range_start_pfn);
  
  		/* Update the hole size cound and move on */
  		if (start_pfn > range_start_pfn) {
  			BUG_ON(prev_end_pfn > start_pfn);
  			hole_pages += start_pfn - prev_end_pfn;
  		}
  		prev_end_pfn = early_node_map[i].end_pfn;
  	}
9c7cd6877   Mel Gorman   [PATCH] Account f...
3699
3700
  	/* Account for ranges past physical memory on this node */
  	if (range_end_pfn > prev_end_pfn)
0c6cb9746   Mel Gorman   [PATCH] Calculati...
3701
  		hole_pages += range_end_pfn -
9c7cd6877   Mel Gorman   [PATCH] Account f...
3702
  				max(range_start_pfn, prev_end_pfn);
c713216de   Mel Gorman   [PATCH] Introduce...
3703
3704
3705
3706
3707
3708
3709
3710
  	return hole_pages;
  }
  
  /**
   * absent_pages_in_range - Return number of page frames in holes within a range
   * @start_pfn: The start PFN to start searching for holes
   * @end_pfn: The end PFN to stop searching for holes
   *
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
3711
   * It returns the number of pages frames in memory holes within a range.
c713216de   Mel Gorman   [PATCH] Introduce...
3712
3713
3714
3715
3716
3717
3718
3719
   */
  unsigned long __init absent_pages_in_range(unsigned long start_pfn,
  							unsigned long end_pfn)
  {
  	return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
  }
  
  /* Return the number of page frames in holes in a zone on a node */
6ea6e6887   Paul Mundt   mm: more __memini...
3720
  static unsigned long __meminit zone_absent_pages_in_node(int nid,
c713216de   Mel Gorman   [PATCH] Introduce...
3721
3722
3723
  					unsigned long zone_type,
  					unsigned long *ignored)
  {
9c7cd6877   Mel Gorman   [PATCH] Account f...
3724
3725
3726
3727
3728
3729
3730
3731
  	unsigned long node_start_pfn, node_end_pfn;
  	unsigned long zone_start_pfn, zone_end_pfn;
  
  	get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
  	zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type],
  							node_start_pfn);
  	zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type],
  							node_end_pfn);
2a1e274ac   Mel Gorman   Create the ZONE_M...
3732
3733
3734
  	adjust_zone_range_for_zone_movable(nid, zone_type,
  			node_start_pfn, node_end_pfn,
  			&zone_start_pfn, &zone_end_pfn);
9c7cd6877   Mel Gorman   [PATCH] Account f...
3735
  	return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
c713216de   Mel Gorman   [PATCH] Introduce...
3736
  }
0e0b864e0   Mel Gorman   [PATCH] Account f...
3737

c713216de   Mel Gorman   [PATCH] Introduce...
3738
  #else
6ea6e6887   Paul Mundt   mm: more __memini...
3739
  static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
c713216de   Mel Gorman   [PATCH] Introduce...
3740
3741
3742
3743
3744
  					unsigned long zone_type,
  					unsigned long *zones_size)
  {
  	return zones_size[zone_type];
  }
6ea6e6887   Paul Mundt   mm: more __memini...
3745
  static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
c713216de   Mel Gorman   [PATCH] Introduce...
3746
3747
3748
3749
3750
3751
3752
3753
  						unsigned long zone_type,
  						unsigned long *zholes_size)
  {
  	if (!zholes_size)
  		return 0;
  
  	return zholes_size[zone_type];
  }
0e0b864e0   Mel Gorman   [PATCH] Account f...
3754

c713216de   Mel Gorman   [PATCH] Introduce...
3755
  #endif
a3142c8e1   Yasunori Goto   Fix section misma...
3756
  static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
c713216de   Mel Gorman   [PATCH] Introduce...
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
  		unsigned long *zones_size, unsigned long *zholes_size)
  {
  	unsigned long realtotalpages, totalpages = 0;
  	enum zone_type i;
  
  	for (i = 0; i < MAX_NR_ZONES; i++)
  		totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
  								zones_size);
  	pgdat->node_spanned_pages = totalpages;
  
  	realtotalpages = totalpages;
  	for (i = 0; i < MAX_NR_ZONES; i++)
  		realtotalpages -=
  			zone_absent_pages_in_node(pgdat->node_id, i,
  								zholes_size);
  	pgdat->node_present_pages = realtotalpages;
  	printk(KERN_DEBUG "On node %d totalpages: %lu
  ", pgdat->node_id,
  							realtotalpages);
  }
835c134ec   Mel Gorman   Add a bitmap that...
3777
3778
3779
  #ifndef CONFIG_SPARSEMEM
  /*
   * Calculate the size of the zone->blockflags rounded to an unsigned long
d9c234005   Mel Gorman   Do not depend on ...
3780
3781
   * Start by making sure zonesize is a multiple of pageblock_order by rounding
   * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
835c134ec   Mel Gorman   Add a bitmap that...
3782
3783
3784
3785
3786
3787
   * round what is now in bits to nearest long in bits, then return it in
   * bytes.
   */
  static unsigned long __init usemap_size(unsigned long zonesize)
  {
  	unsigned long usemapsize;
d9c234005   Mel Gorman   Do not depend on ...
3788
3789
  	usemapsize = roundup(zonesize, pageblock_nr_pages);
  	usemapsize = usemapsize >> pageblock_order;
835c134ec   Mel Gorman   Add a bitmap that...
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
  	usemapsize *= NR_PAGEBLOCK_BITS;
  	usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));
  
  	return usemapsize / 8;
  }
  
  static void __init setup_usemap(struct pglist_data *pgdat,
  				struct zone *zone, unsigned long zonesize)
  {
  	unsigned long usemapsize = usemap_size(zonesize);
  	zone->pageblock_flags = NULL;
58a01a457   Julia Lawall   mm/page_alloc.c: ...
3801
  	if (usemapsize)
835c134ec   Mel Gorman   Add a bitmap that...
3802
  		zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize);
835c134ec   Mel Gorman   Add a bitmap that...
3803
3804
3805
3806
3807
  }
  #else
  static void inline setup_usemap(struct pglist_data *pgdat,
  				struct zone *zone, unsigned long zonesize) {}
  #endif /* CONFIG_SPARSEMEM */
d9c234005   Mel Gorman   Do not depend on ...
3808
  #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
ba72cb8cb   Mel Gorman   Fix boot problem ...
3809
3810
3811
3812
3813
3814
3815
3816
3817
  
  /* Return a sensible default order for the pageblock size. */
  static inline int pageblock_default_order(void)
  {
  	if (HPAGE_SHIFT > PAGE_SHIFT)
  		return HUGETLB_PAGE_ORDER;
  
  	return MAX_ORDER-1;
  }
d9c234005   Mel Gorman   Do not depend on ...
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
  /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
  static inline void __init set_pageblock_order(unsigned int order)
  {
  	/* Check that pageblock_nr_pages has not already been setup */
  	if (pageblock_order)
  		return;
  
  	/*
  	 * Assume the largest contiguous order of interest is a huge page.
  	 * This value may be variable depending on boot parameters on IA64
  	 */
  	pageblock_order = order;
  }
  #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
ba72cb8cb   Mel Gorman   Fix boot problem ...
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
  /*
   * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
   * and pageblock_default_order() are unused as pageblock_order is set
   * at compile-time. See include/linux/pageblock-flags.h for the values of
   * pageblock_order based on the kernel config
   */
  static inline int pageblock_default_order(unsigned int order)
  {
  	return MAX_ORDER-1;
  }
d9c234005   Mel Gorman   Do not depend on ...
3842
3843
3844
  #define set_pageblock_order(x)	do {} while (0)
  
  #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3845
3846
3847
3848
3849
3850
  /*
   * Set up the zone data structures:
   *   - mark all pages reserved
   *   - mark all memory queues empty
   *   - clear the memory bitmaps
   */
b5a0e0113   Alexander van Heukelum   Solve section mis...
3851
  static void __paginginit free_area_init_core(struct pglist_data *pgdat,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3852
3853
  		unsigned long *zones_size, unsigned long *zholes_size)
  {
2f1b62486   Christoph Lameter   [PATCH] reduce MA...
3854
  	enum zone_type j;
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
3855
  	int nid = pgdat->node_id;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3856
  	unsigned long zone_start_pfn = pgdat->node_start_pfn;
718127cc3   Yasunori Goto   [PATCH] wait_tabl...
3857
  	int ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3858

208d54e55   Dave Hansen   [PATCH] memory ho...
3859
  	pgdat_resize_init(pgdat);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3860
3861
3862
  	pgdat->nr_zones = 0;
  	init_waitqueue_head(&pgdat->kswapd_wait);
  	pgdat->kswapd_max_order = 0;
52d4b9ac0   KAMEZAWA Hiroyuki   memcg: allocate a...
3863
  	pgdat_page_cgroup_init(pgdat);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3864
3865
3866
  	
  	for (j = 0; j < MAX_NR_ZONES; j++) {
  		struct zone *zone = pgdat->node_zones + j;
0e0b864e0   Mel Gorman   [PATCH] Account f...
3867
  		unsigned long size, realsize, memmap_pages;
b69408e88   Christoph Lameter   vmscan: Use an in...
3868
  		enum lru_list l;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3869

c713216de   Mel Gorman   [PATCH] Introduce...
3870
3871
3872
  		size = zone_spanned_pages_in_node(nid, j, zones_size);
  		realsize = size - zone_absent_pages_in_node(nid, j,
  								zholes_size);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3873

0e0b864e0   Mel Gorman   [PATCH] Account f...
3874
3875
3876
3877
3878
  		/*
  		 * Adjust realsize so that it accounts for how much memory
  		 * is used by this zone for memmap. This affects the watermark
  		 * and per-cpu initialisations
  		 */
f72321541   Johannes Weiner   mm: don't drop a ...
3879
3880
  		memmap_pages =
  			PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;
0e0b864e0   Mel Gorman   [PATCH] Account f...
3881
3882
  		if (realsize >= memmap_pages) {
  			realsize -= memmap_pages;
5594c8c81   Yinghai Lu   mm: print out mem...
3883
3884
3885
3886
3887
  			if (memmap_pages)
  				printk(KERN_DEBUG
  				       "  %s zone: %lu pages used for memmap
  ",
  				       zone_names[j], memmap_pages);
0e0b864e0   Mel Gorman   [PATCH] Account f...
3888
3889
3890
3891
3892
  		} else
  			printk(KERN_WARNING
  				"  %s zone: %lu pages exceeds realsize %lu
  ",
  				zone_names[j], memmap_pages, realsize);
6267276f3   Christoph Lameter   [PATCH] optional ...
3893
3894
  		/* Account for reserved pages */
  		if (j == 0 && realsize > dma_reserve) {
0e0b864e0   Mel Gorman   [PATCH] Account f...
3895
  			realsize -= dma_reserve;
d903ef9f3   Yinghai Lu   mm: print out mem...
3896
3897
  			printk(KERN_DEBUG "  %s zone: %lu pages reserved
  ",
6267276f3   Christoph Lameter   [PATCH] optional ...
3898
  					zone_names[0], dma_reserve);
0e0b864e0   Mel Gorman   [PATCH] Account f...
3899
  		}
98d2b0ebd   Christoph Lameter   [PATCH] reduce MA...
3900
  		if (!is_highmem_idx(j))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3901
3902
3903
3904
3905
  			nr_kernel_pages += realsize;
  		nr_all_pages += realsize;
  
  		zone->spanned_pages = size;
  		zone->present_pages = realsize;
9614634fe   Christoph Lameter   [PATCH] ZVC/zone_...
3906
  #ifdef CONFIG_NUMA
d5f541ed6   Christoph Lameter   [PATCH] Add node ...
3907
  		zone->node = nid;
8417bba4b   Christoph Lameter   [PATCH] Replace m...
3908
  		zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
9614634fe   Christoph Lameter   [PATCH] ZVC/zone_...
3909
  						/ 100;
0ff38490c   Christoph Lameter   [PATCH] zone_recl...
3910
  		zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
9614634fe   Christoph Lameter   [PATCH] ZVC/zone_...
3911
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3912
3913
3914
  		zone->name = zone_names[j];
  		spin_lock_init(&zone->lock);
  		spin_lock_init(&zone->lru_lock);
bdc8cb984   Dave Hansen   [PATCH] memory ho...
3915
  		zone_seqlock_init(zone);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3916
  		zone->zone_pgdat = pgdat;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3917

3bb1a852a   Martin Bligh   [PATCH] vmscan: F...
3918
  		zone->prev_priority = DEF_PRIORITY;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3919

ed8ece2ec   Dave Hansen   [PATCH] memory ho...
3920
  		zone_pcp_init(zone);
b69408e88   Christoph Lameter   vmscan: Use an in...
3921
3922
  		for_each_lru(l) {
  			INIT_LIST_HEAD(&zone->lru[l].list);
f86296317   Wu Fengguang   mm: do batched sc...
3923
  			zone->reclaim_stat.nr_saved_scan[l] = 0;
b69408e88   Christoph Lameter   vmscan: Use an in...
3924
  		}
6e9015716   KOSAKI Motohiro   mm: introduce zon...
3925
3926
3927
3928
  		zone->reclaim_stat.recent_rotated[0] = 0;
  		zone->reclaim_stat.recent_rotated[1] = 0;
  		zone->reclaim_stat.recent_scanned[0] = 0;
  		zone->reclaim_stat.recent_scanned[1] = 0;
2244b95a7   Christoph Lameter   [PATCH] zoned vm ...
3929
  		zap_zone_vm_stats(zone);
e815af95f   David Rientjes   oom: change all_u...
3930
  		zone->flags = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3931
3932
  		if (!size)
  			continue;
ba72cb8cb   Mel Gorman   Fix boot problem ...
3933
  		set_pageblock_order(pageblock_default_order());
835c134ec   Mel Gorman   Add a bitmap that...
3934
  		setup_usemap(pgdat, zone, size);
a2f3aa025   Dave Hansen   [PATCH] Fix spars...
3935
3936
  		ret = init_currently_empty_zone(zone, zone_start_pfn,
  						size, MEMMAP_EARLY);
718127cc3   Yasunori Goto   [PATCH] wait_tabl...
3937
  		BUG_ON(ret);
76cdd58e5   Heiko Carstens   memory_hotplug: a...
3938
  		memmap_init(size, nid, j, zone_start_pfn);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3939
  		zone_start_pfn += size;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3940
3941
  	}
  }
577a32f62   Sam Ravnborg   mm: fix section m...
3942
  static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3943
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3944
3945
3946
  	/* Skip empty nodes */
  	if (!pgdat->node_spanned_pages)
  		return;
d41dee369   Andy Whitcroft   [PATCH] sparsemem...
3947
  #ifdef CONFIG_FLAT_NODE_MEM_MAP
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3948
3949
  	/* ia64 gets its own node_mem_map, before this, without bootmem */
  	if (!pgdat->node_mem_map) {
e984bb43f   Bob Picco   [PATCH] Align the...
3950
  		unsigned long size, start, end;
d41dee369   Andy Whitcroft   [PATCH] sparsemem...
3951
  		struct page *map;
e984bb43f   Bob Picco   [PATCH] Align the...
3952
3953
3954
3955
3956
3957
3958
3959
3960
  		/*
  		 * The zone's endpoints aren't required to be MAX_ORDER
  		 * aligned but the node_mem_map endpoints must be in order
  		 * for the buddy allocator to function correctly.
  		 */
  		start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
  		end = pgdat->node_start_pfn + pgdat->node_spanned_pages;
  		end = ALIGN(end, MAX_ORDER_NR_PAGES);
  		size =  (end - start) * sizeof(struct page);
6f167ec72   Dave Hansen   [PATCH] sparsemem...
3961
3962
3963
  		map = alloc_remap(pgdat->node_id, size);
  		if (!map)
  			map = alloc_bootmem_node(pgdat, size);
e984bb43f   Bob Picco   [PATCH] Align the...
3964
  		pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3965
  	}
12d810c1b   Roman Zippel   m68k: discontinuo...
3966
  #ifndef CONFIG_NEED_MULTIPLE_NODES
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3967
3968
3969
  	/*
  	 * With no DISCONTIG, the global mem_map is just set as node 0's
  	 */
c713216de   Mel Gorman   [PATCH] Introduce...
3970
  	if (pgdat == NODE_DATA(0)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3971
  		mem_map = NODE_DATA(0)->node_mem_map;
c713216de   Mel Gorman   [PATCH] Introduce...
3972
3973
  #ifdef CONFIG_ARCH_POPULATES_NODE_MAP
  		if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
467bc461d   Thomas Bogendoerfer   Fix crash with FL...
3974
  			mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);
c713216de   Mel Gorman   [PATCH] Introduce...
3975
3976
  #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3977
  #endif
d41dee369   Andy Whitcroft   [PATCH] sparsemem...
3978
  #endif /* CONFIG_FLAT_NODE_MEM_MAP */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3979
  }
9109fb7b3   Johannes Weiner   mm: drop unneeded...
3980
3981
  void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
  		unsigned long node_start_pfn, unsigned long *zholes_size)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3982
  {
9109fb7b3   Johannes Weiner   mm: drop unneeded...
3983
  	pg_data_t *pgdat = NODE_DATA(nid);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3984
3985
  	pgdat->node_id = nid;
  	pgdat->node_start_pfn = node_start_pfn;
c713216de   Mel Gorman   [PATCH] Introduce...
3986
  	calculate_node_totalpages(pgdat, zones_size, zholes_size);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3987
3988
  
  	alloc_node_mem_map(pgdat);
e8c27ac91   Yinghai Lu   x86, numa, 32-bit...
3989
3990
3991
3992
3993
3994
  #ifdef CONFIG_FLAT_NODE_MEM_MAP
  	printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx
  ",
  		nid, (unsigned long)pgdat,
  		(unsigned long)pgdat->node_mem_map);
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3995
3996
3997
  
  	free_area_init_core(pgdat, zones_size, zholes_size);
  }
c713216de   Mel Gorman   [PATCH] Introduce...
3998
  #ifdef CONFIG_ARCH_POPULATES_NODE_MAP
418508c13   Miklos Szeredi   fix unused setup_...
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
  
  #if MAX_NUMNODES > 1
  /*
   * Figure out the number of possible node ids.
   */
  static void __init setup_nr_node_ids(void)
  {
  	unsigned int node;
  	unsigned int highest = 0;
  
  	for_each_node_mask(node, node_possible_map)
  		highest = node;
  	nr_node_ids = highest + 1;
  }
  #else
  static inline void setup_nr_node_ids(void)
  {
  }
  #endif
c713216de   Mel Gorman   [PATCH] Introduce...
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
  /**
   * add_active_range - Register a range of PFNs backed by physical memory
   * @nid: The node ID the range resides on
   * @start_pfn: The start PFN of the available physical memory
   * @end_pfn: The end PFN of the available physical memory
   *
   * These ranges are stored in an early_node_map[] and later used by
   * free_area_init_nodes() to calculate zone sizes and holes. If the
   * range spans a memory hole, it is up to the architecture to ensure
   * the memory is not freed by the bootmem allocator. If possible
   * the range being registered will be merged with existing ranges.
   */
  void __init add_active_range(unsigned int nid, unsigned long start_pfn,
  						unsigned long end_pfn)
  {
  	int i;
6b74ab97b   Mel Gorman   mm: add a basic d...
4034
4035
4036
4037
4038
4039
  	mminit_dprintk(MMINIT_TRACE, "memory_register",
  			"Entering add_active_range(%d, %#lx, %#lx) "
  			"%d entries of %d used
  ",
  			nid, start_pfn, end_pfn,
  			nr_nodemap_entries, MAX_ACTIVE_REGIONS);
c713216de   Mel Gorman   [PATCH] Introduce...
4040

2dbb51c49   Mel Gorman   mm: make defensiv...
4041
  	mminit_validate_memmodel_limits(&start_pfn, &end_pfn);
c713216de   Mel Gorman   [PATCH] Introduce...
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
  	/* Merge with existing active regions if possible */
  	for (i = 0; i < nr_nodemap_entries; i++) {
  		if (early_node_map[i].nid != nid)
  			continue;
  
  		/* Skip if an existing region covers this new one */
  		if (start_pfn >= early_node_map[i].start_pfn &&
  				end_pfn <= early_node_map[i].end_pfn)
  			return;
  
  		/* Merge forward if suitable */
  		if (start_pfn <= early_node_map[i].end_pfn &&
  				end_pfn > early_node_map[i].end_pfn) {
  			early_node_map[i].end_pfn = end_pfn;
  			return;
  		}
  
  		/* Merge backward if suitable */
d2dbe08dd   Kazuhisa Ichikawa   mm/page_alloc: fi...
4060
  		if (start_pfn < early_node_map[i].start_pfn &&
c713216de   Mel Gorman   [PATCH] Introduce...
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
  				end_pfn >= early_node_map[i].start_pfn) {
  			early_node_map[i].start_pfn = start_pfn;
  			return;
  		}
  	}
  
  	/* Check that early_node_map is large enough */
  	if (i >= MAX_ACTIVE_REGIONS) {
  		printk(KERN_CRIT "More than %d memory regions, truncating
  ",
  							MAX_ACTIVE_REGIONS);
  		return;
  	}
  
  	early_node_map[i].nid = nid;
  	early_node_map[i].start_pfn = start_pfn;
  	early_node_map[i].end_pfn = end_pfn;
  	nr_nodemap_entries = i + 1;
  }
  
  /**
cc1050baf   Yinghai Lu   x86: replace shri...
4082
   * remove_active_range - Shrink an existing registered range of PFNs
c713216de   Mel Gorman   [PATCH] Introduce...
4083
   * @nid: The node id the range is on that should be shrunk
cc1050baf   Yinghai Lu   x86: replace shri...
4084
4085
   * @start_pfn: The new PFN of the range
   * @end_pfn: The new PFN of the range
c713216de   Mel Gorman   [PATCH] Introduce...
4086
4087
   *
   * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node.
cc1a9d86c   Yinghai Lu   mm, x86: shrink_a...
4088
4089
4090
   * The map is kept near the end physical page range that has already been
   * registered. This function allows an arch to shrink an existing registered
   * range.
c713216de   Mel Gorman   [PATCH] Introduce...
4091
   */
cc1050baf   Yinghai Lu   x86: replace shri...
4092
4093
  void __init remove_active_range(unsigned int nid, unsigned long start_pfn,
  				unsigned long end_pfn)
c713216de   Mel Gorman   [PATCH] Introduce...
4094
  {
cc1a9d86c   Yinghai Lu   mm, x86: shrink_a...
4095
4096
  	int i, j;
  	int removed = 0;
c713216de   Mel Gorman   [PATCH] Introduce...
4097

cc1050baf   Yinghai Lu   x86: replace shri...
4098
4099
4100
  	printk(KERN_DEBUG "remove_active_range (%d, %lu, %lu)
  ",
  			  nid, start_pfn, end_pfn);
c713216de   Mel Gorman   [PATCH] Introduce...
4101
  	/* Find the old active region end and shrink */
cc1a9d86c   Yinghai Lu   mm, x86: shrink_a...
4102
  	for_each_active_range_index_in_nid(i, nid) {
cc1050baf   Yinghai Lu   x86: replace shri...
4103
4104
  		if (early_node_map[i].start_pfn >= start_pfn &&
  		    early_node_map[i].end_pfn <= end_pfn) {
cc1a9d86c   Yinghai Lu   mm, x86: shrink_a...
4105
  			/* clear it */
cc1050baf   Yinghai Lu   x86: replace shri...
4106
  			early_node_map[i].start_pfn = 0;
cc1a9d86c   Yinghai Lu   mm, x86: shrink_a...
4107
4108
4109
4110
  			early_node_map[i].end_pfn = 0;
  			removed = 1;
  			continue;
  		}
cc1050baf   Yinghai Lu   x86: replace shri...
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
  		if (early_node_map[i].start_pfn < start_pfn &&
  		    early_node_map[i].end_pfn > start_pfn) {
  			unsigned long temp_end_pfn = early_node_map[i].end_pfn;
  			early_node_map[i].end_pfn = start_pfn;
  			if (temp_end_pfn > end_pfn)
  				add_active_range(nid, end_pfn, temp_end_pfn);
  			continue;
  		}
  		if (early_node_map[i].start_pfn >= start_pfn &&
  		    early_node_map[i].end_pfn > end_pfn &&
  		    early_node_map[i].start_pfn < end_pfn) {
  			early_node_map[i].start_pfn = end_pfn;
cc1a9d86c   Yinghai Lu   mm, x86: shrink_a...
4123
  			continue;
c713216de   Mel Gorman   [PATCH] Introduce...
4124
  		}
cc1a9d86c   Yinghai Lu   mm, x86: shrink_a...
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
  	}
  
  	if (!removed)
  		return;
  
  	/* remove the blank ones */
  	for (i = nr_nodemap_entries - 1; i > 0; i--) {
  		if (early_node_map[i].nid != nid)
  			continue;
  		if (early_node_map[i].end_pfn)
  			continue;
  		/* we found it, get rid of it */
  		for (j = i; j < nr_nodemap_entries - 1; j++)
  			memcpy(&early_node_map[j], &early_node_map[j+1],
  				sizeof(early_node_map[j]));
  		j = nr_nodemap_entries - 1;
  		memset(&early_node_map[j], 0, sizeof(early_node_map[j]));
  		nr_nodemap_entries--;
  	}
c713216de   Mel Gorman   [PATCH] Introduce...
4144
4145
4146
4147
  }
  
  /**
   * remove_all_active_ranges - Remove all currently registered regions
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
4148
   *
c713216de   Mel Gorman   [PATCH] Introduce...
4149
4150
4151
4152
   * During discovery, it may be found that a table like SRAT is invalid
   * and an alternative discovery method must be used. This function removes
   * all currently registered regions.
   */
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
4153
  void __init remove_all_active_ranges(void)
c713216de   Mel Gorman   [PATCH] Introduce...
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
  {
  	memset(early_node_map, 0, sizeof(early_node_map));
  	nr_nodemap_entries = 0;
  }
  
  /* Compare two active node_active_regions */
  static int __init cmp_node_active_region(const void *a, const void *b)
  {
  	struct node_active_region *arange = (struct node_active_region *)a;
  	struct node_active_region *brange = (struct node_active_region *)b;
  
  	/* Done this way to avoid overflows */
  	if (arange->start_pfn > brange->start_pfn)
  		return 1;
  	if (arange->start_pfn < brange->start_pfn)
  		return -1;
  
  	return 0;
  }
  
  /* sort the node_map by start_pfn */
329962503   Yinghai Lu   x86: Fix checking...
4175
  void __init sort_node_map(void)
c713216de   Mel Gorman   [PATCH] Introduce...
4176
4177
4178
4179
4180
  {
  	sort(early_node_map, (size_t)nr_nodemap_entries,
  			sizeof(struct node_active_region),
  			cmp_node_active_region, NULL);
  }
a6af2bc3d   Mel Gorman   [PATCH] Avoid exc...
4181
  /* Find the lowest pfn for a node */
b69a7288e   Adrian Bunk   mm/page_alloc.c: ...
4182
  static unsigned long __init find_min_pfn_for_node(int nid)
c713216de   Mel Gorman   [PATCH] Introduce...
4183
4184
  {
  	int i;
a6af2bc3d   Mel Gorman   [PATCH] Avoid exc...
4185
  	unsigned long min_pfn = ULONG_MAX;
1abbfb412   Mel Gorman   [PATCH] x86_64: f...
4186

c713216de   Mel Gorman   [PATCH] Introduce...
4187
4188
  	/* Assuming a sorted map, the first range found has the starting pfn */
  	for_each_active_range_index_in_nid(i, nid)
a6af2bc3d   Mel Gorman   [PATCH] Avoid exc...
4189
  		min_pfn = min(min_pfn, early_node_map[i].start_pfn);
c713216de   Mel Gorman   [PATCH] Introduce...
4190

a6af2bc3d   Mel Gorman   [PATCH] Avoid exc...
4191
4192
  	if (min_pfn == ULONG_MAX) {
  		printk(KERN_WARNING
2bc0d2615   Paul Jackson   x86 boot: more co...
4193
4194
  			"Could not find start_pfn for node %d
  ", nid);
a6af2bc3d   Mel Gorman   [PATCH] Avoid exc...
4195
4196
4197
4198
  		return 0;
  	}
  
  	return min_pfn;
c713216de   Mel Gorman   [PATCH] Introduce...
4199
4200
4201
4202
4203
4204
  }
  
  /**
   * find_min_pfn_with_active_regions - Find the minimum PFN registered
   *
   * It returns the minimum PFN based on information provided via
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
4205
   * add_active_range().
c713216de   Mel Gorman   [PATCH] Introduce...
4206
4207
4208
4209
4210
   */
  unsigned long __init find_min_pfn_with_active_regions(void)
  {
  	return find_min_pfn_for_node(MAX_NUMNODES);
  }
37b07e416   Lee Schermerhorn   memoryless nodes:...
4211
4212
4213
4214
4215
  /*
   * early_calculate_totalpages()
   * Sum pages in active regions for movable zone.
   * Populate N_HIGH_MEMORY for calculating usable_nodes.
   */
484f51f82   Adrian Bunk   mm/page_alloc.c: ...
4216
  static unsigned long __init early_calculate_totalpages(void)
7e63efef8   Mel Gorman   Add a movablecore...
4217
4218
4219
  {
  	int i;
  	unsigned long totalpages = 0;
37b07e416   Lee Schermerhorn   memoryless nodes:...
4220
4221
  	for (i = 0; i < nr_nodemap_entries; i++) {
  		unsigned long pages = early_node_map[i].end_pfn -
7e63efef8   Mel Gorman   Add a movablecore...
4222
  						early_node_map[i].start_pfn;
37b07e416   Lee Schermerhorn   memoryless nodes:...
4223
4224
4225
4226
4227
  		totalpages += pages;
  		if (pages)
  			node_set_state(early_node_map[i].nid, N_HIGH_MEMORY);
  	}
    	return totalpages;
7e63efef8   Mel Gorman   Add a movablecore...
4228
  }
2a1e274ac   Mel Gorman   Create the ZONE_M...
4229
4230
4231
4232
4233
4234
  /*
   * Find the PFN the Movable zone begins in each node. Kernel memory
   * is spread evenly between nodes as long as the nodes have enough
   * memory. When they don't, some nodes will have more kernelcore than
   * others
   */
b69a7288e   Adrian Bunk   mm/page_alloc.c: ...
4235
  static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
2a1e274ac   Mel Gorman   Create the ZONE_M...
4236
4237
4238
4239
  {
  	int i, nid;
  	unsigned long usable_startpfn;
  	unsigned long kernelcore_node, kernelcore_remaining;
66918dcdf   Yinghai Lu   x86: only clear n...
4240
4241
  	/* save the state before borrow the nodemask */
  	nodemask_t saved_node_state = node_states[N_HIGH_MEMORY];
37b07e416   Lee Schermerhorn   memoryless nodes:...
4242
4243
  	unsigned long totalpages = early_calculate_totalpages();
  	int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);
2a1e274ac   Mel Gorman   Create the ZONE_M...
4244

7e63efef8   Mel Gorman   Add a movablecore...
4245
4246
4247
4248
4249
4250
4251
4252
4253
  	/*
  	 * If movablecore was specified, calculate what size of
  	 * kernelcore that corresponds so that memory usable for
  	 * any allocation type is evenly spread. If both kernelcore
  	 * and movablecore are specified, then the value of kernelcore
  	 * will be used for required_kernelcore if it's greater than
  	 * what movablecore would have allowed.
  	 */
  	if (required_movablecore) {
7e63efef8   Mel Gorman   Add a movablecore...
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
  		unsigned long corepages;
  
  		/*
  		 * Round-up so that ZONE_MOVABLE is at least as large as what
  		 * was requested by the user
  		 */
  		required_movablecore =
  			roundup(required_movablecore, MAX_ORDER_NR_PAGES);
  		corepages = totalpages - required_movablecore;
  
  		required_kernelcore = max(required_kernelcore, corepages);
  	}
2a1e274ac   Mel Gorman   Create the ZONE_M...
4266
4267
  	/* If kernelcore was not specified, there is no ZONE_MOVABLE */
  	if (!required_kernelcore)
66918dcdf   Yinghai Lu   x86: only clear n...
4268
  		goto out;
2a1e274ac   Mel Gorman   Create the ZONE_M...
4269
4270
4271
4272
4273
4274
4275
4276
  
  	/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
  	find_usable_zone_for_movable();
  	usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
  
  restart:
  	/* Spread kernelcore memory as evenly as possible throughout nodes */
  	kernelcore_node = required_kernelcore / usable_nodes;
37b07e416   Lee Schermerhorn   memoryless nodes:...
4277
  	for_each_node_state(nid, N_HIGH_MEMORY) {
2a1e274ac   Mel Gorman   Create the ZONE_M...
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
  		/*
  		 * Recalculate kernelcore_node if the division per node
  		 * now exceeds what is necessary to satisfy the requested
  		 * amount of memory for the kernel
  		 */
  		if (required_kernelcore < kernelcore_node)
  			kernelcore_node = required_kernelcore / usable_nodes;
  
  		/*
  		 * As the map is walked, we track how much memory is usable
  		 * by the kernel using kernelcore_remaining. When it is
  		 * 0, the rest of the node is usable by ZONE_MOVABLE
  		 */
  		kernelcore_remaining = kernelcore_node;
  
  		/* Go through each range of PFNs within this node */
  		for_each_active_range_index_in_nid(i, nid) {
  			unsigned long start_pfn, end_pfn;
  			unsigned long size_pages;
  
  			start_pfn = max(early_node_map[i].start_pfn,
  						zone_movable_pfn[nid]);
  			end_pfn = early_node_map[i].end_pfn;
  			if (start_pfn >= end_pfn)
  				continue;
  
  			/* Account for what is only usable for kernelcore */
  			if (start_pfn < usable_startpfn) {
  				unsigned long kernel_pages;
  				kernel_pages = min(end_pfn, usable_startpfn)
  								- start_pfn;
  
  				kernelcore_remaining -= min(kernel_pages,
  							kernelcore_remaining);
  				required_kernelcore -= min(kernel_pages,
  							required_kernelcore);
  
  				/* Continue if range is now fully accounted */
  				if (end_pfn <= usable_startpfn) {
  
  					/*
  					 * Push zone_movable_pfn to the end so
  					 * that if we have to rebalance
  					 * kernelcore across nodes, we will
  					 * not double account here
  					 */
  					zone_movable_pfn[nid] = end_pfn;
  					continue;
  				}
  				start_pfn = usable_startpfn;
  			}
  
  			/*
  			 * The usable PFN range for ZONE_MOVABLE is from
  			 * start_pfn->end_pfn. Calculate size_pages as the
  			 * number of pages used as kernelcore
  			 */
  			size_pages = end_pfn - start_pfn;
  			if (size_pages > kernelcore_remaining)
  				size_pages = kernelcore_remaining;
  			zone_movable_pfn[nid] = start_pfn + size_pages;
  
  			/*
  			 * Some kernelcore has been met, update counts and
  			 * break if the kernelcore for this node has been
  			 * satisified
  			 */
  			required_kernelcore -= min(required_kernelcore,
  								size_pages);
  			kernelcore_remaining -= size_pages;
  			if (!kernelcore_remaining)
  				break;
  		}
  	}
  
  	/*
  	 * If there is still required_kernelcore, we do another pass with one
  	 * less node in the count. This will push zone_movable_pfn[nid] further
  	 * along on the nodes that still have memory until kernelcore is
  	 * satisified
  	 */
  	usable_nodes--;
  	if (usable_nodes && required_kernelcore > usable_nodes)
  		goto restart;
  
  	/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
  	for (nid = 0; nid < MAX_NUMNODES; nid++)
  		zone_movable_pfn[nid] =
  			roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
66918dcdf   Yinghai Lu   x86: only clear n...
4367
4368
4369
4370
  
  out:
  	/* restore the node_state */
  	node_states[N_HIGH_MEMORY] = saved_node_state;
2a1e274ac   Mel Gorman   Create the ZONE_M...
4371
  }
37b07e416   Lee Schermerhorn   memoryless nodes:...
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
  /* Any regular memory on that node ? */
  static void check_for_regular_memory(pg_data_t *pgdat)
  {
  #ifdef CONFIG_HIGHMEM
  	enum zone_type zone_type;
  
  	for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) {
  		struct zone *zone = &pgdat->node_zones[zone_type];
  		if (zone->present_pages)
  			node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY);
  	}
  #endif
  }
c713216de   Mel Gorman   [PATCH] Introduce...
4385
4386
  /**
   * free_area_init_nodes - Initialise all pg_data_t and zone data
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
4387
   * @max_zone_pfn: an array of max PFNs for each zone
c713216de   Mel Gorman   [PATCH] Introduce...
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
   *
   * This will call free_area_init_node() for each active node in the system.
   * Using the page ranges provided by add_active_range(), the size of each
   * zone in each node and their holes is calculated. If the maximum PFN
   * between two adjacent zones match, it is assumed that the zone is empty.
   * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
   * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
   * starts where the previous one ended. For example, ZONE_DMA32 starts
   * at arch_max_dma_pfn.
   */
  void __init free_area_init_nodes(unsigned long *max_zone_pfn)
  {
  	unsigned long nid;
db99100d2   Andrew Morton   mm/page_alloc.c:f...
4401
  	int i;
c713216de   Mel Gorman   [PATCH] Introduce...
4402

a6af2bc3d   Mel Gorman   [PATCH] Avoid exc...
4403
4404
  	/* Sort early_node_map as initialisation assumes it is sorted */
  	sort_node_map();
c713216de   Mel Gorman   [PATCH] Introduce...
4405
4406
4407
4408
4409
4410
4411
4412
  	/* Record where the zone boundaries are */
  	memset(arch_zone_lowest_possible_pfn, 0,
  				sizeof(arch_zone_lowest_possible_pfn));
  	memset(arch_zone_highest_possible_pfn, 0,
  				sizeof(arch_zone_highest_possible_pfn));
  	arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
  	arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
  	for (i = 1; i < MAX_NR_ZONES; i++) {
2a1e274ac   Mel Gorman   Create the ZONE_M...
4413
4414
  		if (i == ZONE_MOVABLE)
  			continue;
c713216de   Mel Gorman   [PATCH] Introduce...
4415
4416
4417
4418
4419
  		arch_zone_lowest_possible_pfn[i] =
  			arch_zone_highest_possible_pfn[i-1];
  		arch_zone_highest_possible_pfn[i] =
  			max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
  	}
2a1e274ac   Mel Gorman   Create the ZONE_M...
4420
4421
4422
4423
4424
4425
  	arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;
  	arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;
  
  	/* Find the PFNs that ZONE_MOVABLE begins at in each node */
  	memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
  	find_zone_movable_pfns_for_nodes(zone_movable_pfn);
c713216de   Mel Gorman   [PATCH] Introduce...
4426

c713216de   Mel Gorman   [PATCH] Introduce...
4427
4428
4429
  	/* Print out the zone ranges */
  	printk("Zone PFN ranges:
  ");
2a1e274ac   Mel Gorman   Create the ZONE_M...
4430
4431
4432
  	for (i = 0; i < MAX_NR_ZONES; i++) {
  		if (i == ZONE_MOVABLE)
  			continue;
72f0ba025   David Rientjes   mm: suppress pfn ...
4433
4434
4435
4436
4437
4438
4439
4440
  		printk("  %-8s ", zone_names[i]);
  		if (arch_zone_lowest_possible_pfn[i] ==
  				arch_zone_highest_possible_pfn[i])
  			printk("empty
  ");
  		else
  			printk("%0#10lx -> %0#10lx
  ",
c713216de   Mel Gorman   [PATCH] Introduce...
4441
4442
  				arch_zone_lowest_possible_pfn[i],
  				arch_zone_highest_possible_pfn[i]);
2a1e274ac   Mel Gorman   Create the ZONE_M...
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
  	}
  
  	/* Print out the PFNs ZONE_MOVABLE begins at in each node */
  	printk("Movable zone start PFN for each node
  ");
  	for (i = 0; i < MAX_NUMNODES; i++) {
  		if (zone_movable_pfn[i])
  			printk("  Node %d: %lu
  ", i, zone_movable_pfn[i]);
  	}
c713216de   Mel Gorman   [PATCH] Introduce...
4453
4454
4455
4456
4457
  
  	/* Print out the early_node_map[] */
  	printk("early_node_map[%d] active PFN ranges
  ", nr_nodemap_entries);
  	for (i = 0; i < nr_nodemap_entries; i++)
5dab8ec13   Paul Jackson   mm, generic, x86 ...
4458
4459
  		printk("  %3d: %0#10lx -> %0#10lx
  ", early_node_map[i].nid,
c713216de   Mel Gorman   [PATCH] Introduce...
4460
4461
4462
4463
  						early_node_map[i].start_pfn,
  						early_node_map[i].end_pfn);
  
  	/* Initialise every node */
708614e61   Mel Gorman   mm: verify the pa...
4464
  	mminit_verify_pageflags_layout();
8ef828668   Christoph Lameter   [PATCH] slab: red...
4465
  	setup_nr_node_ids();
c713216de   Mel Gorman   [PATCH] Introduce...
4466
4467
  	for_each_online_node(nid) {
  		pg_data_t *pgdat = NODE_DATA(nid);
9109fb7b3   Johannes Weiner   mm: drop unneeded...
4468
  		free_area_init_node(nid, NULL,
c713216de   Mel Gorman   [PATCH] Introduce...
4469
  				find_min_pfn_for_node(nid), NULL);
37b07e416   Lee Schermerhorn   memoryless nodes:...
4470
4471
4472
4473
4474
  
  		/* Any memory on that node */
  		if (pgdat->node_present_pages)
  			node_set_state(nid, N_HIGH_MEMORY);
  		check_for_regular_memory(pgdat);
c713216de   Mel Gorman   [PATCH] Introduce...
4475
4476
  	}
  }
2a1e274ac   Mel Gorman   Create the ZONE_M...
4477

7e63efef8   Mel Gorman   Add a movablecore...
4478
  static int __init cmdline_parse_core(char *p, unsigned long *core)
2a1e274ac   Mel Gorman   Create the ZONE_M...
4479
4480
4481
4482
4483
4484
  {
  	unsigned long long coremem;
  	if (!p)
  		return -EINVAL;
  
  	coremem = memparse(p, &p);
7e63efef8   Mel Gorman   Add a movablecore...
4485
  	*core = coremem >> PAGE_SHIFT;
2a1e274ac   Mel Gorman   Create the ZONE_M...
4486

7e63efef8   Mel Gorman   Add a movablecore...
4487
  	/* Paranoid check that UL is enough for the coremem value */
2a1e274ac   Mel Gorman   Create the ZONE_M...
4488
4489
4490
4491
  	WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
  
  	return 0;
  }
ed7ed3651   Mel Gorman   handle kernelcore...
4492

7e63efef8   Mel Gorman   Add a movablecore...
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
  /*
   * kernelcore=size sets the amount of memory for use for allocations that
   * cannot be reclaimed or migrated.
   */
  static int __init cmdline_parse_kernelcore(char *p)
  {
  	return cmdline_parse_core(p, &required_kernelcore);
  }
  
  /*
   * movablecore=size sets the amount of memory for use for allocations that
   * can be reclaimed or migrated.
   */
  static int __init cmdline_parse_movablecore(char *p)
  {
  	return cmdline_parse_core(p, &required_movablecore);
  }
ed7ed3651   Mel Gorman   handle kernelcore...
4510
  early_param("kernelcore", cmdline_parse_kernelcore);
7e63efef8   Mel Gorman   Add a movablecore...
4511
  early_param("movablecore", cmdline_parse_movablecore);
ed7ed3651   Mel Gorman   handle kernelcore...
4512

c713216de   Mel Gorman   [PATCH] Introduce...
4513
  #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
0e0b864e0   Mel Gorman   [PATCH] Account f...
4514
  /**
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
4515
4516
   * set_dma_reserve - set the specified number of pages reserved in the first zone
   * @new_dma_reserve: The number of pages to mark reserved
0e0b864e0   Mel Gorman   [PATCH] Account f...
4517
4518
4519
4520
   *
   * The per-cpu batchsize and zone watermarks are determined by present_pages.
   * In the DMA zone, a significant percentage may be consumed by kernel image
   * and other unfreeable allocations which can skew the watermarks badly. This
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
4521
4522
4523
   * function may optionally be used to account for unfreeable pages in the
   * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
   * smaller per-cpu batchsize.
0e0b864e0   Mel Gorman   [PATCH] Account f...
4524
4525
4526
4527
4528
   */
  void __init set_dma_reserve(unsigned long new_dma_reserve)
  {
  	dma_reserve = new_dma_reserve;
  }
93b7504e3   Dave Hansen   [PATCH] Introduce...
4529
  #ifndef CONFIG_NEED_MULTIPLE_NODES
08677214e   Yinghai Lu   x86: Make 64 bit ...
4530
4531
4532
4533
4534
  struct pglist_data __refdata contig_page_data = {
  #ifndef CONFIG_NO_BOOTMEM
   .bdata = &bootmem_node_data[0]
  #endif
   };
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4535
  EXPORT_SYMBOL(contig_page_data);
93b7504e3   Dave Hansen   [PATCH] Introduce...
4536
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4537
4538
4539
  
  void __init free_area_init(unsigned long *zones_size)
  {
9109fb7b3   Johannes Weiner   mm: drop unneeded...
4540
  	free_area_init_node(0, zones_size,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4541
4542
  			__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4543

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4544
4545
4546
4547
  static int page_alloc_cpu_notify(struct notifier_block *self,
  				 unsigned long action, void *hcpu)
  {
  	int cpu = (unsigned long)hcpu;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4548

8bb784428   Rafael J. Wysocki   Add suspend-relat...
4549
  	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
9f8f21725   Christoph Lameter   Page allocator: c...
4550
4551
4552
4553
4554
4555
4556
4557
  		drain_pages(cpu);
  
  		/*
  		 * Spill the event counters of the dead processor
  		 * into the current processors event counters.
  		 * This artificially elevates the count of the current
  		 * processor.
  		 */
f8891e5e1   Christoph Lameter   [PATCH] Light wei...
4558
  		vm_events_fold_cpu(cpu);
9f8f21725   Christoph Lameter   Page allocator: c...
4559
4560
4561
4562
4563
4564
4565
4566
  
  		/*
  		 * Zero the differential counters of the dead processor
  		 * so that the vm statistics are consistent.
  		 *
  		 * This is only okay since the processor is dead and cannot
  		 * race with what we are doing.
  		 */
2244b95a7   Christoph Lameter   [PATCH] zoned vm ...
4567
  		refresh_cpu_vm_stats(cpu);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4568
4569
4570
  	}
  	return NOTIFY_OK;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4571
4572
4573
4574
4575
4576
4577
  
  void __init page_alloc_init(void)
  {
  	hotcpu_notifier(page_alloc_cpu_notify, 0);
  }
  
  /*
cb45b0e96   Hideo AOKI   [PATCH] overcommi...
4578
4579
4580
4581
4582
4583
4584
   * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio
   *	or min_free_kbytes changes.
   */
  static void calculate_totalreserve_pages(void)
  {
  	struct pglist_data *pgdat;
  	unsigned long reserve_pages = 0;
2f6726e54   Christoph Lameter   [PATCH] Apply typ...
4585
  	enum zone_type i, j;
cb45b0e96   Hideo AOKI   [PATCH] overcommi...
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
  
  	for_each_online_pgdat(pgdat) {
  		for (i = 0; i < MAX_NR_ZONES; i++) {
  			struct zone *zone = pgdat->node_zones + i;
  			unsigned long max = 0;
  
  			/* Find valid and maximum lowmem_reserve in the zone */
  			for (j = i; j < MAX_NR_ZONES; j++) {
  				if (zone->lowmem_reserve[j] > max)
  					max = zone->lowmem_reserve[j];
  			}
418589663   Mel Gorman   page allocator: u...
4597
4598
  			/* we treat the high watermark as reserved pages. */
  			max += high_wmark_pages(zone);
cb45b0e96   Hideo AOKI   [PATCH] overcommi...
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
  
  			if (max > zone->present_pages)
  				max = zone->present_pages;
  			reserve_pages += max;
  		}
  	}
  	totalreserve_pages = reserve_pages;
  }
  
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4609
4610
4611
4612
4613
4614
4615
4616
   * setup_per_zone_lowmem_reserve - called whenever
   *	sysctl_lower_zone_reserve_ratio changes.  Ensures that each zone
   *	has a correct pages reserved value, so an adequate number of
   *	pages are left in the zone after a successful __alloc_pages().
   */
  static void setup_per_zone_lowmem_reserve(void)
  {
  	struct pglist_data *pgdat;
2f6726e54   Christoph Lameter   [PATCH] Apply typ...
4617
  	enum zone_type j, idx;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4618

ec936fc56   KAMEZAWA Hiroyuki   [PATCH] for_each_...
4619
  	for_each_online_pgdat(pgdat) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4620
4621
4622
4623
4624
  		for (j = 0; j < MAX_NR_ZONES; j++) {
  			struct zone *zone = pgdat->node_zones + j;
  			unsigned long present_pages = zone->present_pages;
  
  			zone->lowmem_reserve[j] = 0;
2f6726e54   Christoph Lameter   [PATCH] Apply typ...
4625
4626
  			idx = j;
  			while (idx) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4627
  				struct zone *lower_zone;
2f6726e54   Christoph Lameter   [PATCH] Apply typ...
4628
  				idx--;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
  				if (sysctl_lowmem_reserve_ratio[idx] < 1)
  					sysctl_lowmem_reserve_ratio[idx] = 1;
  
  				lower_zone = pgdat->node_zones + idx;
  				lower_zone->lowmem_reserve[j] = present_pages /
  					sysctl_lowmem_reserve_ratio[idx];
  				present_pages += lower_zone->present_pages;
  			}
  		}
  	}
cb45b0e96   Hideo AOKI   [PATCH] overcommi...
4639
4640
4641
  
  	/* update totalreserve_pages */
  	calculate_totalreserve_pages();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4642
  }
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
4643
  /**
bc75d33f0   Minchan Kim   page-allocator: c...
4644
   * setup_per_zone_wmarks - called when min_free_kbytes changes
bce7394a3   Minchan Kim   page-allocator: r...
4645
   * or when memory is hot-{added|removed}
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
4646
   *
bc75d33f0   Minchan Kim   page-allocator: c...
4647
4648
   * Ensures that the watermark[min,low,high] values for each zone are set
   * correctly with respect to min_free_kbytes.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4649
   */
bc75d33f0   Minchan Kim   page-allocator: c...
4650
  void setup_per_zone_wmarks(void)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
  {
  	unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
  	unsigned long lowmem_pages = 0;
  	struct zone *zone;
  	unsigned long flags;
  
  	/* Calculate total number of !ZONE_HIGHMEM pages */
  	for_each_zone(zone) {
  		if (!is_highmem(zone))
  			lowmem_pages += zone->present_pages;
  	}
  
  	for_each_zone(zone) {
ac924c603   Andrew Morton   [PATCH] setup_per...
4664
  		u64 tmp;
1125b4e39   Gerald Schaefer   setup_per_zone_pa...
4665
  		spin_lock_irqsave(&zone->lock, flags);
ac924c603   Andrew Morton   [PATCH] setup_per...
4666
4667
  		tmp = (u64)pages_min * zone->present_pages;
  		do_div(tmp, lowmem_pages);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4668
4669
  		if (is_highmem(zone)) {
  			/*
669ed1752   Nick Piggin   [PATCH] mm: highm...
4670
4671
4672
4673
  			 * __GFP_HIGH and PF_MEMALLOC allocations usually don't
  			 * need highmem pages, so cap pages_min to a small
  			 * value here.
  			 *
418589663   Mel Gorman   page allocator: u...
4674
  			 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
669ed1752   Nick Piggin   [PATCH] mm: highm...
4675
4676
  			 * deltas controls asynch page reclaim, and so should
  			 * not be capped for highmem.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4677
4678
4679
4680
4681
4682
4683
4684
  			 */
  			int min_pages;
  
  			min_pages = zone->present_pages / 1024;
  			if (min_pages < SWAP_CLUSTER_MAX)
  				min_pages = SWAP_CLUSTER_MAX;
  			if (min_pages > 128)
  				min_pages = 128;
418589663   Mel Gorman   page allocator: u...
4685
  			zone->watermark[WMARK_MIN] = min_pages;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4686
  		} else {
669ed1752   Nick Piggin   [PATCH] mm: highm...
4687
4688
  			/*
  			 * If it's a lowmem zone, reserve a number of pages
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4689
4690
  			 * proportionate to the zone's size.
  			 */
418589663   Mel Gorman   page allocator: u...
4691
  			zone->watermark[WMARK_MIN] = tmp;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4692
  		}
418589663   Mel Gorman   page allocator: u...
4693
4694
  		zone->watermark[WMARK_LOW]  = min_wmark_pages(zone) + (tmp >> 2);
  		zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
56fd56b86   Mel Gorman   Bias the location...
4695
  		setup_zone_migrate_reserve(zone);
1125b4e39   Gerald Schaefer   setup_per_zone_pa...
4696
  		spin_unlock_irqrestore(&zone->lock, flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4697
  	}
cb45b0e96   Hideo AOKI   [PATCH] overcommi...
4698
4699
4700
  
  	/* update totalreserve_pages */
  	calculate_totalreserve_pages();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4701
  }
55a4462af   Randy Dunlap   page_alloc: fix k...
4702
  /*
556adecba   Rik van Riel   vmscan: second ch...
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
   * The inactive anon list should be small enough that the VM never has to
   * do too much work, but large enough that each inactive page has a chance
   * to be referenced again before it is swapped out.
   *
   * The inactive_anon ratio is the target ratio of ACTIVE_ANON to
   * INACTIVE_ANON pages on this zone's LRU, maintained by the
   * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of
   * the anonymous pages are kept on the inactive list.
   *
   * total     target    max
   * memory    ratio     inactive anon
   * -------------------------------------
   *   10MB       1         5MB
   *  100MB       1        50MB
   *    1GB       3       250MB
   *   10GB      10       0.9GB
   *  100GB      31         3GB
   *    1TB     101        10GB
   *   10TB     320        32GB
   */
96cb4df5d   Minchan Kim   page-allocator: a...
4723
  void calculate_zone_inactive_ratio(struct zone *zone)
556adecba   Rik van Riel   vmscan: second ch...
4724
  {
96cb4df5d   Minchan Kim   page-allocator: a...
4725
  	unsigned int gb, ratio;
556adecba   Rik van Riel   vmscan: second ch...
4726

96cb4df5d   Minchan Kim   page-allocator: a...
4727
4728
4729
  	/* Zone size in gigabytes */
  	gb = zone->present_pages >> (30 - PAGE_SHIFT);
  	if (gb)
556adecba   Rik van Riel   vmscan: second ch...
4730
  		ratio = int_sqrt(10 * gb);
96cb4df5d   Minchan Kim   page-allocator: a...
4731
4732
  	else
  		ratio = 1;
556adecba   Rik van Riel   vmscan: second ch...
4733

96cb4df5d   Minchan Kim   page-allocator: a...
4734
4735
  	zone->inactive_ratio = ratio;
  }
556adecba   Rik van Riel   vmscan: second ch...
4736

96cb4df5d   Minchan Kim   page-allocator: a...
4737
4738
4739
4740
4741
4742
  static void __init setup_per_zone_inactive_ratio(void)
  {
  	struct zone *zone;
  
  	for_each_zone(zone)
  		calculate_zone_inactive_ratio(zone);
556adecba   Rik van Riel   vmscan: second ch...
4743
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
  /*
   * Initialise min_free_kbytes.
   *
   * For small machines we want it small (128k min).  For large machines
   * we want it large (64MB max).  But it is not linear, because network
   * bandwidth does not increase linearly with machine size.  We use
   *
   * 	min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
   *	min_free_kbytes = sqrt(lowmem_kbytes * 16)
   *
   * which yields
   *
   * 16MB:	512k
   * 32MB:	724k
   * 64MB:	1024k
   * 128MB:	1448k
   * 256MB:	2048k
   * 512MB:	2896k
   * 1024MB:	4096k
   * 2048MB:	5792k
   * 4096MB:	8192k
   * 8192MB:	11584k
   * 16384MB:	16384k
   */
bc75d33f0   Minchan Kim   page-allocator: c...
4768
  static int __init init_per_zone_wmark_min(void)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
  {
  	unsigned long lowmem_kbytes;
  
  	lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
  
  	min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
  	if (min_free_kbytes < 128)
  		min_free_kbytes = 128;
  	if (min_free_kbytes > 65536)
  		min_free_kbytes = 65536;
bc75d33f0   Minchan Kim   page-allocator: c...
4779
  	setup_per_zone_wmarks();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4780
  	setup_per_zone_lowmem_reserve();
556adecba   Rik van Riel   vmscan: second ch...
4781
  	setup_per_zone_inactive_ratio();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4782
4783
  	return 0;
  }
bc75d33f0   Minchan Kim   page-allocator: c...
4784
  module_init(init_per_zone_wmark_min)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4785
4786
4787
4788
4789
4790
4791
  
  /*
   * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 
   *	that we can call two helper functions whenever min_free_kbytes
   *	changes.
   */
  int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 
8d65af789   Alexey Dobriyan   sysctl: remove "s...
4792
  	void __user *buffer, size_t *length, loff_t *ppos)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4793
  {
8d65af789   Alexey Dobriyan   sysctl: remove "s...
4794
  	proc_dointvec(table, write, buffer, length, ppos);
3b1d92c56   Mel Gorman   Do not disable in...
4795
  	if (write)
bc75d33f0   Minchan Kim   page-allocator: c...
4796
  		setup_per_zone_wmarks();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4797
4798
  	return 0;
  }
9614634fe   Christoph Lameter   [PATCH] ZVC/zone_...
4799
4800
  #ifdef CONFIG_NUMA
  int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
8d65af789   Alexey Dobriyan   sysctl: remove "s...
4801
  	void __user *buffer, size_t *length, loff_t *ppos)
9614634fe   Christoph Lameter   [PATCH] ZVC/zone_...
4802
4803
4804
  {
  	struct zone *zone;
  	int rc;
8d65af789   Alexey Dobriyan   sysctl: remove "s...
4805
  	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
9614634fe   Christoph Lameter   [PATCH] ZVC/zone_...
4806
4807
4808
4809
  	if (rc)
  		return rc;
  
  	for_each_zone(zone)
8417bba4b   Christoph Lameter   [PATCH] Replace m...
4810
  		zone->min_unmapped_pages = (zone->present_pages *
9614634fe   Christoph Lameter   [PATCH] ZVC/zone_...
4811
4812
4813
  				sysctl_min_unmapped_ratio) / 100;
  	return 0;
  }
0ff38490c   Christoph Lameter   [PATCH] zone_recl...
4814
4815
  
  int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
8d65af789   Alexey Dobriyan   sysctl: remove "s...
4816
  	void __user *buffer, size_t *length, loff_t *ppos)
0ff38490c   Christoph Lameter   [PATCH] zone_recl...
4817
4818
4819
  {
  	struct zone *zone;
  	int rc;
8d65af789   Alexey Dobriyan   sysctl: remove "s...
4820
  	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
0ff38490c   Christoph Lameter   [PATCH] zone_recl...
4821
4822
4823
4824
4825
4826
4827
4828
  	if (rc)
  		return rc;
  
  	for_each_zone(zone)
  		zone->min_slab_pages = (zone->present_pages *
  				sysctl_min_slab_ratio) / 100;
  	return 0;
  }
9614634fe   Christoph Lameter   [PATCH] ZVC/zone_...
4829
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4830
4831
4832
4833
4834
4835
  /*
   * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
   *	proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
   *	whenever sysctl_lowmem_reserve_ratio changes.
   *
   * The reserve ratio obviously has absolutely no relation with the
418589663   Mel Gorman   page allocator: u...
4836
   * minimum watermarks. The lowmem reserve ratio can only make sense
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4837
4838
4839
   * if in function of the boot time zone sizes.
   */
  int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
8d65af789   Alexey Dobriyan   sysctl: remove "s...
4840
  	void __user *buffer, size_t *length, loff_t *ppos)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4841
  {
8d65af789   Alexey Dobriyan   sysctl: remove "s...
4842
  	proc_dointvec_minmax(table, write, buffer, length, ppos);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4843
4844
4845
  	setup_per_zone_lowmem_reserve();
  	return 0;
  }
8ad4b1fb8   Rohit Seth   [PATCH] Make high...
4846
4847
4848
4849
4850
4851
4852
  /*
   * percpu_pagelist_fraction - changes the pcp->high for each zone on each
   * cpu.  It is the fraction of total pages in each zone that a hot per cpu pagelist
   * can have before it gets flushed back to buddy allocator.
   */
  
  int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
8d65af789   Alexey Dobriyan   sysctl: remove "s...
4853
  	void __user *buffer, size_t *length, loff_t *ppos)
8ad4b1fb8   Rohit Seth   [PATCH] Make high...
4854
4855
4856
4857
  {
  	struct zone *zone;
  	unsigned int cpu;
  	int ret;
8d65af789   Alexey Dobriyan   sysctl: remove "s...
4858
  	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
8ad4b1fb8   Rohit Seth   [PATCH] Make high...
4859
4860
  	if (!write || (ret == -EINVAL))
  		return ret;
364df0ebf   Dimitri Sivanich   mm: fix handling ...
4861
  	for_each_populated_zone(zone) {
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
4862
  		for_each_possible_cpu(cpu) {
8ad4b1fb8   Rohit Seth   [PATCH] Make high...
4863
4864
  			unsigned long  high;
  			high = zone->present_pages / percpu_pagelist_fraction;
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
4865
4866
  			setup_pagelist_highmark(
  				per_cpu_ptr(zone->pageset, cpu), high);
8ad4b1fb8   Rohit Seth   [PATCH] Make high...
4867
4868
4869
4870
  		}
  	}
  	return 0;
  }
f034b5d4e   David S. Miller   [XFRM]: Dynamic x...
4871
  int hashdist = HASHDIST_DEFAULT;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
  
  #ifdef CONFIG_NUMA
  static int __init set_hashdist(char *str)
  {
  	if (!str)
  		return 0;
  	hashdist = simple_strtoul(str, &str, 0);
  	return 1;
  }
  __setup("hashdist=", set_hashdist);
  #endif
  
  /*
   * allocate a large system hash table from bootmem
   * - it is assumed that the hash table must contain an exact power-of-2
   *   quantity of entries
   * - limit is the number of hash buckets, not the total allocation size
   */
  void *__init alloc_large_system_hash(const char *tablename,
  				     unsigned long bucketsize,
  				     unsigned long numentries,
  				     int scale,
  				     int flags,
  				     unsigned int *_hash_shift,
  				     unsigned int *_hash_mask,
  				     unsigned long limit)
  {
  	unsigned long long max = limit;
  	unsigned long log2qty, size;
  	void *table = NULL;
  
  	/* allow the kernel cmdline to have a say */
  	if (!numentries) {
  		/* round applicable memory size up to nearest megabyte */
049036643   Andrew Morton   [PATCH] remove HA...
4906
  		numentries = nr_kernel_pages;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4907
4908
4909
4910
4911
4912
4913
4914
4915
  		numentries += (1UL << (20 - PAGE_SHIFT)) - 1;
  		numentries >>= 20 - PAGE_SHIFT;
  		numentries <<= 20 - PAGE_SHIFT;
  
  		/* limit to 1 bucket per 2^scale bytes of low memory */
  		if (scale > PAGE_SHIFT)
  			numentries >>= (scale - PAGE_SHIFT);
  		else
  			numentries <<= (PAGE_SHIFT - scale);
9ab37b8f2   Paul Mundt   [PATCH] Sanely si...
4916
4917
  
  		/* Make sure we've got at least a 0-order allocation.. */
2c85f51d2   Jan Beulich   mm: also use allo...
4918
4919
4920
4921
4922
4923
4924
4925
  		if (unlikely(flags & HASH_SMALL)) {
  			/* Makes no sense without HASH_EARLY */
  			WARN_ON(!(flags & HASH_EARLY));
  			if (!(numentries >> *_hash_shift)) {
  				numentries = 1UL << *_hash_shift;
  				BUG_ON(!numentries);
  			}
  		} else if (unlikely((numentries * bucketsize) < PAGE_SIZE))
9ab37b8f2   Paul Mundt   [PATCH] Sanely si...
4926
  			numentries = PAGE_SIZE / bucketsize;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4927
  	}
6e692ed37   John Hawkes   [PATCH] fix alloc...
4928
  	numentries = roundup_pow_of_two(numentries);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4929
4930
4931
4932
4933
4934
4935
4936
4937
  
  	/* limit allocation size to 1/16 total memory by default */
  	if (max == 0) {
  		max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
  		do_div(max, bucketsize);
  	}
  
  	if (numentries > max)
  		numentries = max;
f0d1b0b30   David Howells   [PATCH] LOG2: Imp...
4938
  	log2qty = ilog2(numentries);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4939
4940
4941
4942
  
  	do {
  		size = bucketsize << log2qty;
  		if (flags & HASH_EARLY)
74768ed83   Jan Beulich   page allocator: u...
4943
  			table = alloc_bootmem_nopanic(size);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4944
4945
4946
  		else if (hashdist)
  			table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
  		else {
1037b83bd   Eric Dumazet   MM: alloc_large_s...
4947
4948
  			/*
  			 * If bucketsize is not a power-of-two, we may free
a1dd268cf   Mel Gorman   mm: use alloc_pag...
4949
4950
  			 * some pages at the end of hash table which
  			 * alloc_pages_exact() automatically does
1037b83bd   Eric Dumazet   MM: alloc_large_s...
4951
  			 */
264ef8a90   Catalin Marinas   kmemleak: Remove ...
4952
  			if (get_order(size) < MAX_ORDER) {
a1dd268cf   Mel Gorman   mm: use alloc_pag...
4953
  				table = alloc_pages_exact(size, GFP_ATOMIC);
264ef8a90   Catalin Marinas   kmemleak: Remove ...
4954
4955
  				kmemleak_alloc(table, size, 1, GFP_ATOMIC);
  			}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4956
4957
4958
4959
4960
4961
  		}
  	} while (!table && size > PAGE_SIZE && --log2qty);
  
  	if (!table)
  		panic("Failed to allocate %s hash table
  ", tablename);
b49ad484c   Dan Aloni   mm/page_alloc.c: ...
4962
4963
  	printk(KERN_INFO "%s hash table entries: %d (order: %d, %lu bytes)
  ",
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4964
4965
  	       tablename,
  	       (1U << log2qty),
f0d1b0b30   David Howells   [PATCH] LOG2: Imp...
4966
  	       ilog2(size) - PAGE_SHIFT,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4967
4968
4969
4970
4971
4972
4973
4974
4975
  	       size);
  
  	if (_hash_shift)
  		*_hash_shift = log2qty;
  	if (_hash_mask)
  		*_hash_mask = (1 << log2qty) - 1;
  
  	return table;
  }
a117e66ed   KAMEZAWA Hiroyuki   [PATCH] unify pfn...
4976

835c134ec   Mel Gorman   Add a bitmap that...
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
  /* Return a pointer to the bitmap storing bits affecting a block of pages */
  static inline unsigned long *get_pageblock_bitmap(struct zone *zone,
  							unsigned long pfn)
  {
  #ifdef CONFIG_SPARSEMEM
  	return __pfn_to_section(pfn)->pageblock_flags;
  #else
  	return zone->pageblock_flags;
  #endif /* CONFIG_SPARSEMEM */
  }
  
  static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
  {
  #ifdef CONFIG_SPARSEMEM
  	pfn &= (PAGES_PER_SECTION-1);
d9c234005   Mel Gorman   Do not depend on ...
4992
  	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
835c134ec   Mel Gorman   Add a bitmap that...
4993
4994
  #else
  	pfn = pfn - zone->zone_start_pfn;
d9c234005   Mel Gorman   Do not depend on ...
4995
  	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
835c134ec   Mel Gorman   Add a bitmap that...
4996
4997
4998
4999
  #endif /* CONFIG_SPARSEMEM */
  }
  
  /**
d9c234005   Mel Gorman   Do not depend on ...
5000
   * get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages
835c134ec   Mel Gorman   Add a bitmap that...
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
   * @page: The page within the block of interest
   * @start_bitidx: The first bit of interest to retrieve
   * @end_bitidx: The last bit of interest
   * returns pageblock_bits flags
   */
  unsigned long get_pageblock_flags_group(struct page *page,
  					int start_bitidx, int end_bitidx)
  {
  	struct zone *zone;
  	unsigned long *bitmap;
  	unsigned long pfn, bitidx;
  	unsigned long flags = 0;
  	unsigned long value = 1;
  
  	zone = page_zone(page);
  	pfn = page_to_pfn(page);
  	bitmap = get_pageblock_bitmap(zone, pfn);
  	bitidx = pfn_to_bitidx(zone, pfn);
  
  	for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
  		if (test_bit(bitidx + start_bitidx, bitmap))
  			flags |= value;
6220ec784   Andrew Morton   [PATCH] highest_p...
5023

835c134ec   Mel Gorman   Add a bitmap that...
5024
5025
5026
5027
  	return flags;
  }
  
  /**
d9c234005   Mel Gorman   Do not depend on ...
5028
   * set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages
835c134ec   Mel Gorman   Add a bitmap that...
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
   * @page: The page within the block of interest
   * @start_bitidx: The first bit of interest
   * @end_bitidx: The last bit of interest
   * @flags: The flags to set
   */
  void set_pageblock_flags_group(struct page *page, unsigned long flags,
  					int start_bitidx, int end_bitidx)
  {
  	struct zone *zone;
  	unsigned long *bitmap;
  	unsigned long pfn, bitidx;
  	unsigned long value = 1;
  
  	zone = page_zone(page);
  	pfn = page_to_pfn(page);
  	bitmap = get_pageblock_bitmap(zone, pfn);
  	bitidx = pfn_to_bitidx(zone, pfn);
86051ca5e   KAMEZAWA Hiroyuki   mm: fix usemap in...
5046
5047
  	VM_BUG_ON(pfn < zone->zone_start_pfn);
  	VM_BUG_ON(pfn >= zone->zone_start_pfn + zone->spanned_pages);
835c134ec   Mel Gorman   Add a bitmap that...
5048
5049
5050
5051
5052
5053
5054
  
  	for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
  		if (flags & value)
  			__set_bit(bitidx + start_bitidx, bitmap);
  		else
  			__clear_bit(bitidx + start_bitidx, bitmap);
  }
a5d76b54a   KAMEZAWA Hiroyuki   memory unplug: pa...
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
  
  /*
   * This is designed as sub function...plz see page_isolation.c also.
   * set/clear page block's type to be ISOLATE.
   * page allocater never alloc memory from ISOLATE block.
   */
  
  int set_migratetype_isolate(struct page *page)
  {
  	struct zone *zone;
925cc71e5   Robert Jennings   mm: Add notifier ...
5065
5066
5067
5068
5069
  	struct page *curr_page;
  	unsigned long flags, pfn, iter;
  	unsigned long immobile = 0;
  	struct memory_isolate_notify arg;
  	int notifier_ret;
a5d76b54a   KAMEZAWA Hiroyuki   memory unplug: pa...
5070
  	int ret = -EBUSY;
8e7e40d96   Shaohua Li   memory hotplug: m...
5071
  	int zone_idx;
a5d76b54a   KAMEZAWA Hiroyuki   memory unplug: pa...
5072
5073
  
  	zone = page_zone(page);
8e7e40d96   Shaohua Li   memory hotplug: m...
5074
  	zone_idx = zone_idx(zone);
925cc71e5   Robert Jennings   mm: Add notifier ...
5075

a5d76b54a   KAMEZAWA Hiroyuki   memory unplug: pa...
5076
  	spin_lock_irqsave(&zone->lock, flags);
925cc71e5   Robert Jennings   mm: Add notifier ...
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
  	if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE ||
  	    zone_idx == ZONE_MOVABLE) {
  		ret = 0;
  		goto out;
  	}
  
  	pfn = page_to_pfn(page);
  	arg.start_pfn = pfn;
  	arg.nr_pages = pageblock_nr_pages;
  	arg.pages_found = 0;
a5d76b54a   KAMEZAWA Hiroyuki   memory unplug: pa...
5087
  	/*
925cc71e5   Robert Jennings   mm: Add notifier ...
5088
5089
5090
5091
5092
5093
5094
5095
5096
  	 * It may be possible to isolate a pageblock even if the
  	 * migratetype is not MIGRATE_MOVABLE. The memory isolation
  	 * notifier chain is used by balloon drivers to return the
  	 * number of pages in a range that are held by the balloon
  	 * driver to shrink memory. If all the pages are accounted for
  	 * by balloons, are free, or on the LRU, isolation can continue.
  	 * Later, for example, when memory hotplug notifier runs, these
  	 * pages reported as "can be isolated" should be isolated(freed)
  	 * by the balloon driver through the memory notifier chain.
a5d76b54a   KAMEZAWA Hiroyuki   memory unplug: pa...
5097
  	 */
925cc71e5   Robert Jennings   mm: Add notifier ...
5098
5099
5100
  	notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
  	notifier_ret = notifier_to_errno(notifier_ret);
  	if (notifier_ret || !arg.pages_found)
a5d76b54a   KAMEZAWA Hiroyuki   memory unplug: pa...
5101
  		goto out;
925cc71e5   Robert Jennings   mm: Add notifier ...
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
  
  	for (iter = pfn; iter < (pfn + pageblock_nr_pages); iter++) {
  		if (!pfn_valid_within(pfn))
  			continue;
  
  		curr_page = pfn_to_page(iter);
  		if (!page_count(curr_page) || PageLRU(curr_page))
  			continue;
  
  		immobile++;
  	}
  
  	if (arg.pages_found == immobile)
  		ret = 0;
a5d76b54a   KAMEZAWA Hiroyuki   memory unplug: pa...
5116
  out:
925cc71e5   Robert Jennings   mm: Add notifier ...
5117
5118
5119
5120
  	if (!ret) {
  		set_pageblock_migratetype(page, MIGRATE_ISOLATE);
  		move_freepages_block(zone, page, MIGRATE_ISOLATE);
  	}
a5d76b54a   KAMEZAWA Hiroyuki   memory unplug: pa...
5121
5122
  	spin_unlock_irqrestore(&zone->lock, flags);
  	if (!ret)
9f8f21725   Christoph Lameter   Page allocator: c...
5123
  		drain_all_pages();
a5d76b54a   KAMEZAWA Hiroyuki   memory unplug: pa...
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
  	return ret;
  }
  
  void unset_migratetype_isolate(struct page *page)
  {
  	struct zone *zone;
  	unsigned long flags;
  	zone = page_zone(page);
  	spin_lock_irqsave(&zone->lock, flags);
  	if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
  		goto out;
  	set_pageblock_migratetype(page, MIGRATE_MOVABLE);
  	move_freepages_block(zone, page, MIGRATE_MOVABLE);
  out:
  	spin_unlock_irqrestore(&zone->lock, flags);
  }
0c0e61958   KAMEZAWA Hiroyuki   memory unplug: pa...
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
  
  #ifdef CONFIG_MEMORY_HOTREMOVE
  /*
   * All pages in the range must be isolated before calling this.
   */
  void
  __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
  {
  	struct page *page;
  	struct zone *zone;
  	int order, i;
  	unsigned long pfn;
  	unsigned long flags;
  	/* find the first valid pfn */
  	for (pfn = start_pfn; pfn < end_pfn; pfn++)
  		if (pfn_valid(pfn))
  			break;
  	if (pfn == end_pfn)
  		return;
  	zone = page_zone(pfn_to_page(pfn));
  	spin_lock_irqsave(&zone->lock, flags);
  	pfn = start_pfn;
  	while (pfn < end_pfn) {
  		if (!pfn_valid(pfn)) {
  			pfn++;
  			continue;
  		}
  		page = pfn_to_page(pfn);
  		BUG_ON(page_count(page));
  		BUG_ON(!PageBuddy(page));
  		order = page_order(page);
  #ifdef CONFIG_DEBUG_VM
  		printk(KERN_INFO "remove from free list %lx %d %lx
  ",
  		       pfn, 1 << order, end_pfn);
  #endif
  		list_del(&page->lru);
  		rmv_page_order(page);
  		zone->free_area[order].nr_free--;
  		__mod_zone_page_state(zone, NR_FREE_PAGES,
  				      - (1UL << order));
  		for (i = 0; i < (1 << order); i++)
  			SetPageReserved((page+i));
  		pfn += (1 << order);
  	}
  	spin_unlock_irqrestore(&zone->lock, flags);
  }
  #endif
8d22ba1b7   Wu Fengguang   HWPOISON: detect ...
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
  
  #ifdef CONFIG_MEMORY_FAILURE
  bool is_free_buddy_page(struct page *page)
  {
  	struct zone *zone = page_zone(page);
  	unsigned long pfn = page_to_pfn(page);
  	unsigned long flags;
  	int order;
  
  	spin_lock_irqsave(&zone->lock, flags);
  	for (order = 0; order < MAX_ORDER; order++) {
  		struct page *page_head = page - (pfn & ((1 << order) - 1));
  
  		if (PageBuddy(page_head) && page_order(page_head) >= order)
  			break;
  	}
  	spin_unlock_irqrestore(&zone->lock, flags);
  
  	return order < MAX_ORDER;
  }
  #endif
718a38211   Wu Fengguang   mm: introduce dum...
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
  
  static struct trace_print_flags pageflag_names[] = {
  	{1UL << PG_locked,		"locked"	},
  	{1UL << PG_error,		"error"		},
  	{1UL << PG_referenced,		"referenced"	},
  	{1UL << PG_uptodate,		"uptodate"	},
  	{1UL << PG_dirty,		"dirty"		},
  	{1UL << PG_lru,			"lru"		},
  	{1UL << PG_active,		"active"	},
  	{1UL << PG_slab,		"slab"		},
  	{1UL << PG_owner_priv_1,	"owner_priv_1"	},
  	{1UL << PG_arch_1,		"arch_1"	},
  	{1UL << PG_reserved,		"reserved"	},
  	{1UL << PG_private,		"private"	},
  	{1UL << PG_private_2,		"private_2"	},
  	{1UL << PG_writeback,		"writeback"	},
  #ifdef CONFIG_PAGEFLAGS_EXTENDED
  	{1UL << PG_head,		"head"		},
  	{1UL << PG_tail,		"tail"		},
  #else
  	{1UL << PG_compound,		"compound"	},
  #endif
  	{1UL << PG_swapcache,		"swapcache"	},
  	{1UL << PG_mappedtodisk,	"mappedtodisk"	},
  	{1UL << PG_reclaim,		"reclaim"	},
  	{1UL << PG_buddy,		"buddy"		},
  	{1UL << PG_swapbacked,		"swapbacked"	},
  	{1UL << PG_unevictable,		"unevictable"	},
  #ifdef CONFIG_MMU
  	{1UL << PG_mlocked,		"mlocked"	},
  #endif
  #ifdef CONFIG_ARCH_USES_PG_UNCACHED
  	{1UL << PG_uncached,		"uncached"	},
  #endif
  #ifdef CONFIG_MEMORY_FAILURE
  	{1UL << PG_hwpoison,		"hwpoison"	},
  #endif
  	{-1UL,				NULL		},
  };
  
  static void dump_page_flags(unsigned long flags)
  {
  	const char *delim = "";
  	unsigned long mask;
  	int i;
  
  	printk(KERN_ALERT "page flags: %#lx(", flags);
  
  	/* remove zone id */
  	flags &= (1UL << NR_PAGEFLAGS) - 1;
  
  	for (i = 0; pageflag_names[i].name && flags; i++) {
  
  		mask = pageflag_names[i].mask;
  		if ((flags & mask) != mask)
  			continue;
  
  		flags &= ~mask;
  		printk("%s%s", delim, pageflag_names[i].name);
  		delim = "|";
  	}
  
  	/* check for left over flags */
  	if (flags)
  		printk("%s%#lx", delim, flags);
  
  	printk(")
  ");
  }
  
  void dump_page(struct page *page)
  {
  	printk(KERN_ALERT
  	       "page:%p count:%d mapcount:%d mapping:%p index:%#lx
  ",
  		page, page_count(page), page_mapcount(page),
  		page->mapping, page->index);
  	dump_page_flags(page->flags);
  }