Blame view

mm/page_alloc.c 159 KB
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
  /*
   *  linux/mm/page_alloc.c
   *
   *  Manages the free list, the system allocates free pages here.
   *  Note that kmalloc() lives in slab.c
   *
   *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   *  Swap reorganised 29.12.95, Stephen Tweedie
   *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
   *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
   *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
   *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
   *  Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
   *          (lots of bits borrowed from Ingo Molnar & Andrew Morton)
   */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
16
17
18
19
20
  #include <linux/stddef.h>
  #include <linux/mm.h>
  #include <linux/swap.h>
  #include <linux/interrupt.h>
  #include <linux/pagemap.h>
10ed273f5   KOSAKI Motohiro   zlc_setup(): hand...
21
  #include <linux/jiffies.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
22
  #include <linux/bootmem.h>
edbe7d23b   Yinghai Lu   memblock: Add fin...
23
  #include <linux/memblock.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
24
  #include <linux/compiler.h>
9f1583339   Randy Dunlap   [PATCH] use add_t...
25
  #include <linux/kernel.h>
b1eeab676   Vegard Nossum   kmemcheck: add ho...
26
  #include <linux/kmemcheck.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
27
28
29
30
31
  #include <linux/module.h>
  #include <linux/suspend.h>
  #include <linux/pagevec.h>
  #include <linux/blkdev.h>
  #include <linux/slab.h>
a238ab5b0   Dave Hansen   mm: break out pag...
32
  #include <linux/ratelimit.h>
5a3135c2e   David Rientjes   oom: move prototy...
33
  #include <linux/oom.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
34
35
36
37
38
  #include <linux/notifier.h>
  #include <linux/topology.h>
  #include <linux/sysctl.h>
  #include <linux/cpu.h>
  #include <linux/cpuset.h>
bdc8cb984   Dave Hansen   [PATCH] memory ho...
39
  #include <linux/memory_hotplug.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
40
41
  #include <linux/nodemask.h>
  #include <linux/vmalloc.h>
a6cccdc36   KOSAKI Motohiro   mm, mem-hotplug: ...
42
  #include <linux/vmstat.h>
4be38e351   Christoph Lameter   [PATCH] mm: move ...
43
  #include <linux/mempolicy.h>
6811378e7   Yasunori Goto   [PATCH] wait_tabl...
44
  #include <linux/stop_machine.h>
c713216de   Mel Gorman   [PATCH] Introduce...
45
46
  #include <linux/sort.h>
  #include <linux/pfn.h>
3fcfab16c   Andrew Morton   [PATCH] separate ...
47
  #include <linux/backing-dev.h>
933e312e7   Akinobu Mita   [PATCH] fault-inj...
48
  #include <linux/fault-inject.h>
a5d76b54a   KAMEZAWA Hiroyuki   memory unplug: pa...
49
  #include <linux/page-isolation.h>
52d4b9ac0   KAMEZAWA Hiroyuki   memcg: allocate a...
50
  #include <linux/page_cgroup.h>
3ac7fe5a4   Thomas Gleixner   infrastructure to...
51
  #include <linux/debugobjects.h>
dbb1f81ca   Catalin Marinas   kmemleak: Add kme...
52
  #include <linux/kmemleak.h>
925cc71e5   Robert Jennings   mm: Add notifier ...
53
  #include <linux/memory.h>
56de7263f   Mel Gorman   mm: compaction: d...
54
  #include <linux/compaction.h>
0d3d062a6   Mel Gorman   tracing, page-all...
55
  #include <trace/events/kmem.h>
718a38211   Wu Fengguang   mm: introduce dum...
56
  #include <linux/ftrace_event.h>
f212ad7cf   Daisuke Nishimura   memcg: add memcg ...
57
  #include <linux/memcontrol.h>
268bb0ce3   Linus Torvalds   sanitize <linux/p...
58
  #include <linux/prefetch.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
59
60
  
  #include <asm/tlbflush.h>
ac924c603   Andrew Morton   [PATCH] setup_per...
61
  #include <asm/div64.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
62
  #include "internal.h"
728120192   Lee Schermerhorn   numa: add generic...
63
64
65
66
  #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
  DEFINE_PER_CPU(int, numa_node);
  EXPORT_PER_CPU_SYMBOL(numa_node);
  #endif
7aac78988   Lee Schermerhorn   numa: introduce n...
67
68
69
70
71
72
73
74
75
76
  #ifdef CONFIG_HAVE_MEMORYLESS_NODES
  /*
   * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
   * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
   * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()
   * defined in <linux/topology.h>.
   */
  DEFINE_PER_CPU(int, _numa_mem_);		/* Kernel "local memory" node */
  EXPORT_PER_CPU_SYMBOL(_numa_mem_);
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
77
  /*
138089107   Christoph Lameter   Memoryless nodes:...
78
   * Array of node states.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
79
   */
138089107   Christoph Lameter   Memoryless nodes:...
80
81
82
83
84
85
86
87
88
89
90
91
  nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
  	[N_POSSIBLE] = NODE_MASK_ALL,
  	[N_ONLINE] = { { [0] = 1UL } },
  #ifndef CONFIG_NUMA
  	[N_NORMAL_MEMORY] = { { [0] = 1UL } },
  #ifdef CONFIG_HIGHMEM
  	[N_HIGH_MEMORY] = { { [0] = 1UL } },
  #endif
  	[N_CPU] = { { [0] = 1UL } },
  #endif	/* NUMA */
  };
  EXPORT_SYMBOL(node_states);
6c231b7ba   Ravikiran G Thirumalai   [PATCH] Additions...
92
  unsigned long totalram_pages __read_mostly;
cb45b0e96   Hideo AOKI   [PATCH] overcommi...
93
  unsigned long totalreserve_pages __read_mostly;
8ad4b1fb8   Rohit Seth   [PATCH] Make high...
94
  int percpu_pagelist_fraction;
dcce284a2   Benjamin Herrenschmidt   mm: Extend gfp ma...
95
  gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
96

452aa6999   Rafael J. Wysocki   mm/pm: force GFP_...
97
98
99
100
101
102
103
104
105
  #ifdef CONFIG_PM_SLEEP
  /*
   * The following functions are used by the suspend/hibernate code to temporarily
   * change gfp_allowed_mask in order to avoid using I/O during memory allocations
   * while devices are suspended.  To avoid races with the suspend/hibernate code,
   * they should always be called with pm_mutex held (gfp_allowed_mask also should
   * only be modified with pm_mutex held, unless the suspend/hibernate code is
   * guaranteed not to run in parallel with that modification).
   */
c9e664f1f   Rafael J. Wysocki   PM / Hibernate: F...
106
107
108
109
  
  static gfp_t saved_gfp_mask;
  
  void pm_restore_gfp_mask(void)
452aa6999   Rafael J. Wysocki   mm/pm: force GFP_...
110
111
  {
  	WARN_ON(!mutex_is_locked(&pm_mutex));
c9e664f1f   Rafael J. Wysocki   PM / Hibernate: F...
112
113
114
115
  	if (saved_gfp_mask) {
  		gfp_allowed_mask = saved_gfp_mask;
  		saved_gfp_mask = 0;
  	}
452aa6999   Rafael J. Wysocki   mm/pm: force GFP_...
116
  }
c9e664f1f   Rafael J. Wysocki   PM / Hibernate: F...
117
  void pm_restrict_gfp_mask(void)
452aa6999   Rafael J. Wysocki   mm/pm: force GFP_...
118
  {
452aa6999   Rafael J. Wysocki   mm/pm: force GFP_...
119
  	WARN_ON(!mutex_is_locked(&pm_mutex));
c9e664f1f   Rafael J. Wysocki   PM / Hibernate: F...
120
121
122
  	WARN_ON(saved_gfp_mask);
  	saved_gfp_mask = gfp_allowed_mask;
  	gfp_allowed_mask &= ~GFP_IOFS;
452aa6999   Rafael J. Wysocki   mm/pm: force GFP_...
123
124
  }
  #endif /* CONFIG_PM_SLEEP */
d9c234005   Mel Gorman   Do not depend on ...
125
126
127
  #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
  int pageblock_order __read_mostly;
  #endif
d98c7a098   Hugh Dickins   [PATCH] compound ...
128
  static void __free_pages_ok(struct page *page, unsigned int order);
a226f6c89   David Howells   [PATCH] FRV: Clea...
129

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
130
131
132
133
134
135
136
  /*
   * results with 256, 32 in the lowmem_reserve sysctl:
   *	1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
   *	1G machine -> (16M dma, 784M normal, 224M high)
   *	NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
   *	HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
   *	HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
a2f1b4249   Andi Kleen   [PATCH] x86_64: A...
137
138
139
   *
   * TBD: should special case ZONE_DMA32 machines here - in those we normally
   * don't need any ZONE_NORMAL reservation
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
140
   */
2f1b62486   Christoph Lameter   [PATCH] reduce MA...
141
  int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
4b51d6698   Christoph Lameter   [PATCH] optional ...
142
  #ifdef CONFIG_ZONE_DMA
2f1b62486   Christoph Lameter   [PATCH] reduce MA...
143
  	 256,
4b51d6698   Christoph Lameter   [PATCH] optional ...
144
  #endif
fb0e7942b   Christoph Lameter   [PATCH] reduce MA...
145
  #ifdef CONFIG_ZONE_DMA32
2f1b62486   Christoph Lameter   [PATCH] reduce MA...
146
  	 256,
fb0e7942b   Christoph Lameter   [PATCH] reduce MA...
147
  #endif
e53ef38d0   Christoph Lameter   [PATCH] reduce MA...
148
  #ifdef CONFIG_HIGHMEM
2a1e274ac   Mel Gorman   Create the ZONE_M...
149
  	 32,
e53ef38d0   Christoph Lameter   [PATCH] reduce MA...
150
  #endif
2a1e274ac   Mel Gorman   Create the ZONE_M...
151
  	 32,
2f1b62486   Christoph Lameter   [PATCH] reduce MA...
152
  };
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
153
154
  
  EXPORT_SYMBOL(totalram_pages);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
155

15ad7cdcf   Helge Deller   [PATCH] struct se...
156
  static char * const zone_names[MAX_NR_ZONES] = {
4b51d6698   Christoph Lameter   [PATCH] optional ...
157
  #ifdef CONFIG_ZONE_DMA
2f1b62486   Christoph Lameter   [PATCH] reduce MA...
158
  	 "DMA",
4b51d6698   Christoph Lameter   [PATCH] optional ...
159
  #endif
fb0e7942b   Christoph Lameter   [PATCH] reduce MA...
160
  #ifdef CONFIG_ZONE_DMA32
2f1b62486   Christoph Lameter   [PATCH] reduce MA...
161
  	 "DMA32",
fb0e7942b   Christoph Lameter   [PATCH] reduce MA...
162
  #endif
2f1b62486   Christoph Lameter   [PATCH] reduce MA...
163
  	 "Normal",
e53ef38d0   Christoph Lameter   [PATCH] reduce MA...
164
  #ifdef CONFIG_HIGHMEM
2a1e274ac   Mel Gorman   Create the ZONE_M...
165
  	 "HighMem",
e53ef38d0   Christoph Lameter   [PATCH] reduce MA...
166
  #endif
2a1e274ac   Mel Gorman   Create the ZONE_M...
167
  	 "Movable",
2f1b62486   Christoph Lameter   [PATCH] reduce MA...
168
  };
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
169
  int min_free_kbytes = 1024;
2c85f51d2   Jan Beulich   mm: also use allo...
170
171
  static unsigned long __meminitdata nr_kernel_pages;
  static unsigned long __meminitdata nr_all_pages;
a3142c8e1   Yasunori Goto   Fix section misma...
172
  static unsigned long __meminitdata dma_reserve;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
173

c713216de   Mel Gorman   [PATCH] Introduce...
174
175
  #ifdef CONFIG_ARCH_POPULATES_NODE_MAP
    /*
183ff22bb   Simon Arlott   spelling fixes: mm/
176
     * MAX_ACTIVE_REGIONS determines the maximum number of distinct
c713216de   Mel Gorman   [PATCH] Introduce...
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
     * ranges of memory (RAM) that may be registered with add_active_range().
     * Ranges passed to add_active_range() will be merged if possible
     * so the number of times add_active_range() can be called is
     * related to the number of nodes and the number of holes
     */
    #ifdef CONFIG_MAX_ACTIVE_REGIONS
      /* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */
      #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS
    #else
      #if MAX_NUMNODES >= 32
        /* If there can be many nodes, allow up to 50 holes per node */
        #define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50)
      #else
        /* By default, allow up to 256 distinct regions */
        #define MAX_ACTIVE_REGIONS 256
      #endif
    #endif
98011f569   Jan Beulich   mm: fix improper ...
194
195
196
197
    static struct node_active_region __meminitdata early_node_map[MAX_ACTIVE_REGIONS];
    static int __meminitdata nr_nodemap_entries;
    static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
    static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
b69a7288e   Adrian Bunk   mm/page_alloc.c: ...
198
    static unsigned long __initdata required_kernelcore;
484f51f82   Adrian Bunk   mm/page_alloc.c: ...
199
    static unsigned long __initdata required_movablecore;
b69a7288e   Adrian Bunk   mm/page_alloc.c: ...
200
    static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
2a1e274ac   Mel Gorman   Create the ZONE_M...
201
202
203
204
  
    /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
    int movable_zone;
    EXPORT_SYMBOL(movable_zone);
c713216de   Mel Gorman   [PATCH] Introduce...
205
  #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
418508c13   Miklos Szeredi   fix unused setup_...
206
207
  #if MAX_NUMNODES > 1
  int nr_node_ids __read_mostly = MAX_NUMNODES;
62bc62a87   Christoph Lameter   page allocator: u...
208
  int nr_online_nodes __read_mostly = 1;
418508c13   Miklos Szeredi   fix unused setup_...
209
  EXPORT_SYMBOL(nr_node_ids);
62bc62a87   Christoph Lameter   page allocator: u...
210
  EXPORT_SYMBOL(nr_online_nodes);
418508c13   Miklos Szeredi   fix unused setup_...
211
  #endif
9ef9acb05   Mel Gorman   Do not group page...
212
  int page_group_by_mobility_disabled __read_mostly;
b2a0ac887   Mel Gorman   Split the free li...
213
214
  static void set_pageblock_migratetype(struct page *page, int migratetype)
  {
49255c619   Mel Gorman   page allocator: m...
215
216
217
  
  	if (unlikely(page_group_by_mobility_disabled))
  		migratetype = MIGRATE_UNMOVABLE;
b2a0ac887   Mel Gorman   Split the free li...
218
219
220
  	set_pageblock_flags_group(page, (unsigned long)migratetype,
  					PB_migrate, PB_migrate_end);
  }
7f33d49a2   Rafael J. Wysocki   mm, PM/Freezer: D...
221
  bool oom_killer_disabled __read_mostly;
13e7444b0   Nick Piggin   [PATCH] mm: remov...
222
  #ifdef CONFIG_DEBUG_VM
c6a57e19e   Dave Hansen   [PATCH] memory ho...
223
  static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
224
  {
bdc8cb984   Dave Hansen   [PATCH] memory ho...
225
226
227
  	int ret = 0;
  	unsigned seq;
  	unsigned long pfn = page_to_pfn(page);
c6a57e19e   Dave Hansen   [PATCH] memory ho...
228

bdc8cb984   Dave Hansen   [PATCH] memory ho...
229
230
231
232
233
234
235
236
237
  	do {
  		seq = zone_span_seqbegin(zone);
  		if (pfn >= zone->zone_start_pfn + zone->spanned_pages)
  			ret = 1;
  		else if (pfn < zone->zone_start_pfn)
  			ret = 1;
  	} while (zone_span_seqretry(zone, seq));
  
  	return ret;
c6a57e19e   Dave Hansen   [PATCH] memory ho...
238
239
240
241
  }
  
  static int page_is_consistent(struct zone *zone, struct page *page)
  {
14e072984   Andy Whitcroft   add pfn_valid_wit...
242
  	if (!pfn_valid_within(page_to_pfn(page)))
c6a57e19e   Dave Hansen   [PATCH] memory ho...
243
  		return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
244
  	if (zone != page_zone(page))
c6a57e19e   Dave Hansen   [PATCH] memory ho...
245
246
247
248
249
250
251
252
253
254
  		return 0;
  
  	return 1;
  }
  /*
   * Temporary debugging check for pages not lying within a given zone.
   */
  static int bad_range(struct zone *zone, struct page *page)
  {
  	if (page_outside_zone_boundaries(zone, page))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
255
  		return 1;
c6a57e19e   Dave Hansen   [PATCH] memory ho...
256
257
  	if (!page_is_consistent(zone, page))
  		return 1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
258
259
  	return 0;
  }
13e7444b0   Nick Piggin   [PATCH] mm: remov...
260
261
262
263
264
265
  #else
  static inline int bad_range(struct zone *zone, struct page *page)
  {
  	return 0;
  }
  #endif
224abf92b   Nick Piggin   [PATCH] mm: bad_p...
266
  static void bad_page(struct page *page)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
267
  {
d936cf9b3   Hugh Dickins   badpage: ratelimi...
268
269
270
  	static unsigned long resume;
  	static unsigned long nr_shown;
  	static unsigned long nr_unshown;
2a7684a23   Wu Fengguang   HWPOISON: check a...
271
272
  	/* Don't complain about poisoned pages */
  	if (PageHWPoison(page)) {
ef2b4b95a   Andrea Arcangeli   mm: PageBuddy and...
273
  		reset_page_mapcount(page); /* remove PageBuddy */
2a7684a23   Wu Fengguang   HWPOISON: check a...
274
275
  		return;
  	}
d936cf9b3   Hugh Dickins   badpage: ratelimi...
276
277
278
279
280
281
282
283
284
285
  	/*
  	 * Allow a burst of 60 reports, then keep quiet for that minute;
  	 * or allow a steady drip of one report per second.
  	 */
  	if (nr_shown == 60) {
  		if (time_before(jiffies, resume)) {
  			nr_unshown++;
  			goto out;
  		}
  		if (nr_unshown) {
1e9e63650   Hugh Dickins   badpage: KERN_ALE...
286
287
288
  			printk(KERN_ALERT
  			      "BUG: Bad page state: %lu messages suppressed
  ",
d936cf9b3   Hugh Dickins   badpage: ratelimi...
289
290
291
292
293
294
295
  				nr_unshown);
  			nr_unshown = 0;
  		}
  		nr_shown = 0;
  	}
  	if (nr_shown++ == 0)
  		resume = jiffies + 60 * HZ;
1e9e63650   Hugh Dickins   badpage: KERN_ALE...
296
297
  	printk(KERN_ALERT "BUG: Bad page state in process %s  pfn:%05lx
  ",
3dc147414   Hugh Dickins   badpage: replace ...
298
  		current->comm, page_to_pfn(page));
718a38211   Wu Fengguang   mm: introduce dum...
299
  	dump_page(page);
3dc147414   Hugh Dickins   badpage: replace ...
300

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
301
  	dump_stack();
d936cf9b3   Hugh Dickins   badpage: ratelimi...
302
  out:
8cc3b3922   Hugh Dickins   badpage: keep any...
303
  	/* Leave bad fields for debug, except PageBuddy could make trouble */
ef2b4b95a   Andrea Arcangeli   mm: PageBuddy and...
304
  	reset_page_mapcount(page); /* remove PageBuddy */
9f1583339   Randy Dunlap   [PATCH] use add_t...
305
  	add_taint(TAINT_BAD_PAGE);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
306
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
307
308
309
310
311
312
313
314
315
316
  /*
   * Higher-order pages are called "compound pages".  They are structured thusly:
   *
   * The first PAGE_SIZE page is called the "head page".
   *
   * The remaining PAGE_SIZE pages are called "tail pages".
   *
   * All pages have PG_compound set.  All pages have their ->private pointing at
   * the head page (even the head page has this).
   *
41d78ba55   Hugh Dickins   [PATCH] compound ...
317
318
319
   * The first tail page's ->lru.next holds the address of the compound page's
   * put_page() function.  Its ->lru.prev holds the order of allocation.
   * This usage means that zero-order pages may not be compound.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
320
   */
d98c7a098   Hugh Dickins   [PATCH] compound ...
321
322
323
  
  static void free_compound_page(struct page *page)
  {
d85f33855   Christoph Lameter   Make page->privat...
324
  	__free_pages_ok(page, compound_order(page));
d98c7a098   Hugh Dickins   [PATCH] compound ...
325
  }
01ad1c082   Andi Kleen   mm: export prep_c...
326
  void prep_compound_page(struct page *page, unsigned long order)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
327
328
329
  {
  	int i;
  	int nr_pages = 1 << order;
18229df5b   Andy Whitcroft   hugetlb: pull gig...
330
331
332
333
334
335
336
337
338
339
340
  
  	set_compound_page_dtor(page, free_compound_page);
  	set_compound_order(page, order);
  	__SetPageHead(page);
  	for (i = 1; i < nr_pages; i++) {
  		struct page *p = page + i;
  
  		__SetPageTail(p);
  		p->first_page = page;
  	}
  }
59ff42163   Andrea Arcangeli   thp: comment remi...
341
  /* update __split_huge_page_refcount if you change this function */
8cc3b3922   Hugh Dickins   badpage: keep any...
342
  static int destroy_compound_page(struct page *page, unsigned long order)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
343
344
345
  {
  	int i;
  	int nr_pages = 1 << order;
8cc3b3922   Hugh Dickins   badpage: keep any...
346
  	int bad = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
347

8cc3b3922   Hugh Dickins   badpage: keep any...
348
349
  	if (unlikely(compound_order(page) != order) ||
  	    unlikely(!PageHead(page))) {
224abf92b   Nick Piggin   [PATCH] mm: bad_p...
350
  		bad_page(page);
8cc3b3922   Hugh Dickins   badpage: keep any...
351
352
  		bad++;
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
353

6d7779538   Christoph Lameter   mm: optimize comp...
354
  	__ClearPageHead(page);
8cc3b3922   Hugh Dickins   badpage: keep any...
355

18229df5b   Andy Whitcroft   hugetlb: pull gig...
356
357
  	for (i = 1; i < nr_pages; i++) {
  		struct page *p = page + i;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
358

e713a21d8   Alexey Zaytsev   trivial: Fix dubi...
359
  		if (unlikely(!PageTail(p) || (p->first_page != page))) {
224abf92b   Nick Piggin   [PATCH] mm: bad_p...
360
  			bad_page(page);
8cc3b3922   Hugh Dickins   badpage: keep any...
361
362
  			bad++;
  		}
d85f33855   Christoph Lameter   Make page->privat...
363
  		__ClearPageTail(p);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
364
  	}
8cc3b3922   Hugh Dickins   badpage: keep any...
365
366
  
  	return bad;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
367
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
368

17cf44064   Nick Piggin   [PATCH] mm: clean...
369
370
371
  static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
  {
  	int i;
6626c5d53   Andrew Morton   [PATCH] mm: prep_...
372
373
374
375
  	/*
  	 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
  	 * and __GFP_HIGHMEM from hard or soft interrupt context.
  	 */
725d704ec   Nick Piggin   [PATCH] mm: VM_BU...
376
  	VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
17cf44064   Nick Piggin   [PATCH] mm: clean...
377
378
379
  	for (i = 0; i < (1 << order); i++)
  		clear_highpage(page + i);
  }
6aa3001b2   Andrew Morton   [PATCH] page_allo...
380
381
  static inline void set_page_order(struct page *page, int order)
  {
4c21e2f24   Hugh Dickins   [PATCH] mm: split...
382
  	set_page_private(page, order);
676165a8a   Nick Piggin   [PATCH] Fix buddy...
383
  	__SetPageBuddy(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
384
385
386
387
  }
  
  static inline void rmv_page_order(struct page *page)
  {
676165a8a   Nick Piggin   [PATCH] Fix buddy...
388
  	__ClearPageBuddy(page);
4c21e2f24   Hugh Dickins   [PATCH] mm: split...
389
  	set_page_private(page, 0);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
  }
  
  /*
   * Locate the struct page for both the matching buddy in our
   * pair (buddy1) and the combined O(n+1) page they form (page).
   *
   * 1) Any buddy B1 will have an order O twin B2 which satisfies
   * the following equation:
   *     B2 = B1 ^ (1 << O)
   * For example, if the starting buddy (buddy2) is #8 its order
   * 1 buddy is #10:
   *     B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
   *
   * 2) Any buddy B will have an order O+1 parent P which
   * satisfies the following equation:
   *     P = B & ~(1 << O)
   *
d6e05edc5   Andreas Mohr   spelling fixes
407
   * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
408
   */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
409
  static inline unsigned long
43506fad2   KyongHo Cho   mm/page_alloc.c: ...
410
  __find_buddy_index(unsigned long page_idx, unsigned int order)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
411
  {
43506fad2   KyongHo Cho   mm/page_alloc.c: ...
412
  	return page_idx ^ (1 << order);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
413
414
415
416
417
  }
  
  /*
   * This function checks whether a page is free && is the buddy
   * we can do coalesce a page and its buddy if
13e7444b0   Nick Piggin   [PATCH] mm: remov...
418
   * (a) the buddy is not in a hole &&
676165a8a   Nick Piggin   [PATCH] Fix buddy...
419
   * (b) the buddy is in the buddy system &&
cb2b95e1c   Andy Whitcroft   [PATCH] zone hand...
420
421
   * (c) a page and its buddy have the same order &&
   * (d) a page and its buddy are in the same zone.
676165a8a   Nick Piggin   [PATCH] Fix buddy...
422
   *
5f24ce5fd   Andrea Arcangeli   thp: remove PG_buddy
423
424
   * For recording whether a page is in the buddy system, we set ->_mapcount -2.
   * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
425
   *
676165a8a   Nick Piggin   [PATCH] Fix buddy...
426
   * For recording page's order, we use page_private(page).
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
427
   */
cb2b95e1c   Andy Whitcroft   [PATCH] zone hand...
428
429
  static inline int page_is_buddy(struct page *page, struct page *buddy,
  								int order)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
430
  {
14e072984   Andy Whitcroft   add pfn_valid_wit...
431
  	if (!pfn_valid_within(page_to_pfn(buddy)))
13e7444b0   Nick Piggin   [PATCH] mm: remov...
432
  		return 0;
13e7444b0   Nick Piggin   [PATCH] mm: remov...
433

cb2b95e1c   Andy Whitcroft   [PATCH] zone hand...
434
435
436
437
  	if (page_zone_id(page) != page_zone_id(buddy))
  		return 0;
  
  	if (PageBuddy(buddy) && page_order(buddy) == order) {
a3af9c389   Nick Piggin   page allocator: d...
438
  		VM_BUG_ON(page_count(buddy) != 0);
6aa3001b2   Andrew Morton   [PATCH] page_allo...
439
  		return 1;
676165a8a   Nick Piggin   [PATCH] Fix buddy...
440
  	}
6aa3001b2   Andrew Morton   [PATCH] page_allo...
441
  	return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
  }
  
  /*
   * Freeing function for a buddy system allocator.
   *
   * The concept of a buddy system is to maintain direct-mapped table
   * (containing bit values) for memory blocks of various "orders".
   * The bottom level table contains the map for the smallest allocatable
   * units of memory (here, pages), and each level above it describes
   * pairs of units from the levels below, hence, "buddies".
   * At a high level, all that happens here is marking the table entry
   * at the bottom level available, and propagating the changes upward
   * as necessary, plus some accounting needed to play nicely with other
   * parts of the VM system.
   * At each level, we keep a list of pages, which are heads of continuous
5f24ce5fd   Andrea Arcangeli   thp: remove PG_buddy
457
   * free pages of length of (1 << order) and marked with _mapcount -2. Page's
4c21e2f24   Hugh Dickins   [PATCH] mm: split...
458
   * order is recorded in page_private(page) field.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
459
460
461
462
463
464
465
466
   * So when we are allocating or freeing one, we can derive the state of the
   * other.  That is, if we allocate a small block, and both were   
   * free, the remainder of the region must be split into blocks.   
   * If a block is freed, and its buddy is also free, then this
   * triggers coalescing into a block of larger size.            
   *
   * -- wli
   */
48db57f8f   Nick Piggin   [PATCH] mm: free_...
467
  static inline void __free_one_page(struct page *page,
ed0ae21dc   Mel Gorman   page allocator: d...
468
469
  		struct zone *zone, unsigned int order,
  		int migratetype)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
470
471
  {
  	unsigned long page_idx;
6dda9d55b   Corrado Zoccolo   page allocator: r...
472
  	unsigned long combined_idx;
43506fad2   KyongHo Cho   mm/page_alloc.c: ...
473
  	unsigned long uninitialized_var(buddy_idx);
6dda9d55b   Corrado Zoccolo   page allocator: r...
474
  	struct page *buddy;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
475

224abf92b   Nick Piggin   [PATCH] mm: bad_p...
476
  	if (unlikely(PageCompound(page)))
8cc3b3922   Hugh Dickins   badpage: keep any...
477
478
  		if (unlikely(destroy_compound_page(page, order)))
  			return;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
479

ed0ae21dc   Mel Gorman   page allocator: d...
480
  	VM_BUG_ON(migratetype == -1);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
481
  	page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
f2260e6b1   Mel Gorman   page allocator: u...
482
  	VM_BUG_ON(page_idx & ((1 << order) - 1));
725d704ec   Nick Piggin   [PATCH] mm: VM_BU...
483
  	VM_BUG_ON(bad_range(zone, page));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
484

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
485
  	while (order < MAX_ORDER-1) {
43506fad2   KyongHo Cho   mm/page_alloc.c: ...
486
487
  		buddy_idx = __find_buddy_index(page_idx, order);
  		buddy = page + (buddy_idx - page_idx);
cb2b95e1c   Andy Whitcroft   [PATCH] zone hand...
488
  		if (!page_is_buddy(page, buddy, order))
3c82d0ce2   Andy Whitcroft   buddy: clarify co...
489
  			break;
13e7444b0   Nick Piggin   [PATCH] mm: remov...
490

3c82d0ce2   Andy Whitcroft   buddy: clarify co...
491
  		/* Our buddy is free, merge with it and move up one order. */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
492
  		list_del(&buddy->lru);
b2a0ac887   Mel Gorman   Split the free li...
493
  		zone->free_area[order].nr_free--;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
494
  		rmv_page_order(buddy);
43506fad2   KyongHo Cho   mm/page_alloc.c: ...
495
  		combined_idx = buddy_idx & page_idx;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
496
497
498
499
500
  		page = page + (combined_idx - page_idx);
  		page_idx = combined_idx;
  		order++;
  	}
  	set_page_order(page, order);
6dda9d55b   Corrado Zoccolo   page allocator: r...
501
502
503
504
505
506
507
508
509
  
  	/*
  	 * If this is not the largest possible page, check if the buddy
  	 * of the next-highest order is free. If it is, it's possible
  	 * that pages are being freed that will coalesce soon. In case,
  	 * that is happening, add the free page to the tail of the list
  	 * so it's less likely to be used soon and more likely to be merged
  	 * as a higher order page
  	 */
b7f50cfa3   Mel Gorman   mm, page-allocato...
510
  	if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
6dda9d55b   Corrado Zoccolo   page allocator: r...
511
  		struct page *higher_page, *higher_buddy;
43506fad2   KyongHo Cho   mm/page_alloc.c: ...
512
513
514
515
  		combined_idx = buddy_idx & page_idx;
  		higher_page = page + (combined_idx - page_idx);
  		buddy_idx = __find_buddy_index(combined_idx, order + 1);
  		higher_buddy = page + (buddy_idx - combined_idx);
6dda9d55b   Corrado Zoccolo   page allocator: r...
516
517
518
519
520
521
522
523
524
  		if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
  			list_add_tail(&page->lru,
  				&zone->free_area[order].free_list[migratetype]);
  			goto out;
  		}
  	}
  
  	list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
  out:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
525
526
  	zone->free_area[order].nr_free++;
  }
092cead61   KOSAKI Motohiro   page allocator: m...
527
528
529
530
531
532
533
  /*
   * free_page_mlock() -- clean up attempts to free and mlocked() page.
   * Page should not be on lru, so no need to fix that up.
   * free_pages_check() will verify...
   */
  static inline void free_page_mlock(struct page *page)
  {
092cead61   KOSAKI Motohiro   page allocator: m...
534
535
536
  	__dec_zone_page_state(page, NR_MLOCK);
  	__count_vm_event(UNEVICTABLE_MLOCKFREED);
  }
092cead61   KOSAKI Motohiro   page allocator: m...
537

224abf92b   Nick Piggin   [PATCH] mm: bad_p...
538
  static inline int free_pages_check(struct page *page)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
539
  {
92be2e33b   Nick Piggin   [PATCH] mm: micro...
540
541
  	if (unlikely(page_mapcount(page) |
  		(page->mapping != NULL)  |
a3af9c389   Nick Piggin   page allocator: d...
542
  		(atomic_read(&page->_count) != 0) |
f212ad7cf   Daisuke Nishimura   memcg: add memcg ...
543
544
  		(page->flags & PAGE_FLAGS_CHECK_AT_FREE) |
  		(mem_cgroup_bad_page_check(page)))) {
224abf92b   Nick Piggin   [PATCH] mm: bad_p...
545
  		bad_page(page);
79f4b7bf3   Hugh Dickins   badpage: simplify...
546
  		return 1;
8cc3b3922   Hugh Dickins   badpage: keep any...
547
  	}
79f4b7bf3   Hugh Dickins   badpage: simplify...
548
549
550
  	if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
  		page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
  	return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
551
552
553
  }
  
  /*
5f8dcc212   Mel Gorman   page-allocator: s...
554
   * Frees a number of pages from the PCP lists
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
555
   * Assumes all pages on list are in same zone, and of same order.
207f36eec   Renaud Lienhart   [PATCH] remove in...
556
   * count is the number of pages to free.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
557
558
559
560
561
562
563
   *
   * If the zone was previously in an "all pages pinned" state then look to
   * see if this freeing clears that state.
   *
   * And clear the zone's pages_scanned counter, to hold off the "all pages are
   * pinned" detection logic.
   */
5f8dcc212   Mel Gorman   page-allocator: s...
564
565
  static void free_pcppages_bulk(struct zone *zone, int count,
  					struct per_cpu_pages *pcp)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
566
  {
5f8dcc212   Mel Gorman   page-allocator: s...
567
  	int migratetype = 0;
a6f9edd65   Mel Gorman   page-allocator: m...
568
  	int batch_free = 0;
72853e299   Mel Gorman   mm: page allocato...
569
  	int to_free = count;
5f8dcc212   Mel Gorman   page-allocator: s...
570

c54ad30c7   Nick Piggin   [PATCH] mm: pagea...
571
  	spin_lock(&zone->lock);
93e4a89a8   KOSAKI Motohiro   mm: restore zone-...
572
  	zone->all_unreclaimable = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
573
  	zone->pages_scanned = 0;
f2260e6b1   Mel Gorman   page allocator: u...
574

72853e299   Mel Gorman   mm: page allocato...
575
  	while (to_free) {
48db57f8f   Nick Piggin   [PATCH] mm: free_...
576
  		struct page *page;
5f8dcc212   Mel Gorman   page-allocator: s...
577
578
579
  		struct list_head *list;
  
  		/*
a6f9edd65   Mel Gorman   page-allocator: m...
580
581
582
583
584
  		 * Remove pages from lists in a round-robin fashion. A
  		 * batch_free count is maintained that is incremented when an
  		 * empty list is encountered.  This is so more pages are freed
  		 * off fuller lists instead of spinning excessively around empty
  		 * lists
5f8dcc212   Mel Gorman   page-allocator: s...
585
586
  		 */
  		do {
a6f9edd65   Mel Gorman   page-allocator: m...
587
  			batch_free++;
5f8dcc212   Mel Gorman   page-allocator: s...
588
589
590
591
  			if (++migratetype == MIGRATE_PCPTYPES)
  				migratetype = 0;
  			list = &pcp->lists[migratetype];
  		} while (list_empty(list));
48db57f8f   Nick Piggin   [PATCH] mm: free_...
592

1d16871d8   Namhyung Kim   mm: batch-free pc...
593
594
595
  		/* This is the only non-empty list. Free them all. */
  		if (batch_free == MIGRATE_PCPTYPES)
  			batch_free = to_free;
a6f9edd65   Mel Gorman   page-allocator: m...
596
597
598
599
  		do {
  			page = list_entry(list->prev, struct page, lru);
  			/* must delete as __free_one_page list manipulates */
  			list_del(&page->lru);
a7016235a   Hugh Dickins   mm: fix migratety...
600
601
602
  			/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
  			__free_one_page(page, zone, 0, page_private(page));
  			trace_mm_page_pcpu_drain(page, 0, page_private(page));
72853e299   Mel Gorman   mm: page allocato...
603
  		} while (--to_free && --batch_free && !list_empty(list));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
604
  	}
72853e299   Mel Gorman   mm: page allocato...
605
  	__mod_zone_page_state(zone, NR_FREE_PAGES, count);
c54ad30c7   Nick Piggin   [PATCH] mm: pagea...
606
  	spin_unlock(&zone->lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
607
  }
ed0ae21dc   Mel Gorman   page allocator: d...
608
609
  static void free_one_page(struct zone *zone, struct page *page, int order,
  				int migratetype)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
610
  {
006d22d9b   Christoph Lameter   [PATCH] Optimize ...
611
  	spin_lock(&zone->lock);
93e4a89a8   KOSAKI Motohiro   mm: restore zone-...
612
  	zone->all_unreclaimable = 0;
006d22d9b   Christoph Lameter   [PATCH] Optimize ...
613
  	zone->pages_scanned = 0;
f2260e6b1   Mel Gorman   page allocator: u...
614

ed0ae21dc   Mel Gorman   page allocator: d...
615
  	__free_one_page(page, zone, order, migratetype);
72853e299   Mel Gorman   mm: page allocato...
616
  	__mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
006d22d9b   Christoph Lameter   [PATCH] Optimize ...
617
  	spin_unlock(&zone->lock);
48db57f8f   Nick Piggin   [PATCH] mm: free_...
618
  }
ec95f53aa   KOSAKI Motohiro   mm: introduce fre...
619
  static bool free_pages_prepare(struct page *page, unsigned int order)
48db57f8f   Nick Piggin   [PATCH] mm: free_...
620
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
621
  	int i;
8cc3b3922   Hugh Dickins   badpage: keep any...
622
  	int bad = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
623

f650316c8   Li Hong   mm/page_alloc.c: ...
624
  	trace_mm_page_free_direct(page, order);
b1eeab676   Vegard Nossum   kmemcheck: add ho...
625
  	kmemcheck_free_shadow(page, order);
8dd60a3a6   Andrea Arcangeli   thp: clear compou...
626
627
628
629
  	if (PageAnon(page))
  		page->mapping = NULL;
  	for (i = 0; i < (1 << order); i++)
  		bad += free_pages_check(page + i);
8cc3b3922   Hugh Dickins   badpage: keep any...
630
  	if (bad)
ec95f53aa   KOSAKI Motohiro   mm: introduce fre...
631
  		return false;
689bcebfd   Hugh Dickins   [PATCH] unpaged: ...
632

3ac7fe5a4   Thomas Gleixner   infrastructure to...
633
  	if (!PageHighMem(page)) {
9858db504   Nick Piggin   [PATCH] mm: locks...
634
  		debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);
3ac7fe5a4   Thomas Gleixner   infrastructure to...
635
636
637
  		debug_check_no_obj_freed(page_address(page),
  					   PAGE_SIZE << order);
  	}
dafb13673   Nick Piggin   [PATCH] mm: arch_...
638
  	arch_free_page(page, order);
48db57f8f   Nick Piggin   [PATCH] mm: free_...
639
  	kernel_map_pages(page, 1 << order, 0);
dafb13673   Nick Piggin   [PATCH] mm: arch_...
640

ec95f53aa   KOSAKI Motohiro   mm: introduce fre...
641
642
643
644
645
646
647
648
649
650
  	return true;
  }
  
  static void __free_pages_ok(struct page *page, unsigned int order)
  {
  	unsigned long flags;
  	int wasMlocked = __TestClearPageMlocked(page);
  
  	if (!free_pages_prepare(page, order))
  		return;
c54ad30c7   Nick Piggin   [PATCH] mm: pagea...
651
  	local_irq_save(flags);
c277331d5   Johannes Weiner   mm: page_alloc: c...
652
  	if (unlikely(wasMlocked))
da456f14d   Mel Gorman   page allocator: d...
653
  		free_page_mlock(page);
f8891e5e1   Christoph Lameter   [PATCH] Light wei...
654
  	__count_vm_events(PGFREE, 1 << order);
ed0ae21dc   Mel Gorman   page allocator: d...
655
656
  	free_one_page(page_zone(page), page, order,
  					get_pageblock_migratetype(page));
c54ad30c7   Nick Piggin   [PATCH] mm: pagea...
657
  	local_irq_restore(flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
658
  }
a226f6c89   David Howells   [PATCH] FRV: Clea...
659
660
661
  /*
   * permit the bootmem allocator to evade page validation on high-order frees
   */
af370fb8c   Yasunori Goto   memory hotplug: s...
662
  void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
a226f6c89   David Howells   [PATCH] FRV: Clea...
663
664
665
666
  {
  	if (order == 0) {
  		__ClearPageReserved(page);
  		set_page_count(page, 0);
7835e98b2   Nick Piggin   [PATCH] remove se...
667
  		set_page_refcounted(page);
545b1ea9b   Nick Piggin   [PATCH] mm: clean...
668
  		__free_page(page);
a226f6c89   David Howells   [PATCH] FRV: Clea...
669
  	} else {
a226f6c89   David Howells   [PATCH] FRV: Clea...
670
  		int loop;
545b1ea9b   Nick Piggin   [PATCH] mm: clean...
671
  		prefetchw(page);
a226f6c89   David Howells   [PATCH] FRV: Clea...
672
673
  		for (loop = 0; loop < BITS_PER_LONG; loop++) {
  			struct page *p = &page[loop];
545b1ea9b   Nick Piggin   [PATCH] mm: clean...
674
675
  			if (loop + 1 < BITS_PER_LONG)
  				prefetchw(p + 1);
a226f6c89   David Howells   [PATCH] FRV: Clea...
676
677
678
  			__ClearPageReserved(p);
  			set_page_count(p, 0);
  		}
7835e98b2   Nick Piggin   [PATCH] remove se...
679
  		set_page_refcounted(page);
545b1ea9b   Nick Piggin   [PATCH] mm: clean...
680
  		__free_pages(page, order);
a226f6c89   David Howells   [PATCH] FRV: Clea...
681
682
  	}
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
  
  /*
   * The order of subdivision here is critical for the IO subsystem.
   * Please do not alter this order without good reasons and regression
   * testing. Specifically, as large blocks of memory are subdivided,
   * the order in which smaller blocks are delivered depends on the order
   * they're subdivided in this function. This is the primary factor
   * influencing the order in which pages are delivered to the IO
   * subsystem according to empirical testing, and this is also justified
   * by considering the behavior of a buddy system containing a single
   * large block of memory acted on by a series of small allocations.
   * This behavior is a critical factor in sglist merging's success.
   *
   * -- wli
   */
085cc7d5d   Nick Piggin   [PATCH] mm: page_...
698
  static inline void expand(struct zone *zone, struct page *page,
b2a0ac887   Mel Gorman   Split the free li...
699
700
  	int low, int high, struct free_area *area,
  	int migratetype)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
701
702
703
704
705
706
707
  {
  	unsigned long size = 1 << high;
  
  	while (high > low) {
  		area--;
  		high--;
  		size >>= 1;
725d704ec   Nick Piggin   [PATCH] mm: VM_BU...
708
  		VM_BUG_ON(bad_range(zone, &page[size]));
b2a0ac887   Mel Gorman   Split the free li...
709
  		list_add(&page[size].lru, &area->free_list[migratetype]);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
710
711
712
  		area->nr_free++;
  		set_page_order(&page[size], high);
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
713
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
714
715
716
  /*
   * This page is about to be returned from the page allocator
   */
2a7684a23   Wu Fengguang   HWPOISON: check a...
717
  static inline int check_new_page(struct page *page)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
718
  {
92be2e33b   Nick Piggin   [PATCH] mm: micro...
719
720
  	if (unlikely(page_mapcount(page) |
  		(page->mapping != NULL)  |
a3af9c389   Nick Piggin   page allocator: d...
721
  		(atomic_read(&page->_count) != 0)  |
f212ad7cf   Daisuke Nishimura   memcg: add memcg ...
722
723
  		(page->flags & PAGE_FLAGS_CHECK_AT_PREP) |
  		(mem_cgroup_bad_page_check(page)))) {
224abf92b   Nick Piggin   [PATCH] mm: bad_p...
724
  		bad_page(page);
689bcebfd   Hugh Dickins   [PATCH] unpaged: ...
725
  		return 1;
8cc3b3922   Hugh Dickins   badpage: keep any...
726
  	}
2a7684a23   Wu Fengguang   HWPOISON: check a...
727
728
729
730
731
732
733
734
735
736
737
738
  	return 0;
  }
  
  static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
  {
  	int i;
  
  	for (i = 0; i < (1 << order); i++) {
  		struct page *p = page + i;
  		if (unlikely(check_new_page(p)))
  			return 1;
  	}
689bcebfd   Hugh Dickins   [PATCH] unpaged: ...
739

4c21e2f24   Hugh Dickins   [PATCH] mm: split...
740
  	set_page_private(page, 0);
7835e98b2   Nick Piggin   [PATCH] remove se...
741
  	set_page_refcounted(page);
cc1025090   Nick Piggin   [PATCH] mm: add a...
742
743
  
  	arch_alloc_page(page, order);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
744
  	kernel_map_pages(page, 1 << order, 1);
17cf44064   Nick Piggin   [PATCH] mm: clean...
745
746
747
748
749
750
  
  	if (gfp_flags & __GFP_ZERO)
  		prep_zero_page(page, order, gfp_flags);
  
  	if (order && (gfp_flags & __GFP_COMP))
  		prep_compound_page(page, order);
689bcebfd   Hugh Dickins   [PATCH] unpaged: ...
751
  	return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
752
  }
56fd56b86   Mel Gorman   Bias the location...
753
754
755
756
  /*
   * Go through the free lists for the given migratetype and remove
   * the smallest available page from the freelists
   */
728ec980f   Mel Gorman   page allocator: i...
757
758
  static inline
  struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
56fd56b86   Mel Gorman   Bias the location...
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
  						int migratetype)
  {
  	unsigned int current_order;
  	struct free_area * area;
  	struct page *page;
  
  	/* Find a page of the appropriate size in the preferred list */
  	for (current_order = order; current_order < MAX_ORDER; ++current_order) {
  		area = &(zone->free_area[current_order]);
  		if (list_empty(&area->free_list[migratetype]))
  			continue;
  
  		page = list_entry(area->free_list[migratetype].next,
  							struct page, lru);
  		list_del(&page->lru);
  		rmv_page_order(page);
  		area->nr_free--;
56fd56b86   Mel Gorman   Bias the location...
776
777
778
779
780
781
  		expand(zone, page, order, current_order, area, migratetype);
  		return page;
  	}
  
  	return NULL;
  }
b2a0ac887   Mel Gorman   Split the free li...
782
783
784
785
786
  /*
   * This array describes the order lists are fallen back to when
   * the free lists for the desirable migrate type are depleted
   */
  static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = {
64c5e135b   Mel Gorman   don't group high ...
787
788
789
790
  	[MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,   MIGRATE_RESERVE },
  	[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,   MIGRATE_RESERVE },
  	[MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
  	[MIGRATE_RESERVE]     = { MIGRATE_RESERVE,     MIGRATE_RESERVE,   MIGRATE_RESERVE }, /* Never used */
b2a0ac887   Mel Gorman   Split the free li...
791
  };
c361be55b   Mel Gorman   Move free pages b...
792
793
  /*
   * Move the free pages in a range to the free lists of the requested type.
d9c234005   Mel Gorman   Do not depend on ...
794
   * Note that start_page and end_pages are not aligned on a pageblock
c361be55b   Mel Gorman   Move free pages b...
795
796
   * boundary. If alignment is required, use move_freepages_block()
   */
b69a7288e   Adrian Bunk   mm/page_alloc.c: ...
797
798
799
  static int move_freepages(struct zone *zone,
  			  struct page *start_page, struct page *end_page,
  			  int migratetype)
c361be55b   Mel Gorman   Move free pages b...
800
801
802
  {
  	struct page *page;
  	unsigned long order;
d100313fd   Mel Gorman   Fix calculation i...
803
  	int pages_moved = 0;
c361be55b   Mel Gorman   Move free pages b...
804
805
806
807
808
809
810
  
  #ifndef CONFIG_HOLES_IN_ZONE
  	/*
  	 * page_zone is not safe to call in this context when
  	 * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
  	 * anyway as we check zone boundaries in move_freepages_block().
  	 * Remove at a later date when no bug reports exist related to
ac0e5b7a6   Mel Gorman   remove PAGE_GROUP...
811
  	 * grouping pages by mobility
c361be55b   Mel Gorman   Move free pages b...
812
813
814
815
816
  	 */
  	BUG_ON(page_zone(start_page) != page_zone(end_page));
  #endif
  
  	for (page = start_page; page <= end_page;) {
344c790e3   Adam Litke   mm: make setup_zo...
817
818
  		/* Make sure we are not inadvertently changing nodes */
  		VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));
c361be55b   Mel Gorman   Move free pages b...
819
820
821
822
823
824
825
826
827
828
829
  		if (!pfn_valid_within(page_to_pfn(page))) {
  			page++;
  			continue;
  		}
  
  		if (!PageBuddy(page)) {
  			page++;
  			continue;
  		}
  
  		order = page_order(page);
84be48d84   Kirill A. Shutemov   mm/page_alloc.c: ...
830
831
  		list_move(&page->lru,
  			  &zone->free_area[order].free_list[migratetype]);
c361be55b   Mel Gorman   Move free pages b...
832
  		page += 1 << order;
d100313fd   Mel Gorman   Fix calculation i...
833
  		pages_moved += 1 << order;
c361be55b   Mel Gorman   Move free pages b...
834
  	}
d100313fd   Mel Gorman   Fix calculation i...
835
  	return pages_moved;
c361be55b   Mel Gorman   Move free pages b...
836
  }
b69a7288e   Adrian Bunk   mm/page_alloc.c: ...
837
838
  static int move_freepages_block(struct zone *zone, struct page *page,
  				int migratetype)
c361be55b   Mel Gorman   Move free pages b...
839
840
841
842
843
  {
  	unsigned long start_pfn, end_pfn;
  	struct page *start_page, *end_page;
  
  	start_pfn = page_to_pfn(page);
d9c234005   Mel Gorman   Do not depend on ...
844
  	start_pfn = start_pfn & ~(pageblock_nr_pages-1);
c361be55b   Mel Gorman   Move free pages b...
845
  	start_page = pfn_to_page(start_pfn);
d9c234005   Mel Gorman   Do not depend on ...
846
847
  	end_page = start_page + pageblock_nr_pages - 1;
  	end_pfn = start_pfn + pageblock_nr_pages - 1;
c361be55b   Mel Gorman   Move free pages b...
848
849
850
851
852
853
854
855
856
  
  	/* Do not cross zone boundaries */
  	if (start_pfn < zone->zone_start_pfn)
  		start_page = page;
  	if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages)
  		return 0;
  
  	return move_freepages(zone, start_page, end_page, migratetype);
  }
2f66a68f3   Mel Gorman   page-allocator: c...
857
858
859
860
861
862
863
864
865
866
  static void change_pageblock_range(struct page *pageblock_page,
  					int start_order, int migratetype)
  {
  	int nr_pageblocks = 1 << (start_order - pageblock_order);
  
  	while (nr_pageblocks--) {
  		set_pageblock_migratetype(pageblock_page, migratetype);
  		pageblock_page += pageblock_nr_pages;
  	}
  }
b2a0ac887   Mel Gorman   Split the free li...
867
  /* Remove an element from the buddy allocator from the fallback list */
0ac3a4099   Mel Gorman   page allocator: i...
868
869
  static inline struct page *
  __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
b2a0ac887   Mel Gorman   Split the free li...
870
871
872
873
874
875
876
877
878
879
880
  {
  	struct free_area * area;
  	int current_order;
  	struct page *page;
  	int migratetype, i;
  
  	/* Find the largest possible block of pages in the other list */
  	for (current_order = MAX_ORDER-1; current_order >= order;
  						--current_order) {
  		for (i = 0; i < MIGRATE_TYPES - 1; i++) {
  			migratetype = fallbacks[start_migratetype][i];
56fd56b86   Mel Gorman   Bias the location...
881
882
883
  			/* MIGRATE_RESERVE handled later if necessary */
  			if (migratetype == MIGRATE_RESERVE)
  				continue;
e010487db   Mel Gorman   Group high-order ...
884

b2a0ac887   Mel Gorman   Split the free li...
885
886
887
888
889
890
891
892
893
  			area = &(zone->free_area[current_order]);
  			if (list_empty(&area->free_list[migratetype]))
  				continue;
  
  			page = list_entry(area->free_list[migratetype].next,
  					struct page, lru);
  			area->nr_free--;
  
  			/*
c361be55b   Mel Gorman   Move free pages b...
894
  			 * If breaking a large block of pages, move all free
46dafbca2   Mel Gorman   Be more agressive...
895
896
  			 * pages to the preferred allocation list. If falling
  			 * back for a reclaimable kernel allocation, be more
25985edce   Lucas De Marchi   Fix common misspe...
897
  			 * aggressive about taking ownership of free pages
b2a0ac887   Mel Gorman   Split the free li...
898
  			 */
d9c234005   Mel Gorman   Do not depend on ...
899
  			if (unlikely(current_order >= (pageblock_order >> 1)) ||
dd5d241ea   Mel Gorman   page-allocator: a...
900
901
  					start_migratetype == MIGRATE_RECLAIMABLE ||
  					page_group_by_mobility_disabled) {
46dafbca2   Mel Gorman   Be more agressive...
902
903
904
905
906
  				unsigned long pages;
  				pages = move_freepages_block(zone, page,
  								start_migratetype);
  
  				/* Claim the whole block if over half of it is free */
dd5d241ea   Mel Gorman   page-allocator: a...
907
908
  				if (pages >= (1 << (pageblock_order-1)) ||
  						page_group_by_mobility_disabled)
46dafbca2   Mel Gorman   Be more agressive...
909
910
  					set_pageblock_migratetype(page,
  								start_migratetype);
b2a0ac887   Mel Gorman   Split the free li...
911
  				migratetype = start_migratetype;
c361be55b   Mel Gorman   Move free pages b...
912
  			}
b2a0ac887   Mel Gorman   Split the free li...
913
914
915
916
  
  			/* Remove the page from the freelists */
  			list_del(&page->lru);
  			rmv_page_order(page);
b2a0ac887   Mel Gorman   Split the free li...
917

2f66a68f3   Mel Gorman   page-allocator: c...
918
919
920
  			/* Take ownership for orders >= pageblock_order */
  			if (current_order >= pageblock_order)
  				change_pageblock_range(page, current_order,
b2a0ac887   Mel Gorman   Split the free li...
921
922
923
  							start_migratetype);
  
  			expand(zone, page, order, current_order, area, migratetype);
e0fff1bd1   Mel Gorman   tracing, page-all...
924
925
926
  
  			trace_mm_page_alloc_extfrag(page, order, current_order,
  				start_migratetype, migratetype);
b2a0ac887   Mel Gorman   Split the free li...
927
928
929
  			return page;
  		}
  	}
728ec980f   Mel Gorman   page allocator: i...
930
  	return NULL;
b2a0ac887   Mel Gorman   Split the free li...
931
  }
56fd56b86   Mel Gorman   Bias the location...
932
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
933
934
935
   * Do the hard work of removing an element from the buddy allocator.
   * Call me with the zone->lock already held.
   */
b2a0ac887   Mel Gorman   Split the free li...
936
937
  static struct page *__rmqueue(struct zone *zone, unsigned int order,
  						int migratetype)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
938
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
939
  	struct page *page;
728ec980f   Mel Gorman   page allocator: i...
940
  retry_reserve:
56fd56b86   Mel Gorman   Bias the location...
941
  	page = __rmqueue_smallest(zone, order, migratetype);
b2a0ac887   Mel Gorman   Split the free li...
942

728ec980f   Mel Gorman   page allocator: i...
943
  	if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
56fd56b86   Mel Gorman   Bias the location...
944
  		page = __rmqueue_fallback(zone, order, migratetype);
b2a0ac887   Mel Gorman   Split the free li...
945

728ec980f   Mel Gorman   page allocator: i...
946
947
948
949
950
951
952
953
954
955
  		/*
  		 * Use MIGRATE_RESERVE rather than fail an allocation. goto
  		 * is used because __rmqueue_smallest is an inline function
  		 * and we want just one call site
  		 */
  		if (!page) {
  			migratetype = MIGRATE_RESERVE;
  			goto retry_reserve;
  		}
  	}
0d3d062a6   Mel Gorman   tracing, page-all...
956
  	trace_mm_page_alloc_zone_locked(page, order, migratetype);
b2a0ac887   Mel Gorman   Split the free li...
957
  	return page;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
958
959
960
961
962
963
964
965
  }
  
  /* 
   * Obtain a specified number of elements from the buddy allocator, all under
   * a single hold of the lock, for efficiency.  Add them to the supplied list.
   * Returns the number of new pages which were placed at *list.
   */
  static int rmqueue_bulk(struct zone *zone, unsigned int order, 
b2a0ac887   Mel Gorman   Split the free li...
966
  			unsigned long count, struct list_head *list,
e084b2d95   Mel Gorman   page-allocator: p...
967
  			int migratetype, int cold)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
968
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
969
  	int i;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
970
  	
c54ad30c7   Nick Piggin   [PATCH] mm: pagea...
971
  	spin_lock(&zone->lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
972
  	for (i = 0; i < count; ++i) {
b2a0ac887   Mel Gorman   Split the free li...
973
  		struct page *page = __rmqueue(zone, order, migratetype);
085cc7d5d   Nick Piggin   [PATCH] mm: page_...
974
  		if (unlikely(page == NULL))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
975
  			break;
81eabcbe0   Mel Gorman   mm: fix page allo...
976
977
978
979
980
981
982
983
984
985
  
  		/*
  		 * Split buddy pages returned by expand() are received here
  		 * in physical page order. The page is added to the callers and
  		 * list and the list head then moves forward. From the callers
  		 * perspective, the linked list is ordered by page number in
  		 * some conditions. This is useful for IO devices that can
  		 * merge IO requests if the physical pages are ordered
  		 * properly.
  		 */
e084b2d95   Mel Gorman   page-allocator: p...
986
987
988
989
  		if (likely(cold == 0))
  			list_add(&page->lru, list);
  		else
  			list_add_tail(&page->lru, list);
535131e69   Mel Gorman   Choose pages from...
990
  		set_page_private(page, migratetype);
81eabcbe0   Mel Gorman   mm: fix page allo...
991
  		list = &page->lru;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
992
  	}
f2260e6b1   Mel Gorman   page allocator: u...
993
  	__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
c54ad30c7   Nick Piggin   [PATCH] mm: pagea...
994
  	spin_unlock(&zone->lock);
085cc7d5d   Nick Piggin   [PATCH] mm: page_...
995
  	return i;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
996
  }
4ae7c0394   Christoph Lameter   [PATCH] Periodica...
997
  #ifdef CONFIG_NUMA
8fce4d8e3   Christoph Lameter   [PATCH] slab: Nod...
998
  /*
4037d4522   Christoph Lameter   Move remote node ...
999
1000
1001
1002
   * Called from the vmstat counter updater to drain pagesets of this
   * currently executing processor on remote nodes after they have
   * expired.
   *
879336c39   Christoph Lameter   [PATCH] drain_nod...
1003
1004
   * Note that this function must be called with the thread pinned to
   * a single processor.
8fce4d8e3   Christoph Lameter   [PATCH] slab: Nod...
1005
   */
4037d4522   Christoph Lameter   Move remote node ...
1006
  void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
4ae7c0394   Christoph Lameter   [PATCH] Periodica...
1007
  {
4ae7c0394   Christoph Lameter   [PATCH] Periodica...
1008
  	unsigned long flags;
4037d4522   Christoph Lameter   Move remote node ...
1009
  	int to_drain;
4ae7c0394   Christoph Lameter   [PATCH] Periodica...
1010

4037d4522   Christoph Lameter   Move remote node ...
1011
1012
1013
1014
1015
  	local_irq_save(flags);
  	if (pcp->count >= pcp->batch)
  		to_drain = pcp->batch;
  	else
  		to_drain = pcp->count;
5f8dcc212   Mel Gorman   page-allocator: s...
1016
  	free_pcppages_bulk(zone, to_drain, pcp);
4037d4522   Christoph Lameter   Move remote node ...
1017
1018
  	pcp->count -= to_drain;
  	local_irq_restore(flags);
4ae7c0394   Christoph Lameter   [PATCH] Periodica...
1019
1020
  }
  #endif
9f8f21725   Christoph Lameter   Page allocator: c...
1021
1022
1023
1024
1025
1026
1027
1028
  /*
   * Drain pages of the indicated processor.
   *
   * The processor must either be the current processor and the
   * thread pinned to the current processor or a processor that
   * is not online.
   */
  static void drain_pages(unsigned int cpu)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1029
  {
c54ad30c7   Nick Piggin   [PATCH] mm: pagea...
1030
  	unsigned long flags;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1031
  	struct zone *zone;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1032

ee99c71c5   KOSAKI Motohiro   mm: introduce for...
1033
  	for_each_populated_zone(zone) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1034
  		struct per_cpu_pageset *pset;
3dfa5721f   Christoph Lameter   Page allocator: g...
1035
  		struct per_cpu_pages *pcp;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1036

99dcc3e5a   Christoph Lameter   this_cpu: Page al...
1037
1038
  		local_irq_save(flags);
  		pset = per_cpu_ptr(zone->pageset, cpu);
3dfa5721f   Christoph Lameter   Page allocator: g...
1039
1040
  
  		pcp = &pset->pcp;
2ff754fa8   David Rientjes   mm: clear pages_s...
1041
1042
1043
1044
  		if (pcp->count) {
  			free_pcppages_bulk(zone, pcp->count, pcp);
  			pcp->count = 0;
  		}
3dfa5721f   Christoph Lameter   Page allocator: g...
1045
  		local_irq_restore(flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1046
1047
  	}
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1048

9f8f21725   Christoph Lameter   Page allocator: c...
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
  /*
   * Spill all of this CPU's per-cpu pages back into the buddy allocator.
   */
  void drain_local_pages(void *arg)
  {
  	drain_pages(smp_processor_id());
  }
  
  /*
   * Spill all the per-cpu pages from all CPUs back into the buddy allocator
   */
  void drain_all_pages(void)
  {
15c8b6c1a   Jens Axboe   on_each_cpu(): ki...
1062
  	on_each_cpu(drain_local_pages, NULL, 1);
9f8f21725   Christoph Lameter   Page allocator: c...
1063
  }
296699de6   Rafael J. Wysocki   Introduce CONFIG_...
1064
  #ifdef CONFIG_HIBERNATION
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1065
1066
1067
  
  void mark_free_pages(struct zone *zone)
  {
f623f0db8   Rafael J. Wysocki   [PATCH] swsusp: F...
1068
1069
  	unsigned long pfn, max_zone_pfn;
  	unsigned long flags;
b2a0ac887   Mel Gorman   Split the free li...
1070
  	int order, t;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1071
1072
1073
1074
1075
1076
  	struct list_head *curr;
  
  	if (!zone->spanned_pages)
  		return;
  
  	spin_lock_irqsave(&zone->lock, flags);
f623f0db8   Rafael J. Wysocki   [PATCH] swsusp: F...
1077
1078
1079
1080
1081
  
  	max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
  	for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
  		if (pfn_valid(pfn)) {
  			struct page *page = pfn_to_page(pfn);
7be982349   Rafael J. Wysocki   swsusp: use inlin...
1082
1083
  			if (!swsusp_page_is_forbidden(page))
  				swsusp_unset_page_free(page);
f623f0db8   Rafael J. Wysocki   [PATCH] swsusp: F...
1084
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1085

b2a0ac887   Mel Gorman   Split the free li...
1086
1087
  	for_each_migratetype_order(order, t) {
  		list_for_each(curr, &zone->free_area[order].free_list[t]) {
f623f0db8   Rafael J. Wysocki   [PATCH] swsusp: F...
1088
  			unsigned long i;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1089

f623f0db8   Rafael J. Wysocki   [PATCH] swsusp: F...
1090
1091
  			pfn = page_to_pfn(list_entry(curr, struct page, lru));
  			for (i = 0; i < (1UL << order); i++)
7be982349   Rafael J. Wysocki   swsusp: use inlin...
1092
  				swsusp_set_page_free(pfn_to_page(pfn + i));
f623f0db8   Rafael J. Wysocki   [PATCH] swsusp: F...
1093
  		}
b2a0ac887   Mel Gorman   Split the free li...
1094
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1095
1096
  	spin_unlock_irqrestore(&zone->lock, flags);
  }
e2c55dc87   Mel Gorman   Drain per-cpu lis...
1097
  #endif /* CONFIG_PM */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1098
1099
  
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1100
   * Free a 0-order page
fc91668ea   Li Hong   mm: remove free_h...
1101
   * cold == 1 ? free a cold page : free a hot page
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1102
   */
fc91668ea   Li Hong   mm: remove free_h...
1103
  void free_hot_cold_page(struct page *page, int cold)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1104
1105
1106
1107
  {
  	struct zone *zone = page_zone(page);
  	struct per_cpu_pages *pcp;
  	unsigned long flags;
5f8dcc212   Mel Gorman   page-allocator: s...
1108
  	int migratetype;
451ea25da   Johannes Weiner   mm: perform non-a...
1109
  	int wasMlocked = __TestClearPageMlocked(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1110

ec95f53aa   KOSAKI Motohiro   mm: introduce fre...
1111
  	if (!free_pages_prepare(page, 0))
689bcebfd   Hugh Dickins   [PATCH] unpaged: ...
1112
  		return;
5f8dcc212   Mel Gorman   page-allocator: s...
1113
1114
  	migratetype = get_pageblock_migratetype(page);
  	set_page_private(page, migratetype);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1115
  	local_irq_save(flags);
c277331d5   Johannes Weiner   mm: page_alloc: c...
1116
  	if (unlikely(wasMlocked))
da456f14d   Mel Gorman   page allocator: d...
1117
  		free_page_mlock(page);
f8891e5e1   Christoph Lameter   [PATCH] Light wei...
1118
  	__count_vm_event(PGFREE);
da456f14d   Mel Gorman   page allocator: d...
1119

5f8dcc212   Mel Gorman   page-allocator: s...
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
  	/*
  	 * We only track unmovable, reclaimable and movable on pcp lists.
  	 * Free ISOLATE pages back to the allocator because they are being
  	 * offlined but treat RESERVE as movable pages so we can get those
  	 * areas back if necessary. Otherwise, we may have to free
  	 * excessively into the page allocator
  	 */
  	if (migratetype >= MIGRATE_PCPTYPES) {
  		if (unlikely(migratetype == MIGRATE_ISOLATE)) {
  			free_one_page(zone, page, 0, migratetype);
  			goto out;
  		}
  		migratetype = MIGRATE_MOVABLE;
  	}
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
1134
  	pcp = &this_cpu_ptr(zone->pageset)->pcp;
3dfa5721f   Christoph Lameter   Page allocator: g...
1135
  	if (cold)
5f8dcc212   Mel Gorman   page-allocator: s...
1136
  		list_add_tail(&page->lru, &pcp->lists[migratetype]);
3dfa5721f   Christoph Lameter   Page allocator: g...
1137
  	else
5f8dcc212   Mel Gorman   page-allocator: s...
1138
  		list_add(&page->lru, &pcp->lists[migratetype]);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1139
  	pcp->count++;
48db57f8f   Nick Piggin   [PATCH] mm: free_...
1140
  	if (pcp->count >= pcp->high) {
5f8dcc212   Mel Gorman   page-allocator: s...
1141
  		free_pcppages_bulk(zone, pcp->batch, pcp);
48db57f8f   Nick Piggin   [PATCH] mm: free_...
1142
1143
  		pcp->count -= pcp->batch;
  	}
5f8dcc212   Mel Gorman   page-allocator: s...
1144
1145
  
  out:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1146
  	local_irq_restore(flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1147
  }
8dfcc9ba2   Nick Piggin   [PATCH] mm: split...
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
  /*
   * split_page takes a non-compound higher-order page, and splits it into
   * n (1<<order) sub-pages: page[0..n]
   * Each sub-page must be freed individually.
   *
   * Note: this is probably too low level an operation for use in drivers.
   * Please consult with lkml before using this in your driver.
   */
  void split_page(struct page *page, unsigned int order)
  {
  	int i;
725d704ec   Nick Piggin   [PATCH] mm: VM_BU...
1159
1160
  	VM_BUG_ON(PageCompound(page));
  	VM_BUG_ON(!page_count(page));
b1eeab676   Vegard Nossum   kmemcheck: add ho...
1161
1162
1163
1164
1165
1166
1167
1168
1169
  
  #ifdef CONFIG_KMEMCHECK
  	/*
  	 * Split shadow pages too, because free(page[0]) would
  	 * otherwise free the whole shadow.
  	 */
  	if (kmemcheck_page_is_tracked(page))
  		split_page(virt_to_page(page[0].shadow), order);
  #endif
7835e98b2   Nick Piggin   [PATCH] remove se...
1170
1171
  	for (i = 1; i < (1 << order); i++)
  		set_page_refcounted(page + i);
8dfcc9ba2   Nick Piggin   [PATCH] mm: split...
1172
  }
8dfcc9ba2   Nick Piggin   [PATCH] mm: split...
1173

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1174
  /*
748446bb6   Mel Gorman   mm: compaction: m...
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
   * Similar to split_page except the page is already free. As this is only
   * being used for migration, the migratetype of the block also changes.
   * As this is called with interrupts disabled, the caller is responsible
   * for calling arch_alloc_page() and kernel_map_page() after interrupts
   * are enabled.
   *
   * Note: this is probably too low level an operation for use in drivers.
   * Please consult with lkml before using this in your driver.
   */
  int split_free_page(struct page *page)
  {
  	unsigned int order;
  	unsigned long watermark;
  	struct zone *zone;
  
  	BUG_ON(!PageBuddy(page));
  
  	zone = page_zone(page);
  	order = page_order(page);
  
  	/* Obey watermarks as if the page was being allocated */
  	watermark = low_wmark_pages(zone) + (1 << order);
  	if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
  		return 0;
  
  	/* Remove page from free list */
  	list_del(&page->lru);
  	zone->free_area[order].nr_free--;
  	rmv_page_order(page);
  	__mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order));
  
  	/* Split into individual pages */
  	set_page_refcounted(page);
  	split_page(page, order);
  
  	if (order >= pageblock_order - 1) {
  		struct page *endpage = page + (1 << order) - 1;
  		for (; page < endpage; page += pageblock_nr_pages)
  			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
  	}
  
  	return 1 << order;
  }
  
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1220
1221
1222
1223
   * Really, prep_compound_page() should be called from __rmqueue_bulk().  But
   * we cheat by calling it from here, in the order > 0 path.  Saves a branch
   * or two.
   */
0a15c3e9f   Mel Gorman   page allocator: i...
1224
1225
  static inline
  struct page *buffered_rmqueue(struct zone *preferred_zone,
3dd282669   Mel Gorman   page allocator: c...
1226
1227
  			struct zone *zone, int order, gfp_t gfp_flags,
  			int migratetype)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1228
1229
  {
  	unsigned long flags;
689bcebfd   Hugh Dickins   [PATCH] unpaged: ...
1230
  	struct page *page;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1231
  	int cold = !!(gfp_flags & __GFP_COLD);
689bcebfd   Hugh Dickins   [PATCH] unpaged: ...
1232
  again:
48db57f8f   Nick Piggin   [PATCH] mm: free_...
1233
  	if (likely(order == 0)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1234
  		struct per_cpu_pages *pcp;
5f8dcc212   Mel Gorman   page-allocator: s...
1235
  		struct list_head *list;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1236

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1237
  		local_irq_save(flags);
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
1238
1239
  		pcp = &this_cpu_ptr(zone->pageset)->pcp;
  		list = &pcp->lists[migratetype];
5f8dcc212   Mel Gorman   page-allocator: s...
1240
  		if (list_empty(list)) {
535131e69   Mel Gorman   Choose pages from...
1241
  			pcp->count += rmqueue_bulk(zone, 0,
5f8dcc212   Mel Gorman   page-allocator: s...
1242
  					pcp->batch, list,
e084b2d95   Mel Gorman   page-allocator: p...
1243
  					migratetype, cold);
5f8dcc212   Mel Gorman   page-allocator: s...
1244
  			if (unlikely(list_empty(list)))
6fb332fab   Shaohua Li   memory hotplug: e...
1245
  				goto failed;
535131e69   Mel Gorman   Choose pages from...
1246
  		}
b92a6edd4   Mel Gorman   Add a configure o...
1247

5f8dcc212   Mel Gorman   page-allocator: s...
1248
1249
1250
1251
  		if (cold)
  			page = list_entry(list->prev, struct page, lru);
  		else
  			page = list_entry(list->next, struct page, lru);
b92a6edd4   Mel Gorman   Add a configure o...
1252
1253
  		list_del(&page->lru);
  		pcp->count--;
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1254
  	} else {
dab48dab3   Andrew Morton   page-allocator: w...
1255
1256
1257
1258
1259
1260
1261
1262
  		if (unlikely(gfp_flags & __GFP_NOFAIL)) {
  			/*
  			 * __GFP_NOFAIL is not to be used in new code.
  			 *
  			 * All __GFP_NOFAIL callers should be fixed so that they
  			 * properly detect and handle allocation failures.
  			 *
  			 * We most definitely don't want callers attempting to
4923abf9f   Linus Torvalds   Don't warn about ...
1263
  			 * allocate greater than order-1 page units with
dab48dab3   Andrew Morton   page-allocator: w...
1264
1265
  			 * __GFP_NOFAIL.
  			 */
4923abf9f   Linus Torvalds   Don't warn about ...
1266
  			WARN_ON_ONCE(order > 1);
dab48dab3   Andrew Morton   page-allocator: w...
1267
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1268
  		spin_lock_irqsave(&zone->lock, flags);
b2a0ac887   Mel Gorman   Split the free li...
1269
  		page = __rmqueue(zone, order, migratetype);
a74609faf   Nick Piggin   [PATCH] mm: page_...
1270
1271
1272
  		spin_unlock(&zone->lock);
  		if (!page)
  			goto failed;
6ccf80eb1   KOSAKI Motohiro   page allocator: u...
1273
  		__mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1274
  	}
f8891e5e1   Christoph Lameter   [PATCH] Light wei...
1275
  	__count_zone_vm_events(PGALLOC, zone, 1 << order);
78afd5612   Andi Kleen   mm: add __GFP_OTH...
1276
  	zone_statistics(preferred_zone, zone, gfp_flags);
a74609faf   Nick Piggin   [PATCH] mm: page_...
1277
  	local_irq_restore(flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1278

725d704ec   Nick Piggin   [PATCH] mm: VM_BU...
1279
  	VM_BUG_ON(bad_range(zone, page));
17cf44064   Nick Piggin   [PATCH] mm: clean...
1280
  	if (prep_new_page(page, order, gfp_flags))
a74609faf   Nick Piggin   [PATCH] mm: page_...
1281
  		goto again;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1282
  	return page;
a74609faf   Nick Piggin   [PATCH] mm: page_...
1283
1284
1285
  
  failed:
  	local_irq_restore(flags);
a74609faf   Nick Piggin   [PATCH] mm: page_...
1286
  	return NULL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1287
  }
418589663   Mel Gorman   page allocator: u...
1288
1289
1290
1291
1292
1293
1294
1295
  /* The ALLOC_WMARK bits are used as an index to zone->watermark */
  #define ALLOC_WMARK_MIN		WMARK_MIN
  #define ALLOC_WMARK_LOW		WMARK_LOW
  #define ALLOC_WMARK_HIGH	WMARK_HIGH
  #define ALLOC_NO_WATERMARKS	0x04 /* don't check watermarks at all */
  
  /* Mask to get the watermark bits */
  #define ALLOC_WMARK_MASK	(ALLOC_NO_WATERMARKS-1)
3148890bf   Nick Piggin   [PATCH] mm: __all...
1296
1297
1298
  #define ALLOC_HARDER		0x10 /* try to alloc harder */
  #define ALLOC_HIGH		0x20 /* __GFP_HIGH set */
  #define ALLOC_CPUSET		0x40 /* check for correct cpuset */
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1299

933e312e7   Akinobu Mita   [PATCH] fault-inj...
1300
  #ifdef CONFIG_FAIL_PAGE_ALLOC
b2588c4b4   Akinobu Mita   fail_page_alloc: ...
1301
  static struct {
933e312e7   Akinobu Mita   [PATCH] fault-inj...
1302
1303
1304
1305
  	struct fault_attr attr;
  
  	u32 ignore_gfp_highmem;
  	u32 ignore_gfp_wait;
54114994f   Akinobu Mita   fault-injection: ...
1306
  	u32 min_order;
933e312e7   Akinobu Mita   [PATCH] fault-inj...
1307
1308
  } fail_page_alloc = {
  	.attr = FAULT_ATTR_INITIALIZER,
6b1b60f41   Don Mullis   [PATCH] fault-inj...
1309
1310
  	.ignore_gfp_wait = 1,
  	.ignore_gfp_highmem = 1,
54114994f   Akinobu Mita   fault-injection: ...
1311
  	.min_order = 1,
933e312e7   Akinobu Mita   [PATCH] fault-inj...
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
  };
  
  static int __init setup_fail_page_alloc(char *str)
  {
  	return setup_fault_attr(&fail_page_alloc.attr, str);
  }
  __setup("fail_page_alloc=", setup_fail_page_alloc);
  
  static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
  {
54114994f   Akinobu Mita   fault-injection: ...
1322
1323
  	if (order < fail_page_alloc.min_order)
  		return 0;
933e312e7   Akinobu Mita   [PATCH] fault-inj...
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
  	if (gfp_mask & __GFP_NOFAIL)
  		return 0;
  	if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
  		return 0;
  	if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
  		return 0;
  
  	return should_fail(&fail_page_alloc.attr, 1 << order);
  }
  
  #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
  
  static int __init fail_page_alloc_debugfs(void)
  {
  	mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
  	struct dentry *dir;
933e312e7   Akinobu Mita   [PATCH] fault-inj...
1340

dd48c085c   Akinobu Mita   fault-injection: ...
1341
1342
1343
1344
  	dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
  					&fail_page_alloc.attr);
  	if (IS_ERR(dir))
  		return PTR_ERR(dir);
933e312e7   Akinobu Mita   [PATCH] fault-inj...
1345

b2588c4b4   Akinobu Mita   fail_page_alloc: ...
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
  	if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
  				&fail_page_alloc.ignore_gfp_wait))
  		goto fail;
  	if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
  				&fail_page_alloc.ignore_gfp_highmem))
  		goto fail;
  	if (!debugfs_create_u32("min-order", mode, dir,
  				&fail_page_alloc.min_order))
  		goto fail;
  
  	return 0;
  fail:
dd48c085c   Akinobu Mita   fault-injection: ...
1358
  	debugfs_remove_recursive(dir);
933e312e7   Akinobu Mita   [PATCH] fault-inj...
1359

b2588c4b4   Akinobu Mita   fail_page_alloc: ...
1360
  	return -ENOMEM;
933e312e7   Akinobu Mita   [PATCH] fault-inj...
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
  }
  
  late_initcall(fail_page_alloc_debugfs);
  
  #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
  
  #else /* CONFIG_FAIL_PAGE_ALLOC */
  
  static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
  {
  	return 0;
  }
  
  #endif /* CONFIG_FAIL_PAGE_ALLOC */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1375
  /*
88f5acf88   Mel Gorman   mm: page allocato...
1376
   * Return true if free pages are above 'mark'. This takes into account the order
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1377
1378
   * of the allocation.
   */
88f5acf88   Mel Gorman   mm: page allocato...
1379
1380
  static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
  		      int classzone_idx, int alloc_flags, long free_pages)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1381
1382
  {
  	/* free_pages my go negative - that's OK */
d23ad4232   Christoph Lameter   [PATCH] Use ZVC f...
1383
  	long min = mark;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1384
  	int o;
88f5acf88   Mel Gorman   mm: page allocato...
1385
  	free_pages -= (1 << order) + 1;
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1386
  	if (alloc_flags & ALLOC_HIGH)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1387
  		min -= min / 2;
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1388
  	if (alloc_flags & ALLOC_HARDER)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1389
1390
1391
  		min -= min / 4;
  
  	if (free_pages <= min + z->lowmem_reserve[classzone_idx])
88f5acf88   Mel Gorman   mm: page allocato...
1392
  		return false;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1393
1394
1395
1396
1397
1398
1399
1400
  	for (o = 0; o < order; o++) {
  		/* At the next order, this order's pages become unavailable */
  		free_pages -= z->free_area[o].nr_free << o;
  
  		/* Require fewer higher order pages to be free */
  		min >>= 1;
  
  		if (free_pages <= min)
88f5acf88   Mel Gorman   mm: page allocato...
1401
  			return false;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1402
  	}
88f5acf88   Mel Gorman   mm: page allocato...
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
  	return true;
  }
  
  bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
  		      int classzone_idx, int alloc_flags)
  {
  	return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
  					zone_page_state(z, NR_FREE_PAGES));
  }
  
  bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
  		      int classzone_idx, int alloc_flags)
  {
  	long free_pages = zone_page_state(z, NR_FREE_PAGES);
  
  	if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
  		free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
  
  	return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
  								free_pages);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1423
  }
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1424
1425
1426
1427
1428
1429
  #ifdef CONFIG_NUMA
  /*
   * zlc_setup - Setup for "zonelist cache".  Uses cached zone data to
   * skip over zones that are not allowed by the cpuset, or that have
   * been recently (in last second) found to be nearly full.  See further
   * comments in mmzone.h.  Reduces cache footprint of zonelist scans
183ff22bb   Simon Arlott   spelling fixes: mm/
1430
   * that have to skip over a lot of full or unallowed zones.
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1431
1432
1433
   *
   * If the zonelist cache is present in the passed in zonelist, then
   * returns a pointer to the allowed node mask (either the current
37b07e416   Lee Schermerhorn   memoryless nodes:...
1434
   * tasks mems_allowed, or node_states[N_HIGH_MEMORY].)
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
   *
   * If the zonelist cache is not available for this zonelist, does
   * nothing and returns NULL.
   *
   * If the fullzones BITMAP in the zonelist cache is stale (more than
   * a second since last zap'd) then we zap it out (clear its bits.)
   *
   * We hold off even calling zlc_setup, until after we've checked the
   * first zone in the zonelist, on the theory that most allocations will
   * be satisfied from that first zone, so best to examine that zone as
   * quickly as we can.
   */
  static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
  {
  	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
  	nodemask_t *allowednodes;	/* zonelist_cache approximation */
  
  	zlc = zonelist->zlcache_ptr;
  	if (!zlc)
  		return NULL;
f05111f50   S.ÇaÄŸlar Onur   mm/page_alloc.c: ...
1455
  	if (time_after(jiffies, zlc->last_full_zap + HZ)) {
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1456
1457
1458
1459
1460
1461
  		bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
  		zlc->last_full_zap = jiffies;
  	}
  
  	allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
  					&cpuset_current_mems_allowed :
37b07e416   Lee Schermerhorn   memoryless nodes:...
1462
  					&node_states[N_HIGH_MEMORY];
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
  	return allowednodes;
  }
  
  /*
   * Given 'z' scanning a zonelist, run a couple of quick checks to see
   * if it is worth looking at further for free memory:
   *  1) Check that the zone isn't thought to be full (doesn't have its
   *     bit set in the zonelist_cache fullzones BITMAP).
   *  2) Check that the zones node (obtained from the zonelist_cache
   *     z_to_n[] mapping) is allowed in the passed in allowednodes mask.
   * Return true (non-zero) if zone is worth looking at further, or
   * else return false (zero) if it is not.
   *
   * This check -ignores- the distinction between various watermarks,
   * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ...  If a zone is
   * found to be full for any variation of these watermarks, it will
   * be considered full for up to one second by all requests, unless
   * we are so low on memory on all allowed nodes that we are forced
   * into the second scan of the zonelist.
   *
   * In the second scan we ignore this zonelist cache and exactly
   * apply the watermarks to all zones, even it is slower to do so.
   * We are low on memory in the second scan, and should leave no stone
   * unturned looking for a free page.
   */
dd1a239f6   Mel Gorman   mm: have zonelist...
1488
  static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1489
1490
1491
1492
1493
1494
1495
1496
1497
  						nodemask_t *allowednodes)
  {
  	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
  	int i;				/* index of *z in zonelist zones */
  	int n;				/* node that zone *z is on */
  
  	zlc = zonelist->zlcache_ptr;
  	if (!zlc)
  		return 1;
dd1a239f6   Mel Gorman   mm: have zonelist...
1498
  	i = z - zonelist->_zonerefs;
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
  	n = zlc->z_to_n[i];
  
  	/* This zone is worth trying if it is allowed but not full */
  	return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
  }
  
  /*
   * Given 'z' scanning a zonelist, set the corresponding bit in
   * zlc->fullzones, so that subsequent attempts to allocate a page
   * from that zone don't waste time re-examining it.
   */
dd1a239f6   Mel Gorman   mm: have zonelist...
1510
  static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1511
1512
1513
1514
1515
1516
1517
  {
  	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
  	int i;				/* index of *z in zonelist zones */
  
  	zlc = zonelist->zlcache_ptr;
  	if (!zlc)
  		return;
dd1a239f6   Mel Gorman   mm: have zonelist...
1518
  	i = z - zonelist->_zonerefs;
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1519
1520
1521
  
  	set_bit(i, zlc->fullzones);
  }
76d3fbf8f   Mel Gorman   mm: page allocato...
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
  /*
   * clear all zones full, called after direct reclaim makes progress so that
   * a zone that was recently full is not skipped over for up to a second
   */
  static void zlc_clear_zones_full(struct zonelist *zonelist)
  {
  	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
  
  	zlc = zonelist->zlcache_ptr;
  	if (!zlc)
  		return;
  
  	bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
  }
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1536
1537
1538
1539
1540
1541
  #else	/* CONFIG_NUMA */
  
  static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
  {
  	return NULL;
  }
dd1a239f6   Mel Gorman   mm: have zonelist...
1542
  static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1543
1544
1545
1546
  				nodemask_t *allowednodes)
  {
  	return 1;
  }
dd1a239f6   Mel Gorman   mm: have zonelist...
1547
  static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1548
1549
  {
  }
76d3fbf8f   Mel Gorman   mm: page allocato...
1550
1551
1552
1553
  
  static void zlc_clear_zones_full(struct zonelist *zonelist)
  {
  }
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1554
  #endif	/* CONFIG_NUMA */
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1555
  /*
0798e5193   Paul Jackson   [PATCH] memory pa...
1556
   * get_page_from_freelist goes through the zonelist trying to allocate
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1557
1558
1559
   * a page.
   */
  static struct page *
19770b326   Mel Gorman   mm: filter based ...
1560
  get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
5117f45d1   Mel Gorman   page allocator: c...
1561
  		struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
3dd282669   Mel Gorman   page allocator: c...
1562
  		struct zone *preferred_zone, int migratetype)
753ee7289   Martin Hicks   [PATCH] VM: early...
1563
  {
dd1a239f6   Mel Gorman   mm: have zonelist...
1564
  	struct zoneref *z;
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1565
  	struct page *page = NULL;
54a6eb5c4   Mel Gorman   mm: use two zonel...
1566
  	int classzone_idx;
5117f45d1   Mel Gorman   page allocator: c...
1567
  	struct zone *zone;
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1568
1569
1570
  	nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
  	int zlc_active = 0;		/* set if using zonelist_cache */
  	int did_zlc_setup = 0;		/* just call zlc_setup() one time */
54a6eb5c4   Mel Gorman   mm: use two zonel...
1571

19770b326   Mel Gorman   mm: filter based ...
1572
  	classzone_idx = zone_idx(preferred_zone);
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1573
  zonelist_scan:
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1574
  	/*
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1575
  	 * Scan zonelist, looking for a zone with enough free.
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1576
1577
  	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
  	 */
19770b326   Mel Gorman   mm: filter based ...
1578
1579
  	for_each_zone_zonelist_nodemask(zone, z, zonelist,
  						high_zoneidx, nodemask) {
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1580
1581
1582
  		if (NUMA_BUILD && zlc_active &&
  			!zlc_zone_worth_trying(zonelist, z, allowednodes))
  				continue;
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1583
  		if ((alloc_flags & ALLOC_CPUSET) &&
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
1584
  			!cpuset_zone_allowed_softwall(zone, gfp_mask))
cd38b115d   Mel Gorman   mm: page allocato...
1585
  				continue;
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1586

418589663   Mel Gorman   page allocator: u...
1587
  		BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1588
  		if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
3148890bf   Nick Piggin   [PATCH] mm: __all...
1589
  			unsigned long mark;
fa5e084e4   Mel Gorman   vmscan: do not un...
1590
  			int ret;
418589663   Mel Gorman   page allocator: u...
1591
  			mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
fa5e084e4   Mel Gorman   vmscan: do not un...
1592
1593
1594
  			if (zone_watermark_ok(zone, order, mark,
  				    classzone_idx, alloc_flags))
  				goto try_this_zone;
cd38b115d   Mel Gorman   mm: page allocato...
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
  			if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
  				/*
  				 * we do zlc_setup if there are multiple nodes
  				 * and before considering the first zone allowed
  				 * by the cpuset.
  				 */
  				allowednodes = zlc_setup(zonelist, alloc_flags);
  				zlc_active = 1;
  				did_zlc_setup = 1;
  			}
fa5e084e4   Mel Gorman   vmscan: do not un...
1605
1606
  			if (zone_reclaim_mode == 0)
  				goto this_zone_full;
cd38b115d   Mel Gorman   mm: page allocato...
1607
1608
1609
1610
1611
1612
1613
  			/*
  			 * As we may have just activated ZLC, check if the first
  			 * eligible zone has failed zone_reclaim recently.
  			 */
  			if (NUMA_BUILD && zlc_active &&
  				!zlc_zone_worth_trying(zonelist, z, allowednodes))
  				continue;
fa5e084e4   Mel Gorman   vmscan: do not un...
1614
1615
1616
1617
  			ret = zone_reclaim(zone, gfp_mask, order);
  			switch (ret) {
  			case ZONE_RECLAIM_NOSCAN:
  				/* did not scan */
cd38b115d   Mel Gorman   mm: page allocato...
1618
  				continue;
fa5e084e4   Mel Gorman   vmscan: do not un...
1619
1620
  			case ZONE_RECLAIM_FULL:
  				/* scanned but unreclaimable */
cd38b115d   Mel Gorman   mm: page allocato...
1621
  				continue;
fa5e084e4   Mel Gorman   vmscan: do not un...
1622
1623
1624
1625
  			default:
  				/* did we reclaim enough */
  				if (!zone_watermark_ok(zone, order, mark,
  						classzone_idx, alloc_flags))
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1626
  					goto this_zone_full;
0798e5193   Paul Jackson   [PATCH] memory pa...
1627
  			}
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1628
  		}
fa5e084e4   Mel Gorman   vmscan: do not un...
1629
  try_this_zone:
3dd282669   Mel Gorman   page allocator: c...
1630
1631
  		page = buffered_rmqueue(preferred_zone, zone, order,
  						gfp_mask, migratetype);
0798e5193   Paul Jackson   [PATCH] memory pa...
1632
  		if (page)
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1633
  			break;
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1634
1635
1636
  this_zone_full:
  		if (NUMA_BUILD)
  			zlc_mark_zone_full(zonelist, z);
54a6eb5c4   Mel Gorman   mm: use two zonel...
1637
  	}
9276b1bc9   Paul Jackson   [PATCH] memory pa...
1638
1639
1640
1641
1642
1643
  
  	if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
  		/* Disable zlc cache for second zonelist scan */
  		zlc_active = 0;
  		goto zonelist_scan;
  	}
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1644
  	return page;
753ee7289   Martin Hicks   [PATCH] VM: early...
1645
  }
29423e77c   David Rientjes   oom: suppress sho...
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
  /*
   * Large machines with many possible nodes should not always dump per-node
   * meminfo in irq context.
   */
  static inline bool should_suppress_show_mem(void)
  {
  	bool ret = false;
  
  #if NODES_SHIFT > 8
  	ret = in_interrupt();
  #endif
  	return ret;
  }
a238ab5b0   Dave Hansen   mm: break out pag...
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
  static DEFINE_RATELIMIT_STATE(nopage_rs,
  		DEFAULT_RATELIMIT_INTERVAL,
  		DEFAULT_RATELIMIT_BURST);
  
  void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
  {
  	va_list args;
  	unsigned int filter = SHOW_MEM_FILTER_NODES;
  
  	if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))
  		return;
  
  	/*
  	 * This documents exceptions given to allocations in certain
  	 * contexts that are allowed to allocate outside current's set
  	 * of allowed nodes.
  	 */
  	if (!(gfp_mask & __GFP_NOMEMALLOC))
  		if (test_thread_flag(TIF_MEMDIE) ||
  		    (current->flags & (PF_MEMALLOC | PF_EXITING)))
  			filter &= ~SHOW_MEM_FILTER_NODES;
  	if (in_interrupt() || !(gfp_mask & __GFP_WAIT))
  		filter &= ~SHOW_MEM_FILTER_NODES;
  
  	if (fmt) {
  		printk(KERN_WARNING);
  		va_start(args, fmt);
  		vprintk(fmt, args);
  		va_end(args);
  	}
  
  	pr_warning("%s: page allocation failure: order:%d, mode:0x%x
  ",
  		   current->comm, order, gfp_mask);
  
  	dump_stack();
  	if (!should_suppress_show_mem())
  		show_mem(filter);
  }
11e33f6a5   Mel Gorman   page allocator: b...
1698
1699
1700
  static inline int
  should_alloc_retry(gfp_t gfp_mask, unsigned int order,
  				unsigned long pages_reclaimed)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1701
  {
11e33f6a5   Mel Gorman   page allocator: b...
1702
1703
1704
  	/* Do not loop if specifically requested */
  	if (gfp_mask & __GFP_NORETRY)
  		return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1705

11e33f6a5   Mel Gorman   page allocator: b...
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
  	/*
  	 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
  	 * means __GFP_NOFAIL, but that may not be true in other
  	 * implementations.
  	 */
  	if (order <= PAGE_ALLOC_COSTLY_ORDER)
  		return 1;
  
  	/*
  	 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
  	 * specified, then we retry until we no longer reclaim any pages
  	 * (above), or we've reclaimed an order of pages at least as
  	 * large as the allocation's order. In both cases, if the
  	 * allocation still fails, we stop retrying.
  	 */
  	if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
  		return 1;
cf40bd16f   Nick Piggin   lockdep: annotate...
1723

11e33f6a5   Mel Gorman   page allocator: b...
1724
1725
1726
1727
1728
1729
  	/*
  	 * Don't let big-order allocations loop unless the caller
  	 * explicitly requests that.
  	 */
  	if (gfp_mask & __GFP_NOFAIL)
  		return 1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1730

11e33f6a5   Mel Gorman   page allocator: b...
1731
1732
  	return 0;
  }
933e312e7   Akinobu Mita   [PATCH] fault-inj...
1733

11e33f6a5   Mel Gorman   page allocator: b...
1734
1735
1736
  static inline struct page *
  __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
  	struct zonelist *zonelist, enum zone_type high_zoneidx,
3dd282669   Mel Gorman   page allocator: c...
1737
1738
  	nodemask_t *nodemask, struct zone *preferred_zone,
  	int migratetype)
11e33f6a5   Mel Gorman   page allocator: b...
1739
1740
1741
1742
  {
  	struct page *page;
  
  	/* Acquire the OOM killer lock for the zones in zonelist */
ff321feac   Minchan Kim   mm: rename try_se...
1743
  	if (!try_set_zonelist_oom(zonelist, gfp_mask)) {
11e33f6a5   Mel Gorman   page allocator: b...
1744
  		schedule_timeout_uninterruptible(1);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1745
1746
  		return NULL;
  	}
6b1de9161   Jens Axboe   [PATCH] VM: fix z...
1747

11e33f6a5   Mel Gorman   page allocator: b...
1748
1749
1750
1751
1752
1753
1754
  	/*
  	 * Go through the zonelist yet one more time, keep very high watermark
  	 * here, this is only to catch a parallel oom killing, we must fail if
  	 * we're still under heavy pressure.
  	 */
  	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
  		order, zonelist, high_zoneidx,
5117f45d1   Mel Gorman   page allocator: c...
1755
  		ALLOC_WMARK_HIGH|ALLOC_CPUSET,
3dd282669   Mel Gorman   page allocator: c...
1756
  		preferred_zone, migratetype);
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1757
  	if (page)
11e33f6a5   Mel Gorman   page allocator: b...
1758
  		goto out;
4365a5676   KAMEZAWA Hiroyuki   oom-kill: fix NUM...
1759
1760
1761
1762
  	if (!(gfp_mask & __GFP_NOFAIL)) {
  		/* The OOM killer will not help higher order allocs */
  		if (order > PAGE_ALLOC_COSTLY_ORDER)
  			goto out;
03668b3ce   David Rientjes   oom: avoid oom ki...
1763
1764
1765
  		/* The OOM killer does not needlessly kill tasks for lowmem */
  		if (high_zoneidx < ZONE_NORMAL)
  			goto out;
4365a5676   KAMEZAWA Hiroyuki   oom-kill: fix NUM...
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
  		/*
  		 * GFP_THISNODE contains __GFP_NORETRY and we never hit this.
  		 * Sanity check for bare calls of __GFP_THISNODE, not real OOM.
  		 * The caller should handle page allocation failure by itself if
  		 * it specifies __GFP_THISNODE.
  		 * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.
  		 */
  		if (gfp_mask & __GFP_THISNODE)
  			goto out;
  	}
11e33f6a5   Mel Gorman   page allocator: b...
1776
  	/* Exhausted what can be done so it's blamo time */
4365a5676   KAMEZAWA Hiroyuki   oom-kill: fix NUM...
1777
  	out_of_memory(zonelist, gfp_mask, order, nodemask);
11e33f6a5   Mel Gorman   page allocator: b...
1778
1779
1780
1781
1782
  
  out:
  	clear_zonelist_oom(zonelist, gfp_mask);
  	return page;
  }
56de7263f   Mel Gorman   mm: compaction: d...
1783
1784
1785
1786
1787
1788
  #ifdef CONFIG_COMPACTION
  /* Try memory compaction for high-order allocations before reclaim */
  static struct page *
  __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
  	struct zonelist *zonelist, enum zone_type high_zoneidx,
  	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
77f1fe6b0   Mel Gorman   mm: migration: al...
1789
1790
  	int migratetype, unsigned long *did_some_progress,
  	bool sync_migration)
56de7263f   Mel Gorman   mm: compaction: d...
1791
1792
  {
  	struct page *page;
4f92e2586   Mel Gorman   mm: compaction: d...
1793
  	if (!order || compaction_deferred(preferred_zone))
56de7263f   Mel Gorman   mm: compaction: d...
1794
  		return NULL;
c06b1fca1   Andrew Morton   mm/page_alloc.c: ...
1795
  	current->flags |= PF_MEMALLOC;
56de7263f   Mel Gorman   mm: compaction: d...
1796
  	*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
77f1fe6b0   Mel Gorman   mm: migration: al...
1797
  						nodemask, sync_migration);
c06b1fca1   Andrew Morton   mm/page_alloc.c: ...
1798
  	current->flags &= ~PF_MEMALLOC;
56de7263f   Mel Gorman   mm: compaction: d...
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
  	if (*did_some_progress != COMPACT_SKIPPED) {
  
  		/* Page migration frees to the PCP lists but we want merging */
  		drain_pages(get_cpu());
  		put_cpu();
  
  		page = get_page_from_freelist(gfp_mask, nodemask,
  				order, zonelist, high_zoneidx,
  				alloc_flags, preferred_zone,
  				migratetype);
  		if (page) {
4f92e2586   Mel Gorman   mm: compaction: d...
1810
1811
  			preferred_zone->compact_considered = 0;
  			preferred_zone->compact_defer_shift = 0;
56de7263f   Mel Gorman   mm: compaction: d...
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
  			count_vm_event(COMPACTSUCCESS);
  			return page;
  		}
  
  		/*
  		 * It's bad if compaction run occurs and fails.
  		 * The most likely reason is that pages exist,
  		 * but not enough to satisfy watermarks.
  		 */
  		count_vm_event(COMPACTFAIL);
4f92e2586   Mel Gorman   mm: compaction: d...
1822
  		defer_compaction(preferred_zone);
56de7263f   Mel Gorman   mm: compaction: d...
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
  
  		cond_resched();
  	}
  
  	return NULL;
  }
  #else
  static inline struct page *
  __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
  	struct zonelist *zonelist, enum zone_type high_zoneidx,
  	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
77f1fe6b0   Mel Gorman   mm: migration: al...
1834
1835
  	int migratetype, unsigned long *did_some_progress,
  	bool sync_migration)
56de7263f   Mel Gorman   mm: compaction: d...
1836
1837
1838
1839
  {
  	return NULL;
  }
  #endif /* CONFIG_COMPACTION */
11e33f6a5   Mel Gorman   page allocator: b...
1840
1841
1842
1843
  /* The really slow allocator path where we enter direct reclaim */
  static inline struct page *
  __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
  	struct zonelist *zonelist, enum zone_type high_zoneidx,
5117f45d1   Mel Gorman   page allocator: c...
1844
  	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
3dd282669   Mel Gorman   page allocator: c...
1845
  	int migratetype, unsigned long *did_some_progress)
11e33f6a5   Mel Gorman   page allocator: b...
1846
1847
1848
  {
  	struct page *page = NULL;
  	struct reclaim_state reclaim_state;
9ee493ce0   Mel Gorman   mm: page allocato...
1849
  	bool drained = false;
11e33f6a5   Mel Gorman   page allocator: b...
1850
1851
1852
1853
1854
  
  	cond_resched();
  
  	/* We now go into synchronous reclaim */
  	cpuset_memory_pressure_bump();
c06b1fca1   Andrew Morton   mm/page_alloc.c: ...
1855
  	current->flags |= PF_MEMALLOC;
11e33f6a5   Mel Gorman   page allocator: b...
1856
1857
  	lockdep_set_current_reclaim_state(gfp_mask);
  	reclaim_state.reclaimed_slab = 0;
c06b1fca1   Andrew Morton   mm/page_alloc.c: ...
1858
  	current->reclaim_state = &reclaim_state;
11e33f6a5   Mel Gorman   page allocator: b...
1859
1860
  
  	*did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
c06b1fca1   Andrew Morton   mm/page_alloc.c: ...
1861
  	current->reclaim_state = NULL;
11e33f6a5   Mel Gorman   page allocator: b...
1862
  	lockdep_clear_current_reclaim_state();
c06b1fca1   Andrew Morton   mm/page_alloc.c: ...
1863
  	current->flags &= ~PF_MEMALLOC;
11e33f6a5   Mel Gorman   page allocator: b...
1864
1865
  
  	cond_resched();
9ee493ce0   Mel Gorman   mm: page allocato...
1866
1867
  	if (unlikely(!(*did_some_progress)))
  		return NULL;
11e33f6a5   Mel Gorman   page allocator: b...
1868

76d3fbf8f   Mel Gorman   mm: page allocato...
1869
1870
1871
  	/* After successful reclaim, reconsider all zones for allocation */
  	if (NUMA_BUILD)
  		zlc_clear_zones_full(zonelist);
9ee493ce0   Mel Gorman   mm: page allocato...
1872
1873
  retry:
  	page = get_page_from_freelist(gfp_mask, nodemask, order,
5117f45d1   Mel Gorman   page allocator: c...
1874
  					zonelist, high_zoneidx,
3dd282669   Mel Gorman   page allocator: c...
1875
1876
  					alloc_flags, preferred_zone,
  					migratetype);
9ee493ce0   Mel Gorman   mm: page allocato...
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
  
  	/*
  	 * If an allocation failed after direct reclaim, it could be because
  	 * pages are pinned on the per-cpu lists. Drain them and try again
  	 */
  	if (!page && !drained) {
  		drain_all_pages();
  		drained = true;
  		goto retry;
  	}
11e33f6a5   Mel Gorman   page allocator: b...
1887
1888
  	return page;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1889
  /*
11e33f6a5   Mel Gorman   page allocator: b...
1890
1891
   * This is called in the allocator slow-path if the allocation request is of
   * sufficient urgency to ignore watermarks and take other desperate measures
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1892
   */
11e33f6a5   Mel Gorman   page allocator: b...
1893
1894
1895
  static inline struct page *
  __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
  	struct zonelist *zonelist, enum zone_type high_zoneidx,
3dd282669   Mel Gorman   page allocator: c...
1896
1897
  	nodemask_t *nodemask, struct zone *preferred_zone,
  	int migratetype)
11e33f6a5   Mel Gorman   page allocator: b...
1898
1899
1900
1901
1902
  {
  	struct page *page;
  
  	do {
  		page = get_page_from_freelist(gfp_mask, nodemask, order,
5117f45d1   Mel Gorman   page allocator: c...
1903
  			zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,
3dd282669   Mel Gorman   page allocator: c...
1904
  			preferred_zone, migratetype);
11e33f6a5   Mel Gorman   page allocator: b...
1905
1906
  
  		if (!page && gfp_mask & __GFP_NOFAIL)
0e093d997   Mel Gorman   writeback: do not...
1907
  			wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
11e33f6a5   Mel Gorman   page allocator: b...
1908
1909
1910
1911
1912
1913
1914
  	} while (!page && (gfp_mask & __GFP_NOFAIL));
  
  	return page;
  }
  
  static inline
  void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
995047488   Mel Gorman   mm: kswapd: stop ...
1915
1916
  						enum zone_type high_zoneidx,
  						enum zone_type classzone_idx)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1917
  {
dd1a239f6   Mel Gorman   mm: have zonelist...
1918
1919
  	struct zoneref *z;
  	struct zone *zone;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1920

11e33f6a5   Mel Gorman   page allocator: b...
1921
  	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
995047488   Mel Gorman   mm: kswapd: stop ...
1922
  		wakeup_kswapd(zone, order, classzone_idx);
11e33f6a5   Mel Gorman   page allocator: b...
1923
  }
cf40bd16f   Nick Piggin   lockdep: annotate...
1924

341ce06f6   Peter Zijlstra   page allocator: c...
1925
1926
1927
  static inline int
  gfp_to_alloc_flags(gfp_t gfp_mask)
  {
341ce06f6   Peter Zijlstra   page allocator: c...
1928
1929
  	int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
  	const gfp_t wait = gfp_mask & __GFP_WAIT;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1930

a56f57ff9   Mel Gorman   page allocator: r...
1931
  	/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
e6223a3b1   Namhyung Kim   mm: add casts to/...
1932
  	BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
933e312e7   Akinobu Mita   [PATCH] fault-inj...
1933

341ce06f6   Peter Zijlstra   page allocator: c...
1934
1935
1936
1937
1938
1939
  	/*
  	 * The caller may dip into page reserves a bit more if the caller
  	 * cannot run direct reclaim, or if the caller has realtime scheduling
  	 * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
  	 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
  	 */
e6223a3b1   Namhyung Kim   mm: add casts to/...
1940
  	alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1941

341ce06f6   Peter Zijlstra   page allocator: c...
1942
  	if (!wait) {
5c3240d92   Andrea Arcangeli   thp: don't alloc ...
1943
1944
1945
1946
1947
1948
  		/*
  		 * Not worth trying to allocate harder for
  		 * __GFP_NOMEMALLOC even if it can't schedule.
  		 */
  		if  (!(gfp_mask & __GFP_NOMEMALLOC))
  			alloc_flags |= ALLOC_HARDER;
523b94585   Christoph Lameter   Memoryless nodes:...
1949
  		/*
341ce06f6   Peter Zijlstra   page allocator: c...
1950
1951
  		 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
  		 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
523b94585   Christoph Lameter   Memoryless nodes:...
1952
  		 */
341ce06f6   Peter Zijlstra   page allocator: c...
1953
  		alloc_flags &= ~ALLOC_CPUSET;
c06b1fca1   Andrew Morton   mm/page_alloc.c: ...
1954
  	} else if (unlikely(rt_task(current)) && !in_interrupt())
341ce06f6   Peter Zijlstra   page allocator: c...
1955
1956
1957
1958
  		alloc_flags |= ALLOC_HARDER;
  
  	if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
  		if (!in_interrupt() &&
c06b1fca1   Andrew Morton   mm/page_alloc.c: ...
1959
  		    ((current->flags & PF_MEMALLOC) ||
341ce06f6   Peter Zijlstra   page allocator: c...
1960
1961
  		     unlikely(test_thread_flag(TIF_MEMDIE))))
  			alloc_flags |= ALLOC_NO_WATERMARKS;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1962
  	}
6b1de9161   Jens Axboe   [PATCH] VM: fix z...
1963

341ce06f6   Peter Zijlstra   page allocator: c...
1964
1965
  	return alloc_flags;
  }
11e33f6a5   Mel Gorman   page allocator: b...
1966
1967
1968
  static inline struct page *
  __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
  	struct zonelist *zonelist, enum zone_type high_zoneidx,
3dd282669   Mel Gorman   page allocator: c...
1969
1970
  	nodemask_t *nodemask, struct zone *preferred_zone,
  	int migratetype)
11e33f6a5   Mel Gorman   page allocator: b...
1971
1972
1973
1974
1975
1976
  {
  	const gfp_t wait = gfp_mask & __GFP_WAIT;
  	struct page *page = NULL;
  	int alloc_flags;
  	unsigned long pages_reclaimed = 0;
  	unsigned long did_some_progress;
77f1fe6b0   Mel Gorman   mm: migration: al...
1977
  	bool sync_migration = false;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1978

952f3b51b   Christoph Lameter   [PATCH] GFP_THISN...
1979
  	/*
72807a74c   Mel Gorman   page allocator: s...
1980
1981
1982
1983
1984
  	 * In the slowpath, we sanity check order to avoid ever trying to
  	 * reclaim >= MAX_ORDER areas which will never succeed. Callers may
  	 * be using allocators in order of preference for an area that is
  	 * too large.
  	 */
1fc28b70f   Mel Gorman   page-allocator: a...
1985
1986
  	if (order >= MAX_ORDER) {
  		WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
72807a74c   Mel Gorman   page allocator: s...
1987
  		return NULL;
1fc28b70f   Mel Gorman   page-allocator: a...
1988
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1989

952f3b51b   Christoph Lameter   [PATCH] GFP_THISN...
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
  	/*
  	 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
  	 * __GFP_NOWARN set) should not cause reclaim since the subsystem
  	 * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
  	 * using a larger set of nodes after it has established that the
  	 * allowed per node queues are empty and that nodes are
  	 * over allocated.
  	 */
  	if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
  		goto nopage;
cc4a68514   Mel Gorman   page allocator: a...
2000
  restart:
32dba98e0   Andrea Arcangeli   thp: _GFP_NO_KSWAPD
2001
2002
  	if (!(gfp_mask & __GFP_NO_KSWAPD))
  		wake_all_kswapd(order, zonelist, high_zoneidx,
995047488   Mel Gorman   mm: kswapd: stop ...
2003
  						zone_idx(preferred_zone));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2004

9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2005
  	/*
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
2006
2007
2008
  	 * OK, we're below the kswapd watermark and have kicked background
  	 * reclaim. Now things get more complex, so set up alloc_flags according
  	 * to how we want to proceed.
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2009
  	 */
341ce06f6   Peter Zijlstra   page allocator: c...
2010
  	alloc_flags = gfp_to_alloc_flags(gfp_mask);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2011

f33261d75   David Rientjes   mm: fix deferred ...
2012
2013
2014
2015
2016
2017
2018
  	/*
  	 * Find the true preferred zone if the allocation is unconstrained by
  	 * cpusets.
  	 */
  	if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)
  		first_zones_zonelist(zonelist, high_zoneidx, NULL,
  					&preferred_zone);
cfa54a0fc   Andrew Barry   mm/page_alloc.c: ...
2019
  rebalance:
341ce06f6   Peter Zijlstra   page allocator: c...
2020
  	/* This is the last chance, in general, before the goto nopage. */
19770b326   Mel Gorman   mm: filter based ...
2021
  	page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
341ce06f6   Peter Zijlstra   page allocator: c...
2022
2023
  			high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
  			preferred_zone, migratetype);
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
2024
2025
  	if (page)
  		goto got_pg;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2026

11e33f6a5   Mel Gorman   page allocator: b...
2027
  	/* Allocate without watermarks if the context allows */
341ce06f6   Peter Zijlstra   page allocator: c...
2028
2029
2030
2031
2032
2033
  	if (alloc_flags & ALLOC_NO_WATERMARKS) {
  		page = __alloc_pages_high_priority(gfp_mask, order,
  				zonelist, high_zoneidx, nodemask,
  				preferred_zone, migratetype);
  		if (page)
  			goto got_pg;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2034
2035
2036
2037
2038
  	}
  
  	/* Atomic allocations - we can't balance anything */
  	if (!wait)
  		goto nopage;
341ce06f6   Peter Zijlstra   page allocator: c...
2039
  	/* Avoid recursion of direct reclaim */
c06b1fca1   Andrew Morton   mm/page_alloc.c: ...
2040
  	if (current->flags & PF_MEMALLOC)
341ce06f6   Peter Zijlstra   page allocator: c...
2041
  		goto nopage;
6583bb64f   David Rientjes   mm: avoid endless...
2042
2043
2044
  	/* Avoid allocations with no watermarks from looping endlessly */
  	if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
  		goto nopage;
77f1fe6b0   Mel Gorman   mm: migration: al...
2045
2046
2047
2048
  	/*
  	 * Try direct compaction. The first pass is asynchronous. Subsequent
  	 * attempts after direct reclaim are synchronous
  	 */
56de7263f   Mel Gorman   mm: compaction: d...
2049
2050
2051
2052
  	page = __alloc_pages_direct_compact(gfp_mask, order,
  					zonelist, high_zoneidx,
  					nodemask,
  					alloc_flags, preferred_zone,
77f1fe6b0   Mel Gorman   mm: migration: al...
2053
2054
  					migratetype, &did_some_progress,
  					sync_migration);
56de7263f   Mel Gorman   mm: compaction: d...
2055
2056
  	if (page)
  		goto got_pg;
c6a140bf1   Andrea Arcangeli   mm/compaction: re...
2057
  	sync_migration = true;
56de7263f   Mel Gorman   mm: compaction: d...
2058

11e33f6a5   Mel Gorman   page allocator: b...
2059
2060
2061
2062
  	/* Try direct reclaim and then allocating */
  	page = __alloc_pages_direct_reclaim(gfp_mask, order,
  					zonelist, high_zoneidx,
  					nodemask,
5117f45d1   Mel Gorman   page allocator: c...
2063
  					alloc_flags, preferred_zone,
3dd282669   Mel Gorman   page allocator: c...
2064
  					migratetype, &did_some_progress);
11e33f6a5   Mel Gorman   page allocator: b...
2065
2066
  	if (page)
  		goto got_pg;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2067

e33c3b5e1   David Rientjes   cpusets: update m...
2068
  	/*
11e33f6a5   Mel Gorman   page allocator: b...
2069
2070
  	 * If we failed to make any progress reclaiming, then we are
  	 * running out of options and have to consider going OOM
e33c3b5e1   David Rientjes   cpusets: update m...
2071
  	 */
11e33f6a5   Mel Gorman   page allocator: b...
2072
2073
  	if (!did_some_progress) {
  		if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
7f33d49a2   Rafael J. Wysocki   mm, PM/Freezer: D...
2074
2075
  			if (oom_killer_disabled)
  				goto nopage;
11e33f6a5   Mel Gorman   page allocator: b...
2076
2077
  			page = __alloc_pages_may_oom(gfp_mask, order,
  					zonelist, high_zoneidx,
3dd282669   Mel Gorman   page allocator: c...
2078
2079
  					nodemask, preferred_zone,
  					migratetype);
11e33f6a5   Mel Gorman   page allocator: b...
2080
2081
  			if (page)
  				goto got_pg;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2082

03668b3ce   David Rientjes   oom: avoid oom ki...
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
  			if (!(gfp_mask & __GFP_NOFAIL)) {
  				/*
  				 * The oom killer is not called for high-order
  				 * allocations that may fail, so if no progress
  				 * is being made, there are no other options and
  				 * retrying is unlikely to help.
  				 */
  				if (order > PAGE_ALLOC_COSTLY_ORDER)
  					goto nopage;
  				/*
  				 * The oom killer is not called for lowmem
  				 * allocations to prevent needlessly killing
  				 * innocent tasks.
  				 */
  				if (high_zoneidx < ZONE_NORMAL)
  					goto nopage;
  			}
e2c55dc87   Mel Gorman   Drain per-cpu lis...
2100

ff0ceb9de   David Rientjes   oom: serialize ou...
2101
2102
  			goto restart;
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2103
  	}
11e33f6a5   Mel Gorman   page allocator: b...
2104
  	/* Check if we should retry the allocation */
a41f24ea9   Nishanth Aravamudan   page allocator: s...
2105
  	pages_reclaimed += did_some_progress;
11e33f6a5   Mel Gorman   page allocator: b...
2106
2107
  	if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
  		/* Wait for some write requests to complete then retry */
0e093d997   Mel Gorman   writeback: do not...
2108
  		wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2109
  		goto rebalance;
3e7d34497   Mel Gorman   mm: vmscan: recla...
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
  	} else {
  		/*
  		 * High-order allocations do not necessarily loop after
  		 * direct reclaim and reclaim/compaction depends on compaction
  		 * being called after reclaim so call directly if necessary
  		 */
  		page = __alloc_pages_direct_compact(gfp_mask, order,
  					zonelist, high_zoneidx,
  					nodemask,
  					alloc_flags, preferred_zone,
77f1fe6b0   Mel Gorman   mm: migration: al...
2120
2121
  					migratetype, &did_some_progress,
  					sync_migration);
3e7d34497   Mel Gorman   mm: vmscan: recla...
2122
2123
  		if (page)
  			goto got_pg;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2124
2125
2126
  	}
  
  nopage:
a238ab5b0   Dave Hansen   mm: break out pag...
2127
  	warn_alloc_failed(gfp_mask, order, NULL);
b1eeab676   Vegard Nossum   kmemcheck: add ho...
2128
  	return page;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2129
  got_pg:
b1eeab676   Vegard Nossum   kmemcheck: add ho...
2130
2131
  	if (kmemcheck_enabled)
  		kmemcheck_pagealloc_alloc(page, order, gfp_mask);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2132
  	return page;
11e33f6a5   Mel Gorman   page allocator: b...
2133

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2134
  }
11e33f6a5   Mel Gorman   page allocator: b...
2135
2136
2137
2138
2139
2140
2141
2142
2143
  
  /*
   * This is the 'heart' of the zoned buddy allocator.
   */
  struct page *
  __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
  			struct zonelist *zonelist, nodemask_t *nodemask)
  {
  	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
5117f45d1   Mel Gorman   page allocator: c...
2144
  	struct zone *preferred_zone;
11e33f6a5   Mel Gorman   page allocator: b...
2145
  	struct page *page;
3dd282669   Mel Gorman   page allocator: c...
2146
  	int migratetype = allocflags_to_migratetype(gfp_mask);
11e33f6a5   Mel Gorman   page allocator: b...
2147

dcce284a2   Benjamin Herrenschmidt   mm: Extend gfp ma...
2148
  	gfp_mask &= gfp_allowed_mask;
11e33f6a5   Mel Gorman   page allocator: b...
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
  	lockdep_trace_alloc(gfp_mask);
  
  	might_sleep_if(gfp_mask & __GFP_WAIT);
  
  	if (should_fail_alloc_page(gfp_mask, order))
  		return NULL;
  
  	/*
  	 * Check the zones suitable for the gfp_mask contain at least one
  	 * valid zone. It's possible to have an empty zonelist as a result
  	 * of GFP_THISNODE and a memoryless node
  	 */
  	if (unlikely(!zonelist->_zonerefs->zone))
  		return NULL;
c0ff7453b   Miao Xie   cpuset,mm: fix no...
2163
  	get_mems_allowed();
5117f45d1   Mel Gorman   page allocator: c...
2164
  	/* The preferred zone is used for statistics later */
f33261d75   David Rientjes   mm: fix deferred ...
2165
2166
2167
  	first_zones_zonelist(zonelist, high_zoneidx,
  				nodemask ? : &cpuset_current_mems_allowed,
  				&preferred_zone);
c0ff7453b   Miao Xie   cpuset,mm: fix no...
2168
2169
  	if (!preferred_zone) {
  		put_mems_allowed();
5117f45d1   Mel Gorman   page allocator: c...
2170
  		return NULL;
c0ff7453b   Miao Xie   cpuset,mm: fix no...
2171
  	}
5117f45d1   Mel Gorman   page allocator: c...
2172
2173
  
  	/* First allocation attempt */
11e33f6a5   Mel Gorman   page allocator: b...
2174
  	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
5117f45d1   Mel Gorman   page allocator: c...
2175
  			zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,
3dd282669   Mel Gorman   page allocator: c...
2176
  			preferred_zone, migratetype);
11e33f6a5   Mel Gorman   page allocator: b...
2177
2178
  	if (unlikely(!page))
  		page = __alloc_pages_slowpath(gfp_mask, order,
5117f45d1   Mel Gorman   page allocator: c...
2179
  				zonelist, high_zoneidx, nodemask,
3dd282669   Mel Gorman   page allocator: c...
2180
  				preferred_zone, migratetype);
c0ff7453b   Miao Xie   cpuset,mm: fix no...
2181
  	put_mems_allowed();
11e33f6a5   Mel Gorman   page allocator: b...
2182

4b4f278c0   Mel Gorman   tracing, page-all...
2183
  	trace_mm_page_alloc(page, order, gfp_mask, migratetype);
11e33f6a5   Mel Gorman   page allocator: b...
2184
  	return page;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2185
  }
d239171e4   Mel Gorman   page allocator: r...
2186
  EXPORT_SYMBOL(__alloc_pages_nodemask);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2187
2188
2189
2190
  
  /*
   * Common helper functions.
   */
920c7a5d0   Harvey Harrison   mm: remove fastca...
2191
  unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2192
  {
945a11136   Akinobu Mita   mm: add gfp mask ...
2193
2194
2195
2196
2197
2198
2199
  	struct page *page;
  
  	/*
  	 * __get_free_pages() returns a 32-bit address, which cannot represent
  	 * a highmem page
  	 */
  	VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2200
2201
2202
2203
2204
  	page = alloc_pages(gfp_mask, order);
  	if (!page)
  		return 0;
  	return (unsigned long) page_address(page);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2205
  EXPORT_SYMBOL(__get_free_pages);
920c7a5d0   Harvey Harrison   mm: remove fastca...
2206
  unsigned long get_zeroed_page(gfp_t gfp_mask)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2207
  {
945a11136   Akinobu Mita   mm: add gfp mask ...
2208
  	return __get_free_pages(gfp_mask | __GFP_ZERO, 0);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2209
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2210
2211
2212
2213
2214
  EXPORT_SYMBOL(get_zeroed_page);
  
  void __pagevec_free(struct pagevec *pvec)
  {
  	int i = pagevec_count(pvec);
4b4f278c0   Mel Gorman   tracing, page-all...
2215
2216
  	while (--i >= 0) {
  		trace_mm_pagevec_free(pvec->pages[i], pvec->cold);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2217
  		free_hot_cold_page(pvec->pages[i], pvec->cold);
4b4f278c0   Mel Gorman   tracing, page-all...
2218
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2219
  }
920c7a5d0   Harvey Harrison   mm: remove fastca...
2220
  void __free_pages(struct page *page, unsigned int order)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2221
  {
b5810039a   Nick Piggin   [PATCH] core remo...
2222
  	if (put_page_testzero(page)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2223
  		if (order == 0)
fc91668ea   Li Hong   mm: remove free_h...
2224
  			free_hot_cold_page(page, 0);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2225
2226
2227
2228
2229
2230
  		else
  			__free_pages_ok(page, order);
  	}
  }
  
  EXPORT_SYMBOL(__free_pages);
920c7a5d0   Harvey Harrison   mm: remove fastca...
2231
  void free_pages(unsigned long addr, unsigned int order)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2232
2233
  {
  	if (addr != 0) {
725d704ec   Nick Piggin   [PATCH] mm: VM_BU...
2234
  		VM_BUG_ON(!virt_addr_valid((void *)addr));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2235
2236
2237
2238
2239
  		__free_pages(virt_to_page((void *)addr), order);
  	}
  }
  
  EXPORT_SYMBOL(free_pages);
ee85c2e14   Andi Kleen   mm: add alloc_pag...
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
  static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)
  {
  	if (addr) {
  		unsigned long alloc_end = addr + (PAGE_SIZE << order);
  		unsigned long used = addr + PAGE_ALIGN(size);
  
  		split_page(virt_to_page((void *)addr), order);
  		while (used < alloc_end) {
  			free_page(used);
  			used += PAGE_SIZE;
  		}
  	}
  	return (void *)addr;
  }
2be0ffe2b   Timur Tabi   mm: add alloc_pag...
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
  /**
   * alloc_pages_exact - allocate an exact number physically-contiguous pages.
   * @size: the number of bytes to allocate
   * @gfp_mask: GFP flags for the allocation
   *
   * This function is similar to alloc_pages(), except that it allocates the
   * minimum number of pages to satisfy the request.  alloc_pages() can only
   * allocate memory in power-of-two pages.
   *
   * This function is also limited by MAX_ORDER.
   *
   * Memory allocated by this function must be released by free_pages_exact().
   */
  void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
  {
  	unsigned int order = get_order(size);
  	unsigned long addr;
  
  	addr = __get_free_pages(gfp_mask, order);
ee85c2e14   Andi Kleen   mm: add alloc_pag...
2273
  	return make_alloc_exact(addr, order, size);
2be0ffe2b   Timur Tabi   mm: add alloc_pag...
2274
2275
2276
2277
  }
  EXPORT_SYMBOL(alloc_pages_exact);
  
  /**
ee85c2e14   Andi Kleen   mm: add alloc_pag...
2278
2279
   * alloc_pages_exact_nid - allocate an exact number of physically-contiguous
   *			   pages on a node.
b5e6ab589   Randy Dunlap   mm: fix kernel-do...
2280
   * @nid: the preferred node ID where memory should be allocated
ee85c2e14   Andi Kleen   mm: add alloc_pag...
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
   * @size: the number of bytes to allocate
   * @gfp_mask: GFP flags for the allocation
   *
   * Like alloc_pages_exact(), but try to allocate on node nid first before falling
   * back.
   * Note this is not alloc_pages_exact_node() which allocates on a specific node,
   * but is not exact.
   */
  void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
  {
  	unsigned order = get_order(size);
  	struct page *p = alloc_pages_node(nid, gfp_mask, order);
  	if (!p)
  		return NULL;
  	return make_alloc_exact((unsigned long)page_address(p), order, size);
  }
  EXPORT_SYMBOL(alloc_pages_exact_nid);
  
  /**
2be0ffe2b   Timur Tabi   mm: add alloc_pag...
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
   * free_pages_exact - release memory allocated via alloc_pages_exact()
   * @virt: the value returned by alloc_pages_exact.
   * @size: size of allocation, same value as passed to alloc_pages_exact().
   *
   * Release the memory allocated by a previous call to alloc_pages_exact.
   */
  void free_pages_exact(void *virt, size_t size)
  {
  	unsigned long addr = (unsigned long)virt;
  	unsigned long end = addr + PAGE_ALIGN(size);
  
  	while (addr < end) {
  		free_page(addr);
  		addr += PAGE_SIZE;
  	}
  }
  EXPORT_SYMBOL(free_pages_exact);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2317
2318
  static unsigned int nr_free_zone_pages(int offset)
  {
dd1a239f6   Mel Gorman   mm: have zonelist...
2319
  	struct zoneref *z;
54a6eb5c4   Mel Gorman   mm: use two zonel...
2320
  	struct zone *zone;
e310fd432   Martin J. Bligh   [PATCH] Fix NUMA ...
2321
  	/* Just pick one node, since fallback list is circular */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2322
  	unsigned int sum = 0;
0e88460da   Mel Gorman   mm: introduce nod...
2323
  	struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2324

54a6eb5c4   Mel Gorman   mm: use two zonel...
2325
  	for_each_zone_zonelist(zone, z, zonelist, offset) {
e310fd432   Martin J. Bligh   [PATCH] Fix NUMA ...
2326
  		unsigned long size = zone->present_pages;
418589663   Mel Gorman   page allocator: u...
2327
  		unsigned long high = high_wmark_pages(zone);
e310fd432   Martin J. Bligh   [PATCH] Fix NUMA ...
2328
2329
  		if (size > high)
  			sum += size - high;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
  	}
  
  	return sum;
  }
  
  /*
   * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL
   */
  unsigned int nr_free_buffer_pages(void)
  {
af4ca457e   Al Viro   [PATCH] gfp_t: in...
2340
  	return nr_free_zone_pages(gfp_zone(GFP_USER));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2341
  }
c2f1a551d   Meelap Shah   knfsd: nfsd4: var...
2342
  EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2343
2344
2345
2346
2347
2348
  
  /*
   * Amount of free RAM allocatable within all zones
   */
  unsigned int nr_free_pagecache_pages(void)
  {
2a1e274ac   Mel Gorman   Create the ZONE_M...
2349
  	return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2350
  }
08e0f6a97   Christoph Lameter   [PATCH] Add NUMA_...
2351
2352
  
  static inline void show_node(struct zone *zone)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2353
  {
08e0f6a97   Christoph Lameter   [PATCH] Add NUMA_...
2354
  	if (NUMA_BUILD)
25ba77c14   Andy Whitcroft   [PATCH] numa node...
2355
  		printk("Node %d ", zone_to_nid(zone));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2356
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2357

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2358
2359
2360
2361
  void si_meminfo(struct sysinfo *val)
  {
  	val->totalram = totalram_pages;
  	val->sharedram = 0;
d23ad4232   Christoph Lameter   [PATCH] Use ZVC f...
2362
  	val->freeram = global_page_state(NR_FREE_PAGES);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2363
  	val->bufferram = nr_blockdev_pages();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2364
2365
  	val->totalhigh = totalhigh_pages;
  	val->freehigh = nr_free_highpages();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
  	val->mem_unit = PAGE_SIZE;
  }
  
  EXPORT_SYMBOL(si_meminfo);
  
  #ifdef CONFIG_NUMA
  void si_meminfo_node(struct sysinfo *val, int nid)
  {
  	pg_data_t *pgdat = NODE_DATA(nid);
  
  	val->totalram = pgdat->node_present_pages;
d23ad4232   Christoph Lameter   [PATCH] Use ZVC f...
2377
  	val->freeram = node_page_state(nid, NR_FREE_PAGES);
98d2b0ebd   Christoph Lameter   [PATCH] reduce MA...
2378
  #ifdef CONFIG_HIGHMEM
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2379
  	val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;
d23ad4232   Christoph Lameter   [PATCH] Use ZVC f...
2380
2381
  	val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
  			NR_FREE_PAGES);
98d2b0ebd   Christoph Lameter   [PATCH] reduce MA...
2382
2383
2384
2385
  #else
  	val->totalhigh = 0;
  	val->freehigh = 0;
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2386
2387
2388
  	val->mem_unit = PAGE_SIZE;
  }
  #endif
ddd588b5d   David Rientjes   oom: suppress nod...
2389
  /*
7bf02ea22   David Rientjes   arch, mm: filter ...
2390
2391
   * Determine whether the node should be displayed or not, depending on whether
   * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
ddd588b5d   David Rientjes   oom: suppress nod...
2392
   */
7bf02ea22   David Rientjes   arch, mm: filter ...
2393
  bool skip_free_areas_node(unsigned int flags, int nid)
ddd588b5d   David Rientjes   oom: suppress nod...
2394
2395
2396
2397
2398
2399
2400
  {
  	bool ret = false;
  
  	if (!(flags & SHOW_MEM_FILTER_NODES))
  		goto out;
  
  	get_mems_allowed();
7bf02ea22   David Rientjes   arch, mm: filter ...
2401
  	ret = !node_isset(nid, cpuset_current_mems_allowed);
ddd588b5d   David Rientjes   oom: suppress nod...
2402
2403
2404
2405
  	put_mems_allowed();
  out:
  	return ret;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2406
2407
2408
2409
2410
2411
  #define K(x) ((x) << (PAGE_SHIFT-10))
  
  /*
   * Show free area list (used inside shift_scroll-lock stuff)
   * We also calculate the percentage fragmentation. We do this by counting the
   * memory on each free list with the exception of the first item on the list.
ddd588b5d   David Rientjes   oom: suppress nod...
2412
2413
   * Suppresses nodes that are not allowed by current's cpuset if
   * SHOW_MEM_FILTER_NODES is passed.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2414
   */
7bf02ea22   David Rientjes   arch, mm: filter ...
2415
  void show_free_areas(unsigned int filter)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2416
  {
c72419138   Jes Sorensen   [PATCH] Condense ...
2417
  	int cpu;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2418
  	struct zone *zone;
ee99c71c5   KOSAKI Motohiro   mm: introduce for...
2419
  	for_each_populated_zone(zone) {
7bf02ea22   David Rientjes   arch, mm: filter ...
2420
  		if (skip_free_areas_node(filter, zone_to_nid(zone)))
ddd588b5d   David Rientjes   oom: suppress nod...
2421
  			continue;
c72419138   Jes Sorensen   [PATCH] Condense ...
2422
2423
2424
  		show_node(zone);
  		printk("%s per-cpu:
  ", zone->name);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2425

6b482c677   Dave Jones   [PATCH] Don't pri...
2426
  		for_each_online_cpu(cpu) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2427
  			struct per_cpu_pageset *pageset;
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
2428
  			pageset = per_cpu_ptr(zone->pageset, cpu);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2429

3dfa5721f   Christoph Lameter   Page allocator: g...
2430
2431
2432
2433
  			printk("CPU %4d: hi:%5d, btch:%4d usd:%4d
  ",
  			       cpu, pageset->pcp.high,
  			       pageset->pcp.batch, pageset->pcp.count);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2434
2435
  		}
  	}
a731286de   KOSAKI Motohiro   mm: vmstat: add i...
2436
2437
2438
2439
  	printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu
  "
  		" active_file:%lu inactive_file:%lu isolated_file:%lu
  "
7b854121e   Lee Schermerhorn   Unevictable LRU P...
2440
  		" unevictable:%lu"
b76146ed1   Andrew Morton   revert "mm: oom a...
2441
2442
  		" dirty:%lu writeback:%lu unstable:%lu
  "
3701b0332   KOSAKI Motohiro   mm: show_free_are...
2443
2444
  		" free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu
  "
4b02108ac   KOSAKI Motohiro   mm: oom analysis:...
2445
2446
  		" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu
  ",
4f98a2fee   Rik van Riel   vmscan: split LRU...
2447
  		global_page_state(NR_ACTIVE_ANON),
4f98a2fee   Rik van Riel   vmscan: split LRU...
2448
  		global_page_state(NR_INACTIVE_ANON),
a731286de   KOSAKI Motohiro   mm: vmstat: add i...
2449
2450
  		global_page_state(NR_ISOLATED_ANON),
  		global_page_state(NR_ACTIVE_FILE),
4f98a2fee   Rik van Riel   vmscan: split LRU...
2451
  		global_page_state(NR_INACTIVE_FILE),
a731286de   KOSAKI Motohiro   mm: vmstat: add i...
2452
  		global_page_state(NR_ISOLATED_FILE),
7b854121e   Lee Schermerhorn   Unevictable LRU P...
2453
  		global_page_state(NR_UNEVICTABLE),
b1e7a8fd8   Christoph Lameter   [PATCH] zoned vm ...
2454
  		global_page_state(NR_FILE_DIRTY),
ce866b34a   Christoph Lameter   [PATCH] zoned vm ...
2455
  		global_page_state(NR_WRITEBACK),
fd39fc856   Christoph Lameter   [PATCH] zoned vm ...
2456
  		global_page_state(NR_UNSTABLE_NFS),
d23ad4232   Christoph Lameter   [PATCH] Use ZVC f...
2457
  		global_page_state(NR_FREE_PAGES),
3701b0332   KOSAKI Motohiro   mm: show_free_are...
2458
2459
  		global_page_state(NR_SLAB_RECLAIMABLE),
  		global_page_state(NR_SLAB_UNRECLAIMABLE),
65ba55f50   Christoph Lameter   [PATCH] zoned vm ...
2460
  		global_page_state(NR_FILE_MAPPED),
4b02108ac   KOSAKI Motohiro   mm: oom analysis:...
2461
  		global_page_state(NR_SHMEM),
a25700a53   Andrew Morton   [PATCH] mm: show ...
2462
2463
  		global_page_state(NR_PAGETABLE),
  		global_page_state(NR_BOUNCE));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2464

ee99c71c5   KOSAKI Motohiro   mm: introduce for...
2465
  	for_each_populated_zone(zone) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2466
  		int i;
7bf02ea22   David Rientjes   arch, mm: filter ...
2467
  		if (skip_free_areas_node(filter, zone_to_nid(zone)))
ddd588b5d   David Rientjes   oom: suppress nod...
2468
  			continue;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2469
2470
2471
2472
2473
2474
  		show_node(zone);
  		printk("%s"
  			" free:%lukB"
  			" min:%lukB"
  			" low:%lukB"
  			" high:%lukB"
4f98a2fee   Rik van Riel   vmscan: split LRU...
2475
2476
2477
2478
  			" active_anon:%lukB"
  			" inactive_anon:%lukB"
  			" active_file:%lukB"
  			" inactive_file:%lukB"
7b854121e   Lee Schermerhorn   Unevictable LRU P...
2479
  			" unevictable:%lukB"
a731286de   KOSAKI Motohiro   mm: vmstat: add i...
2480
2481
  			" isolated(anon):%lukB"
  			" isolated(file):%lukB"
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2482
  			" present:%lukB"
4a0aa73f1   KOSAKI Motohiro   mm: oom analysis:...
2483
2484
2485
2486
  			" mlocked:%lukB"
  			" dirty:%lukB"
  			" writeback:%lukB"
  			" mapped:%lukB"
4b02108ac   KOSAKI Motohiro   mm: oom analysis:...
2487
  			" shmem:%lukB"
4a0aa73f1   KOSAKI Motohiro   mm: oom analysis:...
2488
2489
  			" slab_reclaimable:%lukB"
  			" slab_unreclaimable:%lukB"
c6a7f5728   KOSAKI Motohiro   mm: oom analysis:...
2490
  			" kernel_stack:%lukB"
4a0aa73f1   KOSAKI Motohiro   mm: oom analysis:...
2491
2492
2493
2494
  			" pagetables:%lukB"
  			" unstable:%lukB"
  			" bounce:%lukB"
  			" writeback_tmp:%lukB"
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2495
2496
2497
2498
2499
  			" pages_scanned:%lu"
  			" all_unreclaimable? %s"
  			"
  ",
  			zone->name,
88f5acf88   Mel Gorman   mm: page allocato...
2500
  			K(zone_page_state(zone, NR_FREE_PAGES)),
418589663   Mel Gorman   page allocator: u...
2501
2502
2503
  			K(min_wmark_pages(zone)),
  			K(low_wmark_pages(zone)),
  			K(high_wmark_pages(zone)),
4f98a2fee   Rik van Riel   vmscan: split LRU...
2504
2505
2506
2507
  			K(zone_page_state(zone, NR_ACTIVE_ANON)),
  			K(zone_page_state(zone, NR_INACTIVE_ANON)),
  			K(zone_page_state(zone, NR_ACTIVE_FILE)),
  			K(zone_page_state(zone, NR_INACTIVE_FILE)),
7b854121e   Lee Schermerhorn   Unevictable LRU P...
2508
  			K(zone_page_state(zone, NR_UNEVICTABLE)),
a731286de   KOSAKI Motohiro   mm: vmstat: add i...
2509
2510
  			K(zone_page_state(zone, NR_ISOLATED_ANON)),
  			K(zone_page_state(zone, NR_ISOLATED_FILE)),
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2511
  			K(zone->present_pages),
4a0aa73f1   KOSAKI Motohiro   mm: oom analysis:...
2512
2513
2514
2515
  			K(zone_page_state(zone, NR_MLOCK)),
  			K(zone_page_state(zone, NR_FILE_DIRTY)),
  			K(zone_page_state(zone, NR_WRITEBACK)),
  			K(zone_page_state(zone, NR_FILE_MAPPED)),
4b02108ac   KOSAKI Motohiro   mm: oom analysis:...
2516
  			K(zone_page_state(zone, NR_SHMEM)),
4a0aa73f1   KOSAKI Motohiro   mm: oom analysis:...
2517
2518
  			K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),
  			K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),
c6a7f5728   KOSAKI Motohiro   mm: oom analysis:...
2519
2520
  			zone_page_state(zone, NR_KERNEL_STACK) *
  				THREAD_SIZE / 1024,
4a0aa73f1   KOSAKI Motohiro   mm: oom analysis:...
2521
2522
2523
2524
  			K(zone_page_state(zone, NR_PAGETABLE)),
  			K(zone_page_state(zone, NR_UNSTABLE_NFS)),
  			K(zone_page_state(zone, NR_BOUNCE)),
  			K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2525
  			zone->pages_scanned,
93e4a89a8   KOSAKI Motohiro   mm: restore zone-...
2526
  			(zone->all_unreclaimable ? "yes" : "no")
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2527
2528
2529
2530
2531
2532
2533
  			);
  		printk("lowmem_reserve[]:");
  		for (i = 0; i < MAX_NR_ZONES; i++)
  			printk(" %lu", zone->lowmem_reserve[i]);
  		printk("
  ");
  	}
ee99c71c5   KOSAKI Motohiro   mm: introduce for...
2534
  	for_each_populated_zone(zone) {
8f9de51a4   Kirill Korotaev   [PATCH] printk() ...
2535
   		unsigned long nr[MAX_ORDER], flags, order, total = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2536

7bf02ea22   David Rientjes   arch, mm: filter ...
2537
  		if (skip_free_areas_node(filter, zone_to_nid(zone)))
ddd588b5d   David Rientjes   oom: suppress nod...
2538
  			continue;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2539
2540
  		show_node(zone);
  		printk("%s: ", zone->name);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2541
2542
2543
  
  		spin_lock_irqsave(&zone->lock, flags);
  		for (order = 0; order < MAX_ORDER; order++) {
8f9de51a4   Kirill Korotaev   [PATCH] printk() ...
2544
2545
  			nr[order] = zone->free_area[order].nr_free;
  			total += nr[order] << order;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2546
2547
  		}
  		spin_unlock_irqrestore(&zone->lock, flags);
8f9de51a4   Kirill Korotaev   [PATCH] printk() ...
2548
2549
  		for (order = 0; order < MAX_ORDER; order++)
  			printk("%lu*%lukB ", nr[order], K(1UL) << order);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2550
2551
2552
  		printk("= %lukB
  ", K(total));
  	}
e6f3602d2   Larry Woodman   Include count of ...
2553
2554
  	printk("%ld total pagecache pages
  ", global_page_state(NR_FILE_PAGES));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2555
2556
  	show_swap_cache_info();
  }
19770b326   Mel Gorman   mm: filter based ...
2557
2558
2559
2560
2561
  static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
  {
  	zoneref->zone = zone;
  	zoneref->zone_idx = zone_idx(zone);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2562
2563
  /*
   * Builds allocation fallback zone lists.
1a93205bd   Christoph Lameter   [PATCH] mm: simpl...
2564
2565
   *
   * Add all populated zones of a node to the zonelist.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2566
   */
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2567
2568
  static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
  				int nr_zones, enum zone_type zone_type)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2569
  {
1a93205bd   Christoph Lameter   [PATCH] mm: simpl...
2570
  	struct zone *zone;
98d2b0ebd   Christoph Lameter   [PATCH] reduce MA...
2571
  	BUG_ON(zone_type >= MAX_NR_ZONES);
2f6726e54   Christoph Lameter   [PATCH] Apply typ...
2572
  	zone_type++;
02a68a5eb   Christoph Lameter   [PATCH] Fix zone ...
2573
2574
  
  	do {
2f6726e54   Christoph Lameter   [PATCH] Apply typ...
2575
  		zone_type--;
070f80326   Christoph Lameter   [PATCH] build_zon...
2576
  		zone = pgdat->node_zones + zone_type;
1a93205bd   Christoph Lameter   [PATCH] mm: simpl...
2577
  		if (populated_zone(zone)) {
dd1a239f6   Mel Gorman   mm: have zonelist...
2578
2579
  			zoneref_set_zone(zone,
  				&zonelist->_zonerefs[nr_zones++]);
070f80326   Christoph Lameter   [PATCH] build_zon...
2580
  			check_highest_zone(zone_type);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2581
  		}
02a68a5eb   Christoph Lameter   [PATCH] Fix zone ...
2582

2f6726e54   Christoph Lameter   [PATCH] Apply typ...
2583
  	} while (zone_type);
070f80326   Christoph Lameter   [PATCH] build_zon...
2584
  	return nr_zones;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2585
  }
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
  
  /*
   *  zonelist_order:
   *  0 = automatic detection of better ordering.
   *  1 = order by ([node] distance, -zonetype)
   *  2 = order by (-zonetype, [node] distance)
   *
   *  If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
   *  the same zonelist. So only NUMA can configure this param.
   */
  #define ZONELIST_ORDER_DEFAULT  0
  #define ZONELIST_ORDER_NODE     1
  #define ZONELIST_ORDER_ZONE     2
  
  /* zonelist order in the kernel.
   * set_zonelist_order() will set this to NODE or ZONE.
   */
  static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;
  static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2605
  #ifdef CONFIG_NUMA
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
  /* The value user specified ....changed by config */
  static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;
  /* string for sysctl */
  #define NUMA_ZONELIST_ORDER_LEN	16
  char numa_zonelist_order[16] = "default";
  
  /*
   * interface for configure zonelist ordering.
   * command line option "numa_zonelist_order"
   *	= "[dD]efault	- default, automatic configuration.
   *	= "[nN]ode 	- order by node locality, then by zone within node
   *	= "[zZ]one      - order by zone, then by locality within zone
   */
  
  static int __parse_numa_zonelist_order(char *s)
  {
  	if (*s == 'd' || *s == 'D') {
  		user_zonelist_order = ZONELIST_ORDER_DEFAULT;
  	} else if (*s == 'n' || *s == 'N') {
  		user_zonelist_order = ZONELIST_ORDER_NODE;
  	} else if (*s == 'z' || *s == 'Z') {
  		user_zonelist_order = ZONELIST_ORDER_ZONE;
  	} else {
  		printk(KERN_WARNING
  			"Ignoring invalid numa_zonelist_order value:  "
  			"%s
  ", s);
  		return -EINVAL;
  	}
  	return 0;
  }
  
  static __init int setup_numa_zonelist_order(char *s)
  {
ecb256f81   Volodymyr G. Lukiianyk   mm: set correct n...
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
  	int ret;
  
  	if (!s)
  		return 0;
  
  	ret = __parse_numa_zonelist_order(s);
  	if (ret == 0)
  		strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
  
  	return ret;
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2650
2651
2652
2653
2654
2655
2656
  }
  early_param("numa_zonelist_order", setup_numa_zonelist_order);
  
  /*
   * sysctl handler for numa_zonelist_order
   */
  int numa_zonelist_order_handler(ctl_table *table, int write,
8d65af789   Alexey Dobriyan   sysctl: remove "s...
2657
  		void __user *buffer, size_t *length,
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2658
2659
2660
2661
  		loff_t *ppos)
  {
  	char saved_string[NUMA_ZONELIST_ORDER_LEN];
  	int ret;
443c6f145   Andi Kleen   SYSCTL: Add a mut...
2662
  	static DEFINE_MUTEX(zl_order_mutex);
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2663

443c6f145   Andi Kleen   SYSCTL: Add a mut...
2664
  	mutex_lock(&zl_order_mutex);
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2665
  	if (write)
443c6f145   Andi Kleen   SYSCTL: Add a mut...
2666
  		strcpy(saved_string, (char*)table->data);
8d65af789   Alexey Dobriyan   sysctl: remove "s...
2667
  	ret = proc_dostring(table, write, buffer, length, ppos);
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2668
  	if (ret)
443c6f145   Andi Kleen   SYSCTL: Add a mut...
2669
  		goto out;
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2670
2671
2672
2673
2674
2675
2676
2677
2678
  	if (write) {
  		int oldval = user_zonelist_order;
  		if (__parse_numa_zonelist_order((char*)table->data)) {
  			/*
  			 * bogus value.  restore saved string
  			 */
  			strncpy((char*)table->data, saved_string,
  				NUMA_ZONELIST_ORDER_LEN);
  			user_zonelist_order = oldval;
4eaf3f643   Haicheng Li   mem-hotplug: fix ...
2679
2680
  		} else if (oldval != user_zonelist_order) {
  			mutex_lock(&zonelists_mutex);
1f522509c   Haicheng Li   mem-hotplug: avoi...
2681
  			build_all_zonelists(NULL);
4eaf3f643   Haicheng Li   mem-hotplug: fix ...
2682
2683
  			mutex_unlock(&zonelists_mutex);
  		}
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2684
  	}
443c6f145   Andi Kleen   SYSCTL: Add a mut...
2685
2686
2687
  out:
  	mutex_unlock(&zl_order_mutex);
  	return ret;
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2688
  }
62bc62a87   Christoph Lameter   page allocator: u...
2689
  #define MAX_NODE_LOAD (nr_online_nodes)
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2690
  static int node_load[MAX_NUMNODES];
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2691
  /**
4dc3b16ba   Pavel Pisa   [PATCH] DocBook: ...
2692
   * find_next_best_node - find the next node that should appear in a given node's fallback list
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
   * @node: node whose fallback list we're appending
   * @used_node_mask: nodemask_t of already used nodes
   *
   * We use a number of factors to determine which is the next node that should
   * appear on a given node's fallback list.  The node should not have appeared
   * already in @node's fallback list, and it should be the next closest node
   * according to the distance array (which contains arbitrary distance values
   * from each node to each node in the system), and should also prefer nodes
   * with no CPUs, since presumably they'll have very little allocation pressure
   * on them otherwise.
   * It returns -1 if no node is found.
   */
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2705
  static int find_next_best_node(int node, nodemask_t *used_node_mask)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2706
  {
4cf808eb4   Linus Torvalds   [PATCH] Handle ho...
2707
  	int n, val;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2708
2709
  	int min_val = INT_MAX;
  	int best_node = -1;
a70f73028   Rusty Russell   cpumask: replace ...
2710
  	const struct cpumask *tmp = cpumask_of_node(0);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2711

4cf808eb4   Linus Torvalds   [PATCH] Handle ho...
2712
2713
2714
2715
2716
  	/* Use the local node if we haven't already */
  	if (!node_isset(node, *used_node_mask)) {
  		node_set(node, *used_node_mask);
  		return node;
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2717

37b07e416   Lee Schermerhorn   memoryless nodes:...
2718
  	for_each_node_state(n, N_HIGH_MEMORY) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2719
2720
2721
2722
  
  		/* Don't want a node to appear more than once */
  		if (node_isset(n, *used_node_mask))
  			continue;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2723
2724
  		/* Use the distance array to find the distance */
  		val = node_distance(node, n);
4cf808eb4   Linus Torvalds   [PATCH] Handle ho...
2725
2726
  		/* Penalize nodes under us ("prefer the next node") */
  		val += (n < node);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2727
  		/* Give preference to headless and unused nodes */
a70f73028   Rusty Russell   cpumask: replace ...
2728
2729
  		tmp = cpumask_of_node(n);
  		if (!cpumask_empty(tmp))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
  			val += PENALTY_FOR_NODE_WITH_CPUS;
  
  		/* Slight preference for less loaded node */
  		val *= (MAX_NODE_LOAD*MAX_NUMNODES);
  		val += node_load[n];
  
  		if (val < min_val) {
  			min_val = val;
  			best_node = n;
  		}
  	}
  
  	if (best_node >= 0)
  		node_set(best_node, *used_node_mask);
  
  	return best_node;
  }
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2747
2748
2749
2750
2751
2752
2753
  
  /*
   * Build zonelists ordered by node and zones within node.
   * This results in maximum locality--normal zone overflows into local
   * DMA zone, if any--but risks exhausting DMA zone.
   */
  static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2754
  {
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2755
  	int j;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2756
  	struct zonelist *zonelist;
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2757

54a6eb5c4   Mel Gorman   mm: use two zonel...
2758
  	zonelist = &pgdat->node_zonelists[0];
dd1a239f6   Mel Gorman   mm: have zonelist...
2759
  	for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
54a6eb5c4   Mel Gorman   mm: use two zonel...
2760
2761
2762
  		;
  	j = build_zonelists_node(NODE_DATA(node), zonelist, j,
  							MAX_NR_ZONES - 1);
dd1a239f6   Mel Gorman   mm: have zonelist...
2763
2764
  	zonelist->_zonerefs[j].zone = NULL;
  	zonelist->_zonerefs[j].zone_idx = 0;
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2765
2766
2767
  }
  
  /*
523b94585   Christoph Lameter   Memoryless nodes:...
2768
2769
2770
2771
   * Build gfp_thisnode zonelists
   */
  static void build_thisnode_zonelists(pg_data_t *pgdat)
  {
523b94585   Christoph Lameter   Memoryless nodes:...
2772
2773
  	int j;
  	struct zonelist *zonelist;
54a6eb5c4   Mel Gorman   mm: use two zonel...
2774
2775
  	zonelist = &pgdat->node_zonelists[1];
  	j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
dd1a239f6   Mel Gorman   mm: have zonelist...
2776
2777
  	zonelist->_zonerefs[j].zone = NULL;
  	zonelist->_zonerefs[j].zone_idx = 0;
523b94585   Christoph Lameter   Memoryless nodes:...
2778
2779
2780
  }
  
  /*
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2781
2782
2783
2784
2785
2786
2787
2788
2789
   * Build zonelists ordered by zone and nodes within zones.
   * This results in conserving DMA zone[s] until all Normal memory is
   * exhausted, but results in overflowing to remote node while memory
   * may still exist in local DMA zone.
   */
  static int node_order[MAX_NUMNODES];
  
  static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
  {
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2790
2791
2792
2793
  	int pos, j, node;
  	int zone_type;		/* needs to be signed */
  	struct zone *z;
  	struct zonelist *zonelist;
54a6eb5c4   Mel Gorman   mm: use two zonel...
2794
2795
2796
2797
2798
2799
2800
  	zonelist = &pgdat->node_zonelists[0];
  	pos = 0;
  	for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
  		for (j = 0; j < nr_nodes; j++) {
  			node = node_order[j];
  			z = &NODE_DATA(node)->node_zones[zone_type];
  			if (populated_zone(z)) {
dd1a239f6   Mel Gorman   mm: have zonelist...
2801
2802
  				zoneref_set_zone(z,
  					&zonelist->_zonerefs[pos++]);
54a6eb5c4   Mel Gorman   mm: use two zonel...
2803
  				check_highest_zone(zone_type);
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2804
2805
  			}
  		}
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2806
  	}
dd1a239f6   Mel Gorman   mm: have zonelist...
2807
2808
  	zonelist->_zonerefs[pos].zone = NULL;
  	zonelist->_zonerefs[pos].zone_idx = 0;
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2809
2810
2811
2812
2813
2814
2815
2816
2817
  }
  
  static int default_zonelist_order(void)
  {
  	int nid, zone_type;
  	unsigned long low_kmem_size,total_size;
  	struct zone *z;
  	int average_size;
  	/*
883931612   Thomas Weber   Fix typos in comm...
2818
           * ZONE_DMA and ZONE_DMA32 can be very small area in the system.
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2819
2820
  	 * If they are really small and used heavily, the system can fall
  	 * into OOM very easily.
e325c90ff   David Rientjes   mm: default to no...
2821
  	 * This function detect ZONE_DMA/DMA32 size and configures zone order.
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
  	 */
  	/* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */
  	low_kmem_size = 0;
  	total_size = 0;
  	for_each_online_node(nid) {
  		for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
  			z = &NODE_DATA(nid)->node_zones[zone_type];
  			if (populated_zone(z)) {
  				if (zone_type < ZONE_NORMAL)
  					low_kmem_size += z->present_pages;
  				total_size += z->present_pages;
e325c90ff   David Rientjes   mm: default to no...
2833
2834
2835
2836
2837
2838
2839
2840
2841
  			} else if (zone_type == ZONE_NORMAL) {
  				/*
  				 * If any node has only lowmem, then node order
  				 * is preferred to allow kernel allocations
  				 * locally; otherwise, they can easily infringe
  				 * on other nodes when there is an abundance of
  				 * lowmem available to allocate from.
  				 */
  				return ZONELIST_ORDER_NODE;
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
  			}
  		}
  	}
  	if (!low_kmem_size ||  /* there are no DMA area. */
  	    low_kmem_size > total_size/2) /* DMA/DMA32 is big. */
  		return ZONELIST_ORDER_NODE;
  	/*
  	 * look into each node's config.
    	 * If there is a node whose DMA/DMA32 memory is very big area on
   	 * local memory, NODE_ORDER may be suitable.
           */
37b07e416   Lee Schermerhorn   memoryless nodes:...
2853
2854
  	average_size = total_size /
  				(nodes_weight(node_states[N_HIGH_MEMORY]) + 1);
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
  	for_each_online_node(nid) {
  		low_kmem_size = 0;
  		total_size = 0;
  		for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
  			z = &NODE_DATA(nid)->node_zones[zone_type];
  			if (populated_zone(z)) {
  				if (zone_type < ZONE_NORMAL)
  					low_kmem_size += z->present_pages;
  				total_size += z->present_pages;
  			}
  		}
  		if (low_kmem_size &&
  		    total_size > average_size && /* ignore small node */
  		    low_kmem_size > total_size * 70/100)
  			return ZONELIST_ORDER_NODE;
  	}
  	return ZONELIST_ORDER_ZONE;
  }
  
  static void set_zonelist_order(void)
  {
  	if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)
  		current_zonelist_order = default_zonelist_order();
  	else
  		current_zonelist_order = user_zonelist_order;
  }
  
  static void build_zonelists(pg_data_t *pgdat)
  {
  	int j, node, load;
  	enum zone_type i;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2886
  	nodemask_t used_mask;
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2887
2888
2889
  	int local_node, prev_node;
  	struct zonelist *zonelist;
  	int order = current_zonelist_order;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2890
2891
  
  	/* initialize zonelists */
523b94585   Christoph Lameter   Memoryless nodes:...
2892
  	for (i = 0; i < MAX_ZONELISTS; i++) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2893
  		zonelist = pgdat->node_zonelists + i;
dd1a239f6   Mel Gorman   mm: have zonelist...
2894
2895
  		zonelist->_zonerefs[0].zone = NULL;
  		zonelist->_zonerefs[0].zone_idx = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2896
2897
2898
2899
  	}
  
  	/* NUMA-aware ordering of nodes */
  	local_node = pgdat->node_id;
62bc62a87   Christoph Lameter   page allocator: u...
2900
  	load = nr_online_nodes;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2901
2902
  	prev_node = local_node;
  	nodes_clear(used_mask);
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2903

f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2904
2905
  	memset(node_order, 0, sizeof(node_order));
  	j = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2906
  	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
9eeff2395   Christoph Lameter   [PATCH] Zone recl...
2907
2908
2909
2910
2911
2912
2913
2914
  		int distance = node_distance(local_node, node);
  
  		/*
  		 * If another node is sufficiently far away then it is better
  		 * to reclaim pages in a zone before going off node.
  		 */
  		if (distance > RECLAIM_DISTANCE)
  			zone_reclaim_mode = 1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2915
2916
2917
2918
2919
  		/*
  		 * We don't want to pressure a particular node.
  		 * So adding penalty to the first node in same
  		 * distance group to make it round-robin.
  		 */
9eeff2395   Christoph Lameter   [PATCH] Zone recl...
2920
  		if (distance != node_distance(local_node, prev_node))
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2921
  			node_load[node] = load;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2922
2923
  		prev_node = node;
  		load--;
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2924
2925
2926
2927
2928
  		if (order == ZONELIST_ORDER_NODE)
  			build_zonelists_in_node_order(pgdat, node);
  		else
  			node_order[j++] = node;	/* remember order */
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2929

f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2930
2931
2932
  	if (order == ZONELIST_ORDER_ZONE) {
  		/* calculate node order -- i.e., DMA last! */
  		build_zonelists_in_zone_order(pgdat, j);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2933
  	}
523b94585   Christoph Lameter   Memoryless nodes:...
2934
2935
  
  	build_thisnode_zonelists(pgdat);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2936
  }
9276b1bc9   Paul Jackson   [PATCH] memory pa...
2937
  /* Construct the zonelist performance cache - see further mmzone.h */
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2938
  static void build_zonelist_cache(pg_data_t *pgdat)
9276b1bc9   Paul Jackson   [PATCH] memory pa...
2939
  {
54a6eb5c4   Mel Gorman   mm: use two zonel...
2940
2941
  	struct zonelist *zonelist;
  	struct zonelist_cache *zlc;
dd1a239f6   Mel Gorman   mm: have zonelist...
2942
  	struct zoneref *z;
9276b1bc9   Paul Jackson   [PATCH] memory pa...
2943

54a6eb5c4   Mel Gorman   mm: use two zonel...
2944
2945
2946
  	zonelist = &pgdat->node_zonelists[0];
  	zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
  	bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
dd1a239f6   Mel Gorman   mm: have zonelist...
2947
2948
  	for (z = zonelist->_zonerefs; z->zone; z++)
  		zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);
9276b1bc9   Paul Jackson   [PATCH] memory pa...
2949
  }
7aac78988   Lee Schermerhorn   numa: introduce n...
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
  #ifdef CONFIG_HAVE_MEMORYLESS_NODES
  /*
   * Return node id of node used for "local" allocations.
   * I.e., first node id of first zone in arg node's generic zonelist.
   * Used for initializing percpu 'numa_mem', which is used primarily
   * for kernel allocations, so use GFP_KERNEL flags to locate zonelist.
   */
  int local_memory_node(int node)
  {
  	struct zone *zone;
  
  	(void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
  				   gfp_zone(GFP_KERNEL),
  				   NULL,
  				   &zone);
  	return zone->node;
  }
  #endif
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2968

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2969
  #else	/* CONFIG_NUMA */
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
2970
2971
2972
2973
2974
2975
  static void set_zonelist_order(void)
  {
  	current_zonelist_order = ZONELIST_ORDER_ZONE;
  }
  
  static void build_zonelists(pg_data_t *pgdat)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2976
  {
19655d348   Christoph Lameter   [PATCH] linearly ...
2977
  	int node, local_node;
54a6eb5c4   Mel Gorman   mm: use two zonel...
2978
2979
  	enum zone_type j;
  	struct zonelist *zonelist;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2980
2981
  
  	local_node = pgdat->node_id;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2982

54a6eb5c4   Mel Gorman   mm: use two zonel...
2983
2984
  	zonelist = &pgdat->node_zonelists[0];
  	j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2985

54a6eb5c4   Mel Gorman   mm: use two zonel...
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
  	/*
  	 * Now we build the zonelist so that it contains the zones
  	 * of all the other nodes.
  	 * We don't want to pressure a particular node, so when
  	 * building the zones for node N, we make sure that the
  	 * zones coming right after the local ones are those from
  	 * node N+1 (modulo N)
  	 */
  	for (node = local_node + 1; node < MAX_NUMNODES; node++) {
  		if (!node_online(node))
  			continue;
  		j = build_zonelists_node(NODE_DATA(node), zonelist, j,
  							MAX_NR_ZONES - 1);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2999
  	}
54a6eb5c4   Mel Gorman   mm: use two zonel...
3000
3001
3002
3003
3004
3005
  	for (node = 0; node < local_node; node++) {
  		if (!node_online(node))
  			continue;
  		j = build_zonelists_node(NODE_DATA(node), zonelist, j,
  							MAX_NR_ZONES - 1);
  	}
dd1a239f6   Mel Gorman   mm: have zonelist...
3006
3007
  	zonelist->_zonerefs[j].zone = NULL;
  	zonelist->_zonerefs[j].zone_idx = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3008
  }
9276b1bc9   Paul Jackson   [PATCH] memory pa...
3009
  /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
3010
  static void build_zonelist_cache(pg_data_t *pgdat)
9276b1bc9   Paul Jackson   [PATCH] memory pa...
3011
  {
54a6eb5c4   Mel Gorman   mm: use two zonel...
3012
  	pgdat->node_zonelists[0].zlcache_ptr = NULL;
9276b1bc9   Paul Jackson   [PATCH] memory pa...
3013
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3014
  #endif	/* CONFIG_NUMA */
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
  /*
   * Boot pageset table. One per cpu which is going to be used for all
   * zones and all nodes. The parameters will be set in such a way
   * that an item put on a list will immediately be handed over to
   * the buddy list. This is safe since pageset manipulation is done
   * with interrupts disabled.
   *
   * The boot_pagesets must be kept even after bootup is complete for
   * unused processors and/or zones. They do play a role for bootstrapping
   * hotplugged processors.
   *
   * zoneinfo_show() and maybe other functions do
   * not check if the processor is online before following the pageset pointer.
   * Other parts of the kernel may not check if the zone is available.
   */
  static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
  static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
1f522509c   Haicheng Li   mem-hotplug: avoi...
3032
  static void setup_zone_pageset(struct zone *zone);
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
3033

4eaf3f643   Haicheng Li   mem-hotplug: fix ...
3034
3035
3036
3037
3038
  /*
   * Global mutex to protect against size modification of zonelists
   * as well as to serialize pageset setup for the new populated zone.
   */
  DEFINE_MUTEX(zonelists_mutex);
9b1a4d383   Rusty Russell   stop_machine: Wea...
3039
  /* return values int ....just for stop_machine() */
1f522509c   Haicheng Li   mem-hotplug: avoi...
3040
  static __init_refok int __build_all_zonelists(void *data)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3041
  {
6811378e7   Yasunori Goto   [PATCH] wait_tabl...
3042
  	int nid;
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
3043
  	int cpu;
9276b1bc9   Paul Jackson   [PATCH] memory pa...
3044

7f9cfb310   Bo Liu   mm: build_zonelis...
3045
3046
3047
  #ifdef CONFIG_NUMA
  	memset(node_load, 0, sizeof(node_load));
  #endif
9276b1bc9   Paul Jackson   [PATCH] memory pa...
3048
  	for_each_online_node(nid) {
7ea1530ab   Christoph Lameter   Memoryless nodes:...
3049
3050
3051
3052
  		pg_data_t *pgdat = NODE_DATA(nid);
  
  		build_zonelists(pgdat);
  		build_zonelist_cache(pgdat);
9276b1bc9   Paul Jackson   [PATCH] memory pa...
3053
  	}
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
  
  	/*
  	 * Initialize the boot_pagesets that are going to be used
  	 * for bootstrapping processors. The real pagesets for
  	 * each zone will be allocated later when the per cpu
  	 * allocator is available.
  	 *
  	 * boot_pagesets are used also for bootstrapping offline
  	 * cpus if the system is already booted because the pagesets
  	 * are needed to initialize allocators on a specific cpu too.
  	 * F.e. the percpu allocator needs the page allocator which
  	 * needs the percpu allocator in order to allocate its pagesets
  	 * (a chicken-egg dilemma).
  	 */
7aac78988   Lee Schermerhorn   numa: introduce n...
3068
  	for_each_possible_cpu(cpu) {
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
3069
  		setup_pageset(&per_cpu(boot_pageset, cpu), 0);
7aac78988   Lee Schermerhorn   numa: introduce n...
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
  #ifdef CONFIG_HAVE_MEMORYLESS_NODES
  		/*
  		 * We now know the "local memory node" for each node--
  		 * i.e., the node of the first zone in the generic zonelist.
  		 * Set up numa_mem percpu variable for on-line cpus.  During
  		 * boot, only the boot cpu should be on-line;  we'll init the
  		 * secondary cpus' numa_mem as they come on-line.  During
  		 * node/memory hotplug, we'll fixup all on-line cpus.
  		 */
  		if (cpu_online(cpu))
  			set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
  #endif
  	}
6811378e7   Yasunori Goto   [PATCH] wait_tabl...
3083
3084
  	return 0;
  }
4eaf3f643   Haicheng Li   mem-hotplug: fix ...
3085
3086
3087
3088
  /*
   * Called with zonelists_mutex held always
   * unless system_state == SYSTEM_BOOTING.
   */
9f6ae448b   Paul Mundt   mm/page_alloc.c: ...
3089
  void __ref build_all_zonelists(void *data)
6811378e7   Yasunori Goto   [PATCH] wait_tabl...
3090
  {
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
3091
  	set_zonelist_order();
6811378e7   Yasunori Goto   [PATCH] wait_tabl...
3092
  	if (system_state == SYSTEM_BOOTING) {
423b41d77   Randy Dunlap   [PATCH] mm/page_a...
3093
  		__build_all_zonelists(NULL);
68ad8df42   Mel Gorman   mm: print out the...
3094
  		mminit_verify_zonelist();
6811378e7   Yasunori Goto   [PATCH] wait_tabl...
3095
3096
  		cpuset_init_current_mems_allowed();
  	} else {
183ff22bb   Simon Arlott   spelling fixes: mm/
3097
  		/* we have to stop all cpus to guarantee there is no user
6811378e7   Yasunori Goto   [PATCH] wait_tabl...
3098
  		   of zonelist */
e9959f0f3   KAMEZAWA Hiroyuki   mm/page_alloc.c: ...
3099
3100
3101
3102
3103
  #ifdef CONFIG_MEMORY_HOTPLUG
  		if (data)
  			setup_zone_pageset((struct zone *)data);
  #endif
  		stop_machine(__build_all_zonelists, NULL, NULL);
6811378e7   Yasunori Goto   [PATCH] wait_tabl...
3104
3105
  		/* cpuset refresh routine should be here */
  	}
bd1e22b8e   Andrew Morton   [PATCH] initialis...
3106
  	vm_total_pages = nr_free_pagecache_pages();
9ef9acb05   Mel Gorman   Do not group page...
3107
3108
3109
3110
3111
3112
3113
  	/*
  	 * Disable grouping by mobility if the number of pages in the
  	 * system is too low to allow the mechanism to work. It would be
  	 * more accurate, but expensive to check per-zone. This check is
  	 * made on memory-hotadd so a system can start with mobility
  	 * disabled and enable it later
  	 */
d9c234005   Mel Gorman   Do not depend on ...
3114
  	if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
9ef9acb05   Mel Gorman   Do not group page...
3115
3116
3117
3118
3119
3120
3121
  		page_group_by_mobility_disabled = 1;
  	else
  		page_group_by_mobility_disabled = 0;
  
  	printk("Built %i zonelists in %s order, mobility grouping %s.  "
  		"Total pages: %ld
  ",
62bc62a87   Christoph Lameter   page allocator: u...
3122
  			nr_online_nodes,
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
3123
  			zonelist_order_name[current_zonelist_order],
9ef9acb05   Mel Gorman   Do not group page...
3124
  			page_group_by_mobility_disabled ? "off" : "on",
f0c0b2b80   KAMEZAWA Hiroyuki   change zonelist o...
3125
3126
3127
3128
3129
  			vm_total_pages);
  #ifdef CONFIG_NUMA
  	printk("Policy zone: %s
  ", zone_names[policy_zone]);
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
  }
  
  /*
   * Helper functions to size the waitqueue hash table.
   * Essentially these want to choose hash table sizes sufficiently
   * large so that collisions trying to wait on pages are rare.
   * But in fact, the number of active page waitqueues on typical
   * systems is ridiculously low, less than 200. So this is even
   * conservative, even though it seems large.
   *
   * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
   * waitqueues, i.e. the size of the waitq table given the number of pages.
   */
  #define PAGES_PER_WAITQUEUE	256
cca448fe9   Yasunori Goto   [PATCH] wait_tabl...
3144
  #ifndef CONFIG_MEMORY_HOTPLUG
02b694dea   Yasunori Goto   [PATCH] wait_tabl...
3145
  static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
  {
  	unsigned long size = 1;
  
  	pages /= PAGES_PER_WAITQUEUE;
  
  	while (size < pages)
  		size <<= 1;
  
  	/*
  	 * Once we have dozens or even hundreds of threads sleeping
  	 * on IO we've got bigger problems than wait queue collision.
  	 * Limit the size of the wait table to a reasonable size.
  	 */
  	size = min(size, 4096UL);
  
  	return max(size, 4UL);
  }
cca448fe9   Yasunori Goto   [PATCH] wait_tabl...
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
  #else
  /*
   * A zone's size might be changed by hot-add, so it is not possible to determine
   * a suitable size for its wait_table.  So we use the maximum size now.
   *
   * The max wait table size = 4096 x sizeof(wait_queue_head_t).   ie:
   *
   *    i386 (preemption config)    : 4096 x 16 = 64Kbyte.
   *    ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.
   *    ia64, x86-64 (preemption)   : 4096 x 24 = 96Kbyte.
   *
   * The maximum entries are prepared when a zone's memory is (512K + 256) pages
   * or more by the traditional way. (See above).  It equals:
   *
   *    i386, x86-64, powerpc(4K page size) : =  ( 2G + 1M)byte.
   *    ia64(16K page size)                 : =  ( 8G + 4M)byte.
   *    powerpc (64K page size)             : =  (32G +16M)byte.
   */
  static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
  {
  	return 4096UL;
  }
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
  
  /*
   * This is an integer logarithm so that shifts can be used later
   * to extract the more random high bits from the multiplicative
   * hash function before the remainder is taken.
   */
  static inline unsigned long wait_table_bits(unsigned long size)
  {
  	return ffz(~size);
  }
  
  #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
56fd56b86   Mel Gorman   Bias the location...
3198
  /*
6d3163ce8   Arve HjønnevÃ¥g   mm: check if any ...
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
   * Check if a pageblock contains reserved pages
   */
  static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)
  {
  	unsigned long pfn;
  
  	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
  		if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))
  			return 1;
  	}
  	return 0;
  }
  
  /*
d9c234005   Mel Gorman   Do not depend on ...
3213
   * Mark a number of pageblocks as MIGRATE_RESERVE. The number
418589663   Mel Gorman   page allocator: u...
3214
3215
   * of blocks reserved is based on min_wmark_pages(zone). The memory within
   * the reserve will tend to store contiguous free pages. Setting min_free_kbytes
56fd56b86   Mel Gorman   Bias the location...
3216
3217
3218
3219
3220
   * higher will lead to a bigger reserve which will get freed as contiguous
   * blocks as reclaim kicks in
   */
  static void setup_zone_migrate_reserve(struct zone *zone)
  {
6d3163ce8   Arve HjønnevÃ¥g   mm: check if any ...
3221
  	unsigned long start_pfn, pfn, end_pfn, block_end_pfn;
56fd56b86   Mel Gorman   Bias the location...
3222
  	struct page *page;
78986a678   Mel Gorman   page-allocator: l...
3223
3224
  	unsigned long block_migratetype;
  	int reserve;
56fd56b86   Mel Gorman   Bias the location...
3225
3226
3227
3228
  
  	/* Get the start pfn, end pfn and the number of blocks to reserve */
  	start_pfn = zone->zone_start_pfn;
  	end_pfn = start_pfn + zone->spanned_pages;
418589663   Mel Gorman   page allocator: u...
3229
  	reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
d9c234005   Mel Gorman   Do not depend on ...
3230
  							pageblock_order;
56fd56b86   Mel Gorman   Bias the location...
3231

78986a678   Mel Gorman   page-allocator: l...
3232
3233
3234
3235
3236
3237
3238
3239
  	/*
  	 * Reserve blocks are generally in place to help high-order atomic
  	 * allocations that are short-lived. A min_free_kbytes value that
  	 * would result in more than 2 reserve blocks for atomic allocations
  	 * is assumed to be in place to help anti-fragmentation for the
  	 * future allocation of hugepages at runtime.
  	 */
  	reserve = min(2, reserve);
d9c234005   Mel Gorman   Do not depend on ...
3240
  	for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
56fd56b86   Mel Gorman   Bias the location...
3241
3242
3243
  		if (!pfn_valid(pfn))
  			continue;
  		page = pfn_to_page(pfn);
344c790e3   Adam Litke   mm: make setup_zo...
3244
3245
3246
  		/* Watch out for overlapping nodes */
  		if (page_to_nid(page) != zone_to_nid(zone))
  			continue;
56fd56b86   Mel Gorman   Bias the location...
3247
  		/* Blocks with reserved pages will never free, skip them. */
6d3163ce8   Arve HjønnevÃ¥g   mm: check if any ...
3248
3249
  		block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
  		if (pageblock_is_reserved(pfn, block_end_pfn))
56fd56b86   Mel Gorman   Bias the location...
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
  			continue;
  
  		block_migratetype = get_pageblock_migratetype(page);
  
  		/* If this block is reserved, account for it */
  		if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) {
  			reserve--;
  			continue;
  		}
  
  		/* Suitable for reserving if this block is movable */
  		if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) {
  			set_pageblock_migratetype(page, MIGRATE_RESERVE);
  			move_freepages_block(zone, page, MIGRATE_RESERVE);
  			reserve--;
  			continue;
  		}
  
  		/*
  		 * If the reserve is met and this is a previous reserved block,
  		 * take it back
  		 */
  		if (block_migratetype == MIGRATE_RESERVE) {
  			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
  			move_freepages_block(zone, page, MIGRATE_MOVABLE);
  		}
  	}
  }
ac0e5b7a6   Mel Gorman   remove PAGE_GROUP...
3278

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3279
3280
3281
3282
3283
  /*
   * Initially all pages are reserved - free ones are freed
   * up by free_all_bootmem() once the early boot process is
   * done. Non-atomic initialization, single-pass.
   */
c09b42404   Matt Tolentino   [PATCH] x86_64: a...
3284
  void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
a2f3aa025   Dave Hansen   [PATCH] Fix spars...
3285
  		unsigned long start_pfn, enum memmap_context context)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3286
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3287
  	struct page *page;
29751f699   Andy Whitcroft   [PATCH] sparsemem...
3288
3289
  	unsigned long end_pfn = start_pfn + size;
  	unsigned long pfn;
86051ca5e   KAMEZAWA Hiroyuki   mm: fix usemap in...
3290
  	struct zone *z;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3291

22b31eec6   Hugh Dickins   badpage: vm_norma...
3292
3293
  	if (highest_memmap_pfn < end_pfn - 1)
  		highest_memmap_pfn = end_pfn - 1;
86051ca5e   KAMEZAWA Hiroyuki   mm: fix usemap in...
3294
  	z = &NODE_DATA(nid)->node_zones[zone];
cbe8dd4af   Greg Ungerer   [PATCH] memmap_in...
3295
  	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
a2f3aa025   Dave Hansen   [PATCH] Fix spars...
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
  		/*
  		 * There can be holes in boot-time mem_map[]s
  		 * handed to this function.  They do not
  		 * exist on hotplugged memory.
  		 */
  		if (context == MEMMAP_EARLY) {
  			if (!early_pfn_valid(pfn))
  				continue;
  			if (!early_pfn_in_nid(pfn, nid))
  				continue;
  		}
d41dee369   Andy Whitcroft   [PATCH] sparsemem...
3307
3308
  		page = pfn_to_page(pfn);
  		set_page_links(page, zone, nid, pfn);
708614e61   Mel Gorman   mm: verify the pa...
3309
  		mminit_verify_page_links(page, zone, nid, pfn);
7835e98b2   Nick Piggin   [PATCH] remove se...
3310
  		init_page_count(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3311
3312
  		reset_page_mapcount(page);
  		SetPageReserved(page);
b2a0ac887   Mel Gorman   Split the free li...
3313
3314
3315
3316
3317
  		/*
  		 * Mark the block movable so that blocks are reserved for
  		 * movable at startup. This will force kernel allocations
  		 * to reserve their blocks rather than leaking throughout
  		 * the address space during boot when many long-lived
56fd56b86   Mel Gorman   Bias the location...
3318
3319
3320
  		 * kernel allocations are made. Later some blocks near
  		 * the start are marked MIGRATE_RESERVE by
  		 * setup_zone_migrate_reserve()
86051ca5e   KAMEZAWA Hiroyuki   mm: fix usemap in...
3321
3322
3323
3324
3325
  		 *
  		 * bitmap is created for zone's valid pfn range. but memmap
  		 * can be created for invalid pages (for alignment)
  		 * check here not to call set_pageblock_migratetype() against
  		 * pfn out of zone.
b2a0ac887   Mel Gorman   Split the free li...
3326
  		 */
86051ca5e   KAMEZAWA Hiroyuki   mm: fix usemap in...
3327
3328
3329
  		if ((z->zone_start_pfn <= pfn)
  		    && (pfn < z->zone_start_pfn + z->spanned_pages)
  		    && !(pfn & (pageblock_nr_pages - 1)))
56fd56b86   Mel Gorman   Bias the location...
3330
  			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
b2a0ac887   Mel Gorman   Split the free li...
3331

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3332
3333
3334
3335
  		INIT_LIST_HEAD(&page->lru);
  #ifdef WANT_PAGE_VIRTUAL
  		/* The shift won't overflow because ZONE_NORMAL is below 4G. */
  		if (!is_highmem_idx(zone))
3212c6be2   Bob Picco   [PATCH] fix WANT_...
3336
  			set_page_address(page, __va(pfn << PAGE_SHIFT));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3337
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3338
3339
  	}
  }
1e548deb5   Andi Kleen   page allocator: r...
3340
  static void __meminit zone_init_free_lists(struct zone *zone)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3341
  {
b2a0ac887   Mel Gorman   Split the free li...
3342
3343
3344
  	int order, t;
  	for_each_migratetype_order(order, t) {
  		INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3345
3346
3347
3348
3349
3350
  		zone->free_area[order].nr_free = 0;
  	}
  }
  
  #ifndef __HAVE_ARCH_MEMMAP_INIT
  #define memmap_init(size, nid, zone, start_pfn) \
a2f3aa025   Dave Hansen   [PATCH] Fix spars...
3351
  	memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3352
  #endif
1d6f4e60e   Sam Ravnborg   mm: fix section m...
3353
  static int zone_batchsize(struct zone *zone)
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
3354
  {
3a6be87fd   David Howells   nommu: clamp zone...
3355
  #ifdef CONFIG_MMU
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
3356
3357
3358
3359
  	int batch;
  
  	/*
  	 * The per-cpu-pages pools are set to around 1000th of the
ba56e91c9   Seth, Rohit   [PATCH] mm: page_...
3360
  	 * size of the zone.  But no more than 1/2 of a meg.
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
3361
3362
3363
3364
  	 *
  	 * OK, so we don't know how big the cache is.  So guess.
  	 */
  	batch = zone->present_pages / 1024;
ba56e91c9   Seth, Rohit   [PATCH] mm: page_...
3365
3366
  	if (batch * PAGE_SIZE > 512 * 1024)
  		batch = (512 * 1024) / PAGE_SIZE;
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
3367
3368
3369
3370
3371
  	batch /= 4;		/* We effectively *= 4 below */
  	if (batch < 1)
  		batch = 1;
  
  	/*
0ceaacc97   Nick Piggin   [PATCH] Fix up pe...
3372
3373
3374
  	 * Clamp the batch to a 2^n - 1 value. Having a power
  	 * of 2 value was found to be more likely to have
  	 * suboptimal cache aliasing properties in some cases.
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
3375
  	 *
0ceaacc97   Nick Piggin   [PATCH] Fix up pe...
3376
3377
3378
3379
  	 * For example if 2 tasks are alternately allocating
  	 * batches of pages, one task can end up with a lot
  	 * of pages of one half of the possible page colors
  	 * and the other with pages of the other colors.
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
3380
  	 */
9155203a5   David Howells   mm: use roundown_...
3381
  	batch = rounddown_pow_of_two(batch + batch/2) - 1;
ba56e91c9   Seth, Rohit   [PATCH] mm: page_...
3382

e7c8d5c99   Christoph Lameter   [PATCH] node loca...
3383
  	return batch;
3a6be87fd   David Howells   nommu: clamp zone...
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
  
  #else
  	/* The deferral and batching of frees should be suppressed under NOMMU
  	 * conditions.
  	 *
  	 * The problem is that NOMMU needs to be able to allocate large chunks
  	 * of contiguous memory as there's no hardware page translation to
  	 * assemble apparent contiguous memory from discontiguous pages.
  	 *
  	 * Queueing large contiguous runs of pages for batching, however,
  	 * causes the pages to actually be freed in smaller chunks.  As there
  	 * can be a significant delay between the individual batches being
  	 * recycled, this leads to the once large chunks of space being
  	 * fragmented and becoming unavailable for high-order allocations.
  	 */
  	return 0;
  #endif
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
3401
  }
b69a7288e   Adrian Bunk   mm/page_alloc.c: ...
3402
  static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
2caaad41e   Christoph Lameter   [PATCH] Reduce si...
3403
3404
  {
  	struct per_cpu_pages *pcp;
5f8dcc212   Mel Gorman   page-allocator: s...
3405
  	int migratetype;
2caaad41e   Christoph Lameter   [PATCH] Reduce si...
3406

1c6fe9465   Magnus Damm   [PATCH] NUMA: bro...
3407
  	memset(p, 0, sizeof(*p));
3dfa5721f   Christoph Lameter   Page allocator: g...
3408
  	pcp = &p->pcp;
2caaad41e   Christoph Lameter   [PATCH] Reduce si...
3409
  	pcp->count = 0;
2caaad41e   Christoph Lameter   [PATCH] Reduce si...
3410
3411
  	pcp->high = 6 * batch;
  	pcp->batch = max(1UL, 1 * batch);
5f8dcc212   Mel Gorman   page-allocator: s...
3412
3413
  	for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
  		INIT_LIST_HEAD(&pcp->lists[migratetype]);
2caaad41e   Christoph Lameter   [PATCH] Reduce si...
3414
  }
8ad4b1fb8   Rohit Seth   [PATCH] Make high...
3415
3416
3417
3418
3419
3420
3421
3422
3423
  /*
   * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
   * to the value high for the pageset p.
   */
  
  static void setup_pagelist_highmark(struct per_cpu_pageset *p,
  				unsigned long high)
  {
  	struct per_cpu_pages *pcp;
3dfa5721f   Christoph Lameter   Page allocator: g...
3424
  	pcp = &p->pcp;
8ad4b1fb8   Rohit Seth   [PATCH] Make high...
3425
3426
3427
3428
3429
  	pcp->high = high;
  	pcp->batch = max(1UL, high/4);
  	if ((high/4) > (PAGE_SHIFT * 8))
  		pcp->batch = PAGE_SHIFT * 8;
  }
58c2ee400   Nikanth Karthikesan   mm: Fix section m...
3430
  static void setup_zone_pageset(struct zone *zone)
319774e25   Wu Fengguang   mem-hotplug: sepa...
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
  {
  	int cpu;
  
  	zone->pageset = alloc_percpu(struct per_cpu_pageset);
  
  	for_each_possible_cpu(cpu) {
  		struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
  
  		setup_pageset(pcp, zone_batchsize(zone));
  
  		if (percpu_pagelist_fraction)
  			setup_pagelist_highmark(pcp,
  				(zone->present_pages /
  					percpu_pagelist_fraction));
  	}
  }
2caaad41e   Christoph Lameter   [PATCH] Reduce si...
3447
  /*
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
3448
3449
   * Allocate per cpu pagesets and initialize them.
   * Before this call only boot pagesets were available.
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
3450
   */
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
3451
  void __init setup_per_cpu_pageset(void)
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
3452
  {
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
3453
  	struct zone *zone;
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
3454

319774e25   Wu Fengguang   mem-hotplug: sepa...
3455
3456
  	for_each_populated_zone(zone)
  		setup_zone_pageset(zone);
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
3457
  }
577a32f62   Sam Ravnborg   mm: fix section m...
3458
  static noinline __init_refok
cca448fe9   Yasunori Goto   [PATCH] wait_tabl...
3459
  int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
3460
3461
3462
  {
  	int i;
  	struct pglist_data *pgdat = zone->zone_pgdat;
cca448fe9   Yasunori Goto   [PATCH] wait_tabl...
3463
  	size_t alloc_size;
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
3464
3465
3466
3467
3468
  
  	/*
  	 * The per-page waitqueue mechanism uses hashed waitqueues
  	 * per zone.
  	 */
02b694dea   Yasunori Goto   [PATCH] wait_tabl...
3469
3470
3471
3472
  	zone->wait_table_hash_nr_entries =
  		 wait_table_hash_nr_entries(zone_size_pages);
  	zone->wait_table_bits =
  		wait_table_bits(zone->wait_table_hash_nr_entries);
cca448fe9   Yasunori Goto   [PATCH] wait_tabl...
3473
3474
  	alloc_size = zone->wait_table_hash_nr_entries
  					* sizeof(wait_queue_head_t);
cd94b9dbf   Heiko Carstens   memory hotplug: f...
3475
  	if (!slab_is_available()) {
cca448fe9   Yasunori Goto   [PATCH] wait_tabl...
3476
  		zone->wait_table = (wait_queue_head_t *)
8f389a99b   Yinghai Lu   mm: use alloc_boo...
3477
  			alloc_bootmem_node_nopanic(pgdat, alloc_size);
cca448fe9   Yasunori Goto   [PATCH] wait_tabl...
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
  	} else {
  		/*
  		 * This case means that a zone whose size was 0 gets new memory
  		 * via memory hot-add.
  		 * But it may be the case that a new node was hot-added.  In
  		 * this case vmalloc() will not be able to use this new node's
  		 * memory - this wait_table must be initialized to use this new
  		 * node itself as well.
  		 * To use this new node's memory, further consideration will be
  		 * necessary.
  		 */
8691f3a72   Jesper Juhl   mm: no need to ca...
3489
  		zone->wait_table = vmalloc(alloc_size);
cca448fe9   Yasunori Goto   [PATCH] wait_tabl...
3490
3491
3492
  	}
  	if (!zone->wait_table)
  		return -ENOMEM;
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
3493

02b694dea   Yasunori Goto   [PATCH] wait_tabl...
3494
  	for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
3495
  		init_waitqueue_head(zone->wait_table + i);
cca448fe9   Yasunori Goto   [PATCH] wait_tabl...
3496
3497
  
  	return 0;
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
3498
  }
112067f09   Shaohua Li   memory hotplug: u...
3499
3500
3501
3502
3503
  static int __zone_pcp_update(void *data)
  {
  	struct zone *zone = data;
  	int cpu;
  	unsigned long batch = zone_batchsize(zone), flags;
2d30a1f63   Thomas Gleixner   mm: do not iterat...
3504
  	for_each_possible_cpu(cpu) {
112067f09   Shaohua Li   memory hotplug: u...
3505
3506
  		struct per_cpu_pageset *pset;
  		struct per_cpu_pages *pcp;
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
3507
  		pset = per_cpu_ptr(zone->pageset, cpu);
112067f09   Shaohua Li   memory hotplug: u...
3508
3509
3510
  		pcp = &pset->pcp;
  
  		local_irq_save(flags);
5f8dcc212   Mel Gorman   page-allocator: s...
3511
  		free_pcppages_bulk(zone, pcp->count, pcp);
112067f09   Shaohua Li   memory hotplug: u...
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
  		setup_pageset(pset, batch);
  		local_irq_restore(flags);
  	}
  	return 0;
  }
  
  void zone_pcp_update(struct zone *zone)
  {
  	stop_machine(__zone_pcp_update, zone, NULL);
  }
c09b42404   Matt Tolentino   [PATCH] x86_64: a...
3522
  static __meminit void zone_pcp_init(struct zone *zone)
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
3523
  {
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
3524
3525
3526
3527
3528
3529
  	/*
  	 * per cpu subsystem is not up at this point. The following code
  	 * relies on the ability of the linker to provide the
  	 * offset of a (static) per cpu variable into the per cpu area.
  	 */
  	zone->pageset = &boot_pageset;
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
3530

f5335c0f1   Anton Blanchard   [PATCH] quieten z...
3531
  	if (zone->present_pages)
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
3532
3533
3534
3535
  		printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%u
  ",
  			zone->name, zone->present_pages,
  					 zone_batchsize(zone));
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
3536
  }
718127cc3   Yasunori Goto   [PATCH] wait_tabl...
3537
3538
  __meminit int init_currently_empty_zone(struct zone *zone,
  					unsigned long zone_start_pfn,
a2f3aa025   Dave Hansen   [PATCH] Fix spars...
3539
3540
  					unsigned long size,
  					enum memmap_context context)
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
3541
3542
  {
  	struct pglist_data *pgdat = zone->zone_pgdat;
cca448fe9   Yasunori Goto   [PATCH] wait_tabl...
3543
3544
3545
3546
  	int ret;
  	ret = zone_wait_table_init(zone, size);
  	if (ret)
  		return ret;
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
3547
  	pgdat->nr_zones = zone_idx(zone) + 1;
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
3548
  	zone->zone_start_pfn = zone_start_pfn;
708614e61   Mel Gorman   mm: verify the pa...
3549
3550
3551
3552
3553
3554
  	mminit_dprintk(MMINIT_TRACE, "memmap_init",
  			"Initialising map node %d zone %lu pfns %lu -> %lu
  ",
  			pgdat->node_id,
  			(unsigned long)zone_idx(zone),
  			zone_start_pfn, (zone_start_pfn + size));
1e548deb5   Andi Kleen   page allocator: r...
3555
  	zone_init_free_lists(zone);
718127cc3   Yasunori Goto   [PATCH] wait_tabl...
3556
3557
  
  	return 0;
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
3558
  }
c713216de   Mel Gorman   [PATCH] Introduce...
3559
3560
3561
3562
3563
  #ifdef CONFIG_ARCH_POPULATES_NODE_MAP
  /*
   * Basic iterator support. Return the first range of PFNs for a node
   * Note: nid == MAX_NUMNODES returns first region regardless of node
   */
a3142c8e1   Yasunori Goto   Fix section misma...
3564
  static int __meminit first_active_region_index_in_nid(int nid)
c713216de   Mel Gorman   [PATCH] Introduce...
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
  {
  	int i;
  
  	for (i = 0; i < nr_nodemap_entries; i++)
  		if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
  			return i;
  
  	return -1;
  }
  
  /*
   * Basic iterator support. Return the next active range of PFNs for a node
183ff22bb   Simon Arlott   spelling fixes: mm/
3577
   * Note: nid == MAX_NUMNODES returns next region regardless of node
c713216de   Mel Gorman   [PATCH] Introduce...
3578
   */
a3142c8e1   Yasunori Goto   Fix section misma...
3579
  static int __meminit next_active_region_index_in_nid(int index, int nid)
c713216de   Mel Gorman   [PATCH] Introduce...
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
  {
  	for (index = index + 1; index < nr_nodemap_entries; index++)
  		if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
  			return index;
  
  	return -1;
  }
  
  #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
  /*
   * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
   * Architectures may implement their own version but if add_active_range()
   * was used and there are no special requirements, this is a convenient
   * alternative
   */
f2dbcfa73   KAMEZAWA Hiroyuki   mm: clean up for ...
3595
  int __meminit __early_pfn_to_nid(unsigned long pfn)
c713216de   Mel Gorman   [PATCH] Introduce...
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
  {
  	int i;
  
  	for (i = 0; i < nr_nodemap_entries; i++) {
  		unsigned long start_pfn = early_node_map[i].start_pfn;
  		unsigned long end_pfn = early_node_map[i].end_pfn;
  
  		if (start_pfn <= pfn && pfn < end_pfn)
  			return early_node_map[i].nid;
  	}
cc2559bcc   KAMEZAWA Hiroyuki   mm: fix memmap in...
3606
3607
  	/* This is a memory hole */
  	return -1;
c713216de   Mel Gorman   [PATCH] Introduce...
3608
3609
  }
  #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
f2dbcfa73   KAMEZAWA Hiroyuki   mm: clean up for ...
3610
3611
  int __meminit early_pfn_to_nid(unsigned long pfn)
  {
cc2559bcc   KAMEZAWA Hiroyuki   mm: fix memmap in...
3612
3613
3614
3615
3616
3617
3618
  	int nid;
  
  	nid = __early_pfn_to_nid(pfn);
  	if (nid >= 0)
  		return nid;
  	/* just returns 0 */
  	return 0;
f2dbcfa73   KAMEZAWA Hiroyuki   mm: clean up for ...
3619
  }
cc2559bcc   KAMEZAWA Hiroyuki   mm: fix memmap in...
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
  #ifdef CONFIG_NODES_SPAN_OTHER_NODES
  bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
  {
  	int nid;
  
  	nid = __early_pfn_to_nid(pfn);
  	if (nid >= 0 && nid != node)
  		return false;
  	return true;
  }
  #endif
f2dbcfa73   KAMEZAWA Hiroyuki   mm: clean up for ...
3631

c713216de   Mel Gorman   [PATCH] Introduce...
3632
3633
3634
3635
3636
3637
3638
  /* Basic iterator support to walk early_node_map[] */
  #define for_each_active_range_index_in_nid(i, nid) \
  	for (i = first_active_region_index_in_nid(nid); i != -1; \
  				i = next_active_region_index_in_nid(i, nid))
  
  /**
   * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
3639
3640
   * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
   * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node
c713216de   Mel Gorman   [PATCH] Introduce...
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
   *
   * If an architecture guarantees that all ranges registered with
   * add_active_ranges() contain no holes and may be freed, this
   * this function may be used instead of calling free_bootmem() manually.
   */
  void __init free_bootmem_with_active_regions(int nid,
  						unsigned long max_low_pfn)
  {
  	int i;
  
  	for_each_active_range_index_in_nid(i, nid) {
  		unsigned long size_pages = 0;
  		unsigned long end_pfn = early_node_map[i].end_pfn;
  
  		if (early_node_map[i].start_pfn >= max_low_pfn)
  			continue;
  
  		if (end_pfn > max_low_pfn)
  			end_pfn = max_low_pfn;
  
  		size_pages = end_pfn - early_node_map[i].start_pfn;
  		free_bootmem_node(NODE_DATA(early_node_map[i].nid),
  				PFN_PHYS(early_node_map[i].start_pfn),
  				size_pages << PAGE_SHIFT);
  	}
  }
edbe7d23b   Yinghai Lu   memblock: Add fin...
3667
  #ifdef CONFIG_HAVE_MEMBLOCK
cc2898943   Yinghai Lu   mm: Move early_no...
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
  /*
   * Basic iterator support. Return the last range of PFNs for a node
   * Note: nid == MAX_NUMNODES returns last region regardless of node
   */
  static int __meminit last_active_region_index_in_nid(int nid)
  {
  	int i;
  
  	for (i = nr_nodemap_entries - 1; i >= 0; i--)
  		if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
  			return i;
  
  	return -1;
  }
  
  /*
   * Basic iterator support. Return the previous active range of PFNs for a node
   * Note: nid == MAX_NUMNODES returns next region regardless of node
   */
  static int __meminit previous_active_region_index_in_nid(int index, int nid)
  {
  	for (index = index - 1; index >= 0; index--)
  		if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
  			return index;
  
  	return -1;
  }
  
  #define for_each_active_range_index_in_nid_reverse(i, nid) \
  	for (i = last_active_region_index_in_nid(nid); i != -1; \
  				i = previous_active_region_index_in_nid(i, nid))
edbe7d23b   Yinghai Lu   memblock: Add fin...
3699
3700
3701
3702
3703
3704
  u64 __init find_memory_core_early(int nid, u64 size, u64 align,
  					u64 goal, u64 limit)
  {
  	int i;
  
  	/* Need to go over early_node_map to find out good range for node */
1a4a678b1   Yinghai Lu   memblock: Make fi...
3705
  	for_each_active_range_index_in_nid_reverse(i, nid) {
edbe7d23b   Yinghai Lu   memblock: Add fin...
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
  		u64 addr;
  		u64 ei_start, ei_last;
  		u64 final_start, final_end;
  
  		ei_last = early_node_map[i].end_pfn;
  		ei_last <<= PAGE_SHIFT;
  		ei_start = early_node_map[i].start_pfn;
  		ei_start <<= PAGE_SHIFT;
  
  		final_start = max(ei_start, goal);
  		final_end = min(ei_last, limit);
  
  		if (final_start >= final_end)
  			continue;
  
  		addr = memblock_find_in_range(final_start, final_end, size, align);
  
  		if (addr == MEMBLOCK_ERROR)
  			continue;
  
  		return addr;
  	}
  
  	return MEMBLOCK_ERROR;
  }
  #endif
08677214e   Yinghai Lu   x86: Make 64 bit ...
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
  int __init add_from_early_node_map(struct range *range, int az,
  				   int nr_range, int nid)
  {
  	int i;
  	u64 start, end;
  
  	/* need to go over early_node_map to find out good range for node */
  	for_each_active_range_index_in_nid(i, nid) {
  		start = early_node_map[i].start_pfn;
  		end = early_node_map[i].end_pfn;
  		nr_range = add_range(range, az, nr_range, start, end);
  	}
  	return nr_range;
  }
b5bc6c0e5   Yinghai Lu   x86, mm: use add_...
3746
3747
3748
  void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data)
  {
  	int i;
d52d53b8a   Yinghai Lu   RFC x86: try to r...
3749
  	int ret;
b5bc6c0e5   Yinghai Lu   x86, mm: use add_...
3750

d52d53b8a   Yinghai Lu   RFC x86: try to r...
3751
3752
3753
3754
3755
3756
  	for_each_active_range_index_in_nid(i, nid) {
  		ret = work_fn(early_node_map[i].start_pfn,
  			      early_node_map[i].end_pfn, data);
  		if (ret)
  			break;
  	}
b5bc6c0e5   Yinghai Lu   x86, mm: use add_...
3757
  }
c713216de   Mel Gorman   [PATCH] Introduce...
3758
3759
  /**
   * sparse_memory_present_with_active_regions - Call memory_present for each active range
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
3760
   * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
c713216de   Mel Gorman   [PATCH] Introduce...
3761
3762
3763
   *
   * If an architecture guarantees that all ranges registered with
   * add_active_ranges() contain no holes and may be freed, this
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
3764
   * function may be used instead of calling memory_present() manually.
c713216de   Mel Gorman   [PATCH] Introduce...
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
   */
  void __init sparse_memory_present_with_active_regions(int nid)
  {
  	int i;
  
  	for_each_active_range_index_in_nid(i, nid)
  		memory_present(early_node_map[i].nid,
  				early_node_map[i].start_pfn,
  				early_node_map[i].end_pfn);
  }
  
  /**
   * get_pfn_range_for_nid - Return the start and end page frames for a node
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
3778
3779
3780
   * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
   * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
   * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
c713216de   Mel Gorman   [PATCH] Introduce...
3781
3782
3783
3784
   *
   * It returns the start and end page frame of a node based on information
   * provided by an arch calling add_active_range(). If called for a node
   * with no available memory, a warning is printed and the start and end
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
3785
   * PFNs will be 0.
c713216de   Mel Gorman   [PATCH] Introduce...
3786
   */
a3142c8e1   Yasunori Goto   Fix section misma...
3787
  void __meminit get_pfn_range_for_nid(unsigned int nid,
c713216de   Mel Gorman   [PATCH] Introduce...
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
  			unsigned long *start_pfn, unsigned long *end_pfn)
  {
  	int i;
  	*start_pfn = -1UL;
  	*end_pfn = 0;
  
  	for_each_active_range_index_in_nid(i, nid) {
  		*start_pfn = min(*start_pfn, early_node_map[i].start_pfn);
  		*end_pfn = max(*end_pfn, early_node_map[i].end_pfn);
  	}
633c0666b   Christoph Lameter   Memoryless nodes:...
3798
  	if (*start_pfn == -1UL)
c713216de   Mel Gorman   [PATCH] Introduce...
3799
  		*start_pfn = 0;
c713216de   Mel Gorman   [PATCH] Introduce...
3800
3801
3802
  }
  
  /*
2a1e274ac   Mel Gorman   Create the ZONE_M...
3803
3804
3805
3806
   * This finds a zone that can be used for ZONE_MOVABLE pages. The
   * assumption is made that zones within a node are ordered in monotonic
   * increasing memory addresses so that the "highest" populated zone is used
   */
b69a7288e   Adrian Bunk   mm/page_alloc.c: ...
3807
  static void __init find_usable_zone_for_movable(void)
2a1e274ac   Mel Gorman   Create the ZONE_M...
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
  {
  	int zone_index;
  	for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
  		if (zone_index == ZONE_MOVABLE)
  			continue;
  
  		if (arch_zone_highest_possible_pfn[zone_index] >
  				arch_zone_lowest_possible_pfn[zone_index])
  			break;
  	}
  
  	VM_BUG_ON(zone_index == -1);
  	movable_zone = zone_index;
  }
  
  /*
   * The zone ranges provided by the architecture do not include ZONE_MOVABLE
25985edce   Lucas De Marchi   Fix common misspe...
3825
   * because it is sized independent of architecture. Unlike the other zones,
2a1e274ac   Mel Gorman   Create the ZONE_M...
3826
3827
3828
3829
3830
3831
3832
   * the starting point for ZONE_MOVABLE is not fixed. It may be different
   * in each node depending on the size of each node and how evenly kernelcore
   * is distributed. This helper function adjusts the zone ranges
   * provided by the architecture for a given node by using the end of the
   * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
   * zones within a node are in order of monotonic increases memory addresses
   */
b69a7288e   Adrian Bunk   mm/page_alloc.c: ...
3833
  static void __meminit adjust_zone_range_for_zone_movable(int nid,
2a1e274ac   Mel Gorman   Create the ZONE_M...
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
  					unsigned long zone_type,
  					unsigned long node_start_pfn,
  					unsigned long node_end_pfn,
  					unsigned long *zone_start_pfn,
  					unsigned long *zone_end_pfn)
  {
  	/* Only adjust if ZONE_MOVABLE is on this node */
  	if (zone_movable_pfn[nid]) {
  		/* Size ZONE_MOVABLE */
  		if (zone_type == ZONE_MOVABLE) {
  			*zone_start_pfn = zone_movable_pfn[nid];
  			*zone_end_pfn = min(node_end_pfn,
  				arch_zone_highest_possible_pfn[movable_zone]);
  
  		/* Adjust for ZONE_MOVABLE starting within this range */
  		} else if (*zone_start_pfn < zone_movable_pfn[nid] &&
  				*zone_end_pfn > zone_movable_pfn[nid]) {
  			*zone_end_pfn = zone_movable_pfn[nid];
  
  		/* Check if this whole range is within ZONE_MOVABLE */
  		} else if (*zone_start_pfn >= zone_movable_pfn[nid])
  			*zone_start_pfn = *zone_end_pfn;
  	}
  }
  
  /*
c713216de   Mel Gorman   [PATCH] Introduce...
3860
3861
3862
   * Return the number of pages a zone spans in a node, including holes
   * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
   */
6ea6e6887   Paul Mundt   mm: more __memini...
3863
  static unsigned long __meminit zone_spanned_pages_in_node(int nid,
c713216de   Mel Gorman   [PATCH] Introduce...
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
  					unsigned long zone_type,
  					unsigned long *ignored)
  {
  	unsigned long node_start_pfn, node_end_pfn;
  	unsigned long zone_start_pfn, zone_end_pfn;
  
  	/* Get the start and end of the node and zone */
  	get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
  	zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
  	zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
2a1e274ac   Mel Gorman   Create the ZONE_M...
3874
3875
3876
  	adjust_zone_range_for_zone_movable(nid, zone_type,
  				node_start_pfn, node_end_pfn,
  				&zone_start_pfn, &zone_end_pfn);
c713216de   Mel Gorman   [PATCH] Introduce...
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
  
  	/* Check that this node has pages within the zone's required range */
  	if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
  		return 0;
  
  	/* Move the zone boundaries inside the node if necessary */
  	zone_end_pfn = min(zone_end_pfn, node_end_pfn);
  	zone_start_pfn = max(zone_start_pfn, node_start_pfn);
  
  	/* Return the spanned pages */
  	return zone_end_pfn - zone_start_pfn;
  }
  
  /*
   * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
3892
   * then all holes in the requested range will be accounted for.
c713216de   Mel Gorman   [PATCH] Introduce...
3893
   */
329962503   Yinghai Lu   x86: Fix checking...
3894
  unsigned long __meminit __absent_pages_in_range(int nid,
c713216de   Mel Gorman   [PATCH] Introduce...
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
  				unsigned long range_start_pfn,
  				unsigned long range_end_pfn)
  {
  	int i = 0;
  	unsigned long prev_end_pfn = 0, hole_pages = 0;
  	unsigned long start_pfn;
  
  	/* Find the end_pfn of the first active range of pfns in the node */
  	i = first_active_region_index_in_nid(nid);
  	if (i == -1)
  		return 0;
b5445f956   Mel Gorman   Allow nodes to ex...
3906
  	prev_end_pfn = min(early_node_map[i].start_pfn, range_end_pfn);
9c7cd6877   Mel Gorman   [PATCH] Account f...
3907
3908
  	/* Account for ranges before physical memory on this node */
  	if (early_node_map[i].start_pfn > range_start_pfn)
b5445f956   Mel Gorman   Allow nodes to ex...
3909
  		hole_pages = prev_end_pfn - range_start_pfn;
c713216de   Mel Gorman   [PATCH] Introduce...
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
  
  	/* Find all holes for the zone within the node */
  	for (; i != -1; i = next_active_region_index_in_nid(i, nid)) {
  
  		/* No need to continue if prev_end_pfn is outside the zone */
  		if (prev_end_pfn >= range_end_pfn)
  			break;
  
  		/* Make sure the end of the zone is not within the hole */
  		start_pfn = min(early_node_map[i].start_pfn, range_end_pfn);
  		prev_end_pfn = max(prev_end_pfn, range_start_pfn);
  
  		/* Update the hole size cound and move on */
  		if (start_pfn > range_start_pfn) {
  			BUG_ON(prev_end_pfn > start_pfn);
  			hole_pages += start_pfn - prev_end_pfn;
  		}
  		prev_end_pfn = early_node_map[i].end_pfn;
  	}
9c7cd6877   Mel Gorman   [PATCH] Account f...
3929
3930
  	/* Account for ranges past physical memory on this node */
  	if (range_end_pfn > prev_end_pfn)
0c6cb9746   Mel Gorman   [PATCH] Calculati...
3931
  		hole_pages += range_end_pfn -
9c7cd6877   Mel Gorman   [PATCH] Account f...
3932
  				max(range_start_pfn, prev_end_pfn);
c713216de   Mel Gorman   [PATCH] Introduce...
3933
3934
3935
3936
3937
3938
3939
3940
  	return hole_pages;
  }
  
  /**
   * absent_pages_in_range - Return number of page frames in holes within a range
   * @start_pfn: The start PFN to start searching for holes
   * @end_pfn: The end PFN to stop searching for holes
   *
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
3941
   * It returns the number of pages frames in memory holes within a range.
c713216de   Mel Gorman   [PATCH] Introduce...
3942
3943
3944
3945
3946
3947
3948
3949
   */
  unsigned long __init absent_pages_in_range(unsigned long start_pfn,
  							unsigned long end_pfn)
  {
  	return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
  }
  
  /* Return the number of page frames in holes in a zone on a node */
6ea6e6887   Paul Mundt   mm: more __memini...
3950
  static unsigned long __meminit zone_absent_pages_in_node(int nid,
c713216de   Mel Gorman   [PATCH] Introduce...
3951
3952
3953
  					unsigned long zone_type,
  					unsigned long *ignored)
  {
9c7cd6877   Mel Gorman   [PATCH] Account f...
3954
3955
3956
3957
3958
3959
3960
3961
  	unsigned long node_start_pfn, node_end_pfn;
  	unsigned long zone_start_pfn, zone_end_pfn;
  
  	get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
  	zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type],
  							node_start_pfn);
  	zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type],
  							node_end_pfn);
2a1e274ac   Mel Gorman   Create the ZONE_M...
3962
3963
3964
  	adjust_zone_range_for_zone_movable(nid, zone_type,
  			node_start_pfn, node_end_pfn,
  			&zone_start_pfn, &zone_end_pfn);
9c7cd6877   Mel Gorman   [PATCH] Account f...
3965
  	return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
c713216de   Mel Gorman   [PATCH] Introduce...
3966
  }
0e0b864e0   Mel Gorman   [PATCH] Account f...
3967

c713216de   Mel Gorman   [PATCH] Introduce...
3968
  #else
6ea6e6887   Paul Mundt   mm: more __memini...
3969
  static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
c713216de   Mel Gorman   [PATCH] Introduce...
3970
3971
3972
3973
3974
  					unsigned long zone_type,
  					unsigned long *zones_size)
  {
  	return zones_size[zone_type];
  }
6ea6e6887   Paul Mundt   mm: more __memini...
3975
  static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
c713216de   Mel Gorman   [PATCH] Introduce...
3976
3977
3978
3979
3980
3981
3982
3983
  						unsigned long zone_type,
  						unsigned long *zholes_size)
  {
  	if (!zholes_size)
  		return 0;
  
  	return zholes_size[zone_type];
  }
0e0b864e0   Mel Gorman   [PATCH] Account f...
3984

c713216de   Mel Gorman   [PATCH] Introduce...
3985
  #endif
a3142c8e1   Yasunori Goto   Fix section misma...
3986
  static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
c713216de   Mel Gorman   [PATCH] Introduce...
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
  		unsigned long *zones_size, unsigned long *zholes_size)
  {
  	unsigned long realtotalpages, totalpages = 0;
  	enum zone_type i;
  
  	for (i = 0; i < MAX_NR_ZONES; i++)
  		totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
  								zones_size);
  	pgdat->node_spanned_pages = totalpages;
  
  	realtotalpages = totalpages;
  	for (i = 0; i < MAX_NR_ZONES; i++)
  		realtotalpages -=
  			zone_absent_pages_in_node(pgdat->node_id, i,
  								zholes_size);
  	pgdat->node_present_pages = realtotalpages;
  	printk(KERN_DEBUG "On node %d totalpages: %lu
  ", pgdat->node_id,
  							realtotalpages);
  }
835c134ec   Mel Gorman   Add a bitmap that...
4007
4008
4009
  #ifndef CONFIG_SPARSEMEM
  /*
   * Calculate the size of the zone->blockflags rounded to an unsigned long
d9c234005   Mel Gorman   Do not depend on ...
4010
4011
   * Start by making sure zonesize is a multiple of pageblock_order by rounding
   * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
835c134ec   Mel Gorman   Add a bitmap that...
4012
4013
4014
4015
4016
4017
   * round what is now in bits to nearest long in bits, then return it in
   * bytes.
   */
  static unsigned long __init usemap_size(unsigned long zonesize)
  {
  	unsigned long usemapsize;
d9c234005   Mel Gorman   Do not depend on ...
4018
4019
  	usemapsize = roundup(zonesize, pageblock_nr_pages);
  	usemapsize = usemapsize >> pageblock_order;
835c134ec   Mel Gorman   Add a bitmap that...
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
  	usemapsize *= NR_PAGEBLOCK_BITS;
  	usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));
  
  	return usemapsize / 8;
  }
  
  static void __init setup_usemap(struct pglist_data *pgdat,
  				struct zone *zone, unsigned long zonesize)
  {
  	unsigned long usemapsize = usemap_size(zonesize);
  	zone->pageblock_flags = NULL;
58a01a457   Julia Lawall   mm/page_alloc.c: ...
4031
  	if (usemapsize)
8f389a99b   Yinghai Lu   mm: use alloc_boo...
4032
4033
  		zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,
  								   usemapsize);
835c134ec   Mel Gorman   Add a bitmap that...
4034
4035
  }
  #else
fa9f90be7   Jesper Juhl   Kill off a bunch ...
4036
  static inline void setup_usemap(struct pglist_data *pgdat,
835c134ec   Mel Gorman   Add a bitmap that...
4037
4038
  				struct zone *zone, unsigned long zonesize) {}
  #endif /* CONFIG_SPARSEMEM */
d9c234005   Mel Gorman   Do not depend on ...
4039
  #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
ba72cb8cb   Mel Gorman   Fix boot problem ...
4040
4041
4042
4043
4044
4045
4046
4047
4048
  
  /* Return a sensible default order for the pageblock size. */
  static inline int pageblock_default_order(void)
  {
  	if (HPAGE_SHIFT > PAGE_SHIFT)
  		return HUGETLB_PAGE_ORDER;
  
  	return MAX_ORDER-1;
  }
d9c234005   Mel Gorman   Do not depend on ...
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
  /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
  static inline void __init set_pageblock_order(unsigned int order)
  {
  	/* Check that pageblock_nr_pages has not already been setup */
  	if (pageblock_order)
  		return;
  
  	/*
  	 * Assume the largest contiguous order of interest is a huge page.
  	 * This value may be variable depending on boot parameters on IA64
  	 */
  	pageblock_order = order;
  }
  #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
ba72cb8cb   Mel Gorman   Fix boot problem ...
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
  /*
   * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
   * and pageblock_default_order() are unused as pageblock_order is set
   * at compile-time. See include/linux/pageblock-flags.h for the values of
   * pageblock_order based on the kernel config
   */
  static inline int pageblock_default_order(unsigned int order)
  {
  	return MAX_ORDER-1;
  }
d9c234005   Mel Gorman   Do not depend on ...
4073
4074
4075
  #define set_pageblock_order(x)	do {} while (0)
  
  #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4076
4077
4078
4079
4080
4081
  /*
   * Set up the zone data structures:
   *   - mark all pages reserved
   *   - mark all memory queues empty
   *   - clear the memory bitmaps
   */
b5a0e0113   Alexander van Heukelum   Solve section mis...
4082
  static void __paginginit free_area_init_core(struct pglist_data *pgdat,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4083
4084
  		unsigned long *zones_size, unsigned long *zholes_size)
  {
2f1b62486   Christoph Lameter   [PATCH] reduce MA...
4085
  	enum zone_type j;
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
4086
  	int nid = pgdat->node_id;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4087
  	unsigned long zone_start_pfn = pgdat->node_start_pfn;
718127cc3   Yasunori Goto   [PATCH] wait_tabl...
4088
  	int ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4089

208d54e55   Dave Hansen   [PATCH] memory ho...
4090
  	pgdat_resize_init(pgdat);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4091
4092
4093
  	pgdat->nr_zones = 0;
  	init_waitqueue_head(&pgdat->kswapd_wait);
  	pgdat->kswapd_max_order = 0;
52d4b9ac0   KAMEZAWA Hiroyuki   memcg: allocate a...
4094
  	pgdat_page_cgroup_init(pgdat);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4095
4096
4097
  	
  	for (j = 0; j < MAX_NR_ZONES; j++) {
  		struct zone *zone = pgdat->node_zones + j;
0e0b864e0   Mel Gorman   [PATCH] Account f...
4098
  		unsigned long size, realsize, memmap_pages;
b69408e88   Christoph Lameter   vmscan: Use an in...
4099
  		enum lru_list l;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4100

c713216de   Mel Gorman   [PATCH] Introduce...
4101
4102
4103
  		size = zone_spanned_pages_in_node(nid, j, zones_size);
  		realsize = size - zone_absent_pages_in_node(nid, j,
  								zholes_size);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4104

0e0b864e0   Mel Gorman   [PATCH] Account f...
4105
4106
4107
4108
4109
  		/*
  		 * Adjust realsize so that it accounts for how much memory
  		 * is used by this zone for memmap. This affects the watermark
  		 * and per-cpu initialisations
  		 */
f72321541   Johannes Weiner   mm: don't drop a ...
4110
4111
  		memmap_pages =
  			PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;
0e0b864e0   Mel Gorman   [PATCH] Account f...
4112
4113
  		if (realsize >= memmap_pages) {
  			realsize -= memmap_pages;
5594c8c81   Yinghai Lu   mm: print out mem...
4114
4115
4116
4117
4118
  			if (memmap_pages)
  				printk(KERN_DEBUG
  				       "  %s zone: %lu pages used for memmap
  ",
  				       zone_names[j], memmap_pages);
0e0b864e0   Mel Gorman   [PATCH] Account f...
4119
4120
4121
4122
4123
  		} else
  			printk(KERN_WARNING
  				"  %s zone: %lu pages exceeds realsize %lu
  ",
  				zone_names[j], memmap_pages, realsize);
6267276f3   Christoph Lameter   [PATCH] optional ...
4124
4125
  		/* Account for reserved pages */
  		if (j == 0 && realsize > dma_reserve) {
0e0b864e0   Mel Gorman   [PATCH] Account f...
4126
  			realsize -= dma_reserve;
d903ef9f3   Yinghai Lu   mm: print out mem...
4127
4128
  			printk(KERN_DEBUG "  %s zone: %lu pages reserved
  ",
6267276f3   Christoph Lameter   [PATCH] optional ...
4129
  					zone_names[0], dma_reserve);
0e0b864e0   Mel Gorman   [PATCH] Account f...
4130
  		}
98d2b0ebd   Christoph Lameter   [PATCH] reduce MA...
4131
  		if (!is_highmem_idx(j))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4132
4133
4134
4135
4136
  			nr_kernel_pages += realsize;
  		nr_all_pages += realsize;
  
  		zone->spanned_pages = size;
  		zone->present_pages = realsize;
9614634fe   Christoph Lameter   [PATCH] ZVC/zone_...
4137
  #ifdef CONFIG_NUMA
d5f541ed6   Christoph Lameter   [PATCH] Add node ...
4138
  		zone->node = nid;
8417bba4b   Christoph Lameter   [PATCH] Replace m...
4139
  		zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
9614634fe   Christoph Lameter   [PATCH] ZVC/zone_...
4140
  						/ 100;
0ff38490c   Christoph Lameter   [PATCH] zone_recl...
4141
  		zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
9614634fe   Christoph Lameter   [PATCH] ZVC/zone_...
4142
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4143
4144
4145
  		zone->name = zone_names[j];
  		spin_lock_init(&zone->lock);
  		spin_lock_init(&zone->lru_lock);
bdc8cb984   Dave Hansen   [PATCH] memory ho...
4146
  		zone_seqlock_init(zone);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4147
  		zone->zone_pgdat = pgdat;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4148

ed8ece2ec   Dave Hansen   [PATCH] memory ho...
4149
  		zone_pcp_init(zone);
246e87a93   KAMEZAWA Hiroyuki   memcg: fix get_sc...
4150
  		for_each_lru(l)
b69408e88   Christoph Lameter   vmscan: Use an in...
4151
  			INIT_LIST_HEAD(&zone->lru[l].list);
6e9015716   KOSAKI Motohiro   mm: introduce zon...
4152
4153
4154
4155
  		zone->reclaim_stat.recent_rotated[0] = 0;
  		zone->reclaim_stat.recent_rotated[1] = 0;
  		zone->reclaim_stat.recent_scanned[0] = 0;
  		zone->reclaim_stat.recent_scanned[1] = 0;
2244b95a7   Christoph Lameter   [PATCH] zoned vm ...
4156
  		zap_zone_vm_stats(zone);
e815af95f   David Rientjes   oom: change all_u...
4157
  		zone->flags = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4158
4159
  		if (!size)
  			continue;
ba72cb8cb   Mel Gorman   Fix boot problem ...
4160
  		set_pageblock_order(pageblock_default_order());
835c134ec   Mel Gorman   Add a bitmap that...
4161
  		setup_usemap(pgdat, zone, size);
a2f3aa025   Dave Hansen   [PATCH] Fix spars...
4162
4163
  		ret = init_currently_empty_zone(zone, zone_start_pfn,
  						size, MEMMAP_EARLY);
718127cc3   Yasunori Goto   [PATCH] wait_tabl...
4164
  		BUG_ON(ret);
76cdd58e5   Heiko Carstens   memory_hotplug: a...
4165
  		memmap_init(size, nid, j, zone_start_pfn);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4166
  		zone_start_pfn += size;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4167
4168
  	}
  }
577a32f62   Sam Ravnborg   mm: fix section m...
4169
  static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4170
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4171
4172
4173
  	/* Skip empty nodes */
  	if (!pgdat->node_spanned_pages)
  		return;
d41dee369   Andy Whitcroft   [PATCH] sparsemem...
4174
  #ifdef CONFIG_FLAT_NODE_MEM_MAP
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4175
4176
  	/* ia64 gets its own node_mem_map, before this, without bootmem */
  	if (!pgdat->node_mem_map) {
e984bb43f   Bob Picco   [PATCH] Align the...
4177
  		unsigned long size, start, end;
d41dee369   Andy Whitcroft   [PATCH] sparsemem...
4178
  		struct page *map;
e984bb43f   Bob Picco   [PATCH] Align the...
4179
4180
4181
4182
4183
4184
4185
4186
4187
  		/*
  		 * The zone's endpoints aren't required to be MAX_ORDER
  		 * aligned but the node_mem_map endpoints must be in order
  		 * for the buddy allocator to function correctly.
  		 */
  		start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
  		end = pgdat->node_start_pfn + pgdat->node_spanned_pages;
  		end = ALIGN(end, MAX_ORDER_NR_PAGES);
  		size =  (end - start) * sizeof(struct page);
6f167ec72   Dave Hansen   [PATCH] sparsemem...
4188
4189
  		map = alloc_remap(pgdat->node_id, size);
  		if (!map)
8f389a99b   Yinghai Lu   mm: use alloc_boo...
4190
  			map = alloc_bootmem_node_nopanic(pgdat, size);
e984bb43f   Bob Picco   [PATCH] Align the...
4191
  		pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4192
  	}
12d810c1b   Roman Zippel   m68k: discontinuo...
4193
  #ifndef CONFIG_NEED_MULTIPLE_NODES
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4194
4195
4196
  	/*
  	 * With no DISCONTIG, the global mem_map is just set as node 0's
  	 */
c713216de   Mel Gorman   [PATCH] Introduce...
4197
  	if (pgdat == NODE_DATA(0)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4198
  		mem_map = NODE_DATA(0)->node_mem_map;
c713216de   Mel Gorman   [PATCH] Introduce...
4199
4200
  #ifdef CONFIG_ARCH_POPULATES_NODE_MAP
  		if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
467bc461d   Thomas Bogendoerfer   Fix crash with FL...
4201
  			mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);
c713216de   Mel Gorman   [PATCH] Introduce...
4202
4203
  #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4204
  #endif
d41dee369   Andy Whitcroft   [PATCH] sparsemem...
4205
  #endif /* CONFIG_FLAT_NODE_MEM_MAP */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4206
  }
9109fb7b3   Johannes Weiner   mm: drop unneeded...
4207
4208
  void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
  		unsigned long node_start_pfn, unsigned long *zholes_size)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4209
  {
9109fb7b3   Johannes Weiner   mm: drop unneeded...
4210
  	pg_data_t *pgdat = NODE_DATA(nid);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4211
4212
  	pgdat->node_id = nid;
  	pgdat->node_start_pfn = node_start_pfn;
c713216de   Mel Gorman   [PATCH] Introduce...
4213
  	calculate_node_totalpages(pgdat, zones_size, zholes_size);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4214
4215
  
  	alloc_node_mem_map(pgdat);
e8c27ac91   Yinghai Lu   x86, numa, 32-bit...
4216
4217
4218
4219
4220
4221
  #ifdef CONFIG_FLAT_NODE_MEM_MAP
  	printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx
  ",
  		nid, (unsigned long)pgdat,
  		(unsigned long)pgdat->node_mem_map);
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4222
4223
4224
  
  	free_area_init_core(pgdat, zones_size, zholes_size);
  }
c713216de   Mel Gorman   [PATCH] Introduce...
4225
  #ifdef CONFIG_ARCH_POPULATES_NODE_MAP
418508c13   Miklos Szeredi   fix unused setup_...
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
  
  #if MAX_NUMNODES > 1
  /*
   * Figure out the number of possible node ids.
   */
  static void __init setup_nr_node_ids(void)
  {
  	unsigned int node;
  	unsigned int highest = 0;
  
  	for_each_node_mask(node, node_possible_map)
  		highest = node;
  	nr_node_ids = highest + 1;
  }
  #else
  static inline void setup_nr_node_ids(void)
  {
  }
  #endif
c713216de   Mel Gorman   [PATCH] Introduce...
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
  /**
   * add_active_range - Register a range of PFNs backed by physical memory
   * @nid: The node ID the range resides on
   * @start_pfn: The start PFN of the available physical memory
   * @end_pfn: The end PFN of the available physical memory
   *
   * These ranges are stored in an early_node_map[] and later used by
   * free_area_init_nodes() to calculate zone sizes and holes. If the
   * range spans a memory hole, it is up to the architecture to ensure
   * the memory is not freed by the bootmem allocator. If possible
   * the range being registered will be merged with existing ranges.
   */
  void __init add_active_range(unsigned int nid, unsigned long start_pfn,
  						unsigned long end_pfn)
  {
  	int i;
6b74ab97b   Mel Gorman   mm: add a basic d...
4261
4262
4263
4264
4265
4266
  	mminit_dprintk(MMINIT_TRACE, "memory_register",
  			"Entering add_active_range(%d, %#lx, %#lx) "
  			"%d entries of %d used
  ",
  			nid, start_pfn, end_pfn,
  			nr_nodemap_entries, MAX_ACTIVE_REGIONS);
c713216de   Mel Gorman   [PATCH] Introduce...
4267

2dbb51c49   Mel Gorman   mm: make defensiv...
4268
  	mminit_validate_memmodel_limits(&start_pfn, &end_pfn);
c713216de   Mel Gorman   [PATCH] Introduce...
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
  	/* Merge with existing active regions if possible */
  	for (i = 0; i < nr_nodemap_entries; i++) {
  		if (early_node_map[i].nid != nid)
  			continue;
  
  		/* Skip if an existing region covers this new one */
  		if (start_pfn >= early_node_map[i].start_pfn &&
  				end_pfn <= early_node_map[i].end_pfn)
  			return;
  
  		/* Merge forward if suitable */
  		if (start_pfn <= early_node_map[i].end_pfn &&
  				end_pfn > early_node_map[i].end_pfn) {
  			early_node_map[i].end_pfn = end_pfn;
  			return;
  		}
  
  		/* Merge backward if suitable */
d2dbe08dd   Kazuhisa Ichikawa   mm/page_alloc: fi...
4287
  		if (start_pfn < early_node_map[i].start_pfn &&
c713216de   Mel Gorman   [PATCH] Introduce...
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
  				end_pfn >= early_node_map[i].start_pfn) {
  			early_node_map[i].start_pfn = start_pfn;
  			return;
  		}
  	}
  
  	/* Check that early_node_map is large enough */
  	if (i >= MAX_ACTIVE_REGIONS) {
  		printk(KERN_CRIT "More than %d memory regions, truncating
  ",
  							MAX_ACTIVE_REGIONS);
  		return;
  	}
  
  	early_node_map[i].nid = nid;
  	early_node_map[i].start_pfn = start_pfn;
  	early_node_map[i].end_pfn = end_pfn;
  	nr_nodemap_entries = i + 1;
  }
  
  /**
cc1050baf   Yinghai Lu   x86: replace shri...
4309
   * remove_active_range - Shrink an existing registered range of PFNs
c713216de   Mel Gorman   [PATCH] Introduce...
4310
   * @nid: The node id the range is on that should be shrunk
cc1050baf   Yinghai Lu   x86: replace shri...
4311
4312
   * @start_pfn: The new PFN of the range
   * @end_pfn: The new PFN of the range
c713216de   Mel Gorman   [PATCH] Introduce...
4313
4314
   *
   * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node.
cc1a9d86c   Yinghai Lu   mm, x86: shrink_a...
4315
4316
4317
   * The map is kept near the end physical page range that has already been
   * registered. This function allows an arch to shrink an existing registered
   * range.
c713216de   Mel Gorman   [PATCH] Introduce...
4318
   */
cc1050baf   Yinghai Lu   x86: replace shri...
4319
4320
  void __init remove_active_range(unsigned int nid, unsigned long start_pfn,
  				unsigned long end_pfn)
c713216de   Mel Gorman   [PATCH] Introduce...
4321
  {
cc1a9d86c   Yinghai Lu   mm, x86: shrink_a...
4322
4323
  	int i, j;
  	int removed = 0;
c713216de   Mel Gorman   [PATCH] Introduce...
4324

cc1050baf   Yinghai Lu   x86: replace shri...
4325
4326
4327
  	printk(KERN_DEBUG "remove_active_range (%d, %lu, %lu)
  ",
  			  nid, start_pfn, end_pfn);
c713216de   Mel Gorman   [PATCH] Introduce...
4328
  	/* Find the old active region end and shrink */
cc1a9d86c   Yinghai Lu   mm, x86: shrink_a...
4329
  	for_each_active_range_index_in_nid(i, nid) {
cc1050baf   Yinghai Lu   x86: replace shri...
4330
4331
  		if (early_node_map[i].start_pfn >= start_pfn &&
  		    early_node_map[i].end_pfn <= end_pfn) {
cc1a9d86c   Yinghai Lu   mm, x86: shrink_a...
4332
  			/* clear it */
cc1050baf   Yinghai Lu   x86: replace shri...
4333
  			early_node_map[i].start_pfn = 0;
cc1a9d86c   Yinghai Lu   mm, x86: shrink_a...
4334
4335
4336
4337
  			early_node_map[i].end_pfn = 0;
  			removed = 1;
  			continue;
  		}
cc1050baf   Yinghai Lu   x86: replace shri...
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
  		if (early_node_map[i].start_pfn < start_pfn &&
  		    early_node_map[i].end_pfn > start_pfn) {
  			unsigned long temp_end_pfn = early_node_map[i].end_pfn;
  			early_node_map[i].end_pfn = start_pfn;
  			if (temp_end_pfn > end_pfn)
  				add_active_range(nid, end_pfn, temp_end_pfn);
  			continue;
  		}
  		if (early_node_map[i].start_pfn >= start_pfn &&
  		    early_node_map[i].end_pfn > end_pfn &&
  		    early_node_map[i].start_pfn < end_pfn) {
  			early_node_map[i].start_pfn = end_pfn;
cc1a9d86c   Yinghai Lu   mm, x86: shrink_a...
4350
  			continue;
c713216de   Mel Gorman   [PATCH] Introduce...
4351
  		}
cc1a9d86c   Yinghai Lu   mm, x86: shrink_a...
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
  	}
  
  	if (!removed)
  		return;
  
  	/* remove the blank ones */
  	for (i = nr_nodemap_entries - 1; i > 0; i--) {
  		if (early_node_map[i].nid != nid)
  			continue;
  		if (early_node_map[i].end_pfn)
  			continue;
  		/* we found it, get rid of it */
  		for (j = i; j < nr_nodemap_entries - 1; j++)
  			memcpy(&early_node_map[j], &early_node_map[j+1],
  				sizeof(early_node_map[j]));
  		j = nr_nodemap_entries - 1;
  		memset(&early_node_map[j], 0, sizeof(early_node_map[j]));
  		nr_nodemap_entries--;
  	}
c713216de   Mel Gorman   [PATCH] Introduce...
4371
4372
4373
4374
  }
  
  /**
   * remove_all_active_ranges - Remove all currently registered regions
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
4375
   *
c713216de   Mel Gorman   [PATCH] Introduce...
4376
4377
4378
4379
   * During discovery, it may be found that a table like SRAT is invalid
   * and an alternative discovery method must be used. This function removes
   * all currently registered regions.
   */
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
4380
  void __init remove_all_active_ranges(void)
c713216de   Mel Gorman   [PATCH] Introduce...
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
  {
  	memset(early_node_map, 0, sizeof(early_node_map));
  	nr_nodemap_entries = 0;
  }
  
  /* Compare two active node_active_regions */
  static int __init cmp_node_active_region(const void *a, const void *b)
  {
  	struct node_active_region *arange = (struct node_active_region *)a;
  	struct node_active_region *brange = (struct node_active_region *)b;
  
  	/* Done this way to avoid overflows */
  	if (arange->start_pfn > brange->start_pfn)
  		return 1;
  	if (arange->start_pfn < brange->start_pfn)
  		return -1;
  
  	return 0;
  }
  
  /* sort the node_map by start_pfn */
329962503   Yinghai Lu   x86: Fix checking...
4402
  void __init sort_node_map(void)
c713216de   Mel Gorman   [PATCH] Introduce...
4403
4404
4405
4406
4407
  {
  	sort(early_node_map, (size_t)nr_nodemap_entries,
  			sizeof(struct node_active_region),
  			cmp_node_active_region, NULL);
  }
1e01979c8   Tejun Heo   x86, numa: Implem...
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
  /**
   * node_map_pfn_alignment - determine the maximum internode alignment
   *
   * This function should be called after node map is populated and sorted.
   * It calculates the maximum power of two alignment which can distinguish
   * all the nodes.
   *
   * For example, if all nodes are 1GiB and aligned to 1GiB, the return value
   * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)).  If the
   * nodes are shifted by 256MiB, 256MiB.  Note that if only the last node is
   * shifted, 1GiB is enough and this function will indicate so.
   *
   * This is used to test whether pfn -> nid mapping of the chosen memory
   * model has fine enough granularity to avoid incorrect mapping for the
   * populated node map.
   *
   * Returns the determined alignment in pfn's.  0 if there is no alignment
   * requirement (single node).
   */
  unsigned long __init node_map_pfn_alignment(void)
  {
  	unsigned long accl_mask = 0, last_end = 0;
  	int last_nid = -1;
  	int i;
  
  	for_each_active_range_index_in_nid(i, MAX_NUMNODES) {
  		int nid = early_node_map[i].nid;
  		unsigned long start = early_node_map[i].start_pfn;
  		unsigned long end = early_node_map[i].end_pfn;
  		unsigned long mask;
  
  		if (!start || last_nid < 0 || last_nid == nid) {
  			last_nid = nid;
  			last_end = end;
  			continue;
  		}
  
  		/*
  		 * Start with a mask granular enough to pin-point to the
  		 * start pfn and tick off bits one-by-one until it becomes
  		 * too coarse to separate the current node from the last.
  		 */
  		mask = ~((1 << __ffs(start)) - 1);
  		while (mask && last_end <= (start & (mask << 1)))
  			mask <<= 1;
  
  		/* accumulate all internode masks */
  		accl_mask |= mask;
  	}
  
  	/* convert mask to number of pages */
  	return ~accl_mask + 1;
  }
a6af2bc3d   Mel Gorman   [PATCH] Avoid exc...
4461
  /* Find the lowest pfn for a node */
b69a7288e   Adrian Bunk   mm/page_alloc.c: ...
4462
  static unsigned long __init find_min_pfn_for_node(int nid)
c713216de   Mel Gorman   [PATCH] Introduce...
4463
4464
  {
  	int i;
a6af2bc3d   Mel Gorman   [PATCH] Avoid exc...
4465
  	unsigned long min_pfn = ULONG_MAX;
1abbfb412   Mel Gorman   [PATCH] x86_64: f...
4466

c713216de   Mel Gorman   [PATCH] Introduce...
4467
4468
  	/* Assuming a sorted map, the first range found has the starting pfn */
  	for_each_active_range_index_in_nid(i, nid)
a6af2bc3d   Mel Gorman   [PATCH] Avoid exc...
4469
  		min_pfn = min(min_pfn, early_node_map[i].start_pfn);
c713216de   Mel Gorman   [PATCH] Introduce...
4470

a6af2bc3d   Mel Gorman   [PATCH] Avoid exc...
4471
4472
  	if (min_pfn == ULONG_MAX) {
  		printk(KERN_WARNING
2bc0d2615   Paul Jackson   x86 boot: more co...
4473
4474
  			"Could not find start_pfn for node %d
  ", nid);
a6af2bc3d   Mel Gorman   [PATCH] Avoid exc...
4475
4476
4477
4478
  		return 0;
  	}
  
  	return min_pfn;
c713216de   Mel Gorman   [PATCH] Introduce...
4479
4480
4481
4482
4483
4484
  }
  
  /**
   * find_min_pfn_with_active_regions - Find the minimum PFN registered
   *
   * It returns the minimum PFN based on information provided via
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
4485
   * add_active_range().
c713216de   Mel Gorman   [PATCH] Introduce...
4486
4487
4488
4489
4490
   */
  unsigned long __init find_min_pfn_with_active_regions(void)
  {
  	return find_min_pfn_for_node(MAX_NUMNODES);
  }
37b07e416   Lee Schermerhorn   memoryless nodes:...
4491
4492
4493
4494
4495
  /*
   * early_calculate_totalpages()
   * Sum pages in active regions for movable zone.
   * Populate N_HIGH_MEMORY for calculating usable_nodes.
   */
484f51f82   Adrian Bunk   mm/page_alloc.c: ...
4496
  static unsigned long __init early_calculate_totalpages(void)
7e63efef8   Mel Gorman   Add a movablecore...
4497
4498
4499
  {
  	int i;
  	unsigned long totalpages = 0;
37b07e416   Lee Schermerhorn   memoryless nodes:...
4500
4501
  	for (i = 0; i < nr_nodemap_entries; i++) {
  		unsigned long pages = early_node_map[i].end_pfn -
7e63efef8   Mel Gorman   Add a movablecore...
4502
  						early_node_map[i].start_pfn;
37b07e416   Lee Schermerhorn   memoryless nodes:...
4503
4504
4505
4506
4507
  		totalpages += pages;
  		if (pages)
  			node_set_state(early_node_map[i].nid, N_HIGH_MEMORY);
  	}
    	return totalpages;
7e63efef8   Mel Gorman   Add a movablecore...
4508
  }
2a1e274ac   Mel Gorman   Create the ZONE_M...
4509
4510
4511
4512
4513
4514
  /*
   * Find the PFN the Movable zone begins in each node. Kernel memory
   * is spread evenly between nodes as long as the nodes have enough
   * memory. When they don't, some nodes will have more kernelcore than
   * others
   */
b69a7288e   Adrian Bunk   mm/page_alloc.c: ...
4515
  static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
2a1e274ac   Mel Gorman   Create the ZONE_M...
4516
4517
4518
4519
  {
  	int i, nid;
  	unsigned long usable_startpfn;
  	unsigned long kernelcore_node, kernelcore_remaining;
66918dcdf   Yinghai Lu   x86: only clear n...
4520
4521
  	/* save the state before borrow the nodemask */
  	nodemask_t saved_node_state = node_states[N_HIGH_MEMORY];
37b07e416   Lee Schermerhorn   memoryless nodes:...
4522
4523
  	unsigned long totalpages = early_calculate_totalpages();
  	int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);
2a1e274ac   Mel Gorman   Create the ZONE_M...
4524

7e63efef8   Mel Gorman   Add a movablecore...
4525
4526
4527
4528
4529
4530
4531
4532
4533
  	/*
  	 * If movablecore was specified, calculate what size of
  	 * kernelcore that corresponds so that memory usable for
  	 * any allocation type is evenly spread. If both kernelcore
  	 * and movablecore are specified, then the value of kernelcore
  	 * will be used for required_kernelcore if it's greater than
  	 * what movablecore would have allowed.
  	 */
  	if (required_movablecore) {
7e63efef8   Mel Gorman   Add a movablecore...
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
  		unsigned long corepages;
  
  		/*
  		 * Round-up so that ZONE_MOVABLE is at least as large as what
  		 * was requested by the user
  		 */
  		required_movablecore =
  			roundup(required_movablecore, MAX_ORDER_NR_PAGES);
  		corepages = totalpages - required_movablecore;
  
  		required_kernelcore = max(required_kernelcore, corepages);
  	}
2a1e274ac   Mel Gorman   Create the ZONE_M...
4546
4547
  	/* If kernelcore was not specified, there is no ZONE_MOVABLE */
  	if (!required_kernelcore)
66918dcdf   Yinghai Lu   x86: only clear n...
4548
  		goto out;
2a1e274ac   Mel Gorman   Create the ZONE_M...
4549
4550
4551
4552
4553
4554
4555
4556
  
  	/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
  	find_usable_zone_for_movable();
  	usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
  
  restart:
  	/* Spread kernelcore memory as evenly as possible throughout nodes */
  	kernelcore_node = required_kernelcore / usable_nodes;
37b07e416   Lee Schermerhorn   memoryless nodes:...
4557
  	for_each_node_state(nid, N_HIGH_MEMORY) {
2a1e274ac   Mel Gorman   Create the ZONE_M...
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
  		/*
  		 * Recalculate kernelcore_node if the division per node
  		 * now exceeds what is necessary to satisfy the requested
  		 * amount of memory for the kernel
  		 */
  		if (required_kernelcore < kernelcore_node)
  			kernelcore_node = required_kernelcore / usable_nodes;
  
  		/*
  		 * As the map is walked, we track how much memory is usable
  		 * by the kernel using kernelcore_remaining. When it is
  		 * 0, the rest of the node is usable by ZONE_MOVABLE
  		 */
  		kernelcore_remaining = kernelcore_node;
  
  		/* Go through each range of PFNs within this node */
  		for_each_active_range_index_in_nid(i, nid) {
  			unsigned long start_pfn, end_pfn;
  			unsigned long size_pages;
  
  			start_pfn = max(early_node_map[i].start_pfn,
  						zone_movable_pfn[nid]);
  			end_pfn = early_node_map[i].end_pfn;
  			if (start_pfn >= end_pfn)
  				continue;
  
  			/* Account for what is only usable for kernelcore */
  			if (start_pfn < usable_startpfn) {
  				unsigned long kernel_pages;
  				kernel_pages = min(end_pfn, usable_startpfn)
  								- start_pfn;
  
  				kernelcore_remaining -= min(kernel_pages,
  							kernelcore_remaining);
  				required_kernelcore -= min(kernel_pages,
  							required_kernelcore);
  
  				/* Continue if range is now fully accounted */
  				if (end_pfn <= usable_startpfn) {
  
  					/*
  					 * Push zone_movable_pfn to the end so
  					 * that if we have to rebalance
  					 * kernelcore across nodes, we will
  					 * not double account here
  					 */
  					zone_movable_pfn[nid] = end_pfn;
  					continue;
  				}
  				start_pfn = usable_startpfn;
  			}
  
  			/*
  			 * The usable PFN range for ZONE_MOVABLE is from
  			 * start_pfn->end_pfn. Calculate size_pages as the
  			 * number of pages used as kernelcore
  			 */
  			size_pages = end_pfn - start_pfn;
  			if (size_pages > kernelcore_remaining)
  				size_pages = kernelcore_remaining;
  			zone_movable_pfn[nid] = start_pfn + size_pages;
  
  			/*
  			 * Some kernelcore has been met, update counts and
  			 * break if the kernelcore for this node has been
  			 * satisified
  			 */
  			required_kernelcore -= min(required_kernelcore,
  								size_pages);
  			kernelcore_remaining -= size_pages;
  			if (!kernelcore_remaining)
  				break;
  		}
  	}
  
  	/*
  	 * If there is still required_kernelcore, we do another pass with one
  	 * less node in the count. This will push zone_movable_pfn[nid] further
  	 * along on the nodes that still have memory until kernelcore is
  	 * satisified
  	 */
  	usable_nodes--;
  	if (usable_nodes && required_kernelcore > usable_nodes)
  		goto restart;
  
  	/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
  	for (nid = 0; nid < MAX_NUMNODES; nid++)
  		zone_movable_pfn[nid] =
  			roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
66918dcdf   Yinghai Lu   x86: only clear n...
4647
4648
4649
4650
  
  out:
  	/* restore the node_state */
  	node_states[N_HIGH_MEMORY] = saved_node_state;
2a1e274ac   Mel Gorman   Create the ZONE_M...
4651
  }
37b07e416   Lee Schermerhorn   memoryless nodes:...
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
  /* Any regular memory on that node ? */
  static void check_for_regular_memory(pg_data_t *pgdat)
  {
  #ifdef CONFIG_HIGHMEM
  	enum zone_type zone_type;
  
  	for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) {
  		struct zone *zone = &pgdat->node_zones[zone_type];
  		if (zone->present_pages)
  			node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY);
  	}
  #endif
  }
c713216de   Mel Gorman   [PATCH] Introduce...
4665
4666
  /**
   * free_area_init_nodes - Initialise all pg_data_t and zone data
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
4667
   * @max_zone_pfn: an array of max PFNs for each zone
c713216de   Mel Gorman   [PATCH] Introduce...
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
   *
   * This will call free_area_init_node() for each active node in the system.
   * Using the page ranges provided by add_active_range(), the size of each
   * zone in each node and their holes is calculated. If the maximum PFN
   * between two adjacent zones match, it is assumed that the zone is empty.
   * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
   * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
   * starts where the previous one ended. For example, ZONE_DMA32 starts
   * at arch_max_dma_pfn.
   */
  void __init free_area_init_nodes(unsigned long *max_zone_pfn)
  {
  	unsigned long nid;
db99100d2   Andrew Morton   mm/page_alloc.c:f...
4681
  	int i;
c713216de   Mel Gorman   [PATCH] Introduce...
4682

a6af2bc3d   Mel Gorman   [PATCH] Avoid exc...
4683
4684
  	/* Sort early_node_map as initialisation assumes it is sorted */
  	sort_node_map();
c713216de   Mel Gorman   [PATCH] Introduce...
4685
4686
4687
4688
4689
4690
4691
4692
  	/* Record where the zone boundaries are */
  	memset(arch_zone_lowest_possible_pfn, 0,
  				sizeof(arch_zone_lowest_possible_pfn));
  	memset(arch_zone_highest_possible_pfn, 0,
  				sizeof(arch_zone_highest_possible_pfn));
  	arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
  	arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
  	for (i = 1; i < MAX_NR_ZONES; i++) {
2a1e274ac   Mel Gorman   Create the ZONE_M...
4693
4694
  		if (i == ZONE_MOVABLE)
  			continue;
c713216de   Mel Gorman   [PATCH] Introduce...
4695
4696
4697
4698
4699
  		arch_zone_lowest_possible_pfn[i] =
  			arch_zone_highest_possible_pfn[i-1];
  		arch_zone_highest_possible_pfn[i] =
  			max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
  	}
2a1e274ac   Mel Gorman   Create the ZONE_M...
4700
4701
4702
4703
4704
4705
  	arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;
  	arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;
  
  	/* Find the PFNs that ZONE_MOVABLE begins at in each node */
  	memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
  	find_zone_movable_pfns_for_nodes(zone_movable_pfn);
c713216de   Mel Gorman   [PATCH] Introduce...
4706

c713216de   Mel Gorman   [PATCH] Introduce...
4707
4708
4709
  	/* Print out the zone ranges */
  	printk("Zone PFN ranges:
  ");
2a1e274ac   Mel Gorman   Create the ZONE_M...
4710
4711
4712
  	for (i = 0; i < MAX_NR_ZONES; i++) {
  		if (i == ZONE_MOVABLE)
  			continue;
72f0ba025   David Rientjes   mm: suppress pfn ...
4713
4714
4715
4716
4717
4718
4719
4720
  		printk("  %-8s ", zone_names[i]);
  		if (arch_zone_lowest_possible_pfn[i] ==
  				arch_zone_highest_possible_pfn[i])
  			printk("empty
  ");
  		else
  			printk("%0#10lx -> %0#10lx
  ",
c713216de   Mel Gorman   [PATCH] Introduce...
4721
4722
  				arch_zone_lowest_possible_pfn[i],
  				arch_zone_highest_possible_pfn[i]);
2a1e274ac   Mel Gorman   Create the ZONE_M...
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
  	}
  
  	/* Print out the PFNs ZONE_MOVABLE begins at in each node */
  	printk("Movable zone start PFN for each node
  ");
  	for (i = 0; i < MAX_NUMNODES; i++) {
  		if (zone_movable_pfn[i])
  			printk("  Node %d: %lu
  ", i, zone_movable_pfn[i]);
  	}
c713216de   Mel Gorman   [PATCH] Introduce...
4733
4734
4735
4736
4737
  
  	/* Print out the early_node_map[] */
  	printk("early_node_map[%d] active PFN ranges
  ", nr_nodemap_entries);
  	for (i = 0; i < nr_nodemap_entries; i++)
5dab8ec13   Paul Jackson   mm, generic, x86 ...
4738
4739
  		printk("  %3d: %0#10lx -> %0#10lx
  ", early_node_map[i].nid,
c713216de   Mel Gorman   [PATCH] Introduce...
4740
4741
4742
4743
  						early_node_map[i].start_pfn,
  						early_node_map[i].end_pfn);
  
  	/* Initialise every node */
708614e61   Mel Gorman   mm: verify the pa...
4744
  	mminit_verify_pageflags_layout();
8ef828668   Christoph Lameter   [PATCH] slab: red...
4745
  	setup_nr_node_ids();
c713216de   Mel Gorman   [PATCH] Introduce...
4746
4747
  	for_each_online_node(nid) {
  		pg_data_t *pgdat = NODE_DATA(nid);
9109fb7b3   Johannes Weiner   mm: drop unneeded...
4748
  		free_area_init_node(nid, NULL,
c713216de   Mel Gorman   [PATCH] Introduce...
4749
  				find_min_pfn_for_node(nid), NULL);
37b07e416   Lee Schermerhorn   memoryless nodes:...
4750
4751
4752
4753
4754
  
  		/* Any memory on that node */
  		if (pgdat->node_present_pages)
  			node_set_state(nid, N_HIGH_MEMORY);
  		check_for_regular_memory(pgdat);
c713216de   Mel Gorman   [PATCH] Introduce...
4755
4756
  	}
  }
2a1e274ac   Mel Gorman   Create the ZONE_M...
4757

7e63efef8   Mel Gorman   Add a movablecore...
4758
  static int __init cmdline_parse_core(char *p, unsigned long *core)
2a1e274ac   Mel Gorman   Create the ZONE_M...
4759
4760
4761
4762
4763
4764
  {
  	unsigned long long coremem;
  	if (!p)
  		return -EINVAL;
  
  	coremem = memparse(p, &p);
7e63efef8   Mel Gorman   Add a movablecore...
4765
  	*core = coremem >> PAGE_SHIFT;
2a1e274ac   Mel Gorman   Create the ZONE_M...
4766

7e63efef8   Mel Gorman   Add a movablecore...
4767
  	/* Paranoid check that UL is enough for the coremem value */
2a1e274ac   Mel Gorman   Create the ZONE_M...
4768
4769
4770
4771
  	WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
  
  	return 0;
  }
ed7ed3651   Mel Gorman   handle kernelcore...
4772

7e63efef8   Mel Gorman   Add a movablecore...
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
  /*
   * kernelcore=size sets the amount of memory for use for allocations that
   * cannot be reclaimed or migrated.
   */
  static int __init cmdline_parse_kernelcore(char *p)
  {
  	return cmdline_parse_core(p, &required_kernelcore);
  }
  
  /*
   * movablecore=size sets the amount of memory for use for allocations that
   * can be reclaimed or migrated.
   */
  static int __init cmdline_parse_movablecore(char *p)
  {
  	return cmdline_parse_core(p, &required_movablecore);
  }
ed7ed3651   Mel Gorman   handle kernelcore...
4790
  early_param("kernelcore", cmdline_parse_kernelcore);
7e63efef8   Mel Gorman   Add a movablecore...
4791
  early_param("movablecore", cmdline_parse_movablecore);
ed7ed3651   Mel Gorman   handle kernelcore...
4792

c713216de   Mel Gorman   [PATCH] Introduce...
4793
  #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
0e0b864e0   Mel Gorman   [PATCH] Account f...
4794
  /**
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
4795
4796
   * set_dma_reserve - set the specified number of pages reserved in the first zone
   * @new_dma_reserve: The number of pages to mark reserved
0e0b864e0   Mel Gorman   [PATCH] Account f...
4797
4798
4799
4800
   *
   * The per-cpu batchsize and zone watermarks are determined by present_pages.
   * In the DMA zone, a significant percentage may be consumed by kernel image
   * and other unfreeable allocations which can skew the watermarks badly. This
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
4801
4802
4803
   * function may optionally be used to account for unfreeable pages in the
   * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
   * smaller per-cpu batchsize.
0e0b864e0   Mel Gorman   [PATCH] Account f...
4804
4805
4806
4807
4808
   */
  void __init set_dma_reserve(unsigned long new_dma_reserve)
  {
  	dma_reserve = new_dma_reserve;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4809
4810
  void __init free_area_init(unsigned long *zones_size)
  {
9109fb7b3   Johannes Weiner   mm: drop unneeded...
4811
  	free_area_init_node(0, zones_size,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4812
4813
  			__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4814

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4815
4816
4817
4818
  static int page_alloc_cpu_notify(struct notifier_block *self,
  				 unsigned long action, void *hcpu)
  {
  	int cpu = (unsigned long)hcpu;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4819

8bb784428   Rafael J. Wysocki   Add suspend-relat...
4820
  	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
9f8f21725   Christoph Lameter   Page allocator: c...
4821
4822
4823
4824
4825
4826
4827
4828
  		drain_pages(cpu);
  
  		/*
  		 * Spill the event counters of the dead processor
  		 * into the current processors event counters.
  		 * This artificially elevates the count of the current
  		 * processor.
  		 */
f8891e5e1   Christoph Lameter   [PATCH] Light wei...
4829
  		vm_events_fold_cpu(cpu);
9f8f21725   Christoph Lameter   Page allocator: c...
4830
4831
4832
4833
4834
4835
4836
4837
  
  		/*
  		 * Zero the differential counters of the dead processor
  		 * so that the vm statistics are consistent.
  		 *
  		 * This is only okay since the processor is dead and cannot
  		 * race with what we are doing.
  		 */
2244b95a7   Christoph Lameter   [PATCH] zoned vm ...
4838
  		refresh_cpu_vm_stats(cpu);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4839
4840
4841
  	}
  	return NOTIFY_OK;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4842
4843
4844
4845
4846
4847
4848
  
  void __init page_alloc_init(void)
  {
  	hotcpu_notifier(page_alloc_cpu_notify, 0);
  }
  
  /*
cb45b0e96   Hideo AOKI   [PATCH] overcommi...
4849
4850
4851
4852
4853
4854
4855
   * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio
   *	or min_free_kbytes changes.
   */
  static void calculate_totalreserve_pages(void)
  {
  	struct pglist_data *pgdat;
  	unsigned long reserve_pages = 0;
2f6726e54   Christoph Lameter   [PATCH] Apply typ...
4856
  	enum zone_type i, j;
cb45b0e96   Hideo AOKI   [PATCH] overcommi...
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
  
  	for_each_online_pgdat(pgdat) {
  		for (i = 0; i < MAX_NR_ZONES; i++) {
  			struct zone *zone = pgdat->node_zones + i;
  			unsigned long max = 0;
  
  			/* Find valid and maximum lowmem_reserve in the zone */
  			for (j = i; j < MAX_NR_ZONES; j++) {
  				if (zone->lowmem_reserve[j] > max)
  					max = zone->lowmem_reserve[j];
  			}
418589663   Mel Gorman   page allocator: u...
4868
4869
  			/* we treat the high watermark as reserved pages. */
  			max += high_wmark_pages(zone);
cb45b0e96   Hideo AOKI   [PATCH] overcommi...
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
  
  			if (max > zone->present_pages)
  				max = zone->present_pages;
  			reserve_pages += max;
  		}
  	}
  	totalreserve_pages = reserve_pages;
  }
  
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4880
4881
4882
4883
4884
4885
4886
4887
   * setup_per_zone_lowmem_reserve - called whenever
   *	sysctl_lower_zone_reserve_ratio changes.  Ensures that each zone
   *	has a correct pages reserved value, so an adequate number of
   *	pages are left in the zone after a successful __alloc_pages().
   */
  static void setup_per_zone_lowmem_reserve(void)
  {
  	struct pglist_data *pgdat;
2f6726e54   Christoph Lameter   [PATCH] Apply typ...
4888
  	enum zone_type j, idx;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4889

ec936fc56   KAMEZAWA Hiroyuki   [PATCH] for_each_...
4890
  	for_each_online_pgdat(pgdat) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4891
4892
4893
4894
4895
  		for (j = 0; j < MAX_NR_ZONES; j++) {
  			struct zone *zone = pgdat->node_zones + j;
  			unsigned long present_pages = zone->present_pages;
  
  			zone->lowmem_reserve[j] = 0;
2f6726e54   Christoph Lameter   [PATCH] Apply typ...
4896
4897
  			idx = j;
  			while (idx) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4898
  				struct zone *lower_zone;
2f6726e54   Christoph Lameter   [PATCH] Apply typ...
4899
  				idx--;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
  				if (sysctl_lowmem_reserve_ratio[idx] < 1)
  					sysctl_lowmem_reserve_ratio[idx] = 1;
  
  				lower_zone = pgdat->node_zones + idx;
  				lower_zone->lowmem_reserve[j] = present_pages /
  					sysctl_lowmem_reserve_ratio[idx];
  				present_pages += lower_zone->present_pages;
  			}
  		}
  	}
cb45b0e96   Hideo AOKI   [PATCH] overcommi...
4910
4911
4912
  
  	/* update totalreserve_pages */
  	calculate_totalreserve_pages();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4913
  }
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
4914
  /**
bc75d33f0   Minchan Kim   page-allocator: c...
4915
   * setup_per_zone_wmarks - called when min_free_kbytes changes
bce7394a3   Minchan Kim   page-allocator: r...
4916
   * or when memory is hot-{added|removed}
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
4917
   *
bc75d33f0   Minchan Kim   page-allocator: c...
4918
4919
   * Ensures that the watermark[min,low,high] values for each zone are set
   * correctly with respect to min_free_kbytes.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4920
   */
bc75d33f0   Minchan Kim   page-allocator: c...
4921
  void setup_per_zone_wmarks(void)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
  {
  	unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
  	unsigned long lowmem_pages = 0;
  	struct zone *zone;
  	unsigned long flags;
  
  	/* Calculate total number of !ZONE_HIGHMEM pages */
  	for_each_zone(zone) {
  		if (!is_highmem(zone))
  			lowmem_pages += zone->present_pages;
  	}
  
  	for_each_zone(zone) {
ac924c603   Andrew Morton   [PATCH] setup_per...
4935
  		u64 tmp;
1125b4e39   Gerald Schaefer   setup_per_zone_pa...
4936
  		spin_lock_irqsave(&zone->lock, flags);
ac924c603   Andrew Morton   [PATCH] setup_per...
4937
4938
  		tmp = (u64)pages_min * zone->present_pages;
  		do_div(tmp, lowmem_pages);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4939
4940
  		if (is_highmem(zone)) {
  			/*
669ed1752   Nick Piggin   [PATCH] mm: highm...
4941
4942
4943
4944
  			 * __GFP_HIGH and PF_MEMALLOC allocations usually don't
  			 * need highmem pages, so cap pages_min to a small
  			 * value here.
  			 *
418589663   Mel Gorman   page allocator: u...
4945
  			 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
669ed1752   Nick Piggin   [PATCH] mm: highm...
4946
4947
  			 * deltas controls asynch page reclaim, and so should
  			 * not be capped for highmem.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4948
4949
4950
4951
4952
4953
4954
4955
  			 */
  			int min_pages;
  
  			min_pages = zone->present_pages / 1024;
  			if (min_pages < SWAP_CLUSTER_MAX)
  				min_pages = SWAP_CLUSTER_MAX;
  			if (min_pages > 128)
  				min_pages = 128;
418589663   Mel Gorman   page allocator: u...
4956
  			zone->watermark[WMARK_MIN] = min_pages;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4957
  		} else {
669ed1752   Nick Piggin   [PATCH] mm: highm...
4958
4959
  			/*
  			 * If it's a lowmem zone, reserve a number of pages
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4960
4961
  			 * proportionate to the zone's size.
  			 */
418589663   Mel Gorman   page allocator: u...
4962
  			zone->watermark[WMARK_MIN] = tmp;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4963
  		}
418589663   Mel Gorman   page allocator: u...
4964
4965
  		zone->watermark[WMARK_LOW]  = min_wmark_pages(zone) + (tmp >> 2);
  		zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
56fd56b86   Mel Gorman   Bias the location...
4966
  		setup_zone_migrate_reserve(zone);
1125b4e39   Gerald Schaefer   setup_per_zone_pa...
4967
  		spin_unlock_irqrestore(&zone->lock, flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4968
  	}
cb45b0e96   Hideo AOKI   [PATCH] overcommi...
4969
4970
4971
  
  	/* update totalreserve_pages */
  	calculate_totalreserve_pages();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4972
  }
55a4462af   Randy Dunlap   page_alloc: fix k...
4973
  /*
556adecba   Rik van Riel   vmscan: second ch...
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
   * The inactive anon list should be small enough that the VM never has to
   * do too much work, but large enough that each inactive page has a chance
   * to be referenced again before it is swapped out.
   *
   * The inactive_anon ratio is the target ratio of ACTIVE_ANON to
   * INACTIVE_ANON pages on this zone's LRU, maintained by the
   * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of
   * the anonymous pages are kept on the inactive list.
   *
   * total     target    max
   * memory    ratio     inactive anon
   * -------------------------------------
   *   10MB       1         5MB
   *  100MB       1        50MB
   *    1GB       3       250MB
   *   10GB      10       0.9GB
   *  100GB      31         3GB
   *    1TB     101        10GB
   *   10TB     320        32GB
   */
1b79acc91   KOSAKI Motohiro   mm, mem-hotplug: ...
4994
  static void __meminit calculate_zone_inactive_ratio(struct zone *zone)
556adecba   Rik van Riel   vmscan: second ch...
4995
  {
96cb4df5d   Minchan Kim   page-allocator: a...
4996
  	unsigned int gb, ratio;
556adecba   Rik van Riel   vmscan: second ch...
4997

96cb4df5d   Minchan Kim   page-allocator: a...
4998
4999
5000
  	/* Zone size in gigabytes */
  	gb = zone->present_pages >> (30 - PAGE_SHIFT);
  	if (gb)
556adecba   Rik van Riel   vmscan: second ch...
5001
  		ratio = int_sqrt(10 * gb);
96cb4df5d   Minchan Kim   page-allocator: a...
5002
5003
  	else
  		ratio = 1;
556adecba   Rik van Riel   vmscan: second ch...
5004

96cb4df5d   Minchan Kim   page-allocator: a...
5005
5006
  	zone->inactive_ratio = ratio;
  }
556adecba   Rik van Riel   vmscan: second ch...
5007

839a4fcc8   KOSAKI Motohiro   mm, mem-hotplug: ...
5008
  static void __meminit setup_per_zone_inactive_ratio(void)
96cb4df5d   Minchan Kim   page-allocator: a...
5009
5010
5011
5012
5013
  {
  	struct zone *zone;
  
  	for_each_zone(zone)
  		calculate_zone_inactive_ratio(zone);
556adecba   Rik van Riel   vmscan: second ch...
5014
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
  /*
   * Initialise min_free_kbytes.
   *
   * For small machines we want it small (128k min).  For large machines
   * we want it large (64MB max).  But it is not linear, because network
   * bandwidth does not increase linearly with machine size.  We use
   *
   * 	min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
   *	min_free_kbytes = sqrt(lowmem_kbytes * 16)
   *
   * which yields
   *
   * 16MB:	512k
   * 32MB:	724k
   * 64MB:	1024k
   * 128MB:	1448k
   * 256MB:	2048k
   * 512MB:	2896k
   * 1024MB:	4096k
   * 2048MB:	5792k
   * 4096MB:	8192k
   * 8192MB:	11584k
   * 16384MB:	16384k
   */
1b79acc91   KOSAKI Motohiro   mm, mem-hotplug: ...
5039
  int __meminit init_per_zone_wmark_min(void)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
  {
  	unsigned long lowmem_kbytes;
  
  	lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
  
  	min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
  	if (min_free_kbytes < 128)
  		min_free_kbytes = 128;
  	if (min_free_kbytes > 65536)
  		min_free_kbytes = 65536;
bc75d33f0   Minchan Kim   page-allocator: c...
5050
  	setup_per_zone_wmarks();
a6cccdc36   KOSAKI Motohiro   mm, mem-hotplug: ...
5051
  	refresh_zone_stat_thresholds();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5052
  	setup_per_zone_lowmem_reserve();
556adecba   Rik van Riel   vmscan: second ch...
5053
  	setup_per_zone_inactive_ratio();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5054
5055
  	return 0;
  }
bc75d33f0   Minchan Kim   page-allocator: c...
5056
  module_init(init_per_zone_wmark_min)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5057
5058
5059
5060
5061
5062
5063
  
  /*
   * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 
   *	that we can call two helper functions whenever min_free_kbytes
   *	changes.
   */
  int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 
8d65af789   Alexey Dobriyan   sysctl: remove "s...
5064
  	void __user *buffer, size_t *length, loff_t *ppos)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5065
  {
8d65af789   Alexey Dobriyan   sysctl: remove "s...
5066
  	proc_dointvec(table, write, buffer, length, ppos);
3b1d92c56   Mel Gorman   Do not disable in...
5067
  	if (write)
bc75d33f0   Minchan Kim   page-allocator: c...
5068
  		setup_per_zone_wmarks();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5069
5070
  	return 0;
  }
9614634fe   Christoph Lameter   [PATCH] ZVC/zone_...
5071
5072
  #ifdef CONFIG_NUMA
  int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
8d65af789   Alexey Dobriyan   sysctl: remove "s...
5073
  	void __user *buffer, size_t *length, loff_t *ppos)
9614634fe   Christoph Lameter   [PATCH] ZVC/zone_...
5074
5075
5076
  {
  	struct zone *zone;
  	int rc;
8d65af789   Alexey Dobriyan   sysctl: remove "s...
5077
  	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
9614634fe   Christoph Lameter   [PATCH] ZVC/zone_...
5078
5079
5080
5081
  	if (rc)
  		return rc;
  
  	for_each_zone(zone)
8417bba4b   Christoph Lameter   [PATCH] Replace m...
5082
  		zone->min_unmapped_pages = (zone->present_pages *
9614634fe   Christoph Lameter   [PATCH] ZVC/zone_...
5083
5084
5085
  				sysctl_min_unmapped_ratio) / 100;
  	return 0;
  }
0ff38490c   Christoph Lameter   [PATCH] zone_recl...
5086
5087
  
  int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
8d65af789   Alexey Dobriyan   sysctl: remove "s...
5088
  	void __user *buffer, size_t *length, loff_t *ppos)
0ff38490c   Christoph Lameter   [PATCH] zone_recl...
5089
5090
5091
  {
  	struct zone *zone;
  	int rc;
8d65af789   Alexey Dobriyan   sysctl: remove "s...
5092
  	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
0ff38490c   Christoph Lameter   [PATCH] zone_recl...
5093
5094
5095
5096
5097
5098
5099
5100
  	if (rc)
  		return rc;
  
  	for_each_zone(zone)
  		zone->min_slab_pages = (zone->present_pages *
  				sysctl_min_slab_ratio) / 100;
  	return 0;
  }
9614634fe   Christoph Lameter   [PATCH] ZVC/zone_...
5101
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5102
5103
5104
5105
5106
5107
  /*
   * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
   *	proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
   *	whenever sysctl_lowmem_reserve_ratio changes.
   *
   * The reserve ratio obviously has absolutely no relation with the
418589663   Mel Gorman   page allocator: u...
5108
   * minimum watermarks. The lowmem reserve ratio can only make sense
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5109
5110
5111
   * if in function of the boot time zone sizes.
   */
  int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
8d65af789   Alexey Dobriyan   sysctl: remove "s...
5112
  	void __user *buffer, size_t *length, loff_t *ppos)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5113
  {
8d65af789   Alexey Dobriyan   sysctl: remove "s...
5114
  	proc_dointvec_minmax(table, write, buffer, length, ppos);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5115
5116
5117
  	setup_per_zone_lowmem_reserve();
  	return 0;
  }
8ad4b1fb8   Rohit Seth   [PATCH] Make high...
5118
5119
5120
5121
5122
5123
5124
  /*
   * percpu_pagelist_fraction - changes the pcp->high for each zone on each
   * cpu.  It is the fraction of total pages in each zone that a hot per cpu pagelist
   * can have before it gets flushed back to buddy allocator.
   */
  
  int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
8d65af789   Alexey Dobriyan   sysctl: remove "s...
5125
  	void __user *buffer, size_t *length, loff_t *ppos)
8ad4b1fb8   Rohit Seth   [PATCH] Make high...
5126
5127
5128
5129
  {
  	struct zone *zone;
  	unsigned int cpu;
  	int ret;
8d65af789   Alexey Dobriyan   sysctl: remove "s...
5130
  	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
8ad4b1fb8   Rohit Seth   [PATCH] Make high...
5131
5132
  	if (!write || (ret == -EINVAL))
  		return ret;
364df0ebf   Dimitri Sivanich   mm: fix handling ...
5133
  	for_each_populated_zone(zone) {
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
5134
  		for_each_possible_cpu(cpu) {
8ad4b1fb8   Rohit Seth   [PATCH] Make high...
5135
5136
  			unsigned long  high;
  			high = zone->present_pages / percpu_pagelist_fraction;
99dcc3e5a   Christoph Lameter   this_cpu: Page al...
5137
5138
  			setup_pagelist_highmark(
  				per_cpu_ptr(zone->pageset, cpu), high);
8ad4b1fb8   Rohit Seth   [PATCH] Make high...
5139
5140
5141
5142
  		}
  	}
  	return 0;
  }
f034b5d4e   David S. Miller   [XFRM]: Dynamic x...
5143
  int hashdist = HASHDIST_DEFAULT;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
  
  #ifdef CONFIG_NUMA
  static int __init set_hashdist(char *str)
  {
  	if (!str)
  		return 0;
  	hashdist = simple_strtoul(str, &str, 0);
  	return 1;
  }
  __setup("hashdist=", set_hashdist);
  #endif
  
  /*
   * allocate a large system hash table from bootmem
   * - it is assumed that the hash table must contain an exact power-of-2
   *   quantity of entries
   * - limit is the number of hash buckets, not the total allocation size
   */
  void *__init alloc_large_system_hash(const char *tablename,
  				     unsigned long bucketsize,
  				     unsigned long numentries,
  				     int scale,
  				     int flags,
  				     unsigned int *_hash_shift,
  				     unsigned int *_hash_mask,
  				     unsigned long limit)
  {
  	unsigned long long max = limit;
  	unsigned long log2qty, size;
  	void *table = NULL;
  
  	/* allow the kernel cmdline to have a say */
  	if (!numentries) {
  		/* round applicable memory size up to nearest megabyte */
049036643   Andrew Morton   [PATCH] remove HA...
5178
  		numentries = nr_kernel_pages;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5179
5180
5181
5182
5183
5184
5185
5186
5187
  		numentries += (1UL << (20 - PAGE_SHIFT)) - 1;
  		numentries >>= 20 - PAGE_SHIFT;
  		numentries <<= 20 - PAGE_SHIFT;
  
  		/* limit to 1 bucket per 2^scale bytes of low memory */
  		if (scale > PAGE_SHIFT)
  			numentries >>= (scale - PAGE_SHIFT);
  		else
  			numentries <<= (PAGE_SHIFT - scale);
9ab37b8f2   Paul Mundt   [PATCH] Sanely si...
5188
5189
  
  		/* Make sure we've got at least a 0-order allocation.. */
2c85f51d2   Jan Beulich   mm: also use allo...
5190
5191
5192
5193
5194
5195
5196
5197
  		if (unlikely(flags & HASH_SMALL)) {
  			/* Makes no sense without HASH_EARLY */
  			WARN_ON(!(flags & HASH_EARLY));
  			if (!(numentries >> *_hash_shift)) {
  				numentries = 1UL << *_hash_shift;
  				BUG_ON(!numentries);
  			}
  		} else if (unlikely((numentries * bucketsize) < PAGE_SIZE))
9ab37b8f2   Paul Mundt   [PATCH] Sanely si...
5198
  			numentries = PAGE_SIZE / bucketsize;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5199
  	}
6e692ed37   John Hawkes   [PATCH] fix alloc...
5200
  	numentries = roundup_pow_of_two(numentries);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5201
5202
5203
5204
5205
5206
5207
5208
5209
  
  	/* limit allocation size to 1/16 total memory by default */
  	if (max == 0) {
  		max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
  		do_div(max, bucketsize);
  	}
  
  	if (numentries > max)
  		numentries = max;
f0d1b0b30   David Howells   [PATCH] LOG2: Imp...
5210
  	log2qty = ilog2(numentries);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5211
5212
5213
5214
  
  	do {
  		size = bucketsize << log2qty;
  		if (flags & HASH_EARLY)
74768ed83   Jan Beulich   page allocator: u...
5215
  			table = alloc_bootmem_nopanic(size);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5216
5217
5218
  		else if (hashdist)
  			table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
  		else {
1037b83bd   Eric Dumazet   MM: alloc_large_s...
5219
5220
  			/*
  			 * If bucketsize is not a power-of-two, we may free
a1dd268cf   Mel Gorman   mm: use alloc_pag...
5221
5222
  			 * some pages at the end of hash table which
  			 * alloc_pages_exact() automatically does
1037b83bd   Eric Dumazet   MM: alloc_large_s...
5223
  			 */
264ef8a90   Catalin Marinas   kmemleak: Remove ...
5224
  			if (get_order(size) < MAX_ORDER) {
a1dd268cf   Mel Gorman   mm: use alloc_pag...
5225
  				table = alloc_pages_exact(size, GFP_ATOMIC);
264ef8a90   Catalin Marinas   kmemleak: Remove ...
5226
5227
  				kmemleak_alloc(table, size, 1, GFP_ATOMIC);
  			}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5228
5229
5230
5231
5232
5233
  		}
  	} while (!table && size > PAGE_SIZE && --log2qty);
  
  	if (!table)
  		panic("Failed to allocate %s hash table
  ", tablename);
f241e6607   Robin Holt   mm: alloc_large_s...
5234
5235
  	printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)
  ",
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5236
  	       tablename,
f241e6607   Robin Holt   mm: alloc_large_s...
5237
  	       (1UL << log2qty),
f0d1b0b30   David Howells   [PATCH] LOG2: Imp...
5238
  	       ilog2(size) - PAGE_SHIFT,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5239
5240
5241
5242
5243
5244
5245
5246
5247
  	       size);
  
  	if (_hash_shift)
  		*_hash_shift = log2qty;
  	if (_hash_mask)
  		*_hash_mask = (1 << log2qty) - 1;
  
  	return table;
  }
a117e66ed   KAMEZAWA Hiroyuki   [PATCH] unify pfn...
5248

835c134ec   Mel Gorman   Add a bitmap that...
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
  /* Return a pointer to the bitmap storing bits affecting a block of pages */
  static inline unsigned long *get_pageblock_bitmap(struct zone *zone,
  							unsigned long pfn)
  {
  #ifdef CONFIG_SPARSEMEM
  	return __pfn_to_section(pfn)->pageblock_flags;
  #else
  	return zone->pageblock_flags;
  #endif /* CONFIG_SPARSEMEM */
  }
  
  static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
  {
  #ifdef CONFIG_SPARSEMEM
  	pfn &= (PAGES_PER_SECTION-1);
d9c234005   Mel Gorman   Do not depend on ...
5264
  	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
835c134ec   Mel Gorman   Add a bitmap that...
5265
5266
  #else
  	pfn = pfn - zone->zone_start_pfn;
d9c234005   Mel Gorman   Do not depend on ...
5267
  	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
835c134ec   Mel Gorman   Add a bitmap that...
5268
5269
5270
5271
  #endif /* CONFIG_SPARSEMEM */
  }
  
  /**
d9c234005   Mel Gorman   Do not depend on ...
5272
   * get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages
835c134ec   Mel Gorman   Add a bitmap that...
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
   * @page: The page within the block of interest
   * @start_bitidx: The first bit of interest to retrieve
   * @end_bitidx: The last bit of interest
   * returns pageblock_bits flags
   */
  unsigned long get_pageblock_flags_group(struct page *page,
  					int start_bitidx, int end_bitidx)
  {
  	struct zone *zone;
  	unsigned long *bitmap;
  	unsigned long pfn, bitidx;
  	unsigned long flags = 0;
  	unsigned long value = 1;
  
  	zone = page_zone(page);
  	pfn = page_to_pfn(page);
  	bitmap = get_pageblock_bitmap(zone, pfn);
  	bitidx = pfn_to_bitidx(zone, pfn);
  
  	for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
  		if (test_bit(bitidx + start_bitidx, bitmap))
  			flags |= value;
6220ec784   Andrew Morton   [PATCH] highest_p...
5295

835c134ec   Mel Gorman   Add a bitmap that...
5296
5297
5298
5299
  	return flags;
  }
  
  /**
d9c234005   Mel Gorman   Do not depend on ...
5300
   * set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages
835c134ec   Mel Gorman   Add a bitmap that...
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
   * @page: The page within the block of interest
   * @start_bitidx: The first bit of interest
   * @end_bitidx: The last bit of interest
   * @flags: The flags to set
   */
  void set_pageblock_flags_group(struct page *page, unsigned long flags,
  					int start_bitidx, int end_bitidx)
  {
  	struct zone *zone;
  	unsigned long *bitmap;
  	unsigned long pfn, bitidx;
  	unsigned long value = 1;
  
  	zone = page_zone(page);
  	pfn = page_to_pfn(page);
  	bitmap = get_pageblock_bitmap(zone, pfn);
  	bitidx = pfn_to_bitidx(zone, pfn);
86051ca5e   KAMEZAWA Hiroyuki   mm: fix usemap in...
5318
5319
  	VM_BUG_ON(pfn < zone->zone_start_pfn);
  	VM_BUG_ON(pfn >= zone->zone_start_pfn + zone->spanned_pages);
835c134ec   Mel Gorman   Add a bitmap that...
5320
5321
5322
5323
5324
5325
5326
  
  	for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
  		if (flags & value)
  			__set_bit(bitidx + start_bitidx, bitmap);
  		else
  			__clear_bit(bitidx + start_bitidx, bitmap);
  }
a5d76b54a   KAMEZAWA Hiroyuki   memory unplug: pa...
5327
5328
5329
5330
5331
5332
  
  /*
   * This is designed as sub function...plz see page_isolation.c also.
   * set/clear page block's type to be ISOLATE.
   * page allocater never alloc memory from ISOLATE block.
   */
49ac82558   KAMEZAWA Hiroyuki   memory hotplug: u...
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
  static int
  __count_immobile_pages(struct zone *zone, struct page *page, int count)
  {
  	unsigned long pfn, iter, found;
  	/*
  	 * For avoiding noise data, lru_add_drain_all() should be called
  	 * If ZONE_MOVABLE, the zone never contains immobile pages
  	 */
  	if (zone_idx(zone) == ZONE_MOVABLE)
  		return true;
  
  	if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE)
  		return true;
  
  	pfn = page_to_pfn(page);
  	for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
  		unsigned long check = pfn + iter;
29723fccc   Namhyung Kim   mm: fix dubious c...
5350
  		if (!pfn_valid_within(check))
49ac82558   KAMEZAWA Hiroyuki   memory hotplug: u...
5351
  			continue;
29723fccc   Namhyung Kim   mm: fix dubious c...
5352

49ac82558   KAMEZAWA Hiroyuki   memory hotplug: u...
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
  		page = pfn_to_page(check);
  		if (!page_count(page)) {
  			if (PageBuddy(page))
  				iter += (1 << page_order(page)) - 1;
  			continue;
  		}
  		if (!PageLRU(page))
  			found++;
  		/*
  		 * If there are RECLAIMABLE pages, we need to check it.
  		 * But now, memory offline itself doesn't call shrink_slab()
  		 * and it still to be fixed.
  		 */
  		/*
  		 * If the page is not RAM, page_count()should be 0.
  		 * we don't need more check. This is an _used_ not-movable page.
  		 *
  		 * The problematic thing here is PG_reserved pages. PG_reserved
  		 * is set to both of a memory hole page and a _used_ kernel
  		 * page at boot.
  		 */
  		if (found > count)
  			return false;
  	}
  	return true;
  }
  
  bool is_pageblock_removable_nolock(struct page *page)
  {
  	struct zone *zone = page_zone(page);
  	return __count_immobile_pages(zone, page, 0);
  }
a5d76b54a   KAMEZAWA Hiroyuki   memory unplug: pa...
5385
5386
5387
  int set_migratetype_isolate(struct page *page)
  {
  	struct zone *zone;
49ac82558   KAMEZAWA Hiroyuki   memory hotplug: u...
5388
  	unsigned long flags, pfn;
925cc71e5   Robert Jennings   mm: Add notifier ...
5389
5390
  	struct memory_isolate_notify arg;
  	int notifier_ret;
a5d76b54a   KAMEZAWA Hiroyuki   memory unplug: pa...
5391
5392
5393
  	int ret = -EBUSY;
  
  	zone = page_zone(page);
925cc71e5   Robert Jennings   mm: Add notifier ...
5394

a5d76b54a   KAMEZAWA Hiroyuki   memory unplug: pa...
5395
  	spin_lock_irqsave(&zone->lock, flags);
925cc71e5   Robert Jennings   mm: Add notifier ...
5396
5397
5398
5399
5400
  
  	pfn = page_to_pfn(page);
  	arg.start_pfn = pfn;
  	arg.nr_pages = pageblock_nr_pages;
  	arg.pages_found = 0;
a5d76b54a   KAMEZAWA Hiroyuki   memory unplug: pa...
5401
  	/*
925cc71e5   Robert Jennings   mm: Add notifier ...
5402
5403
5404
5405
5406
5407
5408
5409
5410
  	 * It may be possible to isolate a pageblock even if the
  	 * migratetype is not MIGRATE_MOVABLE. The memory isolation
  	 * notifier chain is used by balloon drivers to return the
  	 * number of pages in a range that are held by the balloon
  	 * driver to shrink memory. If all the pages are accounted for
  	 * by balloons, are free, or on the LRU, isolation can continue.
  	 * Later, for example, when memory hotplug notifier runs, these
  	 * pages reported as "can be isolated" should be isolated(freed)
  	 * by the balloon driver through the memory notifier chain.
a5d76b54a   KAMEZAWA Hiroyuki   memory unplug: pa...
5411
  	 */
925cc71e5   Robert Jennings   mm: Add notifier ...
5412
5413
  	notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
  	notifier_ret = notifier_to_errno(notifier_ret);
4b20477f5   KAMEZAWA Hiroyuki   memory hotplug: f...
5414
  	if (notifier_ret)
a5d76b54a   KAMEZAWA Hiroyuki   memory unplug: pa...
5415
  		goto out;
49ac82558   KAMEZAWA Hiroyuki   memory hotplug: u...
5416
5417
5418
5419
5420
  	/*
  	 * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
  	 * We just check MOVABLE pages.
  	 */
  	if (__count_immobile_pages(zone, page, arg.pages_found))
925cc71e5   Robert Jennings   mm: Add notifier ...
5421
  		ret = 0;
49ac82558   KAMEZAWA Hiroyuki   memory hotplug: u...
5422
5423
5424
5425
  	/*
  	 * immobile means "not-on-lru" paes. If immobile is larger than
  	 * removable-by-driver pages reported by notifier, we'll fail.
  	 */
a5d76b54a   KAMEZAWA Hiroyuki   memory unplug: pa...
5426
  out:
925cc71e5   Robert Jennings   mm: Add notifier ...
5427
5428
5429
5430
  	if (!ret) {
  		set_pageblock_migratetype(page, MIGRATE_ISOLATE);
  		move_freepages_block(zone, page, MIGRATE_ISOLATE);
  	}
a5d76b54a   KAMEZAWA Hiroyuki   memory unplug: pa...
5431
5432
  	spin_unlock_irqrestore(&zone->lock, flags);
  	if (!ret)
9f8f21725   Christoph Lameter   Page allocator: c...
5433
  		drain_all_pages();
a5d76b54a   KAMEZAWA Hiroyuki   memory unplug: pa...
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
  	return ret;
  }
  
  void unset_migratetype_isolate(struct page *page)
  {
  	struct zone *zone;
  	unsigned long flags;
  	zone = page_zone(page);
  	spin_lock_irqsave(&zone->lock, flags);
  	if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
  		goto out;
  	set_pageblock_migratetype(page, MIGRATE_MOVABLE);
  	move_freepages_block(zone, page, MIGRATE_MOVABLE);
  out:
  	spin_unlock_irqrestore(&zone->lock, flags);
  }
0c0e61958   KAMEZAWA Hiroyuki   memory unplug: pa...
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
  
  #ifdef CONFIG_MEMORY_HOTREMOVE
  /*
   * All pages in the range must be isolated before calling this.
   */
  void
  __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
  {
  	struct page *page;
  	struct zone *zone;
  	int order, i;
  	unsigned long pfn;
  	unsigned long flags;
  	/* find the first valid pfn */
  	for (pfn = start_pfn; pfn < end_pfn; pfn++)
  		if (pfn_valid(pfn))
  			break;
  	if (pfn == end_pfn)
  		return;
  	zone = page_zone(pfn_to_page(pfn));
  	spin_lock_irqsave(&zone->lock, flags);
  	pfn = start_pfn;
  	while (pfn < end_pfn) {
  		if (!pfn_valid(pfn)) {
  			pfn++;
  			continue;
  		}
  		page = pfn_to_page(pfn);
  		BUG_ON(page_count(page));
  		BUG_ON(!PageBuddy(page));
  		order = page_order(page);
  #ifdef CONFIG_DEBUG_VM
  		printk(KERN_INFO "remove from free list %lx %d %lx
  ",
  		       pfn, 1 << order, end_pfn);
  #endif
  		list_del(&page->lru);
  		rmv_page_order(page);
  		zone->free_area[order].nr_free--;
  		__mod_zone_page_state(zone, NR_FREE_PAGES,
  				      - (1UL << order));
  		for (i = 0; i < (1 << order); i++)
  			SetPageReserved((page+i));
  		pfn += (1 << order);
  	}
  	spin_unlock_irqrestore(&zone->lock, flags);
  }
  #endif
8d22ba1b7   Wu Fengguang   HWPOISON: detect ...
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
  
  #ifdef CONFIG_MEMORY_FAILURE
  bool is_free_buddy_page(struct page *page)
  {
  	struct zone *zone = page_zone(page);
  	unsigned long pfn = page_to_pfn(page);
  	unsigned long flags;
  	int order;
  
  	spin_lock_irqsave(&zone->lock, flags);
  	for (order = 0; order < MAX_ORDER; order++) {
  		struct page *page_head = page - (pfn & ((1 << order) - 1));
  
  		if (PageBuddy(page_head) && page_order(page_head) >= order)
  			break;
  	}
  	spin_unlock_irqrestore(&zone->lock, flags);
  
  	return order < MAX_ORDER;
  }
  #endif
718a38211   Wu Fengguang   mm: introduce dum...
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
  
  static struct trace_print_flags pageflag_names[] = {
  	{1UL << PG_locked,		"locked"	},
  	{1UL << PG_error,		"error"		},
  	{1UL << PG_referenced,		"referenced"	},
  	{1UL << PG_uptodate,		"uptodate"	},
  	{1UL << PG_dirty,		"dirty"		},
  	{1UL << PG_lru,			"lru"		},
  	{1UL << PG_active,		"active"	},
  	{1UL << PG_slab,		"slab"		},
  	{1UL << PG_owner_priv_1,	"owner_priv_1"	},
  	{1UL << PG_arch_1,		"arch_1"	},
  	{1UL << PG_reserved,		"reserved"	},
  	{1UL << PG_private,		"private"	},
  	{1UL << PG_private_2,		"private_2"	},
  	{1UL << PG_writeback,		"writeback"	},
  #ifdef CONFIG_PAGEFLAGS_EXTENDED
  	{1UL << PG_head,		"head"		},
  	{1UL << PG_tail,		"tail"		},
  #else
  	{1UL << PG_compound,		"compound"	},
  #endif
  	{1UL << PG_swapcache,		"swapcache"	},
  	{1UL << PG_mappedtodisk,	"mappedtodisk"	},
  	{1UL << PG_reclaim,		"reclaim"	},
718a38211   Wu Fengguang   mm: introduce dum...
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
  	{1UL << PG_swapbacked,		"swapbacked"	},
  	{1UL << PG_unevictable,		"unevictable"	},
  #ifdef CONFIG_MMU
  	{1UL << PG_mlocked,		"mlocked"	},
  #endif
  #ifdef CONFIG_ARCH_USES_PG_UNCACHED
  	{1UL << PG_uncached,		"uncached"	},
  #endif
  #ifdef CONFIG_MEMORY_FAILURE
  	{1UL << PG_hwpoison,		"hwpoison"	},
  #endif
  	{-1UL,				NULL		},
  };
  
  static void dump_page_flags(unsigned long flags)
  {
  	const char *delim = "";
  	unsigned long mask;
  	int i;
  
  	printk(KERN_ALERT "page flags: %#lx(", flags);
  
  	/* remove zone id */
  	flags &= (1UL << NR_PAGEFLAGS) - 1;
  
  	for (i = 0; pageflag_names[i].name && flags; i++) {
  
  		mask = pageflag_names[i].mask;
  		if ((flags & mask) != mask)
  			continue;
  
  		flags &= ~mask;
  		printk("%s%s", delim, pageflag_names[i].name);
  		delim = "|";
  	}
  
  	/* check for left over flags */
  	if (flags)
  		printk("%s%#lx", delim, flags);
  
  	printk(")
  ");
  }
  
  void dump_page(struct page *page)
  {
  	printk(KERN_ALERT
  	       "page:%p count:%d mapcount:%d mapping:%p index:%#lx
  ",
4e9f64c42   Andrea Arcangeli   thp: fix bad_page...
5593
  		page, atomic_read(&page->_count), page_mapcount(page),
718a38211   Wu Fengguang   mm: introduce dum...
5594
5595
  		page->mapping, page->index);
  	dump_page_flags(page->flags);
f212ad7cf   Daisuke Nishimura   memcg: add memcg ...
5596
  	mem_cgroup_print_bad_page(page);
718a38211   Wu Fengguang   mm: introduce dum...
5597
  }