Blame view

mm/page_alloc.c 83.2 KB
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
  /*
   *  linux/mm/page_alloc.c
   *
   *  Manages the free list, the system allocates free pages here.
   *  Note that kmalloc() lives in slab.c
   *
   *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   *  Swap reorganised 29.12.95, Stephen Tweedie
   *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
   *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
   *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
   *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
   *  Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
   *          (lots of bits borrowed from Ingo Molnar & Andrew Morton)
   */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
16
17
18
19
20
21
22
  #include <linux/stddef.h>
  #include <linux/mm.h>
  #include <linux/swap.h>
  #include <linux/interrupt.h>
  #include <linux/pagemap.h>
  #include <linux/bootmem.h>
  #include <linux/compiler.h>
9f1583339   Randy Dunlap   [PATCH] use add_t...
23
  #include <linux/kernel.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
24
25
26
27
28
29
30
31
32
33
  #include <linux/module.h>
  #include <linux/suspend.h>
  #include <linux/pagevec.h>
  #include <linux/blkdev.h>
  #include <linux/slab.h>
  #include <linux/notifier.h>
  #include <linux/topology.h>
  #include <linux/sysctl.h>
  #include <linux/cpu.h>
  #include <linux/cpuset.h>
bdc8cb984   Dave Hansen   [PATCH] memory ho...
34
  #include <linux/memory_hotplug.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
35
36
  #include <linux/nodemask.h>
  #include <linux/vmalloc.h>
4be38e351   Christoph Lameter   [PATCH] mm: move ...
37
  #include <linux/mempolicy.h>
6811378e7   Yasunori Goto   [PATCH] wait_tabl...
38
  #include <linux/stop_machine.h>
c713216de   Mel Gorman   [PATCH] Introduce...
39
40
  #include <linux/sort.h>
  #include <linux/pfn.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
41
42
  
  #include <asm/tlbflush.h>
ac924c603   Andrew Morton   [PATCH] setup_per...
43
  #include <asm/div64.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
44
45
46
47
48
49
  #include "internal.h"
  
  /*
   * MCD - HACK: Find somewhere to initialize this EARLY, or make this
   * initializer cleaner
   */
c3d8c1414   Christoph Lameter   [PATCH] More __re...
50
  nodemask_t node_online_map __read_mostly = { { [0] = 1UL } };
7223a93a5   Dean Nelson   [IA64] Export nod...
51
  EXPORT_SYMBOL(node_online_map);
c3d8c1414   Christoph Lameter   [PATCH] More __re...
52
  nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL;
7223a93a5   Dean Nelson   [IA64] Export nod...
53
  EXPORT_SYMBOL(node_possible_map);
6c231b7ba   Ravikiran G Thirumalai   [PATCH] Additions...
54
  unsigned long totalram_pages __read_mostly;
cb45b0e96   Hideo AOKI   [PATCH] overcommi...
55
  unsigned long totalreserve_pages __read_mostly;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
56
  long nr_swap_pages;
8ad4b1fb8   Rohit Seth   [PATCH] Make high...
57
  int percpu_pagelist_fraction;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
58

d98c7a098   Hugh Dickins   [PATCH] compound ...
59
  static void __free_pages_ok(struct page *page, unsigned int order);
a226f6c89   David Howells   [PATCH] FRV: Clea...
60

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
61
62
63
64
65
66
67
  /*
   * results with 256, 32 in the lowmem_reserve sysctl:
   *	1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
   *	1G machine -> (16M dma, 784M normal, 224M high)
   *	NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
   *	HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
   *	HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
a2f1b4249   Andi Kleen   [PATCH] x86_64: A...
68
69
70
   *
   * TBD: should special case ZONE_DMA32 machines here - in those we normally
   * don't need any ZONE_NORMAL reservation
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
71
   */
2f1b62486   Christoph Lameter   [PATCH] reduce MA...
72
73
  int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
  	 256,
fb0e7942b   Christoph Lameter   [PATCH] reduce MA...
74
  #ifdef CONFIG_ZONE_DMA32
2f1b62486   Christoph Lameter   [PATCH] reduce MA...
75
  	 256,
fb0e7942b   Christoph Lameter   [PATCH] reduce MA...
76
  #endif
e53ef38d0   Christoph Lameter   [PATCH] reduce MA...
77
  #ifdef CONFIG_HIGHMEM
2f1b62486   Christoph Lameter   [PATCH] reduce MA...
78
  	 32
e53ef38d0   Christoph Lameter   [PATCH] reduce MA...
79
  #endif
2f1b62486   Christoph Lameter   [PATCH] reduce MA...
80
  };
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
81
82
  
  EXPORT_SYMBOL(totalram_pages);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
83
84
85
86
87
  
  /*
   * Used by page_zone() to look up the address of the struct zone whose
   * id is encoded in the upper bits of page->flags
   */
c3d8c1414   Christoph Lameter   [PATCH] More __re...
88
  struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
89
  EXPORT_SYMBOL(zone_table);
2f1b62486   Christoph Lameter   [PATCH] reduce MA...
90
91
  static char *zone_names[MAX_NR_ZONES] = {
  	 "DMA",
fb0e7942b   Christoph Lameter   [PATCH] reduce MA...
92
  #ifdef CONFIG_ZONE_DMA32
2f1b62486   Christoph Lameter   [PATCH] reduce MA...
93
  	 "DMA32",
fb0e7942b   Christoph Lameter   [PATCH] reduce MA...
94
  #endif
2f1b62486   Christoph Lameter   [PATCH] reduce MA...
95
  	 "Normal",
e53ef38d0   Christoph Lameter   [PATCH] reduce MA...
96
  #ifdef CONFIG_HIGHMEM
2f1b62486   Christoph Lameter   [PATCH] reduce MA...
97
  	 "HighMem"
e53ef38d0   Christoph Lameter   [PATCH] reduce MA...
98
  #endif
2f1b62486   Christoph Lameter   [PATCH] reduce MA...
99
  };
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
100
  int min_free_kbytes = 1024;
86356ab14   Yasunori Goto   [PATCH] wait_tabl...
101
102
  unsigned long __meminitdata nr_kernel_pages;
  unsigned long __meminitdata nr_all_pages;
0e0b864e0   Mel Gorman   [PATCH] Account f...
103
  static unsigned long __initdata dma_reserve;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
104

c713216de   Mel Gorman   [PATCH] Introduce...
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
  #ifdef CONFIG_ARCH_POPULATES_NODE_MAP
    /*
     * MAX_ACTIVE_REGIONS determines the maxmimum number of distinct
     * ranges of memory (RAM) that may be registered with add_active_range().
     * Ranges passed to add_active_range() will be merged if possible
     * so the number of times add_active_range() can be called is
     * related to the number of nodes and the number of holes
     */
    #ifdef CONFIG_MAX_ACTIVE_REGIONS
      /* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */
      #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS
    #else
      #if MAX_NUMNODES >= 32
        /* If there can be many nodes, allow up to 50 holes per node */
        #define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50)
      #else
        /* By default, allow up to 256 distinct regions */
        #define MAX_ACTIVE_REGIONS 256
      #endif
    #endif
  
    struct node_active_region __initdata early_node_map[MAX_ACTIVE_REGIONS];
    int __initdata nr_nodemap_entries;
    unsigned long __initdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
    unsigned long __initdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
fb01439c5   Mel Gorman   [PATCH] Allow an ...
130
131
132
133
  #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
    unsigned long __initdata node_boundary_start_pfn[MAX_NUMNODES];
    unsigned long __initdata node_boundary_end_pfn[MAX_NUMNODES];
  #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
c713216de   Mel Gorman   [PATCH] Introduce...
134
  #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
13e7444b0   Nick Piggin   [PATCH] mm: remov...
135
  #ifdef CONFIG_DEBUG_VM
c6a57e19e   Dave Hansen   [PATCH] memory ho...
136
  static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
137
  {
bdc8cb984   Dave Hansen   [PATCH] memory ho...
138
139
140
  	int ret = 0;
  	unsigned seq;
  	unsigned long pfn = page_to_pfn(page);
c6a57e19e   Dave Hansen   [PATCH] memory ho...
141

bdc8cb984   Dave Hansen   [PATCH] memory ho...
142
143
144
145
146
147
148
149
150
  	do {
  		seq = zone_span_seqbegin(zone);
  		if (pfn >= zone->zone_start_pfn + zone->spanned_pages)
  			ret = 1;
  		else if (pfn < zone->zone_start_pfn)
  			ret = 1;
  	} while (zone_span_seqretry(zone, seq));
  
  	return ret;
c6a57e19e   Dave Hansen   [PATCH] memory ho...
151
152
153
154
  }
  
  static int page_is_consistent(struct zone *zone, struct page *page)
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
155
156
  #ifdef CONFIG_HOLES_IN_ZONE
  	if (!pfn_valid(page_to_pfn(page)))
c6a57e19e   Dave Hansen   [PATCH] memory ho...
157
  		return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
158
159
  #endif
  	if (zone != page_zone(page))
c6a57e19e   Dave Hansen   [PATCH] memory ho...
160
161
162
163
164
165
166
167
168
169
  		return 0;
  
  	return 1;
  }
  /*
   * Temporary debugging check for pages not lying within a given zone.
   */
  static int bad_range(struct zone *zone, struct page *page)
  {
  	if (page_outside_zone_boundaries(zone, page))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
170
  		return 1;
c6a57e19e   Dave Hansen   [PATCH] memory ho...
171
172
  	if (!page_is_consistent(zone, page))
  		return 1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
173
174
  	return 0;
  }
13e7444b0   Nick Piggin   [PATCH] mm: remov...
175
176
177
178
179
180
  #else
  static inline int bad_range(struct zone *zone, struct page *page)
  {
  	return 0;
  }
  #endif
224abf92b   Nick Piggin   [PATCH] mm: bad_p...
181
  static void bad_page(struct page *page)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
182
  {
224abf92b   Nick Piggin   [PATCH] mm: bad_p...
183
184
  	printk(KERN_EMERG "Bad page state in process '%s'
  "
7365f3d16   Hugh Dickins   [PATCH] Restore K...
185
186
187
188
189
190
  		KERN_EMERG "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d
  "
  		KERN_EMERG "Trying to fix it up, but a reboot is needed
  "
  		KERN_EMERG "Backtrace:
  ",
224abf92b   Nick Piggin   [PATCH] mm: bad_p...
191
192
193
  		current->comm, page, (int)(2*sizeof(unsigned long)),
  		(unsigned long)page->flags, page->mapping,
  		page_mapcount(page), page_count(page));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
194
  	dump_stack();
334795eca   Hugh Dickins   [PATCH] bad_page:...
195
196
  	page->flags &= ~(1 << PG_lru	|
  			1 << PG_private |
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
197
  			1 << PG_locked	|
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
198
199
  			1 << PG_active	|
  			1 << PG_dirty	|
334795eca   Hugh Dickins   [PATCH] bad_page:...
200
201
  			1 << PG_reclaim |
  			1 << PG_slab    |
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
202
  			1 << PG_swapcache |
676165a8a   Nick Piggin   [PATCH] Fix buddy...
203
204
  			1 << PG_writeback |
  			1 << PG_buddy );
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
205
206
207
  	set_page_count(page, 0);
  	reset_page_mapcount(page);
  	page->mapping = NULL;
9f1583339   Randy Dunlap   [PATCH] use add_t...
208
  	add_taint(TAINT_BAD_PAGE);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
209
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
210
211
212
213
214
215
216
217
218
219
  /*
   * Higher-order pages are called "compound pages".  They are structured thusly:
   *
   * The first PAGE_SIZE page is called the "head page".
   *
   * The remaining PAGE_SIZE pages are called "tail pages".
   *
   * All pages have PG_compound set.  All pages have their ->private pointing at
   * the head page (even the head page has this).
   *
41d78ba55   Hugh Dickins   [PATCH] compound ...
220
221
222
   * The first tail page's ->lru.next holds the address of the compound page's
   * put_page() function.  Its ->lru.prev holds the order of allocation.
   * This usage means that zero-order pages may not be compound.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
223
   */
d98c7a098   Hugh Dickins   [PATCH] compound ...
224
225
226
227
228
  
  static void free_compound_page(struct page *page)
  {
  	__free_pages_ok(page, (unsigned long)page[1].lru.prev);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
229
230
231
232
  static void prep_compound_page(struct page *page, unsigned long order)
  {
  	int i;
  	int nr_pages = 1 << order;
d98c7a098   Hugh Dickins   [PATCH] compound ...
233
  	page[1].lru.next = (void *)free_compound_page;	/* set dtor */
41d78ba55   Hugh Dickins   [PATCH] compound ...
234
  	page[1].lru.prev = (void *)order;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
235
236
  	for (i = 0; i < nr_pages; i++) {
  		struct page *p = page + i;
5e9dace8d   Nick Piggin   [PATCH] mm: page_...
237
  		__SetPageCompound(p);
4c21e2f24   Hugh Dickins   [PATCH] mm: split...
238
  		set_page_private(p, (unsigned long)page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
239
240
241
242
243
244
245
  	}
  }
  
  static void destroy_compound_page(struct page *page, unsigned long order)
  {
  	int i;
  	int nr_pages = 1 << order;
41d78ba55   Hugh Dickins   [PATCH] compound ...
246
  	if (unlikely((unsigned long)page[1].lru.prev != order))
224abf92b   Nick Piggin   [PATCH] mm: bad_p...
247
  		bad_page(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
248
249
250
  
  	for (i = 0; i < nr_pages; i++) {
  		struct page *p = page + i;
224abf92b   Nick Piggin   [PATCH] mm: bad_p...
251
252
253
  		if (unlikely(!PageCompound(p) |
  				(page_private(p) != (unsigned long)page)))
  			bad_page(page);
5e9dace8d   Nick Piggin   [PATCH] mm: page_...
254
  		__ClearPageCompound(p);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
255
256
  	}
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
257

17cf44064   Nick Piggin   [PATCH] mm: clean...
258
259
260
  static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
  {
  	int i;
725d704ec   Nick Piggin   [PATCH] mm: VM_BU...
261
  	VM_BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM);
6626c5d53   Andrew Morton   [PATCH] mm: prep_...
262
263
264
265
  	/*
  	 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
  	 * and __GFP_HIGHMEM from hard or soft interrupt context.
  	 */
725d704ec   Nick Piggin   [PATCH] mm: VM_BU...
266
  	VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
17cf44064   Nick Piggin   [PATCH] mm: clean...
267
268
269
  	for (i = 0; i < (1 << order); i++)
  		clear_highpage(page + i);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
270
271
272
273
274
  /*
   * function for dealing with page's order in buddy system.
   * zone->lock is already acquired when we use these.
   * So, we don't need atomic page->flags operations here.
   */
6aa3001b2   Andrew Morton   [PATCH] page_allo...
275
276
  static inline unsigned long page_order(struct page *page)
  {
4c21e2f24   Hugh Dickins   [PATCH] mm: split...
277
  	return page_private(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
278
  }
6aa3001b2   Andrew Morton   [PATCH] page_allo...
279
280
  static inline void set_page_order(struct page *page, int order)
  {
4c21e2f24   Hugh Dickins   [PATCH] mm: split...
281
  	set_page_private(page, order);
676165a8a   Nick Piggin   [PATCH] Fix buddy...
282
  	__SetPageBuddy(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
283
284
285
286
  }
  
  static inline void rmv_page_order(struct page *page)
  {
676165a8a   Nick Piggin   [PATCH] Fix buddy...
287
  	__ClearPageBuddy(page);
4c21e2f24   Hugh Dickins   [PATCH] mm: split...
288
  	set_page_private(page, 0);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
  }
  
  /*
   * Locate the struct page for both the matching buddy in our
   * pair (buddy1) and the combined O(n+1) page they form (page).
   *
   * 1) Any buddy B1 will have an order O twin B2 which satisfies
   * the following equation:
   *     B2 = B1 ^ (1 << O)
   * For example, if the starting buddy (buddy2) is #8 its order
   * 1 buddy is #10:
   *     B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
   *
   * 2) Any buddy B will have an order O+1 parent P which
   * satisfies the following equation:
   *     P = B & ~(1 << O)
   *
d6e05edc5   Andreas Mohr   spelling fixes
306
   * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
   */
  static inline struct page *
  __page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
  {
  	unsigned long buddy_idx = page_idx ^ (1 << order);
  
  	return page + (buddy_idx - page_idx);
  }
  
  static inline unsigned long
  __find_combined_index(unsigned long page_idx, unsigned int order)
  {
  	return (page_idx & ~(1 << order));
  }
  
  /*
   * This function checks whether a page is free && is the buddy
   * we can do coalesce a page and its buddy if
13e7444b0   Nick Piggin   [PATCH] mm: remov...
325
   * (a) the buddy is not in a hole &&
676165a8a   Nick Piggin   [PATCH] Fix buddy...
326
   * (b) the buddy is in the buddy system &&
cb2b95e1c   Andy Whitcroft   [PATCH] zone hand...
327
328
   * (c) a page and its buddy have the same order &&
   * (d) a page and its buddy are in the same zone.
676165a8a   Nick Piggin   [PATCH] Fix buddy...
329
330
331
   *
   * For recording whether a page is in the buddy system, we use PG_buddy.
   * Setting, clearing, and testing PG_buddy is serialized by zone->lock.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
332
   *
676165a8a   Nick Piggin   [PATCH] Fix buddy...
333
   * For recording page's order, we use page_private(page).
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
334
   */
cb2b95e1c   Andy Whitcroft   [PATCH] zone hand...
335
336
  static inline int page_is_buddy(struct page *page, struct page *buddy,
  								int order)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
337
  {
13e7444b0   Nick Piggin   [PATCH] mm: remov...
338
  #ifdef CONFIG_HOLES_IN_ZONE
cb2b95e1c   Andy Whitcroft   [PATCH] zone hand...
339
  	if (!pfn_valid(page_to_pfn(buddy)))
13e7444b0   Nick Piggin   [PATCH] mm: remov...
340
341
  		return 0;
  #endif
cb2b95e1c   Andy Whitcroft   [PATCH] zone hand...
342
343
344
345
346
  	if (page_zone_id(page) != page_zone_id(buddy))
  		return 0;
  
  	if (PageBuddy(buddy) && page_order(buddy) == order) {
  		BUG_ON(page_count(buddy) != 0);
6aa3001b2   Andrew Morton   [PATCH] page_allo...
347
  		return 1;
676165a8a   Nick Piggin   [PATCH] Fix buddy...
348
  	}
6aa3001b2   Andrew Morton   [PATCH] page_allo...
349
  	return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
  }
  
  /*
   * Freeing function for a buddy system allocator.
   *
   * The concept of a buddy system is to maintain direct-mapped table
   * (containing bit values) for memory blocks of various "orders".
   * The bottom level table contains the map for the smallest allocatable
   * units of memory (here, pages), and each level above it describes
   * pairs of units from the levels below, hence, "buddies".
   * At a high level, all that happens here is marking the table entry
   * at the bottom level available, and propagating the changes upward
   * as necessary, plus some accounting needed to play nicely with other
   * parts of the VM system.
   * At each level, we keep a list of pages, which are heads of continuous
676165a8a   Nick Piggin   [PATCH] Fix buddy...
365
   * free pages of length of (1 << order) and marked with PG_buddy. Page's
4c21e2f24   Hugh Dickins   [PATCH] mm: split...
366
   * order is recorded in page_private(page) field.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
367
368
369
370
371
372
373
374
   * So when we are allocating or freeing one, we can derive the state of the
   * other.  That is, if we allocate a small block, and both were   
   * free, the remainder of the region must be split into blocks.   
   * If a block is freed, and its buddy is also free, then this
   * triggers coalescing into a block of larger size.            
   *
   * -- wli
   */
48db57f8f   Nick Piggin   [PATCH] mm: free_...
375
  static inline void __free_one_page(struct page *page,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
376
377
378
379
  		struct zone *zone, unsigned int order)
  {
  	unsigned long page_idx;
  	int order_size = 1 << order;
224abf92b   Nick Piggin   [PATCH] mm: bad_p...
380
  	if (unlikely(PageCompound(page)))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
381
382
383
  		destroy_compound_page(page, order);
  
  	page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
725d704ec   Nick Piggin   [PATCH] mm: VM_BU...
384
385
  	VM_BUG_ON(page_idx & (order_size - 1));
  	VM_BUG_ON(bad_range(zone, page));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
386
387
388
389
390
391
  
  	zone->free_pages += order_size;
  	while (order < MAX_ORDER-1) {
  		unsigned long combined_idx;
  		struct free_area *area;
  		struct page *buddy;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
392
  		buddy = __page_find_buddy(page, page_idx, order);
cb2b95e1c   Andy Whitcroft   [PATCH] zone hand...
393
  		if (!page_is_buddy(page, buddy, order))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
394
  			break;		/* Move the buddy up one level. */
13e7444b0   Nick Piggin   [PATCH] mm: remov...
395

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
396
397
398
399
  		list_del(&buddy->lru);
  		area = zone->free_area + order;
  		area->nr_free--;
  		rmv_page_order(buddy);
13e7444b0   Nick Piggin   [PATCH] mm: remov...
400
  		combined_idx = __find_combined_index(page_idx, order);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
401
402
403
404
405
406
407
408
  		page = page + (combined_idx - page_idx);
  		page_idx = combined_idx;
  		order++;
  	}
  	set_page_order(page, order);
  	list_add(&page->lru, &zone->free_area[order].free_list);
  	zone->free_area[order].nr_free++;
  }
224abf92b   Nick Piggin   [PATCH] mm: bad_p...
409
  static inline int free_pages_check(struct page *page)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
410
  {
92be2e33b   Nick Piggin   [PATCH] mm: micro...
411
412
413
  	if (unlikely(page_mapcount(page) |
  		(page->mapping != NULL)  |
  		(page_count(page) != 0)  |
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
414
415
416
417
418
419
420
421
  		(page->flags & (
  			1 << PG_lru	|
  			1 << PG_private |
  			1 << PG_locked	|
  			1 << PG_active	|
  			1 << PG_reclaim	|
  			1 << PG_slab	|
  			1 << PG_swapcache |
b5810039a   Nick Piggin   [PATCH] core remo...
422
  			1 << PG_writeback |
676165a8a   Nick Piggin   [PATCH] Fix buddy...
423
424
  			1 << PG_reserved |
  			1 << PG_buddy ))))
224abf92b   Nick Piggin   [PATCH] mm: bad_p...
425
  		bad_page(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
426
  	if (PageDirty(page))
242e54686   Nick Piggin   [PATCH] mm: remov...
427
  		__ClearPageDirty(page);
689bcebfd   Hugh Dickins   [PATCH] unpaged: ...
428
429
430
431
432
433
  	/*
  	 * For now, we report if PG_reserved was found set, but do not
  	 * clear it, and do not free the page.  But we shall soon need
  	 * to do more, for when the ZERO_PAGE count wraps negative.
  	 */
  	return PageReserved(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
434
435
436
437
438
  }
  
  /*
   * Frees a list of pages. 
   * Assumes all pages on list are in same zone, and of same order.
207f36eec   Renaud Lienhart   [PATCH] remove in...
439
   * count is the number of pages to free.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
440
441
442
443
444
445
446
   *
   * If the zone was previously in an "all pages pinned" state then look to
   * see if this freeing clears that state.
   *
   * And clear the zone's pages_scanned counter, to hold off the "all pages are
   * pinned" detection logic.
   */
48db57f8f   Nick Piggin   [PATCH] mm: free_...
447
448
  static void free_pages_bulk(struct zone *zone, int count,
  					struct list_head *list, int order)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
449
  {
c54ad30c7   Nick Piggin   [PATCH] mm: pagea...
450
  	spin_lock(&zone->lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
451
452
  	zone->all_unreclaimable = 0;
  	zone->pages_scanned = 0;
48db57f8f   Nick Piggin   [PATCH] mm: free_...
453
454
  	while (count--) {
  		struct page *page;
725d704ec   Nick Piggin   [PATCH] mm: VM_BU...
455
  		VM_BUG_ON(list_empty(list));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
456
  		page = list_entry(list->prev, struct page, lru);
48db57f8f   Nick Piggin   [PATCH] mm: free_...
457
  		/* have to delete it as __free_one_page list manipulates */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
458
  		list_del(&page->lru);
48db57f8f   Nick Piggin   [PATCH] mm: free_...
459
  		__free_one_page(page, zone, order);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
460
  	}
c54ad30c7   Nick Piggin   [PATCH] mm: pagea...
461
  	spin_unlock(&zone->lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
462
  }
48db57f8f   Nick Piggin   [PATCH] mm: free_...
463
  static void free_one_page(struct zone *zone, struct page *page, int order)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
464
  {
006d22d9b   Christoph Lameter   [PATCH] Optimize ...
465
466
467
468
469
  	spin_lock(&zone->lock);
  	zone->all_unreclaimable = 0;
  	zone->pages_scanned = 0;
  	__free_one_page(page, zone ,order);
  	spin_unlock(&zone->lock);
48db57f8f   Nick Piggin   [PATCH] mm: free_...
470
471
472
473
474
  }
  
  static void __free_pages_ok(struct page *page, unsigned int order)
  {
  	unsigned long flags;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
475
  	int i;
689bcebfd   Hugh Dickins   [PATCH] unpaged: ...
476
  	int reserved = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
477

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
478
  	for (i = 0 ; i < (1 << order) ; ++i)
224abf92b   Nick Piggin   [PATCH] mm: bad_p...
479
  		reserved += free_pages_check(page + i);
689bcebfd   Hugh Dickins   [PATCH] unpaged: ...
480
481
  	if (reserved)
  		return;
9858db504   Nick Piggin   [PATCH] mm: locks...
482
483
  	if (!PageHighMem(page))
  		debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);
dafb13673   Nick Piggin   [PATCH] mm: arch_...
484
  	arch_free_page(page, order);
48db57f8f   Nick Piggin   [PATCH] mm: free_...
485
  	kernel_map_pages(page, 1 << order, 0);
dafb13673   Nick Piggin   [PATCH] mm: arch_...
486

c54ad30c7   Nick Piggin   [PATCH] mm: pagea...
487
  	local_irq_save(flags);
f8891e5e1   Christoph Lameter   [PATCH] Light wei...
488
  	__count_vm_events(PGFREE, 1 << order);
48db57f8f   Nick Piggin   [PATCH] mm: free_...
489
  	free_one_page(page_zone(page), page, order);
c54ad30c7   Nick Piggin   [PATCH] mm: pagea...
490
  	local_irq_restore(flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
491
  }
a226f6c89   David Howells   [PATCH] FRV: Clea...
492
493
494
495
496
497
498
499
  /*
   * permit the bootmem allocator to evade page validation on high-order frees
   */
  void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order)
  {
  	if (order == 0) {
  		__ClearPageReserved(page);
  		set_page_count(page, 0);
7835e98b2   Nick Piggin   [PATCH] remove se...
500
  		set_page_refcounted(page);
545b1ea9b   Nick Piggin   [PATCH] mm: clean...
501
  		__free_page(page);
a226f6c89   David Howells   [PATCH] FRV: Clea...
502
  	} else {
a226f6c89   David Howells   [PATCH] FRV: Clea...
503
  		int loop;
545b1ea9b   Nick Piggin   [PATCH] mm: clean...
504
  		prefetchw(page);
a226f6c89   David Howells   [PATCH] FRV: Clea...
505
506
  		for (loop = 0; loop < BITS_PER_LONG; loop++) {
  			struct page *p = &page[loop];
545b1ea9b   Nick Piggin   [PATCH] mm: clean...
507
508
  			if (loop + 1 < BITS_PER_LONG)
  				prefetchw(p + 1);
a226f6c89   David Howells   [PATCH] FRV: Clea...
509
510
511
  			__ClearPageReserved(p);
  			set_page_count(p, 0);
  		}
7835e98b2   Nick Piggin   [PATCH] remove se...
512
  		set_page_refcounted(page);
545b1ea9b   Nick Piggin   [PATCH] mm: clean...
513
  		__free_pages(page, order);
a226f6c89   David Howells   [PATCH] FRV: Clea...
514
515
  	}
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
  
  /*
   * The order of subdivision here is critical for the IO subsystem.
   * Please do not alter this order without good reasons and regression
   * testing. Specifically, as large blocks of memory are subdivided,
   * the order in which smaller blocks are delivered depends on the order
   * they're subdivided in this function. This is the primary factor
   * influencing the order in which pages are delivered to the IO
   * subsystem according to empirical testing, and this is also justified
   * by considering the behavior of a buddy system containing a single
   * large block of memory acted on by a series of small allocations.
   * This behavior is a critical factor in sglist merging's success.
   *
   * -- wli
   */
085cc7d5d   Nick Piggin   [PATCH] mm: page_...
531
  static inline void expand(struct zone *zone, struct page *page,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
532
533
534
535
536
537
538
539
   	int low, int high, struct free_area *area)
  {
  	unsigned long size = 1 << high;
  
  	while (high > low) {
  		area--;
  		high--;
  		size >>= 1;
725d704ec   Nick Piggin   [PATCH] mm: VM_BU...
540
  		VM_BUG_ON(bad_range(zone, &page[size]));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
541
542
543
544
  		list_add(&page[size].lru, &area->free_list);
  		area->nr_free++;
  		set_page_order(&page[size], high);
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
545
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
546
547
548
  /*
   * This page is about to be returned from the page allocator
   */
17cf44064   Nick Piggin   [PATCH] mm: clean...
549
  static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
550
  {
92be2e33b   Nick Piggin   [PATCH] mm: micro...
551
552
553
  	if (unlikely(page_mapcount(page) |
  		(page->mapping != NULL)  |
  		(page_count(page) != 0)  |
334795eca   Hugh Dickins   [PATCH] bad_page:...
554
555
  		(page->flags & (
  			1 << PG_lru	|
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
556
557
  			1 << PG_private	|
  			1 << PG_locked	|
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
558
559
560
  			1 << PG_active	|
  			1 << PG_dirty	|
  			1 << PG_reclaim	|
334795eca   Hugh Dickins   [PATCH] bad_page:...
561
  			1 << PG_slab    |
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
562
  			1 << PG_swapcache |
b5810039a   Nick Piggin   [PATCH] core remo...
563
  			1 << PG_writeback |
676165a8a   Nick Piggin   [PATCH] Fix buddy...
564
565
  			1 << PG_reserved |
  			1 << PG_buddy ))))
224abf92b   Nick Piggin   [PATCH] mm: bad_p...
566
  		bad_page(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
567

689bcebfd   Hugh Dickins   [PATCH] unpaged: ...
568
569
570
571
572
573
  	/*
  	 * For now, we report if PG_reserved was found set, but do not
  	 * clear it, and do not allocate the page: as a safety net.
  	 */
  	if (PageReserved(page))
  		return 1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
574
575
576
  	page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
  			1 << PG_referenced | 1 << PG_arch_1 |
  			1 << PG_checked | 1 << PG_mappedtodisk);
4c21e2f24   Hugh Dickins   [PATCH] mm: split...
577
  	set_page_private(page, 0);
7835e98b2   Nick Piggin   [PATCH] remove se...
578
  	set_page_refcounted(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
579
  	kernel_map_pages(page, 1 << order, 1);
17cf44064   Nick Piggin   [PATCH] mm: clean...
580
581
582
583
584
585
  
  	if (gfp_flags & __GFP_ZERO)
  		prep_zero_page(page, order, gfp_flags);
  
  	if (order && (gfp_flags & __GFP_COMP))
  		prep_compound_page(page, order);
689bcebfd   Hugh Dickins   [PATCH] unpaged: ...
586
  	return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
  }
  
  /* 
   * Do the hard work of removing an element from the buddy allocator.
   * Call me with the zone->lock already held.
   */
  static struct page *__rmqueue(struct zone *zone, unsigned int order)
  {
  	struct free_area * area;
  	unsigned int current_order;
  	struct page *page;
  
  	for (current_order = order; current_order < MAX_ORDER; ++current_order) {
  		area = zone->free_area + current_order;
  		if (list_empty(&area->free_list))
  			continue;
  
  		page = list_entry(area->free_list.next, struct page, lru);
  		list_del(&page->lru);
  		rmv_page_order(page);
  		area->nr_free--;
  		zone->free_pages -= 1UL << order;
085cc7d5d   Nick Piggin   [PATCH] mm: page_...
609
610
  		expand(zone, page, order, current_order, area);
  		return page;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
611
612
613
614
615
616
617
618
619
620
621
622
623
  	}
  
  	return NULL;
  }
  
  /* 
   * Obtain a specified number of elements from the buddy allocator, all under
   * a single hold of the lock, for efficiency.  Add them to the supplied list.
   * Returns the number of new pages which were placed at *list.
   */
  static int rmqueue_bulk(struct zone *zone, unsigned int order, 
  			unsigned long count, struct list_head *list)
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
624
  	int i;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
625
  	
c54ad30c7   Nick Piggin   [PATCH] mm: pagea...
626
  	spin_lock(&zone->lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
627
  	for (i = 0; i < count; ++i) {
085cc7d5d   Nick Piggin   [PATCH] mm: page_...
628
629
  		struct page *page = __rmqueue(zone, order);
  		if (unlikely(page == NULL))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
630
  			break;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
631
632
  		list_add_tail(&page->lru, list);
  	}
c54ad30c7   Nick Piggin   [PATCH] mm: pagea...
633
  	spin_unlock(&zone->lock);
085cc7d5d   Nick Piggin   [PATCH] mm: page_...
634
  	return i;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
635
  }
4ae7c0394   Christoph Lameter   [PATCH] Periodica...
636
  #ifdef CONFIG_NUMA
8fce4d8e3   Christoph Lameter   [PATCH] slab: Nod...
637
638
  /*
   * Called from the slab reaper to drain pagesets on a particular node that
39bbcb8f8   Christoph Lameter   [PATCH] mm: do no...
639
   * belongs to the currently executing processor.
879336c39   Christoph Lameter   [PATCH] drain_nod...
640
641
   * Note that this function must be called with the thread pinned to
   * a single processor.
8fce4d8e3   Christoph Lameter   [PATCH] slab: Nod...
642
643
   */
  void drain_node_pages(int nodeid)
4ae7c0394   Christoph Lameter   [PATCH] Periodica...
644
  {
2f6726e54   Christoph Lameter   [PATCH] Apply typ...
645
646
  	int i;
  	enum zone_type z;
4ae7c0394   Christoph Lameter   [PATCH] Periodica...
647
  	unsigned long flags;
8fce4d8e3   Christoph Lameter   [PATCH] slab: Nod...
648
649
  	for (z = 0; z < MAX_NR_ZONES; z++) {
  		struct zone *zone = NODE_DATA(nodeid)->node_zones + z;
4ae7c0394   Christoph Lameter   [PATCH] Periodica...
650
  		struct per_cpu_pageset *pset;
39bbcb8f8   Christoph Lameter   [PATCH] mm: do no...
651
652
  		if (!populated_zone(zone))
  			continue;
23316bc86   Nick Piggin   [PATCH] mm: clean...
653
  		pset = zone_pcp(zone, smp_processor_id());
4ae7c0394   Christoph Lameter   [PATCH] Periodica...
654
655
656
657
  		for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
  			struct per_cpu_pages *pcp;
  
  			pcp = &pset->pcp[i];
879336c39   Christoph Lameter   [PATCH] drain_nod...
658
659
660
661
662
663
  			if (pcp->count) {
  				local_irq_save(flags);
  				free_pages_bulk(zone, pcp->count, &pcp->list, 0);
  				pcp->count = 0;
  				local_irq_restore(flags);
  			}
4ae7c0394   Christoph Lameter   [PATCH] Periodica...
664
665
  		}
  	}
4ae7c0394   Christoph Lameter   [PATCH] Periodica...
666
667
  }
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
668
669
670
  #if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
  static void __drain_pages(unsigned int cpu)
  {
c54ad30c7   Nick Piggin   [PATCH] mm: pagea...
671
  	unsigned long flags;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
672
673
674
675
676
  	struct zone *zone;
  	int i;
  
  	for_each_zone(zone) {
  		struct per_cpu_pageset *pset;
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
677
  		pset = zone_pcp(zone, cpu);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
678
679
680
681
  		for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
  			struct per_cpu_pages *pcp;
  
  			pcp = &pset->pcp[i];
c54ad30c7   Nick Piggin   [PATCH] mm: pagea...
682
  			local_irq_save(flags);
48db57f8f   Nick Piggin   [PATCH] mm: free_...
683
684
  			free_pages_bulk(zone, pcp->count, &pcp->list, 0);
  			pcp->count = 0;
c54ad30c7   Nick Piggin   [PATCH] mm: pagea...
685
  			local_irq_restore(flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
686
687
688
689
690
691
692
693
694
  		}
  	}
  }
  #endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */
  
  #ifdef CONFIG_PM
  
  void mark_free_pages(struct zone *zone)
  {
f623f0db8   Rafael J. Wysocki   [PATCH] swsusp: F...
695
696
  	unsigned long pfn, max_zone_pfn;
  	unsigned long flags;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
697
698
699
700
701
702
703
  	int order;
  	struct list_head *curr;
  
  	if (!zone->spanned_pages)
  		return;
  
  	spin_lock_irqsave(&zone->lock, flags);
f623f0db8   Rafael J. Wysocki   [PATCH] swsusp: F...
704
705
706
707
708
709
710
711
712
  
  	max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
  	for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
  		if (pfn_valid(pfn)) {
  			struct page *page = pfn_to_page(pfn);
  
  			if (!PageNosave(page))
  				ClearPageNosaveFree(page);
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
713
714
715
  
  	for (order = MAX_ORDER - 1; order >= 0; --order)
  		list_for_each(curr, &zone->free_area[order].free_list) {
f623f0db8   Rafael J. Wysocki   [PATCH] swsusp: F...
716
  			unsigned long i;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
717

f623f0db8   Rafael J. Wysocki   [PATCH] swsusp: F...
718
719
720
721
  			pfn = page_to_pfn(list_entry(curr, struct page, lru));
  			for (i = 0; i < (1UL << order); i++)
  				SetPageNosaveFree(pfn_to_page(pfn + i));
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
722

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
  	spin_unlock_irqrestore(&zone->lock, flags);
  }
  
  /*
   * Spill all of this CPU's per-cpu pages back into the buddy allocator.
   */
  void drain_local_pages(void)
  {
  	unsigned long flags;
  
  	local_irq_save(flags);	
  	__drain_pages(smp_processor_id());
  	local_irq_restore(flags);	
  }
  #endif /* CONFIG_PM */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
738
739
740
  /*
   * Free a 0-order page
   */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
741
742
743
744
745
  static void fastcall free_hot_cold_page(struct page *page, int cold)
  {
  	struct zone *zone = page_zone(page);
  	struct per_cpu_pages *pcp;
  	unsigned long flags;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
746
747
  	if (PageAnon(page))
  		page->mapping = NULL;
224abf92b   Nick Piggin   [PATCH] mm: bad_p...
748
  	if (free_pages_check(page))
689bcebfd   Hugh Dickins   [PATCH] unpaged: ...
749
  		return;
9858db504   Nick Piggin   [PATCH] mm: locks...
750
751
  	if (!PageHighMem(page))
  		debug_check_no_locks_freed(page_address(page), PAGE_SIZE);
dafb13673   Nick Piggin   [PATCH] mm: arch_...
752
  	arch_free_page(page, 0);
689bcebfd   Hugh Dickins   [PATCH] unpaged: ...
753
  	kernel_map_pages(page, 1, 0);
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
754
  	pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
755
  	local_irq_save(flags);
f8891e5e1   Christoph Lameter   [PATCH] Light wei...
756
  	__count_vm_event(PGFREE);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
757
758
  	list_add(&page->lru, &pcp->list);
  	pcp->count++;
48db57f8f   Nick Piggin   [PATCH] mm: free_...
759
760
761
762
  	if (pcp->count >= pcp->high) {
  		free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
  		pcp->count -= pcp->batch;
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
763
764
765
766
767
768
769
770
771
772
773
774
775
  	local_irq_restore(flags);
  	put_cpu();
  }
  
  void fastcall free_hot_page(struct page *page)
  {
  	free_hot_cold_page(page, 0);
  }
  	
  void fastcall free_cold_page(struct page *page)
  {
  	free_hot_cold_page(page, 1);
  }
8dfcc9ba2   Nick Piggin   [PATCH] mm: split...
776
777
778
779
780
781
782
783
784
785
786
  /*
   * split_page takes a non-compound higher-order page, and splits it into
   * n (1<<order) sub-pages: page[0..n]
   * Each sub-page must be freed individually.
   *
   * Note: this is probably too low level an operation for use in drivers.
   * Please consult with lkml before using this in your driver.
   */
  void split_page(struct page *page, unsigned int order)
  {
  	int i;
725d704ec   Nick Piggin   [PATCH] mm: VM_BU...
787
788
  	VM_BUG_ON(PageCompound(page));
  	VM_BUG_ON(!page_count(page));
7835e98b2   Nick Piggin   [PATCH] remove se...
789
790
  	for (i = 1; i < (1 << order); i++)
  		set_page_refcounted(page + i);
8dfcc9ba2   Nick Piggin   [PATCH] mm: split...
791
  }
8dfcc9ba2   Nick Piggin   [PATCH] mm: split...
792

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
793
794
795
796
797
  /*
   * Really, prep_compound_page() should be called from __rmqueue_bulk().  But
   * we cheat by calling it from here, in the order > 0 path.  Saves a branch
   * or two.
   */
a74609faf   Nick Piggin   [PATCH] mm: page_...
798
799
  static struct page *buffered_rmqueue(struct zonelist *zonelist,
  			struct zone *zone, int order, gfp_t gfp_flags)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
800
801
  {
  	unsigned long flags;
689bcebfd   Hugh Dickins   [PATCH] unpaged: ...
802
  	struct page *page;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
803
  	int cold = !!(gfp_flags & __GFP_COLD);
a74609faf   Nick Piggin   [PATCH] mm: page_...
804
  	int cpu;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
805

689bcebfd   Hugh Dickins   [PATCH] unpaged: ...
806
  again:
a74609faf   Nick Piggin   [PATCH] mm: page_...
807
  	cpu  = get_cpu();
48db57f8f   Nick Piggin   [PATCH] mm: free_...
808
  	if (likely(order == 0)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
809
  		struct per_cpu_pages *pcp;
a74609faf   Nick Piggin   [PATCH] mm: page_...
810
  		pcp = &zone_pcp(zone, cpu)->pcp[cold];
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
811
  		local_irq_save(flags);
a74609faf   Nick Piggin   [PATCH] mm: page_...
812
  		if (!pcp->count) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
813
814
  			pcp->count += rmqueue_bulk(zone, 0,
  						pcp->batch, &pcp->list);
a74609faf   Nick Piggin   [PATCH] mm: page_...
815
816
  			if (unlikely(!pcp->count))
  				goto failed;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
817
  		}
a74609faf   Nick Piggin   [PATCH] mm: page_...
818
819
820
  		page = list_entry(pcp->list.next, struct page, lru);
  		list_del(&page->lru);
  		pcp->count--;
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
821
  	} else {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
822
823
  		spin_lock_irqsave(&zone->lock, flags);
  		page = __rmqueue(zone, order);
a74609faf   Nick Piggin   [PATCH] mm: page_...
824
825
826
  		spin_unlock(&zone->lock);
  		if (!page)
  			goto failed;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
827
  	}
f8891e5e1   Christoph Lameter   [PATCH] Light wei...
828
  	__count_zone_vm_events(PGALLOC, zone, 1 << order);
ca889e6c4   Christoph Lameter   [PATCH] Use Zoned...
829
  	zone_statistics(zonelist, zone);
a74609faf   Nick Piggin   [PATCH] mm: page_...
830
831
  	local_irq_restore(flags);
  	put_cpu();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
832

725d704ec   Nick Piggin   [PATCH] mm: VM_BU...
833
  	VM_BUG_ON(bad_range(zone, page));
17cf44064   Nick Piggin   [PATCH] mm: clean...
834
  	if (prep_new_page(page, order, gfp_flags))
a74609faf   Nick Piggin   [PATCH] mm: page_...
835
  		goto again;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
836
  	return page;
a74609faf   Nick Piggin   [PATCH] mm: page_...
837
838
839
840
841
  
  failed:
  	local_irq_restore(flags);
  	put_cpu();
  	return NULL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
842
  }
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
843
  #define ALLOC_NO_WATERMARKS	0x01 /* don't check watermarks at all */
3148890bf   Nick Piggin   [PATCH] mm: __all...
844
845
846
847
848
849
  #define ALLOC_WMARK_MIN		0x02 /* use pages_min watermark */
  #define ALLOC_WMARK_LOW		0x04 /* use pages_low watermark */
  #define ALLOC_WMARK_HIGH	0x08 /* use pages_high watermark */
  #define ALLOC_HARDER		0x10 /* try to alloc harder */
  #define ALLOC_HIGH		0x20 /* __GFP_HIGH set */
  #define ALLOC_CPUSET		0x40 /* check for correct cpuset */
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
850

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
851
852
853
854
855
  /*
   * Return 1 if free pages are above 'mark'. This takes into account the order
   * of the allocation.
   */
  int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
856
  		      int classzone_idx, int alloc_flags)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
857
858
  {
  	/* free_pages my go negative - that's OK */
e80ee884a   Nick Piggin   [PATCH] mm: micro...
859
860
  	unsigned long min = mark;
  	long free_pages = z->free_pages - (1 << order) + 1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
861
  	int o;
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
862
  	if (alloc_flags & ALLOC_HIGH)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
863
  		min -= min / 2;
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
864
  	if (alloc_flags & ALLOC_HARDER)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
  		min -= min / 4;
  
  	if (free_pages <= min + z->lowmem_reserve[classzone_idx])
  		return 0;
  	for (o = 0; o < order; o++) {
  		/* At the next order, this order's pages become unavailable */
  		free_pages -= z->free_area[o].nr_free << o;
  
  		/* Require fewer higher order pages to be free */
  		min >>= 1;
  
  		if (free_pages <= min)
  			return 0;
  	}
  	return 1;
  }
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
881
882
883
884
885
886
887
  /*
   * get_page_from_freeliest goes through the zonelist trying to allocate
   * a page.
   */
  static struct page *
  get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
  		struct zonelist *zonelist, int alloc_flags)
753ee7289   Martin Hicks   [PATCH] VM: early...
888
  {
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
889
890
891
  	struct zone **z = zonelist->zones;
  	struct page *page = NULL;
  	int classzone_idx = zone_idx(*z);
1192d5264   Christoph Lameter   [PATCH] Cleanup: ...
892
  	struct zone *zone;
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
893
894
895
896
897
898
  
  	/*
  	 * Go through the zonelist once, looking for a zone with enough free.
  	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
  	 */
  	do {
1192d5264   Christoph Lameter   [PATCH] Cleanup: ...
899
  		zone = *z;
08e0f6a97   Christoph Lameter   [PATCH] Add NUMA_...
900
  		if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) &&
1192d5264   Christoph Lameter   [PATCH] Cleanup: ...
901
  			zone->zone_pgdat != zonelist->zones[0]->zone_pgdat))
9b819d204   Christoph Lameter   [PATCH] Add __GFP...
902
  				break;
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
903
  		if ((alloc_flags & ALLOC_CPUSET) &&
1192d5264   Christoph Lameter   [PATCH] Cleanup: ...
904
  				!cpuset_zone_allowed(zone, gfp_mask))
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
905
906
907
  			continue;
  
  		if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
3148890bf   Nick Piggin   [PATCH] mm: __all...
908
909
  			unsigned long mark;
  			if (alloc_flags & ALLOC_WMARK_MIN)
1192d5264   Christoph Lameter   [PATCH] Cleanup: ...
910
  				mark = zone->pages_min;
3148890bf   Nick Piggin   [PATCH] mm: __all...
911
  			else if (alloc_flags & ALLOC_WMARK_LOW)
1192d5264   Christoph Lameter   [PATCH] Cleanup: ...
912
  				mark = zone->pages_low;
3148890bf   Nick Piggin   [PATCH] mm: __all...
913
  			else
1192d5264   Christoph Lameter   [PATCH] Cleanup: ...
914
915
  				mark = zone->pages_high;
  			if (!zone_watermark_ok(zone , order, mark,
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
916
  				    classzone_idx, alloc_flags))
9eeff2395   Christoph Lameter   [PATCH] Zone recl...
917
  				if (!zone_reclaim_mode ||
1192d5264   Christoph Lameter   [PATCH] Cleanup: ...
918
  				    !zone_reclaim(zone, gfp_mask, order))
9eeff2395   Christoph Lameter   [PATCH] Zone recl...
919
  					continue;
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
920
  		}
1192d5264   Christoph Lameter   [PATCH] Cleanup: ...
921
  		page = buffered_rmqueue(zonelist, zone, order, gfp_mask);
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
922
  		if (page) {
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
923
924
925
926
  			break;
  		}
  	} while (*(++z) != NULL);
  	return page;
753ee7289   Martin Hicks   [PATCH] VM: early...
927
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
928
929
930
931
  /*
   * This is the 'heart' of the zoned buddy allocator.
   */
  struct page * fastcall
dd0fc66fb   Al Viro   [PATCH] gfp flags...
932
  __alloc_pages(gfp_t gfp_mask, unsigned int order,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
933
934
  		struct zonelist *zonelist)
  {
260b23674   Al Viro   [PATCH] gfp_t: th...
935
  	const gfp_t wait = gfp_mask & __GFP_WAIT;
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
936
  	struct zone **z;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
937
938
939
  	struct page *page;
  	struct reclaim_state reclaim_state;
  	struct task_struct *p = current;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
940
  	int do_retry;
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
941
  	int alloc_flags;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
942
943
944
  	int did_some_progress;
  
  	might_sleep_if(wait);
6b1de9161   Jens Axboe   [PATCH] VM: fix z...
945
  restart:
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
946
  	z = zonelist->zones;  /* the list of zones suitable for gfp_mask */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
947

7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
948
  	if (unlikely(*z == NULL)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
949
950
951
  		/* Should this ever happen?? */
  		return NULL;
  	}
6b1de9161   Jens Axboe   [PATCH] VM: fix z...
952

7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
953
  	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
3148890bf   Nick Piggin   [PATCH] mm: __all...
954
  				zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET);
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
955
956
  	if (page)
  		goto got_pg;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
957

6b1de9161   Jens Axboe   [PATCH] VM: fix z...
958
  	do {
43b0bc00f   Chris Wright   [PATCH] cpuset: r...
959
  		wakeup_kswapd(*z, order);
6b1de9161   Jens Axboe   [PATCH] VM: fix z...
960
  	} while (*(++z));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
961

9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
962
  	/*
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
963
964
965
966
967
968
  	 * OK, we're below the kswapd watermark and have kicked background
  	 * reclaim. Now things get more complex, so set up alloc_flags according
  	 * to how we want to proceed.
  	 *
  	 * The caller may dip into page reserves a bit more if the caller
  	 * cannot run direct reclaim, or if the caller has realtime scheduling
4eac915d0   Paul Jackson   [PATCH] mm: gfp_a...
969
970
  	 * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
  	 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
971
  	 */
3148890bf   Nick Piggin   [PATCH] mm: __all...
972
  	alloc_flags = ALLOC_WMARK_MIN;
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
973
974
975
976
  	if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
  		alloc_flags |= ALLOC_HARDER;
  	if (gfp_mask & __GFP_HIGH)
  		alloc_flags |= ALLOC_HIGH;
bdd804f47   Paul Jackson   [PATCH] Cpuset: m...
977
978
  	if (wait)
  		alloc_flags |= ALLOC_CPUSET;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
979
980
981
  
  	/*
  	 * Go through the zonelist again. Let __GFP_HIGH and allocations
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
982
  	 * coming from realtime tasks go deeper into reserves.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
983
984
985
  	 *
  	 * This is the last chance, in general, before the goto nopage.
  	 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
986
  	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
987
  	 */
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
988
989
990
  	page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags);
  	if (page)
  		goto got_pg;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
991
992
  
  	/* This allocation should allow future memory freeing. */
b84a35be0   Nick Piggin   [PATCH] mempool: ...
993
994
995
996
  
  	if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
  			&& !in_interrupt()) {
  		if (!(gfp_mask & __GFP_NOMEMALLOC)) {
885036d32   Kirill Korotaev   [PATCH] mm: __GFP...
997
  nofail_alloc:
b84a35be0   Nick Piggin   [PATCH] mempool: ...
998
  			/* go through the zonelist yet again, ignoring mins */
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
999
  			page = get_page_from_freelist(gfp_mask, order,
47f3a867f   Paul Jackson   [PATCH] mm: fix _...
1000
  				zonelist, ALLOC_NO_WATERMARKS);
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1001
1002
  			if (page)
  				goto got_pg;
885036d32   Kirill Korotaev   [PATCH] mm: __GFP...
1003
1004
1005
1006
  			if (gfp_mask & __GFP_NOFAIL) {
  				blk_congestion_wait(WRITE, HZ/50);
  				goto nofail_alloc;
  			}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
  		}
  		goto nopage;
  	}
  
  	/* Atomic allocations - we can't balance anything */
  	if (!wait)
  		goto nopage;
  
  rebalance:
  	cond_resched();
  
  	/* We now go into synchronous reclaim */
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
1019
  	cpuset_memory_pressure_bump();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1020
1021
1022
  	p->flags |= PF_MEMALLOC;
  	reclaim_state.reclaimed_slab = 0;
  	p->reclaim_state = &reclaim_state;
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1023
  	did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1024
1025
1026
1027
1028
1029
1030
  
  	p->reclaim_state = NULL;
  	p->flags &= ~PF_MEMALLOC;
  
  	cond_resched();
  
  	if (likely(did_some_progress)) {
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1031
1032
1033
1034
  		page = get_page_from_freelist(gfp_mask, order,
  						zonelist, alloc_flags);
  		if (page)
  			goto got_pg;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1035
1036
1037
1038
1039
1040
1041
  	} else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
  		/*
  		 * Go through the zonelist yet one more time, keep
  		 * very high watermark here, this is only to catch
  		 * a parallel oom killing, we must fail if we're still
  		 * under heavy pressure.
  		 */
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1042
  		page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
3148890bf   Nick Piggin   [PATCH] mm: __all...
1043
  				zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET);
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1044
1045
  		if (page)
  			goto got_pg;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1046

9b0f8b040   Christoph Lameter   [PATCH] Terminate...
1047
  		out_of_memory(zonelist, gfp_mask, order);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
  		goto restart;
  	}
  
  	/*
  	 * Don't let big-order allocations loop unless the caller explicitly
  	 * requests that.  Wait for some write requests to complete then retry.
  	 *
  	 * In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order
  	 * <= 3, but that may not be true in other implementations.
  	 */
  	do_retry = 0;
  	if (!(gfp_mask & __GFP_NORETRY)) {
  		if ((order <= 3) || (gfp_mask & __GFP_REPEAT))
  			do_retry = 1;
  		if (gfp_mask & __GFP_NOFAIL)
  			do_retry = 1;
  	}
  	if (do_retry) {
  		blk_congestion_wait(WRITE, HZ/50);
  		goto rebalance;
  	}
  
  nopage:
  	if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
  		printk(KERN_WARNING "%s: page allocation failure."
  			" order:%d, mode:0x%x
  ",
  			p->comm, order, gfp_mask);
  		dump_stack();
578c2fd6a   Janet Morgan   [PATCH] add OOM d...
1077
  		show_mem();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1078
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1079
  got_pg:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1080
1081
1082
1083
1084
1085
1086
1087
  	return page;
  }
  
  EXPORT_SYMBOL(__alloc_pages);
  
  /*
   * Common helper functions.
   */
dd0fc66fb   Al Viro   [PATCH] gfp flags...
1088
  fastcall unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1089
1090
1091
1092
1093
1094
1095
1096
1097
  {
  	struct page * page;
  	page = alloc_pages(gfp_mask, order);
  	if (!page)
  		return 0;
  	return (unsigned long) page_address(page);
  }
  
  EXPORT_SYMBOL(__get_free_pages);
dd0fc66fb   Al Viro   [PATCH] gfp flags...
1098
  fastcall unsigned long get_zeroed_page(gfp_t gfp_mask)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1099
1100
1101
1102
1103
1104
1105
  {
  	struct page * page;
  
  	/*
  	 * get_zeroed_page() returns a 32-bit address, which cannot represent
  	 * a highmem page
  	 */
725d704ec   Nick Piggin   [PATCH] mm: VM_BU...
1106
  	VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
  
  	page = alloc_pages(gfp_mask | __GFP_ZERO, 0);
  	if (page)
  		return (unsigned long) page_address(page);
  	return 0;
  }
  
  EXPORT_SYMBOL(get_zeroed_page);
  
  void __pagevec_free(struct pagevec *pvec)
  {
  	int i = pagevec_count(pvec);
  
  	while (--i >= 0)
  		free_hot_cold_page(pvec->pages[i], pvec->cold);
  }
  
  fastcall void __free_pages(struct page *page, unsigned int order)
  {
b5810039a   Nick Piggin   [PATCH] core remo...
1126
  	if (put_page_testzero(page)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
  		if (order == 0)
  			free_hot_page(page);
  		else
  			__free_pages_ok(page, order);
  	}
  }
  
  EXPORT_SYMBOL(__free_pages);
  
  fastcall void free_pages(unsigned long addr, unsigned int order)
  {
  	if (addr != 0) {
725d704ec   Nick Piggin   [PATCH] mm: VM_BU...
1139
  		VM_BUG_ON(!virt_addr_valid((void *)addr));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
  		__free_pages(virt_to_page((void *)addr), order);
  	}
  }
  
  EXPORT_SYMBOL(free_pages);
  
  /*
   * Total amount of free (allocatable) RAM:
   */
  unsigned int nr_free_pages(void)
  {
  	unsigned int sum = 0;
  	struct zone *zone;
  
  	for_each_zone(zone)
  		sum += zone->free_pages;
  
  	return sum;
  }
  
  EXPORT_SYMBOL(nr_free_pages);
  
  #ifdef CONFIG_NUMA
  unsigned int nr_free_pages_pgdat(pg_data_t *pgdat)
  {
2f6726e54   Christoph Lameter   [PATCH] Apply typ...
1165
1166
  	unsigned int sum = 0;
  	enum zone_type i;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
  
  	for (i = 0; i < MAX_NR_ZONES; i++)
  		sum += pgdat->node_zones[i].free_pages;
  
  	return sum;
  }
  #endif
  
  static unsigned int nr_free_zone_pages(int offset)
  {
e310fd432   Martin J. Bligh   [PATCH] Fix NUMA ...
1177
1178
  	/* Just pick one node, since fallback list is circular */
  	pg_data_t *pgdat = NODE_DATA(numa_node_id());
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1179
  	unsigned int sum = 0;
e310fd432   Martin J. Bligh   [PATCH] Fix NUMA ...
1180
1181
1182
  	struct zonelist *zonelist = pgdat->node_zonelists + offset;
  	struct zone **zonep = zonelist->zones;
  	struct zone *zone;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1183

e310fd432   Martin J. Bligh   [PATCH] Fix NUMA ...
1184
1185
1186
1187
1188
  	for (zone = *zonep++; zone; zone = *zonep++) {
  		unsigned long size = zone->present_pages;
  		unsigned long high = zone->pages_high;
  		if (size > high)
  			sum += size - high;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
  	}
  
  	return sum;
  }
  
  /*
   * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL
   */
  unsigned int nr_free_buffer_pages(void)
  {
af4ca457e   Al Viro   [PATCH] gfp_t: in...
1199
  	return nr_free_zone_pages(gfp_zone(GFP_USER));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1200
1201
1202
1203
1204
1205
1206
  }
  
  /*
   * Amount of free RAM allocatable within all zones
   */
  unsigned int nr_free_pagecache_pages(void)
  {
af4ca457e   Al Viro   [PATCH] gfp_t: in...
1207
  	return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1208
  }
08e0f6a97   Christoph Lameter   [PATCH] Add NUMA_...
1209
1210
  
  static inline void show_node(struct zone *zone)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1211
  {
08e0f6a97   Christoph Lameter   [PATCH] Add NUMA_...
1212
1213
  	if (NUMA_BUILD)
  		printk("Node %ld ", zone_to_nid(zone));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1214
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1215

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1216
1217
1218
1219
1220
1221
  void si_meminfo(struct sysinfo *val)
  {
  	val->totalram = totalram_pages;
  	val->sharedram = 0;
  	val->freeram = nr_free_pages();
  	val->bufferram = nr_blockdev_pages();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1222
1223
  	val->totalhigh = totalhigh_pages;
  	val->freehigh = nr_free_highpages();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
  	val->mem_unit = PAGE_SIZE;
  }
  
  EXPORT_SYMBOL(si_meminfo);
  
  #ifdef CONFIG_NUMA
  void si_meminfo_node(struct sysinfo *val, int nid)
  {
  	pg_data_t *pgdat = NODE_DATA(nid);
  
  	val->totalram = pgdat->node_present_pages;
  	val->freeram = nr_free_pages_pgdat(pgdat);
98d2b0ebd   Christoph Lameter   [PATCH] reduce MA...
1236
  #ifdef CONFIG_HIGHMEM
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1237
1238
  	val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;
  	val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages;
98d2b0ebd   Christoph Lameter   [PATCH] reduce MA...
1239
1240
1241
1242
  #else
  	val->totalhigh = 0;
  	val->freehigh = 0;
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
  	val->mem_unit = PAGE_SIZE;
  }
  #endif
  
  #define K(x) ((x) << (PAGE_SHIFT-10))
  
  /*
   * Show free area list (used inside shift_scroll-lock stuff)
   * We also calculate the percentage fragmentation. We do this by counting the
   * memory on each free list with the exception of the first item on the list.
   */
  void show_free_areas(void)
  {
c72419138   Jes Sorensen   [PATCH] Condense ...
1256
  	int cpu;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1257
1258
1259
1260
1261
1262
  	unsigned long active;
  	unsigned long inactive;
  	unsigned long free;
  	struct zone *zone;
  
  	for_each_zone(zone) {
c72419138   Jes Sorensen   [PATCH] Condense ...
1263
  		if (!populated_zone(zone))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1264
  			continue;
c72419138   Jes Sorensen   [PATCH] Condense ...
1265
1266
1267
1268
  
  		show_node(zone);
  		printk("%s per-cpu:
  ", zone->name);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1269

6b482c677   Dave Jones   [PATCH] Don't pri...
1270
  		for_each_online_cpu(cpu) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1271
  			struct per_cpu_pageset *pageset;
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
1272
  			pageset = zone_pcp(zone, cpu);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1273

c72419138   Jes Sorensen   [PATCH] Condense ...
1274
1275
1276
1277
1278
1279
1280
  			printk("CPU %4d: Hot: hi:%5d, btch:%4d usd:%4d   "
  			       "Cold: hi:%5d, btch:%4d usd:%4d
  ",
  			       cpu, pageset->pcp[0].high,
  			       pageset->pcp[0].batch, pageset->pcp[0].count,
  			       pageset->pcp[1].high, pageset->pcp[1].batch,
  			       pageset->pcp[1].count);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1281
1282
  		}
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1283
  	get_zone_counts(&active, &inactive, &free);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1284
1285
1286
1287
1288
  	printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu "
  		"unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu
  ",
  		active,
  		inactive,
b1e7a8fd8   Christoph Lameter   [PATCH] zoned vm ...
1289
  		global_page_state(NR_FILE_DIRTY),
ce866b34a   Christoph Lameter   [PATCH] zoned vm ...
1290
  		global_page_state(NR_WRITEBACK),
fd39fc856   Christoph Lameter   [PATCH] zoned vm ...
1291
  		global_page_state(NR_UNSTABLE_NFS),
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1292
  		nr_free_pages(),
972d1a7b1   Christoph Lameter   [PATCH] ZVC: Supp...
1293
1294
  		global_page_state(NR_SLAB_RECLAIMABLE) +
  			global_page_state(NR_SLAB_UNRECLAIMABLE),
65ba55f50   Christoph Lameter   [PATCH] zoned vm ...
1295
  		global_page_state(NR_FILE_MAPPED),
df849a152   Christoph Lameter   [PATCH] zoned vm ...
1296
  		global_page_state(NR_PAGETABLE));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1297
1298
1299
  
  	for_each_zone(zone) {
  		int i;
c72419138   Jes Sorensen   [PATCH] Condense ...
1300
1301
  		if (!populated_zone(zone))
  			continue;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
  		show_node(zone);
  		printk("%s"
  			" free:%lukB"
  			" min:%lukB"
  			" low:%lukB"
  			" high:%lukB"
  			" active:%lukB"
  			" inactive:%lukB"
  			" present:%lukB"
  			" pages_scanned:%lu"
  			" all_unreclaimable? %s"
  			"
  ",
  			zone->name,
  			K(zone->free_pages),
  			K(zone->pages_min),
  			K(zone->pages_low),
  			K(zone->pages_high),
  			K(zone->nr_active),
  			K(zone->nr_inactive),
  			K(zone->present_pages),
  			zone->pages_scanned,
  			(zone->all_unreclaimable ? "yes" : "no")
  			);
  		printk("lowmem_reserve[]:");
  		for (i = 0; i < MAX_NR_ZONES; i++)
  			printk(" %lu", zone->lowmem_reserve[i]);
  		printk("
  ");
  	}
  
  	for_each_zone(zone) {
8f9de51a4   Kirill Korotaev   [PATCH] printk() ...
1334
   		unsigned long nr[MAX_ORDER], flags, order, total = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1335

c72419138   Jes Sorensen   [PATCH] Condense ...
1336
1337
  		if (!populated_zone(zone))
  			continue;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1338
1339
  		show_node(zone);
  		printk("%s: ", zone->name);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1340
1341
1342
  
  		spin_lock_irqsave(&zone->lock, flags);
  		for (order = 0; order < MAX_ORDER; order++) {
8f9de51a4   Kirill Korotaev   [PATCH] printk() ...
1343
1344
  			nr[order] = zone->free_area[order].nr_free;
  			total += nr[order] << order;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1345
1346
  		}
  		spin_unlock_irqrestore(&zone->lock, flags);
8f9de51a4   Kirill Korotaev   [PATCH] printk() ...
1347
1348
  		for (order = 0; order < MAX_ORDER; order++)
  			printk("%lu*%lukB ", nr[order], K(1UL) << order);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1349
1350
1351
1352
1353
1354
1355
1356
1357
  		printk("= %lukB
  ", K(total));
  	}
  
  	show_swap_cache_info();
  }
  
  /*
   * Builds allocation fallback zone lists.
1a93205bd   Christoph Lameter   [PATCH] mm: simpl...
1358
1359
   *
   * Add all populated zones of a node to the zonelist.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1360
   */
86356ab14   Yasunori Goto   [PATCH] wait_tabl...
1361
  static int __meminit build_zonelists_node(pg_data_t *pgdat,
2f6726e54   Christoph Lameter   [PATCH] Apply typ...
1362
  			struct zonelist *zonelist, int nr_zones, enum zone_type zone_type)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1363
  {
1a93205bd   Christoph Lameter   [PATCH] mm: simpl...
1364
  	struct zone *zone;
98d2b0ebd   Christoph Lameter   [PATCH] reduce MA...
1365
  	BUG_ON(zone_type >= MAX_NR_ZONES);
2f6726e54   Christoph Lameter   [PATCH] Apply typ...
1366
  	zone_type++;
02a68a5eb   Christoph Lameter   [PATCH] Fix zone ...
1367
1368
  
  	do {
2f6726e54   Christoph Lameter   [PATCH] Apply typ...
1369
  		zone_type--;
070f80326   Christoph Lameter   [PATCH] build_zon...
1370
  		zone = pgdat->node_zones + zone_type;
1a93205bd   Christoph Lameter   [PATCH] mm: simpl...
1371
  		if (populated_zone(zone)) {
070f80326   Christoph Lameter   [PATCH] build_zon...
1372
1373
  			zonelist->zones[nr_zones++] = zone;
  			check_highest_zone(zone_type);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1374
  		}
02a68a5eb   Christoph Lameter   [PATCH] Fix zone ...
1375

2f6726e54   Christoph Lameter   [PATCH] Apply typ...
1376
  	} while (zone_type);
070f80326   Christoph Lameter   [PATCH] build_zon...
1377
  	return nr_zones;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1378
1379
1380
1381
  }
  
  #ifdef CONFIG_NUMA
  #define MAX_NODE_LOAD (num_online_nodes())
86356ab14   Yasunori Goto   [PATCH] wait_tabl...
1382
  static int __meminitdata node_load[MAX_NUMNODES];
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1383
  /**
4dc3b16ba   Pavel Pisa   [PATCH] DocBook: ...
1384
   * find_next_best_node - find the next node that should appear in a given node's fallback list
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
   * @node: node whose fallback list we're appending
   * @used_node_mask: nodemask_t of already used nodes
   *
   * We use a number of factors to determine which is the next node that should
   * appear on a given node's fallback list.  The node should not have appeared
   * already in @node's fallback list, and it should be the next closest node
   * according to the distance array (which contains arbitrary distance values
   * from each node to each node in the system), and should also prefer nodes
   * with no CPUs, since presumably they'll have very little allocation pressure
   * on them otherwise.
   * It returns -1 if no node is found.
   */
86356ab14   Yasunori Goto   [PATCH] wait_tabl...
1397
  static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1398
  {
4cf808eb4   Linus Torvalds   [PATCH] Handle ho...
1399
  	int n, val;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1400
1401
  	int min_val = INT_MAX;
  	int best_node = -1;
4cf808eb4   Linus Torvalds   [PATCH] Handle ho...
1402
1403
1404
1405
1406
  	/* Use the local node if we haven't already */
  	if (!node_isset(node, *used_node_mask)) {
  		node_set(node, *used_node_mask);
  		return node;
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1407

4cf808eb4   Linus Torvalds   [PATCH] Handle ho...
1408
1409
  	for_each_online_node(n) {
  		cpumask_t tmp;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1410
1411
1412
1413
  
  		/* Don't want a node to appear more than once */
  		if (node_isset(n, *used_node_mask))
  			continue;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1414
1415
  		/* Use the distance array to find the distance */
  		val = node_distance(node, n);
4cf808eb4   Linus Torvalds   [PATCH] Handle ho...
1416
1417
  		/* Penalize nodes under us ("prefer the next node") */
  		val += (n < node);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
  		/* Give preference to headless and unused nodes */
  		tmp = node_to_cpumask(n);
  		if (!cpus_empty(tmp))
  			val += PENALTY_FOR_NODE_WITH_CPUS;
  
  		/* Slight preference for less loaded node */
  		val *= (MAX_NODE_LOAD*MAX_NUMNODES);
  		val += node_load[n];
  
  		if (val < min_val) {
  			min_val = val;
  			best_node = n;
  		}
  	}
  
  	if (best_node >= 0)
  		node_set(best_node, *used_node_mask);
  
  	return best_node;
  }
86356ab14   Yasunori Goto   [PATCH] wait_tabl...
1438
  static void __meminit build_zonelists(pg_data_t *pgdat)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1439
  {
19655d348   Christoph Lameter   [PATCH] linearly ...
1440
1441
  	int j, node, local_node;
  	enum zone_type i;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1442
1443
1444
1445
1446
  	int prev_node, load;
  	struct zonelist *zonelist;
  	nodemask_t used_mask;
  
  	/* initialize zonelists */
19655d348   Christoph Lameter   [PATCH] linearly ...
1447
  	for (i = 0; i < MAX_NR_ZONES; i++) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
  		zonelist = pgdat->node_zonelists + i;
  		zonelist->zones[0] = NULL;
  	}
  
  	/* NUMA-aware ordering of nodes */
  	local_node = pgdat->node_id;
  	load = num_online_nodes();
  	prev_node = local_node;
  	nodes_clear(used_mask);
  	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
9eeff2395   Christoph Lameter   [PATCH] Zone recl...
1458
1459
1460
1461
1462
1463
1464
1465
  		int distance = node_distance(local_node, node);
  
  		/*
  		 * If another node is sufficiently far away then it is better
  		 * to reclaim pages in a zone before going off node.
  		 */
  		if (distance > RECLAIM_DISTANCE)
  			zone_reclaim_mode = 1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1466
1467
1468
1469
1470
  		/*
  		 * We don't want to pressure a particular node.
  		 * So adding penalty to the first node in same
  		 * distance group to make it round-robin.
  		 */
9eeff2395   Christoph Lameter   [PATCH] Zone recl...
1471
1472
  
  		if (distance != node_distance(local_node, prev_node))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1473
1474
1475
  			node_load[node] += load;
  		prev_node = node;
  		load--;
19655d348   Christoph Lameter   [PATCH] linearly ...
1476
  		for (i = 0; i < MAX_NR_ZONES; i++) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1477
1478
  			zonelist = pgdat->node_zonelists + i;
  			for (j = 0; zonelist->zones[j] != NULL; j++);
19655d348   Christoph Lameter   [PATCH] linearly ...
1479
  	 		j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1480
1481
1482
1483
1484
1485
  			zonelist->zones[j] = NULL;
  		}
  	}
  }
  
  #else	/* CONFIG_NUMA */
86356ab14   Yasunori Goto   [PATCH] wait_tabl...
1486
  static void __meminit build_zonelists(pg_data_t *pgdat)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1487
  {
19655d348   Christoph Lameter   [PATCH] linearly ...
1488
1489
  	int node, local_node;
  	enum zone_type i,j;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1490
1491
  
  	local_node = pgdat->node_id;
19655d348   Christoph Lameter   [PATCH] linearly ...
1492
  	for (i = 0; i < MAX_NR_ZONES; i++) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1493
1494
1495
  		struct zonelist *zonelist;
  
  		zonelist = pgdat->node_zonelists + i;
19655d348   Christoph Lameter   [PATCH] linearly ...
1496
   		j = build_zonelists_node(pgdat, zonelist, 0, i);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
   		/*
   		 * Now we build the zonelist so that it contains the zones
   		 * of all the other nodes.
   		 * We don't want to pressure a particular node, so when
   		 * building the zones for node N, we make sure that the
   		 * zones coming right after the local ones are those from
   		 * node N+1 (modulo N)
   		 */
  		for (node = local_node + 1; node < MAX_NUMNODES; node++) {
  			if (!node_online(node))
  				continue;
19655d348   Christoph Lameter   [PATCH] linearly ...
1508
  			j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1509
1510
1511
1512
  		}
  		for (node = 0; node < local_node; node++) {
  			if (!node_online(node))
  				continue;
19655d348   Christoph Lameter   [PATCH] linearly ...
1513
  			j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1514
1515
1516
1517
1518
1519
1520
  		}
  
  		zonelist->zones[j] = NULL;
  	}
  }
  
  #endif	/* CONFIG_NUMA */
6811378e7   Yasunori Goto   [PATCH] wait_tabl...
1521
1522
  /* return values int ....just for stop_machine_run() */
  static int __meminit __build_all_zonelists(void *dummy)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1523
  {
6811378e7   Yasunori Goto   [PATCH] wait_tabl...
1524
1525
1526
1527
1528
1529
1530
1531
1532
  	int nid;
  	for_each_online_node(nid)
  		build_zonelists(NODE_DATA(nid));
  	return 0;
  }
  
  void __meminit build_all_zonelists(void)
  {
  	if (system_state == SYSTEM_BOOTING) {
423b41d77   Randy Dunlap   [PATCH] mm/page_a...
1533
  		__build_all_zonelists(NULL);
6811378e7   Yasunori Goto   [PATCH] wait_tabl...
1534
1535
1536
1537
1538
1539
1540
  		cpuset_init_current_mems_allowed();
  	} else {
  		/* we have to stop all cpus to guaranntee there is no user
  		   of zonelist */
  		stop_machine_run(__build_all_zonelists, NULL, NR_CPUS);
  		/* cpuset refresh routine should be here */
  	}
bd1e22b8e   Andrew Morton   [PATCH] initialis...
1541
1542
1543
1544
  	vm_total_pages = nr_free_pagecache_pages();
  	printk("Built %i zonelists.  Total pages: %ld
  ",
  			num_online_nodes(), vm_total_pages);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
  }
  
  /*
   * Helper functions to size the waitqueue hash table.
   * Essentially these want to choose hash table sizes sufficiently
   * large so that collisions trying to wait on pages are rare.
   * But in fact, the number of active page waitqueues on typical
   * systems is ridiculously low, less than 200. So this is even
   * conservative, even though it seems large.
   *
   * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
   * waitqueues, i.e. the size of the waitq table given the number of pages.
   */
  #define PAGES_PER_WAITQUEUE	256
cca448fe9   Yasunori Goto   [PATCH] wait_tabl...
1559
  #ifndef CONFIG_MEMORY_HOTPLUG
02b694dea   Yasunori Goto   [PATCH] wait_tabl...
1560
  static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
  {
  	unsigned long size = 1;
  
  	pages /= PAGES_PER_WAITQUEUE;
  
  	while (size < pages)
  		size <<= 1;
  
  	/*
  	 * Once we have dozens or even hundreds of threads sleeping
  	 * on IO we've got bigger problems than wait queue collision.
  	 * Limit the size of the wait table to a reasonable size.
  	 */
  	size = min(size, 4096UL);
  
  	return max(size, 4UL);
  }
cca448fe9   Yasunori Goto   [PATCH] wait_tabl...
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
  #else
  /*
   * A zone's size might be changed by hot-add, so it is not possible to determine
   * a suitable size for its wait_table.  So we use the maximum size now.
   *
   * The max wait table size = 4096 x sizeof(wait_queue_head_t).   ie:
   *
   *    i386 (preemption config)    : 4096 x 16 = 64Kbyte.
   *    ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.
   *    ia64, x86-64 (preemption)   : 4096 x 24 = 96Kbyte.
   *
   * The maximum entries are prepared when a zone's memory is (512K + 256) pages
   * or more by the traditional way. (See above).  It equals:
   *
   *    i386, x86-64, powerpc(4K page size) : =  ( 2G + 1M)byte.
   *    ia64(16K page size)                 : =  ( 8G + 4M)byte.
   *    powerpc (64K page size)             : =  (32G +16M)byte.
   */
  static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
  {
  	return 4096UL;
  }
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
  
  /*
   * This is an integer logarithm so that shifts can be used later
   * to extract the more random high bits from the multiplicative
   * hash function before the remainder is taken.
   */
  static inline unsigned long wait_table_bits(unsigned long size)
  {
  	return ffz(~size);
  }
  
  #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1613
1614
1615
1616
1617
  /*
   * Initially all pages are reserved - free ones are freed
   * up by free_all_bootmem() once the early boot process is
   * done. Non-atomic initialization, single-pass.
   */
c09b42404   Matt Tolentino   [PATCH] x86_64: a...
1618
  void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1619
1620
  		unsigned long start_pfn)
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1621
  	struct page *page;
29751f699   Andy Whitcroft   [PATCH] sparsemem...
1622
1623
  	unsigned long end_pfn = start_pfn + size;
  	unsigned long pfn;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1624

cbe8dd4af   Greg Ungerer   [PATCH] memmap_in...
1625
  	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
d41dee369   Andy Whitcroft   [PATCH] sparsemem...
1626
1627
1628
1629
  		if (!early_pfn_valid(pfn))
  			continue;
  		page = pfn_to_page(pfn);
  		set_page_links(page, zone, nid, pfn);
7835e98b2   Nick Piggin   [PATCH] remove se...
1630
  		init_page_count(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1631
1632
1633
1634
1635
1636
  		reset_page_mapcount(page);
  		SetPageReserved(page);
  		INIT_LIST_HEAD(&page->lru);
  #ifdef WANT_PAGE_VIRTUAL
  		/* The shift won't overflow because ZONE_NORMAL is below 4G. */
  		if (!is_highmem_idx(zone))
3212c6be2   Bob Picco   [PATCH] fix WANT_...
1637
  			set_page_address(page, __va(pfn << PAGE_SHIFT));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1638
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
  	}
  }
  
  void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
  				unsigned long size)
  {
  	int order;
  	for (order = 0; order < MAX_ORDER ; order++) {
  		INIT_LIST_HEAD(&zone->free_area[order].free_list);
  		zone->free_area[order].nr_free = 0;
  	}
  }
d41dee369   Andy Whitcroft   [PATCH] sparsemem...
1651
  #define ZONETABLE_INDEX(x, zone_nr)	((x << ZONES_SHIFT) | zone_nr)
2f1b62486   Christoph Lameter   [PATCH] reduce MA...
1652
1653
  void zonetable_add(struct zone *zone, int nid, enum zone_type zid,
  		unsigned long pfn, unsigned long size)
d41dee369   Andy Whitcroft   [PATCH] sparsemem...
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
  {
  	unsigned long snum = pfn_to_section_nr(pfn);
  	unsigned long end = pfn_to_section_nr(pfn + size);
  
  	if (FLAGS_HAS_NODE)
  		zone_table[ZONETABLE_INDEX(nid, zid)] = zone;
  	else
  		for (; snum <= end; snum++)
  			zone_table[ZONETABLE_INDEX(snum, zid)] = zone;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1664
1665
1666
1667
  #ifndef __HAVE_ARCH_MEMMAP_INIT
  #define memmap_init(size, nid, zone, start_pfn) \
  	memmap_init_zone((size), (nid), (zone), (start_pfn))
  #endif
6292d9aaf   Ashok Raj   [PATCH] __cpuinit...
1668
  static int __cpuinit zone_batchsize(struct zone *zone)
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
1669
1670
1671
1672
1673
  {
  	int batch;
  
  	/*
  	 * The per-cpu-pages pools are set to around 1000th of the
ba56e91c9   Seth, Rohit   [PATCH] mm: page_...
1674
  	 * size of the zone.  But no more than 1/2 of a meg.
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
1675
1676
1677
1678
  	 *
  	 * OK, so we don't know how big the cache is.  So guess.
  	 */
  	batch = zone->present_pages / 1024;
ba56e91c9   Seth, Rohit   [PATCH] mm: page_...
1679
1680
  	if (batch * PAGE_SIZE > 512 * 1024)
  		batch = (512 * 1024) / PAGE_SIZE;
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
1681
1682
1683
1684
1685
  	batch /= 4;		/* We effectively *= 4 below */
  	if (batch < 1)
  		batch = 1;
  
  	/*
0ceaacc97   Nick Piggin   [PATCH] Fix up pe...
1686
1687
1688
  	 * Clamp the batch to a 2^n - 1 value. Having a power
  	 * of 2 value was found to be more likely to have
  	 * suboptimal cache aliasing properties in some cases.
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
1689
  	 *
0ceaacc97   Nick Piggin   [PATCH] Fix up pe...
1690
1691
1692
1693
  	 * For example if 2 tasks are alternately allocating
  	 * batches of pages, one task can end up with a lot
  	 * of pages of one half of the possible page colors
  	 * and the other with pages of the other colors.
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
1694
  	 */
0ceaacc97   Nick Piggin   [PATCH] Fix up pe...
1695
  	batch = (1 << (fls(batch + batch/2)-1)) - 1;
ba56e91c9   Seth, Rohit   [PATCH] mm: page_...
1696

e7c8d5c99   Christoph Lameter   [PATCH] node loca...
1697
1698
  	return batch;
  }
2caaad41e   Christoph Lameter   [PATCH] Reduce si...
1699
1700
1701
  inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
  {
  	struct per_cpu_pages *pcp;
1c6fe9465   Magnus Damm   [PATCH] NUMA: bro...
1702
  	memset(p, 0, sizeof(*p));
2caaad41e   Christoph Lameter   [PATCH] Reduce si...
1703
1704
  	pcp = &p->pcp[0];		/* hot */
  	pcp->count = 0;
2caaad41e   Christoph Lameter   [PATCH] Reduce si...
1705
1706
1707
1708
1709
1710
  	pcp->high = 6 * batch;
  	pcp->batch = max(1UL, 1 * batch);
  	INIT_LIST_HEAD(&pcp->list);
  
  	pcp = &p->pcp[1];		/* cold*/
  	pcp->count = 0;
2caaad41e   Christoph Lameter   [PATCH] Reduce si...
1711
  	pcp->high = 2 * batch;
e46a5e28c   Seth, Rohit   [PATCH] mm: set p...
1712
  	pcp->batch = max(1UL, batch/2);
2caaad41e   Christoph Lameter   [PATCH] Reduce si...
1713
1714
  	INIT_LIST_HEAD(&pcp->list);
  }
8ad4b1fb8   Rohit Seth   [PATCH] Make high...
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
  /*
   * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
   * to the value high for the pageset p.
   */
  
  static void setup_pagelist_highmark(struct per_cpu_pageset *p,
  				unsigned long high)
  {
  	struct per_cpu_pages *pcp;
  
  	pcp = &p->pcp[0]; /* hot list */
  	pcp->high = high;
  	pcp->batch = max(1UL, high/4);
  	if ((high/4) > (PAGE_SHIFT * 8))
  		pcp->batch = PAGE_SHIFT * 8;
  }
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
1731
1732
  #ifdef CONFIG_NUMA
  /*
2caaad41e   Christoph Lameter   [PATCH] Reduce si...
1733
1734
1735
1736
1737
1738
1739
   * Boot pageset table. One per cpu which is going to be used for all
   * zones and all nodes. The parameters will be set in such a way
   * that an item put on a list will immediately be handed over to
   * the buddy list. This is safe since pageset manipulation is done
   * with interrupts disabled.
   *
   * Some NUMA counter updates may also be caught by the boot pagesets.
b7c84c6ad   Christoph Lameter   [PATCH] boot_page...
1740
1741
1742
1743
1744
1745
1746
1747
   *
   * The boot_pagesets must be kept even after bootup is complete for
   * unused processors and/or zones. They do play a role for bootstrapping
   * hotplugged processors.
   *
   * zoneinfo_show() and maybe other functions do
   * not check if the processor is online before following the pageset pointer.
   * Other parts of the kernel may not check if the zone is available.
2caaad41e   Christoph Lameter   [PATCH] Reduce si...
1748
   */
88a2a4ac6   Eric Dumazet   [PATCH] percpu da...
1749
  static struct per_cpu_pageset boot_pageset[NR_CPUS];
2caaad41e   Christoph Lameter   [PATCH] Reduce si...
1750
1751
1752
  
  /*
   * Dynamically allocate memory for the
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
1753
1754
   * per cpu pageset array in struct zone.
   */
6292d9aaf   Ashok Raj   [PATCH] __cpuinit...
1755
  static int __cpuinit process_zones(int cpu)
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
1756
1757
  {
  	struct zone *zone, *dzone;
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
1758
1759
  
  	for_each_zone(zone) {
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
1760

66a550308   Christoph Lameter   [PATCH] Do not al...
1761
1762
  		if (!populated_zone(zone))
  			continue;
23316bc86   Nick Piggin   [PATCH] mm: clean...
1763
  		zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
1764
  					 GFP_KERNEL, cpu_to_node(cpu));
23316bc86   Nick Piggin   [PATCH] mm: clean...
1765
  		if (!zone_pcp(zone, cpu))
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
1766
  			goto bad;
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
1767

23316bc86   Nick Piggin   [PATCH] mm: clean...
1768
  		setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));
8ad4b1fb8   Rohit Seth   [PATCH] Make high...
1769
1770
1771
1772
  
  		if (percpu_pagelist_fraction)
  			setup_pagelist_highmark(zone_pcp(zone, cpu),
  			 	(zone->present_pages / percpu_pagelist_fraction));
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
1773
1774
1775
1776
1777
1778
1779
  	}
  
  	return 0;
  bad:
  	for_each_zone(dzone) {
  		if (dzone == zone)
  			break;
23316bc86   Nick Piggin   [PATCH] mm: clean...
1780
1781
  		kfree(zone_pcp(dzone, cpu));
  		zone_pcp(dzone, cpu) = NULL;
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
1782
1783
1784
1785
1786
1787
  	}
  	return -ENOMEM;
  }
  
  static inline void free_zone_pagesets(int cpu)
  {
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
1788
1789
1790
1791
  	struct zone *zone;
  
  	for_each_zone(zone) {
  		struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
f3ef9ead3   David Rientjes   [PATCH] do not fr...
1792
1793
1794
  		/* Free per_cpu_pageset if it is slab allocated */
  		if (pset != &boot_pageset[cpu])
  			kfree(pset);
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
1795
  		zone_pcp(zone, cpu) = NULL;
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
1796
  	}
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
1797
  }
9c7b216d2   Chandra Seetharaman   [PATCH] cpu hotpl...
1798
  static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
  		unsigned long action,
  		void *hcpu)
  {
  	int cpu = (long)hcpu;
  	int ret = NOTIFY_OK;
  
  	switch (action) {
  		case CPU_UP_PREPARE:
  			if (process_zones(cpu))
  				ret = NOTIFY_BAD;
  			break;
b0d416932   Andi Kleen   [PATCH] x86_64: W...
1810
  		case CPU_UP_CANCELED:
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
1811
1812
1813
  		case CPU_DEAD:
  			free_zone_pagesets(cpu);
  			break;
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
1814
1815
1816
1817
1818
  		default:
  			break;
  	}
  	return ret;
  }
74b85f379   Chandra Seetharaman   [PATCH] cpu hotpl...
1819
  static struct notifier_block __cpuinitdata pageset_notifier =
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
1820
  	{ &pageset_cpuup_callback, NULL, 0 };
78d9955bb   Al Viro   [PATCH] missing p...
1821
  void __init setup_per_cpu_pageset(void)
e7c8d5c99   Christoph Lameter   [PATCH] node loca...
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
  {
  	int err;
  
  	/* Initialize per_cpu_pageset for cpu 0.
  	 * A cpuup callback will do this for every cpu
  	 * as it comes online
  	 */
  	err = process_zones(smp_processor_id());
  	BUG_ON(err);
  	register_cpu_notifier(&pageset_notifier);
  }
  
  #endif
c09b42404   Matt Tolentino   [PATCH] x86_64: a...
1835
  static __meminit
cca448fe9   Yasunori Goto   [PATCH] wait_tabl...
1836
  int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
1837
1838
1839
  {
  	int i;
  	struct pglist_data *pgdat = zone->zone_pgdat;
cca448fe9   Yasunori Goto   [PATCH] wait_tabl...
1840
  	size_t alloc_size;
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
1841
1842
1843
1844
1845
  
  	/*
  	 * The per-page waitqueue mechanism uses hashed waitqueues
  	 * per zone.
  	 */
02b694dea   Yasunori Goto   [PATCH] wait_tabl...
1846
1847
1848
1849
  	zone->wait_table_hash_nr_entries =
  		 wait_table_hash_nr_entries(zone_size_pages);
  	zone->wait_table_bits =
  		wait_table_bits(zone->wait_table_hash_nr_entries);
cca448fe9   Yasunori Goto   [PATCH] wait_tabl...
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
  	alloc_size = zone->wait_table_hash_nr_entries
  					* sizeof(wait_queue_head_t);
  
   	if (system_state == SYSTEM_BOOTING) {
  		zone->wait_table = (wait_queue_head_t *)
  			alloc_bootmem_node(pgdat, alloc_size);
  	} else {
  		/*
  		 * This case means that a zone whose size was 0 gets new memory
  		 * via memory hot-add.
  		 * But it may be the case that a new node was hot-added.  In
  		 * this case vmalloc() will not be able to use this new node's
  		 * memory - this wait_table must be initialized to use this new
  		 * node itself as well.
  		 * To use this new node's memory, further consideration will be
  		 * necessary.
  		 */
  		zone->wait_table = (wait_queue_head_t *)vmalloc(alloc_size);
  	}
  	if (!zone->wait_table)
  		return -ENOMEM;
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
1871

02b694dea   Yasunori Goto   [PATCH] wait_tabl...
1872
  	for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
1873
  		init_waitqueue_head(zone->wait_table + i);
cca448fe9   Yasunori Goto   [PATCH] wait_tabl...
1874
1875
  
  	return 0;
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
1876
  }
c09b42404   Matt Tolentino   [PATCH] x86_64: a...
1877
  static __meminit void zone_pcp_init(struct zone *zone)
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
1878
1879
1880
1881
1882
1883
1884
  {
  	int cpu;
  	unsigned long batch = zone_batchsize(zone);
  
  	for (cpu = 0; cpu < NR_CPUS; cpu++) {
  #ifdef CONFIG_NUMA
  		/* Early boot. Slab allocator not functional yet */
23316bc86   Nick Piggin   [PATCH] mm: clean...
1885
  		zone_pcp(zone, cpu) = &boot_pageset[cpu];
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
1886
1887
1888
1889
1890
  		setup_pageset(&boot_pageset[cpu],0);
  #else
  		setup_pageset(zone_pcp(zone,cpu), batch);
  #endif
  	}
f5335c0f1   Anton Blanchard   [PATCH] quieten z...
1891
1892
1893
1894
  	if (zone->present_pages)
  		printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%lu
  ",
  			zone->name, zone->present_pages, batch);
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
1895
  }
718127cc3   Yasunori Goto   [PATCH] wait_tabl...
1896
1897
1898
  __meminit int init_currently_empty_zone(struct zone *zone,
  					unsigned long zone_start_pfn,
  					unsigned long size)
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
1899
1900
  {
  	struct pglist_data *pgdat = zone->zone_pgdat;
cca448fe9   Yasunori Goto   [PATCH] wait_tabl...
1901
1902
1903
1904
  	int ret;
  	ret = zone_wait_table_init(zone, size);
  	if (ret)
  		return ret;
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
1905
  	pgdat->nr_zones = zone_idx(zone) + 1;
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
1906
1907
1908
1909
1910
  	zone->zone_start_pfn = zone_start_pfn;
  
  	memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
  
  	zone_init_free_lists(pgdat, zone, zone->spanned_pages);
718127cc3   Yasunori Goto   [PATCH] wait_tabl...
1911
1912
  
  	return 0;
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
1913
  }
c713216de   Mel Gorman   [PATCH] Introduce...
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
  #ifdef CONFIG_ARCH_POPULATES_NODE_MAP
  /*
   * Basic iterator support. Return the first range of PFNs for a node
   * Note: nid == MAX_NUMNODES returns first region regardless of node
   */
  static int __init first_active_region_index_in_nid(int nid)
  {
  	int i;
  
  	for (i = 0; i < nr_nodemap_entries; i++)
  		if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
  			return i;
  
  	return -1;
  }
  
  /*
   * Basic iterator support. Return the next active range of PFNs for a node
   * Note: nid == MAX_NUMNODES returns next region regardles of node
   */
  static int __init next_active_region_index_in_nid(int index, int nid)
  {
  	for (index = index + 1; index < nr_nodemap_entries; index++)
  		if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
  			return index;
  
  	return -1;
  }
  
  #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
  /*
   * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
   * Architectures may implement their own version but if add_active_range()
   * was used and there are no special requirements, this is a convenient
   * alternative
   */
  int __init early_pfn_to_nid(unsigned long pfn)
  {
  	int i;
  
  	for (i = 0; i < nr_nodemap_entries; i++) {
  		unsigned long start_pfn = early_node_map[i].start_pfn;
  		unsigned long end_pfn = early_node_map[i].end_pfn;
  
  		if (start_pfn <= pfn && pfn < end_pfn)
  			return early_node_map[i].nid;
  	}
  
  	return 0;
  }
  #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
  
  /* Basic iterator support to walk early_node_map[] */
  #define for_each_active_range_index_in_nid(i, nid) \
  	for (i = first_active_region_index_in_nid(nid); i != -1; \
  				i = next_active_region_index_in_nid(i, nid))
  
  /**
   * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
1973
1974
   * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
   * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node
c713216de   Mel Gorman   [PATCH] Introduce...
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
   *
   * If an architecture guarantees that all ranges registered with
   * add_active_ranges() contain no holes and may be freed, this
   * this function may be used instead of calling free_bootmem() manually.
   */
  void __init free_bootmem_with_active_regions(int nid,
  						unsigned long max_low_pfn)
  {
  	int i;
  
  	for_each_active_range_index_in_nid(i, nid) {
  		unsigned long size_pages = 0;
  		unsigned long end_pfn = early_node_map[i].end_pfn;
  
  		if (early_node_map[i].start_pfn >= max_low_pfn)
  			continue;
  
  		if (end_pfn > max_low_pfn)
  			end_pfn = max_low_pfn;
  
  		size_pages = end_pfn - early_node_map[i].start_pfn;
  		free_bootmem_node(NODE_DATA(early_node_map[i].nid),
  				PFN_PHYS(early_node_map[i].start_pfn),
  				size_pages << PAGE_SHIFT);
  	}
  }
  
  /**
   * sparse_memory_present_with_active_regions - Call memory_present for each active range
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
2004
   * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
c713216de   Mel Gorman   [PATCH] Introduce...
2005
2006
2007
   *
   * If an architecture guarantees that all ranges registered with
   * add_active_ranges() contain no holes and may be freed, this
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
2008
   * function may be used instead of calling memory_present() manually.
c713216de   Mel Gorman   [PATCH] Introduce...
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
   */
  void __init sparse_memory_present_with_active_regions(int nid)
  {
  	int i;
  
  	for_each_active_range_index_in_nid(i, nid)
  		memory_present(early_node_map[i].nid,
  				early_node_map[i].start_pfn,
  				early_node_map[i].end_pfn);
  }
  
  /**
fb01439c5   Mel Gorman   [PATCH] Allow an ...
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
   * push_node_boundaries - Push node boundaries to at least the requested boundary
   * @nid: The nid of the node to push the boundary for
   * @start_pfn: The start pfn of the node
   * @end_pfn: The end pfn of the node
   *
   * In reserve-based hot-add, mem_map is allocated that is unused until hotadd
   * time. Specifically, on x86_64, SRAT will report ranges that can potentially
   * be hotplugged even though no physical memory exists. This function allows
   * an arch to push out the node boundaries so mem_map is allocated that can
   * be used later.
   */
  #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
  void __init push_node_boundaries(unsigned int nid,
  		unsigned long start_pfn, unsigned long end_pfn)
  {
  	printk(KERN_DEBUG "Entering push_node_boundaries(%u, %lu, %lu)
  ",
  			nid, start_pfn, end_pfn);
  
  	/* Initialise the boundary for this node if necessary */
  	if (node_boundary_end_pfn[nid] == 0)
  		node_boundary_start_pfn[nid] = -1UL;
  
  	/* Update the boundaries */
  	if (node_boundary_start_pfn[nid] > start_pfn)
  		node_boundary_start_pfn[nid] = start_pfn;
  	if (node_boundary_end_pfn[nid] < end_pfn)
  		node_boundary_end_pfn[nid] = end_pfn;
  }
  
  /* If necessary, push the node boundary out for reserve hotadd */
  static void __init account_node_boundary(unsigned int nid,
  		unsigned long *start_pfn, unsigned long *end_pfn)
  {
  	printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)
  ",
  			nid, *start_pfn, *end_pfn);
  
  	/* Return if boundary information has not been provided */
  	if (node_boundary_end_pfn[nid] == 0)
  		return;
  
  	/* Check the boundaries and update if necessary */
  	if (node_boundary_start_pfn[nid] < *start_pfn)
  		*start_pfn = node_boundary_start_pfn[nid];
  	if (node_boundary_end_pfn[nid] > *end_pfn)
  		*end_pfn = node_boundary_end_pfn[nid];
  }
  #else
  void __init push_node_boundaries(unsigned int nid,
  		unsigned long start_pfn, unsigned long end_pfn) {}
  
  static void __init account_node_boundary(unsigned int nid,
  		unsigned long *start_pfn, unsigned long *end_pfn) {}
  #endif
  
  
  /**
c713216de   Mel Gorman   [PATCH] Introduce...
2079
   * get_pfn_range_for_nid - Return the start and end page frames for a node
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
2080
2081
2082
   * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
   * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
   * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
c713216de   Mel Gorman   [PATCH] Introduce...
2083
2084
2085
2086
   *
   * It returns the start and end page frame of a node based on information
   * provided by an arch calling add_active_range(). If called for a node
   * with no available memory, a warning is printed and the start and end
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
2087
   * PFNs will be 0.
c713216de   Mel Gorman   [PATCH] Introduce...
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
   */
  void __init get_pfn_range_for_nid(unsigned int nid,
  			unsigned long *start_pfn, unsigned long *end_pfn)
  {
  	int i;
  	*start_pfn = -1UL;
  	*end_pfn = 0;
  
  	for_each_active_range_index_in_nid(i, nid) {
  		*start_pfn = min(*start_pfn, early_node_map[i].start_pfn);
  		*end_pfn = max(*end_pfn, early_node_map[i].end_pfn);
  	}
  
  	if (*start_pfn == -1UL) {
  		printk(KERN_WARNING "Node %u active with no memory
  ", nid);
  		*start_pfn = 0;
  	}
fb01439c5   Mel Gorman   [PATCH] Allow an ...
2106
2107
2108
  
  	/* Push the node boundaries out if requested */
  	account_node_boundary(nid, start_pfn, end_pfn);
c713216de   Mel Gorman   [PATCH] Introduce...
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
  }
  
  /*
   * Return the number of pages a zone spans in a node, including holes
   * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
   */
  unsigned long __init zone_spanned_pages_in_node(int nid,
  					unsigned long zone_type,
  					unsigned long *ignored)
  {
  	unsigned long node_start_pfn, node_end_pfn;
  	unsigned long zone_start_pfn, zone_end_pfn;
  
  	/* Get the start and end of the node and zone */
  	get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
  	zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
  	zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
  
  	/* Check that this node has pages within the zone's required range */
  	if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
  		return 0;
  
  	/* Move the zone boundaries inside the node if necessary */
  	zone_end_pfn = min(zone_end_pfn, node_end_pfn);
  	zone_start_pfn = max(zone_start_pfn, node_start_pfn);
  
  	/* Return the spanned pages */
  	return zone_end_pfn - zone_start_pfn;
  }
  
  /*
   * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
2141
   * then all holes in the requested range will be accounted for.
c713216de   Mel Gorman   [PATCH] Introduce...
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
   */
  unsigned long __init __absent_pages_in_range(int nid,
  				unsigned long range_start_pfn,
  				unsigned long range_end_pfn)
  {
  	int i = 0;
  	unsigned long prev_end_pfn = 0, hole_pages = 0;
  	unsigned long start_pfn;
  
  	/* Find the end_pfn of the first active range of pfns in the node */
  	i = first_active_region_index_in_nid(nid);
  	if (i == -1)
  		return 0;
9c7cd6877   Mel Gorman   [PATCH] Account f...
2155
2156
2157
  	/* Account for ranges before physical memory on this node */
  	if (early_node_map[i].start_pfn > range_start_pfn)
  		hole_pages = early_node_map[i].start_pfn - range_start_pfn;
c713216de   Mel Gorman   [PATCH] Introduce...
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
  	prev_end_pfn = early_node_map[i].start_pfn;
  
  	/* Find all holes for the zone within the node */
  	for (; i != -1; i = next_active_region_index_in_nid(i, nid)) {
  
  		/* No need to continue if prev_end_pfn is outside the zone */
  		if (prev_end_pfn >= range_end_pfn)
  			break;
  
  		/* Make sure the end of the zone is not within the hole */
  		start_pfn = min(early_node_map[i].start_pfn, range_end_pfn);
  		prev_end_pfn = max(prev_end_pfn, range_start_pfn);
  
  		/* Update the hole size cound and move on */
  		if (start_pfn > range_start_pfn) {
  			BUG_ON(prev_end_pfn > start_pfn);
  			hole_pages += start_pfn - prev_end_pfn;
  		}
  		prev_end_pfn = early_node_map[i].end_pfn;
  	}
9c7cd6877   Mel Gorman   [PATCH] Account f...
2178
2179
2180
2181
  	/* Account for ranges past physical memory on this node */
  	if (range_end_pfn > prev_end_pfn)
  		hole_pages = range_end_pfn -
  				max(range_start_pfn, prev_end_pfn);
c713216de   Mel Gorman   [PATCH] Introduce...
2182
2183
2184
2185
2186
2187
2188
2189
  	return hole_pages;
  }
  
  /**
   * absent_pages_in_range - Return number of page frames in holes within a range
   * @start_pfn: The start PFN to start searching for holes
   * @end_pfn: The end PFN to stop searching for holes
   *
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
2190
   * It returns the number of pages frames in memory holes within a range.
c713216de   Mel Gorman   [PATCH] Introduce...
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
   */
  unsigned long __init absent_pages_in_range(unsigned long start_pfn,
  							unsigned long end_pfn)
  {
  	return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
  }
  
  /* Return the number of page frames in holes in a zone on a node */
  unsigned long __init zone_absent_pages_in_node(int nid,
  					unsigned long zone_type,
  					unsigned long *ignored)
  {
9c7cd6877   Mel Gorman   [PATCH] Account f...
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
  	unsigned long node_start_pfn, node_end_pfn;
  	unsigned long zone_start_pfn, zone_end_pfn;
  
  	get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
  	zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type],
  							node_start_pfn);
  	zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type],
  							node_end_pfn);
  
  	return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
c713216de   Mel Gorman   [PATCH] Introduce...
2213
  }
0e0b864e0   Mel Gorman   [PATCH] Account f...
2214

c713216de   Mel Gorman   [PATCH] Introduce...
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
  #else
  static inline unsigned long zone_spanned_pages_in_node(int nid,
  					unsigned long zone_type,
  					unsigned long *zones_size)
  {
  	return zones_size[zone_type];
  }
  
  static inline unsigned long zone_absent_pages_in_node(int nid,
  						unsigned long zone_type,
  						unsigned long *zholes_size)
  {
  	if (!zholes_size)
  		return 0;
  
  	return zholes_size[zone_type];
  }
0e0b864e0   Mel Gorman   [PATCH] Account f...
2232

c713216de   Mel Gorman   [PATCH] Introduce...
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
  #endif
  
  static void __init calculate_node_totalpages(struct pglist_data *pgdat,
  		unsigned long *zones_size, unsigned long *zholes_size)
  {
  	unsigned long realtotalpages, totalpages = 0;
  	enum zone_type i;
  
  	for (i = 0; i < MAX_NR_ZONES; i++)
  		totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
  								zones_size);
  	pgdat->node_spanned_pages = totalpages;
  
  	realtotalpages = totalpages;
  	for (i = 0; i < MAX_NR_ZONES; i++)
  		realtotalpages -=
  			zone_absent_pages_in_node(pgdat->node_id, i,
  								zholes_size);
  	pgdat->node_present_pages = realtotalpages;
  	printk(KERN_DEBUG "On node %d totalpages: %lu
  ", pgdat->node_id,
  							realtotalpages);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2256
2257
2258
2259
2260
2261
  /*
   * Set up the zone data structures:
   *   - mark all pages reserved
   *   - mark all memory queues empty
   *   - clear the memory bitmaps
   */
86356ab14   Yasunori Goto   [PATCH] wait_tabl...
2262
  static void __meminit free_area_init_core(struct pglist_data *pgdat,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2263
2264
  		unsigned long *zones_size, unsigned long *zholes_size)
  {
2f1b62486   Christoph Lameter   [PATCH] reduce MA...
2265
  	enum zone_type j;
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
2266
  	int nid = pgdat->node_id;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2267
  	unsigned long zone_start_pfn = pgdat->node_start_pfn;
718127cc3   Yasunori Goto   [PATCH] wait_tabl...
2268
  	int ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2269

208d54e55   Dave Hansen   [PATCH] memory ho...
2270
  	pgdat_resize_init(pgdat);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2271
2272
2273
2274
2275
2276
  	pgdat->nr_zones = 0;
  	init_waitqueue_head(&pgdat->kswapd_wait);
  	pgdat->kswapd_max_order = 0;
  	
  	for (j = 0; j < MAX_NR_ZONES; j++) {
  		struct zone *zone = pgdat->node_zones + j;
0e0b864e0   Mel Gorman   [PATCH] Account f...
2277
  		unsigned long size, realsize, memmap_pages;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2278

c713216de   Mel Gorman   [PATCH] Introduce...
2279
2280
2281
  		size = zone_spanned_pages_in_node(nid, j, zones_size);
  		realsize = size - zone_absent_pages_in_node(nid, j,
  								zholes_size);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2282

0e0b864e0   Mel Gorman   [PATCH] Account f...
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
  		/*
  		 * Adjust realsize so that it accounts for how much memory
  		 * is used by this zone for memmap. This affects the watermark
  		 * and per-cpu initialisations
  		 */
  		memmap_pages = (size * sizeof(struct page)) >> PAGE_SHIFT;
  		if (realsize >= memmap_pages) {
  			realsize -= memmap_pages;
  			printk(KERN_DEBUG
  				"  %s zone: %lu pages used for memmap
  ",
  				zone_names[j], memmap_pages);
  		} else
  			printk(KERN_WARNING
  				"  %s zone: %lu pages exceeds realsize %lu
  ",
  				zone_names[j], memmap_pages, realsize);
  
  		/* Account for reserved DMA pages */
  		if (j == ZONE_DMA && realsize > dma_reserve) {
  			realsize -= dma_reserve;
  			printk(KERN_DEBUG "  DMA zone: %lu pages reserved
  ",
  								dma_reserve);
  		}
98d2b0ebd   Christoph Lameter   [PATCH] reduce MA...
2308
  		if (!is_highmem_idx(j))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2309
2310
2311
2312
2313
  			nr_kernel_pages += realsize;
  		nr_all_pages += realsize;
  
  		zone->spanned_pages = size;
  		zone->present_pages = realsize;
9614634fe   Christoph Lameter   [PATCH] ZVC/zone_...
2314
  #ifdef CONFIG_NUMA
d5f541ed6   Christoph Lameter   [PATCH] Add node ...
2315
  		zone->node = nid;
8417bba4b   Christoph Lameter   [PATCH] Replace m...
2316
  		zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
9614634fe   Christoph Lameter   [PATCH] ZVC/zone_...
2317
  						/ 100;
0ff38490c   Christoph Lameter   [PATCH] zone_recl...
2318
  		zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
9614634fe   Christoph Lameter   [PATCH] ZVC/zone_...
2319
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2320
2321
2322
  		zone->name = zone_names[j];
  		spin_lock_init(&zone->lock);
  		spin_lock_init(&zone->lru_lock);
bdc8cb984   Dave Hansen   [PATCH] memory ho...
2323
  		zone_seqlock_init(zone);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2324
2325
2326
2327
  		zone->zone_pgdat = pgdat;
  		zone->free_pages = 0;
  
  		zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
ed8ece2ec   Dave Hansen   [PATCH] memory ho...
2328
  		zone_pcp_init(zone);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2329
2330
2331
2332
2333
2334
  		INIT_LIST_HEAD(&zone->active_list);
  		INIT_LIST_HEAD(&zone->inactive_list);
  		zone->nr_scan_active = 0;
  		zone->nr_scan_inactive = 0;
  		zone->nr_active = 0;
  		zone->nr_inactive = 0;
2244b95a7   Christoph Lameter   [PATCH] zoned vm ...
2335
  		zap_zone_vm_stats(zone);
53e9a6159   Martin Hicks   [PATCH] VM: zone ...
2336
  		atomic_set(&zone->reclaim_in_progress, 0);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2337
2338
  		if (!size)
  			continue;
d41dee369   Andy Whitcroft   [PATCH] sparsemem...
2339
  		zonetable_add(zone, nid, j, zone_start_pfn, size);
718127cc3   Yasunori Goto   [PATCH] wait_tabl...
2340
2341
  		ret = init_currently_empty_zone(zone, zone_start_pfn, size);
  		BUG_ON(ret);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2342
  		zone_start_pfn += size;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2343
2344
2345
2346
2347
  	}
  }
  
  static void __init alloc_node_mem_map(struct pglist_data *pgdat)
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2348
2349
2350
  	/* Skip empty nodes */
  	if (!pgdat->node_spanned_pages)
  		return;
d41dee369   Andy Whitcroft   [PATCH] sparsemem...
2351
  #ifdef CONFIG_FLAT_NODE_MEM_MAP
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2352
2353
  	/* ia64 gets its own node_mem_map, before this, without bootmem */
  	if (!pgdat->node_mem_map) {
e984bb43f   Bob Picco   [PATCH] Align the...
2354
  		unsigned long size, start, end;
d41dee369   Andy Whitcroft   [PATCH] sparsemem...
2355
  		struct page *map;
e984bb43f   Bob Picco   [PATCH] Align the...
2356
2357
2358
2359
2360
2361
2362
2363
2364
  		/*
  		 * The zone's endpoints aren't required to be MAX_ORDER
  		 * aligned but the node_mem_map endpoints must be in order
  		 * for the buddy allocator to function correctly.
  		 */
  		start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
  		end = pgdat->node_start_pfn + pgdat->node_spanned_pages;
  		end = ALIGN(end, MAX_ORDER_NR_PAGES);
  		size =  (end - start) * sizeof(struct page);
6f167ec72   Dave Hansen   [PATCH] sparsemem...
2365
2366
2367
  		map = alloc_remap(pgdat->node_id, size);
  		if (!map)
  			map = alloc_bootmem_node(pgdat, size);
e984bb43f   Bob Picco   [PATCH] Align the...
2368
  		pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2369
  	}
d41dee369   Andy Whitcroft   [PATCH] sparsemem...
2370
  #ifdef CONFIG_FLATMEM
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2371
2372
2373
  	/*
  	 * With no DISCONTIG, the global mem_map is just set as node 0's
  	 */
c713216de   Mel Gorman   [PATCH] Introduce...
2374
  	if (pgdat == NODE_DATA(0)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2375
  		mem_map = NODE_DATA(0)->node_mem_map;
c713216de   Mel Gorman   [PATCH] Introduce...
2376
2377
2378
2379
2380
  #ifdef CONFIG_ARCH_POPULATES_NODE_MAP
  		if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
  			mem_map -= pgdat->node_start_pfn;
  #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2381
  #endif
d41dee369   Andy Whitcroft   [PATCH] sparsemem...
2382
  #endif /* CONFIG_FLAT_NODE_MEM_MAP */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2383
  }
86356ab14   Yasunori Goto   [PATCH] wait_tabl...
2384
  void __meminit free_area_init_node(int nid, struct pglist_data *pgdat,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2385
2386
2387
2388
2389
  		unsigned long *zones_size, unsigned long node_start_pfn,
  		unsigned long *zholes_size)
  {
  	pgdat->node_id = nid;
  	pgdat->node_start_pfn = node_start_pfn;
c713216de   Mel Gorman   [PATCH] Introduce...
2390
  	calculate_node_totalpages(pgdat, zones_size, zholes_size);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2391
2392
2393
2394
2395
  
  	alloc_node_mem_map(pgdat);
  
  	free_area_init_core(pgdat, zones_size, zholes_size);
  }
c713216de   Mel Gorman   [PATCH] Introduce...
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
  #ifdef CONFIG_ARCH_POPULATES_NODE_MAP
  /**
   * add_active_range - Register a range of PFNs backed by physical memory
   * @nid: The node ID the range resides on
   * @start_pfn: The start PFN of the available physical memory
   * @end_pfn: The end PFN of the available physical memory
   *
   * These ranges are stored in an early_node_map[] and later used by
   * free_area_init_nodes() to calculate zone sizes and holes. If the
   * range spans a memory hole, it is up to the architecture to ensure
   * the memory is not freed by the bootmem allocator. If possible
   * the range being registered will be merged with existing ranges.
   */
  void __init add_active_range(unsigned int nid, unsigned long start_pfn,
  						unsigned long end_pfn)
  {
  	int i;
  
  	printk(KERN_DEBUG "Entering add_active_range(%d, %lu, %lu) "
  			  "%d entries of %d used
  ",
  			  nid, start_pfn, end_pfn,
  			  nr_nodemap_entries, MAX_ACTIVE_REGIONS);
  
  	/* Merge with existing active regions if possible */
  	for (i = 0; i < nr_nodemap_entries; i++) {
  		if (early_node_map[i].nid != nid)
  			continue;
  
  		/* Skip if an existing region covers this new one */
  		if (start_pfn >= early_node_map[i].start_pfn &&
  				end_pfn <= early_node_map[i].end_pfn)
  			return;
  
  		/* Merge forward if suitable */
  		if (start_pfn <= early_node_map[i].end_pfn &&
  				end_pfn > early_node_map[i].end_pfn) {
  			early_node_map[i].end_pfn = end_pfn;
  			return;
  		}
  
  		/* Merge backward if suitable */
  		if (start_pfn < early_node_map[i].end_pfn &&
  				end_pfn >= early_node_map[i].start_pfn) {
  			early_node_map[i].start_pfn = start_pfn;
  			return;
  		}
  	}
  
  	/* Check that early_node_map is large enough */
  	if (i >= MAX_ACTIVE_REGIONS) {
  		printk(KERN_CRIT "More than %d memory regions, truncating
  ",
  							MAX_ACTIVE_REGIONS);
  		return;
  	}
  
  	early_node_map[i].nid = nid;
  	early_node_map[i].start_pfn = start_pfn;
  	early_node_map[i].end_pfn = end_pfn;
  	nr_nodemap_entries = i + 1;
  }
  
  /**
   * shrink_active_range - Shrink an existing registered range of PFNs
   * @nid: The node id the range is on that should be shrunk
   * @old_end_pfn: The old end PFN of the range
   * @new_end_pfn: The new PFN of the range
   *
   * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node.
   * The map is kept at the end physical page range that has already been
   * registered with add_active_range(). This function allows an arch to shrink
   * an existing registered range.
   */
  void __init shrink_active_range(unsigned int nid, unsigned long old_end_pfn,
  						unsigned long new_end_pfn)
  {
  	int i;
  
  	/* Find the old active region end and shrink */
  	for_each_active_range_index_in_nid(i, nid)
  		if (early_node_map[i].end_pfn == old_end_pfn) {
  			early_node_map[i].end_pfn = new_end_pfn;
  			break;
  		}
  }
  
  /**
   * remove_all_active_ranges - Remove all currently registered regions
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
2485
   *
c713216de   Mel Gorman   [PATCH] Introduce...
2486
2487
2488
2489
   * During discovery, it may be found that a table like SRAT is invalid
   * and an alternative discovery method must be used. This function removes
   * all currently registered regions.
   */
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
2490
  void __init remove_all_active_ranges(void)
c713216de   Mel Gorman   [PATCH] Introduce...
2491
2492
2493
  {
  	memset(early_node_map, 0, sizeof(early_node_map));
  	nr_nodemap_entries = 0;
fb01439c5   Mel Gorman   [PATCH] Allow an ...
2494
2495
2496
2497
  #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
  	memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn));
  	memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn));
  #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
c713216de   Mel Gorman   [PATCH] Introduce...
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
  }
  
  /* Compare two active node_active_regions */
  static int __init cmp_node_active_region(const void *a, const void *b)
  {
  	struct node_active_region *arange = (struct node_active_region *)a;
  	struct node_active_region *brange = (struct node_active_region *)b;
  
  	/* Done this way to avoid overflows */
  	if (arange->start_pfn > brange->start_pfn)
  		return 1;
  	if (arange->start_pfn < brange->start_pfn)
  		return -1;
  
  	return 0;
  }
  
  /* sort the node_map by start_pfn */
  static void __init sort_node_map(void)
  {
  	sort(early_node_map, (size_t)nr_nodemap_entries,
  			sizeof(struct node_active_region),
  			cmp_node_active_region, NULL);
  }
  
  /* Find the lowest pfn for a node. This depends on a sorted early_node_map */
  unsigned long __init find_min_pfn_for_node(unsigned long nid)
  {
  	int i;
  
  	/* Assuming a sorted map, the first range found has the starting pfn */
  	for_each_active_range_index_in_nid(i, nid)
  		return early_node_map[i].start_pfn;
  
  	printk(KERN_WARNING "Could not find start_pfn for node %lu
  ", nid);
  	return 0;
  }
  
  /**
   * find_min_pfn_with_active_regions - Find the minimum PFN registered
   *
   * It returns the minimum PFN based on information provided via
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
2541
   * add_active_range().
c713216de   Mel Gorman   [PATCH] Introduce...
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
   */
  unsigned long __init find_min_pfn_with_active_regions(void)
  {
  	return find_min_pfn_for_node(MAX_NUMNODES);
  }
  
  /**
   * find_max_pfn_with_active_regions - Find the maximum PFN registered
   *
   * It returns the maximum PFN based on information provided via
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
2552
   * add_active_range().
c713216de   Mel Gorman   [PATCH] Introduce...
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
   */
  unsigned long __init find_max_pfn_with_active_regions(void)
  {
  	int i;
  	unsigned long max_pfn = 0;
  
  	for (i = 0; i < nr_nodemap_entries; i++)
  		max_pfn = max(max_pfn, early_node_map[i].end_pfn);
  
  	return max_pfn;
  }
  
  /**
   * free_area_init_nodes - Initialise all pg_data_t and zone data
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
2567
   * @max_zone_pfn: an array of max PFNs for each zone
c713216de   Mel Gorman   [PATCH] Introduce...
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
   *
   * This will call free_area_init_node() for each active node in the system.
   * Using the page ranges provided by add_active_range(), the size of each
   * zone in each node and their holes is calculated. If the maximum PFN
   * between two adjacent zones match, it is assumed that the zone is empty.
   * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
   * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
   * starts where the previous one ended. For example, ZONE_DMA32 starts
   * at arch_max_dma_pfn.
   */
  void __init free_area_init_nodes(unsigned long *max_zone_pfn)
  {
  	unsigned long nid;
  	enum zone_type i;
  
  	/* Record where the zone boundaries are */
  	memset(arch_zone_lowest_possible_pfn, 0,
  				sizeof(arch_zone_lowest_possible_pfn));
  	memset(arch_zone_highest_possible_pfn, 0,
  				sizeof(arch_zone_highest_possible_pfn));
  	arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
  	arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
  	for (i = 1; i < MAX_NR_ZONES; i++) {
  		arch_zone_lowest_possible_pfn[i] =
  			arch_zone_highest_possible_pfn[i-1];
  		arch_zone_highest_possible_pfn[i] =
  			max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
  	}
  
  	/* Regions in the early_node_map can be in any order */
  	sort_node_map();
  
  	/* Print out the zone ranges */
  	printk("Zone PFN ranges:
  ");
  	for (i = 0; i < MAX_NR_ZONES; i++)
  		printk("  %-8s %8lu -> %8lu
  ",
  				zone_names[i],
  				arch_zone_lowest_possible_pfn[i],
  				arch_zone_highest_possible_pfn[i]);
  
  	/* Print out the early_node_map[] */
  	printk("early_node_map[%d] active PFN ranges
  ", nr_nodemap_entries);
  	for (i = 0; i < nr_nodemap_entries; i++)
  		printk("  %3d: %8lu -> %8lu
  ", early_node_map[i].nid,
  						early_node_map[i].start_pfn,
  						early_node_map[i].end_pfn);
  
  	/* Initialise every node */
  	for_each_online_node(nid) {
  		pg_data_t *pgdat = NODE_DATA(nid);
  		free_area_init_node(nid, pgdat, NULL,
  				find_min_pfn_for_node(nid), NULL);
  	}
  }
  #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
0e0b864e0   Mel Gorman   [PATCH] Account f...
2627
  /**
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
2628
2629
   * set_dma_reserve - set the specified number of pages reserved in the first zone
   * @new_dma_reserve: The number of pages to mark reserved
0e0b864e0   Mel Gorman   [PATCH] Account f...
2630
2631
2632
2633
   *
   * The per-cpu batchsize and zone watermarks are determined by present_pages.
   * In the DMA zone, a significant percentage may be consumed by kernel image
   * and other unfreeable allocations which can skew the watermarks badly. This
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
2634
2635
2636
   * function may optionally be used to account for unfreeable pages in the
   * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
   * smaller per-cpu batchsize.
0e0b864e0   Mel Gorman   [PATCH] Account f...
2637
2638
2639
2640
2641
   */
  void __init set_dma_reserve(unsigned long new_dma_reserve)
  {
  	dma_reserve = new_dma_reserve;
  }
93b7504e3   Dave Hansen   [PATCH] Introduce...
2642
  #ifndef CONFIG_NEED_MULTIPLE_NODES
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2643
2644
2645
2646
  static bootmem_data_t contig_bootmem_data;
  struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
  
  EXPORT_SYMBOL(contig_page_data);
93b7504e3   Dave Hansen   [PATCH] Introduce...
2647
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2648
2649
2650
  
  void __init free_area_init(unsigned long *zones_size)
  {
93b7504e3   Dave Hansen   [PATCH] Introduce...
2651
  	free_area_init_node(0, NODE_DATA(0), zones_size,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2652
2653
  			__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2654

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2655
2656
2657
2658
2659
  #ifdef CONFIG_HOTPLUG_CPU
  static int page_alloc_cpu_notify(struct notifier_block *self,
  				 unsigned long action, void *hcpu)
  {
  	int cpu = (unsigned long)hcpu;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2660
2661
  
  	if (action == CPU_DEAD) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2662
2663
  		local_irq_disable();
  		__drain_pages(cpu);
f8891e5e1   Christoph Lameter   [PATCH] Light wei...
2664
  		vm_events_fold_cpu(cpu);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2665
  		local_irq_enable();
2244b95a7   Christoph Lameter   [PATCH] zoned vm ...
2666
  		refresh_cpu_vm_stats(cpu);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
  	}
  	return NOTIFY_OK;
  }
  #endif /* CONFIG_HOTPLUG_CPU */
  
  void __init page_alloc_init(void)
  {
  	hotcpu_notifier(page_alloc_cpu_notify, 0);
  }
  
  /*
cb45b0e96   Hideo AOKI   [PATCH] overcommi...
2678
2679
2680
2681
2682
2683
2684
   * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio
   *	or min_free_kbytes changes.
   */
  static void calculate_totalreserve_pages(void)
  {
  	struct pglist_data *pgdat;
  	unsigned long reserve_pages = 0;
2f6726e54   Christoph Lameter   [PATCH] Apply typ...
2685
  	enum zone_type i, j;
cb45b0e96   Hideo AOKI   [PATCH] overcommi...
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
  
  	for_each_online_pgdat(pgdat) {
  		for (i = 0; i < MAX_NR_ZONES; i++) {
  			struct zone *zone = pgdat->node_zones + i;
  			unsigned long max = 0;
  
  			/* Find valid and maximum lowmem_reserve in the zone */
  			for (j = i; j < MAX_NR_ZONES; j++) {
  				if (zone->lowmem_reserve[j] > max)
  					max = zone->lowmem_reserve[j];
  			}
  
  			/* we treat pages_high as reserved pages. */
  			max += zone->pages_high;
  
  			if (max > zone->present_pages)
  				max = zone->present_pages;
  			reserve_pages += max;
  		}
  	}
  	totalreserve_pages = reserve_pages;
  }
  
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2710
2711
2712
2713
2714
2715
2716
2717
   * setup_per_zone_lowmem_reserve - called whenever
   *	sysctl_lower_zone_reserve_ratio changes.  Ensures that each zone
   *	has a correct pages reserved value, so an adequate number of
   *	pages are left in the zone after a successful __alloc_pages().
   */
  static void setup_per_zone_lowmem_reserve(void)
  {
  	struct pglist_data *pgdat;
2f6726e54   Christoph Lameter   [PATCH] Apply typ...
2718
  	enum zone_type j, idx;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2719

ec936fc56   KAMEZAWA Hiroyuki   [PATCH] for_each_...
2720
  	for_each_online_pgdat(pgdat) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2721
2722
2723
2724
2725
  		for (j = 0; j < MAX_NR_ZONES; j++) {
  			struct zone *zone = pgdat->node_zones + j;
  			unsigned long present_pages = zone->present_pages;
  
  			zone->lowmem_reserve[j] = 0;
2f6726e54   Christoph Lameter   [PATCH] Apply typ...
2726
2727
  			idx = j;
  			while (idx) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2728
  				struct zone *lower_zone;
2f6726e54   Christoph Lameter   [PATCH] Apply typ...
2729
  				idx--;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
  				if (sysctl_lowmem_reserve_ratio[idx] < 1)
  					sysctl_lowmem_reserve_ratio[idx] = 1;
  
  				lower_zone = pgdat->node_zones + idx;
  				lower_zone->lowmem_reserve[j] = present_pages /
  					sysctl_lowmem_reserve_ratio[idx];
  				present_pages += lower_zone->present_pages;
  			}
  		}
  	}
cb45b0e96   Hideo AOKI   [PATCH] overcommi...
2740
2741
2742
  
  	/* update totalreserve_pages */
  	calculate_totalreserve_pages();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2743
  }
88ca3b94e   Randy Dunlap   [PATCH] page_allo...
2744
2745
2746
2747
2748
  /**
   * setup_per_zone_pages_min - called when min_free_kbytes changes.
   *
   * Ensures that the pages_{min,low,high} values for each zone are set correctly
   * with respect to min_free_kbytes.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2749
   */
3947be196   Dave Hansen   [PATCH] memory ho...
2750
  void setup_per_zone_pages_min(void)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
  {
  	unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
  	unsigned long lowmem_pages = 0;
  	struct zone *zone;
  	unsigned long flags;
  
  	/* Calculate total number of !ZONE_HIGHMEM pages */
  	for_each_zone(zone) {
  		if (!is_highmem(zone))
  			lowmem_pages += zone->present_pages;
  	}
  
  	for_each_zone(zone) {
ac924c603   Andrew Morton   [PATCH] setup_per...
2764
  		u64 tmp;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2765
  		spin_lock_irqsave(&zone->lru_lock, flags);
ac924c603   Andrew Morton   [PATCH] setup_per...
2766
2767
  		tmp = (u64)pages_min * zone->present_pages;
  		do_div(tmp, lowmem_pages);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2768
2769
  		if (is_highmem(zone)) {
  			/*
669ed1752   Nick Piggin   [PATCH] mm: highm...
2770
2771
2772
2773
2774
2775
2776
  			 * __GFP_HIGH and PF_MEMALLOC allocations usually don't
  			 * need highmem pages, so cap pages_min to a small
  			 * value here.
  			 *
  			 * The (pages_high-pages_low) and (pages_low-pages_min)
  			 * deltas controls asynch page reclaim, and so should
  			 * not be capped for highmem.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
  			 */
  			int min_pages;
  
  			min_pages = zone->present_pages / 1024;
  			if (min_pages < SWAP_CLUSTER_MAX)
  				min_pages = SWAP_CLUSTER_MAX;
  			if (min_pages > 128)
  				min_pages = 128;
  			zone->pages_min = min_pages;
  		} else {
669ed1752   Nick Piggin   [PATCH] mm: highm...
2787
2788
  			/*
  			 * If it's a lowmem zone, reserve a number of pages
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2789
2790
  			 * proportionate to the zone's size.
  			 */
669ed1752   Nick Piggin   [PATCH] mm: highm...
2791
  			zone->pages_min = tmp;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2792
  		}
ac924c603   Andrew Morton   [PATCH] setup_per...
2793
2794
  		zone->pages_low   = zone->pages_min + (tmp >> 2);
  		zone->pages_high  = zone->pages_min + (tmp >> 1);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2795
2796
  		spin_unlock_irqrestore(&zone->lru_lock, flags);
  	}
cb45b0e96   Hideo AOKI   [PATCH] overcommi...
2797
2798
2799
  
  	/* update totalreserve_pages */
  	calculate_totalreserve_pages();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
  }
  
  /*
   * Initialise min_free_kbytes.
   *
   * For small machines we want it small (128k min).  For large machines
   * we want it large (64MB max).  But it is not linear, because network
   * bandwidth does not increase linearly with machine size.  We use
   *
   * 	min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
   *	min_free_kbytes = sqrt(lowmem_kbytes * 16)
   *
   * which yields
   *
   * 16MB:	512k
   * 32MB:	724k
   * 64MB:	1024k
   * 128MB:	1448k
   * 256MB:	2048k
   * 512MB:	2896k
   * 1024MB:	4096k
   * 2048MB:	5792k
   * 4096MB:	8192k
   * 8192MB:	11584k
   * 16384MB:	16384k
   */
  static int __init init_per_zone_pages_min(void)
  {
  	unsigned long lowmem_kbytes;
  
  	lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
  
  	min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
  	if (min_free_kbytes < 128)
  		min_free_kbytes = 128;
  	if (min_free_kbytes > 65536)
  		min_free_kbytes = 65536;
  	setup_per_zone_pages_min();
  	setup_per_zone_lowmem_reserve();
  	return 0;
  }
  module_init(init_per_zone_pages_min)
  
  /*
   * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 
   *	that we can call two helper functions whenever min_free_kbytes
   *	changes.
   */
  int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 
  	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
  {
  	proc_dointvec(table, write, file, buffer, length, ppos);
  	setup_per_zone_pages_min();
  	return 0;
  }
9614634fe   Christoph Lameter   [PATCH] ZVC/zone_...
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
  #ifdef CONFIG_NUMA
  int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
  	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
  {
  	struct zone *zone;
  	int rc;
  
  	rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
  	if (rc)
  		return rc;
  
  	for_each_zone(zone)
8417bba4b   Christoph Lameter   [PATCH] Replace m...
2867
  		zone->min_unmapped_pages = (zone->present_pages *
9614634fe   Christoph Lameter   [PATCH] ZVC/zone_...
2868
2869
2870
  				sysctl_min_unmapped_ratio) / 100;
  	return 0;
  }
0ff38490c   Christoph Lameter   [PATCH] zone_recl...
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
  
  int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
  	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
  {
  	struct zone *zone;
  	int rc;
  
  	rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
  	if (rc)
  		return rc;
  
  	for_each_zone(zone)
  		zone->min_slab_pages = (zone->present_pages *
  				sysctl_min_slab_ratio) / 100;
  	return 0;
  }
9614634fe   Christoph Lameter   [PATCH] ZVC/zone_...
2887
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
  /*
   * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
   *	proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
   *	whenever sysctl_lowmem_reserve_ratio changes.
   *
   * The reserve ratio obviously has absolutely no relation with the
   * pages_min watermarks. The lowmem reserve ratio can only make sense
   * if in function of the boot time zone sizes.
   */
  int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
  	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
  {
  	proc_dointvec_minmax(table, write, file, buffer, length, ppos);
  	setup_per_zone_lowmem_reserve();
  	return 0;
  }
8ad4b1fb8   Rohit Seth   [PATCH] Make high...
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
  /*
   * percpu_pagelist_fraction - changes the pcp->high for each zone on each
   * cpu.  It is the fraction of total pages in each zone that a hot per cpu pagelist
   * can have before it gets flushed back to buddy allocator.
   */
  
  int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
  	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
  {
  	struct zone *zone;
  	unsigned int cpu;
  	int ret;
  
  	ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
  	if (!write || (ret == -EINVAL))
  		return ret;
  	for_each_zone(zone) {
  		for_each_online_cpu(cpu) {
  			unsigned long  high;
  			high = zone->present_pages / percpu_pagelist_fraction;
  			setup_pagelist_highmark(zone_pcp(zone, cpu), high);
  		}
  	}
  	return 0;
  }
f034b5d4e   David S. Miller   [XFRM]: Dynamic x...
2929
  int hashdist = HASHDIST_DEFAULT;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
  
  #ifdef CONFIG_NUMA
  static int __init set_hashdist(char *str)
  {
  	if (!str)
  		return 0;
  	hashdist = simple_strtoul(str, &str, 0);
  	return 1;
  }
  __setup("hashdist=", set_hashdist);
  #endif
  
  /*
   * allocate a large system hash table from bootmem
   * - it is assumed that the hash table must contain an exact power-of-2
   *   quantity of entries
   * - limit is the number of hash buckets, not the total allocation size
   */
  void *__init alloc_large_system_hash(const char *tablename,
  				     unsigned long bucketsize,
  				     unsigned long numentries,
  				     int scale,
  				     int flags,
  				     unsigned int *_hash_shift,
  				     unsigned int *_hash_mask,
  				     unsigned long limit)
  {
  	unsigned long long max = limit;
  	unsigned long log2qty, size;
  	void *table = NULL;
  
  	/* allow the kernel cmdline to have a say */
  	if (!numentries) {
  		/* round applicable memory size up to nearest megabyte */
  		numentries = (flags & HASH_HIGHMEM) ? nr_all_pages : nr_kernel_pages;
  		numentries += (1UL << (20 - PAGE_SHIFT)) - 1;
  		numentries >>= 20 - PAGE_SHIFT;
  		numentries <<= 20 - PAGE_SHIFT;
  
  		/* limit to 1 bucket per 2^scale bytes of low memory */
  		if (scale > PAGE_SHIFT)
  			numentries >>= (scale - PAGE_SHIFT);
  		else
  			numentries <<= (PAGE_SHIFT - scale);
  	}
6e692ed37   John Hawkes   [PATCH] fix alloc...
2975
  	numentries = roundup_pow_of_two(numentries);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
  
  	/* limit allocation size to 1/16 total memory by default */
  	if (max == 0) {
  		max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
  		do_div(max, bucketsize);
  	}
  
  	if (numentries > max)
  		numentries = max;
  
  	log2qty = long_log2(numentries);
  
  	do {
  		size = bucketsize << log2qty;
  		if (flags & HASH_EARLY)
  			table = alloc_bootmem(size);
  		else if (hashdist)
  			table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
  		else {
  			unsigned long order;
  			for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++)
  				;
  			table = (void*) __get_free_pages(GFP_ATOMIC, order);
  		}
  	} while (!table && size > PAGE_SIZE && --log2qty);
  
  	if (!table)
  		panic("Failed to allocate %s hash table
  ", tablename);
  
  	printk("%s hash table entries: %d (order: %d, %lu bytes)
  ",
  	       tablename,
  	       (1U << log2qty),
  	       long_log2(size) - PAGE_SHIFT,
  	       size);
  
  	if (_hash_shift)
  		*_hash_shift = log2qty;
  	if (_hash_mask)
  		*_hash_mask = (1 << log2qty) - 1;
  
  	return table;
  }
a117e66ed   KAMEZAWA Hiroyuki   [PATCH] unify pfn...
3020
3021
  
  #ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE
a117e66ed   KAMEZAWA Hiroyuki   [PATCH] unify pfn...
3022
3023
  struct page *pfn_to_page(unsigned long pfn)
  {
67de64821   Andy Whitcroft   [PATCH] squash du...
3024
  	return __pfn_to_page(pfn);
a117e66ed   KAMEZAWA Hiroyuki   [PATCH] unify pfn...
3025
3026
3027
  }
  unsigned long page_to_pfn(struct page *page)
  {
67de64821   Andy Whitcroft   [PATCH] squash du...
3028
  	return __page_to_pfn(page);
a117e66ed   KAMEZAWA Hiroyuki   [PATCH] unify pfn...
3029
  }
a117e66ed   KAMEZAWA Hiroyuki   [PATCH] unify pfn...
3030
3031
3032
  EXPORT_SYMBOL(pfn_to_page);
  EXPORT_SYMBOL(page_to_pfn);
  #endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */