Commit 04903664325acb3f199dd8a4b8f1aa437e9fd6b2

Authored by Andrew Morton
Committed by Linus Torvalds
1 parent 93f210dd9e

[PATCH] remove HASH_HIGHMEM

It has no users and it's doubtful that we'll need it again.

Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

Showing 2 changed files with 2 additions and 3 deletions Inline Diff

include/linux/bootmem.h
1 /* 1 /*
2 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 2 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
3 */ 3 */
4 #ifndef _LINUX_BOOTMEM_H 4 #ifndef _LINUX_BOOTMEM_H
5 #define _LINUX_BOOTMEM_H 5 #define _LINUX_BOOTMEM_H
6 6
7 #include <linux/mmzone.h> 7 #include <linux/mmzone.h>
8 #include <asm/dma.h> 8 #include <asm/dma.h>
9 9
10 /* 10 /*
11 * simple boot-time physical memory area allocator. 11 * simple boot-time physical memory area allocator.
12 */ 12 */
13 13
14 extern unsigned long max_low_pfn; 14 extern unsigned long max_low_pfn;
15 extern unsigned long min_low_pfn; 15 extern unsigned long min_low_pfn;
16 16
17 /* 17 /*
18 * highest page 18 * highest page
19 */ 19 */
20 extern unsigned long max_pfn; 20 extern unsigned long max_pfn;
21 21
22 #ifdef CONFIG_CRASH_DUMP 22 #ifdef CONFIG_CRASH_DUMP
23 extern unsigned long saved_max_pfn; 23 extern unsigned long saved_max_pfn;
24 #endif 24 #endif
25 25
26 /* 26 /*
27 * node_bootmem_map is a map pointer - the bits represent all physical 27 * node_bootmem_map is a map pointer - the bits represent all physical
28 * memory pages (including holes) on the node. 28 * memory pages (including holes) on the node.
29 */ 29 */
30 typedef struct bootmem_data { 30 typedef struct bootmem_data {
31 unsigned long node_boot_start; 31 unsigned long node_boot_start;
32 unsigned long node_low_pfn; 32 unsigned long node_low_pfn;
33 void *node_bootmem_map; 33 void *node_bootmem_map;
34 unsigned long last_offset; 34 unsigned long last_offset;
35 unsigned long last_pos; 35 unsigned long last_pos;
36 unsigned long last_success; /* Previous allocation point. To speed 36 unsigned long last_success; /* Previous allocation point. To speed
37 * up searching */ 37 * up searching */
38 struct list_head list; 38 struct list_head list;
39 } bootmem_data_t; 39 } bootmem_data_t;
40 40
41 extern unsigned long bootmem_bootmap_pages(unsigned long); 41 extern unsigned long bootmem_bootmap_pages(unsigned long);
42 extern unsigned long init_bootmem(unsigned long addr, unsigned long memend); 42 extern unsigned long init_bootmem(unsigned long addr, unsigned long memend);
43 extern void free_bootmem(unsigned long addr, unsigned long size); 43 extern void free_bootmem(unsigned long addr, unsigned long size);
44 extern void *__alloc_bootmem(unsigned long size, 44 extern void *__alloc_bootmem(unsigned long size,
45 unsigned long align, 45 unsigned long align,
46 unsigned long goal); 46 unsigned long goal);
47 extern void *__alloc_bootmem_nopanic(unsigned long size, 47 extern void *__alloc_bootmem_nopanic(unsigned long size,
48 unsigned long align, 48 unsigned long align,
49 unsigned long goal); 49 unsigned long goal);
50 extern void *__alloc_bootmem_low(unsigned long size, 50 extern void *__alloc_bootmem_low(unsigned long size,
51 unsigned long align, 51 unsigned long align,
52 unsigned long goal); 52 unsigned long goal);
53 extern void *__alloc_bootmem_low_node(pg_data_t *pgdat, 53 extern void *__alloc_bootmem_low_node(pg_data_t *pgdat,
54 unsigned long size, 54 unsigned long size,
55 unsigned long align, 55 unsigned long align,
56 unsigned long goal); 56 unsigned long goal);
57 extern void *__alloc_bootmem_core(struct bootmem_data *bdata, 57 extern void *__alloc_bootmem_core(struct bootmem_data *bdata,
58 unsigned long size, 58 unsigned long size,
59 unsigned long align, 59 unsigned long align,
60 unsigned long goal, 60 unsigned long goal,
61 unsigned long limit); 61 unsigned long limit);
62 62
63 #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE 63 #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
64 extern void reserve_bootmem(unsigned long addr, unsigned long size); 64 extern void reserve_bootmem(unsigned long addr, unsigned long size);
65 #define alloc_bootmem(x) \ 65 #define alloc_bootmem(x) \
66 __alloc_bootmem(x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) 66 __alloc_bootmem(x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
67 #define alloc_bootmem_low(x) \ 67 #define alloc_bootmem_low(x) \
68 __alloc_bootmem_low(x, SMP_CACHE_BYTES, 0) 68 __alloc_bootmem_low(x, SMP_CACHE_BYTES, 0)
69 #define alloc_bootmem_pages(x) \ 69 #define alloc_bootmem_pages(x) \
70 __alloc_bootmem(x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) 70 __alloc_bootmem(x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
71 #define alloc_bootmem_low_pages(x) \ 71 #define alloc_bootmem_low_pages(x) \
72 __alloc_bootmem_low(x, PAGE_SIZE, 0) 72 __alloc_bootmem_low(x, PAGE_SIZE, 0)
73 #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ 73 #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
74 74
75 extern unsigned long free_all_bootmem(void); 75 extern unsigned long free_all_bootmem(void);
76 extern unsigned long free_all_bootmem_node(pg_data_t *pgdat); 76 extern unsigned long free_all_bootmem_node(pg_data_t *pgdat);
77 extern void *__alloc_bootmem_node(pg_data_t *pgdat, 77 extern void *__alloc_bootmem_node(pg_data_t *pgdat,
78 unsigned long size, 78 unsigned long size,
79 unsigned long align, 79 unsigned long align,
80 unsigned long goal); 80 unsigned long goal);
81 extern unsigned long init_bootmem_node(pg_data_t *pgdat, 81 extern unsigned long init_bootmem_node(pg_data_t *pgdat,
82 unsigned long freepfn, 82 unsigned long freepfn,
83 unsigned long startpfn, 83 unsigned long startpfn,
84 unsigned long endpfn); 84 unsigned long endpfn);
85 extern void reserve_bootmem_node(pg_data_t *pgdat, 85 extern void reserve_bootmem_node(pg_data_t *pgdat,
86 unsigned long physaddr, 86 unsigned long physaddr,
87 unsigned long size); 87 unsigned long size);
88 extern void free_bootmem_node(pg_data_t *pgdat, 88 extern void free_bootmem_node(pg_data_t *pgdat,
89 unsigned long addr, 89 unsigned long addr,
90 unsigned long size); 90 unsigned long size);
91 91
92 #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE 92 #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
93 #define alloc_bootmem_node(pgdat, x) \ 93 #define alloc_bootmem_node(pgdat, x) \
94 __alloc_bootmem_node(pgdat, x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) 94 __alloc_bootmem_node(pgdat, x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
95 #define alloc_bootmem_pages_node(pgdat, x) \ 95 #define alloc_bootmem_pages_node(pgdat, x) \
96 __alloc_bootmem_node(pgdat, x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) 96 __alloc_bootmem_node(pgdat, x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
97 #define alloc_bootmem_low_pages_node(pgdat, x) \ 97 #define alloc_bootmem_low_pages_node(pgdat, x) \
98 __alloc_bootmem_low_node(pgdat, x, PAGE_SIZE, 0) 98 __alloc_bootmem_low_node(pgdat, x, PAGE_SIZE, 0)
99 #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ 99 #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
100 100
101 #ifdef CONFIG_HAVE_ARCH_ALLOC_REMAP 101 #ifdef CONFIG_HAVE_ARCH_ALLOC_REMAP
102 extern void *alloc_remap(int nid, unsigned long size); 102 extern void *alloc_remap(int nid, unsigned long size);
103 #else 103 #else
104 static inline void *alloc_remap(int nid, unsigned long size) 104 static inline void *alloc_remap(int nid, unsigned long size)
105 { 105 {
106 return NULL; 106 return NULL;
107 } 107 }
108 #endif /* CONFIG_HAVE_ARCH_ALLOC_REMAP */ 108 #endif /* CONFIG_HAVE_ARCH_ALLOC_REMAP */
109 109
110 extern unsigned long __meminitdata nr_kernel_pages; 110 extern unsigned long __meminitdata nr_kernel_pages;
111 extern unsigned long nr_all_pages; 111 extern unsigned long nr_all_pages;
112 112
113 extern void *alloc_large_system_hash(const char *tablename, 113 extern void *alloc_large_system_hash(const char *tablename,
114 unsigned long bucketsize, 114 unsigned long bucketsize,
115 unsigned long numentries, 115 unsigned long numentries,
116 int scale, 116 int scale,
117 int flags, 117 int flags,
118 unsigned int *_hash_shift, 118 unsigned int *_hash_shift,
119 unsigned int *_hash_mask, 119 unsigned int *_hash_mask,
120 unsigned long limit); 120 unsigned long limit);
121 121
122 #define HASH_HIGHMEM 0x00000001 /* Consider highmem? */ 122 #define HASH_EARLY 0x00000001 /* Allocating during early boot? */
123 #define HASH_EARLY 0x00000002 /* Allocating during early boot? */
124 123
125 /* Only NUMA needs hash distribution. 124 /* Only NUMA needs hash distribution.
126 * IA64 is known to have sufficient vmalloc space. 125 * IA64 is known to have sufficient vmalloc space.
127 */ 126 */
128 #if defined(CONFIG_NUMA) && defined(CONFIG_IA64) 127 #if defined(CONFIG_NUMA) && defined(CONFIG_IA64)
129 #define HASHDIST_DEFAULT 1 128 #define HASHDIST_DEFAULT 1
130 #else 129 #else
131 #define HASHDIST_DEFAULT 0 130 #define HASHDIST_DEFAULT 0
132 #endif 131 #endif
133 extern int hashdist; /* Distribute hashes across NUMA nodes? */ 132 extern int hashdist; /* Distribute hashes across NUMA nodes? */
134 133
135 134
136 #endif /* _LINUX_BOOTMEM_H */ 135 #endif /* _LINUX_BOOTMEM_H */
137 136
1 /* 1 /*
2 * linux/mm/page_alloc.c 2 * linux/mm/page_alloc.c
3 * 3 *
4 * Manages the free list, the system allocates free pages here. 4 * Manages the free list, the system allocates free pages here.
5 * Note that kmalloc() lives in slab.c 5 * Note that kmalloc() lives in slab.c
6 * 6 *
7 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 7 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
8 * Swap reorganised 29.12.95, Stephen Tweedie 8 * Swap reorganised 29.12.95, Stephen Tweedie
9 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 9 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
10 * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 10 * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
11 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 11 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
12 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 12 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000
13 * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002 13 * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
14 * (lots of bits borrowed from Ingo Molnar & Andrew Morton) 14 * (lots of bits borrowed from Ingo Molnar & Andrew Morton)
15 */ 15 */
16 16
17 #include <linux/stddef.h> 17 #include <linux/stddef.h>
18 #include <linux/mm.h> 18 #include <linux/mm.h>
19 #include <linux/swap.h> 19 #include <linux/swap.h>
20 #include <linux/interrupt.h> 20 #include <linux/interrupt.h>
21 #include <linux/pagemap.h> 21 #include <linux/pagemap.h>
22 #include <linux/bootmem.h> 22 #include <linux/bootmem.h>
23 #include <linux/compiler.h> 23 #include <linux/compiler.h>
24 #include <linux/kernel.h> 24 #include <linux/kernel.h>
25 #include <linux/module.h> 25 #include <linux/module.h>
26 #include <linux/suspend.h> 26 #include <linux/suspend.h>
27 #include <linux/pagevec.h> 27 #include <linux/pagevec.h>
28 #include <linux/blkdev.h> 28 #include <linux/blkdev.h>
29 #include <linux/slab.h> 29 #include <linux/slab.h>
30 #include <linux/notifier.h> 30 #include <linux/notifier.h>
31 #include <linux/topology.h> 31 #include <linux/topology.h>
32 #include <linux/sysctl.h> 32 #include <linux/sysctl.h>
33 #include <linux/cpu.h> 33 #include <linux/cpu.h>
34 #include <linux/cpuset.h> 34 #include <linux/cpuset.h>
35 #include <linux/memory_hotplug.h> 35 #include <linux/memory_hotplug.h>
36 #include <linux/nodemask.h> 36 #include <linux/nodemask.h>
37 #include <linux/vmalloc.h> 37 #include <linux/vmalloc.h>
38 #include <linux/mempolicy.h> 38 #include <linux/mempolicy.h>
39 #include <linux/stop_machine.h> 39 #include <linux/stop_machine.h>
40 #include <linux/sort.h> 40 #include <linux/sort.h>
41 #include <linux/pfn.h> 41 #include <linux/pfn.h>
42 #include <linux/backing-dev.h> 42 #include <linux/backing-dev.h>
43 43
44 #include <asm/tlbflush.h> 44 #include <asm/tlbflush.h>
45 #include <asm/div64.h> 45 #include <asm/div64.h>
46 #include "internal.h" 46 #include "internal.h"
47 47
48 /* 48 /*
49 * MCD - HACK: Find somewhere to initialize this EARLY, or make this 49 * MCD - HACK: Find somewhere to initialize this EARLY, or make this
50 * initializer cleaner 50 * initializer cleaner
51 */ 51 */
52 nodemask_t node_online_map __read_mostly = { { [0] = 1UL } }; 52 nodemask_t node_online_map __read_mostly = { { [0] = 1UL } };
53 EXPORT_SYMBOL(node_online_map); 53 EXPORT_SYMBOL(node_online_map);
54 nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL; 54 nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL;
55 EXPORT_SYMBOL(node_possible_map); 55 EXPORT_SYMBOL(node_possible_map);
56 unsigned long totalram_pages __read_mostly; 56 unsigned long totalram_pages __read_mostly;
57 unsigned long totalreserve_pages __read_mostly; 57 unsigned long totalreserve_pages __read_mostly;
58 long nr_swap_pages; 58 long nr_swap_pages;
59 int percpu_pagelist_fraction; 59 int percpu_pagelist_fraction;
60 60
61 static void __free_pages_ok(struct page *page, unsigned int order); 61 static void __free_pages_ok(struct page *page, unsigned int order);
62 62
63 /* 63 /*
64 * results with 256, 32 in the lowmem_reserve sysctl: 64 * results with 256, 32 in the lowmem_reserve sysctl:
65 * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) 65 * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
66 * 1G machine -> (16M dma, 784M normal, 224M high) 66 * 1G machine -> (16M dma, 784M normal, 224M high)
67 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA 67 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
68 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL 68 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
69 * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA 69 * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
70 * 70 *
71 * TBD: should special case ZONE_DMA32 machines here - in those we normally 71 * TBD: should special case ZONE_DMA32 machines here - in those we normally
72 * don't need any ZONE_NORMAL reservation 72 * don't need any ZONE_NORMAL reservation
73 */ 73 */
74 int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 74 int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
75 256, 75 256,
76 #ifdef CONFIG_ZONE_DMA32 76 #ifdef CONFIG_ZONE_DMA32
77 256, 77 256,
78 #endif 78 #endif
79 #ifdef CONFIG_HIGHMEM 79 #ifdef CONFIG_HIGHMEM
80 32 80 32
81 #endif 81 #endif
82 }; 82 };
83 83
84 EXPORT_SYMBOL(totalram_pages); 84 EXPORT_SYMBOL(totalram_pages);
85 85
86 static char *zone_names[MAX_NR_ZONES] = { 86 static char *zone_names[MAX_NR_ZONES] = {
87 "DMA", 87 "DMA",
88 #ifdef CONFIG_ZONE_DMA32 88 #ifdef CONFIG_ZONE_DMA32
89 "DMA32", 89 "DMA32",
90 #endif 90 #endif
91 "Normal", 91 "Normal",
92 #ifdef CONFIG_HIGHMEM 92 #ifdef CONFIG_HIGHMEM
93 "HighMem" 93 "HighMem"
94 #endif 94 #endif
95 }; 95 };
96 96
97 int min_free_kbytes = 1024; 97 int min_free_kbytes = 1024;
98 98
99 unsigned long __meminitdata nr_kernel_pages; 99 unsigned long __meminitdata nr_kernel_pages;
100 unsigned long __meminitdata nr_all_pages; 100 unsigned long __meminitdata nr_all_pages;
101 static unsigned long __initdata dma_reserve; 101 static unsigned long __initdata dma_reserve;
102 102
103 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP 103 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP
104 /* 104 /*
105 * MAX_ACTIVE_REGIONS determines the maxmimum number of distinct 105 * MAX_ACTIVE_REGIONS determines the maxmimum number of distinct
106 * ranges of memory (RAM) that may be registered with add_active_range(). 106 * ranges of memory (RAM) that may be registered with add_active_range().
107 * Ranges passed to add_active_range() will be merged if possible 107 * Ranges passed to add_active_range() will be merged if possible
108 * so the number of times add_active_range() can be called is 108 * so the number of times add_active_range() can be called is
109 * related to the number of nodes and the number of holes 109 * related to the number of nodes and the number of holes
110 */ 110 */
111 #ifdef CONFIG_MAX_ACTIVE_REGIONS 111 #ifdef CONFIG_MAX_ACTIVE_REGIONS
112 /* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */ 112 /* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */
113 #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS 113 #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS
114 #else 114 #else
115 #if MAX_NUMNODES >= 32 115 #if MAX_NUMNODES >= 32
116 /* If there can be many nodes, allow up to 50 holes per node */ 116 /* If there can be many nodes, allow up to 50 holes per node */
117 #define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50) 117 #define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50)
118 #else 118 #else
119 /* By default, allow up to 256 distinct regions */ 119 /* By default, allow up to 256 distinct regions */
120 #define MAX_ACTIVE_REGIONS 256 120 #define MAX_ACTIVE_REGIONS 256
121 #endif 121 #endif
122 #endif 122 #endif
123 123
124 struct node_active_region __initdata early_node_map[MAX_ACTIVE_REGIONS]; 124 struct node_active_region __initdata early_node_map[MAX_ACTIVE_REGIONS];
125 int __initdata nr_nodemap_entries; 125 int __initdata nr_nodemap_entries;
126 unsigned long __initdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; 126 unsigned long __initdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
127 unsigned long __initdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; 127 unsigned long __initdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
128 #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE 128 #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
129 unsigned long __initdata node_boundary_start_pfn[MAX_NUMNODES]; 129 unsigned long __initdata node_boundary_start_pfn[MAX_NUMNODES];
130 unsigned long __initdata node_boundary_end_pfn[MAX_NUMNODES]; 130 unsigned long __initdata node_boundary_end_pfn[MAX_NUMNODES];
131 #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ 131 #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
132 #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ 132 #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
133 133
134 #ifdef CONFIG_DEBUG_VM 134 #ifdef CONFIG_DEBUG_VM
135 static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 135 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
136 { 136 {
137 int ret = 0; 137 int ret = 0;
138 unsigned seq; 138 unsigned seq;
139 unsigned long pfn = page_to_pfn(page); 139 unsigned long pfn = page_to_pfn(page);
140 140
141 do { 141 do {
142 seq = zone_span_seqbegin(zone); 142 seq = zone_span_seqbegin(zone);
143 if (pfn >= zone->zone_start_pfn + zone->spanned_pages) 143 if (pfn >= zone->zone_start_pfn + zone->spanned_pages)
144 ret = 1; 144 ret = 1;
145 else if (pfn < zone->zone_start_pfn) 145 else if (pfn < zone->zone_start_pfn)
146 ret = 1; 146 ret = 1;
147 } while (zone_span_seqretry(zone, seq)); 147 } while (zone_span_seqretry(zone, seq));
148 148
149 return ret; 149 return ret;
150 } 150 }
151 151
152 static int page_is_consistent(struct zone *zone, struct page *page) 152 static int page_is_consistent(struct zone *zone, struct page *page)
153 { 153 {
154 #ifdef CONFIG_HOLES_IN_ZONE 154 #ifdef CONFIG_HOLES_IN_ZONE
155 if (!pfn_valid(page_to_pfn(page))) 155 if (!pfn_valid(page_to_pfn(page)))
156 return 0; 156 return 0;
157 #endif 157 #endif
158 if (zone != page_zone(page)) 158 if (zone != page_zone(page))
159 return 0; 159 return 0;
160 160
161 return 1; 161 return 1;
162 } 162 }
163 /* 163 /*
164 * Temporary debugging check for pages not lying within a given zone. 164 * Temporary debugging check for pages not lying within a given zone.
165 */ 165 */
166 static int bad_range(struct zone *zone, struct page *page) 166 static int bad_range(struct zone *zone, struct page *page)
167 { 167 {
168 if (page_outside_zone_boundaries(zone, page)) 168 if (page_outside_zone_boundaries(zone, page))
169 return 1; 169 return 1;
170 if (!page_is_consistent(zone, page)) 170 if (!page_is_consistent(zone, page))
171 return 1; 171 return 1;
172 172
173 return 0; 173 return 0;
174 } 174 }
175 #else 175 #else
176 static inline int bad_range(struct zone *zone, struct page *page) 176 static inline int bad_range(struct zone *zone, struct page *page)
177 { 177 {
178 return 0; 178 return 0;
179 } 179 }
180 #endif 180 #endif
181 181
182 static void bad_page(struct page *page) 182 static void bad_page(struct page *page)
183 { 183 {
184 printk(KERN_EMERG "Bad page state in process '%s'\n" 184 printk(KERN_EMERG "Bad page state in process '%s'\n"
185 KERN_EMERG "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n" 185 KERN_EMERG "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n"
186 KERN_EMERG "Trying to fix it up, but a reboot is needed\n" 186 KERN_EMERG "Trying to fix it up, but a reboot is needed\n"
187 KERN_EMERG "Backtrace:\n", 187 KERN_EMERG "Backtrace:\n",
188 current->comm, page, (int)(2*sizeof(unsigned long)), 188 current->comm, page, (int)(2*sizeof(unsigned long)),
189 (unsigned long)page->flags, page->mapping, 189 (unsigned long)page->flags, page->mapping,
190 page_mapcount(page), page_count(page)); 190 page_mapcount(page), page_count(page));
191 dump_stack(); 191 dump_stack();
192 page->flags &= ~(1 << PG_lru | 192 page->flags &= ~(1 << PG_lru |
193 1 << PG_private | 193 1 << PG_private |
194 1 << PG_locked | 194 1 << PG_locked |
195 1 << PG_active | 195 1 << PG_active |
196 1 << PG_dirty | 196 1 << PG_dirty |
197 1 << PG_reclaim | 197 1 << PG_reclaim |
198 1 << PG_slab | 198 1 << PG_slab |
199 1 << PG_swapcache | 199 1 << PG_swapcache |
200 1 << PG_writeback | 200 1 << PG_writeback |
201 1 << PG_buddy ); 201 1 << PG_buddy );
202 set_page_count(page, 0); 202 set_page_count(page, 0);
203 reset_page_mapcount(page); 203 reset_page_mapcount(page);
204 page->mapping = NULL; 204 page->mapping = NULL;
205 add_taint(TAINT_BAD_PAGE); 205 add_taint(TAINT_BAD_PAGE);
206 } 206 }
207 207
208 /* 208 /*
209 * Higher-order pages are called "compound pages". They are structured thusly: 209 * Higher-order pages are called "compound pages". They are structured thusly:
210 * 210 *
211 * The first PAGE_SIZE page is called the "head page". 211 * The first PAGE_SIZE page is called the "head page".
212 * 212 *
213 * The remaining PAGE_SIZE pages are called "tail pages". 213 * The remaining PAGE_SIZE pages are called "tail pages".
214 * 214 *
215 * All pages have PG_compound set. All pages have their ->private pointing at 215 * All pages have PG_compound set. All pages have their ->private pointing at
216 * the head page (even the head page has this). 216 * the head page (even the head page has this).
217 * 217 *
218 * The first tail page's ->lru.next holds the address of the compound page's 218 * The first tail page's ->lru.next holds the address of the compound page's
219 * put_page() function. Its ->lru.prev holds the order of allocation. 219 * put_page() function. Its ->lru.prev holds the order of allocation.
220 * This usage means that zero-order pages may not be compound. 220 * This usage means that zero-order pages may not be compound.
221 */ 221 */
222 222
223 static void free_compound_page(struct page *page) 223 static void free_compound_page(struct page *page)
224 { 224 {
225 __free_pages_ok(page, (unsigned long)page[1].lru.prev); 225 __free_pages_ok(page, (unsigned long)page[1].lru.prev);
226 } 226 }
227 227
228 static void prep_compound_page(struct page *page, unsigned long order) 228 static void prep_compound_page(struct page *page, unsigned long order)
229 { 229 {
230 int i; 230 int i;
231 int nr_pages = 1 << order; 231 int nr_pages = 1 << order;
232 232
233 set_compound_page_dtor(page, free_compound_page); 233 set_compound_page_dtor(page, free_compound_page);
234 page[1].lru.prev = (void *)order; 234 page[1].lru.prev = (void *)order;
235 for (i = 0; i < nr_pages; i++) { 235 for (i = 0; i < nr_pages; i++) {
236 struct page *p = page + i; 236 struct page *p = page + i;
237 237
238 __SetPageCompound(p); 238 __SetPageCompound(p);
239 set_page_private(p, (unsigned long)page); 239 set_page_private(p, (unsigned long)page);
240 } 240 }
241 } 241 }
242 242
243 static void destroy_compound_page(struct page *page, unsigned long order) 243 static void destroy_compound_page(struct page *page, unsigned long order)
244 { 244 {
245 int i; 245 int i;
246 int nr_pages = 1 << order; 246 int nr_pages = 1 << order;
247 247
248 if (unlikely((unsigned long)page[1].lru.prev != order)) 248 if (unlikely((unsigned long)page[1].lru.prev != order))
249 bad_page(page); 249 bad_page(page);
250 250
251 for (i = 0; i < nr_pages; i++) { 251 for (i = 0; i < nr_pages; i++) {
252 struct page *p = page + i; 252 struct page *p = page + i;
253 253
254 if (unlikely(!PageCompound(p) | 254 if (unlikely(!PageCompound(p) |
255 (page_private(p) != (unsigned long)page))) 255 (page_private(p) != (unsigned long)page)))
256 bad_page(page); 256 bad_page(page);
257 __ClearPageCompound(p); 257 __ClearPageCompound(p);
258 } 258 }
259 } 259 }
260 260
261 static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) 261 static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
262 { 262 {
263 int i; 263 int i;
264 264
265 VM_BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM); 265 VM_BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM);
266 /* 266 /*
267 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO 267 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
268 * and __GFP_HIGHMEM from hard or soft interrupt context. 268 * and __GFP_HIGHMEM from hard or soft interrupt context.
269 */ 269 */
270 VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt()); 270 VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
271 for (i = 0; i < (1 << order); i++) 271 for (i = 0; i < (1 << order); i++)
272 clear_highpage(page + i); 272 clear_highpage(page + i);
273 } 273 }
274 274
275 /* 275 /*
276 * function for dealing with page's order in buddy system. 276 * function for dealing with page's order in buddy system.
277 * zone->lock is already acquired when we use these. 277 * zone->lock is already acquired when we use these.
278 * So, we don't need atomic page->flags operations here. 278 * So, we don't need atomic page->flags operations here.
279 */ 279 */
280 static inline unsigned long page_order(struct page *page) 280 static inline unsigned long page_order(struct page *page)
281 { 281 {
282 return page_private(page); 282 return page_private(page);
283 } 283 }
284 284
285 static inline void set_page_order(struct page *page, int order) 285 static inline void set_page_order(struct page *page, int order)
286 { 286 {
287 set_page_private(page, order); 287 set_page_private(page, order);
288 __SetPageBuddy(page); 288 __SetPageBuddy(page);
289 } 289 }
290 290
291 static inline void rmv_page_order(struct page *page) 291 static inline void rmv_page_order(struct page *page)
292 { 292 {
293 __ClearPageBuddy(page); 293 __ClearPageBuddy(page);
294 set_page_private(page, 0); 294 set_page_private(page, 0);
295 } 295 }
296 296
297 /* 297 /*
298 * Locate the struct page for both the matching buddy in our 298 * Locate the struct page for both the matching buddy in our
299 * pair (buddy1) and the combined O(n+1) page they form (page). 299 * pair (buddy1) and the combined O(n+1) page they form (page).
300 * 300 *
301 * 1) Any buddy B1 will have an order O twin B2 which satisfies 301 * 1) Any buddy B1 will have an order O twin B2 which satisfies
302 * the following equation: 302 * the following equation:
303 * B2 = B1 ^ (1 << O) 303 * B2 = B1 ^ (1 << O)
304 * For example, if the starting buddy (buddy2) is #8 its order 304 * For example, if the starting buddy (buddy2) is #8 its order
305 * 1 buddy is #10: 305 * 1 buddy is #10:
306 * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10 306 * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
307 * 307 *
308 * 2) Any buddy B will have an order O+1 parent P which 308 * 2) Any buddy B will have an order O+1 parent P which
309 * satisfies the following equation: 309 * satisfies the following equation:
310 * P = B & ~(1 << O) 310 * P = B & ~(1 << O)
311 * 311 *
312 * Assumption: *_mem_map is contiguous at least up to MAX_ORDER 312 * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
313 */ 313 */
314 static inline struct page * 314 static inline struct page *
315 __page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order) 315 __page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
316 { 316 {
317 unsigned long buddy_idx = page_idx ^ (1 << order); 317 unsigned long buddy_idx = page_idx ^ (1 << order);
318 318
319 return page + (buddy_idx - page_idx); 319 return page + (buddy_idx - page_idx);
320 } 320 }
321 321
322 static inline unsigned long 322 static inline unsigned long
323 __find_combined_index(unsigned long page_idx, unsigned int order) 323 __find_combined_index(unsigned long page_idx, unsigned int order)
324 { 324 {
325 return (page_idx & ~(1 << order)); 325 return (page_idx & ~(1 << order));
326 } 326 }
327 327
328 /* 328 /*
329 * This function checks whether a page is free && is the buddy 329 * This function checks whether a page is free && is the buddy
330 * we can do coalesce a page and its buddy if 330 * we can do coalesce a page and its buddy if
331 * (a) the buddy is not in a hole && 331 * (a) the buddy is not in a hole &&
332 * (b) the buddy is in the buddy system && 332 * (b) the buddy is in the buddy system &&
333 * (c) a page and its buddy have the same order && 333 * (c) a page and its buddy have the same order &&
334 * (d) a page and its buddy are in the same zone. 334 * (d) a page and its buddy are in the same zone.
335 * 335 *
336 * For recording whether a page is in the buddy system, we use PG_buddy. 336 * For recording whether a page is in the buddy system, we use PG_buddy.
337 * Setting, clearing, and testing PG_buddy is serialized by zone->lock. 337 * Setting, clearing, and testing PG_buddy is serialized by zone->lock.
338 * 338 *
339 * For recording page's order, we use page_private(page). 339 * For recording page's order, we use page_private(page).
340 */ 340 */
341 static inline int page_is_buddy(struct page *page, struct page *buddy, 341 static inline int page_is_buddy(struct page *page, struct page *buddy,
342 int order) 342 int order)
343 { 343 {
344 #ifdef CONFIG_HOLES_IN_ZONE 344 #ifdef CONFIG_HOLES_IN_ZONE
345 if (!pfn_valid(page_to_pfn(buddy))) 345 if (!pfn_valid(page_to_pfn(buddy)))
346 return 0; 346 return 0;
347 #endif 347 #endif
348 348
349 if (page_zone_id(page) != page_zone_id(buddy)) 349 if (page_zone_id(page) != page_zone_id(buddy))
350 return 0; 350 return 0;
351 351
352 if (PageBuddy(buddy) && page_order(buddy) == order) { 352 if (PageBuddy(buddy) && page_order(buddy) == order) {
353 BUG_ON(page_count(buddy) != 0); 353 BUG_ON(page_count(buddy) != 0);
354 return 1; 354 return 1;
355 } 355 }
356 return 0; 356 return 0;
357 } 357 }
358 358
359 /* 359 /*
360 * Freeing function for a buddy system allocator. 360 * Freeing function for a buddy system allocator.
361 * 361 *
362 * The concept of a buddy system is to maintain direct-mapped table 362 * The concept of a buddy system is to maintain direct-mapped table
363 * (containing bit values) for memory blocks of various "orders". 363 * (containing bit values) for memory blocks of various "orders".
364 * The bottom level table contains the map for the smallest allocatable 364 * The bottom level table contains the map for the smallest allocatable
365 * units of memory (here, pages), and each level above it describes 365 * units of memory (here, pages), and each level above it describes
366 * pairs of units from the levels below, hence, "buddies". 366 * pairs of units from the levels below, hence, "buddies".
367 * At a high level, all that happens here is marking the table entry 367 * At a high level, all that happens here is marking the table entry
368 * at the bottom level available, and propagating the changes upward 368 * at the bottom level available, and propagating the changes upward
369 * as necessary, plus some accounting needed to play nicely with other 369 * as necessary, plus some accounting needed to play nicely with other
370 * parts of the VM system. 370 * parts of the VM system.
371 * At each level, we keep a list of pages, which are heads of continuous 371 * At each level, we keep a list of pages, which are heads of continuous
372 * free pages of length of (1 << order) and marked with PG_buddy. Page's 372 * free pages of length of (1 << order) and marked with PG_buddy. Page's
373 * order is recorded in page_private(page) field. 373 * order is recorded in page_private(page) field.
374 * So when we are allocating or freeing one, we can derive the state of the 374 * So when we are allocating or freeing one, we can derive the state of the
375 * other. That is, if we allocate a small block, and both were 375 * other. That is, if we allocate a small block, and both were
376 * free, the remainder of the region must be split into blocks. 376 * free, the remainder of the region must be split into blocks.
377 * If a block is freed, and its buddy is also free, then this 377 * If a block is freed, and its buddy is also free, then this
378 * triggers coalescing into a block of larger size. 378 * triggers coalescing into a block of larger size.
379 * 379 *
380 * -- wli 380 * -- wli
381 */ 381 */
382 382
383 static inline void __free_one_page(struct page *page, 383 static inline void __free_one_page(struct page *page,
384 struct zone *zone, unsigned int order) 384 struct zone *zone, unsigned int order)
385 { 385 {
386 unsigned long page_idx; 386 unsigned long page_idx;
387 int order_size = 1 << order; 387 int order_size = 1 << order;
388 388
389 if (unlikely(PageCompound(page))) 389 if (unlikely(PageCompound(page)))
390 destroy_compound_page(page, order); 390 destroy_compound_page(page, order);
391 391
392 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); 392 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
393 393
394 VM_BUG_ON(page_idx & (order_size - 1)); 394 VM_BUG_ON(page_idx & (order_size - 1));
395 VM_BUG_ON(bad_range(zone, page)); 395 VM_BUG_ON(bad_range(zone, page));
396 396
397 zone->free_pages += order_size; 397 zone->free_pages += order_size;
398 while (order < MAX_ORDER-1) { 398 while (order < MAX_ORDER-1) {
399 unsigned long combined_idx; 399 unsigned long combined_idx;
400 struct free_area *area; 400 struct free_area *area;
401 struct page *buddy; 401 struct page *buddy;
402 402
403 buddy = __page_find_buddy(page, page_idx, order); 403 buddy = __page_find_buddy(page, page_idx, order);
404 if (!page_is_buddy(page, buddy, order)) 404 if (!page_is_buddy(page, buddy, order))
405 break; /* Move the buddy up one level. */ 405 break; /* Move the buddy up one level. */
406 406
407 list_del(&buddy->lru); 407 list_del(&buddy->lru);
408 area = zone->free_area + order; 408 area = zone->free_area + order;
409 area->nr_free--; 409 area->nr_free--;
410 rmv_page_order(buddy); 410 rmv_page_order(buddy);
411 combined_idx = __find_combined_index(page_idx, order); 411 combined_idx = __find_combined_index(page_idx, order);
412 page = page + (combined_idx - page_idx); 412 page = page + (combined_idx - page_idx);
413 page_idx = combined_idx; 413 page_idx = combined_idx;
414 order++; 414 order++;
415 } 415 }
416 set_page_order(page, order); 416 set_page_order(page, order);
417 list_add(&page->lru, &zone->free_area[order].free_list); 417 list_add(&page->lru, &zone->free_area[order].free_list);
418 zone->free_area[order].nr_free++; 418 zone->free_area[order].nr_free++;
419 } 419 }
420 420
421 static inline int free_pages_check(struct page *page) 421 static inline int free_pages_check(struct page *page)
422 { 422 {
423 if (unlikely(page_mapcount(page) | 423 if (unlikely(page_mapcount(page) |
424 (page->mapping != NULL) | 424 (page->mapping != NULL) |
425 (page_count(page) != 0) | 425 (page_count(page) != 0) |
426 (page->flags & ( 426 (page->flags & (
427 1 << PG_lru | 427 1 << PG_lru |
428 1 << PG_private | 428 1 << PG_private |
429 1 << PG_locked | 429 1 << PG_locked |
430 1 << PG_active | 430 1 << PG_active |
431 1 << PG_reclaim | 431 1 << PG_reclaim |
432 1 << PG_slab | 432 1 << PG_slab |
433 1 << PG_swapcache | 433 1 << PG_swapcache |
434 1 << PG_writeback | 434 1 << PG_writeback |
435 1 << PG_reserved | 435 1 << PG_reserved |
436 1 << PG_buddy )))) 436 1 << PG_buddy ))))
437 bad_page(page); 437 bad_page(page);
438 if (PageDirty(page)) 438 if (PageDirty(page))
439 __ClearPageDirty(page); 439 __ClearPageDirty(page);
440 /* 440 /*
441 * For now, we report if PG_reserved was found set, but do not 441 * For now, we report if PG_reserved was found set, but do not
442 * clear it, and do not free the page. But we shall soon need 442 * clear it, and do not free the page. But we shall soon need
443 * to do more, for when the ZERO_PAGE count wraps negative. 443 * to do more, for when the ZERO_PAGE count wraps negative.
444 */ 444 */
445 return PageReserved(page); 445 return PageReserved(page);
446 } 446 }
447 447
448 /* 448 /*
449 * Frees a list of pages. 449 * Frees a list of pages.
450 * Assumes all pages on list are in same zone, and of same order. 450 * Assumes all pages on list are in same zone, and of same order.
451 * count is the number of pages to free. 451 * count is the number of pages to free.
452 * 452 *
453 * If the zone was previously in an "all pages pinned" state then look to 453 * If the zone was previously in an "all pages pinned" state then look to
454 * see if this freeing clears that state. 454 * see if this freeing clears that state.
455 * 455 *
456 * And clear the zone's pages_scanned counter, to hold off the "all pages are 456 * And clear the zone's pages_scanned counter, to hold off the "all pages are
457 * pinned" detection logic. 457 * pinned" detection logic.
458 */ 458 */
459 static void free_pages_bulk(struct zone *zone, int count, 459 static void free_pages_bulk(struct zone *zone, int count,
460 struct list_head *list, int order) 460 struct list_head *list, int order)
461 { 461 {
462 spin_lock(&zone->lock); 462 spin_lock(&zone->lock);
463 zone->all_unreclaimable = 0; 463 zone->all_unreclaimable = 0;
464 zone->pages_scanned = 0; 464 zone->pages_scanned = 0;
465 while (count--) { 465 while (count--) {
466 struct page *page; 466 struct page *page;
467 467
468 VM_BUG_ON(list_empty(list)); 468 VM_BUG_ON(list_empty(list));
469 page = list_entry(list->prev, struct page, lru); 469 page = list_entry(list->prev, struct page, lru);
470 /* have to delete it as __free_one_page list manipulates */ 470 /* have to delete it as __free_one_page list manipulates */
471 list_del(&page->lru); 471 list_del(&page->lru);
472 __free_one_page(page, zone, order); 472 __free_one_page(page, zone, order);
473 } 473 }
474 spin_unlock(&zone->lock); 474 spin_unlock(&zone->lock);
475 } 475 }
476 476
477 static void free_one_page(struct zone *zone, struct page *page, int order) 477 static void free_one_page(struct zone *zone, struct page *page, int order)
478 { 478 {
479 spin_lock(&zone->lock); 479 spin_lock(&zone->lock);
480 zone->all_unreclaimable = 0; 480 zone->all_unreclaimable = 0;
481 zone->pages_scanned = 0; 481 zone->pages_scanned = 0;
482 __free_one_page(page, zone, order); 482 __free_one_page(page, zone, order);
483 spin_unlock(&zone->lock); 483 spin_unlock(&zone->lock);
484 } 484 }
485 485
486 static void __free_pages_ok(struct page *page, unsigned int order) 486 static void __free_pages_ok(struct page *page, unsigned int order)
487 { 487 {
488 unsigned long flags; 488 unsigned long flags;
489 int i; 489 int i;
490 int reserved = 0; 490 int reserved = 0;
491 491
492 for (i = 0 ; i < (1 << order) ; ++i) 492 for (i = 0 ; i < (1 << order) ; ++i)
493 reserved += free_pages_check(page + i); 493 reserved += free_pages_check(page + i);
494 if (reserved) 494 if (reserved)
495 return; 495 return;
496 496
497 if (!PageHighMem(page)) 497 if (!PageHighMem(page))
498 debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order); 498 debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);
499 arch_free_page(page, order); 499 arch_free_page(page, order);
500 kernel_map_pages(page, 1 << order, 0); 500 kernel_map_pages(page, 1 << order, 0);
501 501
502 local_irq_save(flags); 502 local_irq_save(flags);
503 __count_vm_events(PGFREE, 1 << order); 503 __count_vm_events(PGFREE, 1 << order);
504 free_one_page(page_zone(page), page, order); 504 free_one_page(page_zone(page), page, order);
505 local_irq_restore(flags); 505 local_irq_restore(flags);
506 } 506 }
507 507
508 /* 508 /*
509 * permit the bootmem allocator to evade page validation on high-order frees 509 * permit the bootmem allocator to evade page validation on high-order frees
510 */ 510 */
511 void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order) 511 void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order)
512 { 512 {
513 if (order == 0) { 513 if (order == 0) {
514 __ClearPageReserved(page); 514 __ClearPageReserved(page);
515 set_page_count(page, 0); 515 set_page_count(page, 0);
516 set_page_refcounted(page); 516 set_page_refcounted(page);
517 __free_page(page); 517 __free_page(page);
518 } else { 518 } else {
519 int loop; 519 int loop;
520 520
521 prefetchw(page); 521 prefetchw(page);
522 for (loop = 0; loop < BITS_PER_LONG; loop++) { 522 for (loop = 0; loop < BITS_PER_LONG; loop++) {
523 struct page *p = &page[loop]; 523 struct page *p = &page[loop];
524 524
525 if (loop + 1 < BITS_PER_LONG) 525 if (loop + 1 < BITS_PER_LONG)
526 prefetchw(p + 1); 526 prefetchw(p + 1);
527 __ClearPageReserved(p); 527 __ClearPageReserved(p);
528 set_page_count(p, 0); 528 set_page_count(p, 0);
529 } 529 }
530 530
531 set_page_refcounted(page); 531 set_page_refcounted(page);
532 __free_pages(page, order); 532 __free_pages(page, order);
533 } 533 }
534 } 534 }
535 535
536 536
537 /* 537 /*
538 * The order of subdivision here is critical for the IO subsystem. 538 * The order of subdivision here is critical for the IO subsystem.
539 * Please do not alter this order without good reasons and regression 539 * Please do not alter this order without good reasons and regression
540 * testing. Specifically, as large blocks of memory are subdivided, 540 * testing. Specifically, as large blocks of memory are subdivided,
541 * the order in which smaller blocks are delivered depends on the order 541 * the order in which smaller blocks are delivered depends on the order
542 * they're subdivided in this function. This is the primary factor 542 * they're subdivided in this function. This is the primary factor
543 * influencing the order in which pages are delivered to the IO 543 * influencing the order in which pages are delivered to the IO
544 * subsystem according to empirical testing, and this is also justified 544 * subsystem according to empirical testing, and this is also justified
545 * by considering the behavior of a buddy system containing a single 545 * by considering the behavior of a buddy system containing a single
546 * large block of memory acted on by a series of small allocations. 546 * large block of memory acted on by a series of small allocations.
547 * This behavior is a critical factor in sglist merging's success. 547 * This behavior is a critical factor in sglist merging's success.
548 * 548 *
549 * -- wli 549 * -- wli
550 */ 550 */
551 static inline void expand(struct zone *zone, struct page *page, 551 static inline void expand(struct zone *zone, struct page *page,
552 int low, int high, struct free_area *area) 552 int low, int high, struct free_area *area)
553 { 553 {
554 unsigned long size = 1 << high; 554 unsigned long size = 1 << high;
555 555
556 while (high > low) { 556 while (high > low) {
557 area--; 557 area--;
558 high--; 558 high--;
559 size >>= 1; 559 size >>= 1;
560 VM_BUG_ON(bad_range(zone, &page[size])); 560 VM_BUG_ON(bad_range(zone, &page[size]));
561 list_add(&page[size].lru, &area->free_list); 561 list_add(&page[size].lru, &area->free_list);
562 area->nr_free++; 562 area->nr_free++;
563 set_page_order(&page[size], high); 563 set_page_order(&page[size], high);
564 } 564 }
565 } 565 }
566 566
567 /* 567 /*
568 * This page is about to be returned from the page allocator 568 * This page is about to be returned from the page allocator
569 */ 569 */
570 static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) 570 static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
571 { 571 {
572 if (unlikely(page_mapcount(page) | 572 if (unlikely(page_mapcount(page) |
573 (page->mapping != NULL) | 573 (page->mapping != NULL) |
574 (page_count(page) != 0) | 574 (page_count(page) != 0) |
575 (page->flags & ( 575 (page->flags & (
576 1 << PG_lru | 576 1 << PG_lru |
577 1 << PG_private | 577 1 << PG_private |
578 1 << PG_locked | 578 1 << PG_locked |
579 1 << PG_active | 579 1 << PG_active |
580 1 << PG_dirty | 580 1 << PG_dirty |
581 1 << PG_reclaim | 581 1 << PG_reclaim |
582 1 << PG_slab | 582 1 << PG_slab |
583 1 << PG_swapcache | 583 1 << PG_swapcache |
584 1 << PG_writeback | 584 1 << PG_writeback |
585 1 << PG_reserved | 585 1 << PG_reserved |
586 1 << PG_buddy )))) 586 1 << PG_buddy ))))
587 bad_page(page); 587 bad_page(page);
588 588
589 /* 589 /*
590 * For now, we report if PG_reserved was found set, but do not 590 * For now, we report if PG_reserved was found set, but do not
591 * clear it, and do not allocate the page: as a safety net. 591 * clear it, and do not allocate the page: as a safety net.
592 */ 592 */
593 if (PageReserved(page)) 593 if (PageReserved(page))
594 return 1; 594 return 1;
595 595
596 page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 596 page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
597 1 << PG_referenced | 1 << PG_arch_1 | 597 1 << PG_referenced | 1 << PG_arch_1 |
598 1 << PG_checked | 1 << PG_mappedtodisk); 598 1 << PG_checked | 1 << PG_mappedtodisk);
599 set_page_private(page, 0); 599 set_page_private(page, 0);
600 set_page_refcounted(page); 600 set_page_refcounted(page);
601 601
602 arch_alloc_page(page, order); 602 arch_alloc_page(page, order);
603 kernel_map_pages(page, 1 << order, 1); 603 kernel_map_pages(page, 1 << order, 1);
604 604
605 if (gfp_flags & __GFP_ZERO) 605 if (gfp_flags & __GFP_ZERO)
606 prep_zero_page(page, order, gfp_flags); 606 prep_zero_page(page, order, gfp_flags);
607 607
608 if (order && (gfp_flags & __GFP_COMP)) 608 if (order && (gfp_flags & __GFP_COMP))
609 prep_compound_page(page, order); 609 prep_compound_page(page, order);
610 610
611 return 0; 611 return 0;
612 } 612 }
613 613
614 /* 614 /*
615 * Do the hard work of removing an element from the buddy allocator. 615 * Do the hard work of removing an element from the buddy allocator.
616 * Call me with the zone->lock already held. 616 * Call me with the zone->lock already held.
617 */ 617 */
618 static struct page *__rmqueue(struct zone *zone, unsigned int order) 618 static struct page *__rmqueue(struct zone *zone, unsigned int order)
619 { 619 {
620 struct free_area * area; 620 struct free_area * area;
621 unsigned int current_order; 621 unsigned int current_order;
622 struct page *page; 622 struct page *page;
623 623
624 for (current_order = order; current_order < MAX_ORDER; ++current_order) { 624 for (current_order = order; current_order < MAX_ORDER; ++current_order) {
625 area = zone->free_area + current_order; 625 area = zone->free_area + current_order;
626 if (list_empty(&area->free_list)) 626 if (list_empty(&area->free_list))
627 continue; 627 continue;
628 628
629 page = list_entry(area->free_list.next, struct page, lru); 629 page = list_entry(area->free_list.next, struct page, lru);
630 list_del(&page->lru); 630 list_del(&page->lru);
631 rmv_page_order(page); 631 rmv_page_order(page);
632 area->nr_free--; 632 area->nr_free--;
633 zone->free_pages -= 1UL << order; 633 zone->free_pages -= 1UL << order;
634 expand(zone, page, order, current_order, area); 634 expand(zone, page, order, current_order, area);
635 return page; 635 return page;
636 } 636 }
637 637
638 return NULL; 638 return NULL;
639 } 639 }
640 640
641 /* 641 /*
642 * Obtain a specified number of elements from the buddy allocator, all under 642 * Obtain a specified number of elements from the buddy allocator, all under
643 * a single hold of the lock, for efficiency. Add them to the supplied list. 643 * a single hold of the lock, for efficiency. Add them to the supplied list.
644 * Returns the number of new pages which were placed at *list. 644 * Returns the number of new pages which were placed at *list.
645 */ 645 */
646 static int rmqueue_bulk(struct zone *zone, unsigned int order, 646 static int rmqueue_bulk(struct zone *zone, unsigned int order,
647 unsigned long count, struct list_head *list) 647 unsigned long count, struct list_head *list)
648 { 648 {
649 int i; 649 int i;
650 650
651 spin_lock(&zone->lock); 651 spin_lock(&zone->lock);
652 for (i = 0; i < count; ++i) { 652 for (i = 0; i < count; ++i) {
653 struct page *page = __rmqueue(zone, order); 653 struct page *page = __rmqueue(zone, order);
654 if (unlikely(page == NULL)) 654 if (unlikely(page == NULL))
655 break; 655 break;
656 list_add_tail(&page->lru, list); 656 list_add_tail(&page->lru, list);
657 } 657 }
658 spin_unlock(&zone->lock); 658 spin_unlock(&zone->lock);
659 return i; 659 return i;
660 } 660 }
661 661
662 #ifdef CONFIG_NUMA 662 #ifdef CONFIG_NUMA
663 /* 663 /*
664 * Called from the slab reaper to drain pagesets on a particular node that 664 * Called from the slab reaper to drain pagesets on a particular node that
665 * belongs to the currently executing processor. 665 * belongs to the currently executing processor.
666 * Note that this function must be called with the thread pinned to 666 * Note that this function must be called with the thread pinned to
667 * a single processor. 667 * a single processor.
668 */ 668 */
669 void drain_node_pages(int nodeid) 669 void drain_node_pages(int nodeid)
670 { 670 {
671 int i; 671 int i;
672 enum zone_type z; 672 enum zone_type z;
673 unsigned long flags; 673 unsigned long flags;
674 674
675 for (z = 0; z < MAX_NR_ZONES; z++) { 675 for (z = 0; z < MAX_NR_ZONES; z++) {
676 struct zone *zone = NODE_DATA(nodeid)->node_zones + z; 676 struct zone *zone = NODE_DATA(nodeid)->node_zones + z;
677 struct per_cpu_pageset *pset; 677 struct per_cpu_pageset *pset;
678 678
679 if (!populated_zone(zone)) 679 if (!populated_zone(zone))
680 continue; 680 continue;
681 681
682 pset = zone_pcp(zone, smp_processor_id()); 682 pset = zone_pcp(zone, smp_processor_id());
683 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { 683 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
684 struct per_cpu_pages *pcp; 684 struct per_cpu_pages *pcp;
685 685
686 pcp = &pset->pcp[i]; 686 pcp = &pset->pcp[i];
687 if (pcp->count) { 687 if (pcp->count) {
688 int to_drain; 688 int to_drain;
689 689
690 local_irq_save(flags); 690 local_irq_save(flags);
691 if (pcp->count >= pcp->batch) 691 if (pcp->count >= pcp->batch)
692 to_drain = pcp->batch; 692 to_drain = pcp->batch;
693 else 693 else
694 to_drain = pcp->count; 694 to_drain = pcp->count;
695 free_pages_bulk(zone, to_drain, &pcp->list, 0); 695 free_pages_bulk(zone, to_drain, &pcp->list, 0);
696 pcp->count -= to_drain; 696 pcp->count -= to_drain;
697 local_irq_restore(flags); 697 local_irq_restore(flags);
698 } 698 }
699 } 699 }
700 } 700 }
701 } 701 }
702 #endif 702 #endif
703 703
704 #if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) 704 #if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
705 static void __drain_pages(unsigned int cpu) 705 static void __drain_pages(unsigned int cpu)
706 { 706 {
707 unsigned long flags; 707 unsigned long flags;
708 struct zone *zone; 708 struct zone *zone;
709 int i; 709 int i;
710 710
711 for_each_zone(zone) { 711 for_each_zone(zone) {
712 struct per_cpu_pageset *pset; 712 struct per_cpu_pageset *pset;
713 713
714 pset = zone_pcp(zone, cpu); 714 pset = zone_pcp(zone, cpu);
715 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { 715 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
716 struct per_cpu_pages *pcp; 716 struct per_cpu_pages *pcp;
717 717
718 pcp = &pset->pcp[i]; 718 pcp = &pset->pcp[i];
719 local_irq_save(flags); 719 local_irq_save(flags);
720 free_pages_bulk(zone, pcp->count, &pcp->list, 0); 720 free_pages_bulk(zone, pcp->count, &pcp->list, 0);
721 pcp->count = 0; 721 pcp->count = 0;
722 local_irq_restore(flags); 722 local_irq_restore(flags);
723 } 723 }
724 } 724 }
725 } 725 }
726 #endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */ 726 #endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */
727 727
728 #ifdef CONFIG_PM 728 #ifdef CONFIG_PM
729 729
730 void mark_free_pages(struct zone *zone) 730 void mark_free_pages(struct zone *zone)
731 { 731 {
732 unsigned long pfn, max_zone_pfn; 732 unsigned long pfn, max_zone_pfn;
733 unsigned long flags; 733 unsigned long flags;
734 int order; 734 int order;
735 struct list_head *curr; 735 struct list_head *curr;
736 736
737 if (!zone->spanned_pages) 737 if (!zone->spanned_pages)
738 return; 738 return;
739 739
740 spin_lock_irqsave(&zone->lock, flags); 740 spin_lock_irqsave(&zone->lock, flags);
741 741
742 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 742 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
743 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 743 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
744 if (pfn_valid(pfn)) { 744 if (pfn_valid(pfn)) {
745 struct page *page = pfn_to_page(pfn); 745 struct page *page = pfn_to_page(pfn);
746 746
747 if (!PageNosave(page)) 747 if (!PageNosave(page))
748 ClearPageNosaveFree(page); 748 ClearPageNosaveFree(page);
749 } 749 }
750 750
751 for (order = MAX_ORDER - 1; order >= 0; --order) 751 for (order = MAX_ORDER - 1; order >= 0; --order)
752 list_for_each(curr, &zone->free_area[order].free_list) { 752 list_for_each(curr, &zone->free_area[order].free_list) {
753 unsigned long i; 753 unsigned long i;
754 754
755 pfn = page_to_pfn(list_entry(curr, struct page, lru)); 755 pfn = page_to_pfn(list_entry(curr, struct page, lru));
756 for (i = 0; i < (1UL << order); i++) 756 for (i = 0; i < (1UL << order); i++)
757 SetPageNosaveFree(pfn_to_page(pfn + i)); 757 SetPageNosaveFree(pfn_to_page(pfn + i));
758 } 758 }
759 759
760 spin_unlock_irqrestore(&zone->lock, flags); 760 spin_unlock_irqrestore(&zone->lock, flags);
761 } 761 }
762 762
763 /* 763 /*
764 * Spill all of this CPU's per-cpu pages back into the buddy allocator. 764 * Spill all of this CPU's per-cpu pages back into the buddy allocator.
765 */ 765 */
766 void drain_local_pages(void) 766 void drain_local_pages(void)
767 { 767 {
768 unsigned long flags; 768 unsigned long flags;
769 769
770 local_irq_save(flags); 770 local_irq_save(flags);
771 __drain_pages(smp_processor_id()); 771 __drain_pages(smp_processor_id());
772 local_irq_restore(flags); 772 local_irq_restore(flags);
773 } 773 }
774 #endif /* CONFIG_PM */ 774 #endif /* CONFIG_PM */
775 775
776 /* 776 /*
777 * Free a 0-order page 777 * Free a 0-order page
778 */ 778 */
779 static void fastcall free_hot_cold_page(struct page *page, int cold) 779 static void fastcall free_hot_cold_page(struct page *page, int cold)
780 { 780 {
781 struct zone *zone = page_zone(page); 781 struct zone *zone = page_zone(page);
782 struct per_cpu_pages *pcp; 782 struct per_cpu_pages *pcp;
783 unsigned long flags; 783 unsigned long flags;
784 784
785 if (PageAnon(page)) 785 if (PageAnon(page))
786 page->mapping = NULL; 786 page->mapping = NULL;
787 if (free_pages_check(page)) 787 if (free_pages_check(page))
788 return; 788 return;
789 789
790 if (!PageHighMem(page)) 790 if (!PageHighMem(page))
791 debug_check_no_locks_freed(page_address(page), PAGE_SIZE); 791 debug_check_no_locks_freed(page_address(page), PAGE_SIZE);
792 arch_free_page(page, 0); 792 arch_free_page(page, 0);
793 kernel_map_pages(page, 1, 0); 793 kernel_map_pages(page, 1, 0);
794 794
795 pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; 795 pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
796 local_irq_save(flags); 796 local_irq_save(flags);
797 __count_vm_event(PGFREE); 797 __count_vm_event(PGFREE);
798 list_add(&page->lru, &pcp->list); 798 list_add(&page->lru, &pcp->list);
799 pcp->count++; 799 pcp->count++;
800 if (pcp->count >= pcp->high) { 800 if (pcp->count >= pcp->high) {
801 free_pages_bulk(zone, pcp->batch, &pcp->list, 0); 801 free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
802 pcp->count -= pcp->batch; 802 pcp->count -= pcp->batch;
803 } 803 }
804 local_irq_restore(flags); 804 local_irq_restore(flags);
805 put_cpu(); 805 put_cpu();
806 } 806 }
807 807
808 void fastcall free_hot_page(struct page *page) 808 void fastcall free_hot_page(struct page *page)
809 { 809 {
810 free_hot_cold_page(page, 0); 810 free_hot_cold_page(page, 0);
811 } 811 }
812 812
813 void fastcall free_cold_page(struct page *page) 813 void fastcall free_cold_page(struct page *page)
814 { 814 {
815 free_hot_cold_page(page, 1); 815 free_hot_cold_page(page, 1);
816 } 816 }
817 817
818 /* 818 /*
819 * split_page takes a non-compound higher-order page, and splits it into 819 * split_page takes a non-compound higher-order page, and splits it into
820 * n (1<<order) sub-pages: page[0..n] 820 * n (1<<order) sub-pages: page[0..n]
821 * Each sub-page must be freed individually. 821 * Each sub-page must be freed individually.
822 * 822 *
823 * Note: this is probably too low level an operation for use in drivers. 823 * Note: this is probably too low level an operation for use in drivers.
824 * Please consult with lkml before using this in your driver. 824 * Please consult with lkml before using this in your driver.
825 */ 825 */
826 void split_page(struct page *page, unsigned int order) 826 void split_page(struct page *page, unsigned int order)
827 { 827 {
828 int i; 828 int i;
829 829
830 VM_BUG_ON(PageCompound(page)); 830 VM_BUG_ON(PageCompound(page));
831 VM_BUG_ON(!page_count(page)); 831 VM_BUG_ON(!page_count(page));
832 for (i = 1; i < (1 << order); i++) 832 for (i = 1; i < (1 << order); i++)
833 set_page_refcounted(page + i); 833 set_page_refcounted(page + i);
834 } 834 }
835 835
836 /* 836 /*
837 * Really, prep_compound_page() should be called from __rmqueue_bulk(). But 837 * Really, prep_compound_page() should be called from __rmqueue_bulk(). But
838 * we cheat by calling it from here, in the order > 0 path. Saves a branch 838 * we cheat by calling it from here, in the order > 0 path. Saves a branch
839 * or two. 839 * or two.
840 */ 840 */
841 static struct page *buffered_rmqueue(struct zonelist *zonelist, 841 static struct page *buffered_rmqueue(struct zonelist *zonelist,
842 struct zone *zone, int order, gfp_t gfp_flags) 842 struct zone *zone, int order, gfp_t gfp_flags)
843 { 843 {
844 unsigned long flags; 844 unsigned long flags;
845 struct page *page; 845 struct page *page;
846 int cold = !!(gfp_flags & __GFP_COLD); 846 int cold = !!(gfp_flags & __GFP_COLD);
847 int cpu; 847 int cpu;
848 848
849 again: 849 again:
850 cpu = get_cpu(); 850 cpu = get_cpu();
851 if (likely(order == 0)) { 851 if (likely(order == 0)) {
852 struct per_cpu_pages *pcp; 852 struct per_cpu_pages *pcp;
853 853
854 pcp = &zone_pcp(zone, cpu)->pcp[cold]; 854 pcp = &zone_pcp(zone, cpu)->pcp[cold];
855 local_irq_save(flags); 855 local_irq_save(flags);
856 if (!pcp->count) { 856 if (!pcp->count) {
857 pcp->count = rmqueue_bulk(zone, 0, 857 pcp->count = rmqueue_bulk(zone, 0,
858 pcp->batch, &pcp->list); 858 pcp->batch, &pcp->list);
859 if (unlikely(!pcp->count)) 859 if (unlikely(!pcp->count))
860 goto failed; 860 goto failed;
861 } 861 }
862 page = list_entry(pcp->list.next, struct page, lru); 862 page = list_entry(pcp->list.next, struct page, lru);
863 list_del(&page->lru); 863 list_del(&page->lru);
864 pcp->count--; 864 pcp->count--;
865 } else { 865 } else {
866 spin_lock_irqsave(&zone->lock, flags); 866 spin_lock_irqsave(&zone->lock, flags);
867 page = __rmqueue(zone, order); 867 page = __rmqueue(zone, order);
868 spin_unlock(&zone->lock); 868 spin_unlock(&zone->lock);
869 if (!page) 869 if (!page)
870 goto failed; 870 goto failed;
871 } 871 }
872 872
873 __count_zone_vm_events(PGALLOC, zone, 1 << order); 873 __count_zone_vm_events(PGALLOC, zone, 1 << order);
874 zone_statistics(zonelist, zone); 874 zone_statistics(zonelist, zone);
875 local_irq_restore(flags); 875 local_irq_restore(flags);
876 put_cpu(); 876 put_cpu();
877 877
878 VM_BUG_ON(bad_range(zone, page)); 878 VM_BUG_ON(bad_range(zone, page));
879 if (prep_new_page(page, order, gfp_flags)) 879 if (prep_new_page(page, order, gfp_flags))
880 goto again; 880 goto again;
881 return page; 881 return page;
882 882
883 failed: 883 failed:
884 local_irq_restore(flags); 884 local_irq_restore(flags);
885 put_cpu(); 885 put_cpu();
886 return NULL; 886 return NULL;
887 } 887 }
888 888
889 #define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */ 889 #define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */
890 #define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */ 890 #define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */
891 #define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */ 891 #define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */
892 #define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */ 892 #define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */
893 #define ALLOC_HARDER 0x10 /* try to alloc harder */ 893 #define ALLOC_HARDER 0x10 /* try to alloc harder */
894 #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ 894 #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
895 #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ 895 #define ALLOC_CPUSET 0x40 /* check for correct cpuset */
896 896
897 /* 897 /*
898 * Return 1 if free pages are above 'mark'. This takes into account the order 898 * Return 1 if free pages are above 'mark'. This takes into account the order
899 * of the allocation. 899 * of the allocation.
900 */ 900 */
901 int zone_watermark_ok(struct zone *z, int order, unsigned long mark, 901 int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
902 int classzone_idx, int alloc_flags) 902 int classzone_idx, int alloc_flags)
903 { 903 {
904 /* free_pages my go negative - that's OK */ 904 /* free_pages my go negative - that's OK */
905 unsigned long min = mark; 905 unsigned long min = mark;
906 long free_pages = z->free_pages - (1 << order) + 1; 906 long free_pages = z->free_pages - (1 << order) + 1;
907 int o; 907 int o;
908 908
909 if (alloc_flags & ALLOC_HIGH) 909 if (alloc_flags & ALLOC_HIGH)
910 min -= min / 2; 910 min -= min / 2;
911 if (alloc_flags & ALLOC_HARDER) 911 if (alloc_flags & ALLOC_HARDER)
912 min -= min / 4; 912 min -= min / 4;
913 913
914 if (free_pages <= min + z->lowmem_reserve[classzone_idx]) 914 if (free_pages <= min + z->lowmem_reserve[classzone_idx])
915 return 0; 915 return 0;
916 for (o = 0; o < order; o++) { 916 for (o = 0; o < order; o++) {
917 /* At the next order, this order's pages become unavailable */ 917 /* At the next order, this order's pages become unavailable */
918 free_pages -= z->free_area[o].nr_free << o; 918 free_pages -= z->free_area[o].nr_free << o;
919 919
920 /* Require fewer higher order pages to be free */ 920 /* Require fewer higher order pages to be free */
921 min >>= 1; 921 min >>= 1;
922 922
923 if (free_pages <= min) 923 if (free_pages <= min)
924 return 0; 924 return 0;
925 } 925 }
926 return 1; 926 return 1;
927 } 927 }
928 928
929 #ifdef CONFIG_NUMA 929 #ifdef CONFIG_NUMA
930 /* 930 /*
931 * zlc_setup - Setup for "zonelist cache". Uses cached zone data to 931 * zlc_setup - Setup for "zonelist cache". Uses cached zone data to
932 * skip over zones that are not allowed by the cpuset, or that have 932 * skip over zones that are not allowed by the cpuset, or that have
933 * been recently (in last second) found to be nearly full. See further 933 * been recently (in last second) found to be nearly full. See further
934 * comments in mmzone.h. Reduces cache footprint of zonelist scans 934 * comments in mmzone.h. Reduces cache footprint of zonelist scans
935 * that have to skip over alot of full or unallowed zones. 935 * that have to skip over alot of full or unallowed zones.
936 * 936 *
937 * If the zonelist cache is present in the passed in zonelist, then 937 * If the zonelist cache is present in the passed in zonelist, then
938 * returns a pointer to the allowed node mask (either the current 938 * returns a pointer to the allowed node mask (either the current
939 * tasks mems_allowed, or node_online_map.) 939 * tasks mems_allowed, or node_online_map.)
940 * 940 *
941 * If the zonelist cache is not available for this zonelist, does 941 * If the zonelist cache is not available for this zonelist, does
942 * nothing and returns NULL. 942 * nothing and returns NULL.
943 * 943 *
944 * If the fullzones BITMAP in the zonelist cache is stale (more than 944 * If the fullzones BITMAP in the zonelist cache is stale (more than
945 * a second since last zap'd) then we zap it out (clear its bits.) 945 * a second since last zap'd) then we zap it out (clear its bits.)
946 * 946 *
947 * We hold off even calling zlc_setup, until after we've checked the 947 * We hold off even calling zlc_setup, until after we've checked the
948 * first zone in the zonelist, on the theory that most allocations will 948 * first zone in the zonelist, on the theory that most allocations will
949 * be satisfied from that first zone, so best to examine that zone as 949 * be satisfied from that first zone, so best to examine that zone as
950 * quickly as we can. 950 * quickly as we can.
951 */ 951 */
952 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) 952 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
953 { 953 {
954 struct zonelist_cache *zlc; /* cached zonelist speedup info */ 954 struct zonelist_cache *zlc; /* cached zonelist speedup info */
955 nodemask_t *allowednodes; /* zonelist_cache approximation */ 955 nodemask_t *allowednodes; /* zonelist_cache approximation */
956 956
957 zlc = zonelist->zlcache_ptr; 957 zlc = zonelist->zlcache_ptr;
958 if (!zlc) 958 if (!zlc)
959 return NULL; 959 return NULL;
960 960
961 if (jiffies - zlc->last_full_zap > 1 * HZ) { 961 if (jiffies - zlc->last_full_zap > 1 * HZ) {
962 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); 962 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
963 zlc->last_full_zap = jiffies; 963 zlc->last_full_zap = jiffies;
964 } 964 }
965 965
966 allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? 966 allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
967 &cpuset_current_mems_allowed : 967 &cpuset_current_mems_allowed :
968 &node_online_map; 968 &node_online_map;
969 return allowednodes; 969 return allowednodes;
970 } 970 }
971 971
972 /* 972 /*
973 * Given 'z' scanning a zonelist, run a couple of quick checks to see 973 * Given 'z' scanning a zonelist, run a couple of quick checks to see
974 * if it is worth looking at further for free memory: 974 * if it is worth looking at further for free memory:
975 * 1) Check that the zone isn't thought to be full (doesn't have its 975 * 1) Check that the zone isn't thought to be full (doesn't have its
976 * bit set in the zonelist_cache fullzones BITMAP). 976 * bit set in the zonelist_cache fullzones BITMAP).
977 * 2) Check that the zones node (obtained from the zonelist_cache 977 * 2) Check that the zones node (obtained from the zonelist_cache
978 * z_to_n[] mapping) is allowed in the passed in allowednodes mask. 978 * z_to_n[] mapping) is allowed in the passed in allowednodes mask.
979 * Return true (non-zero) if zone is worth looking at further, or 979 * Return true (non-zero) if zone is worth looking at further, or
980 * else return false (zero) if it is not. 980 * else return false (zero) if it is not.
981 * 981 *
982 * This check -ignores- the distinction between various watermarks, 982 * This check -ignores- the distinction between various watermarks,
983 * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is 983 * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is
984 * found to be full for any variation of these watermarks, it will 984 * found to be full for any variation of these watermarks, it will
985 * be considered full for up to one second by all requests, unless 985 * be considered full for up to one second by all requests, unless
986 * we are so low on memory on all allowed nodes that we are forced 986 * we are so low on memory on all allowed nodes that we are forced
987 * into the second scan of the zonelist. 987 * into the second scan of the zonelist.
988 * 988 *
989 * In the second scan we ignore this zonelist cache and exactly 989 * In the second scan we ignore this zonelist cache and exactly
990 * apply the watermarks to all zones, even it is slower to do so. 990 * apply the watermarks to all zones, even it is slower to do so.
991 * We are low on memory in the second scan, and should leave no stone 991 * We are low on memory in the second scan, and should leave no stone
992 * unturned looking for a free page. 992 * unturned looking for a free page.
993 */ 993 */
994 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z, 994 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
995 nodemask_t *allowednodes) 995 nodemask_t *allowednodes)
996 { 996 {
997 struct zonelist_cache *zlc; /* cached zonelist speedup info */ 997 struct zonelist_cache *zlc; /* cached zonelist speedup info */
998 int i; /* index of *z in zonelist zones */ 998 int i; /* index of *z in zonelist zones */
999 int n; /* node that zone *z is on */ 999 int n; /* node that zone *z is on */
1000 1000
1001 zlc = zonelist->zlcache_ptr; 1001 zlc = zonelist->zlcache_ptr;
1002 if (!zlc) 1002 if (!zlc)
1003 return 1; 1003 return 1;
1004 1004
1005 i = z - zonelist->zones; 1005 i = z - zonelist->zones;
1006 n = zlc->z_to_n[i]; 1006 n = zlc->z_to_n[i];
1007 1007
1008 /* This zone is worth trying if it is allowed but not full */ 1008 /* This zone is worth trying if it is allowed but not full */
1009 return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones); 1009 return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
1010 } 1010 }
1011 1011
1012 /* 1012 /*
1013 * Given 'z' scanning a zonelist, set the corresponding bit in 1013 * Given 'z' scanning a zonelist, set the corresponding bit in
1014 * zlc->fullzones, so that subsequent attempts to allocate a page 1014 * zlc->fullzones, so that subsequent attempts to allocate a page
1015 * from that zone don't waste time re-examining it. 1015 * from that zone don't waste time re-examining it.
1016 */ 1016 */
1017 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z) 1017 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
1018 { 1018 {
1019 struct zonelist_cache *zlc; /* cached zonelist speedup info */ 1019 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1020 int i; /* index of *z in zonelist zones */ 1020 int i; /* index of *z in zonelist zones */
1021 1021
1022 zlc = zonelist->zlcache_ptr; 1022 zlc = zonelist->zlcache_ptr;
1023 if (!zlc) 1023 if (!zlc)
1024 return; 1024 return;
1025 1025
1026 i = z - zonelist->zones; 1026 i = z - zonelist->zones;
1027 1027
1028 set_bit(i, zlc->fullzones); 1028 set_bit(i, zlc->fullzones);
1029 } 1029 }
1030 1030
1031 #else /* CONFIG_NUMA */ 1031 #else /* CONFIG_NUMA */
1032 1032
1033 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) 1033 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
1034 { 1034 {
1035 return NULL; 1035 return NULL;
1036 } 1036 }
1037 1037
1038 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z, 1038 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
1039 nodemask_t *allowednodes) 1039 nodemask_t *allowednodes)
1040 { 1040 {
1041 return 1; 1041 return 1;
1042 } 1042 }
1043 1043
1044 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z) 1044 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
1045 { 1045 {
1046 } 1046 }
1047 #endif /* CONFIG_NUMA */ 1047 #endif /* CONFIG_NUMA */
1048 1048
1049 /* 1049 /*
1050 * get_page_from_freelist goes through the zonelist trying to allocate 1050 * get_page_from_freelist goes through the zonelist trying to allocate
1051 * a page. 1051 * a page.
1052 */ 1052 */
1053 static struct page * 1053 static struct page *
1054 get_page_from_freelist(gfp_t gfp_mask, unsigned int order, 1054 get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
1055 struct zonelist *zonelist, int alloc_flags) 1055 struct zonelist *zonelist, int alloc_flags)
1056 { 1056 {
1057 struct zone **z; 1057 struct zone **z;
1058 struct page *page = NULL; 1058 struct page *page = NULL;
1059 int classzone_idx = zone_idx(zonelist->zones[0]); 1059 int classzone_idx = zone_idx(zonelist->zones[0]);
1060 struct zone *zone; 1060 struct zone *zone;
1061 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ 1061 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
1062 int zlc_active = 0; /* set if using zonelist_cache */ 1062 int zlc_active = 0; /* set if using zonelist_cache */
1063 int did_zlc_setup = 0; /* just call zlc_setup() one time */ 1063 int did_zlc_setup = 0; /* just call zlc_setup() one time */
1064 1064
1065 zonelist_scan: 1065 zonelist_scan:
1066 /* 1066 /*
1067 * Scan zonelist, looking for a zone with enough free. 1067 * Scan zonelist, looking for a zone with enough free.
1068 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 1068 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
1069 */ 1069 */
1070 z = zonelist->zones; 1070 z = zonelist->zones;
1071 1071
1072 do { 1072 do {
1073 if (NUMA_BUILD && zlc_active && 1073 if (NUMA_BUILD && zlc_active &&
1074 !zlc_zone_worth_trying(zonelist, z, allowednodes)) 1074 !zlc_zone_worth_trying(zonelist, z, allowednodes))
1075 continue; 1075 continue;
1076 zone = *z; 1076 zone = *z;
1077 if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) && 1077 if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) &&
1078 zone->zone_pgdat != zonelist->zones[0]->zone_pgdat)) 1078 zone->zone_pgdat != zonelist->zones[0]->zone_pgdat))
1079 break; 1079 break;
1080 if ((alloc_flags & ALLOC_CPUSET) && 1080 if ((alloc_flags & ALLOC_CPUSET) &&
1081 !cpuset_zone_allowed(zone, gfp_mask)) 1081 !cpuset_zone_allowed(zone, gfp_mask))
1082 goto try_next_zone; 1082 goto try_next_zone;
1083 1083
1084 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { 1084 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
1085 unsigned long mark; 1085 unsigned long mark;
1086 if (alloc_flags & ALLOC_WMARK_MIN) 1086 if (alloc_flags & ALLOC_WMARK_MIN)
1087 mark = zone->pages_min; 1087 mark = zone->pages_min;
1088 else if (alloc_flags & ALLOC_WMARK_LOW) 1088 else if (alloc_flags & ALLOC_WMARK_LOW)
1089 mark = zone->pages_low; 1089 mark = zone->pages_low;
1090 else 1090 else
1091 mark = zone->pages_high; 1091 mark = zone->pages_high;
1092 if (!zone_watermark_ok(zone, order, mark, 1092 if (!zone_watermark_ok(zone, order, mark,
1093 classzone_idx, alloc_flags)) { 1093 classzone_idx, alloc_flags)) {
1094 if (!zone_reclaim_mode || 1094 if (!zone_reclaim_mode ||
1095 !zone_reclaim(zone, gfp_mask, order)) 1095 !zone_reclaim(zone, gfp_mask, order))
1096 goto this_zone_full; 1096 goto this_zone_full;
1097 } 1097 }
1098 } 1098 }
1099 1099
1100 page = buffered_rmqueue(zonelist, zone, order, gfp_mask); 1100 page = buffered_rmqueue(zonelist, zone, order, gfp_mask);
1101 if (page) 1101 if (page)
1102 break; 1102 break;
1103 this_zone_full: 1103 this_zone_full:
1104 if (NUMA_BUILD) 1104 if (NUMA_BUILD)
1105 zlc_mark_zone_full(zonelist, z); 1105 zlc_mark_zone_full(zonelist, z);
1106 try_next_zone: 1106 try_next_zone:
1107 if (NUMA_BUILD && !did_zlc_setup) { 1107 if (NUMA_BUILD && !did_zlc_setup) {
1108 /* we do zlc_setup after the first zone is tried */ 1108 /* we do zlc_setup after the first zone is tried */
1109 allowednodes = zlc_setup(zonelist, alloc_flags); 1109 allowednodes = zlc_setup(zonelist, alloc_flags);
1110 zlc_active = 1; 1110 zlc_active = 1;
1111 did_zlc_setup = 1; 1111 did_zlc_setup = 1;
1112 } 1112 }
1113 } while (*(++z) != NULL); 1113 } while (*(++z) != NULL);
1114 1114
1115 if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { 1115 if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
1116 /* Disable zlc cache for second zonelist scan */ 1116 /* Disable zlc cache for second zonelist scan */
1117 zlc_active = 0; 1117 zlc_active = 0;
1118 goto zonelist_scan; 1118 goto zonelist_scan;
1119 } 1119 }
1120 return page; 1120 return page;
1121 } 1121 }
1122 1122
1123 /* 1123 /*
1124 * This is the 'heart' of the zoned buddy allocator. 1124 * This is the 'heart' of the zoned buddy allocator.
1125 */ 1125 */
1126 struct page * fastcall 1126 struct page * fastcall
1127 __alloc_pages(gfp_t gfp_mask, unsigned int order, 1127 __alloc_pages(gfp_t gfp_mask, unsigned int order,
1128 struct zonelist *zonelist) 1128 struct zonelist *zonelist)
1129 { 1129 {
1130 const gfp_t wait = gfp_mask & __GFP_WAIT; 1130 const gfp_t wait = gfp_mask & __GFP_WAIT;
1131 struct zone **z; 1131 struct zone **z;
1132 struct page *page; 1132 struct page *page;
1133 struct reclaim_state reclaim_state; 1133 struct reclaim_state reclaim_state;
1134 struct task_struct *p = current; 1134 struct task_struct *p = current;
1135 int do_retry; 1135 int do_retry;
1136 int alloc_flags; 1136 int alloc_flags;
1137 int did_some_progress; 1137 int did_some_progress;
1138 1138
1139 might_sleep_if(wait); 1139 might_sleep_if(wait);
1140 1140
1141 restart: 1141 restart:
1142 z = zonelist->zones; /* the list of zones suitable for gfp_mask */ 1142 z = zonelist->zones; /* the list of zones suitable for gfp_mask */
1143 1143
1144 if (unlikely(*z == NULL)) { 1144 if (unlikely(*z == NULL)) {
1145 /* Should this ever happen?? */ 1145 /* Should this ever happen?? */
1146 return NULL; 1146 return NULL;
1147 } 1147 }
1148 1148
1149 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, 1149 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
1150 zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET); 1150 zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET);
1151 if (page) 1151 if (page)
1152 goto got_pg; 1152 goto got_pg;
1153 1153
1154 /* 1154 /*
1155 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and 1155 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
1156 * __GFP_NOWARN set) should not cause reclaim since the subsystem 1156 * __GFP_NOWARN set) should not cause reclaim since the subsystem
1157 * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim 1157 * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
1158 * using a larger set of nodes after it has established that the 1158 * using a larger set of nodes after it has established that the
1159 * allowed per node queues are empty and that nodes are 1159 * allowed per node queues are empty and that nodes are
1160 * over allocated. 1160 * over allocated.
1161 */ 1161 */
1162 if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) 1162 if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
1163 goto nopage; 1163 goto nopage;
1164 1164
1165 for (z = zonelist->zones; *z; z++) 1165 for (z = zonelist->zones; *z; z++)
1166 wakeup_kswapd(*z, order); 1166 wakeup_kswapd(*z, order);
1167 1167
1168 /* 1168 /*
1169 * OK, we're below the kswapd watermark and have kicked background 1169 * OK, we're below the kswapd watermark and have kicked background
1170 * reclaim. Now things get more complex, so set up alloc_flags according 1170 * reclaim. Now things get more complex, so set up alloc_flags according
1171 * to how we want to proceed. 1171 * to how we want to proceed.
1172 * 1172 *
1173 * The caller may dip into page reserves a bit more if the caller 1173 * The caller may dip into page reserves a bit more if the caller
1174 * cannot run direct reclaim, or if the caller has realtime scheduling 1174 * cannot run direct reclaim, or if the caller has realtime scheduling
1175 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will 1175 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
1176 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). 1176 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
1177 */ 1177 */
1178 alloc_flags = ALLOC_WMARK_MIN; 1178 alloc_flags = ALLOC_WMARK_MIN;
1179 if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait) 1179 if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
1180 alloc_flags |= ALLOC_HARDER; 1180 alloc_flags |= ALLOC_HARDER;
1181 if (gfp_mask & __GFP_HIGH) 1181 if (gfp_mask & __GFP_HIGH)
1182 alloc_flags |= ALLOC_HIGH; 1182 alloc_flags |= ALLOC_HIGH;
1183 if (wait) 1183 if (wait)
1184 alloc_flags |= ALLOC_CPUSET; 1184 alloc_flags |= ALLOC_CPUSET;
1185 1185
1186 /* 1186 /*
1187 * Go through the zonelist again. Let __GFP_HIGH and allocations 1187 * Go through the zonelist again. Let __GFP_HIGH and allocations
1188 * coming from realtime tasks go deeper into reserves. 1188 * coming from realtime tasks go deeper into reserves.
1189 * 1189 *
1190 * This is the last chance, in general, before the goto nopage. 1190 * This is the last chance, in general, before the goto nopage.
1191 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. 1191 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
1192 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 1192 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
1193 */ 1193 */
1194 page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags); 1194 page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags);
1195 if (page) 1195 if (page)
1196 goto got_pg; 1196 goto got_pg;
1197 1197
1198 /* This allocation should allow future memory freeing. */ 1198 /* This allocation should allow future memory freeing. */
1199 1199
1200 rebalance: 1200 rebalance:
1201 if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) 1201 if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
1202 && !in_interrupt()) { 1202 && !in_interrupt()) {
1203 if (!(gfp_mask & __GFP_NOMEMALLOC)) { 1203 if (!(gfp_mask & __GFP_NOMEMALLOC)) {
1204 nofail_alloc: 1204 nofail_alloc:
1205 /* go through the zonelist yet again, ignoring mins */ 1205 /* go through the zonelist yet again, ignoring mins */
1206 page = get_page_from_freelist(gfp_mask, order, 1206 page = get_page_from_freelist(gfp_mask, order,
1207 zonelist, ALLOC_NO_WATERMARKS); 1207 zonelist, ALLOC_NO_WATERMARKS);
1208 if (page) 1208 if (page)
1209 goto got_pg; 1209 goto got_pg;
1210 if (gfp_mask & __GFP_NOFAIL) { 1210 if (gfp_mask & __GFP_NOFAIL) {
1211 congestion_wait(WRITE, HZ/50); 1211 congestion_wait(WRITE, HZ/50);
1212 goto nofail_alloc; 1212 goto nofail_alloc;
1213 } 1213 }
1214 } 1214 }
1215 goto nopage; 1215 goto nopage;
1216 } 1216 }
1217 1217
1218 /* Atomic allocations - we can't balance anything */ 1218 /* Atomic allocations - we can't balance anything */
1219 if (!wait) 1219 if (!wait)
1220 goto nopage; 1220 goto nopage;
1221 1221
1222 cond_resched(); 1222 cond_resched();
1223 1223
1224 /* We now go into synchronous reclaim */ 1224 /* We now go into synchronous reclaim */
1225 cpuset_memory_pressure_bump(); 1225 cpuset_memory_pressure_bump();
1226 p->flags |= PF_MEMALLOC; 1226 p->flags |= PF_MEMALLOC;
1227 reclaim_state.reclaimed_slab = 0; 1227 reclaim_state.reclaimed_slab = 0;
1228 p->reclaim_state = &reclaim_state; 1228 p->reclaim_state = &reclaim_state;
1229 1229
1230 did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask); 1230 did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask);
1231 1231
1232 p->reclaim_state = NULL; 1232 p->reclaim_state = NULL;
1233 p->flags &= ~PF_MEMALLOC; 1233 p->flags &= ~PF_MEMALLOC;
1234 1234
1235 cond_resched(); 1235 cond_resched();
1236 1236
1237 if (likely(did_some_progress)) { 1237 if (likely(did_some_progress)) {
1238 page = get_page_from_freelist(gfp_mask, order, 1238 page = get_page_from_freelist(gfp_mask, order,
1239 zonelist, alloc_flags); 1239 zonelist, alloc_flags);
1240 if (page) 1240 if (page)
1241 goto got_pg; 1241 goto got_pg;
1242 } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { 1242 } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
1243 /* 1243 /*
1244 * Go through the zonelist yet one more time, keep 1244 * Go through the zonelist yet one more time, keep
1245 * very high watermark here, this is only to catch 1245 * very high watermark here, this is only to catch
1246 * a parallel oom killing, we must fail if we're still 1246 * a parallel oom killing, we must fail if we're still
1247 * under heavy pressure. 1247 * under heavy pressure.
1248 */ 1248 */
1249 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, 1249 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
1250 zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET); 1250 zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET);
1251 if (page) 1251 if (page)
1252 goto got_pg; 1252 goto got_pg;
1253 1253
1254 out_of_memory(zonelist, gfp_mask, order); 1254 out_of_memory(zonelist, gfp_mask, order);
1255 goto restart; 1255 goto restart;
1256 } 1256 }
1257 1257
1258 /* 1258 /*
1259 * Don't let big-order allocations loop unless the caller explicitly 1259 * Don't let big-order allocations loop unless the caller explicitly
1260 * requests that. Wait for some write requests to complete then retry. 1260 * requests that. Wait for some write requests to complete then retry.
1261 * 1261 *
1262 * In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order 1262 * In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order
1263 * <= 3, but that may not be true in other implementations. 1263 * <= 3, but that may not be true in other implementations.
1264 */ 1264 */
1265 do_retry = 0; 1265 do_retry = 0;
1266 if (!(gfp_mask & __GFP_NORETRY)) { 1266 if (!(gfp_mask & __GFP_NORETRY)) {
1267 if ((order <= 3) || (gfp_mask & __GFP_REPEAT)) 1267 if ((order <= 3) || (gfp_mask & __GFP_REPEAT))
1268 do_retry = 1; 1268 do_retry = 1;
1269 if (gfp_mask & __GFP_NOFAIL) 1269 if (gfp_mask & __GFP_NOFAIL)
1270 do_retry = 1; 1270 do_retry = 1;
1271 } 1271 }
1272 if (do_retry) { 1272 if (do_retry) {
1273 congestion_wait(WRITE, HZ/50); 1273 congestion_wait(WRITE, HZ/50);
1274 goto rebalance; 1274 goto rebalance;
1275 } 1275 }
1276 1276
1277 nopage: 1277 nopage:
1278 if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { 1278 if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
1279 printk(KERN_WARNING "%s: page allocation failure." 1279 printk(KERN_WARNING "%s: page allocation failure."
1280 " order:%d, mode:0x%x\n", 1280 " order:%d, mode:0x%x\n",
1281 p->comm, order, gfp_mask); 1281 p->comm, order, gfp_mask);
1282 dump_stack(); 1282 dump_stack();
1283 show_mem(); 1283 show_mem();
1284 } 1284 }
1285 got_pg: 1285 got_pg:
1286 return page; 1286 return page;
1287 } 1287 }
1288 1288
1289 EXPORT_SYMBOL(__alloc_pages); 1289 EXPORT_SYMBOL(__alloc_pages);
1290 1290
1291 /* 1291 /*
1292 * Common helper functions. 1292 * Common helper functions.
1293 */ 1293 */
1294 fastcall unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) 1294 fastcall unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
1295 { 1295 {
1296 struct page * page; 1296 struct page * page;
1297 page = alloc_pages(gfp_mask, order); 1297 page = alloc_pages(gfp_mask, order);
1298 if (!page) 1298 if (!page)
1299 return 0; 1299 return 0;
1300 return (unsigned long) page_address(page); 1300 return (unsigned long) page_address(page);
1301 } 1301 }
1302 1302
1303 EXPORT_SYMBOL(__get_free_pages); 1303 EXPORT_SYMBOL(__get_free_pages);
1304 1304
1305 fastcall unsigned long get_zeroed_page(gfp_t gfp_mask) 1305 fastcall unsigned long get_zeroed_page(gfp_t gfp_mask)
1306 { 1306 {
1307 struct page * page; 1307 struct page * page;
1308 1308
1309 /* 1309 /*
1310 * get_zeroed_page() returns a 32-bit address, which cannot represent 1310 * get_zeroed_page() returns a 32-bit address, which cannot represent
1311 * a highmem page 1311 * a highmem page
1312 */ 1312 */
1313 VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); 1313 VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
1314 1314
1315 page = alloc_pages(gfp_mask | __GFP_ZERO, 0); 1315 page = alloc_pages(gfp_mask | __GFP_ZERO, 0);
1316 if (page) 1316 if (page)
1317 return (unsigned long) page_address(page); 1317 return (unsigned long) page_address(page);
1318 return 0; 1318 return 0;
1319 } 1319 }
1320 1320
1321 EXPORT_SYMBOL(get_zeroed_page); 1321 EXPORT_SYMBOL(get_zeroed_page);
1322 1322
1323 void __pagevec_free(struct pagevec *pvec) 1323 void __pagevec_free(struct pagevec *pvec)
1324 { 1324 {
1325 int i = pagevec_count(pvec); 1325 int i = pagevec_count(pvec);
1326 1326
1327 while (--i >= 0) 1327 while (--i >= 0)
1328 free_hot_cold_page(pvec->pages[i], pvec->cold); 1328 free_hot_cold_page(pvec->pages[i], pvec->cold);
1329 } 1329 }
1330 1330
1331 fastcall void __free_pages(struct page *page, unsigned int order) 1331 fastcall void __free_pages(struct page *page, unsigned int order)
1332 { 1332 {
1333 if (put_page_testzero(page)) { 1333 if (put_page_testzero(page)) {
1334 if (order == 0) 1334 if (order == 0)
1335 free_hot_page(page); 1335 free_hot_page(page);
1336 else 1336 else
1337 __free_pages_ok(page, order); 1337 __free_pages_ok(page, order);
1338 } 1338 }
1339 } 1339 }
1340 1340
1341 EXPORT_SYMBOL(__free_pages); 1341 EXPORT_SYMBOL(__free_pages);
1342 1342
1343 fastcall void free_pages(unsigned long addr, unsigned int order) 1343 fastcall void free_pages(unsigned long addr, unsigned int order)
1344 { 1344 {
1345 if (addr != 0) { 1345 if (addr != 0) {
1346 VM_BUG_ON(!virt_addr_valid((void *)addr)); 1346 VM_BUG_ON(!virt_addr_valid((void *)addr));
1347 __free_pages(virt_to_page((void *)addr), order); 1347 __free_pages(virt_to_page((void *)addr), order);
1348 } 1348 }
1349 } 1349 }
1350 1350
1351 EXPORT_SYMBOL(free_pages); 1351 EXPORT_SYMBOL(free_pages);
1352 1352
1353 /* 1353 /*
1354 * Total amount of free (allocatable) RAM: 1354 * Total amount of free (allocatable) RAM:
1355 */ 1355 */
1356 unsigned int nr_free_pages(void) 1356 unsigned int nr_free_pages(void)
1357 { 1357 {
1358 unsigned int sum = 0; 1358 unsigned int sum = 0;
1359 struct zone *zone; 1359 struct zone *zone;
1360 1360
1361 for_each_zone(zone) 1361 for_each_zone(zone)
1362 sum += zone->free_pages; 1362 sum += zone->free_pages;
1363 1363
1364 return sum; 1364 return sum;
1365 } 1365 }
1366 1366
1367 EXPORT_SYMBOL(nr_free_pages); 1367 EXPORT_SYMBOL(nr_free_pages);
1368 1368
1369 #ifdef CONFIG_NUMA 1369 #ifdef CONFIG_NUMA
1370 unsigned int nr_free_pages_pgdat(pg_data_t *pgdat) 1370 unsigned int nr_free_pages_pgdat(pg_data_t *pgdat)
1371 { 1371 {
1372 unsigned int sum = 0; 1372 unsigned int sum = 0;
1373 enum zone_type i; 1373 enum zone_type i;
1374 1374
1375 for (i = 0; i < MAX_NR_ZONES; i++) 1375 for (i = 0; i < MAX_NR_ZONES; i++)
1376 sum += pgdat->node_zones[i].free_pages; 1376 sum += pgdat->node_zones[i].free_pages;
1377 1377
1378 return sum; 1378 return sum;
1379 } 1379 }
1380 #endif 1380 #endif
1381 1381
1382 static unsigned int nr_free_zone_pages(int offset) 1382 static unsigned int nr_free_zone_pages(int offset)
1383 { 1383 {
1384 /* Just pick one node, since fallback list is circular */ 1384 /* Just pick one node, since fallback list is circular */
1385 pg_data_t *pgdat = NODE_DATA(numa_node_id()); 1385 pg_data_t *pgdat = NODE_DATA(numa_node_id());
1386 unsigned int sum = 0; 1386 unsigned int sum = 0;
1387 1387
1388 struct zonelist *zonelist = pgdat->node_zonelists + offset; 1388 struct zonelist *zonelist = pgdat->node_zonelists + offset;
1389 struct zone **zonep = zonelist->zones; 1389 struct zone **zonep = zonelist->zones;
1390 struct zone *zone; 1390 struct zone *zone;
1391 1391
1392 for (zone = *zonep++; zone; zone = *zonep++) { 1392 for (zone = *zonep++; zone; zone = *zonep++) {
1393 unsigned long size = zone->present_pages; 1393 unsigned long size = zone->present_pages;
1394 unsigned long high = zone->pages_high; 1394 unsigned long high = zone->pages_high;
1395 if (size > high) 1395 if (size > high)
1396 sum += size - high; 1396 sum += size - high;
1397 } 1397 }
1398 1398
1399 return sum; 1399 return sum;
1400 } 1400 }
1401 1401
1402 /* 1402 /*
1403 * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL 1403 * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL
1404 */ 1404 */
1405 unsigned int nr_free_buffer_pages(void) 1405 unsigned int nr_free_buffer_pages(void)
1406 { 1406 {
1407 return nr_free_zone_pages(gfp_zone(GFP_USER)); 1407 return nr_free_zone_pages(gfp_zone(GFP_USER));
1408 } 1408 }
1409 1409
1410 /* 1410 /*
1411 * Amount of free RAM allocatable within all zones 1411 * Amount of free RAM allocatable within all zones
1412 */ 1412 */
1413 unsigned int nr_free_pagecache_pages(void) 1413 unsigned int nr_free_pagecache_pages(void)
1414 { 1414 {
1415 return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER)); 1415 return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER));
1416 } 1416 }
1417 1417
1418 static inline void show_node(struct zone *zone) 1418 static inline void show_node(struct zone *zone)
1419 { 1419 {
1420 if (NUMA_BUILD) 1420 if (NUMA_BUILD)
1421 printk("Node %d ", zone_to_nid(zone)); 1421 printk("Node %d ", zone_to_nid(zone));
1422 } 1422 }
1423 1423
1424 void si_meminfo(struct sysinfo *val) 1424 void si_meminfo(struct sysinfo *val)
1425 { 1425 {
1426 val->totalram = totalram_pages; 1426 val->totalram = totalram_pages;
1427 val->sharedram = 0; 1427 val->sharedram = 0;
1428 val->freeram = nr_free_pages(); 1428 val->freeram = nr_free_pages();
1429 val->bufferram = nr_blockdev_pages(); 1429 val->bufferram = nr_blockdev_pages();
1430 val->totalhigh = totalhigh_pages; 1430 val->totalhigh = totalhigh_pages;
1431 val->freehigh = nr_free_highpages(); 1431 val->freehigh = nr_free_highpages();
1432 val->mem_unit = PAGE_SIZE; 1432 val->mem_unit = PAGE_SIZE;
1433 } 1433 }
1434 1434
1435 EXPORT_SYMBOL(si_meminfo); 1435 EXPORT_SYMBOL(si_meminfo);
1436 1436
1437 #ifdef CONFIG_NUMA 1437 #ifdef CONFIG_NUMA
1438 void si_meminfo_node(struct sysinfo *val, int nid) 1438 void si_meminfo_node(struct sysinfo *val, int nid)
1439 { 1439 {
1440 pg_data_t *pgdat = NODE_DATA(nid); 1440 pg_data_t *pgdat = NODE_DATA(nid);
1441 1441
1442 val->totalram = pgdat->node_present_pages; 1442 val->totalram = pgdat->node_present_pages;
1443 val->freeram = nr_free_pages_pgdat(pgdat); 1443 val->freeram = nr_free_pages_pgdat(pgdat);
1444 #ifdef CONFIG_HIGHMEM 1444 #ifdef CONFIG_HIGHMEM
1445 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages; 1445 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;
1446 val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages; 1446 val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages;
1447 #else 1447 #else
1448 val->totalhigh = 0; 1448 val->totalhigh = 0;
1449 val->freehigh = 0; 1449 val->freehigh = 0;
1450 #endif 1450 #endif
1451 val->mem_unit = PAGE_SIZE; 1451 val->mem_unit = PAGE_SIZE;
1452 } 1452 }
1453 #endif 1453 #endif
1454 1454
1455 #define K(x) ((x) << (PAGE_SHIFT-10)) 1455 #define K(x) ((x) << (PAGE_SHIFT-10))
1456 1456
1457 /* 1457 /*
1458 * Show free area list (used inside shift_scroll-lock stuff) 1458 * Show free area list (used inside shift_scroll-lock stuff)
1459 * We also calculate the percentage fragmentation. We do this by counting the 1459 * We also calculate the percentage fragmentation. We do this by counting the
1460 * memory on each free list with the exception of the first item on the list. 1460 * memory on each free list with the exception of the first item on the list.
1461 */ 1461 */
1462 void show_free_areas(void) 1462 void show_free_areas(void)
1463 { 1463 {
1464 int cpu; 1464 int cpu;
1465 unsigned long active; 1465 unsigned long active;
1466 unsigned long inactive; 1466 unsigned long inactive;
1467 unsigned long free; 1467 unsigned long free;
1468 struct zone *zone; 1468 struct zone *zone;
1469 1469
1470 for_each_zone(zone) { 1470 for_each_zone(zone) {
1471 if (!populated_zone(zone)) 1471 if (!populated_zone(zone))
1472 continue; 1472 continue;
1473 1473
1474 show_node(zone); 1474 show_node(zone);
1475 printk("%s per-cpu:\n", zone->name); 1475 printk("%s per-cpu:\n", zone->name);
1476 1476
1477 for_each_online_cpu(cpu) { 1477 for_each_online_cpu(cpu) {
1478 struct per_cpu_pageset *pageset; 1478 struct per_cpu_pageset *pageset;
1479 1479
1480 pageset = zone_pcp(zone, cpu); 1480 pageset = zone_pcp(zone, cpu);
1481 1481
1482 printk("CPU %4d: Hot: hi:%5d, btch:%4d usd:%4d " 1482 printk("CPU %4d: Hot: hi:%5d, btch:%4d usd:%4d "
1483 "Cold: hi:%5d, btch:%4d usd:%4d\n", 1483 "Cold: hi:%5d, btch:%4d usd:%4d\n",
1484 cpu, pageset->pcp[0].high, 1484 cpu, pageset->pcp[0].high,
1485 pageset->pcp[0].batch, pageset->pcp[0].count, 1485 pageset->pcp[0].batch, pageset->pcp[0].count,
1486 pageset->pcp[1].high, pageset->pcp[1].batch, 1486 pageset->pcp[1].high, pageset->pcp[1].batch,
1487 pageset->pcp[1].count); 1487 pageset->pcp[1].count);
1488 } 1488 }
1489 } 1489 }
1490 1490
1491 get_zone_counts(&active, &inactive, &free); 1491 get_zone_counts(&active, &inactive, &free);
1492 1492
1493 printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu " 1493 printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu "
1494 "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n", 1494 "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n",
1495 active, 1495 active,
1496 inactive, 1496 inactive,
1497 global_page_state(NR_FILE_DIRTY), 1497 global_page_state(NR_FILE_DIRTY),
1498 global_page_state(NR_WRITEBACK), 1498 global_page_state(NR_WRITEBACK),
1499 global_page_state(NR_UNSTABLE_NFS), 1499 global_page_state(NR_UNSTABLE_NFS),
1500 nr_free_pages(), 1500 nr_free_pages(),
1501 global_page_state(NR_SLAB_RECLAIMABLE) + 1501 global_page_state(NR_SLAB_RECLAIMABLE) +
1502 global_page_state(NR_SLAB_UNRECLAIMABLE), 1502 global_page_state(NR_SLAB_UNRECLAIMABLE),
1503 global_page_state(NR_FILE_MAPPED), 1503 global_page_state(NR_FILE_MAPPED),
1504 global_page_state(NR_PAGETABLE)); 1504 global_page_state(NR_PAGETABLE));
1505 1505
1506 for_each_zone(zone) { 1506 for_each_zone(zone) {
1507 int i; 1507 int i;
1508 1508
1509 if (!populated_zone(zone)) 1509 if (!populated_zone(zone))
1510 continue; 1510 continue;
1511 1511
1512 show_node(zone); 1512 show_node(zone);
1513 printk("%s" 1513 printk("%s"
1514 " free:%lukB" 1514 " free:%lukB"
1515 " min:%lukB" 1515 " min:%lukB"
1516 " low:%lukB" 1516 " low:%lukB"
1517 " high:%lukB" 1517 " high:%lukB"
1518 " active:%lukB" 1518 " active:%lukB"
1519 " inactive:%lukB" 1519 " inactive:%lukB"
1520 " present:%lukB" 1520 " present:%lukB"
1521 " pages_scanned:%lu" 1521 " pages_scanned:%lu"
1522 " all_unreclaimable? %s" 1522 " all_unreclaimable? %s"
1523 "\n", 1523 "\n",
1524 zone->name, 1524 zone->name,
1525 K(zone->free_pages), 1525 K(zone->free_pages),
1526 K(zone->pages_min), 1526 K(zone->pages_min),
1527 K(zone->pages_low), 1527 K(zone->pages_low),
1528 K(zone->pages_high), 1528 K(zone->pages_high),
1529 K(zone->nr_active), 1529 K(zone->nr_active),
1530 K(zone->nr_inactive), 1530 K(zone->nr_inactive),
1531 K(zone->present_pages), 1531 K(zone->present_pages),
1532 zone->pages_scanned, 1532 zone->pages_scanned,
1533 (zone->all_unreclaimable ? "yes" : "no") 1533 (zone->all_unreclaimable ? "yes" : "no")
1534 ); 1534 );
1535 printk("lowmem_reserve[]:"); 1535 printk("lowmem_reserve[]:");
1536 for (i = 0; i < MAX_NR_ZONES; i++) 1536 for (i = 0; i < MAX_NR_ZONES; i++)
1537 printk(" %lu", zone->lowmem_reserve[i]); 1537 printk(" %lu", zone->lowmem_reserve[i]);
1538 printk("\n"); 1538 printk("\n");
1539 } 1539 }
1540 1540
1541 for_each_zone(zone) { 1541 for_each_zone(zone) {
1542 unsigned long nr[MAX_ORDER], flags, order, total = 0; 1542 unsigned long nr[MAX_ORDER], flags, order, total = 0;
1543 1543
1544 if (!populated_zone(zone)) 1544 if (!populated_zone(zone))
1545 continue; 1545 continue;
1546 1546
1547 show_node(zone); 1547 show_node(zone);
1548 printk("%s: ", zone->name); 1548 printk("%s: ", zone->name);
1549 1549
1550 spin_lock_irqsave(&zone->lock, flags); 1550 spin_lock_irqsave(&zone->lock, flags);
1551 for (order = 0; order < MAX_ORDER; order++) { 1551 for (order = 0; order < MAX_ORDER; order++) {
1552 nr[order] = zone->free_area[order].nr_free; 1552 nr[order] = zone->free_area[order].nr_free;
1553 total += nr[order] << order; 1553 total += nr[order] << order;
1554 } 1554 }
1555 spin_unlock_irqrestore(&zone->lock, flags); 1555 spin_unlock_irqrestore(&zone->lock, flags);
1556 for (order = 0; order < MAX_ORDER; order++) 1556 for (order = 0; order < MAX_ORDER; order++)
1557 printk("%lu*%lukB ", nr[order], K(1UL) << order); 1557 printk("%lu*%lukB ", nr[order], K(1UL) << order);
1558 printk("= %lukB\n", K(total)); 1558 printk("= %lukB\n", K(total));
1559 } 1559 }
1560 1560
1561 show_swap_cache_info(); 1561 show_swap_cache_info();
1562 } 1562 }
1563 1563
1564 /* 1564 /*
1565 * Builds allocation fallback zone lists. 1565 * Builds allocation fallback zone lists.
1566 * 1566 *
1567 * Add all populated zones of a node to the zonelist. 1567 * Add all populated zones of a node to the zonelist.
1568 */ 1568 */
1569 static int __meminit build_zonelists_node(pg_data_t *pgdat, 1569 static int __meminit build_zonelists_node(pg_data_t *pgdat,
1570 struct zonelist *zonelist, int nr_zones, enum zone_type zone_type) 1570 struct zonelist *zonelist, int nr_zones, enum zone_type zone_type)
1571 { 1571 {
1572 struct zone *zone; 1572 struct zone *zone;
1573 1573
1574 BUG_ON(zone_type >= MAX_NR_ZONES); 1574 BUG_ON(zone_type >= MAX_NR_ZONES);
1575 zone_type++; 1575 zone_type++;
1576 1576
1577 do { 1577 do {
1578 zone_type--; 1578 zone_type--;
1579 zone = pgdat->node_zones + zone_type; 1579 zone = pgdat->node_zones + zone_type;
1580 if (populated_zone(zone)) { 1580 if (populated_zone(zone)) {
1581 zonelist->zones[nr_zones++] = zone; 1581 zonelist->zones[nr_zones++] = zone;
1582 check_highest_zone(zone_type); 1582 check_highest_zone(zone_type);
1583 } 1583 }
1584 1584
1585 } while (zone_type); 1585 } while (zone_type);
1586 return nr_zones; 1586 return nr_zones;
1587 } 1587 }
1588 1588
1589 #ifdef CONFIG_NUMA 1589 #ifdef CONFIG_NUMA
1590 #define MAX_NODE_LOAD (num_online_nodes()) 1590 #define MAX_NODE_LOAD (num_online_nodes())
1591 static int __meminitdata node_load[MAX_NUMNODES]; 1591 static int __meminitdata node_load[MAX_NUMNODES];
1592 /** 1592 /**
1593 * find_next_best_node - find the next node that should appear in a given node's fallback list 1593 * find_next_best_node - find the next node that should appear in a given node's fallback list
1594 * @node: node whose fallback list we're appending 1594 * @node: node whose fallback list we're appending
1595 * @used_node_mask: nodemask_t of already used nodes 1595 * @used_node_mask: nodemask_t of already used nodes
1596 * 1596 *
1597 * We use a number of factors to determine which is the next node that should 1597 * We use a number of factors to determine which is the next node that should
1598 * appear on a given node's fallback list. The node should not have appeared 1598 * appear on a given node's fallback list. The node should not have appeared
1599 * already in @node's fallback list, and it should be the next closest node 1599 * already in @node's fallback list, and it should be the next closest node
1600 * according to the distance array (which contains arbitrary distance values 1600 * according to the distance array (which contains arbitrary distance values
1601 * from each node to each node in the system), and should also prefer nodes 1601 * from each node to each node in the system), and should also prefer nodes
1602 * with no CPUs, since presumably they'll have very little allocation pressure 1602 * with no CPUs, since presumably they'll have very little allocation pressure
1603 * on them otherwise. 1603 * on them otherwise.
1604 * It returns -1 if no node is found. 1604 * It returns -1 if no node is found.
1605 */ 1605 */
1606 static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask) 1606 static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask)
1607 { 1607 {
1608 int n, val; 1608 int n, val;
1609 int min_val = INT_MAX; 1609 int min_val = INT_MAX;
1610 int best_node = -1; 1610 int best_node = -1;
1611 1611
1612 /* Use the local node if we haven't already */ 1612 /* Use the local node if we haven't already */
1613 if (!node_isset(node, *used_node_mask)) { 1613 if (!node_isset(node, *used_node_mask)) {
1614 node_set(node, *used_node_mask); 1614 node_set(node, *used_node_mask);
1615 return node; 1615 return node;
1616 } 1616 }
1617 1617
1618 for_each_online_node(n) { 1618 for_each_online_node(n) {
1619 cpumask_t tmp; 1619 cpumask_t tmp;
1620 1620
1621 /* Don't want a node to appear more than once */ 1621 /* Don't want a node to appear more than once */
1622 if (node_isset(n, *used_node_mask)) 1622 if (node_isset(n, *used_node_mask))
1623 continue; 1623 continue;
1624 1624
1625 /* Use the distance array to find the distance */ 1625 /* Use the distance array to find the distance */
1626 val = node_distance(node, n); 1626 val = node_distance(node, n);
1627 1627
1628 /* Penalize nodes under us ("prefer the next node") */ 1628 /* Penalize nodes under us ("prefer the next node") */
1629 val += (n < node); 1629 val += (n < node);
1630 1630
1631 /* Give preference to headless and unused nodes */ 1631 /* Give preference to headless and unused nodes */
1632 tmp = node_to_cpumask(n); 1632 tmp = node_to_cpumask(n);
1633 if (!cpus_empty(tmp)) 1633 if (!cpus_empty(tmp))
1634 val += PENALTY_FOR_NODE_WITH_CPUS; 1634 val += PENALTY_FOR_NODE_WITH_CPUS;
1635 1635
1636 /* Slight preference for less loaded node */ 1636 /* Slight preference for less loaded node */
1637 val *= (MAX_NODE_LOAD*MAX_NUMNODES); 1637 val *= (MAX_NODE_LOAD*MAX_NUMNODES);
1638 val += node_load[n]; 1638 val += node_load[n];
1639 1639
1640 if (val < min_val) { 1640 if (val < min_val) {
1641 min_val = val; 1641 min_val = val;
1642 best_node = n; 1642 best_node = n;
1643 } 1643 }
1644 } 1644 }
1645 1645
1646 if (best_node >= 0) 1646 if (best_node >= 0)
1647 node_set(best_node, *used_node_mask); 1647 node_set(best_node, *used_node_mask);
1648 1648
1649 return best_node; 1649 return best_node;
1650 } 1650 }
1651 1651
1652 static void __meminit build_zonelists(pg_data_t *pgdat) 1652 static void __meminit build_zonelists(pg_data_t *pgdat)
1653 { 1653 {
1654 int j, node, local_node; 1654 int j, node, local_node;
1655 enum zone_type i; 1655 enum zone_type i;
1656 int prev_node, load; 1656 int prev_node, load;
1657 struct zonelist *zonelist; 1657 struct zonelist *zonelist;
1658 nodemask_t used_mask; 1658 nodemask_t used_mask;
1659 1659
1660 /* initialize zonelists */ 1660 /* initialize zonelists */
1661 for (i = 0; i < MAX_NR_ZONES; i++) { 1661 for (i = 0; i < MAX_NR_ZONES; i++) {
1662 zonelist = pgdat->node_zonelists + i; 1662 zonelist = pgdat->node_zonelists + i;
1663 zonelist->zones[0] = NULL; 1663 zonelist->zones[0] = NULL;
1664 } 1664 }
1665 1665
1666 /* NUMA-aware ordering of nodes */ 1666 /* NUMA-aware ordering of nodes */
1667 local_node = pgdat->node_id; 1667 local_node = pgdat->node_id;
1668 load = num_online_nodes(); 1668 load = num_online_nodes();
1669 prev_node = local_node; 1669 prev_node = local_node;
1670 nodes_clear(used_mask); 1670 nodes_clear(used_mask);
1671 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { 1671 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
1672 int distance = node_distance(local_node, node); 1672 int distance = node_distance(local_node, node);
1673 1673
1674 /* 1674 /*
1675 * If another node is sufficiently far away then it is better 1675 * If another node is sufficiently far away then it is better
1676 * to reclaim pages in a zone before going off node. 1676 * to reclaim pages in a zone before going off node.
1677 */ 1677 */
1678 if (distance > RECLAIM_DISTANCE) 1678 if (distance > RECLAIM_DISTANCE)
1679 zone_reclaim_mode = 1; 1679 zone_reclaim_mode = 1;
1680 1680
1681 /* 1681 /*
1682 * We don't want to pressure a particular node. 1682 * We don't want to pressure a particular node.
1683 * So adding penalty to the first node in same 1683 * So adding penalty to the first node in same
1684 * distance group to make it round-robin. 1684 * distance group to make it round-robin.
1685 */ 1685 */
1686 1686
1687 if (distance != node_distance(local_node, prev_node)) 1687 if (distance != node_distance(local_node, prev_node))
1688 node_load[node] += load; 1688 node_load[node] += load;
1689 prev_node = node; 1689 prev_node = node;
1690 load--; 1690 load--;
1691 for (i = 0; i < MAX_NR_ZONES; i++) { 1691 for (i = 0; i < MAX_NR_ZONES; i++) {
1692 zonelist = pgdat->node_zonelists + i; 1692 zonelist = pgdat->node_zonelists + i;
1693 for (j = 0; zonelist->zones[j] != NULL; j++); 1693 for (j = 0; zonelist->zones[j] != NULL; j++);
1694 1694
1695 j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); 1695 j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
1696 zonelist->zones[j] = NULL; 1696 zonelist->zones[j] = NULL;
1697 } 1697 }
1698 } 1698 }
1699 } 1699 }
1700 1700
1701 /* Construct the zonelist performance cache - see further mmzone.h */ 1701 /* Construct the zonelist performance cache - see further mmzone.h */
1702 static void __meminit build_zonelist_cache(pg_data_t *pgdat) 1702 static void __meminit build_zonelist_cache(pg_data_t *pgdat)
1703 { 1703 {
1704 int i; 1704 int i;
1705 1705
1706 for (i = 0; i < MAX_NR_ZONES; i++) { 1706 for (i = 0; i < MAX_NR_ZONES; i++) {
1707 struct zonelist *zonelist; 1707 struct zonelist *zonelist;
1708 struct zonelist_cache *zlc; 1708 struct zonelist_cache *zlc;
1709 struct zone **z; 1709 struct zone **z;
1710 1710
1711 zonelist = pgdat->node_zonelists + i; 1711 zonelist = pgdat->node_zonelists + i;
1712 zonelist->zlcache_ptr = zlc = &zonelist->zlcache; 1712 zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
1713 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); 1713 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
1714 for (z = zonelist->zones; *z; z++) 1714 for (z = zonelist->zones; *z; z++)
1715 zlc->z_to_n[z - zonelist->zones] = zone_to_nid(*z); 1715 zlc->z_to_n[z - zonelist->zones] = zone_to_nid(*z);
1716 } 1716 }
1717 } 1717 }
1718 1718
1719 #else /* CONFIG_NUMA */ 1719 #else /* CONFIG_NUMA */
1720 1720
1721 static void __meminit build_zonelists(pg_data_t *pgdat) 1721 static void __meminit build_zonelists(pg_data_t *pgdat)
1722 { 1722 {
1723 int node, local_node; 1723 int node, local_node;
1724 enum zone_type i,j; 1724 enum zone_type i,j;
1725 1725
1726 local_node = pgdat->node_id; 1726 local_node = pgdat->node_id;
1727 for (i = 0; i < MAX_NR_ZONES; i++) { 1727 for (i = 0; i < MAX_NR_ZONES; i++) {
1728 struct zonelist *zonelist; 1728 struct zonelist *zonelist;
1729 1729
1730 zonelist = pgdat->node_zonelists + i; 1730 zonelist = pgdat->node_zonelists + i;
1731 1731
1732 j = build_zonelists_node(pgdat, zonelist, 0, i); 1732 j = build_zonelists_node(pgdat, zonelist, 0, i);
1733 /* 1733 /*
1734 * Now we build the zonelist so that it contains the zones 1734 * Now we build the zonelist so that it contains the zones
1735 * of all the other nodes. 1735 * of all the other nodes.
1736 * We don't want to pressure a particular node, so when 1736 * We don't want to pressure a particular node, so when
1737 * building the zones for node N, we make sure that the 1737 * building the zones for node N, we make sure that the
1738 * zones coming right after the local ones are those from 1738 * zones coming right after the local ones are those from
1739 * node N+1 (modulo N) 1739 * node N+1 (modulo N)
1740 */ 1740 */
1741 for (node = local_node + 1; node < MAX_NUMNODES; node++) { 1741 for (node = local_node + 1; node < MAX_NUMNODES; node++) {
1742 if (!node_online(node)) 1742 if (!node_online(node))
1743 continue; 1743 continue;
1744 j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); 1744 j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
1745 } 1745 }
1746 for (node = 0; node < local_node; node++) { 1746 for (node = 0; node < local_node; node++) {
1747 if (!node_online(node)) 1747 if (!node_online(node))
1748 continue; 1748 continue;
1749 j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); 1749 j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
1750 } 1750 }
1751 1751
1752 zonelist->zones[j] = NULL; 1752 zonelist->zones[j] = NULL;
1753 } 1753 }
1754 } 1754 }
1755 1755
1756 /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */ 1756 /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
1757 static void __meminit build_zonelist_cache(pg_data_t *pgdat) 1757 static void __meminit build_zonelist_cache(pg_data_t *pgdat)
1758 { 1758 {
1759 int i; 1759 int i;
1760 1760
1761 for (i = 0; i < MAX_NR_ZONES; i++) 1761 for (i = 0; i < MAX_NR_ZONES; i++)
1762 pgdat->node_zonelists[i].zlcache_ptr = NULL; 1762 pgdat->node_zonelists[i].zlcache_ptr = NULL;
1763 } 1763 }
1764 1764
1765 #endif /* CONFIG_NUMA */ 1765 #endif /* CONFIG_NUMA */
1766 1766
1767 /* return values int ....just for stop_machine_run() */ 1767 /* return values int ....just for stop_machine_run() */
1768 static int __meminit __build_all_zonelists(void *dummy) 1768 static int __meminit __build_all_zonelists(void *dummy)
1769 { 1769 {
1770 int nid; 1770 int nid;
1771 1771
1772 for_each_online_node(nid) { 1772 for_each_online_node(nid) {
1773 build_zonelists(NODE_DATA(nid)); 1773 build_zonelists(NODE_DATA(nid));
1774 build_zonelist_cache(NODE_DATA(nid)); 1774 build_zonelist_cache(NODE_DATA(nid));
1775 } 1775 }
1776 return 0; 1776 return 0;
1777 } 1777 }
1778 1778
1779 void __meminit build_all_zonelists(void) 1779 void __meminit build_all_zonelists(void)
1780 { 1780 {
1781 if (system_state == SYSTEM_BOOTING) { 1781 if (system_state == SYSTEM_BOOTING) {
1782 __build_all_zonelists(NULL); 1782 __build_all_zonelists(NULL);
1783 cpuset_init_current_mems_allowed(); 1783 cpuset_init_current_mems_allowed();
1784 } else { 1784 } else {
1785 /* we have to stop all cpus to guaranntee there is no user 1785 /* we have to stop all cpus to guaranntee there is no user
1786 of zonelist */ 1786 of zonelist */
1787 stop_machine_run(__build_all_zonelists, NULL, NR_CPUS); 1787 stop_machine_run(__build_all_zonelists, NULL, NR_CPUS);
1788 /* cpuset refresh routine should be here */ 1788 /* cpuset refresh routine should be here */
1789 } 1789 }
1790 vm_total_pages = nr_free_pagecache_pages(); 1790 vm_total_pages = nr_free_pagecache_pages();
1791 printk("Built %i zonelists. Total pages: %ld\n", 1791 printk("Built %i zonelists. Total pages: %ld\n",
1792 num_online_nodes(), vm_total_pages); 1792 num_online_nodes(), vm_total_pages);
1793 } 1793 }
1794 1794
1795 /* 1795 /*
1796 * Helper functions to size the waitqueue hash table. 1796 * Helper functions to size the waitqueue hash table.
1797 * Essentially these want to choose hash table sizes sufficiently 1797 * Essentially these want to choose hash table sizes sufficiently
1798 * large so that collisions trying to wait on pages are rare. 1798 * large so that collisions trying to wait on pages are rare.
1799 * But in fact, the number of active page waitqueues on typical 1799 * But in fact, the number of active page waitqueues on typical
1800 * systems is ridiculously low, less than 200. So this is even 1800 * systems is ridiculously low, less than 200. So this is even
1801 * conservative, even though it seems large. 1801 * conservative, even though it seems large.
1802 * 1802 *
1803 * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to 1803 * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
1804 * waitqueues, i.e. the size of the waitq table given the number of pages. 1804 * waitqueues, i.e. the size of the waitq table given the number of pages.
1805 */ 1805 */
1806 #define PAGES_PER_WAITQUEUE 256 1806 #define PAGES_PER_WAITQUEUE 256
1807 1807
1808 #ifndef CONFIG_MEMORY_HOTPLUG 1808 #ifndef CONFIG_MEMORY_HOTPLUG
1809 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) 1809 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
1810 { 1810 {
1811 unsigned long size = 1; 1811 unsigned long size = 1;
1812 1812
1813 pages /= PAGES_PER_WAITQUEUE; 1813 pages /= PAGES_PER_WAITQUEUE;
1814 1814
1815 while (size < pages) 1815 while (size < pages)
1816 size <<= 1; 1816 size <<= 1;
1817 1817
1818 /* 1818 /*
1819 * Once we have dozens or even hundreds of threads sleeping 1819 * Once we have dozens or even hundreds of threads sleeping
1820 * on IO we've got bigger problems than wait queue collision. 1820 * on IO we've got bigger problems than wait queue collision.
1821 * Limit the size of the wait table to a reasonable size. 1821 * Limit the size of the wait table to a reasonable size.
1822 */ 1822 */
1823 size = min(size, 4096UL); 1823 size = min(size, 4096UL);
1824 1824
1825 return max(size, 4UL); 1825 return max(size, 4UL);
1826 } 1826 }
1827 #else 1827 #else
1828 /* 1828 /*
1829 * A zone's size might be changed by hot-add, so it is not possible to determine 1829 * A zone's size might be changed by hot-add, so it is not possible to determine
1830 * a suitable size for its wait_table. So we use the maximum size now. 1830 * a suitable size for its wait_table. So we use the maximum size now.
1831 * 1831 *
1832 * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie: 1832 * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie:
1833 * 1833 *
1834 * i386 (preemption config) : 4096 x 16 = 64Kbyte. 1834 * i386 (preemption config) : 4096 x 16 = 64Kbyte.
1835 * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte. 1835 * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.
1836 * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte. 1836 * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte.
1837 * 1837 *
1838 * The maximum entries are prepared when a zone's memory is (512K + 256) pages 1838 * The maximum entries are prepared when a zone's memory is (512K + 256) pages
1839 * or more by the traditional way. (See above). It equals: 1839 * or more by the traditional way. (See above). It equals:
1840 * 1840 *
1841 * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte. 1841 * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte.
1842 * ia64(16K page size) : = ( 8G + 4M)byte. 1842 * ia64(16K page size) : = ( 8G + 4M)byte.
1843 * powerpc (64K page size) : = (32G +16M)byte. 1843 * powerpc (64K page size) : = (32G +16M)byte.
1844 */ 1844 */
1845 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) 1845 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
1846 { 1846 {
1847 return 4096UL; 1847 return 4096UL;
1848 } 1848 }
1849 #endif 1849 #endif
1850 1850
1851 /* 1851 /*
1852 * This is an integer logarithm so that shifts can be used later 1852 * This is an integer logarithm so that shifts can be used later
1853 * to extract the more random high bits from the multiplicative 1853 * to extract the more random high bits from the multiplicative
1854 * hash function before the remainder is taken. 1854 * hash function before the remainder is taken.
1855 */ 1855 */
1856 static inline unsigned long wait_table_bits(unsigned long size) 1856 static inline unsigned long wait_table_bits(unsigned long size)
1857 { 1857 {
1858 return ffz(~size); 1858 return ffz(~size);
1859 } 1859 }
1860 1860
1861 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) 1861 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
1862 1862
1863 /* 1863 /*
1864 * Initially all pages are reserved - free ones are freed 1864 * Initially all pages are reserved - free ones are freed
1865 * up by free_all_bootmem() once the early boot process is 1865 * up by free_all_bootmem() once the early boot process is
1866 * done. Non-atomic initialization, single-pass. 1866 * done. Non-atomic initialization, single-pass.
1867 */ 1867 */
1868 void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, 1868 void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
1869 unsigned long start_pfn) 1869 unsigned long start_pfn)
1870 { 1870 {
1871 struct page *page; 1871 struct page *page;
1872 unsigned long end_pfn = start_pfn + size; 1872 unsigned long end_pfn = start_pfn + size;
1873 unsigned long pfn; 1873 unsigned long pfn;
1874 1874
1875 for (pfn = start_pfn; pfn < end_pfn; pfn++) { 1875 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
1876 if (!early_pfn_valid(pfn)) 1876 if (!early_pfn_valid(pfn))
1877 continue; 1877 continue;
1878 if (!early_pfn_in_nid(pfn, nid)) 1878 if (!early_pfn_in_nid(pfn, nid))
1879 continue; 1879 continue;
1880 page = pfn_to_page(pfn); 1880 page = pfn_to_page(pfn);
1881 set_page_links(page, zone, nid, pfn); 1881 set_page_links(page, zone, nid, pfn);
1882 init_page_count(page); 1882 init_page_count(page);
1883 reset_page_mapcount(page); 1883 reset_page_mapcount(page);
1884 SetPageReserved(page); 1884 SetPageReserved(page);
1885 INIT_LIST_HEAD(&page->lru); 1885 INIT_LIST_HEAD(&page->lru);
1886 #ifdef WANT_PAGE_VIRTUAL 1886 #ifdef WANT_PAGE_VIRTUAL
1887 /* The shift won't overflow because ZONE_NORMAL is below 4G. */ 1887 /* The shift won't overflow because ZONE_NORMAL is below 4G. */
1888 if (!is_highmem_idx(zone)) 1888 if (!is_highmem_idx(zone))
1889 set_page_address(page, __va(pfn << PAGE_SHIFT)); 1889 set_page_address(page, __va(pfn << PAGE_SHIFT));
1890 #endif 1890 #endif
1891 } 1891 }
1892 } 1892 }
1893 1893
1894 void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone, 1894 void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
1895 unsigned long size) 1895 unsigned long size)
1896 { 1896 {
1897 int order; 1897 int order;
1898 for (order = 0; order < MAX_ORDER ; order++) { 1898 for (order = 0; order < MAX_ORDER ; order++) {
1899 INIT_LIST_HEAD(&zone->free_area[order].free_list); 1899 INIT_LIST_HEAD(&zone->free_area[order].free_list);
1900 zone->free_area[order].nr_free = 0; 1900 zone->free_area[order].nr_free = 0;
1901 } 1901 }
1902 } 1902 }
1903 1903
1904 #ifndef __HAVE_ARCH_MEMMAP_INIT 1904 #ifndef __HAVE_ARCH_MEMMAP_INIT
1905 #define memmap_init(size, nid, zone, start_pfn) \ 1905 #define memmap_init(size, nid, zone, start_pfn) \
1906 memmap_init_zone((size), (nid), (zone), (start_pfn)) 1906 memmap_init_zone((size), (nid), (zone), (start_pfn))
1907 #endif 1907 #endif
1908 1908
1909 static int __cpuinit zone_batchsize(struct zone *zone) 1909 static int __cpuinit zone_batchsize(struct zone *zone)
1910 { 1910 {
1911 int batch; 1911 int batch;
1912 1912
1913 /* 1913 /*
1914 * The per-cpu-pages pools are set to around 1000th of the 1914 * The per-cpu-pages pools are set to around 1000th of the
1915 * size of the zone. But no more than 1/2 of a meg. 1915 * size of the zone. But no more than 1/2 of a meg.
1916 * 1916 *
1917 * OK, so we don't know how big the cache is. So guess. 1917 * OK, so we don't know how big the cache is. So guess.
1918 */ 1918 */
1919 batch = zone->present_pages / 1024; 1919 batch = zone->present_pages / 1024;
1920 if (batch * PAGE_SIZE > 512 * 1024) 1920 if (batch * PAGE_SIZE > 512 * 1024)
1921 batch = (512 * 1024) / PAGE_SIZE; 1921 batch = (512 * 1024) / PAGE_SIZE;
1922 batch /= 4; /* We effectively *= 4 below */ 1922 batch /= 4; /* We effectively *= 4 below */
1923 if (batch < 1) 1923 if (batch < 1)
1924 batch = 1; 1924 batch = 1;
1925 1925
1926 /* 1926 /*
1927 * Clamp the batch to a 2^n - 1 value. Having a power 1927 * Clamp the batch to a 2^n - 1 value. Having a power
1928 * of 2 value was found to be more likely to have 1928 * of 2 value was found to be more likely to have
1929 * suboptimal cache aliasing properties in some cases. 1929 * suboptimal cache aliasing properties in some cases.
1930 * 1930 *
1931 * For example if 2 tasks are alternately allocating 1931 * For example if 2 tasks are alternately allocating
1932 * batches of pages, one task can end up with a lot 1932 * batches of pages, one task can end up with a lot
1933 * of pages of one half of the possible page colors 1933 * of pages of one half of the possible page colors
1934 * and the other with pages of the other colors. 1934 * and the other with pages of the other colors.
1935 */ 1935 */
1936 batch = (1 << (fls(batch + batch/2)-1)) - 1; 1936 batch = (1 << (fls(batch + batch/2)-1)) - 1;
1937 1937
1938 return batch; 1938 return batch;
1939 } 1939 }
1940 1940
1941 inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) 1941 inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
1942 { 1942 {
1943 struct per_cpu_pages *pcp; 1943 struct per_cpu_pages *pcp;
1944 1944
1945 memset(p, 0, sizeof(*p)); 1945 memset(p, 0, sizeof(*p));
1946 1946
1947 pcp = &p->pcp[0]; /* hot */ 1947 pcp = &p->pcp[0]; /* hot */
1948 pcp->count = 0; 1948 pcp->count = 0;
1949 pcp->high = 6 * batch; 1949 pcp->high = 6 * batch;
1950 pcp->batch = max(1UL, 1 * batch); 1950 pcp->batch = max(1UL, 1 * batch);
1951 INIT_LIST_HEAD(&pcp->list); 1951 INIT_LIST_HEAD(&pcp->list);
1952 1952
1953 pcp = &p->pcp[1]; /* cold*/ 1953 pcp = &p->pcp[1]; /* cold*/
1954 pcp->count = 0; 1954 pcp->count = 0;
1955 pcp->high = 2 * batch; 1955 pcp->high = 2 * batch;
1956 pcp->batch = max(1UL, batch/2); 1956 pcp->batch = max(1UL, batch/2);
1957 INIT_LIST_HEAD(&pcp->list); 1957 INIT_LIST_HEAD(&pcp->list);
1958 } 1958 }
1959 1959
1960 /* 1960 /*
1961 * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist 1961 * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
1962 * to the value high for the pageset p. 1962 * to the value high for the pageset p.
1963 */ 1963 */
1964 1964
1965 static void setup_pagelist_highmark(struct per_cpu_pageset *p, 1965 static void setup_pagelist_highmark(struct per_cpu_pageset *p,
1966 unsigned long high) 1966 unsigned long high)
1967 { 1967 {
1968 struct per_cpu_pages *pcp; 1968 struct per_cpu_pages *pcp;
1969 1969
1970 pcp = &p->pcp[0]; /* hot list */ 1970 pcp = &p->pcp[0]; /* hot list */
1971 pcp->high = high; 1971 pcp->high = high;
1972 pcp->batch = max(1UL, high/4); 1972 pcp->batch = max(1UL, high/4);
1973 if ((high/4) > (PAGE_SHIFT * 8)) 1973 if ((high/4) > (PAGE_SHIFT * 8))
1974 pcp->batch = PAGE_SHIFT * 8; 1974 pcp->batch = PAGE_SHIFT * 8;
1975 } 1975 }
1976 1976
1977 1977
1978 #ifdef CONFIG_NUMA 1978 #ifdef CONFIG_NUMA
1979 /* 1979 /*
1980 * Boot pageset table. One per cpu which is going to be used for all 1980 * Boot pageset table. One per cpu which is going to be used for all
1981 * zones and all nodes. The parameters will be set in such a way 1981 * zones and all nodes. The parameters will be set in such a way
1982 * that an item put on a list will immediately be handed over to 1982 * that an item put on a list will immediately be handed over to
1983 * the buddy list. This is safe since pageset manipulation is done 1983 * the buddy list. This is safe since pageset manipulation is done
1984 * with interrupts disabled. 1984 * with interrupts disabled.
1985 * 1985 *
1986 * Some NUMA counter updates may also be caught by the boot pagesets. 1986 * Some NUMA counter updates may also be caught by the boot pagesets.
1987 * 1987 *
1988 * The boot_pagesets must be kept even after bootup is complete for 1988 * The boot_pagesets must be kept even after bootup is complete for
1989 * unused processors and/or zones. They do play a role for bootstrapping 1989 * unused processors and/or zones. They do play a role for bootstrapping
1990 * hotplugged processors. 1990 * hotplugged processors.
1991 * 1991 *
1992 * zoneinfo_show() and maybe other functions do 1992 * zoneinfo_show() and maybe other functions do
1993 * not check if the processor is online before following the pageset pointer. 1993 * not check if the processor is online before following the pageset pointer.
1994 * Other parts of the kernel may not check if the zone is available. 1994 * Other parts of the kernel may not check if the zone is available.
1995 */ 1995 */
1996 static struct per_cpu_pageset boot_pageset[NR_CPUS]; 1996 static struct per_cpu_pageset boot_pageset[NR_CPUS];
1997 1997
1998 /* 1998 /*
1999 * Dynamically allocate memory for the 1999 * Dynamically allocate memory for the
2000 * per cpu pageset array in struct zone. 2000 * per cpu pageset array in struct zone.
2001 */ 2001 */
2002 static int __cpuinit process_zones(int cpu) 2002 static int __cpuinit process_zones(int cpu)
2003 { 2003 {
2004 struct zone *zone, *dzone; 2004 struct zone *zone, *dzone;
2005 2005
2006 for_each_zone(zone) { 2006 for_each_zone(zone) {
2007 2007
2008 if (!populated_zone(zone)) 2008 if (!populated_zone(zone))
2009 continue; 2009 continue;
2010 2010
2011 zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), 2011 zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
2012 GFP_KERNEL, cpu_to_node(cpu)); 2012 GFP_KERNEL, cpu_to_node(cpu));
2013 if (!zone_pcp(zone, cpu)) 2013 if (!zone_pcp(zone, cpu))
2014 goto bad; 2014 goto bad;
2015 2015
2016 setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone)); 2016 setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));
2017 2017
2018 if (percpu_pagelist_fraction) 2018 if (percpu_pagelist_fraction)
2019 setup_pagelist_highmark(zone_pcp(zone, cpu), 2019 setup_pagelist_highmark(zone_pcp(zone, cpu),
2020 (zone->present_pages / percpu_pagelist_fraction)); 2020 (zone->present_pages / percpu_pagelist_fraction));
2021 } 2021 }
2022 2022
2023 return 0; 2023 return 0;
2024 bad: 2024 bad:
2025 for_each_zone(dzone) { 2025 for_each_zone(dzone) {
2026 if (dzone == zone) 2026 if (dzone == zone)
2027 break; 2027 break;
2028 kfree(zone_pcp(dzone, cpu)); 2028 kfree(zone_pcp(dzone, cpu));
2029 zone_pcp(dzone, cpu) = NULL; 2029 zone_pcp(dzone, cpu) = NULL;
2030 } 2030 }
2031 return -ENOMEM; 2031 return -ENOMEM;
2032 } 2032 }
2033 2033
2034 static inline void free_zone_pagesets(int cpu) 2034 static inline void free_zone_pagesets(int cpu)
2035 { 2035 {
2036 struct zone *zone; 2036 struct zone *zone;
2037 2037
2038 for_each_zone(zone) { 2038 for_each_zone(zone) {
2039 struct per_cpu_pageset *pset = zone_pcp(zone, cpu); 2039 struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
2040 2040
2041 /* Free per_cpu_pageset if it is slab allocated */ 2041 /* Free per_cpu_pageset if it is slab allocated */
2042 if (pset != &boot_pageset[cpu]) 2042 if (pset != &boot_pageset[cpu])
2043 kfree(pset); 2043 kfree(pset);
2044 zone_pcp(zone, cpu) = NULL; 2044 zone_pcp(zone, cpu) = NULL;
2045 } 2045 }
2046 } 2046 }
2047 2047
2048 static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb, 2048 static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
2049 unsigned long action, 2049 unsigned long action,
2050 void *hcpu) 2050 void *hcpu)
2051 { 2051 {
2052 int cpu = (long)hcpu; 2052 int cpu = (long)hcpu;
2053 int ret = NOTIFY_OK; 2053 int ret = NOTIFY_OK;
2054 2054
2055 switch (action) { 2055 switch (action) {
2056 case CPU_UP_PREPARE: 2056 case CPU_UP_PREPARE:
2057 if (process_zones(cpu)) 2057 if (process_zones(cpu))
2058 ret = NOTIFY_BAD; 2058 ret = NOTIFY_BAD;
2059 break; 2059 break;
2060 case CPU_UP_CANCELED: 2060 case CPU_UP_CANCELED:
2061 case CPU_DEAD: 2061 case CPU_DEAD:
2062 free_zone_pagesets(cpu); 2062 free_zone_pagesets(cpu);
2063 break; 2063 break;
2064 default: 2064 default:
2065 break; 2065 break;
2066 } 2066 }
2067 return ret; 2067 return ret;
2068 } 2068 }
2069 2069
2070 static struct notifier_block __cpuinitdata pageset_notifier = 2070 static struct notifier_block __cpuinitdata pageset_notifier =
2071 { &pageset_cpuup_callback, NULL, 0 }; 2071 { &pageset_cpuup_callback, NULL, 0 };
2072 2072
2073 void __init setup_per_cpu_pageset(void) 2073 void __init setup_per_cpu_pageset(void)
2074 { 2074 {
2075 int err; 2075 int err;
2076 2076
2077 /* Initialize per_cpu_pageset for cpu 0. 2077 /* Initialize per_cpu_pageset for cpu 0.
2078 * A cpuup callback will do this for every cpu 2078 * A cpuup callback will do this for every cpu
2079 * as it comes online 2079 * as it comes online
2080 */ 2080 */
2081 err = process_zones(smp_processor_id()); 2081 err = process_zones(smp_processor_id());
2082 BUG_ON(err); 2082 BUG_ON(err);
2083 register_cpu_notifier(&pageset_notifier); 2083 register_cpu_notifier(&pageset_notifier);
2084 } 2084 }
2085 2085
2086 #endif 2086 #endif
2087 2087
2088 static __meminit 2088 static __meminit
2089 int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) 2089 int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
2090 { 2090 {
2091 int i; 2091 int i;
2092 struct pglist_data *pgdat = zone->zone_pgdat; 2092 struct pglist_data *pgdat = zone->zone_pgdat;
2093 size_t alloc_size; 2093 size_t alloc_size;
2094 2094
2095 /* 2095 /*
2096 * The per-page waitqueue mechanism uses hashed waitqueues 2096 * The per-page waitqueue mechanism uses hashed waitqueues
2097 * per zone. 2097 * per zone.
2098 */ 2098 */
2099 zone->wait_table_hash_nr_entries = 2099 zone->wait_table_hash_nr_entries =
2100 wait_table_hash_nr_entries(zone_size_pages); 2100 wait_table_hash_nr_entries(zone_size_pages);
2101 zone->wait_table_bits = 2101 zone->wait_table_bits =
2102 wait_table_bits(zone->wait_table_hash_nr_entries); 2102 wait_table_bits(zone->wait_table_hash_nr_entries);
2103 alloc_size = zone->wait_table_hash_nr_entries 2103 alloc_size = zone->wait_table_hash_nr_entries
2104 * sizeof(wait_queue_head_t); 2104 * sizeof(wait_queue_head_t);
2105 2105
2106 if (system_state == SYSTEM_BOOTING) { 2106 if (system_state == SYSTEM_BOOTING) {
2107 zone->wait_table = (wait_queue_head_t *) 2107 zone->wait_table = (wait_queue_head_t *)
2108 alloc_bootmem_node(pgdat, alloc_size); 2108 alloc_bootmem_node(pgdat, alloc_size);
2109 } else { 2109 } else {
2110 /* 2110 /*
2111 * This case means that a zone whose size was 0 gets new memory 2111 * This case means that a zone whose size was 0 gets new memory
2112 * via memory hot-add. 2112 * via memory hot-add.
2113 * But it may be the case that a new node was hot-added. In 2113 * But it may be the case that a new node was hot-added. In
2114 * this case vmalloc() will not be able to use this new node's 2114 * this case vmalloc() will not be able to use this new node's
2115 * memory - this wait_table must be initialized to use this new 2115 * memory - this wait_table must be initialized to use this new
2116 * node itself as well. 2116 * node itself as well.
2117 * To use this new node's memory, further consideration will be 2117 * To use this new node's memory, further consideration will be
2118 * necessary. 2118 * necessary.
2119 */ 2119 */
2120 zone->wait_table = (wait_queue_head_t *)vmalloc(alloc_size); 2120 zone->wait_table = (wait_queue_head_t *)vmalloc(alloc_size);
2121 } 2121 }
2122 if (!zone->wait_table) 2122 if (!zone->wait_table)
2123 return -ENOMEM; 2123 return -ENOMEM;
2124 2124
2125 for(i = 0; i < zone->wait_table_hash_nr_entries; ++i) 2125 for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)
2126 init_waitqueue_head(zone->wait_table + i); 2126 init_waitqueue_head(zone->wait_table + i);
2127 2127
2128 return 0; 2128 return 0;
2129 } 2129 }
2130 2130
2131 static __meminit void zone_pcp_init(struct zone *zone) 2131 static __meminit void zone_pcp_init(struct zone *zone)
2132 { 2132 {
2133 int cpu; 2133 int cpu;
2134 unsigned long batch = zone_batchsize(zone); 2134 unsigned long batch = zone_batchsize(zone);
2135 2135
2136 for (cpu = 0; cpu < NR_CPUS; cpu++) { 2136 for (cpu = 0; cpu < NR_CPUS; cpu++) {
2137 #ifdef CONFIG_NUMA 2137 #ifdef CONFIG_NUMA
2138 /* Early boot. Slab allocator not functional yet */ 2138 /* Early boot. Slab allocator not functional yet */
2139 zone_pcp(zone, cpu) = &boot_pageset[cpu]; 2139 zone_pcp(zone, cpu) = &boot_pageset[cpu];
2140 setup_pageset(&boot_pageset[cpu],0); 2140 setup_pageset(&boot_pageset[cpu],0);
2141 #else 2141 #else
2142 setup_pageset(zone_pcp(zone,cpu), batch); 2142 setup_pageset(zone_pcp(zone,cpu), batch);
2143 #endif 2143 #endif
2144 } 2144 }
2145 if (zone->present_pages) 2145 if (zone->present_pages)
2146 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", 2146 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",
2147 zone->name, zone->present_pages, batch); 2147 zone->name, zone->present_pages, batch);
2148 } 2148 }
2149 2149
2150 __meminit int init_currently_empty_zone(struct zone *zone, 2150 __meminit int init_currently_empty_zone(struct zone *zone,
2151 unsigned long zone_start_pfn, 2151 unsigned long zone_start_pfn,
2152 unsigned long size) 2152 unsigned long size)
2153 { 2153 {
2154 struct pglist_data *pgdat = zone->zone_pgdat; 2154 struct pglist_data *pgdat = zone->zone_pgdat;
2155 int ret; 2155 int ret;
2156 ret = zone_wait_table_init(zone, size); 2156 ret = zone_wait_table_init(zone, size);
2157 if (ret) 2157 if (ret)
2158 return ret; 2158 return ret;
2159 pgdat->nr_zones = zone_idx(zone) + 1; 2159 pgdat->nr_zones = zone_idx(zone) + 1;
2160 2160
2161 zone->zone_start_pfn = zone_start_pfn; 2161 zone->zone_start_pfn = zone_start_pfn;
2162 2162
2163 memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn); 2163 memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
2164 2164
2165 zone_init_free_lists(pgdat, zone, zone->spanned_pages); 2165 zone_init_free_lists(pgdat, zone, zone->spanned_pages);
2166 2166
2167 return 0; 2167 return 0;
2168 } 2168 }
2169 2169
2170 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP 2170 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP
2171 /* 2171 /*
2172 * Basic iterator support. Return the first range of PFNs for a node 2172 * Basic iterator support. Return the first range of PFNs for a node
2173 * Note: nid == MAX_NUMNODES returns first region regardless of node 2173 * Note: nid == MAX_NUMNODES returns first region regardless of node
2174 */ 2174 */
2175 static int __init first_active_region_index_in_nid(int nid) 2175 static int __init first_active_region_index_in_nid(int nid)
2176 { 2176 {
2177 int i; 2177 int i;
2178 2178
2179 for (i = 0; i < nr_nodemap_entries; i++) 2179 for (i = 0; i < nr_nodemap_entries; i++)
2180 if (nid == MAX_NUMNODES || early_node_map[i].nid == nid) 2180 if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
2181 return i; 2181 return i;
2182 2182
2183 return -1; 2183 return -1;
2184 } 2184 }
2185 2185
2186 /* 2186 /*
2187 * Basic iterator support. Return the next active range of PFNs for a node 2187 * Basic iterator support. Return the next active range of PFNs for a node
2188 * Note: nid == MAX_NUMNODES returns next region regardles of node 2188 * Note: nid == MAX_NUMNODES returns next region regardles of node
2189 */ 2189 */
2190 static int __init next_active_region_index_in_nid(int index, int nid) 2190 static int __init next_active_region_index_in_nid(int index, int nid)
2191 { 2191 {
2192 for (index = index + 1; index < nr_nodemap_entries; index++) 2192 for (index = index + 1; index < nr_nodemap_entries; index++)
2193 if (nid == MAX_NUMNODES || early_node_map[index].nid == nid) 2193 if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
2194 return index; 2194 return index;
2195 2195
2196 return -1; 2196 return -1;
2197 } 2197 }
2198 2198
2199 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID 2199 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
2200 /* 2200 /*
2201 * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. 2201 * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
2202 * Architectures may implement their own version but if add_active_range() 2202 * Architectures may implement their own version but if add_active_range()
2203 * was used and there are no special requirements, this is a convenient 2203 * was used and there are no special requirements, this is a convenient
2204 * alternative 2204 * alternative
2205 */ 2205 */
2206 int __init early_pfn_to_nid(unsigned long pfn) 2206 int __init early_pfn_to_nid(unsigned long pfn)
2207 { 2207 {
2208 int i; 2208 int i;
2209 2209
2210 for (i = 0; i < nr_nodemap_entries; i++) { 2210 for (i = 0; i < nr_nodemap_entries; i++) {
2211 unsigned long start_pfn = early_node_map[i].start_pfn; 2211 unsigned long start_pfn = early_node_map[i].start_pfn;
2212 unsigned long end_pfn = early_node_map[i].end_pfn; 2212 unsigned long end_pfn = early_node_map[i].end_pfn;
2213 2213
2214 if (start_pfn <= pfn && pfn < end_pfn) 2214 if (start_pfn <= pfn && pfn < end_pfn)
2215 return early_node_map[i].nid; 2215 return early_node_map[i].nid;
2216 } 2216 }
2217 2217
2218 return 0; 2218 return 0;
2219 } 2219 }
2220 #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ 2220 #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
2221 2221
2222 /* Basic iterator support to walk early_node_map[] */ 2222 /* Basic iterator support to walk early_node_map[] */
2223 #define for_each_active_range_index_in_nid(i, nid) \ 2223 #define for_each_active_range_index_in_nid(i, nid) \
2224 for (i = first_active_region_index_in_nid(nid); i != -1; \ 2224 for (i = first_active_region_index_in_nid(nid); i != -1; \
2225 i = next_active_region_index_in_nid(i, nid)) 2225 i = next_active_region_index_in_nid(i, nid))
2226 2226
2227 /** 2227 /**
2228 * free_bootmem_with_active_regions - Call free_bootmem_node for each active range 2228 * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
2229 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. 2229 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
2230 * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node 2230 * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node
2231 * 2231 *
2232 * If an architecture guarantees that all ranges registered with 2232 * If an architecture guarantees that all ranges registered with
2233 * add_active_ranges() contain no holes and may be freed, this 2233 * add_active_ranges() contain no holes and may be freed, this
2234 * this function may be used instead of calling free_bootmem() manually. 2234 * this function may be used instead of calling free_bootmem() manually.
2235 */ 2235 */
2236 void __init free_bootmem_with_active_regions(int nid, 2236 void __init free_bootmem_with_active_regions(int nid,
2237 unsigned long max_low_pfn) 2237 unsigned long max_low_pfn)
2238 { 2238 {
2239 int i; 2239 int i;
2240 2240
2241 for_each_active_range_index_in_nid(i, nid) { 2241 for_each_active_range_index_in_nid(i, nid) {
2242 unsigned long size_pages = 0; 2242 unsigned long size_pages = 0;
2243 unsigned long end_pfn = early_node_map[i].end_pfn; 2243 unsigned long end_pfn = early_node_map[i].end_pfn;
2244 2244
2245 if (early_node_map[i].start_pfn >= max_low_pfn) 2245 if (early_node_map[i].start_pfn >= max_low_pfn)
2246 continue; 2246 continue;
2247 2247
2248 if (end_pfn > max_low_pfn) 2248 if (end_pfn > max_low_pfn)
2249 end_pfn = max_low_pfn; 2249 end_pfn = max_low_pfn;
2250 2250
2251 size_pages = end_pfn - early_node_map[i].start_pfn; 2251 size_pages = end_pfn - early_node_map[i].start_pfn;
2252 free_bootmem_node(NODE_DATA(early_node_map[i].nid), 2252 free_bootmem_node(NODE_DATA(early_node_map[i].nid),
2253 PFN_PHYS(early_node_map[i].start_pfn), 2253 PFN_PHYS(early_node_map[i].start_pfn),
2254 size_pages << PAGE_SHIFT); 2254 size_pages << PAGE_SHIFT);
2255 } 2255 }
2256 } 2256 }
2257 2257
2258 /** 2258 /**
2259 * sparse_memory_present_with_active_regions - Call memory_present for each active range 2259 * sparse_memory_present_with_active_regions - Call memory_present for each active range
2260 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. 2260 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
2261 * 2261 *
2262 * If an architecture guarantees that all ranges registered with 2262 * If an architecture guarantees that all ranges registered with
2263 * add_active_ranges() contain no holes and may be freed, this 2263 * add_active_ranges() contain no holes and may be freed, this
2264 * function may be used instead of calling memory_present() manually. 2264 * function may be used instead of calling memory_present() manually.
2265 */ 2265 */
2266 void __init sparse_memory_present_with_active_regions(int nid) 2266 void __init sparse_memory_present_with_active_regions(int nid)
2267 { 2267 {
2268 int i; 2268 int i;
2269 2269
2270 for_each_active_range_index_in_nid(i, nid) 2270 for_each_active_range_index_in_nid(i, nid)
2271 memory_present(early_node_map[i].nid, 2271 memory_present(early_node_map[i].nid,
2272 early_node_map[i].start_pfn, 2272 early_node_map[i].start_pfn,
2273 early_node_map[i].end_pfn); 2273 early_node_map[i].end_pfn);
2274 } 2274 }
2275 2275
2276 /** 2276 /**
2277 * push_node_boundaries - Push node boundaries to at least the requested boundary 2277 * push_node_boundaries - Push node boundaries to at least the requested boundary
2278 * @nid: The nid of the node to push the boundary for 2278 * @nid: The nid of the node to push the boundary for
2279 * @start_pfn: The start pfn of the node 2279 * @start_pfn: The start pfn of the node
2280 * @end_pfn: The end pfn of the node 2280 * @end_pfn: The end pfn of the node
2281 * 2281 *
2282 * In reserve-based hot-add, mem_map is allocated that is unused until hotadd 2282 * In reserve-based hot-add, mem_map is allocated that is unused until hotadd
2283 * time. Specifically, on x86_64, SRAT will report ranges that can potentially 2283 * time. Specifically, on x86_64, SRAT will report ranges that can potentially
2284 * be hotplugged even though no physical memory exists. This function allows 2284 * be hotplugged even though no physical memory exists. This function allows
2285 * an arch to push out the node boundaries so mem_map is allocated that can 2285 * an arch to push out the node boundaries so mem_map is allocated that can
2286 * be used later. 2286 * be used later.
2287 */ 2287 */
2288 #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE 2288 #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
2289 void __init push_node_boundaries(unsigned int nid, 2289 void __init push_node_boundaries(unsigned int nid,
2290 unsigned long start_pfn, unsigned long end_pfn) 2290 unsigned long start_pfn, unsigned long end_pfn)
2291 { 2291 {
2292 printk(KERN_DEBUG "Entering push_node_boundaries(%u, %lu, %lu)\n", 2292 printk(KERN_DEBUG "Entering push_node_boundaries(%u, %lu, %lu)\n",
2293 nid, start_pfn, end_pfn); 2293 nid, start_pfn, end_pfn);
2294 2294
2295 /* Initialise the boundary for this node if necessary */ 2295 /* Initialise the boundary for this node if necessary */
2296 if (node_boundary_end_pfn[nid] == 0) 2296 if (node_boundary_end_pfn[nid] == 0)
2297 node_boundary_start_pfn[nid] = -1UL; 2297 node_boundary_start_pfn[nid] = -1UL;
2298 2298
2299 /* Update the boundaries */ 2299 /* Update the boundaries */
2300 if (node_boundary_start_pfn[nid] > start_pfn) 2300 if (node_boundary_start_pfn[nid] > start_pfn)
2301 node_boundary_start_pfn[nid] = start_pfn; 2301 node_boundary_start_pfn[nid] = start_pfn;
2302 if (node_boundary_end_pfn[nid] < end_pfn) 2302 if (node_boundary_end_pfn[nid] < end_pfn)
2303 node_boundary_end_pfn[nid] = end_pfn; 2303 node_boundary_end_pfn[nid] = end_pfn;
2304 } 2304 }
2305 2305
2306 /* If necessary, push the node boundary out for reserve hotadd */ 2306 /* If necessary, push the node boundary out for reserve hotadd */
2307 static void __init account_node_boundary(unsigned int nid, 2307 static void __init account_node_boundary(unsigned int nid,
2308 unsigned long *start_pfn, unsigned long *end_pfn) 2308 unsigned long *start_pfn, unsigned long *end_pfn)
2309 { 2309 {
2310 printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n", 2310 printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n",
2311 nid, *start_pfn, *end_pfn); 2311 nid, *start_pfn, *end_pfn);
2312 2312
2313 /* Return if boundary information has not been provided */ 2313 /* Return if boundary information has not been provided */
2314 if (node_boundary_end_pfn[nid] == 0) 2314 if (node_boundary_end_pfn[nid] == 0)
2315 return; 2315 return;
2316 2316
2317 /* Check the boundaries and update if necessary */ 2317 /* Check the boundaries and update if necessary */
2318 if (node_boundary_start_pfn[nid] < *start_pfn) 2318 if (node_boundary_start_pfn[nid] < *start_pfn)
2319 *start_pfn = node_boundary_start_pfn[nid]; 2319 *start_pfn = node_boundary_start_pfn[nid];
2320 if (node_boundary_end_pfn[nid] > *end_pfn) 2320 if (node_boundary_end_pfn[nid] > *end_pfn)
2321 *end_pfn = node_boundary_end_pfn[nid]; 2321 *end_pfn = node_boundary_end_pfn[nid];
2322 } 2322 }
2323 #else 2323 #else
2324 void __init push_node_boundaries(unsigned int nid, 2324 void __init push_node_boundaries(unsigned int nid,
2325 unsigned long start_pfn, unsigned long end_pfn) {} 2325 unsigned long start_pfn, unsigned long end_pfn) {}
2326 2326
2327 static void __init account_node_boundary(unsigned int nid, 2327 static void __init account_node_boundary(unsigned int nid,
2328 unsigned long *start_pfn, unsigned long *end_pfn) {} 2328 unsigned long *start_pfn, unsigned long *end_pfn) {}
2329 #endif 2329 #endif
2330 2330
2331 2331
2332 /** 2332 /**
2333 * get_pfn_range_for_nid - Return the start and end page frames for a node 2333 * get_pfn_range_for_nid - Return the start and end page frames for a node
2334 * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned. 2334 * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
2335 * @start_pfn: Passed by reference. On return, it will have the node start_pfn. 2335 * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
2336 * @end_pfn: Passed by reference. On return, it will have the node end_pfn. 2336 * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
2337 * 2337 *
2338 * It returns the start and end page frame of a node based on information 2338 * It returns the start and end page frame of a node based on information
2339 * provided by an arch calling add_active_range(). If called for a node 2339 * provided by an arch calling add_active_range(). If called for a node
2340 * with no available memory, a warning is printed and the start and end 2340 * with no available memory, a warning is printed and the start and end
2341 * PFNs will be 0. 2341 * PFNs will be 0.
2342 */ 2342 */
2343 void __init get_pfn_range_for_nid(unsigned int nid, 2343 void __init get_pfn_range_for_nid(unsigned int nid,
2344 unsigned long *start_pfn, unsigned long *end_pfn) 2344 unsigned long *start_pfn, unsigned long *end_pfn)
2345 { 2345 {
2346 int i; 2346 int i;
2347 *start_pfn = -1UL; 2347 *start_pfn = -1UL;
2348 *end_pfn = 0; 2348 *end_pfn = 0;
2349 2349
2350 for_each_active_range_index_in_nid(i, nid) { 2350 for_each_active_range_index_in_nid(i, nid) {
2351 *start_pfn = min(*start_pfn, early_node_map[i].start_pfn); 2351 *start_pfn = min(*start_pfn, early_node_map[i].start_pfn);
2352 *end_pfn = max(*end_pfn, early_node_map[i].end_pfn); 2352 *end_pfn = max(*end_pfn, early_node_map[i].end_pfn);
2353 } 2353 }
2354 2354
2355 if (*start_pfn == -1UL) { 2355 if (*start_pfn == -1UL) {
2356 printk(KERN_WARNING "Node %u active with no memory\n", nid); 2356 printk(KERN_WARNING "Node %u active with no memory\n", nid);
2357 *start_pfn = 0; 2357 *start_pfn = 0;
2358 } 2358 }
2359 2359
2360 /* Push the node boundaries out if requested */ 2360 /* Push the node boundaries out if requested */
2361 account_node_boundary(nid, start_pfn, end_pfn); 2361 account_node_boundary(nid, start_pfn, end_pfn);
2362 } 2362 }
2363 2363
2364 /* 2364 /*
2365 * Return the number of pages a zone spans in a node, including holes 2365 * Return the number of pages a zone spans in a node, including holes
2366 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node() 2366 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
2367 */ 2367 */
2368 unsigned long __init zone_spanned_pages_in_node(int nid, 2368 unsigned long __init zone_spanned_pages_in_node(int nid,
2369 unsigned long zone_type, 2369 unsigned long zone_type,
2370 unsigned long *ignored) 2370 unsigned long *ignored)
2371 { 2371 {
2372 unsigned long node_start_pfn, node_end_pfn; 2372 unsigned long node_start_pfn, node_end_pfn;
2373 unsigned long zone_start_pfn, zone_end_pfn; 2373 unsigned long zone_start_pfn, zone_end_pfn;
2374 2374
2375 /* Get the start and end of the node and zone */ 2375 /* Get the start and end of the node and zone */
2376 get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); 2376 get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
2377 zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; 2377 zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
2378 zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; 2378 zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
2379 2379
2380 /* Check that this node has pages within the zone's required range */ 2380 /* Check that this node has pages within the zone's required range */
2381 if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn) 2381 if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
2382 return 0; 2382 return 0;
2383 2383
2384 /* Move the zone boundaries inside the node if necessary */ 2384 /* Move the zone boundaries inside the node if necessary */
2385 zone_end_pfn = min(zone_end_pfn, node_end_pfn); 2385 zone_end_pfn = min(zone_end_pfn, node_end_pfn);
2386 zone_start_pfn = max(zone_start_pfn, node_start_pfn); 2386 zone_start_pfn = max(zone_start_pfn, node_start_pfn);
2387 2387
2388 /* Return the spanned pages */ 2388 /* Return the spanned pages */
2389 return zone_end_pfn - zone_start_pfn; 2389 return zone_end_pfn - zone_start_pfn;
2390 } 2390 }
2391 2391
2392 /* 2392 /*
2393 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, 2393 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
2394 * then all holes in the requested range will be accounted for. 2394 * then all holes in the requested range will be accounted for.
2395 */ 2395 */
2396 unsigned long __init __absent_pages_in_range(int nid, 2396 unsigned long __init __absent_pages_in_range(int nid,
2397 unsigned long range_start_pfn, 2397 unsigned long range_start_pfn,
2398 unsigned long range_end_pfn) 2398 unsigned long range_end_pfn)
2399 { 2399 {
2400 int i = 0; 2400 int i = 0;
2401 unsigned long prev_end_pfn = 0, hole_pages = 0; 2401 unsigned long prev_end_pfn = 0, hole_pages = 0;
2402 unsigned long start_pfn; 2402 unsigned long start_pfn;
2403 2403
2404 /* Find the end_pfn of the first active range of pfns in the node */ 2404 /* Find the end_pfn of the first active range of pfns in the node */
2405 i = first_active_region_index_in_nid(nid); 2405 i = first_active_region_index_in_nid(nid);
2406 if (i == -1) 2406 if (i == -1)
2407 return 0; 2407 return 0;
2408 2408
2409 /* Account for ranges before physical memory on this node */ 2409 /* Account for ranges before physical memory on this node */
2410 if (early_node_map[i].start_pfn > range_start_pfn) 2410 if (early_node_map[i].start_pfn > range_start_pfn)
2411 hole_pages = early_node_map[i].start_pfn - range_start_pfn; 2411 hole_pages = early_node_map[i].start_pfn - range_start_pfn;
2412 2412
2413 prev_end_pfn = early_node_map[i].start_pfn; 2413 prev_end_pfn = early_node_map[i].start_pfn;
2414 2414
2415 /* Find all holes for the zone within the node */ 2415 /* Find all holes for the zone within the node */
2416 for (; i != -1; i = next_active_region_index_in_nid(i, nid)) { 2416 for (; i != -1; i = next_active_region_index_in_nid(i, nid)) {
2417 2417
2418 /* No need to continue if prev_end_pfn is outside the zone */ 2418 /* No need to continue if prev_end_pfn is outside the zone */
2419 if (prev_end_pfn >= range_end_pfn) 2419 if (prev_end_pfn >= range_end_pfn)
2420 break; 2420 break;
2421 2421
2422 /* Make sure the end of the zone is not within the hole */ 2422 /* Make sure the end of the zone is not within the hole */
2423 start_pfn = min(early_node_map[i].start_pfn, range_end_pfn); 2423 start_pfn = min(early_node_map[i].start_pfn, range_end_pfn);
2424 prev_end_pfn = max(prev_end_pfn, range_start_pfn); 2424 prev_end_pfn = max(prev_end_pfn, range_start_pfn);
2425 2425
2426 /* Update the hole size cound and move on */ 2426 /* Update the hole size cound and move on */
2427 if (start_pfn > range_start_pfn) { 2427 if (start_pfn > range_start_pfn) {
2428 BUG_ON(prev_end_pfn > start_pfn); 2428 BUG_ON(prev_end_pfn > start_pfn);
2429 hole_pages += start_pfn - prev_end_pfn; 2429 hole_pages += start_pfn - prev_end_pfn;
2430 } 2430 }
2431 prev_end_pfn = early_node_map[i].end_pfn; 2431 prev_end_pfn = early_node_map[i].end_pfn;
2432 } 2432 }
2433 2433
2434 /* Account for ranges past physical memory on this node */ 2434 /* Account for ranges past physical memory on this node */
2435 if (range_end_pfn > prev_end_pfn) 2435 if (range_end_pfn > prev_end_pfn)
2436 hole_pages += range_end_pfn - 2436 hole_pages += range_end_pfn -
2437 max(range_start_pfn, prev_end_pfn); 2437 max(range_start_pfn, prev_end_pfn);
2438 2438
2439 return hole_pages; 2439 return hole_pages;
2440 } 2440 }
2441 2441
2442 /** 2442 /**
2443 * absent_pages_in_range - Return number of page frames in holes within a range 2443 * absent_pages_in_range - Return number of page frames in holes within a range
2444 * @start_pfn: The start PFN to start searching for holes 2444 * @start_pfn: The start PFN to start searching for holes
2445 * @end_pfn: The end PFN to stop searching for holes 2445 * @end_pfn: The end PFN to stop searching for holes
2446 * 2446 *
2447 * It returns the number of pages frames in memory holes within a range. 2447 * It returns the number of pages frames in memory holes within a range.
2448 */ 2448 */
2449 unsigned long __init absent_pages_in_range(unsigned long start_pfn, 2449 unsigned long __init absent_pages_in_range(unsigned long start_pfn,
2450 unsigned long end_pfn) 2450 unsigned long end_pfn)
2451 { 2451 {
2452 return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn); 2452 return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
2453 } 2453 }
2454 2454
2455 /* Return the number of page frames in holes in a zone on a node */ 2455 /* Return the number of page frames in holes in a zone on a node */
2456 unsigned long __init zone_absent_pages_in_node(int nid, 2456 unsigned long __init zone_absent_pages_in_node(int nid,
2457 unsigned long zone_type, 2457 unsigned long zone_type,
2458 unsigned long *ignored) 2458 unsigned long *ignored)
2459 { 2459 {
2460 unsigned long node_start_pfn, node_end_pfn; 2460 unsigned long node_start_pfn, node_end_pfn;
2461 unsigned long zone_start_pfn, zone_end_pfn; 2461 unsigned long zone_start_pfn, zone_end_pfn;
2462 2462
2463 get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); 2463 get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
2464 zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type], 2464 zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type],
2465 node_start_pfn); 2465 node_start_pfn);
2466 zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type], 2466 zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type],
2467 node_end_pfn); 2467 node_end_pfn);
2468 2468
2469 return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); 2469 return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
2470 } 2470 }
2471 2471
2472 #else 2472 #else
2473 static inline unsigned long zone_spanned_pages_in_node(int nid, 2473 static inline unsigned long zone_spanned_pages_in_node(int nid,
2474 unsigned long zone_type, 2474 unsigned long zone_type,
2475 unsigned long *zones_size) 2475 unsigned long *zones_size)
2476 { 2476 {
2477 return zones_size[zone_type]; 2477 return zones_size[zone_type];
2478 } 2478 }
2479 2479
2480 static inline unsigned long zone_absent_pages_in_node(int nid, 2480 static inline unsigned long zone_absent_pages_in_node(int nid,
2481 unsigned long zone_type, 2481 unsigned long zone_type,
2482 unsigned long *zholes_size) 2482 unsigned long *zholes_size)
2483 { 2483 {
2484 if (!zholes_size) 2484 if (!zholes_size)
2485 return 0; 2485 return 0;
2486 2486
2487 return zholes_size[zone_type]; 2487 return zholes_size[zone_type];
2488 } 2488 }
2489 2489
2490 #endif 2490 #endif
2491 2491
2492 static void __init calculate_node_totalpages(struct pglist_data *pgdat, 2492 static void __init calculate_node_totalpages(struct pglist_data *pgdat,
2493 unsigned long *zones_size, unsigned long *zholes_size) 2493 unsigned long *zones_size, unsigned long *zholes_size)
2494 { 2494 {
2495 unsigned long realtotalpages, totalpages = 0; 2495 unsigned long realtotalpages, totalpages = 0;
2496 enum zone_type i; 2496 enum zone_type i;
2497 2497
2498 for (i = 0; i < MAX_NR_ZONES; i++) 2498 for (i = 0; i < MAX_NR_ZONES; i++)
2499 totalpages += zone_spanned_pages_in_node(pgdat->node_id, i, 2499 totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
2500 zones_size); 2500 zones_size);
2501 pgdat->node_spanned_pages = totalpages; 2501 pgdat->node_spanned_pages = totalpages;
2502 2502
2503 realtotalpages = totalpages; 2503 realtotalpages = totalpages;
2504 for (i = 0; i < MAX_NR_ZONES; i++) 2504 for (i = 0; i < MAX_NR_ZONES; i++)
2505 realtotalpages -= 2505 realtotalpages -=
2506 zone_absent_pages_in_node(pgdat->node_id, i, 2506 zone_absent_pages_in_node(pgdat->node_id, i,
2507 zholes_size); 2507 zholes_size);
2508 pgdat->node_present_pages = realtotalpages; 2508 pgdat->node_present_pages = realtotalpages;
2509 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, 2509 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
2510 realtotalpages); 2510 realtotalpages);
2511 } 2511 }
2512 2512
2513 /* 2513 /*
2514 * Set up the zone data structures: 2514 * Set up the zone data structures:
2515 * - mark all pages reserved 2515 * - mark all pages reserved
2516 * - mark all memory queues empty 2516 * - mark all memory queues empty
2517 * - clear the memory bitmaps 2517 * - clear the memory bitmaps
2518 */ 2518 */
2519 static void __meminit free_area_init_core(struct pglist_data *pgdat, 2519 static void __meminit free_area_init_core(struct pglist_data *pgdat,
2520 unsigned long *zones_size, unsigned long *zholes_size) 2520 unsigned long *zones_size, unsigned long *zholes_size)
2521 { 2521 {
2522 enum zone_type j; 2522 enum zone_type j;
2523 int nid = pgdat->node_id; 2523 int nid = pgdat->node_id;
2524 unsigned long zone_start_pfn = pgdat->node_start_pfn; 2524 unsigned long zone_start_pfn = pgdat->node_start_pfn;
2525 int ret; 2525 int ret;
2526 2526
2527 pgdat_resize_init(pgdat); 2527 pgdat_resize_init(pgdat);
2528 pgdat->nr_zones = 0; 2528 pgdat->nr_zones = 0;
2529 init_waitqueue_head(&pgdat->kswapd_wait); 2529 init_waitqueue_head(&pgdat->kswapd_wait);
2530 pgdat->kswapd_max_order = 0; 2530 pgdat->kswapd_max_order = 0;
2531 2531
2532 for (j = 0; j < MAX_NR_ZONES; j++) { 2532 for (j = 0; j < MAX_NR_ZONES; j++) {
2533 struct zone *zone = pgdat->node_zones + j; 2533 struct zone *zone = pgdat->node_zones + j;
2534 unsigned long size, realsize, memmap_pages; 2534 unsigned long size, realsize, memmap_pages;
2535 2535
2536 size = zone_spanned_pages_in_node(nid, j, zones_size); 2536 size = zone_spanned_pages_in_node(nid, j, zones_size);
2537 realsize = size - zone_absent_pages_in_node(nid, j, 2537 realsize = size - zone_absent_pages_in_node(nid, j,
2538 zholes_size); 2538 zholes_size);
2539 2539
2540 /* 2540 /*
2541 * Adjust realsize so that it accounts for how much memory 2541 * Adjust realsize so that it accounts for how much memory
2542 * is used by this zone for memmap. This affects the watermark 2542 * is used by this zone for memmap. This affects the watermark
2543 * and per-cpu initialisations 2543 * and per-cpu initialisations
2544 */ 2544 */
2545 memmap_pages = (size * sizeof(struct page)) >> PAGE_SHIFT; 2545 memmap_pages = (size * sizeof(struct page)) >> PAGE_SHIFT;
2546 if (realsize >= memmap_pages) { 2546 if (realsize >= memmap_pages) {
2547 realsize -= memmap_pages; 2547 realsize -= memmap_pages;
2548 printk(KERN_DEBUG 2548 printk(KERN_DEBUG
2549 " %s zone: %lu pages used for memmap\n", 2549 " %s zone: %lu pages used for memmap\n",
2550 zone_names[j], memmap_pages); 2550 zone_names[j], memmap_pages);
2551 } else 2551 } else
2552 printk(KERN_WARNING 2552 printk(KERN_WARNING
2553 " %s zone: %lu pages exceeds realsize %lu\n", 2553 " %s zone: %lu pages exceeds realsize %lu\n",
2554 zone_names[j], memmap_pages, realsize); 2554 zone_names[j], memmap_pages, realsize);
2555 2555
2556 /* Account for reserved DMA pages */ 2556 /* Account for reserved DMA pages */
2557 if (j == ZONE_DMA && realsize > dma_reserve) { 2557 if (j == ZONE_DMA && realsize > dma_reserve) {
2558 realsize -= dma_reserve; 2558 realsize -= dma_reserve;
2559 printk(KERN_DEBUG " DMA zone: %lu pages reserved\n", 2559 printk(KERN_DEBUG " DMA zone: %lu pages reserved\n",
2560 dma_reserve); 2560 dma_reserve);
2561 } 2561 }
2562 2562
2563 if (!is_highmem_idx(j)) 2563 if (!is_highmem_idx(j))
2564 nr_kernel_pages += realsize; 2564 nr_kernel_pages += realsize;
2565 nr_all_pages += realsize; 2565 nr_all_pages += realsize;
2566 2566
2567 zone->spanned_pages = size; 2567 zone->spanned_pages = size;
2568 zone->present_pages = realsize; 2568 zone->present_pages = realsize;
2569 #ifdef CONFIG_NUMA 2569 #ifdef CONFIG_NUMA
2570 zone->node = nid; 2570 zone->node = nid;
2571 zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) 2571 zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
2572 / 100; 2572 / 100;
2573 zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100; 2573 zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
2574 #endif 2574 #endif
2575 zone->name = zone_names[j]; 2575 zone->name = zone_names[j];
2576 spin_lock_init(&zone->lock); 2576 spin_lock_init(&zone->lock);
2577 spin_lock_init(&zone->lru_lock); 2577 spin_lock_init(&zone->lru_lock);
2578 zone_seqlock_init(zone); 2578 zone_seqlock_init(zone);
2579 zone->zone_pgdat = pgdat; 2579 zone->zone_pgdat = pgdat;
2580 zone->free_pages = 0; 2580 zone->free_pages = 0;
2581 2581
2582 zone->prev_priority = DEF_PRIORITY; 2582 zone->prev_priority = DEF_PRIORITY;
2583 2583
2584 zone_pcp_init(zone); 2584 zone_pcp_init(zone);
2585 INIT_LIST_HEAD(&zone->active_list); 2585 INIT_LIST_HEAD(&zone->active_list);
2586 INIT_LIST_HEAD(&zone->inactive_list); 2586 INIT_LIST_HEAD(&zone->inactive_list);
2587 zone->nr_scan_active = 0; 2587 zone->nr_scan_active = 0;
2588 zone->nr_scan_inactive = 0; 2588 zone->nr_scan_inactive = 0;
2589 zone->nr_active = 0; 2589 zone->nr_active = 0;
2590 zone->nr_inactive = 0; 2590 zone->nr_inactive = 0;
2591 zap_zone_vm_stats(zone); 2591 zap_zone_vm_stats(zone);
2592 atomic_set(&zone->reclaim_in_progress, 0); 2592 atomic_set(&zone->reclaim_in_progress, 0);
2593 if (!size) 2593 if (!size)
2594 continue; 2594 continue;
2595 2595
2596 ret = init_currently_empty_zone(zone, zone_start_pfn, size); 2596 ret = init_currently_empty_zone(zone, zone_start_pfn, size);
2597 BUG_ON(ret); 2597 BUG_ON(ret);
2598 zone_start_pfn += size; 2598 zone_start_pfn += size;
2599 } 2599 }
2600 } 2600 }
2601 2601
2602 static void __init alloc_node_mem_map(struct pglist_data *pgdat) 2602 static void __init alloc_node_mem_map(struct pglist_data *pgdat)
2603 { 2603 {
2604 /* Skip empty nodes */ 2604 /* Skip empty nodes */
2605 if (!pgdat->node_spanned_pages) 2605 if (!pgdat->node_spanned_pages)
2606 return; 2606 return;
2607 2607
2608 #ifdef CONFIG_FLAT_NODE_MEM_MAP 2608 #ifdef CONFIG_FLAT_NODE_MEM_MAP
2609 /* ia64 gets its own node_mem_map, before this, without bootmem */ 2609 /* ia64 gets its own node_mem_map, before this, without bootmem */
2610 if (!pgdat->node_mem_map) { 2610 if (!pgdat->node_mem_map) {
2611 unsigned long size, start, end; 2611 unsigned long size, start, end;
2612 struct page *map; 2612 struct page *map;
2613 2613
2614 /* 2614 /*
2615 * The zone's endpoints aren't required to be MAX_ORDER 2615 * The zone's endpoints aren't required to be MAX_ORDER
2616 * aligned but the node_mem_map endpoints must be in order 2616 * aligned but the node_mem_map endpoints must be in order
2617 * for the buddy allocator to function correctly. 2617 * for the buddy allocator to function correctly.
2618 */ 2618 */
2619 start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); 2619 start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
2620 end = pgdat->node_start_pfn + pgdat->node_spanned_pages; 2620 end = pgdat->node_start_pfn + pgdat->node_spanned_pages;
2621 end = ALIGN(end, MAX_ORDER_NR_PAGES); 2621 end = ALIGN(end, MAX_ORDER_NR_PAGES);
2622 size = (end - start) * sizeof(struct page); 2622 size = (end - start) * sizeof(struct page);
2623 map = alloc_remap(pgdat->node_id, size); 2623 map = alloc_remap(pgdat->node_id, size);
2624 if (!map) 2624 if (!map)
2625 map = alloc_bootmem_node(pgdat, size); 2625 map = alloc_bootmem_node(pgdat, size);
2626 pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); 2626 pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
2627 } 2627 }
2628 #ifdef CONFIG_FLATMEM 2628 #ifdef CONFIG_FLATMEM
2629 /* 2629 /*
2630 * With no DISCONTIG, the global mem_map is just set as node 0's 2630 * With no DISCONTIG, the global mem_map is just set as node 0's
2631 */ 2631 */
2632 if (pgdat == NODE_DATA(0)) { 2632 if (pgdat == NODE_DATA(0)) {
2633 mem_map = NODE_DATA(0)->node_mem_map; 2633 mem_map = NODE_DATA(0)->node_mem_map;
2634 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP 2634 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP
2635 if (page_to_pfn(mem_map) != pgdat->node_start_pfn) 2635 if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
2636 mem_map -= pgdat->node_start_pfn; 2636 mem_map -= pgdat->node_start_pfn;
2637 #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ 2637 #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
2638 } 2638 }
2639 #endif 2639 #endif
2640 #endif /* CONFIG_FLAT_NODE_MEM_MAP */ 2640 #endif /* CONFIG_FLAT_NODE_MEM_MAP */
2641 } 2641 }
2642 2642
2643 void __meminit free_area_init_node(int nid, struct pglist_data *pgdat, 2643 void __meminit free_area_init_node(int nid, struct pglist_data *pgdat,
2644 unsigned long *zones_size, unsigned long node_start_pfn, 2644 unsigned long *zones_size, unsigned long node_start_pfn,
2645 unsigned long *zholes_size) 2645 unsigned long *zholes_size)
2646 { 2646 {
2647 pgdat->node_id = nid; 2647 pgdat->node_id = nid;
2648 pgdat->node_start_pfn = node_start_pfn; 2648 pgdat->node_start_pfn = node_start_pfn;
2649 calculate_node_totalpages(pgdat, zones_size, zholes_size); 2649 calculate_node_totalpages(pgdat, zones_size, zholes_size);
2650 2650
2651 alloc_node_mem_map(pgdat); 2651 alloc_node_mem_map(pgdat);
2652 2652
2653 free_area_init_core(pgdat, zones_size, zholes_size); 2653 free_area_init_core(pgdat, zones_size, zholes_size);
2654 } 2654 }
2655 2655
2656 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP 2656 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP
2657 /** 2657 /**
2658 * add_active_range - Register a range of PFNs backed by physical memory 2658 * add_active_range - Register a range of PFNs backed by physical memory
2659 * @nid: The node ID the range resides on 2659 * @nid: The node ID the range resides on
2660 * @start_pfn: The start PFN of the available physical memory 2660 * @start_pfn: The start PFN of the available physical memory
2661 * @end_pfn: The end PFN of the available physical memory 2661 * @end_pfn: The end PFN of the available physical memory
2662 * 2662 *
2663 * These ranges are stored in an early_node_map[] and later used by 2663 * These ranges are stored in an early_node_map[] and later used by
2664 * free_area_init_nodes() to calculate zone sizes and holes. If the 2664 * free_area_init_nodes() to calculate zone sizes and holes. If the
2665 * range spans a memory hole, it is up to the architecture to ensure 2665 * range spans a memory hole, it is up to the architecture to ensure
2666 * the memory is not freed by the bootmem allocator. If possible 2666 * the memory is not freed by the bootmem allocator. If possible
2667 * the range being registered will be merged with existing ranges. 2667 * the range being registered will be merged with existing ranges.
2668 */ 2668 */
2669 void __init add_active_range(unsigned int nid, unsigned long start_pfn, 2669 void __init add_active_range(unsigned int nid, unsigned long start_pfn,
2670 unsigned long end_pfn) 2670 unsigned long end_pfn)
2671 { 2671 {
2672 int i; 2672 int i;
2673 2673
2674 printk(KERN_DEBUG "Entering add_active_range(%d, %lu, %lu) " 2674 printk(KERN_DEBUG "Entering add_active_range(%d, %lu, %lu) "
2675 "%d entries of %d used\n", 2675 "%d entries of %d used\n",
2676 nid, start_pfn, end_pfn, 2676 nid, start_pfn, end_pfn,
2677 nr_nodemap_entries, MAX_ACTIVE_REGIONS); 2677 nr_nodemap_entries, MAX_ACTIVE_REGIONS);
2678 2678
2679 /* Merge with existing active regions if possible */ 2679 /* Merge with existing active regions if possible */
2680 for (i = 0; i < nr_nodemap_entries; i++) { 2680 for (i = 0; i < nr_nodemap_entries; i++) {
2681 if (early_node_map[i].nid != nid) 2681 if (early_node_map[i].nid != nid)
2682 continue; 2682 continue;
2683 2683
2684 /* Skip if an existing region covers this new one */ 2684 /* Skip if an existing region covers this new one */
2685 if (start_pfn >= early_node_map[i].start_pfn && 2685 if (start_pfn >= early_node_map[i].start_pfn &&
2686 end_pfn <= early_node_map[i].end_pfn) 2686 end_pfn <= early_node_map[i].end_pfn)
2687 return; 2687 return;
2688 2688
2689 /* Merge forward if suitable */ 2689 /* Merge forward if suitable */
2690 if (start_pfn <= early_node_map[i].end_pfn && 2690 if (start_pfn <= early_node_map[i].end_pfn &&
2691 end_pfn > early_node_map[i].end_pfn) { 2691 end_pfn > early_node_map[i].end_pfn) {
2692 early_node_map[i].end_pfn = end_pfn; 2692 early_node_map[i].end_pfn = end_pfn;
2693 return; 2693 return;
2694 } 2694 }
2695 2695
2696 /* Merge backward if suitable */ 2696 /* Merge backward if suitable */
2697 if (start_pfn < early_node_map[i].end_pfn && 2697 if (start_pfn < early_node_map[i].end_pfn &&
2698 end_pfn >= early_node_map[i].start_pfn) { 2698 end_pfn >= early_node_map[i].start_pfn) {
2699 early_node_map[i].start_pfn = start_pfn; 2699 early_node_map[i].start_pfn = start_pfn;
2700 return; 2700 return;
2701 } 2701 }
2702 } 2702 }
2703 2703
2704 /* Check that early_node_map is large enough */ 2704 /* Check that early_node_map is large enough */
2705 if (i >= MAX_ACTIVE_REGIONS) { 2705 if (i >= MAX_ACTIVE_REGIONS) {
2706 printk(KERN_CRIT "More than %d memory regions, truncating\n", 2706 printk(KERN_CRIT "More than %d memory regions, truncating\n",
2707 MAX_ACTIVE_REGIONS); 2707 MAX_ACTIVE_REGIONS);
2708 return; 2708 return;
2709 } 2709 }
2710 2710
2711 early_node_map[i].nid = nid; 2711 early_node_map[i].nid = nid;
2712 early_node_map[i].start_pfn = start_pfn; 2712 early_node_map[i].start_pfn = start_pfn;
2713 early_node_map[i].end_pfn = end_pfn; 2713 early_node_map[i].end_pfn = end_pfn;
2714 nr_nodemap_entries = i + 1; 2714 nr_nodemap_entries = i + 1;
2715 } 2715 }
2716 2716
2717 /** 2717 /**
2718 * shrink_active_range - Shrink an existing registered range of PFNs 2718 * shrink_active_range - Shrink an existing registered range of PFNs
2719 * @nid: The node id the range is on that should be shrunk 2719 * @nid: The node id the range is on that should be shrunk
2720 * @old_end_pfn: The old end PFN of the range 2720 * @old_end_pfn: The old end PFN of the range
2721 * @new_end_pfn: The new PFN of the range 2721 * @new_end_pfn: The new PFN of the range
2722 * 2722 *
2723 * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node. 2723 * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node.
2724 * The map is kept at the end physical page range that has already been 2724 * The map is kept at the end physical page range that has already been
2725 * registered with add_active_range(). This function allows an arch to shrink 2725 * registered with add_active_range(). This function allows an arch to shrink
2726 * an existing registered range. 2726 * an existing registered range.
2727 */ 2727 */
2728 void __init shrink_active_range(unsigned int nid, unsigned long old_end_pfn, 2728 void __init shrink_active_range(unsigned int nid, unsigned long old_end_pfn,
2729 unsigned long new_end_pfn) 2729 unsigned long new_end_pfn)
2730 { 2730 {
2731 int i; 2731 int i;
2732 2732
2733 /* Find the old active region end and shrink */ 2733 /* Find the old active region end and shrink */
2734 for_each_active_range_index_in_nid(i, nid) 2734 for_each_active_range_index_in_nid(i, nid)
2735 if (early_node_map[i].end_pfn == old_end_pfn) { 2735 if (early_node_map[i].end_pfn == old_end_pfn) {
2736 early_node_map[i].end_pfn = new_end_pfn; 2736 early_node_map[i].end_pfn = new_end_pfn;
2737 break; 2737 break;
2738 } 2738 }
2739 } 2739 }
2740 2740
2741 /** 2741 /**
2742 * remove_all_active_ranges - Remove all currently registered regions 2742 * remove_all_active_ranges - Remove all currently registered regions
2743 * 2743 *
2744 * During discovery, it may be found that a table like SRAT is invalid 2744 * During discovery, it may be found that a table like SRAT is invalid
2745 * and an alternative discovery method must be used. This function removes 2745 * and an alternative discovery method must be used. This function removes
2746 * all currently registered regions. 2746 * all currently registered regions.
2747 */ 2747 */
2748 void __init remove_all_active_ranges(void) 2748 void __init remove_all_active_ranges(void)
2749 { 2749 {
2750 memset(early_node_map, 0, sizeof(early_node_map)); 2750 memset(early_node_map, 0, sizeof(early_node_map));
2751 nr_nodemap_entries = 0; 2751 nr_nodemap_entries = 0;
2752 #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE 2752 #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
2753 memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn)); 2753 memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn));
2754 memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn)); 2754 memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn));
2755 #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ 2755 #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
2756 } 2756 }
2757 2757
2758 /* Compare two active node_active_regions */ 2758 /* Compare two active node_active_regions */
2759 static int __init cmp_node_active_region(const void *a, const void *b) 2759 static int __init cmp_node_active_region(const void *a, const void *b)
2760 { 2760 {
2761 struct node_active_region *arange = (struct node_active_region *)a; 2761 struct node_active_region *arange = (struct node_active_region *)a;
2762 struct node_active_region *brange = (struct node_active_region *)b; 2762 struct node_active_region *brange = (struct node_active_region *)b;
2763 2763
2764 /* Done this way to avoid overflows */ 2764 /* Done this way to avoid overflows */
2765 if (arange->start_pfn > brange->start_pfn) 2765 if (arange->start_pfn > brange->start_pfn)
2766 return 1; 2766 return 1;
2767 if (arange->start_pfn < brange->start_pfn) 2767 if (arange->start_pfn < brange->start_pfn)
2768 return -1; 2768 return -1;
2769 2769
2770 return 0; 2770 return 0;
2771 } 2771 }
2772 2772
2773 /* sort the node_map by start_pfn */ 2773 /* sort the node_map by start_pfn */
2774 static void __init sort_node_map(void) 2774 static void __init sort_node_map(void)
2775 { 2775 {
2776 sort(early_node_map, (size_t)nr_nodemap_entries, 2776 sort(early_node_map, (size_t)nr_nodemap_entries,
2777 sizeof(struct node_active_region), 2777 sizeof(struct node_active_region),
2778 cmp_node_active_region, NULL); 2778 cmp_node_active_region, NULL);
2779 } 2779 }
2780 2780
2781 /* Find the lowest pfn for a node. This depends on a sorted early_node_map */ 2781 /* Find the lowest pfn for a node. This depends on a sorted early_node_map */
2782 unsigned long __init find_min_pfn_for_node(unsigned long nid) 2782 unsigned long __init find_min_pfn_for_node(unsigned long nid)
2783 { 2783 {
2784 int i; 2784 int i;
2785 2785
2786 /* Regions in the early_node_map can be in any order */ 2786 /* Regions in the early_node_map can be in any order */
2787 sort_node_map(); 2787 sort_node_map();
2788 2788
2789 /* Assuming a sorted map, the first range found has the starting pfn */ 2789 /* Assuming a sorted map, the first range found has the starting pfn */
2790 for_each_active_range_index_in_nid(i, nid) 2790 for_each_active_range_index_in_nid(i, nid)
2791 return early_node_map[i].start_pfn; 2791 return early_node_map[i].start_pfn;
2792 2792
2793 printk(KERN_WARNING "Could not find start_pfn for node %lu\n", nid); 2793 printk(KERN_WARNING "Could not find start_pfn for node %lu\n", nid);
2794 return 0; 2794 return 0;
2795 } 2795 }
2796 2796
2797 /** 2797 /**
2798 * find_min_pfn_with_active_regions - Find the minimum PFN registered 2798 * find_min_pfn_with_active_regions - Find the minimum PFN registered
2799 * 2799 *
2800 * It returns the minimum PFN based on information provided via 2800 * It returns the minimum PFN based on information provided via
2801 * add_active_range(). 2801 * add_active_range().
2802 */ 2802 */
2803 unsigned long __init find_min_pfn_with_active_regions(void) 2803 unsigned long __init find_min_pfn_with_active_regions(void)
2804 { 2804 {
2805 return find_min_pfn_for_node(MAX_NUMNODES); 2805 return find_min_pfn_for_node(MAX_NUMNODES);
2806 } 2806 }
2807 2807
2808 /** 2808 /**
2809 * find_max_pfn_with_active_regions - Find the maximum PFN registered 2809 * find_max_pfn_with_active_regions - Find the maximum PFN registered
2810 * 2810 *
2811 * It returns the maximum PFN based on information provided via 2811 * It returns the maximum PFN based on information provided via
2812 * add_active_range(). 2812 * add_active_range().
2813 */ 2813 */
2814 unsigned long __init find_max_pfn_with_active_regions(void) 2814 unsigned long __init find_max_pfn_with_active_regions(void)
2815 { 2815 {
2816 int i; 2816 int i;
2817 unsigned long max_pfn = 0; 2817 unsigned long max_pfn = 0;
2818 2818
2819 for (i = 0; i < nr_nodemap_entries; i++) 2819 for (i = 0; i < nr_nodemap_entries; i++)
2820 max_pfn = max(max_pfn, early_node_map[i].end_pfn); 2820 max_pfn = max(max_pfn, early_node_map[i].end_pfn);
2821 2821
2822 return max_pfn; 2822 return max_pfn;
2823 } 2823 }
2824 2824
2825 /** 2825 /**
2826 * free_area_init_nodes - Initialise all pg_data_t and zone data 2826 * free_area_init_nodes - Initialise all pg_data_t and zone data
2827 * @max_zone_pfn: an array of max PFNs for each zone 2827 * @max_zone_pfn: an array of max PFNs for each zone
2828 * 2828 *
2829 * This will call free_area_init_node() for each active node in the system. 2829 * This will call free_area_init_node() for each active node in the system.
2830 * Using the page ranges provided by add_active_range(), the size of each 2830 * Using the page ranges provided by add_active_range(), the size of each
2831 * zone in each node and their holes is calculated. If the maximum PFN 2831 * zone in each node and their holes is calculated. If the maximum PFN
2832 * between two adjacent zones match, it is assumed that the zone is empty. 2832 * between two adjacent zones match, it is assumed that the zone is empty.
2833 * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed 2833 * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
2834 * that arch_max_dma32_pfn has no pages. It is also assumed that a zone 2834 * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
2835 * starts where the previous one ended. For example, ZONE_DMA32 starts 2835 * starts where the previous one ended. For example, ZONE_DMA32 starts
2836 * at arch_max_dma_pfn. 2836 * at arch_max_dma_pfn.
2837 */ 2837 */
2838 void __init free_area_init_nodes(unsigned long *max_zone_pfn) 2838 void __init free_area_init_nodes(unsigned long *max_zone_pfn)
2839 { 2839 {
2840 unsigned long nid; 2840 unsigned long nid;
2841 enum zone_type i; 2841 enum zone_type i;
2842 2842
2843 /* Record where the zone boundaries are */ 2843 /* Record where the zone boundaries are */
2844 memset(arch_zone_lowest_possible_pfn, 0, 2844 memset(arch_zone_lowest_possible_pfn, 0,
2845 sizeof(arch_zone_lowest_possible_pfn)); 2845 sizeof(arch_zone_lowest_possible_pfn));
2846 memset(arch_zone_highest_possible_pfn, 0, 2846 memset(arch_zone_highest_possible_pfn, 0,
2847 sizeof(arch_zone_highest_possible_pfn)); 2847 sizeof(arch_zone_highest_possible_pfn));
2848 arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions(); 2848 arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
2849 arch_zone_highest_possible_pfn[0] = max_zone_pfn[0]; 2849 arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
2850 for (i = 1; i < MAX_NR_ZONES; i++) { 2850 for (i = 1; i < MAX_NR_ZONES; i++) {
2851 arch_zone_lowest_possible_pfn[i] = 2851 arch_zone_lowest_possible_pfn[i] =
2852 arch_zone_highest_possible_pfn[i-1]; 2852 arch_zone_highest_possible_pfn[i-1];
2853 arch_zone_highest_possible_pfn[i] = 2853 arch_zone_highest_possible_pfn[i] =
2854 max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]); 2854 max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
2855 } 2855 }
2856 2856
2857 /* Print out the zone ranges */ 2857 /* Print out the zone ranges */
2858 printk("Zone PFN ranges:\n"); 2858 printk("Zone PFN ranges:\n");
2859 for (i = 0; i < MAX_NR_ZONES; i++) 2859 for (i = 0; i < MAX_NR_ZONES; i++)
2860 printk(" %-8s %8lu -> %8lu\n", 2860 printk(" %-8s %8lu -> %8lu\n",
2861 zone_names[i], 2861 zone_names[i],
2862 arch_zone_lowest_possible_pfn[i], 2862 arch_zone_lowest_possible_pfn[i],
2863 arch_zone_highest_possible_pfn[i]); 2863 arch_zone_highest_possible_pfn[i]);
2864 2864
2865 /* Print out the early_node_map[] */ 2865 /* Print out the early_node_map[] */
2866 printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries); 2866 printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries);
2867 for (i = 0; i < nr_nodemap_entries; i++) 2867 for (i = 0; i < nr_nodemap_entries; i++)
2868 printk(" %3d: %8lu -> %8lu\n", early_node_map[i].nid, 2868 printk(" %3d: %8lu -> %8lu\n", early_node_map[i].nid,
2869 early_node_map[i].start_pfn, 2869 early_node_map[i].start_pfn,
2870 early_node_map[i].end_pfn); 2870 early_node_map[i].end_pfn);
2871 2871
2872 /* Initialise every node */ 2872 /* Initialise every node */
2873 for_each_online_node(nid) { 2873 for_each_online_node(nid) {
2874 pg_data_t *pgdat = NODE_DATA(nid); 2874 pg_data_t *pgdat = NODE_DATA(nid);
2875 free_area_init_node(nid, pgdat, NULL, 2875 free_area_init_node(nid, pgdat, NULL,
2876 find_min_pfn_for_node(nid), NULL); 2876 find_min_pfn_for_node(nid), NULL);
2877 } 2877 }
2878 } 2878 }
2879 #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ 2879 #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
2880 2880
2881 /** 2881 /**
2882 * set_dma_reserve - set the specified number of pages reserved in the first zone 2882 * set_dma_reserve - set the specified number of pages reserved in the first zone
2883 * @new_dma_reserve: The number of pages to mark reserved 2883 * @new_dma_reserve: The number of pages to mark reserved
2884 * 2884 *
2885 * The per-cpu batchsize and zone watermarks are determined by present_pages. 2885 * The per-cpu batchsize and zone watermarks are determined by present_pages.
2886 * In the DMA zone, a significant percentage may be consumed by kernel image 2886 * In the DMA zone, a significant percentage may be consumed by kernel image
2887 * and other unfreeable allocations which can skew the watermarks badly. This 2887 * and other unfreeable allocations which can skew the watermarks badly. This
2888 * function may optionally be used to account for unfreeable pages in the 2888 * function may optionally be used to account for unfreeable pages in the
2889 * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and 2889 * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
2890 * smaller per-cpu batchsize. 2890 * smaller per-cpu batchsize.
2891 */ 2891 */
2892 void __init set_dma_reserve(unsigned long new_dma_reserve) 2892 void __init set_dma_reserve(unsigned long new_dma_reserve)
2893 { 2893 {
2894 dma_reserve = new_dma_reserve; 2894 dma_reserve = new_dma_reserve;
2895 } 2895 }
2896 2896
2897 #ifndef CONFIG_NEED_MULTIPLE_NODES 2897 #ifndef CONFIG_NEED_MULTIPLE_NODES
2898 static bootmem_data_t contig_bootmem_data; 2898 static bootmem_data_t contig_bootmem_data;
2899 struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data }; 2899 struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
2900 2900
2901 EXPORT_SYMBOL(contig_page_data); 2901 EXPORT_SYMBOL(contig_page_data);
2902 #endif 2902 #endif
2903 2903
2904 void __init free_area_init(unsigned long *zones_size) 2904 void __init free_area_init(unsigned long *zones_size)
2905 { 2905 {
2906 free_area_init_node(0, NODE_DATA(0), zones_size, 2906 free_area_init_node(0, NODE_DATA(0), zones_size,
2907 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); 2907 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
2908 } 2908 }
2909 2909
2910 #ifdef CONFIG_HOTPLUG_CPU 2910 #ifdef CONFIG_HOTPLUG_CPU
2911 static int page_alloc_cpu_notify(struct notifier_block *self, 2911 static int page_alloc_cpu_notify(struct notifier_block *self,
2912 unsigned long action, void *hcpu) 2912 unsigned long action, void *hcpu)
2913 { 2913 {
2914 int cpu = (unsigned long)hcpu; 2914 int cpu = (unsigned long)hcpu;
2915 2915
2916 if (action == CPU_DEAD) { 2916 if (action == CPU_DEAD) {
2917 local_irq_disable(); 2917 local_irq_disable();
2918 __drain_pages(cpu); 2918 __drain_pages(cpu);
2919 vm_events_fold_cpu(cpu); 2919 vm_events_fold_cpu(cpu);
2920 local_irq_enable(); 2920 local_irq_enable();
2921 refresh_cpu_vm_stats(cpu); 2921 refresh_cpu_vm_stats(cpu);
2922 } 2922 }
2923 return NOTIFY_OK; 2923 return NOTIFY_OK;
2924 } 2924 }
2925 #endif /* CONFIG_HOTPLUG_CPU */ 2925 #endif /* CONFIG_HOTPLUG_CPU */
2926 2926
2927 void __init page_alloc_init(void) 2927 void __init page_alloc_init(void)
2928 { 2928 {
2929 hotcpu_notifier(page_alloc_cpu_notify, 0); 2929 hotcpu_notifier(page_alloc_cpu_notify, 0);
2930 } 2930 }
2931 2931
2932 /* 2932 /*
2933 * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio 2933 * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio
2934 * or min_free_kbytes changes. 2934 * or min_free_kbytes changes.
2935 */ 2935 */
2936 static void calculate_totalreserve_pages(void) 2936 static void calculate_totalreserve_pages(void)
2937 { 2937 {
2938 struct pglist_data *pgdat; 2938 struct pglist_data *pgdat;
2939 unsigned long reserve_pages = 0; 2939 unsigned long reserve_pages = 0;
2940 enum zone_type i, j; 2940 enum zone_type i, j;
2941 2941
2942 for_each_online_pgdat(pgdat) { 2942 for_each_online_pgdat(pgdat) {
2943 for (i = 0; i < MAX_NR_ZONES; i++) { 2943 for (i = 0; i < MAX_NR_ZONES; i++) {
2944 struct zone *zone = pgdat->node_zones + i; 2944 struct zone *zone = pgdat->node_zones + i;
2945 unsigned long max = 0; 2945 unsigned long max = 0;
2946 2946
2947 /* Find valid and maximum lowmem_reserve in the zone */ 2947 /* Find valid and maximum lowmem_reserve in the zone */
2948 for (j = i; j < MAX_NR_ZONES; j++) { 2948 for (j = i; j < MAX_NR_ZONES; j++) {
2949 if (zone->lowmem_reserve[j] > max) 2949 if (zone->lowmem_reserve[j] > max)
2950 max = zone->lowmem_reserve[j]; 2950 max = zone->lowmem_reserve[j];
2951 } 2951 }
2952 2952
2953 /* we treat pages_high as reserved pages. */ 2953 /* we treat pages_high as reserved pages. */
2954 max += zone->pages_high; 2954 max += zone->pages_high;
2955 2955
2956 if (max > zone->present_pages) 2956 if (max > zone->present_pages)
2957 max = zone->present_pages; 2957 max = zone->present_pages;
2958 reserve_pages += max; 2958 reserve_pages += max;
2959 } 2959 }
2960 } 2960 }
2961 totalreserve_pages = reserve_pages; 2961 totalreserve_pages = reserve_pages;
2962 } 2962 }
2963 2963
2964 /* 2964 /*
2965 * setup_per_zone_lowmem_reserve - called whenever 2965 * setup_per_zone_lowmem_reserve - called whenever
2966 * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone 2966 * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone
2967 * has a correct pages reserved value, so an adequate number of 2967 * has a correct pages reserved value, so an adequate number of
2968 * pages are left in the zone after a successful __alloc_pages(). 2968 * pages are left in the zone after a successful __alloc_pages().
2969 */ 2969 */
2970 static void setup_per_zone_lowmem_reserve(void) 2970 static void setup_per_zone_lowmem_reserve(void)
2971 { 2971 {
2972 struct pglist_data *pgdat; 2972 struct pglist_data *pgdat;
2973 enum zone_type j, idx; 2973 enum zone_type j, idx;
2974 2974
2975 for_each_online_pgdat(pgdat) { 2975 for_each_online_pgdat(pgdat) {
2976 for (j = 0; j < MAX_NR_ZONES; j++) { 2976 for (j = 0; j < MAX_NR_ZONES; j++) {
2977 struct zone *zone = pgdat->node_zones + j; 2977 struct zone *zone = pgdat->node_zones + j;
2978 unsigned long present_pages = zone->present_pages; 2978 unsigned long present_pages = zone->present_pages;
2979 2979
2980 zone->lowmem_reserve[j] = 0; 2980 zone->lowmem_reserve[j] = 0;
2981 2981
2982 idx = j; 2982 idx = j;
2983 while (idx) { 2983 while (idx) {
2984 struct zone *lower_zone; 2984 struct zone *lower_zone;
2985 2985
2986 idx--; 2986 idx--;
2987 2987
2988 if (sysctl_lowmem_reserve_ratio[idx] < 1) 2988 if (sysctl_lowmem_reserve_ratio[idx] < 1)
2989 sysctl_lowmem_reserve_ratio[idx] = 1; 2989 sysctl_lowmem_reserve_ratio[idx] = 1;
2990 2990
2991 lower_zone = pgdat->node_zones + idx; 2991 lower_zone = pgdat->node_zones + idx;
2992 lower_zone->lowmem_reserve[j] = present_pages / 2992 lower_zone->lowmem_reserve[j] = present_pages /
2993 sysctl_lowmem_reserve_ratio[idx]; 2993 sysctl_lowmem_reserve_ratio[idx];
2994 present_pages += lower_zone->present_pages; 2994 present_pages += lower_zone->present_pages;
2995 } 2995 }
2996 } 2996 }
2997 } 2997 }
2998 2998
2999 /* update totalreserve_pages */ 2999 /* update totalreserve_pages */
3000 calculate_totalreserve_pages(); 3000 calculate_totalreserve_pages();
3001 } 3001 }
3002 3002
3003 /** 3003 /**
3004 * setup_per_zone_pages_min - called when min_free_kbytes changes. 3004 * setup_per_zone_pages_min - called when min_free_kbytes changes.
3005 * 3005 *
3006 * Ensures that the pages_{min,low,high} values for each zone are set correctly 3006 * Ensures that the pages_{min,low,high} values for each zone are set correctly
3007 * with respect to min_free_kbytes. 3007 * with respect to min_free_kbytes.
3008 */ 3008 */
3009 void setup_per_zone_pages_min(void) 3009 void setup_per_zone_pages_min(void)
3010 { 3010 {
3011 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); 3011 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
3012 unsigned long lowmem_pages = 0; 3012 unsigned long lowmem_pages = 0;
3013 struct zone *zone; 3013 struct zone *zone;
3014 unsigned long flags; 3014 unsigned long flags;
3015 3015
3016 /* Calculate total number of !ZONE_HIGHMEM pages */ 3016 /* Calculate total number of !ZONE_HIGHMEM pages */
3017 for_each_zone(zone) { 3017 for_each_zone(zone) {
3018 if (!is_highmem(zone)) 3018 if (!is_highmem(zone))
3019 lowmem_pages += zone->present_pages; 3019 lowmem_pages += zone->present_pages;
3020 } 3020 }
3021 3021
3022 for_each_zone(zone) { 3022 for_each_zone(zone) {
3023 u64 tmp; 3023 u64 tmp;
3024 3024
3025 spin_lock_irqsave(&zone->lru_lock, flags); 3025 spin_lock_irqsave(&zone->lru_lock, flags);
3026 tmp = (u64)pages_min * zone->present_pages; 3026 tmp = (u64)pages_min * zone->present_pages;
3027 do_div(tmp, lowmem_pages); 3027 do_div(tmp, lowmem_pages);
3028 if (is_highmem(zone)) { 3028 if (is_highmem(zone)) {
3029 /* 3029 /*
3030 * __GFP_HIGH and PF_MEMALLOC allocations usually don't 3030 * __GFP_HIGH and PF_MEMALLOC allocations usually don't
3031 * need highmem pages, so cap pages_min to a small 3031 * need highmem pages, so cap pages_min to a small
3032 * value here. 3032 * value here.
3033 * 3033 *
3034 * The (pages_high-pages_low) and (pages_low-pages_min) 3034 * The (pages_high-pages_low) and (pages_low-pages_min)
3035 * deltas controls asynch page reclaim, and so should 3035 * deltas controls asynch page reclaim, and so should
3036 * not be capped for highmem. 3036 * not be capped for highmem.
3037 */ 3037 */
3038 int min_pages; 3038 int min_pages;
3039 3039
3040 min_pages = zone->present_pages / 1024; 3040 min_pages = zone->present_pages / 1024;
3041 if (min_pages < SWAP_CLUSTER_MAX) 3041 if (min_pages < SWAP_CLUSTER_MAX)
3042 min_pages = SWAP_CLUSTER_MAX; 3042 min_pages = SWAP_CLUSTER_MAX;
3043 if (min_pages > 128) 3043 if (min_pages > 128)
3044 min_pages = 128; 3044 min_pages = 128;
3045 zone->pages_min = min_pages; 3045 zone->pages_min = min_pages;
3046 } else { 3046 } else {
3047 /* 3047 /*
3048 * If it's a lowmem zone, reserve a number of pages 3048 * If it's a lowmem zone, reserve a number of pages
3049 * proportionate to the zone's size. 3049 * proportionate to the zone's size.
3050 */ 3050 */
3051 zone->pages_min = tmp; 3051 zone->pages_min = tmp;
3052 } 3052 }
3053 3053
3054 zone->pages_low = zone->pages_min + (tmp >> 2); 3054 zone->pages_low = zone->pages_min + (tmp >> 2);
3055 zone->pages_high = zone->pages_min + (tmp >> 1); 3055 zone->pages_high = zone->pages_min + (tmp >> 1);
3056 spin_unlock_irqrestore(&zone->lru_lock, flags); 3056 spin_unlock_irqrestore(&zone->lru_lock, flags);
3057 } 3057 }
3058 3058
3059 /* update totalreserve_pages */ 3059 /* update totalreserve_pages */
3060 calculate_totalreserve_pages(); 3060 calculate_totalreserve_pages();
3061 } 3061 }
3062 3062
3063 /* 3063 /*
3064 * Initialise min_free_kbytes. 3064 * Initialise min_free_kbytes.
3065 * 3065 *
3066 * For small machines we want it small (128k min). For large machines 3066 * For small machines we want it small (128k min). For large machines
3067 * we want it large (64MB max). But it is not linear, because network 3067 * we want it large (64MB max). But it is not linear, because network
3068 * bandwidth does not increase linearly with machine size. We use 3068 * bandwidth does not increase linearly with machine size. We use
3069 * 3069 *
3070 * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: 3070 * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
3071 * min_free_kbytes = sqrt(lowmem_kbytes * 16) 3071 * min_free_kbytes = sqrt(lowmem_kbytes * 16)
3072 * 3072 *
3073 * which yields 3073 * which yields
3074 * 3074 *
3075 * 16MB: 512k 3075 * 16MB: 512k
3076 * 32MB: 724k 3076 * 32MB: 724k
3077 * 64MB: 1024k 3077 * 64MB: 1024k
3078 * 128MB: 1448k 3078 * 128MB: 1448k
3079 * 256MB: 2048k 3079 * 256MB: 2048k
3080 * 512MB: 2896k 3080 * 512MB: 2896k
3081 * 1024MB: 4096k 3081 * 1024MB: 4096k
3082 * 2048MB: 5792k 3082 * 2048MB: 5792k
3083 * 4096MB: 8192k 3083 * 4096MB: 8192k
3084 * 8192MB: 11584k 3084 * 8192MB: 11584k
3085 * 16384MB: 16384k 3085 * 16384MB: 16384k
3086 */ 3086 */
3087 static int __init init_per_zone_pages_min(void) 3087 static int __init init_per_zone_pages_min(void)
3088 { 3088 {
3089 unsigned long lowmem_kbytes; 3089 unsigned long lowmem_kbytes;
3090 3090
3091 lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); 3091 lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
3092 3092
3093 min_free_kbytes = int_sqrt(lowmem_kbytes * 16); 3093 min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
3094 if (min_free_kbytes < 128) 3094 if (min_free_kbytes < 128)
3095 min_free_kbytes = 128; 3095 min_free_kbytes = 128;
3096 if (min_free_kbytes > 65536) 3096 if (min_free_kbytes > 65536)
3097 min_free_kbytes = 65536; 3097 min_free_kbytes = 65536;
3098 setup_per_zone_pages_min(); 3098 setup_per_zone_pages_min();
3099 setup_per_zone_lowmem_reserve(); 3099 setup_per_zone_lowmem_reserve();
3100 return 0; 3100 return 0;
3101 } 3101 }
3102 module_init(init_per_zone_pages_min) 3102 module_init(init_per_zone_pages_min)
3103 3103
3104 /* 3104 /*
3105 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 3105 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
3106 * that we can call two helper functions whenever min_free_kbytes 3106 * that we can call two helper functions whenever min_free_kbytes
3107 * changes. 3107 * changes.
3108 */ 3108 */
3109 int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 3109 int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
3110 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 3110 struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
3111 { 3111 {
3112 proc_dointvec(table, write, file, buffer, length, ppos); 3112 proc_dointvec(table, write, file, buffer, length, ppos);
3113 setup_per_zone_pages_min(); 3113 setup_per_zone_pages_min();
3114 return 0; 3114 return 0;
3115 } 3115 }
3116 3116
3117 #ifdef CONFIG_NUMA 3117 #ifdef CONFIG_NUMA
3118 int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, 3118 int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
3119 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 3119 struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
3120 { 3120 {
3121 struct zone *zone; 3121 struct zone *zone;
3122 int rc; 3122 int rc;
3123 3123
3124 rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); 3124 rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
3125 if (rc) 3125 if (rc)
3126 return rc; 3126 return rc;
3127 3127
3128 for_each_zone(zone) 3128 for_each_zone(zone)
3129 zone->min_unmapped_pages = (zone->present_pages * 3129 zone->min_unmapped_pages = (zone->present_pages *
3130 sysctl_min_unmapped_ratio) / 100; 3130 sysctl_min_unmapped_ratio) / 100;
3131 return 0; 3131 return 0;
3132 } 3132 }
3133 3133
3134 int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, 3134 int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
3135 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 3135 struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
3136 { 3136 {
3137 struct zone *zone; 3137 struct zone *zone;
3138 int rc; 3138 int rc;
3139 3139
3140 rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); 3140 rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
3141 if (rc) 3141 if (rc)
3142 return rc; 3142 return rc;
3143 3143
3144 for_each_zone(zone) 3144 for_each_zone(zone)
3145 zone->min_slab_pages = (zone->present_pages * 3145 zone->min_slab_pages = (zone->present_pages *
3146 sysctl_min_slab_ratio) / 100; 3146 sysctl_min_slab_ratio) / 100;
3147 return 0; 3147 return 0;
3148 } 3148 }
3149 #endif 3149 #endif
3150 3150
3151 /* 3151 /*
3152 * lowmem_reserve_ratio_sysctl_handler - just a wrapper around 3152 * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
3153 * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve() 3153 * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
3154 * whenever sysctl_lowmem_reserve_ratio changes. 3154 * whenever sysctl_lowmem_reserve_ratio changes.
3155 * 3155 *
3156 * The reserve ratio obviously has absolutely no relation with the 3156 * The reserve ratio obviously has absolutely no relation with the
3157 * pages_min watermarks. The lowmem reserve ratio can only make sense 3157 * pages_min watermarks. The lowmem reserve ratio can only make sense
3158 * if in function of the boot time zone sizes. 3158 * if in function of the boot time zone sizes.
3159 */ 3159 */
3160 int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, 3160 int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
3161 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 3161 struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
3162 { 3162 {
3163 proc_dointvec_minmax(table, write, file, buffer, length, ppos); 3163 proc_dointvec_minmax(table, write, file, buffer, length, ppos);
3164 setup_per_zone_lowmem_reserve(); 3164 setup_per_zone_lowmem_reserve();
3165 return 0; 3165 return 0;
3166 } 3166 }
3167 3167
3168 /* 3168 /*
3169 * percpu_pagelist_fraction - changes the pcp->high for each zone on each 3169 * percpu_pagelist_fraction - changes the pcp->high for each zone on each
3170 * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist 3170 * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist
3171 * can have before it gets flushed back to buddy allocator. 3171 * can have before it gets flushed back to buddy allocator.
3172 */ 3172 */
3173 3173
3174 int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, 3174 int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
3175 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 3175 struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
3176 { 3176 {
3177 struct zone *zone; 3177 struct zone *zone;
3178 unsigned int cpu; 3178 unsigned int cpu;
3179 int ret; 3179 int ret;
3180 3180
3181 ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos); 3181 ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
3182 if (!write || (ret == -EINVAL)) 3182 if (!write || (ret == -EINVAL))
3183 return ret; 3183 return ret;
3184 for_each_zone(zone) { 3184 for_each_zone(zone) {
3185 for_each_online_cpu(cpu) { 3185 for_each_online_cpu(cpu) {
3186 unsigned long high; 3186 unsigned long high;
3187 high = zone->present_pages / percpu_pagelist_fraction; 3187 high = zone->present_pages / percpu_pagelist_fraction;
3188 setup_pagelist_highmark(zone_pcp(zone, cpu), high); 3188 setup_pagelist_highmark(zone_pcp(zone, cpu), high);
3189 } 3189 }
3190 } 3190 }
3191 return 0; 3191 return 0;
3192 } 3192 }
3193 3193
3194 int hashdist = HASHDIST_DEFAULT; 3194 int hashdist = HASHDIST_DEFAULT;
3195 3195
3196 #ifdef CONFIG_NUMA 3196 #ifdef CONFIG_NUMA
3197 static int __init set_hashdist(char *str) 3197 static int __init set_hashdist(char *str)
3198 { 3198 {
3199 if (!str) 3199 if (!str)
3200 return 0; 3200 return 0;
3201 hashdist = simple_strtoul(str, &str, 0); 3201 hashdist = simple_strtoul(str, &str, 0);
3202 return 1; 3202 return 1;
3203 } 3203 }
3204 __setup("hashdist=", set_hashdist); 3204 __setup("hashdist=", set_hashdist);
3205 #endif 3205 #endif
3206 3206
3207 /* 3207 /*
3208 * allocate a large system hash table from bootmem 3208 * allocate a large system hash table from bootmem
3209 * - it is assumed that the hash table must contain an exact power-of-2 3209 * - it is assumed that the hash table must contain an exact power-of-2
3210 * quantity of entries 3210 * quantity of entries
3211 * - limit is the number of hash buckets, not the total allocation size 3211 * - limit is the number of hash buckets, not the total allocation size
3212 */ 3212 */
3213 void *__init alloc_large_system_hash(const char *tablename, 3213 void *__init alloc_large_system_hash(const char *tablename,
3214 unsigned long bucketsize, 3214 unsigned long bucketsize,
3215 unsigned long numentries, 3215 unsigned long numentries,
3216 int scale, 3216 int scale,
3217 int flags, 3217 int flags,
3218 unsigned int *_hash_shift, 3218 unsigned int *_hash_shift,
3219 unsigned int *_hash_mask, 3219 unsigned int *_hash_mask,
3220 unsigned long limit) 3220 unsigned long limit)
3221 { 3221 {
3222 unsigned long long max = limit; 3222 unsigned long long max = limit;
3223 unsigned long log2qty, size; 3223 unsigned long log2qty, size;
3224 void *table = NULL; 3224 void *table = NULL;
3225 3225
3226 /* allow the kernel cmdline to have a say */ 3226 /* allow the kernel cmdline to have a say */
3227 if (!numentries) { 3227 if (!numentries) {
3228 /* round applicable memory size up to nearest megabyte */ 3228 /* round applicable memory size up to nearest megabyte */
3229 numentries = (flags & HASH_HIGHMEM) ? nr_all_pages : nr_kernel_pages; 3229 numentries = nr_kernel_pages;
3230 numentries += (1UL << (20 - PAGE_SHIFT)) - 1; 3230 numentries += (1UL << (20 - PAGE_SHIFT)) - 1;
3231 numentries >>= 20 - PAGE_SHIFT; 3231 numentries >>= 20 - PAGE_SHIFT;
3232 numentries <<= 20 - PAGE_SHIFT; 3232 numentries <<= 20 - PAGE_SHIFT;
3233 3233
3234 /* limit to 1 bucket per 2^scale bytes of low memory */ 3234 /* limit to 1 bucket per 2^scale bytes of low memory */
3235 if (scale > PAGE_SHIFT) 3235 if (scale > PAGE_SHIFT)
3236 numentries >>= (scale - PAGE_SHIFT); 3236 numentries >>= (scale - PAGE_SHIFT);
3237 else 3237 else
3238 numentries <<= (PAGE_SHIFT - scale); 3238 numentries <<= (PAGE_SHIFT - scale);
3239 } 3239 }
3240 numentries = roundup_pow_of_two(numentries); 3240 numentries = roundup_pow_of_two(numentries);
3241 3241
3242 /* limit allocation size to 1/16 total memory by default */ 3242 /* limit allocation size to 1/16 total memory by default */
3243 if (max == 0) { 3243 if (max == 0) {
3244 max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4; 3244 max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
3245 do_div(max, bucketsize); 3245 do_div(max, bucketsize);
3246 } 3246 }
3247 3247
3248 if (numentries > max) 3248 if (numentries > max)
3249 numentries = max; 3249 numentries = max;
3250 3250
3251 log2qty = long_log2(numentries); 3251 log2qty = long_log2(numentries);
3252 3252
3253 do { 3253 do {
3254 size = bucketsize << log2qty; 3254 size = bucketsize << log2qty;
3255 if (flags & HASH_EARLY) 3255 if (flags & HASH_EARLY)
3256 table = alloc_bootmem(size); 3256 table = alloc_bootmem(size);
3257 else if (hashdist) 3257 else if (hashdist)
3258 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); 3258 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
3259 else { 3259 else {
3260 unsigned long order; 3260 unsigned long order;
3261 for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++) 3261 for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++)
3262 ; 3262 ;
3263 table = (void*) __get_free_pages(GFP_ATOMIC, order); 3263 table = (void*) __get_free_pages(GFP_ATOMIC, order);
3264 } 3264 }
3265 } while (!table && size > PAGE_SIZE && --log2qty); 3265 } while (!table && size > PAGE_SIZE && --log2qty);
3266 3266
3267 if (!table) 3267 if (!table)
3268 panic("Failed to allocate %s hash table\n", tablename); 3268 panic("Failed to allocate %s hash table\n", tablename);
3269 3269
3270 printk("%s hash table entries: %d (order: %d, %lu bytes)\n", 3270 printk("%s hash table entries: %d (order: %d, %lu bytes)\n",
3271 tablename, 3271 tablename,
3272 (1U << log2qty), 3272 (1U << log2qty),
3273 long_log2(size) - PAGE_SHIFT, 3273 long_log2(size) - PAGE_SHIFT,
3274 size); 3274 size);
3275 3275
3276 if (_hash_shift) 3276 if (_hash_shift)
3277 *_hash_shift = log2qty; 3277 *_hash_shift = log2qty;
3278 if (_hash_mask) 3278 if (_hash_mask)
3279 *_hash_mask = (1 << log2qty) - 1; 3279 *_hash_mask = (1 << log2qty) - 1;
3280 3280
3281 return table; 3281 return table;
3282 } 3282 }
3283 3283
3284 #ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE 3284 #ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE
3285 struct page *pfn_to_page(unsigned long pfn) 3285 struct page *pfn_to_page(unsigned long pfn)
3286 { 3286 {
3287 return __pfn_to_page(pfn); 3287 return __pfn_to_page(pfn);
3288 } 3288 }
3289 unsigned long page_to_pfn(struct page *page) 3289 unsigned long page_to_pfn(struct page *page)
3290 { 3290 {
3291 return __page_to_pfn(page); 3291 return __page_to_pfn(page);
3292 } 3292 }
3293 EXPORT_SYMBOL(pfn_to_page); 3293 EXPORT_SYMBOL(pfn_to_page);
3294 EXPORT_SYMBOL(page_to_pfn); 3294 EXPORT_SYMBOL(page_to_pfn);
3295 #endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */ 3295 #endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */
3296 3296
3297 #if MAX_NUMNODES > 1 3297 #if MAX_NUMNODES > 1
3298 /* 3298 /*
3299 * Find the highest possible node id. 3299 * Find the highest possible node id.
3300 */ 3300 */
3301 int highest_possible_node_id(void) 3301 int highest_possible_node_id(void)
3302 { 3302 {
3303 unsigned int node; 3303 unsigned int node;
3304 unsigned int highest = 0; 3304 unsigned int highest = 0;
3305 3305
3306 for_each_node_mask(node, node_possible_map) 3306 for_each_node_mask(node, node_possible_map)
3307 highest = node; 3307 highest = node;
3308 return highest; 3308 return highest;
3309 } 3309 }
3310 EXPORT_SYMBOL(highest_possible_node_id); 3310 EXPORT_SYMBOL(highest_possible_node_id);
3311 #endif 3311 #endif
3312 3312