Commit 208d54e5513c0c02d85af0990901354c74364d5c

Authored by Dave Hansen
Committed by Linus Torvalds
1 parent c6a57e19e4

[PATCH] memory hotplug locking: node_size_lock

pgdat->node_size_lock is basically only neeeded in one place in the normal
code: show_mem(), which is the arch-specific sysrq-m printing function.

Strictly speaking, the architectures not doing memory hotplug do no need this
locking in show_mem().  However, they are all included for completeness.  This
should also make any future consolidation of all of the implementations a
little more straightforward.

This lock is also held in the sparsemem code during a memory removal, as
sections are invalidated.  This is the place there pfn_valid() is made false
for a memory area that's being removed.  The lock is only required when doing
pfn_valid() operations on memory which the user does not already have a
reference on the page, such as in show_mem().

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

Showing 9 changed files with 76 additions and 2 deletions Inline Diff

arch/alpha/mm/numa.c
1 /* 1 /*
2 * linux/arch/alpha/mm/numa.c 2 * linux/arch/alpha/mm/numa.c
3 * 3 *
4 * DISCONTIGMEM NUMA alpha support. 4 * DISCONTIGMEM NUMA alpha support.
5 * 5 *
6 * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE 6 * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
7 */ 7 */
8 8
9 #include <linux/config.h> 9 #include <linux/config.h>
10 #include <linux/types.h> 10 #include <linux/types.h>
11 #include <linux/kernel.h> 11 #include <linux/kernel.h>
12 #include <linux/mm.h> 12 #include <linux/mm.h>
13 #include <linux/bootmem.h> 13 #include <linux/bootmem.h>
14 #include <linux/swap.h> 14 #include <linux/swap.h>
15 #include <linux/initrd.h> 15 #include <linux/initrd.h>
16 16
17 #include <asm/hwrpb.h> 17 #include <asm/hwrpb.h>
18 #include <asm/pgalloc.h> 18 #include <asm/pgalloc.h>
19 19
20 pg_data_t node_data[MAX_NUMNODES]; 20 pg_data_t node_data[MAX_NUMNODES];
21 bootmem_data_t node_bdata[MAX_NUMNODES]; 21 bootmem_data_t node_bdata[MAX_NUMNODES];
22 22
23 #undef DEBUG_DISCONTIG 23 #undef DEBUG_DISCONTIG
24 #ifdef DEBUG_DISCONTIG 24 #ifdef DEBUG_DISCONTIG
25 #define DBGDCONT(args...) printk(args) 25 #define DBGDCONT(args...) printk(args)
26 #else 26 #else
27 #define DBGDCONT(args...) 27 #define DBGDCONT(args...)
28 #endif 28 #endif
29 29
30 #define PFN_UP(x) (((x) + PAGE_SIZE-1) >> PAGE_SHIFT) 30 #define PFN_UP(x) (((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
31 #define PFN_DOWN(x) ((x) >> PAGE_SHIFT) 31 #define PFN_DOWN(x) ((x) >> PAGE_SHIFT)
32 #define PFN_PHYS(x) ((x) << PAGE_SHIFT) 32 #define PFN_PHYS(x) ((x) << PAGE_SHIFT)
33 #define for_each_mem_cluster(memdesc, cluster, i) \ 33 #define for_each_mem_cluster(memdesc, cluster, i) \
34 for ((cluster) = (memdesc)->cluster, (i) = 0; \ 34 for ((cluster) = (memdesc)->cluster, (i) = 0; \
35 (i) < (memdesc)->numclusters; (i)++, (cluster)++) 35 (i) < (memdesc)->numclusters; (i)++, (cluster)++)
36 36
37 static void __init show_mem_layout(void) 37 static void __init show_mem_layout(void)
38 { 38 {
39 struct memclust_struct * cluster; 39 struct memclust_struct * cluster;
40 struct memdesc_struct * memdesc; 40 struct memdesc_struct * memdesc;
41 int i; 41 int i;
42 42
43 /* Find free clusters, and init and free the bootmem accordingly. */ 43 /* Find free clusters, and init and free the bootmem accordingly. */
44 memdesc = (struct memdesc_struct *) 44 memdesc = (struct memdesc_struct *)
45 (hwrpb->mddt_offset + (unsigned long) hwrpb); 45 (hwrpb->mddt_offset + (unsigned long) hwrpb);
46 46
47 printk("Raw memory layout:\n"); 47 printk("Raw memory layout:\n");
48 for_each_mem_cluster(memdesc, cluster, i) { 48 for_each_mem_cluster(memdesc, cluster, i) {
49 printk(" memcluster %2d, usage %1lx, start %8lu, end %8lu\n", 49 printk(" memcluster %2d, usage %1lx, start %8lu, end %8lu\n",
50 i, cluster->usage, cluster->start_pfn, 50 i, cluster->usage, cluster->start_pfn,
51 cluster->start_pfn + cluster->numpages); 51 cluster->start_pfn + cluster->numpages);
52 } 52 }
53 } 53 }
54 54
55 static void __init 55 static void __init
56 setup_memory_node(int nid, void *kernel_end) 56 setup_memory_node(int nid, void *kernel_end)
57 { 57 {
58 extern unsigned long mem_size_limit; 58 extern unsigned long mem_size_limit;
59 struct memclust_struct * cluster; 59 struct memclust_struct * cluster;
60 struct memdesc_struct * memdesc; 60 struct memdesc_struct * memdesc;
61 unsigned long start_kernel_pfn, end_kernel_pfn; 61 unsigned long start_kernel_pfn, end_kernel_pfn;
62 unsigned long bootmap_size, bootmap_pages, bootmap_start; 62 unsigned long bootmap_size, bootmap_pages, bootmap_start;
63 unsigned long start, end; 63 unsigned long start, end;
64 unsigned long node_pfn_start, node_pfn_end; 64 unsigned long node_pfn_start, node_pfn_end;
65 unsigned long node_min_pfn, node_max_pfn; 65 unsigned long node_min_pfn, node_max_pfn;
66 int i; 66 int i;
67 unsigned long node_datasz = PFN_UP(sizeof(pg_data_t)); 67 unsigned long node_datasz = PFN_UP(sizeof(pg_data_t));
68 int show_init = 0; 68 int show_init = 0;
69 69
70 /* Find the bounds of current node */ 70 /* Find the bounds of current node */
71 node_pfn_start = (node_mem_start(nid)) >> PAGE_SHIFT; 71 node_pfn_start = (node_mem_start(nid)) >> PAGE_SHIFT;
72 node_pfn_end = node_pfn_start + (node_mem_size(nid) >> PAGE_SHIFT); 72 node_pfn_end = node_pfn_start + (node_mem_size(nid) >> PAGE_SHIFT);
73 73
74 /* Find free clusters, and init and free the bootmem accordingly. */ 74 /* Find free clusters, and init and free the bootmem accordingly. */
75 memdesc = (struct memdesc_struct *) 75 memdesc = (struct memdesc_struct *)
76 (hwrpb->mddt_offset + (unsigned long) hwrpb); 76 (hwrpb->mddt_offset + (unsigned long) hwrpb);
77 77
78 /* find the bounds of this node (node_min_pfn/node_max_pfn) */ 78 /* find the bounds of this node (node_min_pfn/node_max_pfn) */
79 node_min_pfn = ~0UL; 79 node_min_pfn = ~0UL;
80 node_max_pfn = 0UL; 80 node_max_pfn = 0UL;
81 for_each_mem_cluster(memdesc, cluster, i) { 81 for_each_mem_cluster(memdesc, cluster, i) {
82 /* Bit 0 is console/PALcode reserved. Bit 1 is 82 /* Bit 0 is console/PALcode reserved. Bit 1 is
83 non-volatile memory -- we might want to mark 83 non-volatile memory -- we might want to mark
84 this for later. */ 84 this for later. */
85 if (cluster->usage & 3) 85 if (cluster->usage & 3)
86 continue; 86 continue;
87 87
88 start = cluster->start_pfn; 88 start = cluster->start_pfn;
89 end = start + cluster->numpages; 89 end = start + cluster->numpages;
90 90
91 if (start >= node_pfn_end || end <= node_pfn_start) 91 if (start >= node_pfn_end || end <= node_pfn_start)
92 continue; 92 continue;
93 93
94 if (!show_init) { 94 if (!show_init) {
95 show_init = 1; 95 show_init = 1;
96 printk("Initializing bootmem allocator on Node ID %d\n", nid); 96 printk("Initializing bootmem allocator on Node ID %d\n", nid);
97 } 97 }
98 printk(" memcluster %2d, usage %1lx, start %8lu, end %8lu\n", 98 printk(" memcluster %2d, usage %1lx, start %8lu, end %8lu\n",
99 i, cluster->usage, cluster->start_pfn, 99 i, cluster->usage, cluster->start_pfn,
100 cluster->start_pfn + cluster->numpages); 100 cluster->start_pfn + cluster->numpages);
101 101
102 if (start < node_pfn_start) 102 if (start < node_pfn_start)
103 start = node_pfn_start; 103 start = node_pfn_start;
104 if (end > node_pfn_end) 104 if (end > node_pfn_end)
105 end = node_pfn_end; 105 end = node_pfn_end;
106 106
107 if (start < node_min_pfn) 107 if (start < node_min_pfn)
108 node_min_pfn = start; 108 node_min_pfn = start;
109 if (end > node_max_pfn) 109 if (end > node_max_pfn)
110 node_max_pfn = end; 110 node_max_pfn = end;
111 } 111 }
112 112
113 if (mem_size_limit && node_max_pfn > mem_size_limit) { 113 if (mem_size_limit && node_max_pfn > mem_size_limit) {
114 static int msg_shown = 0; 114 static int msg_shown = 0;
115 if (!msg_shown) { 115 if (!msg_shown) {
116 msg_shown = 1; 116 msg_shown = 1;
117 printk("setup: forcing memory size to %ldK (from %ldK).\n", 117 printk("setup: forcing memory size to %ldK (from %ldK).\n",
118 mem_size_limit << (PAGE_SHIFT - 10), 118 mem_size_limit << (PAGE_SHIFT - 10),
119 node_max_pfn << (PAGE_SHIFT - 10)); 119 node_max_pfn << (PAGE_SHIFT - 10));
120 } 120 }
121 node_max_pfn = mem_size_limit; 121 node_max_pfn = mem_size_limit;
122 } 122 }
123 123
124 if (node_min_pfn >= node_max_pfn) 124 if (node_min_pfn >= node_max_pfn)
125 return; 125 return;
126 126
127 /* Update global {min,max}_low_pfn from node information. */ 127 /* Update global {min,max}_low_pfn from node information. */
128 if (node_min_pfn < min_low_pfn) 128 if (node_min_pfn < min_low_pfn)
129 min_low_pfn = node_min_pfn; 129 min_low_pfn = node_min_pfn;
130 if (node_max_pfn > max_low_pfn) 130 if (node_max_pfn > max_low_pfn)
131 max_pfn = max_low_pfn = node_max_pfn; 131 max_pfn = max_low_pfn = node_max_pfn;
132 132
133 num_physpages += node_max_pfn - node_min_pfn; 133 num_physpages += node_max_pfn - node_min_pfn;
134 134
135 #if 0 /* we'll try this one again in a little while */ 135 #if 0 /* we'll try this one again in a little while */
136 /* Cute trick to make sure our local node data is on local memory */ 136 /* Cute trick to make sure our local node data is on local memory */
137 node_data[nid] = (pg_data_t *)(__va(node_min_pfn << PAGE_SHIFT)); 137 node_data[nid] = (pg_data_t *)(__va(node_min_pfn << PAGE_SHIFT));
138 #endif 138 #endif
139 /* Quasi-mark the pg_data_t as in-use */ 139 /* Quasi-mark the pg_data_t as in-use */
140 node_min_pfn += node_datasz; 140 node_min_pfn += node_datasz;
141 if (node_min_pfn >= node_max_pfn) { 141 if (node_min_pfn >= node_max_pfn) {
142 printk(" not enough mem to reserve NODE_DATA"); 142 printk(" not enough mem to reserve NODE_DATA");
143 return; 143 return;
144 } 144 }
145 NODE_DATA(nid)->bdata = &node_bdata[nid]; 145 NODE_DATA(nid)->bdata = &node_bdata[nid];
146 146
147 printk(" Detected node memory: start %8lu, end %8lu\n", 147 printk(" Detected node memory: start %8lu, end %8lu\n",
148 node_min_pfn, node_max_pfn); 148 node_min_pfn, node_max_pfn);
149 149
150 DBGDCONT(" DISCONTIG: node_data[%d] is at 0x%p\n", nid, NODE_DATA(nid)); 150 DBGDCONT(" DISCONTIG: node_data[%d] is at 0x%p\n", nid, NODE_DATA(nid));
151 DBGDCONT(" DISCONTIG: NODE_DATA(%d)->bdata is at 0x%p\n", nid, NODE_DATA(nid)->bdata); 151 DBGDCONT(" DISCONTIG: NODE_DATA(%d)->bdata is at 0x%p\n", nid, NODE_DATA(nid)->bdata);
152 152
153 /* Find the bounds of kernel memory. */ 153 /* Find the bounds of kernel memory. */
154 start_kernel_pfn = PFN_DOWN(KERNEL_START_PHYS); 154 start_kernel_pfn = PFN_DOWN(KERNEL_START_PHYS);
155 end_kernel_pfn = PFN_UP(virt_to_phys(kernel_end)); 155 end_kernel_pfn = PFN_UP(virt_to_phys(kernel_end));
156 bootmap_start = -1; 156 bootmap_start = -1;
157 157
158 if (!nid && (node_max_pfn < end_kernel_pfn || node_min_pfn > start_kernel_pfn)) 158 if (!nid && (node_max_pfn < end_kernel_pfn || node_min_pfn > start_kernel_pfn))
159 panic("kernel loaded out of ram"); 159 panic("kernel loaded out of ram");
160 160
161 /* Zone start phys-addr must be 2^(MAX_ORDER-1) aligned. 161 /* Zone start phys-addr must be 2^(MAX_ORDER-1) aligned.
162 Note that we round this down, not up - node memory 162 Note that we round this down, not up - node memory
163 has much larger alignment than 8Mb, so it's safe. */ 163 has much larger alignment than 8Mb, so it's safe. */
164 node_min_pfn &= ~((1UL << (MAX_ORDER-1))-1); 164 node_min_pfn &= ~((1UL << (MAX_ORDER-1))-1);
165 165
166 /* We need to know how many physically contiguous pages 166 /* We need to know how many physically contiguous pages
167 we'll need for the bootmap. */ 167 we'll need for the bootmap. */
168 bootmap_pages = bootmem_bootmap_pages(node_max_pfn-node_min_pfn); 168 bootmap_pages = bootmem_bootmap_pages(node_max_pfn-node_min_pfn);
169 169
170 /* Now find a good region where to allocate the bootmap. */ 170 /* Now find a good region where to allocate the bootmap. */
171 for_each_mem_cluster(memdesc, cluster, i) { 171 for_each_mem_cluster(memdesc, cluster, i) {
172 if (cluster->usage & 3) 172 if (cluster->usage & 3)
173 continue; 173 continue;
174 174
175 start = cluster->start_pfn; 175 start = cluster->start_pfn;
176 end = start + cluster->numpages; 176 end = start + cluster->numpages;
177 177
178 if (start >= node_max_pfn || end <= node_min_pfn) 178 if (start >= node_max_pfn || end <= node_min_pfn)
179 continue; 179 continue;
180 180
181 if (end > node_max_pfn) 181 if (end > node_max_pfn)
182 end = node_max_pfn; 182 end = node_max_pfn;
183 if (start < node_min_pfn) 183 if (start < node_min_pfn)
184 start = node_min_pfn; 184 start = node_min_pfn;
185 185
186 if (start < start_kernel_pfn) { 186 if (start < start_kernel_pfn) {
187 if (end > end_kernel_pfn 187 if (end > end_kernel_pfn
188 && end - end_kernel_pfn >= bootmap_pages) { 188 && end - end_kernel_pfn >= bootmap_pages) {
189 bootmap_start = end_kernel_pfn; 189 bootmap_start = end_kernel_pfn;
190 break; 190 break;
191 } else if (end > start_kernel_pfn) 191 } else if (end > start_kernel_pfn)
192 end = start_kernel_pfn; 192 end = start_kernel_pfn;
193 } else if (start < end_kernel_pfn) 193 } else if (start < end_kernel_pfn)
194 start = end_kernel_pfn; 194 start = end_kernel_pfn;
195 if (end - start >= bootmap_pages) { 195 if (end - start >= bootmap_pages) {
196 bootmap_start = start; 196 bootmap_start = start;
197 break; 197 break;
198 } 198 }
199 } 199 }
200 200
201 if (bootmap_start == -1) 201 if (bootmap_start == -1)
202 panic("couldn't find a contigous place for the bootmap"); 202 panic("couldn't find a contigous place for the bootmap");
203 203
204 /* Allocate the bootmap and mark the whole MM as reserved. */ 204 /* Allocate the bootmap and mark the whole MM as reserved. */
205 bootmap_size = init_bootmem_node(NODE_DATA(nid), bootmap_start, 205 bootmap_size = init_bootmem_node(NODE_DATA(nid), bootmap_start,
206 node_min_pfn, node_max_pfn); 206 node_min_pfn, node_max_pfn);
207 DBGDCONT(" bootmap_start %lu, bootmap_size %lu, bootmap_pages %lu\n", 207 DBGDCONT(" bootmap_start %lu, bootmap_size %lu, bootmap_pages %lu\n",
208 bootmap_start, bootmap_size, bootmap_pages); 208 bootmap_start, bootmap_size, bootmap_pages);
209 209
210 /* Mark the free regions. */ 210 /* Mark the free regions. */
211 for_each_mem_cluster(memdesc, cluster, i) { 211 for_each_mem_cluster(memdesc, cluster, i) {
212 if (cluster->usage & 3) 212 if (cluster->usage & 3)
213 continue; 213 continue;
214 214
215 start = cluster->start_pfn; 215 start = cluster->start_pfn;
216 end = cluster->start_pfn + cluster->numpages; 216 end = cluster->start_pfn + cluster->numpages;
217 217
218 if (start >= node_max_pfn || end <= node_min_pfn) 218 if (start >= node_max_pfn || end <= node_min_pfn)
219 continue; 219 continue;
220 220
221 if (end > node_max_pfn) 221 if (end > node_max_pfn)
222 end = node_max_pfn; 222 end = node_max_pfn;
223 if (start < node_min_pfn) 223 if (start < node_min_pfn)
224 start = node_min_pfn; 224 start = node_min_pfn;
225 225
226 if (start < start_kernel_pfn) { 226 if (start < start_kernel_pfn) {
227 if (end > end_kernel_pfn) { 227 if (end > end_kernel_pfn) {
228 free_bootmem_node(NODE_DATA(nid), PFN_PHYS(start), 228 free_bootmem_node(NODE_DATA(nid), PFN_PHYS(start),
229 (PFN_PHYS(start_kernel_pfn) 229 (PFN_PHYS(start_kernel_pfn)
230 - PFN_PHYS(start))); 230 - PFN_PHYS(start)));
231 printk(" freeing pages %ld:%ld\n", 231 printk(" freeing pages %ld:%ld\n",
232 start, start_kernel_pfn); 232 start, start_kernel_pfn);
233 start = end_kernel_pfn; 233 start = end_kernel_pfn;
234 } else if (end > start_kernel_pfn) 234 } else if (end > start_kernel_pfn)
235 end = start_kernel_pfn; 235 end = start_kernel_pfn;
236 } else if (start < end_kernel_pfn) 236 } else if (start < end_kernel_pfn)
237 start = end_kernel_pfn; 237 start = end_kernel_pfn;
238 if (start >= end) 238 if (start >= end)
239 continue; 239 continue;
240 240
241 free_bootmem_node(NODE_DATA(nid), PFN_PHYS(start), PFN_PHYS(end) - PFN_PHYS(start)); 241 free_bootmem_node(NODE_DATA(nid), PFN_PHYS(start), PFN_PHYS(end) - PFN_PHYS(start));
242 printk(" freeing pages %ld:%ld\n", start, end); 242 printk(" freeing pages %ld:%ld\n", start, end);
243 } 243 }
244 244
245 /* Reserve the bootmap memory. */ 245 /* Reserve the bootmap memory. */
246 reserve_bootmem_node(NODE_DATA(nid), PFN_PHYS(bootmap_start), bootmap_size); 246 reserve_bootmem_node(NODE_DATA(nid), PFN_PHYS(bootmap_start), bootmap_size);
247 printk(" reserving pages %ld:%ld\n", bootmap_start, bootmap_start+PFN_UP(bootmap_size)); 247 printk(" reserving pages %ld:%ld\n", bootmap_start, bootmap_start+PFN_UP(bootmap_size));
248 248
249 node_set_online(nid); 249 node_set_online(nid);
250 } 250 }
251 251
252 void __init 252 void __init
253 setup_memory(void *kernel_end) 253 setup_memory(void *kernel_end)
254 { 254 {
255 int nid; 255 int nid;
256 256
257 show_mem_layout(); 257 show_mem_layout();
258 258
259 nodes_clear(node_online_map); 259 nodes_clear(node_online_map);
260 260
261 min_low_pfn = ~0UL; 261 min_low_pfn = ~0UL;
262 max_low_pfn = 0UL; 262 max_low_pfn = 0UL;
263 for (nid = 0; nid < MAX_NUMNODES; nid++) 263 for (nid = 0; nid < MAX_NUMNODES; nid++)
264 setup_memory_node(nid, kernel_end); 264 setup_memory_node(nid, kernel_end);
265 265
266 #ifdef CONFIG_BLK_DEV_INITRD 266 #ifdef CONFIG_BLK_DEV_INITRD
267 initrd_start = INITRD_START; 267 initrd_start = INITRD_START;
268 if (initrd_start) { 268 if (initrd_start) {
269 extern void *move_initrd(unsigned long); 269 extern void *move_initrd(unsigned long);
270 270
271 initrd_end = initrd_start+INITRD_SIZE; 271 initrd_end = initrd_start+INITRD_SIZE;
272 printk("Initial ramdisk at: 0x%p (%lu bytes)\n", 272 printk("Initial ramdisk at: 0x%p (%lu bytes)\n",
273 (void *) initrd_start, INITRD_SIZE); 273 (void *) initrd_start, INITRD_SIZE);
274 274
275 if ((void *)initrd_end > phys_to_virt(PFN_PHYS(max_low_pfn))) { 275 if ((void *)initrd_end > phys_to_virt(PFN_PHYS(max_low_pfn))) {
276 if (!move_initrd(PFN_PHYS(max_low_pfn))) 276 if (!move_initrd(PFN_PHYS(max_low_pfn)))
277 printk("initrd extends beyond end of memory " 277 printk("initrd extends beyond end of memory "
278 "(0x%08lx > 0x%p)\ndisabling initrd\n", 278 "(0x%08lx > 0x%p)\ndisabling initrd\n",
279 initrd_end, 279 initrd_end,
280 phys_to_virt(PFN_PHYS(max_low_pfn))); 280 phys_to_virt(PFN_PHYS(max_low_pfn)));
281 } else { 281 } else {
282 nid = kvaddr_to_nid(initrd_start); 282 nid = kvaddr_to_nid(initrd_start);
283 reserve_bootmem_node(NODE_DATA(nid), 283 reserve_bootmem_node(NODE_DATA(nid),
284 virt_to_phys((void *)initrd_start), 284 virt_to_phys((void *)initrd_start),
285 INITRD_SIZE); 285 INITRD_SIZE);
286 } 286 }
287 } 287 }
288 #endif /* CONFIG_BLK_DEV_INITRD */ 288 #endif /* CONFIG_BLK_DEV_INITRD */
289 } 289 }
290 290
291 void __init paging_init(void) 291 void __init paging_init(void)
292 { 292 {
293 unsigned int nid; 293 unsigned int nid;
294 unsigned long zones_size[MAX_NR_ZONES] = {0, }; 294 unsigned long zones_size[MAX_NR_ZONES] = {0, };
295 unsigned long dma_local_pfn; 295 unsigned long dma_local_pfn;
296 296
297 /* 297 /*
298 * The old global MAX_DMA_ADDRESS per-arch API doesn't fit 298 * The old global MAX_DMA_ADDRESS per-arch API doesn't fit
299 * in the NUMA model, for now we convert it to a pfn and 299 * in the NUMA model, for now we convert it to a pfn and
300 * we interpret this pfn as a local per-node information. 300 * we interpret this pfn as a local per-node information.
301 * This issue isn't very important since none of these machines 301 * This issue isn't very important since none of these machines
302 * have legacy ISA slots anyways. 302 * have legacy ISA slots anyways.
303 */ 303 */
304 dma_local_pfn = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; 304 dma_local_pfn = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
305 305
306 for_each_online_node(nid) { 306 for_each_online_node(nid) {
307 unsigned long start_pfn = node_bdata[nid].node_boot_start >> PAGE_SHIFT; 307 unsigned long start_pfn = node_bdata[nid].node_boot_start >> PAGE_SHIFT;
308 unsigned long end_pfn = node_bdata[nid].node_low_pfn; 308 unsigned long end_pfn = node_bdata[nid].node_low_pfn;
309 309
310 if (dma_local_pfn >= end_pfn - start_pfn) 310 if (dma_local_pfn >= end_pfn - start_pfn)
311 zones_size[ZONE_DMA] = end_pfn - start_pfn; 311 zones_size[ZONE_DMA] = end_pfn - start_pfn;
312 else { 312 else {
313 zones_size[ZONE_DMA] = dma_local_pfn; 313 zones_size[ZONE_DMA] = dma_local_pfn;
314 zones_size[ZONE_NORMAL] = (end_pfn - start_pfn) - dma_local_pfn; 314 zones_size[ZONE_NORMAL] = (end_pfn - start_pfn) - dma_local_pfn;
315 } 315 }
316 free_area_init_node(nid, NODE_DATA(nid), zones_size, start_pfn, NULL); 316 free_area_init_node(nid, NODE_DATA(nid), zones_size, start_pfn, NULL);
317 } 317 }
318 318
319 /* Initialize the kernel's ZERO_PGE. */ 319 /* Initialize the kernel's ZERO_PGE. */
320 memset((void *)ZERO_PGE, 0, PAGE_SIZE); 320 memset((void *)ZERO_PGE, 0, PAGE_SIZE);
321 } 321 }
322 322
323 void __init mem_init(void) 323 void __init mem_init(void)
324 { 324 {
325 unsigned long codesize, reservedpages, datasize, initsize, pfn; 325 unsigned long codesize, reservedpages, datasize, initsize, pfn;
326 extern int page_is_ram(unsigned long) __init; 326 extern int page_is_ram(unsigned long) __init;
327 extern char _text, _etext, _data, _edata; 327 extern char _text, _etext, _data, _edata;
328 extern char __init_begin, __init_end; 328 extern char __init_begin, __init_end;
329 unsigned long nid, i; 329 unsigned long nid, i;
330 high_memory = (void *) __va(max_low_pfn << PAGE_SHIFT); 330 high_memory = (void *) __va(max_low_pfn << PAGE_SHIFT);
331 331
332 reservedpages = 0; 332 reservedpages = 0;
333 for_each_online_node(nid) { 333 for_each_online_node(nid) {
334 /* 334 /*
335 * This will free up the bootmem, ie, slot 0 memory 335 * This will free up the bootmem, ie, slot 0 memory
336 */ 336 */
337 totalram_pages += free_all_bootmem_node(NODE_DATA(nid)); 337 totalram_pages += free_all_bootmem_node(NODE_DATA(nid));
338 338
339 pfn = NODE_DATA(nid)->node_start_pfn; 339 pfn = NODE_DATA(nid)->node_start_pfn;
340 for (i = 0; i < node_spanned_pages(nid); i++, pfn++) 340 for (i = 0; i < node_spanned_pages(nid); i++, pfn++)
341 if (page_is_ram(pfn) && 341 if (page_is_ram(pfn) &&
342 PageReserved(nid_page_nr(nid, i))) 342 PageReserved(nid_page_nr(nid, i)))
343 reservedpages++; 343 reservedpages++;
344 } 344 }
345 345
346 codesize = (unsigned long) &_etext - (unsigned long) &_text; 346 codesize = (unsigned long) &_etext - (unsigned long) &_text;
347 datasize = (unsigned long) &_edata - (unsigned long) &_data; 347 datasize = (unsigned long) &_edata - (unsigned long) &_data;
348 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; 348 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
349 349
350 printk("Memory: %luk/%luk available (%luk kernel code, %luk reserved, " 350 printk("Memory: %luk/%luk available (%luk kernel code, %luk reserved, "
351 "%luk data, %luk init)\n", 351 "%luk data, %luk init)\n",
352 (unsigned long)nr_free_pages() << (PAGE_SHIFT-10), 352 (unsigned long)nr_free_pages() << (PAGE_SHIFT-10),
353 num_physpages << (PAGE_SHIFT-10), 353 num_physpages << (PAGE_SHIFT-10),
354 codesize >> 10, 354 codesize >> 10,
355 reservedpages << (PAGE_SHIFT-10), 355 reservedpages << (PAGE_SHIFT-10),
356 datasize >> 10, 356 datasize >> 10,
357 initsize >> 10); 357 initsize >> 10);
358 #if 0 358 #if 0
359 mem_stress(); 359 mem_stress();
360 #endif 360 #endif
361 } 361 }
362 362
363 void 363 void
364 show_mem(void) 364 show_mem(void)
365 { 365 {
366 long i,free = 0,total = 0,reserved = 0; 366 long i,free = 0,total = 0,reserved = 0;
367 long shared = 0, cached = 0; 367 long shared = 0, cached = 0;
368 int nid; 368 int nid;
369 369
370 printk("\nMem-info:\n"); 370 printk("\nMem-info:\n");
371 show_free_areas(); 371 show_free_areas();
372 printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); 372 printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
373 for_each_online_node(nid) { 373 for_each_online_node(nid) {
374 unsigned long flags;
375 pgdat_resize_lock(NODE_DATA(nid), &flags);
374 i = node_spanned_pages(nid); 376 i = node_spanned_pages(nid);
375 while (i-- > 0) { 377 while (i-- > 0) {
376 struct page *page = nid_page_nr(nid, i); 378 struct page *page = nid_page_nr(nid, i);
377 total++; 379 total++;
378 if (PageReserved(page)) 380 if (PageReserved(page))
379 reserved++; 381 reserved++;
380 else if (PageSwapCache(page)) 382 else if (PageSwapCache(page))
381 cached++; 383 cached++;
382 else if (!page_count(page)) 384 else if (!page_count(page))
383 free++; 385 free++;
384 else 386 else
385 shared += page_count(page) - 1; 387 shared += page_count(page) - 1;
386 } 388 }
389 pgdat_resize_unlock(NODE_DATA(nid), &flags);
387 } 390 }
388 printk("%ld pages of RAM\n",total); 391 printk("%ld pages of RAM\n",total);
389 printk("%ld free pages\n",free); 392 printk("%ld free pages\n",free);
390 printk("%ld reserved pages\n",reserved); 393 printk("%ld reserved pages\n",reserved);
391 printk("%ld pages shared\n",shared); 394 printk("%ld pages shared\n",shared);
392 printk("%ld pages swap cached\n",cached); 395 printk("%ld pages swap cached\n",cached);
393 } 396 }
394 397
arch/i386/mm/pgtable.c
1 /* 1 /*
2 * linux/arch/i386/mm/pgtable.c 2 * linux/arch/i386/mm/pgtable.c
3 */ 3 */
4 4
5 #include <linux/config.h> 5 #include <linux/config.h>
6 #include <linux/sched.h> 6 #include <linux/sched.h>
7 #include <linux/kernel.h> 7 #include <linux/kernel.h>
8 #include <linux/errno.h> 8 #include <linux/errno.h>
9 #include <linux/mm.h> 9 #include <linux/mm.h>
10 #include <linux/swap.h> 10 #include <linux/swap.h>
11 #include <linux/smp.h> 11 #include <linux/smp.h>
12 #include <linux/highmem.h> 12 #include <linux/highmem.h>
13 #include <linux/slab.h> 13 #include <linux/slab.h>
14 #include <linux/pagemap.h> 14 #include <linux/pagemap.h>
15 #include <linux/spinlock.h> 15 #include <linux/spinlock.h>
16 16
17 #include <asm/system.h> 17 #include <asm/system.h>
18 #include <asm/pgtable.h> 18 #include <asm/pgtable.h>
19 #include <asm/pgalloc.h> 19 #include <asm/pgalloc.h>
20 #include <asm/fixmap.h> 20 #include <asm/fixmap.h>
21 #include <asm/e820.h> 21 #include <asm/e820.h>
22 #include <asm/tlb.h> 22 #include <asm/tlb.h>
23 #include <asm/tlbflush.h> 23 #include <asm/tlbflush.h>
24 24
25 void show_mem(void) 25 void show_mem(void)
26 { 26 {
27 int total = 0, reserved = 0; 27 int total = 0, reserved = 0;
28 int shared = 0, cached = 0; 28 int shared = 0, cached = 0;
29 int highmem = 0; 29 int highmem = 0;
30 struct page *page; 30 struct page *page;
31 pg_data_t *pgdat; 31 pg_data_t *pgdat;
32 unsigned long i; 32 unsigned long i;
33 struct page_state ps; 33 struct page_state ps;
34 unsigned long flags;
34 35
35 printk(KERN_INFO "Mem-info:\n"); 36 printk(KERN_INFO "Mem-info:\n");
36 show_free_areas(); 37 show_free_areas();
37 printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); 38 printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
38 for_each_pgdat(pgdat) { 39 for_each_pgdat(pgdat) {
40 pgdat_resize_lock(pgdat, &flags);
39 for (i = 0; i < pgdat->node_spanned_pages; ++i) { 41 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
40 page = pgdat_page_nr(pgdat, i); 42 page = pgdat_page_nr(pgdat, i);
41 total++; 43 total++;
42 if (PageHighMem(page)) 44 if (PageHighMem(page))
43 highmem++; 45 highmem++;
44 if (PageReserved(page)) 46 if (PageReserved(page))
45 reserved++; 47 reserved++;
46 else if (PageSwapCache(page)) 48 else if (PageSwapCache(page))
47 cached++; 49 cached++;
48 else if (page_count(page)) 50 else if (page_count(page))
49 shared += page_count(page) - 1; 51 shared += page_count(page) - 1;
50 } 52 }
53 pgdat_resize_unlock(pgdat, &flags);
51 } 54 }
52 printk(KERN_INFO "%d pages of RAM\n", total); 55 printk(KERN_INFO "%d pages of RAM\n", total);
53 printk(KERN_INFO "%d pages of HIGHMEM\n", highmem); 56 printk(KERN_INFO "%d pages of HIGHMEM\n", highmem);
54 printk(KERN_INFO "%d reserved pages\n", reserved); 57 printk(KERN_INFO "%d reserved pages\n", reserved);
55 printk(KERN_INFO "%d pages shared\n", shared); 58 printk(KERN_INFO "%d pages shared\n", shared);
56 printk(KERN_INFO "%d pages swap cached\n", cached); 59 printk(KERN_INFO "%d pages swap cached\n", cached);
57 60
58 get_page_state(&ps); 61 get_page_state(&ps);
59 printk(KERN_INFO "%lu pages dirty\n", ps.nr_dirty); 62 printk(KERN_INFO "%lu pages dirty\n", ps.nr_dirty);
60 printk(KERN_INFO "%lu pages writeback\n", ps.nr_writeback); 63 printk(KERN_INFO "%lu pages writeback\n", ps.nr_writeback);
61 printk(KERN_INFO "%lu pages mapped\n", ps.nr_mapped); 64 printk(KERN_INFO "%lu pages mapped\n", ps.nr_mapped);
62 printk(KERN_INFO "%lu pages slab\n", ps.nr_slab); 65 printk(KERN_INFO "%lu pages slab\n", ps.nr_slab);
63 printk(KERN_INFO "%lu pages pagetables\n", ps.nr_page_table_pages); 66 printk(KERN_INFO "%lu pages pagetables\n", ps.nr_page_table_pages);
64 } 67 }
65 68
66 /* 69 /*
67 * Associate a virtual page frame with a given physical page frame 70 * Associate a virtual page frame with a given physical page frame
68 * and protection flags for that frame. 71 * and protection flags for that frame.
69 */ 72 */
70 static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags) 73 static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
71 { 74 {
72 pgd_t *pgd; 75 pgd_t *pgd;
73 pud_t *pud; 76 pud_t *pud;
74 pmd_t *pmd; 77 pmd_t *pmd;
75 pte_t *pte; 78 pte_t *pte;
76 79
77 pgd = swapper_pg_dir + pgd_index(vaddr); 80 pgd = swapper_pg_dir + pgd_index(vaddr);
78 if (pgd_none(*pgd)) { 81 if (pgd_none(*pgd)) {
79 BUG(); 82 BUG();
80 return; 83 return;
81 } 84 }
82 pud = pud_offset(pgd, vaddr); 85 pud = pud_offset(pgd, vaddr);
83 if (pud_none(*pud)) { 86 if (pud_none(*pud)) {
84 BUG(); 87 BUG();
85 return; 88 return;
86 } 89 }
87 pmd = pmd_offset(pud, vaddr); 90 pmd = pmd_offset(pud, vaddr);
88 if (pmd_none(*pmd)) { 91 if (pmd_none(*pmd)) {
89 BUG(); 92 BUG();
90 return; 93 return;
91 } 94 }
92 pte = pte_offset_kernel(pmd, vaddr); 95 pte = pte_offset_kernel(pmd, vaddr);
93 /* <pfn,flags> stored as-is, to permit clearing entries */ 96 /* <pfn,flags> stored as-is, to permit clearing entries */
94 set_pte(pte, pfn_pte(pfn, flags)); 97 set_pte(pte, pfn_pte(pfn, flags));
95 98
96 /* 99 /*
97 * It's enough to flush this one mapping. 100 * It's enough to flush this one mapping.
98 * (PGE mappings get flushed as well) 101 * (PGE mappings get flushed as well)
99 */ 102 */
100 __flush_tlb_one(vaddr); 103 __flush_tlb_one(vaddr);
101 } 104 }
102 105
103 /* 106 /*
104 * Associate a large virtual page frame with a given physical page frame 107 * Associate a large virtual page frame with a given physical page frame
105 * and protection flags for that frame. pfn is for the base of the page, 108 * and protection flags for that frame. pfn is for the base of the page,
106 * vaddr is what the page gets mapped to - both must be properly aligned. 109 * vaddr is what the page gets mapped to - both must be properly aligned.
107 * The pmd must already be instantiated. Assumes PAE mode. 110 * The pmd must already be instantiated. Assumes PAE mode.
108 */ 111 */
109 void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags) 112 void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
110 { 113 {
111 pgd_t *pgd; 114 pgd_t *pgd;
112 pud_t *pud; 115 pud_t *pud;
113 pmd_t *pmd; 116 pmd_t *pmd;
114 117
115 if (vaddr & (PMD_SIZE-1)) { /* vaddr is misaligned */ 118 if (vaddr & (PMD_SIZE-1)) { /* vaddr is misaligned */
116 printk(KERN_WARNING "set_pmd_pfn: vaddr misaligned\n"); 119 printk(KERN_WARNING "set_pmd_pfn: vaddr misaligned\n");
117 return; /* BUG(); */ 120 return; /* BUG(); */
118 } 121 }
119 if (pfn & (PTRS_PER_PTE-1)) { /* pfn is misaligned */ 122 if (pfn & (PTRS_PER_PTE-1)) { /* pfn is misaligned */
120 printk(KERN_WARNING "set_pmd_pfn: pfn misaligned\n"); 123 printk(KERN_WARNING "set_pmd_pfn: pfn misaligned\n");
121 return; /* BUG(); */ 124 return; /* BUG(); */
122 } 125 }
123 pgd = swapper_pg_dir + pgd_index(vaddr); 126 pgd = swapper_pg_dir + pgd_index(vaddr);
124 if (pgd_none(*pgd)) { 127 if (pgd_none(*pgd)) {
125 printk(KERN_WARNING "set_pmd_pfn: pgd_none\n"); 128 printk(KERN_WARNING "set_pmd_pfn: pgd_none\n");
126 return; /* BUG(); */ 129 return; /* BUG(); */
127 } 130 }
128 pud = pud_offset(pgd, vaddr); 131 pud = pud_offset(pgd, vaddr);
129 pmd = pmd_offset(pud, vaddr); 132 pmd = pmd_offset(pud, vaddr);
130 set_pmd(pmd, pfn_pmd(pfn, flags)); 133 set_pmd(pmd, pfn_pmd(pfn, flags));
131 /* 134 /*
132 * It's enough to flush this one mapping. 135 * It's enough to flush this one mapping.
133 * (PGE mappings get flushed as well) 136 * (PGE mappings get flushed as well)
134 */ 137 */
135 __flush_tlb_one(vaddr); 138 __flush_tlb_one(vaddr);
136 } 139 }
137 140
138 void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags) 141 void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
139 { 142 {
140 unsigned long address = __fix_to_virt(idx); 143 unsigned long address = __fix_to_virt(idx);
141 144
142 if (idx >= __end_of_fixed_addresses) { 145 if (idx >= __end_of_fixed_addresses) {
143 BUG(); 146 BUG();
144 return; 147 return;
145 } 148 }
146 set_pte_pfn(address, phys >> PAGE_SHIFT, flags); 149 set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
147 } 150 }
148 151
149 pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) 152 pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
150 { 153 {
151 return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); 154 return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
152 } 155 }
153 156
154 struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) 157 struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
155 { 158 {
156 struct page *pte; 159 struct page *pte;
157 160
158 #ifdef CONFIG_HIGHPTE 161 #ifdef CONFIG_HIGHPTE
159 pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0); 162 pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
160 #else 163 #else
161 pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); 164 pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
162 #endif 165 #endif
163 return pte; 166 return pte;
164 } 167 }
165 168
166 void pmd_ctor(void *pmd, kmem_cache_t *cache, unsigned long flags) 169 void pmd_ctor(void *pmd, kmem_cache_t *cache, unsigned long flags)
167 { 170 {
168 memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t)); 171 memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
169 } 172 }
170 173
171 /* 174 /*
172 * List of all pgd's needed for non-PAE so it can invalidate entries 175 * List of all pgd's needed for non-PAE so it can invalidate entries
173 * in both cached and uncached pgd's; not needed for PAE since the 176 * in both cached and uncached pgd's; not needed for PAE since the
174 * kernel pmd is shared. If PAE were not to share the pmd a similar 177 * kernel pmd is shared. If PAE were not to share the pmd a similar
175 * tactic would be needed. This is essentially codepath-based locking 178 * tactic would be needed. This is essentially codepath-based locking
176 * against pageattr.c; it is the unique case in which a valid change 179 * against pageattr.c; it is the unique case in which a valid change
177 * of kernel pagetables can't be lazily synchronized by vmalloc faults. 180 * of kernel pagetables can't be lazily synchronized by vmalloc faults.
178 * vmalloc faults work because attached pagetables are never freed. 181 * vmalloc faults work because attached pagetables are never freed.
179 * The locking scheme was chosen on the basis of manfred's 182 * The locking scheme was chosen on the basis of manfred's
180 * recommendations and having no core impact whatsoever. 183 * recommendations and having no core impact whatsoever.
181 * -- wli 184 * -- wli
182 */ 185 */
183 DEFINE_SPINLOCK(pgd_lock); 186 DEFINE_SPINLOCK(pgd_lock);
184 struct page *pgd_list; 187 struct page *pgd_list;
185 188
186 static inline void pgd_list_add(pgd_t *pgd) 189 static inline void pgd_list_add(pgd_t *pgd)
187 { 190 {
188 struct page *page = virt_to_page(pgd); 191 struct page *page = virt_to_page(pgd);
189 page->index = (unsigned long)pgd_list; 192 page->index = (unsigned long)pgd_list;
190 if (pgd_list) 193 if (pgd_list)
191 set_page_private(pgd_list, (unsigned long)&page->index); 194 set_page_private(pgd_list, (unsigned long)&page->index);
192 pgd_list = page; 195 pgd_list = page;
193 set_page_private(page, (unsigned long)&pgd_list); 196 set_page_private(page, (unsigned long)&pgd_list);
194 } 197 }
195 198
196 static inline void pgd_list_del(pgd_t *pgd) 199 static inline void pgd_list_del(pgd_t *pgd)
197 { 200 {
198 struct page *next, **pprev, *page = virt_to_page(pgd); 201 struct page *next, **pprev, *page = virt_to_page(pgd);
199 next = (struct page *)page->index; 202 next = (struct page *)page->index;
200 pprev = (struct page **)page_private(page); 203 pprev = (struct page **)page_private(page);
201 *pprev = next; 204 *pprev = next;
202 if (next) 205 if (next)
203 set_page_private(next, (unsigned long)pprev); 206 set_page_private(next, (unsigned long)pprev);
204 } 207 }
205 208
206 void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused) 209 void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused)
207 { 210 {
208 unsigned long flags; 211 unsigned long flags;
209 212
210 if (PTRS_PER_PMD == 1) { 213 if (PTRS_PER_PMD == 1) {
211 memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); 214 memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
212 spin_lock_irqsave(&pgd_lock, flags); 215 spin_lock_irqsave(&pgd_lock, flags);
213 } 216 }
214 217
215 clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD, 218 clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
216 swapper_pg_dir + USER_PTRS_PER_PGD, 219 swapper_pg_dir + USER_PTRS_PER_PGD,
217 KERNEL_PGD_PTRS); 220 KERNEL_PGD_PTRS);
218 if (PTRS_PER_PMD > 1) 221 if (PTRS_PER_PMD > 1)
219 return; 222 return;
220 223
221 pgd_list_add(pgd); 224 pgd_list_add(pgd);
222 spin_unlock_irqrestore(&pgd_lock, flags); 225 spin_unlock_irqrestore(&pgd_lock, flags);
223 } 226 }
224 227
225 /* never called when PTRS_PER_PMD > 1 */ 228 /* never called when PTRS_PER_PMD > 1 */
226 void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused) 229 void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused)
227 { 230 {
228 unsigned long flags; /* can be called from interrupt context */ 231 unsigned long flags; /* can be called from interrupt context */
229 232
230 spin_lock_irqsave(&pgd_lock, flags); 233 spin_lock_irqsave(&pgd_lock, flags);
231 pgd_list_del(pgd); 234 pgd_list_del(pgd);
232 spin_unlock_irqrestore(&pgd_lock, flags); 235 spin_unlock_irqrestore(&pgd_lock, flags);
233 } 236 }
234 237
235 pgd_t *pgd_alloc(struct mm_struct *mm) 238 pgd_t *pgd_alloc(struct mm_struct *mm)
236 { 239 {
237 int i; 240 int i;
238 pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL); 241 pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL);
239 242
240 if (PTRS_PER_PMD == 1 || !pgd) 243 if (PTRS_PER_PMD == 1 || !pgd)
241 return pgd; 244 return pgd;
242 245
243 for (i = 0; i < USER_PTRS_PER_PGD; ++i) { 246 for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
244 pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL); 247 pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
245 if (!pmd) 248 if (!pmd)
246 goto out_oom; 249 goto out_oom;
247 set_pgd(&pgd[i], __pgd(1 + __pa(pmd))); 250 set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
248 } 251 }
249 return pgd; 252 return pgd;
250 253
251 out_oom: 254 out_oom:
252 for (i--; i >= 0; i--) 255 for (i--; i >= 0; i--)
253 kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1)); 256 kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1));
254 kmem_cache_free(pgd_cache, pgd); 257 kmem_cache_free(pgd_cache, pgd);
255 return NULL; 258 return NULL;
256 } 259 }
257 260
258 void pgd_free(pgd_t *pgd) 261 void pgd_free(pgd_t *pgd)
259 { 262 {
260 int i; 263 int i;
261 264
262 /* in the PAE case user pgd entries are overwritten before usage */ 265 /* in the PAE case user pgd entries are overwritten before usage */
263 if (PTRS_PER_PMD > 1) 266 if (PTRS_PER_PMD > 1)
264 for (i = 0; i < USER_PTRS_PER_PGD; ++i) 267 for (i = 0; i < USER_PTRS_PER_PGD; ++i)
265 kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1)); 268 kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1));
266 /* in the non-PAE case, free_pgtables() clears user pgd entries */ 269 /* in the non-PAE case, free_pgtables() clears user pgd entries */
267 kmem_cache_free(pgd_cache, pgd); 270 kmem_cache_free(pgd_cache, pgd);
268 } 271 }
269 272
arch/ia64/mm/discontig.c
1 /* 1 /*
2 * Copyright (c) 2000, 2003 Silicon Graphics, Inc. All rights reserved. 2 * Copyright (c) 2000, 2003 Silicon Graphics, Inc. All rights reserved.
3 * Copyright (c) 2001 Intel Corp. 3 * Copyright (c) 2001 Intel Corp.
4 * Copyright (c) 2001 Tony Luck <tony.luck@intel.com> 4 * Copyright (c) 2001 Tony Luck <tony.luck@intel.com>
5 * Copyright (c) 2002 NEC Corp. 5 * Copyright (c) 2002 NEC Corp.
6 * Copyright (c) 2002 Kimio Suganuma <k-suganuma@da.jp.nec.com> 6 * Copyright (c) 2002 Kimio Suganuma <k-suganuma@da.jp.nec.com>
7 * Copyright (c) 2004 Silicon Graphics, Inc 7 * Copyright (c) 2004 Silicon Graphics, Inc
8 * Russ Anderson <rja@sgi.com> 8 * Russ Anderson <rja@sgi.com>
9 * Jesse Barnes <jbarnes@sgi.com> 9 * Jesse Barnes <jbarnes@sgi.com>
10 * Jack Steiner <steiner@sgi.com> 10 * Jack Steiner <steiner@sgi.com>
11 */ 11 */
12 12
13 /* 13 /*
14 * Platform initialization for Discontig Memory 14 * Platform initialization for Discontig Memory
15 */ 15 */
16 16
17 #include <linux/kernel.h> 17 #include <linux/kernel.h>
18 #include <linux/mm.h> 18 #include <linux/mm.h>
19 #include <linux/swap.h> 19 #include <linux/swap.h>
20 #include <linux/bootmem.h> 20 #include <linux/bootmem.h>
21 #include <linux/acpi.h> 21 #include <linux/acpi.h>
22 #include <linux/efi.h> 22 #include <linux/efi.h>
23 #include <linux/nodemask.h> 23 #include <linux/nodemask.h>
24 #include <asm/pgalloc.h> 24 #include <asm/pgalloc.h>
25 #include <asm/tlb.h> 25 #include <asm/tlb.h>
26 #include <asm/meminit.h> 26 #include <asm/meminit.h>
27 #include <asm/numa.h> 27 #include <asm/numa.h>
28 #include <asm/sections.h> 28 #include <asm/sections.h>
29 29
30 /* 30 /*
31 * Track per-node information needed to setup the boot memory allocator, the 31 * Track per-node information needed to setup the boot memory allocator, the
32 * per-node areas, and the real VM. 32 * per-node areas, and the real VM.
33 */ 33 */
34 struct early_node_data { 34 struct early_node_data {
35 struct ia64_node_data *node_data; 35 struct ia64_node_data *node_data;
36 pg_data_t *pgdat; 36 pg_data_t *pgdat;
37 unsigned long pernode_addr; 37 unsigned long pernode_addr;
38 unsigned long pernode_size; 38 unsigned long pernode_size;
39 struct bootmem_data bootmem_data; 39 struct bootmem_data bootmem_data;
40 unsigned long num_physpages; 40 unsigned long num_physpages;
41 unsigned long num_dma_physpages; 41 unsigned long num_dma_physpages;
42 unsigned long min_pfn; 42 unsigned long min_pfn;
43 unsigned long max_pfn; 43 unsigned long max_pfn;
44 }; 44 };
45 45
46 static struct early_node_data mem_data[MAX_NUMNODES] __initdata; 46 static struct early_node_data mem_data[MAX_NUMNODES] __initdata;
47 static nodemask_t memory_less_mask __initdata; 47 static nodemask_t memory_less_mask __initdata;
48 48
49 /* 49 /*
50 * To prevent cache aliasing effects, align per-node structures so that they 50 * To prevent cache aliasing effects, align per-node structures so that they
51 * start at addresses that are strided by node number. 51 * start at addresses that are strided by node number.
52 */ 52 */
53 #define NODEDATA_ALIGN(addr, node) \ 53 #define NODEDATA_ALIGN(addr, node) \
54 ((((addr) + 1024*1024-1) & ~(1024*1024-1)) + (node)*PERCPU_PAGE_SIZE) 54 ((((addr) + 1024*1024-1) & ~(1024*1024-1)) + (node)*PERCPU_PAGE_SIZE)
55 55
56 /** 56 /**
57 * build_node_maps - callback to setup bootmem structs for each node 57 * build_node_maps - callback to setup bootmem structs for each node
58 * @start: physical start of range 58 * @start: physical start of range
59 * @len: length of range 59 * @len: length of range
60 * @node: node where this range resides 60 * @node: node where this range resides
61 * 61 *
62 * We allocate a struct bootmem_data for each piece of memory that we wish to 62 * We allocate a struct bootmem_data for each piece of memory that we wish to
63 * treat as a virtually contiguous block (i.e. each node). Each such block 63 * treat as a virtually contiguous block (i.e. each node). Each such block
64 * must start on an %IA64_GRANULE_SIZE boundary, so we round the address down 64 * must start on an %IA64_GRANULE_SIZE boundary, so we round the address down
65 * if necessary. Any non-existent pages will simply be part of the virtual 65 * if necessary. Any non-existent pages will simply be part of the virtual
66 * memmap. We also update min_low_pfn and max_low_pfn here as we receive 66 * memmap. We also update min_low_pfn and max_low_pfn here as we receive
67 * memory ranges from the caller. 67 * memory ranges from the caller.
68 */ 68 */
69 static int __init build_node_maps(unsigned long start, unsigned long len, 69 static int __init build_node_maps(unsigned long start, unsigned long len,
70 int node) 70 int node)
71 { 71 {
72 unsigned long cstart, epfn, end = start + len; 72 unsigned long cstart, epfn, end = start + len;
73 struct bootmem_data *bdp = &mem_data[node].bootmem_data; 73 struct bootmem_data *bdp = &mem_data[node].bootmem_data;
74 74
75 epfn = GRANULEROUNDUP(end) >> PAGE_SHIFT; 75 epfn = GRANULEROUNDUP(end) >> PAGE_SHIFT;
76 cstart = GRANULEROUNDDOWN(start); 76 cstart = GRANULEROUNDDOWN(start);
77 77
78 if (!bdp->node_low_pfn) { 78 if (!bdp->node_low_pfn) {
79 bdp->node_boot_start = cstart; 79 bdp->node_boot_start = cstart;
80 bdp->node_low_pfn = epfn; 80 bdp->node_low_pfn = epfn;
81 } else { 81 } else {
82 bdp->node_boot_start = min(cstart, bdp->node_boot_start); 82 bdp->node_boot_start = min(cstart, bdp->node_boot_start);
83 bdp->node_low_pfn = max(epfn, bdp->node_low_pfn); 83 bdp->node_low_pfn = max(epfn, bdp->node_low_pfn);
84 } 84 }
85 85
86 min_low_pfn = min(min_low_pfn, bdp->node_boot_start>>PAGE_SHIFT); 86 min_low_pfn = min(min_low_pfn, bdp->node_boot_start>>PAGE_SHIFT);
87 max_low_pfn = max(max_low_pfn, bdp->node_low_pfn); 87 max_low_pfn = max(max_low_pfn, bdp->node_low_pfn);
88 88
89 return 0; 89 return 0;
90 } 90 }
91 91
92 /** 92 /**
93 * early_nr_cpus_node - return number of cpus on a given node 93 * early_nr_cpus_node - return number of cpus on a given node
94 * @node: node to check 94 * @node: node to check
95 * 95 *
96 * Count the number of cpus on @node. We can't use nr_cpus_node() yet because 96 * Count the number of cpus on @node. We can't use nr_cpus_node() yet because
97 * acpi_boot_init() (which builds the node_to_cpu_mask array) hasn't been 97 * acpi_boot_init() (which builds the node_to_cpu_mask array) hasn't been
98 * called yet. Note that node 0 will also count all non-existent cpus. 98 * called yet. Note that node 0 will also count all non-existent cpus.
99 */ 99 */
100 static int __init early_nr_cpus_node(int node) 100 static int __init early_nr_cpus_node(int node)
101 { 101 {
102 int cpu, n = 0; 102 int cpu, n = 0;
103 103
104 for (cpu = 0; cpu < NR_CPUS; cpu++) 104 for (cpu = 0; cpu < NR_CPUS; cpu++)
105 if (node == node_cpuid[cpu].nid) 105 if (node == node_cpuid[cpu].nid)
106 n++; 106 n++;
107 107
108 return n; 108 return n;
109 } 109 }
110 110
111 /** 111 /**
112 * compute_pernodesize - compute size of pernode data 112 * compute_pernodesize - compute size of pernode data
113 * @node: the node id. 113 * @node: the node id.
114 */ 114 */
115 static unsigned long __init compute_pernodesize(int node) 115 static unsigned long __init compute_pernodesize(int node)
116 { 116 {
117 unsigned long pernodesize = 0, cpus; 117 unsigned long pernodesize = 0, cpus;
118 118
119 cpus = early_nr_cpus_node(node); 119 cpus = early_nr_cpus_node(node);
120 pernodesize += PERCPU_PAGE_SIZE * cpus; 120 pernodesize += PERCPU_PAGE_SIZE * cpus;
121 pernodesize += node * L1_CACHE_BYTES; 121 pernodesize += node * L1_CACHE_BYTES;
122 pernodesize += L1_CACHE_ALIGN(sizeof(pg_data_t)); 122 pernodesize += L1_CACHE_ALIGN(sizeof(pg_data_t));
123 pernodesize += L1_CACHE_ALIGN(sizeof(struct ia64_node_data)); 123 pernodesize += L1_CACHE_ALIGN(sizeof(struct ia64_node_data));
124 pernodesize = PAGE_ALIGN(pernodesize); 124 pernodesize = PAGE_ALIGN(pernodesize);
125 return pernodesize; 125 return pernodesize;
126 } 126 }
127 127
128 /** 128 /**
129 * per_cpu_node_setup - setup per-cpu areas on each node 129 * per_cpu_node_setup - setup per-cpu areas on each node
130 * @cpu_data: per-cpu area on this node 130 * @cpu_data: per-cpu area on this node
131 * @node: node to setup 131 * @node: node to setup
132 * 132 *
133 * Copy the static per-cpu data into the region we just set aside and then 133 * Copy the static per-cpu data into the region we just set aside and then
134 * setup __per_cpu_offset for each CPU on this node. Return a pointer to 134 * setup __per_cpu_offset for each CPU on this node. Return a pointer to
135 * the end of the area. 135 * the end of the area.
136 */ 136 */
137 static void *per_cpu_node_setup(void *cpu_data, int node) 137 static void *per_cpu_node_setup(void *cpu_data, int node)
138 { 138 {
139 #ifdef CONFIG_SMP 139 #ifdef CONFIG_SMP
140 int cpu; 140 int cpu;
141 141
142 for (cpu = 0; cpu < NR_CPUS; cpu++) { 142 for (cpu = 0; cpu < NR_CPUS; cpu++) {
143 if (node == node_cpuid[cpu].nid) { 143 if (node == node_cpuid[cpu].nid) {
144 memcpy(__va(cpu_data), __phys_per_cpu_start, 144 memcpy(__va(cpu_data), __phys_per_cpu_start,
145 __per_cpu_end - __per_cpu_start); 145 __per_cpu_end - __per_cpu_start);
146 __per_cpu_offset[cpu] = (char*)__va(cpu_data) - 146 __per_cpu_offset[cpu] = (char*)__va(cpu_data) -
147 __per_cpu_start; 147 __per_cpu_start;
148 cpu_data += PERCPU_PAGE_SIZE; 148 cpu_data += PERCPU_PAGE_SIZE;
149 } 149 }
150 } 150 }
151 #endif 151 #endif
152 return cpu_data; 152 return cpu_data;
153 } 153 }
154 154
155 /** 155 /**
156 * fill_pernode - initialize pernode data. 156 * fill_pernode - initialize pernode data.
157 * @node: the node id. 157 * @node: the node id.
158 * @pernode: physical address of pernode data 158 * @pernode: physical address of pernode data
159 * @pernodesize: size of the pernode data 159 * @pernodesize: size of the pernode data
160 */ 160 */
161 static void __init fill_pernode(int node, unsigned long pernode, 161 static void __init fill_pernode(int node, unsigned long pernode,
162 unsigned long pernodesize) 162 unsigned long pernodesize)
163 { 163 {
164 void *cpu_data; 164 void *cpu_data;
165 int cpus = early_nr_cpus_node(node); 165 int cpus = early_nr_cpus_node(node);
166 struct bootmem_data *bdp = &mem_data[node].bootmem_data; 166 struct bootmem_data *bdp = &mem_data[node].bootmem_data;
167 167
168 mem_data[node].pernode_addr = pernode; 168 mem_data[node].pernode_addr = pernode;
169 mem_data[node].pernode_size = pernodesize; 169 mem_data[node].pernode_size = pernodesize;
170 memset(__va(pernode), 0, pernodesize); 170 memset(__va(pernode), 0, pernodesize);
171 171
172 cpu_data = (void *)pernode; 172 cpu_data = (void *)pernode;
173 pernode += PERCPU_PAGE_SIZE * cpus; 173 pernode += PERCPU_PAGE_SIZE * cpus;
174 pernode += node * L1_CACHE_BYTES; 174 pernode += node * L1_CACHE_BYTES;
175 175
176 mem_data[node].pgdat = __va(pernode); 176 mem_data[node].pgdat = __va(pernode);
177 pernode += L1_CACHE_ALIGN(sizeof(pg_data_t)); 177 pernode += L1_CACHE_ALIGN(sizeof(pg_data_t));
178 178
179 mem_data[node].node_data = __va(pernode); 179 mem_data[node].node_data = __va(pernode);
180 pernode += L1_CACHE_ALIGN(sizeof(struct ia64_node_data)); 180 pernode += L1_CACHE_ALIGN(sizeof(struct ia64_node_data));
181 181
182 mem_data[node].pgdat->bdata = bdp; 182 mem_data[node].pgdat->bdata = bdp;
183 pernode += L1_CACHE_ALIGN(sizeof(pg_data_t)); 183 pernode += L1_CACHE_ALIGN(sizeof(pg_data_t));
184 184
185 cpu_data = per_cpu_node_setup(cpu_data, node); 185 cpu_data = per_cpu_node_setup(cpu_data, node);
186 186
187 return; 187 return;
188 } 188 }
189 189
190 /** 190 /**
191 * find_pernode_space - allocate memory for memory map and per-node structures 191 * find_pernode_space - allocate memory for memory map and per-node structures
192 * @start: physical start of range 192 * @start: physical start of range
193 * @len: length of range 193 * @len: length of range
194 * @node: node where this range resides 194 * @node: node where this range resides
195 * 195 *
196 * This routine reserves space for the per-cpu data struct, the list of 196 * This routine reserves space for the per-cpu data struct, the list of
197 * pg_data_ts and the per-node data struct. Each node will have something like 197 * pg_data_ts and the per-node data struct. Each node will have something like
198 * the following in the first chunk of addr. space large enough to hold it. 198 * the following in the first chunk of addr. space large enough to hold it.
199 * 199 *
200 * ________________________ 200 * ________________________
201 * | | 201 * | |
202 * |~~~~~~~~~~~~~~~~~~~~~~~~| <-- NODEDATA_ALIGN(start, node) for the first 202 * |~~~~~~~~~~~~~~~~~~~~~~~~| <-- NODEDATA_ALIGN(start, node) for the first
203 * | PERCPU_PAGE_SIZE * | start and length big enough 203 * | PERCPU_PAGE_SIZE * | start and length big enough
204 * | cpus_on_this_node | Node 0 will also have entries for all non-existent cpus. 204 * | cpus_on_this_node | Node 0 will also have entries for all non-existent cpus.
205 * |------------------------| 205 * |------------------------|
206 * | local pg_data_t * | 206 * | local pg_data_t * |
207 * |------------------------| 207 * |------------------------|
208 * | local ia64_node_data | 208 * | local ia64_node_data |
209 * |------------------------| 209 * |------------------------|
210 * | ??? | 210 * | ??? |
211 * |________________________| 211 * |________________________|
212 * 212 *
213 * Once this space has been set aside, the bootmem maps are initialized. We 213 * Once this space has been set aside, the bootmem maps are initialized. We
214 * could probably move the allocation of the per-cpu and ia64_node_data space 214 * could probably move the allocation of the per-cpu and ia64_node_data space
215 * outside of this function and use alloc_bootmem_node(), but doing it here 215 * outside of this function and use alloc_bootmem_node(), but doing it here
216 * is straightforward and we get the alignments we want so... 216 * is straightforward and we get the alignments we want so...
217 */ 217 */
218 static int __init find_pernode_space(unsigned long start, unsigned long len, 218 static int __init find_pernode_space(unsigned long start, unsigned long len,
219 int node) 219 int node)
220 { 220 {
221 unsigned long epfn; 221 unsigned long epfn;
222 unsigned long pernodesize = 0, pernode, pages, mapsize; 222 unsigned long pernodesize = 0, pernode, pages, mapsize;
223 struct bootmem_data *bdp = &mem_data[node].bootmem_data; 223 struct bootmem_data *bdp = &mem_data[node].bootmem_data;
224 224
225 epfn = (start + len) >> PAGE_SHIFT; 225 epfn = (start + len) >> PAGE_SHIFT;
226 226
227 pages = bdp->node_low_pfn - (bdp->node_boot_start >> PAGE_SHIFT); 227 pages = bdp->node_low_pfn - (bdp->node_boot_start >> PAGE_SHIFT);
228 mapsize = bootmem_bootmap_pages(pages) << PAGE_SHIFT; 228 mapsize = bootmem_bootmap_pages(pages) << PAGE_SHIFT;
229 229
230 /* 230 /*
231 * Make sure this memory falls within this node's usable memory 231 * Make sure this memory falls within this node's usable memory
232 * since we may have thrown some away in build_maps(). 232 * since we may have thrown some away in build_maps().
233 */ 233 */
234 if (start < bdp->node_boot_start || epfn > bdp->node_low_pfn) 234 if (start < bdp->node_boot_start || epfn > bdp->node_low_pfn)
235 return 0; 235 return 0;
236 236
237 /* Don't setup this node's local space twice... */ 237 /* Don't setup this node's local space twice... */
238 if (mem_data[node].pernode_addr) 238 if (mem_data[node].pernode_addr)
239 return 0; 239 return 0;
240 240
241 /* 241 /*
242 * Calculate total size needed, incl. what's necessary 242 * Calculate total size needed, incl. what's necessary
243 * for good alignment and alias prevention. 243 * for good alignment and alias prevention.
244 */ 244 */
245 pernodesize = compute_pernodesize(node); 245 pernodesize = compute_pernodesize(node);
246 pernode = NODEDATA_ALIGN(start, node); 246 pernode = NODEDATA_ALIGN(start, node);
247 247
248 /* Is this range big enough for what we want to store here? */ 248 /* Is this range big enough for what we want to store here? */
249 if (start + len > (pernode + pernodesize + mapsize)) 249 if (start + len > (pernode + pernodesize + mapsize))
250 fill_pernode(node, pernode, pernodesize); 250 fill_pernode(node, pernode, pernodesize);
251 251
252 return 0; 252 return 0;
253 } 253 }
254 254
255 /** 255 /**
256 * free_node_bootmem - free bootmem allocator memory for use 256 * free_node_bootmem - free bootmem allocator memory for use
257 * @start: physical start of range 257 * @start: physical start of range
258 * @len: length of range 258 * @len: length of range
259 * @node: node where this range resides 259 * @node: node where this range resides
260 * 260 *
261 * Simply calls the bootmem allocator to free the specified ranged from 261 * Simply calls the bootmem allocator to free the specified ranged from
262 * the given pg_data_t's bdata struct. After this function has been called 262 * the given pg_data_t's bdata struct. After this function has been called
263 * for all the entries in the EFI memory map, the bootmem allocator will 263 * for all the entries in the EFI memory map, the bootmem allocator will
264 * be ready to service allocation requests. 264 * be ready to service allocation requests.
265 */ 265 */
266 static int __init free_node_bootmem(unsigned long start, unsigned long len, 266 static int __init free_node_bootmem(unsigned long start, unsigned long len,
267 int node) 267 int node)
268 { 268 {
269 free_bootmem_node(mem_data[node].pgdat, start, len); 269 free_bootmem_node(mem_data[node].pgdat, start, len);
270 270
271 return 0; 271 return 0;
272 } 272 }
273 273
274 /** 274 /**
275 * reserve_pernode_space - reserve memory for per-node space 275 * reserve_pernode_space - reserve memory for per-node space
276 * 276 *
277 * Reserve the space used by the bootmem maps & per-node space in the boot 277 * Reserve the space used by the bootmem maps & per-node space in the boot
278 * allocator so that when we actually create the real mem maps we don't 278 * allocator so that when we actually create the real mem maps we don't
279 * use their memory. 279 * use their memory.
280 */ 280 */
281 static void __init reserve_pernode_space(void) 281 static void __init reserve_pernode_space(void)
282 { 282 {
283 unsigned long base, size, pages; 283 unsigned long base, size, pages;
284 struct bootmem_data *bdp; 284 struct bootmem_data *bdp;
285 int node; 285 int node;
286 286
287 for_each_online_node(node) { 287 for_each_online_node(node) {
288 pg_data_t *pdp = mem_data[node].pgdat; 288 pg_data_t *pdp = mem_data[node].pgdat;
289 289
290 if (node_isset(node, memory_less_mask)) 290 if (node_isset(node, memory_less_mask))
291 continue; 291 continue;
292 292
293 bdp = pdp->bdata; 293 bdp = pdp->bdata;
294 294
295 /* First the bootmem_map itself */ 295 /* First the bootmem_map itself */
296 pages = bdp->node_low_pfn - (bdp->node_boot_start>>PAGE_SHIFT); 296 pages = bdp->node_low_pfn - (bdp->node_boot_start>>PAGE_SHIFT);
297 size = bootmem_bootmap_pages(pages) << PAGE_SHIFT; 297 size = bootmem_bootmap_pages(pages) << PAGE_SHIFT;
298 base = __pa(bdp->node_bootmem_map); 298 base = __pa(bdp->node_bootmem_map);
299 reserve_bootmem_node(pdp, base, size); 299 reserve_bootmem_node(pdp, base, size);
300 300
301 /* Now the per-node space */ 301 /* Now the per-node space */
302 size = mem_data[node].pernode_size; 302 size = mem_data[node].pernode_size;
303 base = __pa(mem_data[node].pernode_addr); 303 base = __pa(mem_data[node].pernode_addr);
304 reserve_bootmem_node(pdp, base, size); 304 reserve_bootmem_node(pdp, base, size);
305 } 305 }
306 } 306 }
307 307
308 /** 308 /**
309 * initialize_pernode_data - fixup per-cpu & per-node pointers 309 * initialize_pernode_data - fixup per-cpu & per-node pointers
310 * 310 *
311 * Each node's per-node area has a copy of the global pg_data_t list, so 311 * Each node's per-node area has a copy of the global pg_data_t list, so
312 * we copy that to each node here, as well as setting the per-cpu pointer 312 * we copy that to each node here, as well as setting the per-cpu pointer
313 * to the local node data structure. The active_cpus field of the per-node 313 * to the local node data structure. The active_cpus field of the per-node
314 * structure gets setup by the platform_cpu_init() function later. 314 * structure gets setup by the platform_cpu_init() function later.
315 */ 315 */
316 static void __init initialize_pernode_data(void) 316 static void __init initialize_pernode_data(void)
317 { 317 {
318 pg_data_t *pgdat_list[MAX_NUMNODES]; 318 pg_data_t *pgdat_list[MAX_NUMNODES];
319 int cpu, node; 319 int cpu, node;
320 320
321 for_each_online_node(node) 321 for_each_online_node(node)
322 pgdat_list[node] = mem_data[node].pgdat; 322 pgdat_list[node] = mem_data[node].pgdat;
323 323
324 /* Copy the pg_data_t list to each node and init the node field */ 324 /* Copy the pg_data_t list to each node and init the node field */
325 for_each_online_node(node) { 325 for_each_online_node(node) {
326 memcpy(mem_data[node].node_data->pg_data_ptrs, pgdat_list, 326 memcpy(mem_data[node].node_data->pg_data_ptrs, pgdat_list,
327 sizeof(pgdat_list)); 327 sizeof(pgdat_list));
328 } 328 }
329 #ifdef CONFIG_SMP 329 #ifdef CONFIG_SMP
330 /* Set the node_data pointer for each per-cpu struct */ 330 /* Set the node_data pointer for each per-cpu struct */
331 for (cpu = 0; cpu < NR_CPUS; cpu++) { 331 for (cpu = 0; cpu < NR_CPUS; cpu++) {
332 node = node_cpuid[cpu].nid; 332 node = node_cpuid[cpu].nid;
333 per_cpu(cpu_info, cpu).node_data = mem_data[node].node_data; 333 per_cpu(cpu_info, cpu).node_data = mem_data[node].node_data;
334 } 334 }
335 #else 335 #else
336 { 336 {
337 struct cpuinfo_ia64 *cpu0_cpu_info; 337 struct cpuinfo_ia64 *cpu0_cpu_info;
338 cpu = 0; 338 cpu = 0;
339 node = node_cpuid[cpu].nid; 339 node = node_cpuid[cpu].nid;
340 cpu0_cpu_info = (struct cpuinfo_ia64 *)(__phys_per_cpu_start + 340 cpu0_cpu_info = (struct cpuinfo_ia64 *)(__phys_per_cpu_start +
341 ((char *)&per_cpu__cpu_info - __per_cpu_start)); 341 ((char *)&per_cpu__cpu_info - __per_cpu_start));
342 cpu0_cpu_info->node_data = mem_data[node].node_data; 342 cpu0_cpu_info->node_data = mem_data[node].node_data;
343 } 343 }
344 #endif /* CONFIG_SMP */ 344 #endif /* CONFIG_SMP */
345 } 345 }
346 346
347 /** 347 /**
348 * memory_less_node_alloc - * attempt to allocate memory on the best NUMA slit 348 * memory_less_node_alloc - * attempt to allocate memory on the best NUMA slit
349 * node but fall back to any other node when __alloc_bootmem_node fails 349 * node but fall back to any other node when __alloc_bootmem_node fails
350 * for best. 350 * for best.
351 * @nid: node id 351 * @nid: node id
352 * @pernodesize: size of this node's pernode data 352 * @pernodesize: size of this node's pernode data
353 * @align: alignment to use for this node's pernode data 353 * @align: alignment to use for this node's pernode data
354 */ 354 */
355 static void __init *memory_less_node_alloc(int nid, unsigned long pernodesize, 355 static void __init *memory_less_node_alloc(int nid, unsigned long pernodesize,
356 unsigned long align) 356 unsigned long align)
357 { 357 {
358 void *ptr = NULL; 358 void *ptr = NULL;
359 u8 best = 0xff; 359 u8 best = 0xff;
360 int bestnode = -1, node; 360 int bestnode = -1, node;
361 361
362 for_each_online_node(node) { 362 for_each_online_node(node) {
363 if (node_isset(node, memory_less_mask)) 363 if (node_isset(node, memory_less_mask))
364 continue; 364 continue;
365 else if (node_distance(nid, node) < best) { 365 else if (node_distance(nid, node) < best) {
366 best = node_distance(nid, node); 366 best = node_distance(nid, node);
367 bestnode = node; 367 bestnode = node;
368 } 368 }
369 } 369 }
370 370
371 ptr = __alloc_bootmem_node(mem_data[bestnode].pgdat, 371 ptr = __alloc_bootmem_node(mem_data[bestnode].pgdat,
372 pernodesize, align, __pa(MAX_DMA_ADDRESS)); 372 pernodesize, align, __pa(MAX_DMA_ADDRESS));
373 373
374 if (!ptr) 374 if (!ptr)
375 panic("NO memory for memory less node\n"); 375 panic("NO memory for memory less node\n");
376 return ptr; 376 return ptr;
377 } 377 }
378 378
379 /** 379 /**
380 * pgdat_insert - insert the pgdat into global pgdat_list 380 * pgdat_insert - insert the pgdat into global pgdat_list
381 * @pgdat: the pgdat for a node. 381 * @pgdat: the pgdat for a node.
382 */ 382 */
383 static void __init pgdat_insert(pg_data_t *pgdat) 383 static void __init pgdat_insert(pg_data_t *pgdat)
384 { 384 {
385 pg_data_t *prev = NULL, *next; 385 pg_data_t *prev = NULL, *next;
386 386
387 for_each_pgdat(next) 387 for_each_pgdat(next)
388 if (pgdat->node_id < next->node_id) 388 if (pgdat->node_id < next->node_id)
389 break; 389 break;
390 else 390 else
391 prev = next; 391 prev = next;
392 392
393 if (prev) { 393 if (prev) {
394 prev->pgdat_next = pgdat; 394 prev->pgdat_next = pgdat;
395 pgdat->pgdat_next = next; 395 pgdat->pgdat_next = next;
396 } else { 396 } else {
397 pgdat->pgdat_next = pgdat_list; 397 pgdat->pgdat_next = pgdat_list;
398 pgdat_list = pgdat; 398 pgdat_list = pgdat;
399 } 399 }
400 400
401 return; 401 return;
402 } 402 }
403 403
404 /** 404 /**
405 * memory_less_nodes - allocate and initialize CPU only nodes pernode 405 * memory_less_nodes - allocate and initialize CPU only nodes pernode
406 * information. 406 * information.
407 */ 407 */
408 static void __init memory_less_nodes(void) 408 static void __init memory_less_nodes(void)
409 { 409 {
410 unsigned long pernodesize; 410 unsigned long pernodesize;
411 void *pernode; 411 void *pernode;
412 int node; 412 int node;
413 413
414 for_each_node_mask(node, memory_less_mask) { 414 for_each_node_mask(node, memory_less_mask) {
415 pernodesize = compute_pernodesize(node); 415 pernodesize = compute_pernodesize(node);
416 pernode = memory_less_node_alloc(node, pernodesize, 416 pernode = memory_less_node_alloc(node, pernodesize,
417 (node) ? (node * PERCPU_PAGE_SIZE) : (1024*1024)); 417 (node) ? (node * PERCPU_PAGE_SIZE) : (1024*1024));
418 fill_pernode(node, __pa(pernode), pernodesize); 418 fill_pernode(node, __pa(pernode), pernodesize);
419 } 419 }
420 420
421 return; 421 return;
422 } 422 }
423 423
424 #ifdef CONFIG_SPARSEMEM 424 #ifdef CONFIG_SPARSEMEM
425 /** 425 /**
426 * register_sparse_mem - notify SPARSEMEM that this memory range exists. 426 * register_sparse_mem - notify SPARSEMEM that this memory range exists.
427 * @start: physical start of range 427 * @start: physical start of range
428 * @end: physical end of range 428 * @end: physical end of range
429 * @arg: unused 429 * @arg: unused
430 * 430 *
431 * Simply calls SPARSEMEM to register memory section(s). 431 * Simply calls SPARSEMEM to register memory section(s).
432 */ 432 */
433 static int __init register_sparse_mem(unsigned long start, unsigned long end, 433 static int __init register_sparse_mem(unsigned long start, unsigned long end,
434 void *arg) 434 void *arg)
435 { 435 {
436 int nid; 436 int nid;
437 437
438 start = __pa(start) >> PAGE_SHIFT; 438 start = __pa(start) >> PAGE_SHIFT;
439 end = __pa(end) >> PAGE_SHIFT; 439 end = __pa(end) >> PAGE_SHIFT;
440 nid = early_pfn_to_nid(start); 440 nid = early_pfn_to_nid(start);
441 memory_present(nid, start, end); 441 memory_present(nid, start, end);
442 442
443 return 0; 443 return 0;
444 } 444 }
445 445
446 static void __init arch_sparse_init(void) 446 static void __init arch_sparse_init(void)
447 { 447 {
448 efi_memmap_walk(register_sparse_mem, NULL); 448 efi_memmap_walk(register_sparse_mem, NULL);
449 sparse_init(); 449 sparse_init();
450 } 450 }
451 #else 451 #else
452 #define arch_sparse_init() do {} while (0) 452 #define arch_sparse_init() do {} while (0)
453 #endif 453 #endif
454 454
455 /** 455 /**
456 * find_memory - walk the EFI memory map and setup the bootmem allocator 456 * find_memory - walk the EFI memory map and setup the bootmem allocator
457 * 457 *
458 * Called early in boot to setup the bootmem allocator, and to 458 * Called early in boot to setup the bootmem allocator, and to
459 * allocate the per-cpu and per-node structures. 459 * allocate the per-cpu and per-node structures.
460 */ 460 */
461 void __init find_memory(void) 461 void __init find_memory(void)
462 { 462 {
463 int node; 463 int node;
464 464
465 reserve_memory(); 465 reserve_memory();
466 466
467 if (num_online_nodes() == 0) { 467 if (num_online_nodes() == 0) {
468 printk(KERN_ERR "node info missing!\n"); 468 printk(KERN_ERR "node info missing!\n");
469 node_set_online(0); 469 node_set_online(0);
470 } 470 }
471 471
472 nodes_or(memory_less_mask, memory_less_mask, node_online_map); 472 nodes_or(memory_less_mask, memory_less_mask, node_online_map);
473 min_low_pfn = -1; 473 min_low_pfn = -1;
474 max_low_pfn = 0; 474 max_low_pfn = 0;
475 475
476 /* These actually end up getting called by call_pernode_memory() */ 476 /* These actually end up getting called by call_pernode_memory() */
477 efi_memmap_walk(filter_rsvd_memory, build_node_maps); 477 efi_memmap_walk(filter_rsvd_memory, build_node_maps);
478 efi_memmap_walk(filter_rsvd_memory, find_pernode_space); 478 efi_memmap_walk(filter_rsvd_memory, find_pernode_space);
479 479
480 for_each_online_node(node) 480 for_each_online_node(node)
481 if (mem_data[node].bootmem_data.node_low_pfn) { 481 if (mem_data[node].bootmem_data.node_low_pfn) {
482 node_clear(node, memory_less_mask); 482 node_clear(node, memory_less_mask);
483 mem_data[node].min_pfn = ~0UL; 483 mem_data[node].min_pfn = ~0UL;
484 } 484 }
485 /* 485 /*
486 * Initialize the boot memory maps in reverse order since that's 486 * Initialize the boot memory maps in reverse order since that's
487 * what the bootmem allocator expects 487 * what the bootmem allocator expects
488 */ 488 */
489 for (node = MAX_NUMNODES - 1; node >= 0; node--) { 489 for (node = MAX_NUMNODES - 1; node >= 0; node--) {
490 unsigned long pernode, pernodesize, map; 490 unsigned long pernode, pernodesize, map;
491 struct bootmem_data *bdp; 491 struct bootmem_data *bdp;
492 492
493 if (!node_online(node)) 493 if (!node_online(node))
494 continue; 494 continue;
495 else if (node_isset(node, memory_less_mask)) 495 else if (node_isset(node, memory_less_mask))
496 continue; 496 continue;
497 497
498 bdp = &mem_data[node].bootmem_data; 498 bdp = &mem_data[node].bootmem_data;
499 pernode = mem_data[node].pernode_addr; 499 pernode = mem_data[node].pernode_addr;
500 pernodesize = mem_data[node].pernode_size; 500 pernodesize = mem_data[node].pernode_size;
501 map = pernode + pernodesize; 501 map = pernode + pernodesize;
502 502
503 init_bootmem_node(mem_data[node].pgdat, 503 init_bootmem_node(mem_data[node].pgdat,
504 map>>PAGE_SHIFT, 504 map>>PAGE_SHIFT,
505 bdp->node_boot_start>>PAGE_SHIFT, 505 bdp->node_boot_start>>PAGE_SHIFT,
506 bdp->node_low_pfn); 506 bdp->node_low_pfn);
507 } 507 }
508 508
509 efi_memmap_walk(filter_rsvd_memory, free_node_bootmem); 509 efi_memmap_walk(filter_rsvd_memory, free_node_bootmem);
510 510
511 reserve_pernode_space(); 511 reserve_pernode_space();
512 memory_less_nodes(); 512 memory_less_nodes();
513 initialize_pernode_data(); 513 initialize_pernode_data();
514 514
515 max_pfn = max_low_pfn; 515 max_pfn = max_low_pfn;
516 516
517 find_initrd(); 517 find_initrd();
518 } 518 }
519 519
520 #ifdef CONFIG_SMP 520 #ifdef CONFIG_SMP
521 /** 521 /**
522 * per_cpu_init - setup per-cpu variables 522 * per_cpu_init - setup per-cpu variables
523 * 523 *
524 * find_pernode_space() does most of this already, we just need to set 524 * find_pernode_space() does most of this already, we just need to set
525 * local_per_cpu_offset 525 * local_per_cpu_offset
526 */ 526 */
527 void *per_cpu_init(void) 527 void *per_cpu_init(void)
528 { 528 {
529 int cpu; 529 int cpu;
530 530
531 if (smp_processor_id() != 0) 531 if (smp_processor_id() != 0)
532 return __per_cpu_start + __per_cpu_offset[smp_processor_id()]; 532 return __per_cpu_start + __per_cpu_offset[smp_processor_id()];
533 533
534 for (cpu = 0; cpu < NR_CPUS; cpu++) 534 for (cpu = 0; cpu < NR_CPUS; cpu++)
535 per_cpu(local_per_cpu_offset, cpu) = __per_cpu_offset[cpu]; 535 per_cpu(local_per_cpu_offset, cpu) = __per_cpu_offset[cpu];
536 536
537 return __per_cpu_start + __per_cpu_offset[smp_processor_id()]; 537 return __per_cpu_start + __per_cpu_offset[smp_processor_id()];
538 } 538 }
539 #endif /* CONFIG_SMP */ 539 #endif /* CONFIG_SMP */
540 540
541 /** 541 /**
542 * show_mem - give short summary of memory stats 542 * show_mem - give short summary of memory stats
543 * 543 *
544 * Shows a simple page count of reserved and used pages in the system. 544 * Shows a simple page count of reserved and used pages in the system.
545 * For discontig machines, it does this on a per-pgdat basis. 545 * For discontig machines, it does this on a per-pgdat basis.
546 */ 546 */
547 void show_mem(void) 547 void show_mem(void)
548 { 548 {
549 int i, total_reserved = 0; 549 int i, total_reserved = 0;
550 int total_shared = 0, total_cached = 0; 550 int total_shared = 0, total_cached = 0;
551 unsigned long total_present = 0; 551 unsigned long total_present = 0;
552 pg_data_t *pgdat; 552 pg_data_t *pgdat;
553 553
554 printk("Mem-info:\n"); 554 printk("Mem-info:\n");
555 show_free_areas(); 555 show_free_areas();
556 printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); 556 printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
557 for_each_pgdat(pgdat) { 557 for_each_pgdat(pgdat) {
558 unsigned long present = pgdat->node_present_pages; 558 unsigned long present;
559 unsigned long flags;
559 int shared = 0, cached = 0, reserved = 0; 560 int shared = 0, cached = 0, reserved = 0;
561
560 printk("Node ID: %d\n", pgdat->node_id); 562 printk("Node ID: %d\n", pgdat->node_id);
563 pgdat_resize_lock(pgdat, &flags);
564 present = pgdat->node_present_pages;
561 for(i = 0; i < pgdat->node_spanned_pages; i++) { 565 for(i = 0; i < pgdat->node_spanned_pages; i++) {
562 struct page *page; 566 struct page *page;
563 if (pfn_valid(pgdat->node_start_pfn + i)) 567 if (pfn_valid(pgdat->node_start_pfn + i))
564 page = pfn_to_page(pgdat->node_start_pfn + i); 568 page = pfn_to_page(pgdat->node_start_pfn + i);
565 else 569 else
566 continue; 570 continue;
567 if (PageReserved(page)) 571 if (PageReserved(page))
568 reserved++; 572 reserved++;
569 else if (PageSwapCache(page)) 573 else if (PageSwapCache(page))
570 cached++; 574 cached++;
571 else if (page_count(page)) 575 else if (page_count(page))
572 shared += page_count(page)-1; 576 shared += page_count(page)-1;
573 } 577 }
578 pgdat_resize_unlock(pgdat, &flags);
574 total_present += present; 579 total_present += present;
575 total_reserved += reserved; 580 total_reserved += reserved;
576 total_cached += cached; 581 total_cached += cached;
577 total_shared += shared; 582 total_shared += shared;
578 printk("\t%ld pages of RAM\n", present); 583 printk("\t%ld pages of RAM\n", present);
579 printk("\t%d reserved pages\n", reserved); 584 printk("\t%d reserved pages\n", reserved);
580 printk("\t%d pages shared\n", shared); 585 printk("\t%d pages shared\n", shared);
581 printk("\t%d pages swap cached\n", cached); 586 printk("\t%d pages swap cached\n", cached);
582 } 587 }
583 printk("%ld pages of RAM\n", total_present); 588 printk("%ld pages of RAM\n", total_present);
584 printk("%d reserved pages\n", total_reserved); 589 printk("%d reserved pages\n", total_reserved);
585 printk("%d pages shared\n", total_shared); 590 printk("%d pages shared\n", total_shared);
586 printk("%d pages swap cached\n", total_cached); 591 printk("%d pages swap cached\n", total_cached);
587 printk("Total of %ld pages in page table cache\n", 592 printk("Total of %ld pages in page table cache\n",
588 pgtable_quicklist_total_size()); 593 pgtable_quicklist_total_size());
589 printk("%d free buffer pages\n", nr_free_buffer_pages()); 594 printk("%d free buffer pages\n", nr_free_buffer_pages());
590 } 595 }
591 596
592 /** 597 /**
593 * call_pernode_memory - use SRAT to call callback functions with node info 598 * call_pernode_memory - use SRAT to call callback functions with node info
594 * @start: physical start of range 599 * @start: physical start of range
595 * @len: length of range 600 * @len: length of range
596 * @arg: function to call for each range 601 * @arg: function to call for each range
597 * 602 *
598 * efi_memmap_walk() knows nothing about layout of memory across nodes. Find 603 * efi_memmap_walk() knows nothing about layout of memory across nodes. Find
599 * out to which node a block of memory belongs. Ignore memory that we cannot 604 * out to which node a block of memory belongs. Ignore memory that we cannot
600 * identify, and split blocks that run across multiple nodes. 605 * identify, and split blocks that run across multiple nodes.
601 * 606 *
602 * Take this opportunity to round the start address up and the end address 607 * Take this opportunity to round the start address up and the end address
603 * down to page boundaries. 608 * down to page boundaries.
604 */ 609 */
605 void call_pernode_memory(unsigned long start, unsigned long len, void *arg) 610 void call_pernode_memory(unsigned long start, unsigned long len, void *arg)
606 { 611 {
607 unsigned long rs, re, end = start + len; 612 unsigned long rs, re, end = start + len;
608 void (*func)(unsigned long, unsigned long, int); 613 void (*func)(unsigned long, unsigned long, int);
609 int i; 614 int i;
610 615
611 start = PAGE_ALIGN(start); 616 start = PAGE_ALIGN(start);
612 end &= PAGE_MASK; 617 end &= PAGE_MASK;
613 if (start >= end) 618 if (start >= end)
614 return; 619 return;
615 620
616 func = arg; 621 func = arg;
617 622
618 if (!num_node_memblks) { 623 if (!num_node_memblks) {
619 /* No SRAT table, so assume one node (node 0) */ 624 /* No SRAT table, so assume one node (node 0) */
620 if (start < end) 625 if (start < end)
621 (*func)(start, end - start, 0); 626 (*func)(start, end - start, 0);
622 return; 627 return;
623 } 628 }
624 629
625 for (i = 0; i < num_node_memblks; i++) { 630 for (i = 0; i < num_node_memblks; i++) {
626 rs = max(start, node_memblk[i].start_paddr); 631 rs = max(start, node_memblk[i].start_paddr);
627 re = min(end, node_memblk[i].start_paddr + 632 re = min(end, node_memblk[i].start_paddr +
628 node_memblk[i].size); 633 node_memblk[i].size);
629 634
630 if (rs < re) 635 if (rs < re)
631 (*func)(rs, re - rs, node_memblk[i].nid); 636 (*func)(rs, re - rs, node_memblk[i].nid);
632 637
633 if (re == end) 638 if (re == end)
634 break; 639 break;
635 } 640 }
636 } 641 }
637 642
638 /** 643 /**
639 * count_node_pages - callback to build per-node memory info structures 644 * count_node_pages - callback to build per-node memory info structures
640 * @start: physical start of range 645 * @start: physical start of range
641 * @len: length of range 646 * @len: length of range
642 * @node: node where this range resides 647 * @node: node where this range resides
643 * 648 *
644 * Each node has it's own number of physical pages, DMAable pages, start, and 649 * Each node has it's own number of physical pages, DMAable pages, start, and
645 * end page frame number. This routine will be called by call_pernode_memory() 650 * end page frame number. This routine will be called by call_pernode_memory()
646 * for each piece of usable memory and will setup these values for each node. 651 * for each piece of usable memory and will setup these values for each node.
647 * Very similar to build_maps(). 652 * Very similar to build_maps().
648 */ 653 */
649 static __init int count_node_pages(unsigned long start, unsigned long len, int node) 654 static __init int count_node_pages(unsigned long start, unsigned long len, int node)
650 { 655 {
651 unsigned long end = start + len; 656 unsigned long end = start + len;
652 657
653 mem_data[node].num_physpages += len >> PAGE_SHIFT; 658 mem_data[node].num_physpages += len >> PAGE_SHIFT;
654 if (start <= __pa(MAX_DMA_ADDRESS)) 659 if (start <= __pa(MAX_DMA_ADDRESS))
655 mem_data[node].num_dma_physpages += 660 mem_data[node].num_dma_physpages +=
656 (min(end, __pa(MAX_DMA_ADDRESS)) - start) >>PAGE_SHIFT; 661 (min(end, __pa(MAX_DMA_ADDRESS)) - start) >>PAGE_SHIFT;
657 start = GRANULEROUNDDOWN(start); 662 start = GRANULEROUNDDOWN(start);
658 start = ORDERROUNDDOWN(start); 663 start = ORDERROUNDDOWN(start);
659 end = GRANULEROUNDUP(end); 664 end = GRANULEROUNDUP(end);
660 mem_data[node].max_pfn = max(mem_data[node].max_pfn, 665 mem_data[node].max_pfn = max(mem_data[node].max_pfn,
661 end >> PAGE_SHIFT); 666 end >> PAGE_SHIFT);
662 mem_data[node].min_pfn = min(mem_data[node].min_pfn, 667 mem_data[node].min_pfn = min(mem_data[node].min_pfn,
663 start >> PAGE_SHIFT); 668 start >> PAGE_SHIFT);
664 669
665 return 0; 670 return 0;
666 } 671 }
667 672
668 /** 673 /**
669 * paging_init - setup page tables 674 * paging_init - setup page tables
670 * 675 *
671 * paging_init() sets up the page tables for each node of the system and frees 676 * paging_init() sets up the page tables for each node of the system and frees
672 * the bootmem allocator memory for general use. 677 * the bootmem allocator memory for general use.
673 */ 678 */
674 void __init paging_init(void) 679 void __init paging_init(void)
675 { 680 {
676 unsigned long max_dma; 681 unsigned long max_dma;
677 unsigned long zones_size[MAX_NR_ZONES]; 682 unsigned long zones_size[MAX_NR_ZONES];
678 unsigned long zholes_size[MAX_NR_ZONES]; 683 unsigned long zholes_size[MAX_NR_ZONES];
679 unsigned long pfn_offset = 0; 684 unsigned long pfn_offset = 0;
680 int node; 685 int node;
681 686
682 max_dma = virt_to_phys((void *) MAX_DMA_ADDRESS) >> PAGE_SHIFT; 687 max_dma = virt_to_phys((void *) MAX_DMA_ADDRESS) >> PAGE_SHIFT;
683 688
684 arch_sparse_init(); 689 arch_sparse_init();
685 690
686 efi_memmap_walk(filter_rsvd_memory, count_node_pages); 691 efi_memmap_walk(filter_rsvd_memory, count_node_pages);
687 692
688 #ifdef CONFIG_VIRTUAL_MEM_MAP 693 #ifdef CONFIG_VIRTUAL_MEM_MAP
689 vmalloc_end -= PAGE_ALIGN(max_low_pfn * sizeof(struct page)); 694 vmalloc_end -= PAGE_ALIGN(max_low_pfn * sizeof(struct page));
690 vmem_map = (struct page *) vmalloc_end; 695 vmem_map = (struct page *) vmalloc_end;
691 efi_memmap_walk(create_mem_map_page_table, NULL); 696 efi_memmap_walk(create_mem_map_page_table, NULL);
692 printk("Virtual mem_map starts at 0x%p\n", vmem_map); 697 printk("Virtual mem_map starts at 0x%p\n", vmem_map);
693 #endif 698 #endif
694 699
695 for_each_online_node(node) { 700 for_each_online_node(node) {
696 memset(zones_size, 0, sizeof(zones_size)); 701 memset(zones_size, 0, sizeof(zones_size));
697 memset(zholes_size, 0, sizeof(zholes_size)); 702 memset(zholes_size, 0, sizeof(zholes_size));
698 703
699 num_physpages += mem_data[node].num_physpages; 704 num_physpages += mem_data[node].num_physpages;
700 705
701 if (mem_data[node].min_pfn >= max_dma) { 706 if (mem_data[node].min_pfn >= max_dma) {
702 /* All of this node's memory is above ZONE_DMA */ 707 /* All of this node's memory is above ZONE_DMA */
703 zones_size[ZONE_NORMAL] = mem_data[node].max_pfn - 708 zones_size[ZONE_NORMAL] = mem_data[node].max_pfn -
704 mem_data[node].min_pfn; 709 mem_data[node].min_pfn;
705 zholes_size[ZONE_NORMAL] = mem_data[node].max_pfn - 710 zholes_size[ZONE_NORMAL] = mem_data[node].max_pfn -
706 mem_data[node].min_pfn - 711 mem_data[node].min_pfn -
707 mem_data[node].num_physpages; 712 mem_data[node].num_physpages;
708 } else if (mem_data[node].max_pfn < max_dma) { 713 } else if (mem_data[node].max_pfn < max_dma) {
709 /* All of this node's memory is in ZONE_DMA */ 714 /* All of this node's memory is in ZONE_DMA */
710 zones_size[ZONE_DMA] = mem_data[node].max_pfn - 715 zones_size[ZONE_DMA] = mem_data[node].max_pfn -
711 mem_data[node].min_pfn; 716 mem_data[node].min_pfn;
712 zholes_size[ZONE_DMA] = mem_data[node].max_pfn - 717 zholes_size[ZONE_DMA] = mem_data[node].max_pfn -
713 mem_data[node].min_pfn - 718 mem_data[node].min_pfn -
714 mem_data[node].num_dma_physpages; 719 mem_data[node].num_dma_physpages;
715 } else { 720 } else {
716 /* This node has memory in both zones */ 721 /* This node has memory in both zones */
717 zones_size[ZONE_DMA] = max_dma - 722 zones_size[ZONE_DMA] = max_dma -
718 mem_data[node].min_pfn; 723 mem_data[node].min_pfn;
719 zholes_size[ZONE_DMA] = zones_size[ZONE_DMA] - 724 zholes_size[ZONE_DMA] = zones_size[ZONE_DMA] -
720 mem_data[node].num_dma_physpages; 725 mem_data[node].num_dma_physpages;
721 zones_size[ZONE_NORMAL] = mem_data[node].max_pfn - 726 zones_size[ZONE_NORMAL] = mem_data[node].max_pfn -
722 max_dma; 727 max_dma;
723 zholes_size[ZONE_NORMAL] = zones_size[ZONE_NORMAL] - 728 zholes_size[ZONE_NORMAL] = zones_size[ZONE_NORMAL] -
724 (mem_data[node].num_physpages - 729 (mem_data[node].num_physpages -
725 mem_data[node].num_dma_physpages); 730 mem_data[node].num_dma_physpages);
726 } 731 }
727 732
728 pfn_offset = mem_data[node].min_pfn; 733 pfn_offset = mem_data[node].min_pfn;
729 734
730 #ifdef CONFIG_VIRTUAL_MEM_MAP 735 #ifdef CONFIG_VIRTUAL_MEM_MAP
731 NODE_DATA(node)->node_mem_map = vmem_map + pfn_offset; 736 NODE_DATA(node)->node_mem_map = vmem_map + pfn_offset;
732 #endif 737 #endif
733 free_area_init_node(node, NODE_DATA(node), zones_size, 738 free_area_init_node(node, NODE_DATA(node), zones_size,
734 pfn_offset, zholes_size); 739 pfn_offset, zholes_size);
735 } 740 }
736 741
737 /* 742 /*
738 * Make memory less nodes become a member of the known nodes. 743 * Make memory less nodes become a member of the known nodes.
739 */ 744 */
740 for_each_node_mask(node, memory_less_mask) 745 for_each_node_mask(node, memory_less_mask)
741 pgdat_insert(mem_data[node].pgdat); 746 pgdat_insert(mem_data[node].pgdat);
742 747
743 zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page)); 748 zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page));
744 } 749 }
745 750
1 /* 1 /*
2 * linux/arch/m32r/mm/init.c 2 * linux/arch/m32r/mm/init.c
3 * 3 *
4 * Copyright (c) 2001, 2002 Hitoshi Yamamoto 4 * Copyright (c) 2001, 2002 Hitoshi Yamamoto
5 * 5 *
6 * Some code taken from sh version. 6 * Some code taken from sh version.
7 * Copyright (C) 1999 Niibe Yutaka 7 * Copyright (C) 1999 Niibe Yutaka
8 * Based on linux/arch/i386/mm/init.c: 8 * Based on linux/arch/i386/mm/init.c:
9 * Copyright (C) 1995 Linus Torvalds 9 * Copyright (C) 1995 Linus Torvalds
10 */ 10 */
11 11
12 #include <linux/init.h> 12 #include <linux/init.h>
13 #include <linux/kernel.h> 13 #include <linux/kernel.h>
14 #include <linux/mm.h> 14 #include <linux/mm.h>
15 #include <linux/pagemap.h> 15 #include <linux/pagemap.h>
16 #include <linux/bootmem.h> 16 #include <linux/bootmem.h>
17 #include <linux/swap.h> 17 #include <linux/swap.h>
18 #include <linux/highmem.h> 18 #include <linux/highmem.h>
19 #include <linux/bitops.h> 19 #include <linux/bitops.h>
20 #include <linux/nodemask.h> 20 #include <linux/nodemask.h>
21 #include <asm/types.h> 21 #include <asm/types.h>
22 #include <asm/processor.h> 22 #include <asm/processor.h>
23 #include <asm/page.h> 23 #include <asm/page.h>
24 #include <asm/pgtable.h> 24 #include <asm/pgtable.h>
25 #include <asm/pgalloc.h> 25 #include <asm/pgalloc.h>
26 #include <asm/mmu_context.h> 26 #include <asm/mmu_context.h>
27 #include <asm/setup.h> 27 #include <asm/setup.h>
28 #include <asm/tlb.h> 28 #include <asm/tlb.h>
29 29
30 /* References to section boundaries */ 30 /* References to section boundaries */
31 extern char _text, _etext, _edata; 31 extern char _text, _etext, _edata;
32 extern char __init_begin, __init_end; 32 extern char __init_begin, __init_end;
33 33
34 pgd_t swapper_pg_dir[1024]; 34 pgd_t swapper_pg_dir[1024];
35 35
36 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); 36 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
37 37
38 void show_mem(void) 38 void show_mem(void)
39 { 39 {
40 int total = 0, reserved = 0; 40 int total = 0, reserved = 0;
41 int shared = 0, cached = 0; 41 int shared = 0, cached = 0;
42 int highmem = 0; 42 int highmem = 0;
43 struct page *page; 43 struct page *page;
44 pg_data_t *pgdat; 44 pg_data_t *pgdat;
45 unsigned long i; 45 unsigned long i;
46 46
47 printk("Mem-info:\n"); 47 printk("Mem-info:\n");
48 show_free_areas(); 48 show_free_areas();
49 printk("Free swap: %6ldkB\n",nr_swap_pages<<(PAGE_SHIFT-10)); 49 printk("Free swap: %6ldkB\n",nr_swap_pages<<(PAGE_SHIFT-10));
50 for_each_pgdat(pgdat) { 50 for_each_pgdat(pgdat) {
51 unsigned long flags;
52 pgdat_resize_lock(pgdat, &flags);
51 for (i = 0; i < pgdat->node_spanned_pages; ++i) { 53 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
52 page = pgdat_page_nr(pgdat, i); 54 page = pgdat_page_nr(pgdat, i);
53 total++; 55 total++;
54 if (PageHighMem(page)) 56 if (PageHighMem(page))
55 highmem++; 57 highmem++;
56 if (PageReserved(page)) 58 if (PageReserved(page))
57 reserved++; 59 reserved++;
58 else if (PageSwapCache(page)) 60 else if (PageSwapCache(page))
59 cached++; 61 cached++;
60 else if (page_count(page)) 62 else if (page_count(page))
61 shared += page_count(page) - 1; 63 shared += page_count(page) - 1;
62 } 64 }
65 pgdat_resize_unlock(pgdat, &flags);
63 } 66 }
64 printk("%d pages of RAM\n", total); 67 printk("%d pages of RAM\n", total);
65 printk("%d pages of HIGHMEM\n",highmem); 68 printk("%d pages of HIGHMEM\n",highmem);
66 printk("%d reserved pages\n",reserved); 69 printk("%d reserved pages\n",reserved);
67 printk("%d pages shared\n",shared); 70 printk("%d pages shared\n",shared);
68 printk("%d pages swap cached\n",cached); 71 printk("%d pages swap cached\n",cached);
69 } 72 }
70 73
71 /* 74 /*
72 * Cache of MMU context last used. 75 * Cache of MMU context last used.
73 */ 76 */
74 #ifndef CONFIG_SMP 77 #ifndef CONFIG_SMP
75 unsigned long mmu_context_cache_dat; 78 unsigned long mmu_context_cache_dat;
76 #else 79 #else
77 unsigned long mmu_context_cache_dat[NR_CPUS]; 80 unsigned long mmu_context_cache_dat[NR_CPUS];
78 #endif 81 #endif
79 static unsigned long hole_pages; 82 static unsigned long hole_pages;
80 83
81 /* 84 /*
82 * function prototype 85 * function prototype
83 */ 86 */
84 void __init paging_init(void); 87 void __init paging_init(void);
85 void __init mem_init(void); 88 void __init mem_init(void);
86 void free_initmem(void); 89 void free_initmem(void);
87 #ifdef CONFIG_BLK_DEV_INITRD 90 #ifdef CONFIG_BLK_DEV_INITRD
88 void free_initrd_mem(unsigned long, unsigned long); 91 void free_initrd_mem(unsigned long, unsigned long);
89 #endif 92 #endif
90 93
91 /* It'd be good if these lines were in the standard header file. */ 94 /* It'd be good if these lines were in the standard header file. */
92 #define START_PFN(nid) \ 95 #define START_PFN(nid) \
93 (NODE_DATA(nid)->bdata->node_boot_start >> PAGE_SHIFT) 96 (NODE_DATA(nid)->bdata->node_boot_start >> PAGE_SHIFT)
94 #define MAX_LOW_PFN(nid) (NODE_DATA(nid)->bdata->node_low_pfn) 97 #define MAX_LOW_PFN(nid) (NODE_DATA(nid)->bdata->node_low_pfn)
95 98
96 #ifndef CONFIG_DISCONTIGMEM 99 #ifndef CONFIG_DISCONTIGMEM
97 unsigned long __init zone_sizes_init(void) 100 unsigned long __init zone_sizes_init(void)
98 { 101 {
99 unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0}; 102 unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
100 unsigned long max_dma; 103 unsigned long max_dma;
101 unsigned long low; 104 unsigned long low;
102 unsigned long start_pfn; 105 unsigned long start_pfn;
103 106
104 #ifdef CONFIG_MMU 107 #ifdef CONFIG_MMU
105 start_pfn = START_PFN(0); 108 start_pfn = START_PFN(0);
106 max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; 109 max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
107 low = MAX_LOW_PFN(0); 110 low = MAX_LOW_PFN(0);
108 111
109 if (low < max_dma){ 112 if (low < max_dma){
110 zones_size[ZONE_DMA] = low - start_pfn; 113 zones_size[ZONE_DMA] = low - start_pfn;
111 zones_size[ZONE_NORMAL] = 0; 114 zones_size[ZONE_NORMAL] = 0;
112 } else { 115 } else {
113 zones_size[ZONE_DMA] = low - start_pfn; 116 zones_size[ZONE_DMA] = low - start_pfn;
114 zones_size[ZONE_NORMAL] = low - max_dma; 117 zones_size[ZONE_NORMAL] = low - max_dma;
115 } 118 }
116 #else 119 #else
117 zones_size[ZONE_DMA] = 0 >> PAGE_SHIFT; 120 zones_size[ZONE_DMA] = 0 >> PAGE_SHIFT;
118 zones_size[ZONE_NORMAL] = __MEMORY_SIZE >> PAGE_SHIFT; 121 zones_size[ZONE_NORMAL] = __MEMORY_SIZE >> PAGE_SHIFT;
119 start_pfn = __MEMORY_START >> PAGE_SHIFT; 122 start_pfn = __MEMORY_START >> PAGE_SHIFT;
120 #endif /* CONFIG_MMU */ 123 #endif /* CONFIG_MMU */
121 124
122 free_area_init_node(0, NODE_DATA(0), zones_size, start_pfn, 0); 125 free_area_init_node(0, NODE_DATA(0), zones_size, start_pfn, 0);
123 126
124 return 0; 127 return 0;
125 } 128 }
126 #else /* CONFIG_DISCONTIGMEM */ 129 #else /* CONFIG_DISCONTIGMEM */
127 extern unsigned long zone_sizes_init(void); 130 extern unsigned long zone_sizes_init(void);
128 #endif /* CONFIG_DISCONTIGMEM */ 131 #endif /* CONFIG_DISCONTIGMEM */
129 132
130 /*======================================================================* 133 /*======================================================================*
131 * paging_init() : sets up the page tables 134 * paging_init() : sets up the page tables
132 *======================================================================*/ 135 *======================================================================*/
133 void __init paging_init(void) 136 void __init paging_init(void)
134 { 137 {
135 #ifdef CONFIG_MMU 138 #ifdef CONFIG_MMU
136 int i; 139 int i;
137 pgd_t *pg_dir; 140 pgd_t *pg_dir;
138 141
139 /* We don't need kernel mapping as hardware support that. */ 142 /* We don't need kernel mapping as hardware support that. */
140 pg_dir = swapper_pg_dir; 143 pg_dir = swapper_pg_dir;
141 144
142 for (i = 0 ; i < USER_PTRS_PER_PGD * 2 ; i++) 145 for (i = 0 ; i < USER_PTRS_PER_PGD * 2 ; i++)
143 pgd_val(pg_dir[i]) = 0; 146 pgd_val(pg_dir[i]) = 0;
144 #endif /* CONFIG_MMU */ 147 #endif /* CONFIG_MMU */
145 hole_pages = zone_sizes_init(); 148 hole_pages = zone_sizes_init();
146 } 149 }
147 150
148 int __init reservedpages_count(void) 151 int __init reservedpages_count(void)
149 { 152 {
150 int reservedpages, nid, i; 153 int reservedpages, nid, i;
151 154
152 reservedpages = 0; 155 reservedpages = 0;
153 for_each_online_node(nid) 156 for_each_online_node(nid) {
157 unsigned long flags;
158 pgdat_resize_lock(NODE_DATA(nid), &flags);
154 for (i = 0 ; i < MAX_LOW_PFN(nid) - START_PFN(nid) ; i++) 159 for (i = 0 ; i < MAX_LOW_PFN(nid) - START_PFN(nid) ; i++)
155 if (PageReserved(nid_page_nr(nid, i))) 160 if (PageReserved(nid_page_nr(nid, i)))
156 reservedpages++; 161 reservedpages++;
162 pgdat_resize_unlock(NODE_DATA(nid), &flags);
163 }
157 164
158 return reservedpages; 165 return reservedpages;
159 } 166 }
160 167
161 /*======================================================================* 168 /*======================================================================*
162 * mem_init() : 169 * mem_init() :
163 * orig : arch/sh/mm/init.c 170 * orig : arch/sh/mm/init.c
164 *======================================================================*/ 171 *======================================================================*/
165 void __init mem_init(void) 172 void __init mem_init(void)
166 { 173 {
167 int codesize, reservedpages, datasize, initsize; 174 int codesize, reservedpages, datasize, initsize;
168 int nid; 175 int nid;
169 #ifndef CONFIG_MMU 176 #ifndef CONFIG_MMU
170 extern unsigned long memory_end; 177 extern unsigned long memory_end;
171 #endif 178 #endif
172 179
173 num_physpages = 0; 180 num_physpages = 0;
174 for_each_online_node(nid) 181 for_each_online_node(nid)
175 num_physpages += MAX_LOW_PFN(nid) - START_PFN(nid) + 1; 182 num_physpages += MAX_LOW_PFN(nid) - START_PFN(nid) + 1;
176 183
177 num_physpages -= hole_pages; 184 num_physpages -= hole_pages;
178 185
179 #ifndef CONFIG_DISCONTIGMEM 186 #ifndef CONFIG_DISCONTIGMEM
180 max_mapnr = num_physpages; 187 max_mapnr = num_physpages;
181 #endif /* CONFIG_DISCONTIGMEM */ 188 #endif /* CONFIG_DISCONTIGMEM */
182 189
183 #ifdef CONFIG_MMU 190 #ifdef CONFIG_MMU
184 high_memory = (void *)__va(PFN_PHYS(MAX_LOW_PFN(0))); 191 high_memory = (void *)__va(PFN_PHYS(MAX_LOW_PFN(0)));
185 #else 192 #else
186 high_memory = (void *)(memory_end & PAGE_MASK); 193 high_memory = (void *)(memory_end & PAGE_MASK);
187 #endif /* CONFIG_MMU */ 194 #endif /* CONFIG_MMU */
188 195
189 /* clear the zero-page */ 196 /* clear the zero-page */
190 memset(empty_zero_page, 0, PAGE_SIZE); 197 memset(empty_zero_page, 0, PAGE_SIZE);
191 198
192 /* this will put all low memory onto the freelists */ 199 /* this will put all low memory onto the freelists */
193 for_each_online_node(nid) 200 for_each_online_node(nid)
194 totalram_pages += free_all_bootmem_node(NODE_DATA(nid)); 201 totalram_pages += free_all_bootmem_node(NODE_DATA(nid));
195 202
196 reservedpages = reservedpages_count() - hole_pages; 203 reservedpages = reservedpages_count() - hole_pages;
197 codesize = (unsigned long) &_etext - (unsigned long)&_text; 204 codesize = (unsigned long) &_etext - (unsigned long)&_text;
198 datasize = (unsigned long) &_edata - (unsigned long)&_etext; 205 datasize = (unsigned long) &_edata - (unsigned long)&_etext;
199 initsize = (unsigned long) &__init_end - (unsigned long)&__init_begin; 206 initsize = (unsigned long) &__init_end - (unsigned long)&__init_begin;
200 207
201 printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, " 208 printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, "
202 "%dk reserved, %dk data, %dk init)\n", 209 "%dk reserved, %dk data, %dk init)\n",
203 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), 210 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
204 num_physpages << (PAGE_SHIFT-10), 211 num_physpages << (PAGE_SHIFT-10),
205 codesize >> 10, 212 codesize >> 10,
206 reservedpages << (PAGE_SHIFT-10), 213 reservedpages << (PAGE_SHIFT-10),
207 datasize >> 10, 214 datasize >> 10,
208 initsize >> 10); 215 initsize >> 10);
209 } 216 }
210 217
211 /*======================================================================* 218 /*======================================================================*
212 * free_initmem() : 219 * free_initmem() :
213 * orig : arch/sh/mm/init.c 220 * orig : arch/sh/mm/init.c
214 *======================================================================*/ 221 *======================================================================*/
215 void free_initmem(void) 222 void free_initmem(void)
216 { 223 {
217 unsigned long addr; 224 unsigned long addr;
218 225
219 addr = (unsigned long)(&__init_begin); 226 addr = (unsigned long)(&__init_begin);
220 for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { 227 for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
221 ClearPageReserved(virt_to_page(addr)); 228 ClearPageReserved(virt_to_page(addr));
222 set_page_count(virt_to_page(addr), 1); 229 set_page_count(virt_to_page(addr), 1);
223 free_page(addr); 230 free_page(addr);
224 totalram_pages++; 231 totalram_pages++;
225 } 232 }
226 printk (KERN_INFO "Freeing unused kernel memory: %dk freed\n", \ 233 printk (KERN_INFO "Freeing unused kernel memory: %dk freed\n", \
227 (int)(&__init_end - &__init_begin) >> 10); 234 (int)(&__init_end - &__init_begin) >> 10);
228 } 235 }
229 236
230 #ifdef CONFIG_BLK_DEV_INITRD 237 #ifdef CONFIG_BLK_DEV_INITRD
231 /*======================================================================* 238 /*======================================================================*
232 * free_initrd_mem() : 239 * free_initrd_mem() :
233 * orig : arch/sh/mm/init.c 240 * orig : arch/sh/mm/init.c
234 *======================================================================*/ 241 *======================================================================*/
235 void free_initrd_mem(unsigned long start, unsigned long end) 242 void free_initrd_mem(unsigned long start, unsigned long end)
236 { 243 {
237 unsigned long p; 244 unsigned long p;
238 for (p = start; p < end; p += PAGE_SIZE) { 245 for (p = start; p < end; p += PAGE_SIZE) {
239 ClearPageReserved(virt_to_page(p)); 246 ClearPageReserved(virt_to_page(p));
240 set_page_count(virt_to_page(p), 1); 247 set_page_count(virt_to_page(p), 1);
241 free_page(p); 248 free_page(p);
242 totalram_pages++; 249 totalram_pages++;
243 } 250 }
244 printk (KERN_INFO "Freeing initrd memory: %ldk freed\n", (end - start) >> 10); 251 printk (KERN_INFO "Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
245 } 252 }
246 #endif 253 #endif
247 254
248 255
arch/parisc/mm/init.c
1 /* 1 /*
2 * linux/arch/parisc/mm/init.c 2 * linux/arch/parisc/mm/init.c
3 * 3 *
4 * Copyright (C) 1995 Linus Torvalds 4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright 1999 SuSE GmbH 5 * Copyright 1999 SuSE GmbH
6 * changed by Philipp Rumpf 6 * changed by Philipp Rumpf
7 * Copyright 1999 Philipp Rumpf (prumpf@tux.org) 7 * Copyright 1999 Philipp Rumpf (prumpf@tux.org)
8 * Copyright 2004 Randolph Chung (tausq@debian.org) 8 * Copyright 2004 Randolph Chung (tausq@debian.org)
9 * 9 *
10 */ 10 */
11 11
12 #include <linux/config.h> 12 #include <linux/config.h>
13 13
14 #include <linux/module.h> 14 #include <linux/module.h>
15 #include <linux/mm.h> 15 #include <linux/mm.h>
16 #include <linux/bootmem.h> 16 #include <linux/bootmem.h>
17 #include <linux/delay.h> 17 #include <linux/delay.h>
18 #include <linux/init.h> 18 #include <linux/init.h>
19 #include <linux/pci.h> /* for hppa_dma_ops and pcxl_dma_ops */ 19 #include <linux/pci.h> /* for hppa_dma_ops and pcxl_dma_ops */
20 #include <linux/initrd.h> 20 #include <linux/initrd.h>
21 #include <linux/swap.h> 21 #include <linux/swap.h>
22 #include <linux/unistd.h> 22 #include <linux/unistd.h>
23 #include <linux/nodemask.h> /* for node_online_map */ 23 #include <linux/nodemask.h> /* for node_online_map */
24 #include <linux/pagemap.h> /* for release_pages and page_cache_release */ 24 #include <linux/pagemap.h> /* for release_pages and page_cache_release */
25 25
26 #include <asm/pgalloc.h> 26 #include <asm/pgalloc.h>
27 #include <asm/tlb.h> 27 #include <asm/tlb.h>
28 #include <asm/pdc_chassis.h> 28 #include <asm/pdc_chassis.h>
29 #include <asm/mmzone.h> 29 #include <asm/mmzone.h>
30 30
31 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); 31 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
32 32
33 extern char _text; /* start of kernel code, defined by linker */ 33 extern char _text; /* start of kernel code, defined by linker */
34 extern int data_start; 34 extern int data_start;
35 extern char _end; /* end of BSS, defined by linker */ 35 extern char _end; /* end of BSS, defined by linker */
36 extern char __init_begin, __init_end; 36 extern char __init_begin, __init_end;
37 37
38 #ifdef CONFIG_DISCONTIGMEM 38 #ifdef CONFIG_DISCONTIGMEM
39 struct node_map_data node_data[MAX_NUMNODES]; 39 struct node_map_data node_data[MAX_NUMNODES];
40 bootmem_data_t bmem_data[MAX_NUMNODES]; 40 bootmem_data_t bmem_data[MAX_NUMNODES];
41 unsigned char pfnnid_map[PFNNID_MAP_MAX]; 41 unsigned char pfnnid_map[PFNNID_MAP_MAX];
42 #endif 42 #endif
43 43
44 static struct resource data_resource = { 44 static struct resource data_resource = {
45 .name = "Kernel data", 45 .name = "Kernel data",
46 .flags = IORESOURCE_BUSY | IORESOURCE_MEM, 46 .flags = IORESOURCE_BUSY | IORESOURCE_MEM,
47 }; 47 };
48 48
49 static struct resource code_resource = { 49 static struct resource code_resource = {
50 .name = "Kernel code", 50 .name = "Kernel code",
51 .flags = IORESOURCE_BUSY | IORESOURCE_MEM, 51 .flags = IORESOURCE_BUSY | IORESOURCE_MEM,
52 }; 52 };
53 53
54 static struct resource pdcdata_resource = { 54 static struct resource pdcdata_resource = {
55 .name = "PDC data (Page Zero)", 55 .name = "PDC data (Page Zero)",
56 .start = 0, 56 .start = 0,
57 .end = 0x9ff, 57 .end = 0x9ff,
58 .flags = IORESOURCE_BUSY | IORESOURCE_MEM, 58 .flags = IORESOURCE_BUSY | IORESOURCE_MEM,
59 }; 59 };
60 60
61 static struct resource sysram_resources[MAX_PHYSMEM_RANGES]; 61 static struct resource sysram_resources[MAX_PHYSMEM_RANGES];
62 62
63 /* The following array is initialized from the firmware specific 63 /* The following array is initialized from the firmware specific
64 * information retrieved in kernel/inventory.c. 64 * information retrieved in kernel/inventory.c.
65 */ 65 */
66 66
67 physmem_range_t pmem_ranges[MAX_PHYSMEM_RANGES]; 67 physmem_range_t pmem_ranges[MAX_PHYSMEM_RANGES];
68 int npmem_ranges; 68 int npmem_ranges;
69 69
70 #ifdef __LP64__ 70 #ifdef __LP64__
71 #define MAX_MEM (~0UL) 71 #define MAX_MEM (~0UL)
72 #else /* !__LP64__ */ 72 #else /* !__LP64__ */
73 #define MAX_MEM (3584U*1024U*1024U) 73 #define MAX_MEM (3584U*1024U*1024U)
74 #endif /* !__LP64__ */ 74 #endif /* !__LP64__ */
75 75
76 static unsigned long mem_limit = MAX_MEM; 76 static unsigned long mem_limit = MAX_MEM;
77 77
78 static void __init mem_limit_func(void) 78 static void __init mem_limit_func(void)
79 { 79 {
80 char *cp, *end; 80 char *cp, *end;
81 unsigned long limit; 81 unsigned long limit;
82 extern char saved_command_line[]; 82 extern char saved_command_line[];
83 83
84 /* We need this before __setup() functions are called */ 84 /* We need this before __setup() functions are called */
85 85
86 limit = MAX_MEM; 86 limit = MAX_MEM;
87 for (cp = saved_command_line; *cp; ) { 87 for (cp = saved_command_line; *cp; ) {
88 if (memcmp(cp, "mem=", 4) == 0) { 88 if (memcmp(cp, "mem=", 4) == 0) {
89 cp += 4; 89 cp += 4;
90 limit = memparse(cp, &end); 90 limit = memparse(cp, &end);
91 if (end != cp) 91 if (end != cp)
92 break; 92 break;
93 cp = end; 93 cp = end;
94 } else { 94 } else {
95 while (*cp != ' ' && *cp) 95 while (*cp != ' ' && *cp)
96 ++cp; 96 ++cp;
97 while (*cp == ' ') 97 while (*cp == ' ')
98 ++cp; 98 ++cp;
99 } 99 }
100 } 100 }
101 101
102 if (limit < mem_limit) 102 if (limit < mem_limit)
103 mem_limit = limit; 103 mem_limit = limit;
104 } 104 }
105 105
106 #define MAX_GAP (0x40000000UL >> PAGE_SHIFT) 106 #define MAX_GAP (0x40000000UL >> PAGE_SHIFT)
107 107
108 static void __init setup_bootmem(void) 108 static void __init setup_bootmem(void)
109 { 109 {
110 unsigned long bootmap_size; 110 unsigned long bootmap_size;
111 unsigned long mem_max; 111 unsigned long mem_max;
112 unsigned long bootmap_pages; 112 unsigned long bootmap_pages;
113 unsigned long bootmap_start_pfn; 113 unsigned long bootmap_start_pfn;
114 unsigned long bootmap_pfn; 114 unsigned long bootmap_pfn;
115 #ifndef CONFIG_DISCONTIGMEM 115 #ifndef CONFIG_DISCONTIGMEM
116 physmem_range_t pmem_holes[MAX_PHYSMEM_RANGES - 1]; 116 physmem_range_t pmem_holes[MAX_PHYSMEM_RANGES - 1];
117 int npmem_holes; 117 int npmem_holes;
118 #endif 118 #endif
119 int i, sysram_resource_count; 119 int i, sysram_resource_count;
120 120
121 disable_sr_hashing(); /* Turn off space register hashing */ 121 disable_sr_hashing(); /* Turn off space register hashing */
122 122
123 /* 123 /*
124 * Sort the ranges. Since the number of ranges is typically 124 * Sort the ranges. Since the number of ranges is typically
125 * small, and performance is not an issue here, just do 125 * small, and performance is not an issue here, just do
126 * a simple insertion sort. 126 * a simple insertion sort.
127 */ 127 */
128 128
129 for (i = 1; i < npmem_ranges; i++) { 129 for (i = 1; i < npmem_ranges; i++) {
130 int j; 130 int j;
131 131
132 for (j = i; j > 0; j--) { 132 for (j = i; j > 0; j--) {
133 unsigned long tmp; 133 unsigned long tmp;
134 134
135 if (pmem_ranges[j-1].start_pfn < 135 if (pmem_ranges[j-1].start_pfn <
136 pmem_ranges[j].start_pfn) { 136 pmem_ranges[j].start_pfn) {
137 137
138 break; 138 break;
139 } 139 }
140 tmp = pmem_ranges[j-1].start_pfn; 140 tmp = pmem_ranges[j-1].start_pfn;
141 pmem_ranges[j-1].start_pfn = pmem_ranges[j].start_pfn; 141 pmem_ranges[j-1].start_pfn = pmem_ranges[j].start_pfn;
142 pmem_ranges[j].start_pfn = tmp; 142 pmem_ranges[j].start_pfn = tmp;
143 tmp = pmem_ranges[j-1].pages; 143 tmp = pmem_ranges[j-1].pages;
144 pmem_ranges[j-1].pages = pmem_ranges[j].pages; 144 pmem_ranges[j-1].pages = pmem_ranges[j].pages;
145 pmem_ranges[j].pages = tmp; 145 pmem_ranges[j].pages = tmp;
146 } 146 }
147 } 147 }
148 148
149 #ifndef CONFIG_DISCONTIGMEM 149 #ifndef CONFIG_DISCONTIGMEM
150 /* 150 /*
151 * Throw out ranges that are too far apart (controlled by 151 * Throw out ranges that are too far apart (controlled by
152 * MAX_GAP). 152 * MAX_GAP).
153 */ 153 */
154 154
155 for (i = 1; i < npmem_ranges; i++) { 155 for (i = 1; i < npmem_ranges; i++) {
156 if (pmem_ranges[i].start_pfn - 156 if (pmem_ranges[i].start_pfn -
157 (pmem_ranges[i-1].start_pfn + 157 (pmem_ranges[i-1].start_pfn +
158 pmem_ranges[i-1].pages) > MAX_GAP) { 158 pmem_ranges[i-1].pages) > MAX_GAP) {
159 npmem_ranges = i; 159 npmem_ranges = i;
160 printk("Large gap in memory detected (%ld pages). " 160 printk("Large gap in memory detected (%ld pages). "
161 "Consider turning on CONFIG_DISCONTIGMEM\n", 161 "Consider turning on CONFIG_DISCONTIGMEM\n",
162 pmem_ranges[i].start_pfn - 162 pmem_ranges[i].start_pfn -
163 (pmem_ranges[i-1].start_pfn + 163 (pmem_ranges[i-1].start_pfn +
164 pmem_ranges[i-1].pages)); 164 pmem_ranges[i-1].pages));
165 break; 165 break;
166 } 166 }
167 } 167 }
168 #endif 168 #endif
169 169
170 if (npmem_ranges > 1) { 170 if (npmem_ranges > 1) {
171 171
172 /* Print the memory ranges */ 172 /* Print the memory ranges */
173 173
174 printk(KERN_INFO "Memory Ranges:\n"); 174 printk(KERN_INFO "Memory Ranges:\n");
175 175
176 for (i = 0; i < npmem_ranges; i++) { 176 for (i = 0; i < npmem_ranges; i++) {
177 unsigned long start; 177 unsigned long start;
178 unsigned long size; 178 unsigned long size;
179 179
180 size = (pmem_ranges[i].pages << PAGE_SHIFT); 180 size = (pmem_ranges[i].pages << PAGE_SHIFT);
181 start = (pmem_ranges[i].start_pfn << PAGE_SHIFT); 181 start = (pmem_ranges[i].start_pfn << PAGE_SHIFT);
182 printk(KERN_INFO "%2d) Start 0x%016lx End 0x%016lx Size %6ld MB\n", 182 printk(KERN_INFO "%2d) Start 0x%016lx End 0x%016lx Size %6ld MB\n",
183 i,start, start + (size - 1), size >> 20); 183 i,start, start + (size - 1), size >> 20);
184 } 184 }
185 } 185 }
186 186
187 sysram_resource_count = npmem_ranges; 187 sysram_resource_count = npmem_ranges;
188 for (i = 0; i < sysram_resource_count; i++) { 188 for (i = 0; i < sysram_resource_count; i++) {
189 struct resource *res = &sysram_resources[i]; 189 struct resource *res = &sysram_resources[i];
190 res->name = "System RAM"; 190 res->name = "System RAM";
191 res->start = pmem_ranges[i].start_pfn << PAGE_SHIFT; 191 res->start = pmem_ranges[i].start_pfn << PAGE_SHIFT;
192 res->end = res->start + (pmem_ranges[i].pages << PAGE_SHIFT)-1; 192 res->end = res->start + (pmem_ranges[i].pages << PAGE_SHIFT)-1;
193 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; 193 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
194 request_resource(&iomem_resource, res); 194 request_resource(&iomem_resource, res);
195 } 195 }
196 196
197 /* 197 /*
198 * For 32 bit kernels we limit the amount of memory we can 198 * For 32 bit kernels we limit the amount of memory we can
199 * support, in order to preserve enough kernel address space 199 * support, in order to preserve enough kernel address space
200 * for other purposes. For 64 bit kernels we don't normally 200 * for other purposes. For 64 bit kernels we don't normally
201 * limit the memory, but this mechanism can be used to 201 * limit the memory, but this mechanism can be used to
202 * artificially limit the amount of memory (and it is written 202 * artificially limit the amount of memory (and it is written
203 * to work with multiple memory ranges). 203 * to work with multiple memory ranges).
204 */ 204 */
205 205
206 mem_limit_func(); /* check for "mem=" argument */ 206 mem_limit_func(); /* check for "mem=" argument */
207 207
208 mem_max = 0; 208 mem_max = 0;
209 num_physpages = 0; 209 num_physpages = 0;
210 for (i = 0; i < npmem_ranges; i++) { 210 for (i = 0; i < npmem_ranges; i++) {
211 unsigned long rsize; 211 unsigned long rsize;
212 212
213 rsize = pmem_ranges[i].pages << PAGE_SHIFT; 213 rsize = pmem_ranges[i].pages << PAGE_SHIFT;
214 if ((mem_max + rsize) > mem_limit) { 214 if ((mem_max + rsize) > mem_limit) {
215 printk(KERN_WARNING "Memory truncated to %ld MB\n", mem_limit >> 20); 215 printk(KERN_WARNING "Memory truncated to %ld MB\n", mem_limit >> 20);
216 if (mem_max == mem_limit) 216 if (mem_max == mem_limit)
217 npmem_ranges = i; 217 npmem_ranges = i;
218 else { 218 else {
219 pmem_ranges[i].pages = (mem_limit >> PAGE_SHIFT) 219 pmem_ranges[i].pages = (mem_limit >> PAGE_SHIFT)
220 - (mem_max >> PAGE_SHIFT); 220 - (mem_max >> PAGE_SHIFT);
221 npmem_ranges = i + 1; 221 npmem_ranges = i + 1;
222 mem_max = mem_limit; 222 mem_max = mem_limit;
223 } 223 }
224 num_physpages += pmem_ranges[i].pages; 224 num_physpages += pmem_ranges[i].pages;
225 break; 225 break;
226 } 226 }
227 num_physpages += pmem_ranges[i].pages; 227 num_physpages += pmem_ranges[i].pages;
228 mem_max += rsize; 228 mem_max += rsize;
229 } 229 }
230 230
231 printk(KERN_INFO "Total Memory: %ld MB\n",mem_max >> 20); 231 printk(KERN_INFO "Total Memory: %ld MB\n",mem_max >> 20);
232 232
233 #ifndef CONFIG_DISCONTIGMEM 233 #ifndef CONFIG_DISCONTIGMEM
234 /* Merge the ranges, keeping track of the holes */ 234 /* Merge the ranges, keeping track of the holes */
235 235
236 { 236 {
237 unsigned long end_pfn; 237 unsigned long end_pfn;
238 unsigned long hole_pages; 238 unsigned long hole_pages;
239 239
240 npmem_holes = 0; 240 npmem_holes = 0;
241 end_pfn = pmem_ranges[0].start_pfn + pmem_ranges[0].pages; 241 end_pfn = pmem_ranges[0].start_pfn + pmem_ranges[0].pages;
242 for (i = 1; i < npmem_ranges; i++) { 242 for (i = 1; i < npmem_ranges; i++) {
243 243
244 hole_pages = pmem_ranges[i].start_pfn - end_pfn; 244 hole_pages = pmem_ranges[i].start_pfn - end_pfn;
245 if (hole_pages) { 245 if (hole_pages) {
246 pmem_holes[npmem_holes].start_pfn = end_pfn; 246 pmem_holes[npmem_holes].start_pfn = end_pfn;
247 pmem_holes[npmem_holes++].pages = hole_pages; 247 pmem_holes[npmem_holes++].pages = hole_pages;
248 end_pfn += hole_pages; 248 end_pfn += hole_pages;
249 } 249 }
250 end_pfn += pmem_ranges[i].pages; 250 end_pfn += pmem_ranges[i].pages;
251 } 251 }
252 252
253 pmem_ranges[0].pages = end_pfn - pmem_ranges[0].start_pfn; 253 pmem_ranges[0].pages = end_pfn - pmem_ranges[0].start_pfn;
254 npmem_ranges = 1; 254 npmem_ranges = 1;
255 } 255 }
256 #endif 256 #endif
257 257
258 bootmap_pages = 0; 258 bootmap_pages = 0;
259 for (i = 0; i < npmem_ranges; i++) 259 for (i = 0; i < npmem_ranges; i++)
260 bootmap_pages += bootmem_bootmap_pages(pmem_ranges[i].pages); 260 bootmap_pages += bootmem_bootmap_pages(pmem_ranges[i].pages);
261 261
262 bootmap_start_pfn = PAGE_ALIGN(__pa((unsigned long) &_end)) >> PAGE_SHIFT; 262 bootmap_start_pfn = PAGE_ALIGN(__pa((unsigned long) &_end)) >> PAGE_SHIFT;
263 263
264 #ifdef CONFIG_DISCONTIGMEM 264 #ifdef CONFIG_DISCONTIGMEM
265 for (i = 0; i < MAX_PHYSMEM_RANGES; i++) { 265 for (i = 0; i < MAX_PHYSMEM_RANGES; i++) {
266 memset(NODE_DATA(i), 0, sizeof(pg_data_t)); 266 memset(NODE_DATA(i), 0, sizeof(pg_data_t));
267 NODE_DATA(i)->bdata = &bmem_data[i]; 267 NODE_DATA(i)->bdata = &bmem_data[i];
268 } 268 }
269 memset(pfnnid_map, 0xff, sizeof(pfnnid_map)); 269 memset(pfnnid_map, 0xff, sizeof(pfnnid_map));
270 270
271 for (i = 0; i < npmem_ranges; i++) 271 for (i = 0; i < npmem_ranges; i++)
272 node_set_online(i); 272 node_set_online(i);
273 #endif 273 #endif
274 274
275 /* 275 /*
276 * Initialize and free the full range of memory in each range. 276 * Initialize and free the full range of memory in each range.
277 * Note that the only writing these routines do are to the bootmap, 277 * Note that the only writing these routines do are to the bootmap,
278 * and we've made sure to locate the bootmap properly so that they 278 * and we've made sure to locate the bootmap properly so that they
279 * won't be writing over anything important. 279 * won't be writing over anything important.
280 */ 280 */
281 281
282 bootmap_pfn = bootmap_start_pfn; 282 bootmap_pfn = bootmap_start_pfn;
283 max_pfn = 0; 283 max_pfn = 0;
284 for (i = 0; i < npmem_ranges; i++) { 284 for (i = 0; i < npmem_ranges; i++) {
285 unsigned long start_pfn; 285 unsigned long start_pfn;
286 unsigned long npages; 286 unsigned long npages;
287 287
288 start_pfn = pmem_ranges[i].start_pfn; 288 start_pfn = pmem_ranges[i].start_pfn;
289 npages = pmem_ranges[i].pages; 289 npages = pmem_ranges[i].pages;
290 290
291 bootmap_size = init_bootmem_node(NODE_DATA(i), 291 bootmap_size = init_bootmem_node(NODE_DATA(i),
292 bootmap_pfn, 292 bootmap_pfn,
293 start_pfn, 293 start_pfn,
294 (start_pfn + npages) ); 294 (start_pfn + npages) );
295 free_bootmem_node(NODE_DATA(i), 295 free_bootmem_node(NODE_DATA(i),
296 (start_pfn << PAGE_SHIFT), 296 (start_pfn << PAGE_SHIFT),
297 (npages << PAGE_SHIFT) ); 297 (npages << PAGE_SHIFT) );
298 bootmap_pfn += (bootmap_size + PAGE_SIZE - 1) >> PAGE_SHIFT; 298 bootmap_pfn += (bootmap_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
299 if ((start_pfn + npages) > max_pfn) 299 if ((start_pfn + npages) > max_pfn)
300 max_pfn = start_pfn + npages; 300 max_pfn = start_pfn + npages;
301 } 301 }
302 302
303 if ((bootmap_pfn - bootmap_start_pfn) != bootmap_pages) { 303 if ((bootmap_pfn - bootmap_start_pfn) != bootmap_pages) {
304 printk(KERN_WARNING "WARNING! bootmap sizing is messed up!\n"); 304 printk(KERN_WARNING "WARNING! bootmap sizing is messed up!\n");
305 BUG(); 305 BUG();
306 } 306 }
307 307
308 /* reserve PAGE0 pdc memory, kernel text/data/bss & bootmap */ 308 /* reserve PAGE0 pdc memory, kernel text/data/bss & bootmap */
309 309
310 #define PDC_CONSOLE_IO_IODC_SIZE 32768 310 #define PDC_CONSOLE_IO_IODC_SIZE 32768
311 311
312 reserve_bootmem_node(NODE_DATA(0), 0UL, 312 reserve_bootmem_node(NODE_DATA(0), 0UL,
313 (unsigned long)(PAGE0->mem_free + PDC_CONSOLE_IO_IODC_SIZE)); 313 (unsigned long)(PAGE0->mem_free + PDC_CONSOLE_IO_IODC_SIZE));
314 reserve_bootmem_node(NODE_DATA(0),__pa((unsigned long)&_text), 314 reserve_bootmem_node(NODE_DATA(0),__pa((unsigned long)&_text),
315 (unsigned long)(&_end - &_text)); 315 (unsigned long)(&_end - &_text));
316 reserve_bootmem_node(NODE_DATA(0), (bootmap_start_pfn << PAGE_SHIFT), 316 reserve_bootmem_node(NODE_DATA(0), (bootmap_start_pfn << PAGE_SHIFT),
317 ((bootmap_pfn - bootmap_start_pfn) << PAGE_SHIFT)); 317 ((bootmap_pfn - bootmap_start_pfn) << PAGE_SHIFT));
318 318
319 #ifndef CONFIG_DISCONTIGMEM 319 #ifndef CONFIG_DISCONTIGMEM
320 320
321 /* reserve the holes */ 321 /* reserve the holes */
322 322
323 for (i = 0; i < npmem_holes; i++) { 323 for (i = 0; i < npmem_holes; i++) {
324 reserve_bootmem_node(NODE_DATA(0), 324 reserve_bootmem_node(NODE_DATA(0),
325 (pmem_holes[i].start_pfn << PAGE_SHIFT), 325 (pmem_holes[i].start_pfn << PAGE_SHIFT),
326 (pmem_holes[i].pages << PAGE_SHIFT)); 326 (pmem_holes[i].pages << PAGE_SHIFT));
327 } 327 }
328 #endif 328 #endif
329 329
330 #ifdef CONFIG_BLK_DEV_INITRD 330 #ifdef CONFIG_BLK_DEV_INITRD
331 if (initrd_start) { 331 if (initrd_start) {
332 printk(KERN_INFO "initrd: %08lx-%08lx\n", initrd_start, initrd_end); 332 printk(KERN_INFO "initrd: %08lx-%08lx\n", initrd_start, initrd_end);
333 if (__pa(initrd_start) < mem_max) { 333 if (__pa(initrd_start) < mem_max) {
334 unsigned long initrd_reserve; 334 unsigned long initrd_reserve;
335 335
336 if (__pa(initrd_end) > mem_max) { 336 if (__pa(initrd_end) > mem_max) {
337 initrd_reserve = mem_max - __pa(initrd_start); 337 initrd_reserve = mem_max - __pa(initrd_start);
338 } else { 338 } else {
339 initrd_reserve = initrd_end - initrd_start; 339 initrd_reserve = initrd_end - initrd_start;
340 } 340 }
341 initrd_below_start_ok = 1; 341 initrd_below_start_ok = 1;
342 printk(KERN_INFO "initrd: reserving %08lx-%08lx (mem_max %08lx)\n", __pa(initrd_start), __pa(initrd_start) + initrd_reserve, mem_max); 342 printk(KERN_INFO "initrd: reserving %08lx-%08lx (mem_max %08lx)\n", __pa(initrd_start), __pa(initrd_start) + initrd_reserve, mem_max);
343 343
344 reserve_bootmem_node(NODE_DATA(0),__pa(initrd_start), initrd_reserve); 344 reserve_bootmem_node(NODE_DATA(0),__pa(initrd_start), initrd_reserve);
345 } 345 }
346 } 346 }
347 #endif 347 #endif
348 348
349 data_resource.start = virt_to_phys(&data_start); 349 data_resource.start = virt_to_phys(&data_start);
350 data_resource.end = virt_to_phys(&_end)-1; 350 data_resource.end = virt_to_phys(&_end)-1;
351 code_resource.start = virt_to_phys(&_text); 351 code_resource.start = virt_to_phys(&_text);
352 code_resource.end = virt_to_phys(&data_start)-1; 352 code_resource.end = virt_to_phys(&data_start)-1;
353 353
354 /* We don't know which region the kernel will be in, so try 354 /* We don't know which region the kernel will be in, so try
355 * all of them. 355 * all of them.
356 */ 356 */
357 for (i = 0; i < sysram_resource_count; i++) { 357 for (i = 0; i < sysram_resource_count; i++) {
358 struct resource *res = &sysram_resources[i]; 358 struct resource *res = &sysram_resources[i];
359 request_resource(res, &code_resource); 359 request_resource(res, &code_resource);
360 request_resource(res, &data_resource); 360 request_resource(res, &data_resource);
361 } 361 }
362 request_resource(&sysram_resources[0], &pdcdata_resource); 362 request_resource(&sysram_resources[0], &pdcdata_resource);
363 } 363 }
364 364
365 void free_initmem(void) 365 void free_initmem(void)
366 { 366 {
367 /* FIXME: */ 367 /* FIXME: */
368 #if 0 368 #if 0
369 printk(KERN_INFO "NOT FREEING INITMEM (%dk)\n", 369 printk(KERN_INFO "NOT FREEING INITMEM (%dk)\n",
370 (&__init_end - &__init_begin) >> 10); 370 (&__init_end - &__init_begin) >> 10);
371 return; 371 return;
372 #else 372 #else
373 unsigned long addr; 373 unsigned long addr;
374 374
375 printk(KERN_INFO "Freeing unused kernel memory: "); 375 printk(KERN_INFO "Freeing unused kernel memory: ");
376 376
377 #if 1 377 #if 1
378 /* Attempt to catch anyone trying to execute code here 378 /* Attempt to catch anyone trying to execute code here
379 * by filling the page with BRK insns. 379 * by filling the page with BRK insns.
380 * 380 *
381 * If we disable interrupts for all CPUs, then IPI stops working. 381 * If we disable interrupts for all CPUs, then IPI stops working.
382 * Kinda breaks the global cache flushing. 382 * Kinda breaks the global cache flushing.
383 */ 383 */
384 local_irq_disable(); 384 local_irq_disable();
385 385
386 memset(&__init_begin, 0x00, 386 memset(&__init_begin, 0x00,
387 (unsigned long)&__init_end - (unsigned long)&__init_begin); 387 (unsigned long)&__init_end - (unsigned long)&__init_begin);
388 388
389 flush_data_cache(); 389 flush_data_cache();
390 asm volatile("sync" : : ); 390 asm volatile("sync" : : );
391 flush_icache_range((unsigned long)&__init_begin, (unsigned long)&__init_end); 391 flush_icache_range((unsigned long)&__init_begin, (unsigned long)&__init_end);
392 asm volatile("sync" : : ); 392 asm volatile("sync" : : );
393 393
394 local_irq_enable(); 394 local_irq_enable();
395 #endif 395 #endif
396 396
397 addr = (unsigned long)(&__init_begin); 397 addr = (unsigned long)(&__init_begin);
398 for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { 398 for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
399 ClearPageReserved(virt_to_page(addr)); 399 ClearPageReserved(virt_to_page(addr));
400 set_page_count(virt_to_page(addr), 1); 400 set_page_count(virt_to_page(addr), 1);
401 free_page(addr); 401 free_page(addr);
402 num_physpages++; 402 num_physpages++;
403 totalram_pages++; 403 totalram_pages++;
404 } 404 }
405 405
406 /* set up a new led state on systems shipped LED State panel */ 406 /* set up a new led state on systems shipped LED State panel */
407 pdc_chassis_send_status(PDC_CHASSIS_DIRECT_BCOMPLETE); 407 pdc_chassis_send_status(PDC_CHASSIS_DIRECT_BCOMPLETE);
408 408
409 printk("%luk freed\n", (unsigned long)(&__init_end - &__init_begin) >> 10); 409 printk("%luk freed\n", (unsigned long)(&__init_end - &__init_begin) >> 10);
410 #endif 410 #endif
411 } 411 }
412 412
413 /* 413 /*
414 * Just an arbitrary offset to serve as a "hole" between mapping areas 414 * Just an arbitrary offset to serve as a "hole" between mapping areas
415 * (between top of physical memory and a potential pcxl dma mapping 415 * (between top of physical memory and a potential pcxl dma mapping
416 * area, and below the vmalloc mapping area). 416 * area, and below the vmalloc mapping area).
417 * 417 *
418 * The current 32K value just means that there will be a 32K "hole" 418 * The current 32K value just means that there will be a 32K "hole"
419 * between mapping areas. That means that any out-of-bounds memory 419 * between mapping areas. That means that any out-of-bounds memory
420 * accesses will hopefully be caught. The vmalloc() routines leaves 420 * accesses will hopefully be caught. The vmalloc() routines leaves
421 * a hole of 4kB between each vmalloced area for the same reason. 421 * a hole of 4kB between each vmalloced area for the same reason.
422 */ 422 */
423 423
424 /* Leave room for gateway page expansion */ 424 /* Leave room for gateway page expansion */
425 #if KERNEL_MAP_START < GATEWAY_PAGE_SIZE 425 #if KERNEL_MAP_START < GATEWAY_PAGE_SIZE
426 #error KERNEL_MAP_START is in gateway reserved region 426 #error KERNEL_MAP_START is in gateway reserved region
427 #endif 427 #endif
428 #define MAP_START (KERNEL_MAP_START) 428 #define MAP_START (KERNEL_MAP_START)
429 429
430 #define VM_MAP_OFFSET (32*1024) 430 #define VM_MAP_OFFSET (32*1024)
431 #define SET_MAP_OFFSET(x) ((void *)(((unsigned long)(x) + VM_MAP_OFFSET) \ 431 #define SET_MAP_OFFSET(x) ((void *)(((unsigned long)(x) + VM_MAP_OFFSET) \
432 & ~(VM_MAP_OFFSET-1))) 432 & ~(VM_MAP_OFFSET-1)))
433 433
434 void *vmalloc_start; 434 void *vmalloc_start;
435 EXPORT_SYMBOL(vmalloc_start); 435 EXPORT_SYMBOL(vmalloc_start);
436 436
437 #ifdef CONFIG_PA11 437 #ifdef CONFIG_PA11
438 unsigned long pcxl_dma_start; 438 unsigned long pcxl_dma_start;
439 #endif 439 #endif
440 440
441 void __init mem_init(void) 441 void __init mem_init(void)
442 { 442 {
443 high_memory = __va((max_pfn << PAGE_SHIFT)); 443 high_memory = __va((max_pfn << PAGE_SHIFT));
444 444
445 #ifndef CONFIG_DISCONTIGMEM 445 #ifndef CONFIG_DISCONTIGMEM
446 max_mapnr = page_to_pfn(virt_to_page(high_memory - 1)) + 1; 446 max_mapnr = page_to_pfn(virt_to_page(high_memory - 1)) + 1;
447 totalram_pages += free_all_bootmem(); 447 totalram_pages += free_all_bootmem();
448 #else 448 #else
449 { 449 {
450 int i; 450 int i;
451 451
452 for (i = 0; i < npmem_ranges; i++) 452 for (i = 0; i < npmem_ranges; i++)
453 totalram_pages += free_all_bootmem_node(NODE_DATA(i)); 453 totalram_pages += free_all_bootmem_node(NODE_DATA(i));
454 } 454 }
455 #endif 455 #endif
456 456
457 printk(KERN_INFO "Memory: %luk available\n", num_physpages << (PAGE_SHIFT-10)); 457 printk(KERN_INFO "Memory: %luk available\n", num_physpages << (PAGE_SHIFT-10));
458 458
459 #ifdef CONFIG_PA11 459 #ifdef CONFIG_PA11
460 if (hppa_dma_ops == &pcxl_dma_ops) { 460 if (hppa_dma_ops == &pcxl_dma_ops) {
461 pcxl_dma_start = (unsigned long)SET_MAP_OFFSET(MAP_START); 461 pcxl_dma_start = (unsigned long)SET_MAP_OFFSET(MAP_START);
462 vmalloc_start = SET_MAP_OFFSET(pcxl_dma_start + PCXL_DMA_MAP_SIZE); 462 vmalloc_start = SET_MAP_OFFSET(pcxl_dma_start + PCXL_DMA_MAP_SIZE);
463 } else { 463 } else {
464 pcxl_dma_start = 0; 464 pcxl_dma_start = 0;
465 vmalloc_start = SET_MAP_OFFSET(MAP_START); 465 vmalloc_start = SET_MAP_OFFSET(MAP_START);
466 } 466 }
467 #else 467 #else
468 vmalloc_start = SET_MAP_OFFSET(MAP_START); 468 vmalloc_start = SET_MAP_OFFSET(MAP_START);
469 #endif 469 #endif
470 470
471 } 471 }
472 472
473 int do_check_pgt_cache(int low, int high) 473 int do_check_pgt_cache(int low, int high)
474 { 474 {
475 return 0; 475 return 0;
476 } 476 }
477 477
478 unsigned long *empty_zero_page; 478 unsigned long *empty_zero_page;
479 479
480 void show_mem(void) 480 void show_mem(void)
481 { 481 {
482 int i,free = 0,total = 0,reserved = 0; 482 int i,free = 0,total = 0,reserved = 0;
483 int shared = 0, cached = 0; 483 int shared = 0, cached = 0;
484 484
485 printk(KERN_INFO "Mem-info:\n"); 485 printk(KERN_INFO "Mem-info:\n");
486 show_free_areas(); 486 show_free_areas();
487 printk(KERN_INFO "Free swap: %6ldkB\n", 487 printk(KERN_INFO "Free swap: %6ldkB\n",
488 nr_swap_pages<<(PAGE_SHIFT-10)); 488 nr_swap_pages<<(PAGE_SHIFT-10));
489 #ifndef CONFIG_DISCONTIGMEM 489 #ifndef CONFIG_DISCONTIGMEM
490 i = max_mapnr; 490 i = max_mapnr;
491 while (i-- > 0) { 491 while (i-- > 0) {
492 total++; 492 total++;
493 if (PageReserved(mem_map+i)) 493 if (PageReserved(mem_map+i))
494 reserved++; 494 reserved++;
495 else if (PageSwapCache(mem_map+i)) 495 else if (PageSwapCache(mem_map+i))
496 cached++; 496 cached++;
497 else if (!page_count(&mem_map[i])) 497 else if (!page_count(&mem_map[i]))
498 free++; 498 free++;
499 else 499 else
500 shared += page_count(&mem_map[i]) - 1; 500 shared += page_count(&mem_map[i]) - 1;
501 } 501 }
502 #else 502 #else
503 for (i = 0; i < npmem_ranges; i++) { 503 for (i = 0; i < npmem_ranges; i++) {
504 int j; 504 int j;
505 505
506 for (j = node_start_pfn(i); j < node_end_pfn(i); j++) { 506 for (j = node_start_pfn(i); j < node_end_pfn(i); j++) {
507 struct page *p; 507 struct page *p;
508 unsigned long flags;
508 509
510 pgdat_resize_lock(NODE_DATA(i), &flags);
509 p = nid_page_nr(i, j) - node_start_pfn(i); 511 p = nid_page_nr(i, j) - node_start_pfn(i);
510 512
511 total++; 513 total++;
512 if (PageReserved(p)) 514 if (PageReserved(p))
513 reserved++; 515 reserved++;
514 else if (PageSwapCache(p)) 516 else if (PageSwapCache(p))
515 cached++; 517 cached++;
516 else if (!page_count(p)) 518 else if (!page_count(p))
517 free++; 519 free++;
518 else 520 else
519 shared += page_count(p) - 1; 521 shared += page_count(p) - 1;
522 pgdat_resize_unlock(NODE_DATA(i), &flags);
520 } 523 }
521 } 524 }
522 #endif 525 #endif
523 printk(KERN_INFO "%d pages of RAM\n", total); 526 printk(KERN_INFO "%d pages of RAM\n", total);
524 printk(KERN_INFO "%d reserved pages\n", reserved); 527 printk(KERN_INFO "%d reserved pages\n", reserved);
525 printk(KERN_INFO "%d pages shared\n", shared); 528 printk(KERN_INFO "%d pages shared\n", shared);
526 printk(KERN_INFO "%d pages swap cached\n", cached); 529 printk(KERN_INFO "%d pages swap cached\n", cached);
527 530
528 531
529 #ifdef CONFIG_DISCONTIGMEM 532 #ifdef CONFIG_DISCONTIGMEM
530 { 533 {
531 struct zonelist *zl; 534 struct zonelist *zl;
532 int i, j, k; 535 int i, j, k;
533 536
534 for (i = 0; i < npmem_ranges; i++) { 537 for (i = 0; i < npmem_ranges; i++) {
535 for (j = 0; j < MAX_NR_ZONES; j++) { 538 for (j = 0; j < MAX_NR_ZONES; j++) {
536 zl = NODE_DATA(i)->node_zonelists + j; 539 zl = NODE_DATA(i)->node_zonelists + j;
537 540
538 printk("Zone list for zone %d on node %d: ", j, i); 541 printk("Zone list for zone %d on node %d: ", j, i);
539 for (k = 0; zl->zones[k] != NULL; k++) 542 for (k = 0; zl->zones[k] != NULL; k++)
540 printk("[%d/%s] ", zl->zones[k]->zone_pgdat->node_id, zl->zones[k]->name); 543 printk("[%d/%s] ", zl->zones[k]->zone_pgdat->node_id, zl->zones[k]->name);
541 printk("\n"); 544 printk("\n");
542 } 545 }
543 } 546 }
544 } 547 }
545 #endif 548 #endif
546 } 549 }
547 550
548 551
549 static void __init map_pages(unsigned long start_vaddr, unsigned long start_paddr, unsigned long size, pgprot_t pgprot) 552 static void __init map_pages(unsigned long start_vaddr, unsigned long start_paddr, unsigned long size, pgprot_t pgprot)
550 { 553 {
551 pgd_t *pg_dir; 554 pgd_t *pg_dir;
552 pmd_t *pmd; 555 pmd_t *pmd;
553 pte_t *pg_table; 556 pte_t *pg_table;
554 unsigned long end_paddr; 557 unsigned long end_paddr;
555 unsigned long start_pmd; 558 unsigned long start_pmd;
556 unsigned long start_pte; 559 unsigned long start_pte;
557 unsigned long tmp1; 560 unsigned long tmp1;
558 unsigned long tmp2; 561 unsigned long tmp2;
559 unsigned long address; 562 unsigned long address;
560 unsigned long ro_start; 563 unsigned long ro_start;
561 unsigned long ro_end; 564 unsigned long ro_end;
562 unsigned long fv_addr; 565 unsigned long fv_addr;
563 unsigned long gw_addr; 566 unsigned long gw_addr;
564 extern const unsigned long fault_vector_20; 567 extern const unsigned long fault_vector_20;
565 extern void * const linux_gateway_page; 568 extern void * const linux_gateway_page;
566 569
567 ro_start = __pa((unsigned long)&_text); 570 ro_start = __pa((unsigned long)&_text);
568 ro_end = __pa((unsigned long)&data_start); 571 ro_end = __pa((unsigned long)&data_start);
569 fv_addr = __pa((unsigned long)&fault_vector_20) & PAGE_MASK; 572 fv_addr = __pa((unsigned long)&fault_vector_20) & PAGE_MASK;
570 gw_addr = __pa((unsigned long)&linux_gateway_page) & PAGE_MASK; 573 gw_addr = __pa((unsigned long)&linux_gateway_page) & PAGE_MASK;
571 574
572 end_paddr = start_paddr + size; 575 end_paddr = start_paddr + size;
573 576
574 pg_dir = pgd_offset_k(start_vaddr); 577 pg_dir = pgd_offset_k(start_vaddr);
575 578
576 #if PTRS_PER_PMD == 1 579 #if PTRS_PER_PMD == 1
577 start_pmd = 0; 580 start_pmd = 0;
578 #else 581 #else
579 start_pmd = ((start_vaddr >> PMD_SHIFT) & (PTRS_PER_PMD - 1)); 582 start_pmd = ((start_vaddr >> PMD_SHIFT) & (PTRS_PER_PMD - 1));
580 #endif 583 #endif
581 start_pte = ((start_vaddr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)); 584 start_pte = ((start_vaddr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1));
582 585
583 address = start_paddr; 586 address = start_paddr;
584 while (address < end_paddr) { 587 while (address < end_paddr) {
585 #if PTRS_PER_PMD == 1 588 #if PTRS_PER_PMD == 1
586 pmd = (pmd_t *)__pa(pg_dir); 589 pmd = (pmd_t *)__pa(pg_dir);
587 #else 590 #else
588 pmd = (pmd_t *)pgd_address(*pg_dir); 591 pmd = (pmd_t *)pgd_address(*pg_dir);
589 592
590 /* 593 /*
591 * pmd is physical at this point 594 * pmd is physical at this point
592 */ 595 */
593 596
594 if (!pmd) { 597 if (!pmd) {
595 pmd = (pmd_t *) alloc_bootmem_low_pages_node(NODE_DATA(0),PAGE_SIZE << PMD_ORDER); 598 pmd = (pmd_t *) alloc_bootmem_low_pages_node(NODE_DATA(0),PAGE_SIZE << PMD_ORDER);
596 pmd = (pmd_t *) __pa(pmd); 599 pmd = (pmd_t *) __pa(pmd);
597 } 600 }
598 601
599 pgd_populate(NULL, pg_dir, __va(pmd)); 602 pgd_populate(NULL, pg_dir, __va(pmd));
600 #endif 603 #endif
601 pg_dir++; 604 pg_dir++;
602 605
603 /* now change pmd to kernel virtual addresses */ 606 /* now change pmd to kernel virtual addresses */
604 607
605 pmd = (pmd_t *)__va(pmd) + start_pmd; 608 pmd = (pmd_t *)__va(pmd) + start_pmd;
606 for (tmp1 = start_pmd; tmp1 < PTRS_PER_PMD; tmp1++,pmd++) { 609 for (tmp1 = start_pmd; tmp1 < PTRS_PER_PMD; tmp1++,pmd++) {
607 610
608 /* 611 /*
609 * pg_table is physical at this point 612 * pg_table is physical at this point
610 */ 613 */
611 614
612 pg_table = (pte_t *)pmd_address(*pmd); 615 pg_table = (pte_t *)pmd_address(*pmd);
613 if (!pg_table) { 616 if (!pg_table) {
614 pg_table = (pte_t *) 617 pg_table = (pte_t *)
615 alloc_bootmem_low_pages_node(NODE_DATA(0),PAGE_SIZE); 618 alloc_bootmem_low_pages_node(NODE_DATA(0),PAGE_SIZE);
616 pg_table = (pte_t *) __pa(pg_table); 619 pg_table = (pte_t *) __pa(pg_table);
617 } 620 }
618 621
619 pmd_populate_kernel(NULL, pmd, __va(pg_table)); 622 pmd_populate_kernel(NULL, pmd, __va(pg_table));
620 623
621 /* now change pg_table to kernel virtual addresses */ 624 /* now change pg_table to kernel virtual addresses */
622 625
623 pg_table = (pte_t *) __va(pg_table) + start_pte; 626 pg_table = (pte_t *) __va(pg_table) + start_pte;
624 for (tmp2 = start_pte; tmp2 < PTRS_PER_PTE; tmp2++,pg_table++) { 627 for (tmp2 = start_pte; tmp2 < PTRS_PER_PTE; tmp2++,pg_table++) {
625 pte_t pte; 628 pte_t pte;
626 629
627 /* 630 /*
628 * Map the fault vector writable so we can 631 * Map the fault vector writable so we can
629 * write the HPMC checksum. 632 * write the HPMC checksum.
630 */ 633 */
631 if (address >= ro_start && address < ro_end 634 if (address >= ro_start && address < ro_end
632 && address != fv_addr 635 && address != fv_addr
633 && address != gw_addr) 636 && address != gw_addr)
634 pte = __mk_pte(address, PAGE_KERNEL_RO); 637 pte = __mk_pte(address, PAGE_KERNEL_RO);
635 else 638 else
636 pte = __mk_pte(address, pgprot); 639 pte = __mk_pte(address, pgprot);
637 640
638 if (address >= end_paddr) 641 if (address >= end_paddr)
639 pte_val(pte) = 0; 642 pte_val(pte) = 0;
640 643
641 set_pte(pg_table, pte); 644 set_pte(pg_table, pte);
642 645
643 address += PAGE_SIZE; 646 address += PAGE_SIZE;
644 } 647 }
645 start_pte = 0; 648 start_pte = 0;
646 649
647 if (address >= end_paddr) 650 if (address >= end_paddr)
648 break; 651 break;
649 } 652 }
650 start_pmd = 0; 653 start_pmd = 0;
651 } 654 }
652 } 655 }
653 656
654 /* 657 /*
655 * pagetable_init() sets up the page tables 658 * pagetable_init() sets up the page tables
656 * 659 *
657 * Note that gateway_init() places the Linux gateway page at page 0. 660 * Note that gateway_init() places the Linux gateway page at page 0.
658 * Since gateway pages cannot be dereferenced this has the desirable 661 * Since gateway pages cannot be dereferenced this has the desirable
659 * side effect of trapping those pesky NULL-reference errors in the 662 * side effect of trapping those pesky NULL-reference errors in the
660 * kernel. 663 * kernel.
661 */ 664 */
662 static void __init pagetable_init(void) 665 static void __init pagetable_init(void)
663 { 666 {
664 int range; 667 int range;
665 668
666 /* Map each physical memory range to its kernel vaddr */ 669 /* Map each physical memory range to its kernel vaddr */
667 670
668 for (range = 0; range < npmem_ranges; range++) { 671 for (range = 0; range < npmem_ranges; range++) {
669 unsigned long start_paddr; 672 unsigned long start_paddr;
670 unsigned long end_paddr; 673 unsigned long end_paddr;
671 unsigned long size; 674 unsigned long size;
672 675
673 start_paddr = pmem_ranges[range].start_pfn << PAGE_SHIFT; 676 start_paddr = pmem_ranges[range].start_pfn << PAGE_SHIFT;
674 end_paddr = start_paddr + (pmem_ranges[range].pages << PAGE_SHIFT); 677 end_paddr = start_paddr + (pmem_ranges[range].pages << PAGE_SHIFT);
675 size = pmem_ranges[range].pages << PAGE_SHIFT; 678 size = pmem_ranges[range].pages << PAGE_SHIFT;
676 679
677 map_pages((unsigned long)__va(start_paddr), start_paddr, 680 map_pages((unsigned long)__va(start_paddr), start_paddr,
678 size, PAGE_KERNEL); 681 size, PAGE_KERNEL);
679 } 682 }
680 683
681 #ifdef CONFIG_BLK_DEV_INITRD 684 #ifdef CONFIG_BLK_DEV_INITRD
682 if (initrd_end && initrd_end > mem_limit) { 685 if (initrd_end && initrd_end > mem_limit) {
683 printk("initrd: mapping %08lx-%08lx\n", initrd_start, initrd_end); 686 printk("initrd: mapping %08lx-%08lx\n", initrd_start, initrd_end);
684 map_pages(initrd_start, __pa(initrd_start), 687 map_pages(initrd_start, __pa(initrd_start),
685 initrd_end - initrd_start, PAGE_KERNEL); 688 initrd_end - initrd_start, PAGE_KERNEL);
686 } 689 }
687 #endif 690 #endif
688 691
689 empty_zero_page = alloc_bootmem_pages(PAGE_SIZE); 692 empty_zero_page = alloc_bootmem_pages(PAGE_SIZE);
690 memset(empty_zero_page, 0, PAGE_SIZE); 693 memset(empty_zero_page, 0, PAGE_SIZE);
691 } 694 }
692 695
693 static void __init gateway_init(void) 696 static void __init gateway_init(void)
694 { 697 {
695 unsigned long linux_gateway_page_addr; 698 unsigned long linux_gateway_page_addr;
696 /* FIXME: This is 'const' in order to trick the compiler 699 /* FIXME: This is 'const' in order to trick the compiler
697 into not treating it as DP-relative data. */ 700 into not treating it as DP-relative data. */
698 extern void * const linux_gateway_page; 701 extern void * const linux_gateway_page;
699 702
700 linux_gateway_page_addr = LINUX_GATEWAY_ADDR & PAGE_MASK; 703 linux_gateway_page_addr = LINUX_GATEWAY_ADDR & PAGE_MASK;
701 704
702 /* 705 /*
703 * Setup Linux Gateway page. 706 * Setup Linux Gateway page.
704 * 707 *
705 * The Linux gateway page will reside in kernel space (on virtual 708 * The Linux gateway page will reside in kernel space (on virtual
706 * page 0), so it doesn't need to be aliased into user space. 709 * page 0), so it doesn't need to be aliased into user space.
707 */ 710 */
708 711
709 map_pages(linux_gateway_page_addr, __pa(&linux_gateway_page), 712 map_pages(linux_gateway_page_addr, __pa(&linux_gateway_page),
710 PAGE_SIZE, PAGE_GATEWAY); 713 PAGE_SIZE, PAGE_GATEWAY);
711 } 714 }
712 715
713 #ifdef CONFIG_HPUX 716 #ifdef CONFIG_HPUX
714 void 717 void
715 map_hpux_gateway_page(struct task_struct *tsk, struct mm_struct *mm) 718 map_hpux_gateway_page(struct task_struct *tsk, struct mm_struct *mm)
716 { 719 {
717 pgd_t *pg_dir; 720 pgd_t *pg_dir;
718 pmd_t *pmd; 721 pmd_t *pmd;
719 pte_t *pg_table; 722 pte_t *pg_table;
720 unsigned long start_pmd; 723 unsigned long start_pmd;
721 unsigned long start_pte; 724 unsigned long start_pte;
722 unsigned long address; 725 unsigned long address;
723 unsigned long hpux_gw_page_addr; 726 unsigned long hpux_gw_page_addr;
724 /* FIXME: This is 'const' in order to trick the compiler 727 /* FIXME: This is 'const' in order to trick the compiler
725 into not treating it as DP-relative data. */ 728 into not treating it as DP-relative data. */
726 extern void * const hpux_gateway_page; 729 extern void * const hpux_gateway_page;
727 730
728 hpux_gw_page_addr = HPUX_GATEWAY_ADDR & PAGE_MASK; 731 hpux_gw_page_addr = HPUX_GATEWAY_ADDR & PAGE_MASK;
729 732
730 /* 733 /*
731 * Setup HP-UX Gateway page. 734 * Setup HP-UX Gateway page.
732 * 735 *
733 * The HP-UX gateway page resides in the user address space, 736 * The HP-UX gateway page resides in the user address space,
734 * so it needs to be aliased into each process. 737 * so it needs to be aliased into each process.
735 */ 738 */
736 739
737 pg_dir = pgd_offset(mm,hpux_gw_page_addr); 740 pg_dir = pgd_offset(mm,hpux_gw_page_addr);
738 741
739 #if PTRS_PER_PMD == 1 742 #if PTRS_PER_PMD == 1
740 start_pmd = 0; 743 start_pmd = 0;
741 #else 744 #else
742 start_pmd = ((hpux_gw_page_addr >> PMD_SHIFT) & (PTRS_PER_PMD - 1)); 745 start_pmd = ((hpux_gw_page_addr >> PMD_SHIFT) & (PTRS_PER_PMD - 1));
743 #endif 746 #endif
744 start_pte = ((hpux_gw_page_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)); 747 start_pte = ((hpux_gw_page_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1));
745 748
746 address = __pa(&hpux_gateway_page); 749 address = __pa(&hpux_gateway_page);
747 #if PTRS_PER_PMD == 1 750 #if PTRS_PER_PMD == 1
748 pmd = (pmd_t *)__pa(pg_dir); 751 pmd = (pmd_t *)__pa(pg_dir);
749 #else 752 #else
750 pmd = (pmd_t *) pgd_address(*pg_dir); 753 pmd = (pmd_t *) pgd_address(*pg_dir);
751 754
752 /* 755 /*
753 * pmd is physical at this point 756 * pmd is physical at this point
754 */ 757 */
755 758
756 if (!pmd) { 759 if (!pmd) {
757 pmd = (pmd_t *) get_zeroed_page(GFP_KERNEL); 760 pmd = (pmd_t *) get_zeroed_page(GFP_KERNEL);
758 pmd = (pmd_t *) __pa(pmd); 761 pmd = (pmd_t *) __pa(pmd);
759 } 762 }
760 763
761 __pgd_val_set(*pg_dir, PxD_FLAG_PRESENT | PxD_FLAG_VALID | (unsigned long) pmd); 764 __pgd_val_set(*pg_dir, PxD_FLAG_PRESENT | PxD_FLAG_VALID | (unsigned long) pmd);
762 #endif 765 #endif
763 /* now change pmd to kernel virtual addresses */ 766 /* now change pmd to kernel virtual addresses */
764 767
765 pmd = (pmd_t *)__va(pmd) + start_pmd; 768 pmd = (pmd_t *)__va(pmd) + start_pmd;
766 769
767 /* 770 /*
768 * pg_table is physical at this point 771 * pg_table is physical at this point
769 */ 772 */
770 773
771 pg_table = (pte_t *) pmd_address(*pmd); 774 pg_table = (pte_t *) pmd_address(*pmd);
772 if (!pg_table) 775 if (!pg_table)
773 pg_table = (pte_t *) __pa(get_zeroed_page(GFP_KERNEL)); 776 pg_table = (pte_t *) __pa(get_zeroed_page(GFP_KERNEL));
774 777
775 __pmd_val_set(*pmd, PxD_FLAG_PRESENT | PxD_FLAG_VALID | (unsigned long) pg_table); 778 __pmd_val_set(*pmd, PxD_FLAG_PRESENT | PxD_FLAG_VALID | (unsigned long) pg_table);
776 779
777 /* now change pg_table to kernel virtual addresses */ 780 /* now change pg_table to kernel virtual addresses */
778 781
779 pg_table = (pte_t *) __va(pg_table) + start_pte; 782 pg_table = (pte_t *) __va(pg_table) + start_pte;
780 set_pte(pg_table, __mk_pte(address, PAGE_GATEWAY)); 783 set_pte(pg_table, __mk_pte(address, PAGE_GATEWAY));
781 } 784 }
782 EXPORT_SYMBOL(map_hpux_gateway_page); 785 EXPORT_SYMBOL(map_hpux_gateway_page);
783 #endif 786 #endif
784 787
785 extern void flush_tlb_all_local(void); 788 extern void flush_tlb_all_local(void);
786 789
787 void __init paging_init(void) 790 void __init paging_init(void)
788 { 791 {
789 int i; 792 int i;
790 793
791 setup_bootmem(); 794 setup_bootmem();
792 pagetable_init(); 795 pagetable_init();
793 gateway_init(); 796 gateway_init();
794 flush_cache_all_local(); /* start with known state */ 797 flush_cache_all_local(); /* start with known state */
795 flush_tlb_all_local(); 798 flush_tlb_all_local();
796 799
797 for (i = 0; i < npmem_ranges; i++) { 800 for (i = 0; i < npmem_ranges; i++) {
798 unsigned long zones_size[MAX_NR_ZONES] = { 0, 0, 0 }; 801 unsigned long zones_size[MAX_NR_ZONES] = { 0, 0, 0 };
799 802
800 /* We have an IOMMU, so all memory can go into a single 803 /* We have an IOMMU, so all memory can go into a single
801 ZONE_DMA zone. */ 804 ZONE_DMA zone. */
802 zones_size[ZONE_DMA] = pmem_ranges[i].pages; 805 zones_size[ZONE_DMA] = pmem_ranges[i].pages;
803 806
804 #ifdef CONFIG_DISCONTIGMEM 807 #ifdef CONFIG_DISCONTIGMEM
805 /* Need to initialize the pfnnid_map before we can initialize 808 /* Need to initialize the pfnnid_map before we can initialize
806 the zone */ 809 the zone */
807 { 810 {
808 int j; 811 int j;
809 for (j = (pmem_ranges[i].start_pfn >> PFNNID_SHIFT); 812 for (j = (pmem_ranges[i].start_pfn >> PFNNID_SHIFT);
810 j <= ((pmem_ranges[i].start_pfn + pmem_ranges[i].pages) >> PFNNID_SHIFT); 813 j <= ((pmem_ranges[i].start_pfn + pmem_ranges[i].pages) >> PFNNID_SHIFT);
811 j++) { 814 j++) {
812 pfnnid_map[j] = i; 815 pfnnid_map[j] = i;
813 } 816 }
814 } 817 }
815 #endif 818 #endif
816 819
817 free_area_init_node(i, NODE_DATA(i), zones_size, 820 free_area_init_node(i, NODE_DATA(i), zones_size,
818 pmem_ranges[i].start_pfn, NULL); 821 pmem_ranges[i].start_pfn, NULL);
819 } 822 }
820 } 823 }
821 824
822 #ifdef CONFIG_PA20 825 #ifdef CONFIG_PA20
823 826
824 /* 827 /*
825 * Currently, all PA20 chips have 18 bit protection id's, which is the 828 * Currently, all PA20 chips have 18 bit protection id's, which is the
826 * limiting factor (space ids are 32 bits). 829 * limiting factor (space ids are 32 bits).
827 */ 830 */
828 831
829 #define NR_SPACE_IDS 262144 832 #define NR_SPACE_IDS 262144
830 833
831 #else 834 #else
832 835
833 /* 836 /*
834 * Currently we have a one-to-one relationship between space id's and 837 * Currently we have a one-to-one relationship between space id's and
835 * protection id's. Older parisc chips (PCXS, PCXT, PCXL, PCXL2) only 838 * protection id's. Older parisc chips (PCXS, PCXT, PCXL, PCXL2) only
836 * support 15 bit protection id's, so that is the limiting factor. 839 * support 15 bit protection id's, so that is the limiting factor.
837 * PCXT' has 18 bit protection id's, but only 16 bit spaceids, so it's 840 * PCXT' has 18 bit protection id's, but only 16 bit spaceids, so it's
838 * probably not worth the effort for a special case here. 841 * probably not worth the effort for a special case here.
839 */ 842 */
840 843
841 #define NR_SPACE_IDS 32768 844 #define NR_SPACE_IDS 32768
842 845
843 #endif /* !CONFIG_PA20 */ 846 #endif /* !CONFIG_PA20 */
844 847
845 #define RECYCLE_THRESHOLD (NR_SPACE_IDS / 2) 848 #define RECYCLE_THRESHOLD (NR_SPACE_IDS / 2)
846 #define SID_ARRAY_SIZE (NR_SPACE_IDS / (8 * sizeof(long))) 849 #define SID_ARRAY_SIZE (NR_SPACE_IDS / (8 * sizeof(long)))
847 850
848 static unsigned long space_id[SID_ARRAY_SIZE] = { 1 }; /* disallow space 0 */ 851 static unsigned long space_id[SID_ARRAY_SIZE] = { 1 }; /* disallow space 0 */
849 static unsigned long dirty_space_id[SID_ARRAY_SIZE]; 852 static unsigned long dirty_space_id[SID_ARRAY_SIZE];
850 static unsigned long space_id_index; 853 static unsigned long space_id_index;
851 static unsigned long free_space_ids = NR_SPACE_IDS - 1; 854 static unsigned long free_space_ids = NR_SPACE_IDS - 1;
852 static unsigned long dirty_space_ids = 0; 855 static unsigned long dirty_space_ids = 0;
853 856
854 static DEFINE_SPINLOCK(sid_lock); 857 static DEFINE_SPINLOCK(sid_lock);
855 858
856 unsigned long alloc_sid(void) 859 unsigned long alloc_sid(void)
857 { 860 {
858 unsigned long index; 861 unsigned long index;
859 862
860 spin_lock(&sid_lock); 863 spin_lock(&sid_lock);
861 864
862 if (free_space_ids == 0) { 865 if (free_space_ids == 0) {
863 if (dirty_space_ids != 0) { 866 if (dirty_space_ids != 0) {
864 spin_unlock(&sid_lock); 867 spin_unlock(&sid_lock);
865 flush_tlb_all(); /* flush_tlb_all() calls recycle_sids() */ 868 flush_tlb_all(); /* flush_tlb_all() calls recycle_sids() */
866 spin_lock(&sid_lock); 869 spin_lock(&sid_lock);
867 } 870 }
868 if (free_space_ids == 0) 871 if (free_space_ids == 0)
869 BUG(); 872 BUG();
870 } 873 }
871 874
872 free_space_ids--; 875 free_space_ids--;
873 876
874 index = find_next_zero_bit(space_id, NR_SPACE_IDS, space_id_index); 877 index = find_next_zero_bit(space_id, NR_SPACE_IDS, space_id_index);
875 space_id[index >> SHIFT_PER_LONG] |= (1L << (index & (BITS_PER_LONG - 1))); 878 space_id[index >> SHIFT_PER_LONG] |= (1L << (index & (BITS_PER_LONG - 1)));
876 space_id_index = index; 879 space_id_index = index;
877 880
878 spin_unlock(&sid_lock); 881 spin_unlock(&sid_lock);
879 882
880 return index << SPACEID_SHIFT; 883 return index << SPACEID_SHIFT;
881 } 884 }
882 885
883 void free_sid(unsigned long spaceid) 886 void free_sid(unsigned long spaceid)
884 { 887 {
885 unsigned long index = spaceid >> SPACEID_SHIFT; 888 unsigned long index = spaceid >> SPACEID_SHIFT;
886 unsigned long *dirty_space_offset; 889 unsigned long *dirty_space_offset;
887 890
888 dirty_space_offset = dirty_space_id + (index >> SHIFT_PER_LONG); 891 dirty_space_offset = dirty_space_id + (index >> SHIFT_PER_LONG);
889 index &= (BITS_PER_LONG - 1); 892 index &= (BITS_PER_LONG - 1);
890 893
891 spin_lock(&sid_lock); 894 spin_lock(&sid_lock);
892 895
893 if (*dirty_space_offset & (1L << index)) 896 if (*dirty_space_offset & (1L << index))
894 BUG(); /* attempt to free space id twice */ 897 BUG(); /* attempt to free space id twice */
895 898
896 *dirty_space_offset |= (1L << index); 899 *dirty_space_offset |= (1L << index);
897 dirty_space_ids++; 900 dirty_space_ids++;
898 901
899 spin_unlock(&sid_lock); 902 spin_unlock(&sid_lock);
900 } 903 }
901 904
902 905
903 #ifdef CONFIG_SMP 906 #ifdef CONFIG_SMP
904 static void get_dirty_sids(unsigned long *ndirtyptr,unsigned long *dirty_array) 907 static void get_dirty_sids(unsigned long *ndirtyptr,unsigned long *dirty_array)
905 { 908 {
906 int i; 909 int i;
907 910
908 /* NOTE: sid_lock must be held upon entry */ 911 /* NOTE: sid_lock must be held upon entry */
909 912
910 *ndirtyptr = dirty_space_ids; 913 *ndirtyptr = dirty_space_ids;
911 if (dirty_space_ids != 0) { 914 if (dirty_space_ids != 0) {
912 for (i = 0; i < SID_ARRAY_SIZE; i++) { 915 for (i = 0; i < SID_ARRAY_SIZE; i++) {
913 dirty_array[i] = dirty_space_id[i]; 916 dirty_array[i] = dirty_space_id[i];
914 dirty_space_id[i] = 0; 917 dirty_space_id[i] = 0;
915 } 918 }
916 dirty_space_ids = 0; 919 dirty_space_ids = 0;
917 } 920 }
918 921
919 return; 922 return;
920 } 923 }
921 924
922 static void recycle_sids(unsigned long ndirty,unsigned long *dirty_array) 925 static void recycle_sids(unsigned long ndirty,unsigned long *dirty_array)
923 { 926 {
924 int i; 927 int i;
925 928
926 /* NOTE: sid_lock must be held upon entry */ 929 /* NOTE: sid_lock must be held upon entry */
927 930
928 if (ndirty != 0) { 931 if (ndirty != 0) {
929 for (i = 0; i < SID_ARRAY_SIZE; i++) { 932 for (i = 0; i < SID_ARRAY_SIZE; i++) {
930 space_id[i] ^= dirty_array[i]; 933 space_id[i] ^= dirty_array[i];
931 } 934 }
932 935
933 free_space_ids += ndirty; 936 free_space_ids += ndirty;
934 space_id_index = 0; 937 space_id_index = 0;
935 } 938 }
936 } 939 }
937 940
938 #else /* CONFIG_SMP */ 941 #else /* CONFIG_SMP */
939 942
940 static void recycle_sids(void) 943 static void recycle_sids(void)
941 { 944 {
942 int i; 945 int i;
943 946
944 /* NOTE: sid_lock must be held upon entry */ 947 /* NOTE: sid_lock must be held upon entry */
945 948
946 if (dirty_space_ids != 0) { 949 if (dirty_space_ids != 0) {
947 for (i = 0; i < SID_ARRAY_SIZE; i++) { 950 for (i = 0; i < SID_ARRAY_SIZE; i++) {
948 space_id[i] ^= dirty_space_id[i]; 951 space_id[i] ^= dirty_space_id[i];
949 dirty_space_id[i] = 0; 952 dirty_space_id[i] = 0;
950 } 953 }
951 954
952 free_space_ids += dirty_space_ids; 955 free_space_ids += dirty_space_ids;
953 dirty_space_ids = 0; 956 dirty_space_ids = 0;
954 space_id_index = 0; 957 space_id_index = 0;
955 } 958 }
956 } 959 }
957 #endif 960 #endif
958 961
959 /* 962 /*
960 * flush_tlb_all() calls recycle_sids(), since whenever the entire tlb is 963 * flush_tlb_all() calls recycle_sids(), since whenever the entire tlb is
961 * purged, we can safely reuse the space ids that were released but 964 * purged, we can safely reuse the space ids that were released but
962 * not flushed from the tlb. 965 * not flushed from the tlb.
963 */ 966 */
964 967
965 #ifdef CONFIG_SMP 968 #ifdef CONFIG_SMP
966 969
967 static unsigned long recycle_ndirty; 970 static unsigned long recycle_ndirty;
968 static unsigned long recycle_dirty_array[SID_ARRAY_SIZE]; 971 static unsigned long recycle_dirty_array[SID_ARRAY_SIZE];
969 static unsigned int recycle_inuse = 0; 972 static unsigned int recycle_inuse = 0;
970 973
971 void flush_tlb_all(void) 974 void flush_tlb_all(void)
972 { 975 {
973 int do_recycle; 976 int do_recycle;
974 977
975 do_recycle = 0; 978 do_recycle = 0;
976 spin_lock(&sid_lock); 979 spin_lock(&sid_lock);
977 if (dirty_space_ids > RECYCLE_THRESHOLD) { 980 if (dirty_space_ids > RECYCLE_THRESHOLD) {
978 if (recycle_inuse) { 981 if (recycle_inuse) {
979 BUG(); /* FIXME: Use a semaphore/wait queue here */ 982 BUG(); /* FIXME: Use a semaphore/wait queue here */
980 } 983 }
981 get_dirty_sids(&recycle_ndirty,recycle_dirty_array); 984 get_dirty_sids(&recycle_ndirty,recycle_dirty_array);
982 recycle_inuse++; 985 recycle_inuse++;
983 do_recycle++; 986 do_recycle++;
984 } 987 }
985 spin_unlock(&sid_lock); 988 spin_unlock(&sid_lock);
986 on_each_cpu((void (*)(void *))flush_tlb_all_local, NULL, 1, 1); 989 on_each_cpu((void (*)(void *))flush_tlb_all_local, NULL, 1, 1);
987 if (do_recycle) { 990 if (do_recycle) {
988 spin_lock(&sid_lock); 991 spin_lock(&sid_lock);
989 recycle_sids(recycle_ndirty,recycle_dirty_array); 992 recycle_sids(recycle_ndirty,recycle_dirty_array);
990 recycle_inuse = 0; 993 recycle_inuse = 0;
991 spin_unlock(&sid_lock); 994 spin_unlock(&sid_lock);
992 } 995 }
993 } 996 }
994 #else 997 #else
995 void flush_tlb_all(void) 998 void flush_tlb_all(void)
996 { 999 {
997 spin_lock(&sid_lock); 1000 spin_lock(&sid_lock);
998 flush_tlb_all_local(); 1001 flush_tlb_all_local();
999 recycle_sids(); 1002 recycle_sids();
1000 spin_unlock(&sid_lock); 1003 spin_unlock(&sid_lock);
1001 } 1004 }
1002 #endif 1005 #endif
1003 1006
1004 #ifdef CONFIG_BLK_DEV_INITRD 1007 #ifdef CONFIG_BLK_DEV_INITRD
1005 void free_initrd_mem(unsigned long start, unsigned long end) 1008 void free_initrd_mem(unsigned long start, unsigned long end)
1006 { 1009 {
1007 #if 0 1010 #if 0
1008 if (start < end) 1011 if (start < end)
1009 printk(KERN_INFO "Freeing initrd memory: %ldk freed\n", (end - start) >> 10); 1012 printk(KERN_INFO "Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
1010 for (; start < end; start += PAGE_SIZE) { 1013 for (; start < end; start += PAGE_SIZE) {
1011 ClearPageReserved(virt_to_page(start)); 1014 ClearPageReserved(virt_to_page(start));
1012 set_page_count(virt_to_page(start), 1); 1015 set_page_count(virt_to_page(start), 1);
1013 free_page(start); 1016 free_page(start);
1014 num_physpages++; 1017 num_physpages++;
1015 totalram_pages++; 1018 totalram_pages++;
1016 } 1019 }
1017 #endif 1020 #endif
1018 } 1021 }
1019 #endif 1022 #endif
1020 1023
arch/ppc64/mm/init.c
1 /* 1 /*
2 * PowerPC version 2 * PowerPC version
3 * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) 3 * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
4 * 4 *
5 * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) 5 * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
6 * and Cort Dougan (PReP) (cort@cs.nmt.edu) 6 * and Cort Dougan (PReP) (cort@cs.nmt.edu)
7 * Copyright (C) 1996 Paul Mackerras 7 * Copyright (C) 1996 Paul Mackerras
8 * Amiga/APUS changes by Jesper Skov (jskov@cygnus.co.uk). 8 * Amiga/APUS changes by Jesper Skov (jskov@cygnus.co.uk).
9 * 9 *
10 * Derived from "arch/i386/mm/init.c" 10 * Derived from "arch/i386/mm/init.c"
11 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 11 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
12 * 12 *
13 * Dave Engebretsen <engebret@us.ibm.com> 13 * Dave Engebretsen <engebret@us.ibm.com>
14 * Rework for PPC64 port. 14 * Rework for PPC64 port.
15 * 15 *
16 * This program is free software; you can redistribute it and/or 16 * This program is free software; you can redistribute it and/or
17 * modify it under the terms of the GNU General Public License 17 * modify it under the terms of the GNU General Public License
18 * as published by the Free Software Foundation; either version 18 * as published by the Free Software Foundation; either version
19 * 2 of the License, or (at your option) any later version. 19 * 2 of the License, or (at your option) any later version.
20 * 20 *
21 */ 21 */
22 22
23 #include <linux/config.h> 23 #include <linux/config.h>
24 #include <linux/signal.h> 24 #include <linux/signal.h>
25 #include <linux/sched.h> 25 #include <linux/sched.h>
26 #include <linux/kernel.h> 26 #include <linux/kernel.h>
27 #include <linux/errno.h> 27 #include <linux/errno.h>
28 #include <linux/string.h> 28 #include <linux/string.h>
29 #include <linux/types.h> 29 #include <linux/types.h>
30 #include <linux/mman.h> 30 #include <linux/mman.h>
31 #include <linux/mm.h> 31 #include <linux/mm.h>
32 #include <linux/swap.h> 32 #include <linux/swap.h>
33 #include <linux/stddef.h> 33 #include <linux/stddef.h>
34 #include <linux/vmalloc.h> 34 #include <linux/vmalloc.h>
35 #include <linux/init.h> 35 #include <linux/init.h>
36 #include <linux/delay.h> 36 #include <linux/delay.h>
37 #include <linux/bootmem.h> 37 #include <linux/bootmem.h>
38 #include <linux/highmem.h> 38 #include <linux/highmem.h>
39 #include <linux/idr.h> 39 #include <linux/idr.h>
40 #include <linux/nodemask.h> 40 #include <linux/nodemask.h>
41 #include <linux/module.h> 41 #include <linux/module.h>
42 42
43 #include <asm/pgalloc.h> 43 #include <asm/pgalloc.h>
44 #include <asm/page.h> 44 #include <asm/page.h>
45 #include <asm/prom.h> 45 #include <asm/prom.h>
46 #include <asm/lmb.h> 46 #include <asm/lmb.h>
47 #include <asm/rtas.h> 47 #include <asm/rtas.h>
48 #include <asm/io.h> 48 #include <asm/io.h>
49 #include <asm/mmu_context.h> 49 #include <asm/mmu_context.h>
50 #include <asm/pgtable.h> 50 #include <asm/pgtable.h>
51 #include <asm/mmu.h> 51 #include <asm/mmu.h>
52 #include <asm/uaccess.h> 52 #include <asm/uaccess.h>
53 #include <asm/smp.h> 53 #include <asm/smp.h>
54 #include <asm/machdep.h> 54 #include <asm/machdep.h>
55 #include <asm/tlb.h> 55 #include <asm/tlb.h>
56 #include <asm/eeh.h> 56 #include <asm/eeh.h>
57 #include <asm/processor.h> 57 #include <asm/processor.h>
58 #include <asm/mmzone.h> 58 #include <asm/mmzone.h>
59 #include <asm/cputable.h> 59 #include <asm/cputable.h>
60 #include <asm/ppcdebug.h> 60 #include <asm/ppcdebug.h>
61 #include <asm/sections.h> 61 #include <asm/sections.h>
62 #include <asm/system.h> 62 #include <asm/system.h>
63 #include <asm/iommu.h> 63 #include <asm/iommu.h>
64 #include <asm/abs_addr.h> 64 #include <asm/abs_addr.h>
65 #include <asm/vdso.h> 65 #include <asm/vdso.h>
66 #include <asm/imalloc.h> 66 #include <asm/imalloc.h>
67 67
68 #if PGTABLE_RANGE > USER_VSID_RANGE 68 #if PGTABLE_RANGE > USER_VSID_RANGE
69 #warning Limited user VSID range means pagetable space is wasted 69 #warning Limited user VSID range means pagetable space is wasted
70 #endif 70 #endif
71 71
72 #if (TASK_SIZE_USER64 < PGTABLE_RANGE) && (TASK_SIZE_USER64 < USER_VSID_RANGE) 72 #if (TASK_SIZE_USER64 < PGTABLE_RANGE) && (TASK_SIZE_USER64 < USER_VSID_RANGE)
73 #warning TASK_SIZE is smaller than it needs to be. 73 #warning TASK_SIZE is smaller than it needs to be.
74 #endif 74 #endif
75 75
76 int mem_init_done; 76 int mem_init_done;
77 unsigned long ioremap_bot = IMALLOC_BASE; 77 unsigned long ioremap_bot = IMALLOC_BASE;
78 static unsigned long phbs_io_bot = PHBS_IO_BASE; 78 static unsigned long phbs_io_bot = PHBS_IO_BASE;
79 79
80 extern pgd_t swapper_pg_dir[]; 80 extern pgd_t swapper_pg_dir[];
81 extern struct task_struct *current_set[NR_CPUS]; 81 extern struct task_struct *current_set[NR_CPUS];
82 82
83 unsigned long klimit = (unsigned long)_end; 83 unsigned long klimit = (unsigned long)_end;
84 84
85 unsigned long _SDR1=0; 85 unsigned long _SDR1=0;
86 unsigned long _ASR=0; 86 unsigned long _ASR=0;
87 87
88 /* max amount of RAM to use */ 88 /* max amount of RAM to use */
89 unsigned long __max_memory; 89 unsigned long __max_memory;
90 90
91 /* info on what we think the IO hole is */ 91 /* info on what we think the IO hole is */
92 unsigned long io_hole_start; 92 unsigned long io_hole_start;
93 unsigned long io_hole_size; 93 unsigned long io_hole_size;
94 94
95 void show_mem(void) 95 void show_mem(void)
96 { 96 {
97 unsigned long total = 0, reserved = 0; 97 unsigned long total = 0, reserved = 0;
98 unsigned long shared = 0, cached = 0; 98 unsigned long shared = 0, cached = 0;
99 struct page *page; 99 struct page *page;
100 pg_data_t *pgdat; 100 pg_data_t *pgdat;
101 unsigned long i; 101 unsigned long i;
102 102
103 printk("Mem-info:\n"); 103 printk("Mem-info:\n");
104 show_free_areas(); 104 show_free_areas();
105 printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); 105 printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
106 for_each_pgdat(pgdat) { 106 for_each_pgdat(pgdat) {
107 unsigned long flags;
108 pgdat_resize_lock(pgdat, &flags);
107 for (i = 0; i < pgdat->node_spanned_pages; i++) { 109 for (i = 0; i < pgdat->node_spanned_pages; i++) {
108 page = pgdat_page_nr(pgdat, i); 110 page = pgdat_page_nr(pgdat, i);
109 total++; 111 total++;
110 if (PageReserved(page)) 112 if (PageReserved(page))
111 reserved++; 113 reserved++;
112 else if (PageSwapCache(page)) 114 else if (PageSwapCache(page))
113 cached++; 115 cached++;
114 else if (page_count(page)) 116 else if (page_count(page))
115 shared += page_count(page) - 1; 117 shared += page_count(page) - 1;
116 } 118 }
119 pgdat_resize_unlock(pgdat, &flags);
117 } 120 }
118 printk("%ld pages of RAM\n", total); 121 printk("%ld pages of RAM\n", total);
119 printk("%ld reserved pages\n", reserved); 122 printk("%ld reserved pages\n", reserved);
120 printk("%ld pages shared\n", shared); 123 printk("%ld pages shared\n", shared);
121 printk("%ld pages swap cached\n", cached); 124 printk("%ld pages swap cached\n", cached);
122 } 125 }
123 126
124 #ifdef CONFIG_PPC_ISERIES 127 #ifdef CONFIG_PPC_ISERIES
125 128
126 void __iomem *ioremap(unsigned long addr, unsigned long size) 129 void __iomem *ioremap(unsigned long addr, unsigned long size)
127 { 130 {
128 return (void __iomem *)addr; 131 return (void __iomem *)addr;
129 } 132 }
130 133
131 extern void __iomem *__ioremap(unsigned long addr, unsigned long size, 134 extern void __iomem *__ioremap(unsigned long addr, unsigned long size,
132 unsigned long flags) 135 unsigned long flags)
133 { 136 {
134 return (void __iomem *)addr; 137 return (void __iomem *)addr;
135 } 138 }
136 139
137 void iounmap(volatile void __iomem *addr) 140 void iounmap(volatile void __iomem *addr)
138 { 141 {
139 return; 142 return;
140 } 143 }
141 144
142 #else 145 #else
143 146
144 /* 147 /*
145 * map_io_page currently only called by __ioremap 148 * map_io_page currently only called by __ioremap
146 * map_io_page adds an entry to the ioremap page table 149 * map_io_page adds an entry to the ioremap page table
147 * and adds an entry to the HPT, possibly bolting it 150 * and adds an entry to the HPT, possibly bolting it
148 */ 151 */
149 static int map_io_page(unsigned long ea, unsigned long pa, int flags) 152 static int map_io_page(unsigned long ea, unsigned long pa, int flags)
150 { 153 {
151 pgd_t *pgdp; 154 pgd_t *pgdp;
152 pud_t *pudp; 155 pud_t *pudp;
153 pmd_t *pmdp; 156 pmd_t *pmdp;
154 pte_t *ptep; 157 pte_t *ptep;
155 unsigned long vsid; 158 unsigned long vsid;
156 159
157 if (mem_init_done) { 160 if (mem_init_done) {
158 pgdp = pgd_offset_k(ea); 161 pgdp = pgd_offset_k(ea);
159 pudp = pud_alloc(&init_mm, pgdp, ea); 162 pudp = pud_alloc(&init_mm, pgdp, ea);
160 if (!pudp) 163 if (!pudp)
161 return -ENOMEM; 164 return -ENOMEM;
162 pmdp = pmd_alloc(&init_mm, pudp, ea); 165 pmdp = pmd_alloc(&init_mm, pudp, ea);
163 if (!pmdp) 166 if (!pmdp)
164 return -ENOMEM; 167 return -ENOMEM;
165 ptep = pte_alloc_kernel(pmdp, ea); 168 ptep = pte_alloc_kernel(pmdp, ea);
166 if (!ptep) 169 if (!ptep)
167 return -ENOMEM; 170 return -ENOMEM;
168 set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, 171 set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
169 __pgprot(flags))); 172 __pgprot(flags)));
170 } else { 173 } else {
171 unsigned long va, vpn, hash, hpteg; 174 unsigned long va, vpn, hash, hpteg;
172 175
173 /* 176 /*
174 * If the mm subsystem is not fully up, we cannot create a 177 * If the mm subsystem is not fully up, we cannot create a
175 * linux page table entry for this mapping. Simply bolt an 178 * linux page table entry for this mapping. Simply bolt an
176 * entry in the hardware page table. 179 * entry in the hardware page table.
177 */ 180 */
178 vsid = get_kernel_vsid(ea); 181 vsid = get_kernel_vsid(ea);
179 va = (vsid << 28) | (ea & 0xFFFFFFF); 182 va = (vsid << 28) | (ea & 0xFFFFFFF);
180 vpn = va >> PAGE_SHIFT; 183 vpn = va >> PAGE_SHIFT;
181 184
182 hash = hpt_hash(vpn, 0); 185 hash = hpt_hash(vpn, 0);
183 186
184 hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP); 187 hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP);
185 188
186 /* Panic if a pte grpup is full */ 189 /* Panic if a pte grpup is full */
187 if (ppc_md.hpte_insert(hpteg, va, pa >> PAGE_SHIFT, 190 if (ppc_md.hpte_insert(hpteg, va, pa >> PAGE_SHIFT,
188 HPTE_V_BOLTED, 191 HPTE_V_BOLTED,
189 _PAGE_NO_CACHE|_PAGE_GUARDED|PP_RWXX) 192 _PAGE_NO_CACHE|_PAGE_GUARDED|PP_RWXX)
190 == -1) { 193 == -1) {
191 panic("map_io_page: could not insert mapping"); 194 panic("map_io_page: could not insert mapping");
192 } 195 }
193 } 196 }
194 return 0; 197 return 0;
195 } 198 }
196 199
197 200
198 static void __iomem * __ioremap_com(unsigned long addr, unsigned long pa, 201 static void __iomem * __ioremap_com(unsigned long addr, unsigned long pa,
199 unsigned long ea, unsigned long size, 202 unsigned long ea, unsigned long size,
200 unsigned long flags) 203 unsigned long flags)
201 { 204 {
202 unsigned long i; 205 unsigned long i;
203 206
204 if ((flags & _PAGE_PRESENT) == 0) 207 if ((flags & _PAGE_PRESENT) == 0)
205 flags |= pgprot_val(PAGE_KERNEL); 208 flags |= pgprot_val(PAGE_KERNEL);
206 209
207 for (i = 0; i < size; i += PAGE_SIZE) 210 for (i = 0; i < size; i += PAGE_SIZE)
208 if (map_io_page(ea+i, pa+i, flags)) 211 if (map_io_page(ea+i, pa+i, flags))
209 return NULL; 212 return NULL;
210 213
211 return (void __iomem *) (ea + (addr & ~PAGE_MASK)); 214 return (void __iomem *) (ea + (addr & ~PAGE_MASK));
212 } 215 }
213 216
214 217
215 void __iomem * 218 void __iomem *
216 ioremap(unsigned long addr, unsigned long size) 219 ioremap(unsigned long addr, unsigned long size)
217 { 220 {
218 return __ioremap(addr, size, _PAGE_NO_CACHE | _PAGE_GUARDED); 221 return __ioremap(addr, size, _PAGE_NO_CACHE | _PAGE_GUARDED);
219 } 222 }
220 223
221 void __iomem * __ioremap(unsigned long addr, unsigned long size, 224 void __iomem * __ioremap(unsigned long addr, unsigned long size,
222 unsigned long flags) 225 unsigned long flags)
223 { 226 {
224 unsigned long pa, ea; 227 unsigned long pa, ea;
225 void __iomem *ret; 228 void __iomem *ret;
226 229
227 /* 230 /*
228 * Choose an address to map it to. 231 * Choose an address to map it to.
229 * Once the imalloc system is running, we use it. 232 * Once the imalloc system is running, we use it.
230 * Before that, we map using addresses going 233 * Before that, we map using addresses going
231 * up from ioremap_bot. imalloc will use 234 * up from ioremap_bot. imalloc will use
232 * the addresses from ioremap_bot through 235 * the addresses from ioremap_bot through
233 * IMALLOC_END 236 * IMALLOC_END
234 * 237 *
235 */ 238 */
236 pa = addr & PAGE_MASK; 239 pa = addr & PAGE_MASK;
237 size = PAGE_ALIGN(addr + size) - pa; 240 size = PAGE_ALIGN(addr + size) - pa;
238 241
239 if (size == 0) 242 if (size == 0)
240 return NULL; 243 return NULL;
241 244
242 if (mem_init_done) { 245 if (mem_init_done) {
243 struct vm_struct *area; 246 struct vm_struct *area;
244 area = im_get_free_area(size); 247 area = im_get_free_area(size);
245 if (area == NULL) 248 if (area == NULL)
246 return NULL; 249 return NULL;
247 ea = (unsigned long)(area->addr); 250 ea = (unsigned long)(area->addr);
248 ret = __ioremap_com(addr, pa, ea, size, flags); 251 ret = __ioremap_com(addr, pa, ea, size, flags);
249 if (!ret) 252 if (!ret)
250 im_free(area->addr); 253 im_free(area->addr);
251 } else { 254 } else {
252 ea = ioremap_bot; 255 ea = ioremap_bot;
253 ret = __ioremap_com(addr, pa, ea, size, flags); 256 ret = __ioremap_com(addr, pa, ea, size, flags);
254 if (ret) 257 if (ret)
255 ioremap_bot += size; 258 ioremap_bot += size;
256 } 259 }
257 return ret; 260 return ret;
258 } 261 }
259 262
260 #define IS_PAGE_ALIGNED(_val) ((_val) == ((_val) & PAGE_MASK)) 263 #define IS_PAGE_ALIGNED(_val) ((_val) == ((_val) & PAGE_MASK))
261 264
262 int __ioremap_explicit(unsigned long pa, unsigned long ea, 265 int __ioremap_explicit(unsigned long pa, unsigned long ea,
263 unsigned long size, unsigned long flags) 266 unsigned long size, unsigned long flags)
264 { 267 {
265 struct vm_struct *area; 268 struct vm_struct *area;
266 void __iomem *ret; 269 void __iomem *ret;
267 270
268 /* For now, require page-aligned values for pa, ea, and size */ 271 /* For now, require page-aligned values for pa, ea, and size */
269 if (!IS_PAGE_ALIGNED(pa) || !IS_PAGE_ALIGNED(ea) || 272 if (!IS_PAGE_ALIGNED(pa) || !IS_PAGE_ALIGNED(ea) ||
270 !IS_PAGE_ALIGNED(size)) { 273 !IS_PAGE_ALIGNED(size)) {
271 printk(KERN_ERR "unaligned value in %s\n", __FUNCTION__); 274 printk(KERN_ERR "unaligned value in %s\n", __FUNCTION__);
272 return 1; 275 return 1;
273 } 276 }
274 277
275 if (!mem_init_done) { 278 if (!mem_init_done) {
276 /* Two things to consider in this case: 279 /* Two things to consider in this case:
277 * 1) No records will be kept (imalloc, etc) that the region 280 * 1) No records will be kept (imalloc, etc) that the region
278 * has been remapped 281 * has been remapped
279 * 2) It won't be easy to iounmap() the region later (because 282 * 2) It won't be easy to iounmap() the region later (because
280 * of 1) 283 * of 1)
281 */ 284 */
282 ; 285 ;
283 } else { 286 } else {
284 area = im_get_area(ea, size, 287 area = im_get_area(ea, size,
285 IM_REGION_UNUSED|IM_REGION_SUBSET|IM_REGION_EXISTS); 288 IM_REGION_UNUSED|IM_REGION_SUBSET|IM_REGION_EXISTS);
286 if (area == NULL) { 289 if (area == NULL) {
287 /* Expected when PHB-dlpar is in play */ 290 /* Expected when PHB-dlpar is in play */
288 return 1; 291 return 1;
289 } 292 }
290 if (ea != (unsigned long) area->addr) { 293 if (ea != (unsigned long) area->addr) {
291 printk(KERN_ERR "unexpected addr return from " 294 printk(KERN_ERR "unexpected addr return from "
292 "im_get_area\n"); 295 "im_get_area\n");
293 return 1; 296 return 1;
294 } 297 }
295 } 298 }
296 299
297 ret = __ioremap_com(pa, pa, ea, size, flags); 300 ret = __ioremap_com(pa, pa, ea, size, flags);
298 if (ret == NULL) { 301 if (ret == NULL) {
299 printk(KERN_ERR "ioremap_explicit() allocation failure !\n"); 302 printk(KERN_ERR "ioremap_explicit() allocation failure !\n");
300 return 1; 303 return 1;
301 } 304 }
302 if (ret != (void *) ea) { 305 if (ret != (void *) ea) {
303 printk(KERN_ERR "__ioremap_com() returned unexpected addr\n"); 306 printk(KERN_ERR "__ioremap_com() returned unexpected addr\n");
304 return 1; 307 return 1;
305 } 308 }
306 309
307 return 0; 310 return 0;
308 } 311 }
309 312
310 /* 313 /*
311 * Unmap an IO region and remove it from imalloc'd list. 314 * Unmap an IO region and remove it from imalloc'd list.
312 * Access to IO memory should be serialized by driver. 315 * Access to IO memory should be serialized by driver.
313 * This code is modeled after vmalloc code - unmap_vm_area() 316 * This code is modeled after vmalloc code - unmap_vm_area()
314 * 317 *
315 * XXX what about calls before mem_init_done (ie python_countermeasures()) 318 * XXX what about calls before mem_init_done (ie python_countermeasures())
316 */ 319 */
317 void iounmap(volatile void __iomem *token) 320 void iounmap(volatile void __iomem *token)
318 { 321 {
319 void *addr; 322 void *addr;
320 323
321 if (!mem_init_done) 324 if (!mem_init_done)
322 return; 325 return;
323 326
324 addr = (void *) ((unsigned long __force) token & PAGE_MASK); 327 addr = (void *) ((unsigned long __force) token & PAGE_MASK);
325 328
326 im_free(addr); 329 im_free(addr);
327 } 330 }
328 331
329 static int iounmap_subset_regions(unsigned long addr, unsigned long size) 332 static int iounmap_subset_regions(unsigned long addr, unsigned long size)
330 { 333 {
331 struct vm_struct *area; 334 struct vm_struct *area;
332 335
333 /* Check whether subsets of this region exist */ 336 /* Check whether subsets of this region exist */
334 area = im_get_area(addr, size, IM_REGION_SUPERSET); 337 area = im_get_area(addr, size, IM_REGION_SUPERSET);
335 if (area == NULL) 338 if (area == NULL)
336 return 1; 339 return 1;
337 340
338 while (area) { 341 while (area) {
339 iounmap((void __iomem *) area->addr); 342 iounmap((void __iomem *) area->addr);
340 area = im_get_area(addr, size, 343 area = im_get_area(addr, size,
341 IM_REGION_SUPERSET); 344 IM_REGION_SUPERSET);
342 } 345 }
343 346
344 return 0; 347 return 0;
345 } 348 }
346 349
347 int iounmap_explicit(volatile void __iomem *start, unsigned long size) 350 int iounmap_explicit(volatile void __iomem *start, unsigned long size)
348 { 351 {
349 struct vm_struct *area; 352 struct vm_struct *area;
350 unsigned long addr; 353 unsigned long addr;
351 int rc; 354 int rc;
352 355
353 addr = (unsigned long __force) start & PAGE_MASK; 356 addr = (unsigned long __force) start & PAGE_MASK;
354 357
355 /* Verify that the region either exists or is a subset of an existing 358 /* Verify that the region either exists or is a subset of an existing
356 * region. In the latter case, split the parent region to create 359 * region. In the latter case, split the parent region to create
357 * the exact region 360 * the exact region
358 */ 361 */
359 area = im_get_area(addr, size, 362 area = im_get_area(addr, size,
360 IM_REGION_EXISTS | IM_REGION_SUBSET); 363 IM_REGION_EXISTS | IM_REGION_SUBSET);
361 if (area == NULL) { 364 if (area == NULL) {
362 /* Determine whether subset regions exist. If so, unmap */ 365 /* Determine whether subset regions exist. If so, unmap */
363 rc = iounmap_subset_regions(addr, size); 366 rc = iounmap_subset_regions(addr, size);
364 if (rc) { 367 if (rc) {
365 printk(KERN_ERR 368 printk(KERN_ERR
366 "%s() cannot unmap nonexistent range 0x%lx\n", 369 "%s() cannot unmap nonexistent range 0x%lx\n",
367 __FUNCTION__, addr); 370 __FUNCTION__, addr);
368 return 1; 371 return 1;
369 } 372 }
370 } else { 373 } else {
371 iounmap((void __iomem *) area->addr); 374 iounmap((void __iomem *) area->addr);
372 } 375 }
373 /* 376 /*
374 * FIXME! This can't be right: 377 * FIXME! This can't be right:
375 iounmap(area->addr); 378 iounmap(area->addr);
376 * Maybe it should be "iounmap(area);" 379 * Maybe it should be "iounmap(area);"
377 */ 380 */
378 return 0; 381 return 0;
379 } 382 }
380 383
381 #endif 384 #endif
382 385
383 EXPORT_SYMBOL(ioremap); 386 EXPORT_SYMBOL(ioremap);
384 EXPORT_SYMBOL(__ioremap); 387 EXPORT_SYMBOL(__ioremap);
385 EXPORT_SYMBOL(iounmap); 388 EXPORT_SYMBOL(iounmap);
386 389
387 void free_initmem(void) 390 void free_initmem(void)
388 { 391 {
389 unsigned long addr; 392 unsigned long addr;
390 393
391 addr = (unsigned long)__init_begin; 394 addr = (unsigned long)__init_begin;
392 for (; addr < (unsigned long)__init_end; addr += PAGE_SIZE) { 395 for (; addr < (unsigned long)__init_end; addr += PAGE_SIZE) {
393 memset((void *)addr, 0xcc, PAGE_SIZE); 396 memset((void *)addr, 0xcc, PAGE_SIZE);
394 ClearPageReserved(virt_to_page(addr)); 397 ClearPageReserved(virt_to_page(addr));
395 set_page_count(virt_to_page(addr), 1); 398 set_page_count(virt_to_page(addr), 1);
396 free_page(addr); 399 free_page(addr);
397 totalram_pages++; 400 totalram_pages++;
398 } 401 }
399 printk ("Freeing unused kernel memory: %luk freed\n", 402 printk ("Freeing unused kernel memory: %luk freed\n",
400 ((unsigned long)__init_end - (unsigned long)__init_begin) >> 10); 403 ((unsigned long)__init_end - (unsigned long)__init_begin) >> 10);
401 } 404 }
402 405
403 #ifdef CONFIG_BLK_DEV_INITRD 406 #ifdef CONFIG_BLK_DEV_INITRD
404 void free_initrd_mem(unsigned long start, unsigned long end) 407 void free_initrd_mem(unsigned long start, unsigned long end)
405 { 408 {
406 if (start < end) 409 if (start < end)
407 printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10); 410 printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
408 for (; start < end; start += PAGE_SIZE) { 411 for (; start < end; start += PAGE_SIZE) {
409 ClearPageReserved(virt_to_page(start)); 412 ClearPageReserved(virt_to_page(start));
410 set_page_count(virt_to_page(start), 1); 413 set_page_count(virt_to_page(start), 1);
411 free_page(start); 414 free_page(start);
412 totalram_pages++; 415 totalram_pages++;
413 } 416 }
414 } 417 }
415 #endif 418 #endif
416 419
417 static DEFINE_SPINLOCK(mmu_context_lock); 420 static DEFINE_SPINLOCK(mmu_context_lock);
418 static DEFINE_IDR(mmu_context_idr); 421 static DEFINE_IDR(mmu_context_idr);
419 422
420 int init_new_context(struct task_struct *tsk, struct mm_struct *mm) 423 int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
421 { 424 {
422 int index; 425 int index;
423 int err; 426 int err;
424 427
425 again: 428 again:
426 if (!idr_pre_get(&mmu_context_idr, GFP_KERNEL)) 429 if (!idr_pre_get(&mmu_context_idr, GFP_KERNEL))
427 return -ENOMEM; 430 return -ENOMEM;
428 431
429 spin_lock(&mmu_context_lock); 432 spin_lock(&mmu_context_lock);
430 err = idr_get_new_above(&mmu_context_idr, NULL, 1, &index); 433 err = idr_get_new_above(&mmu_context_idr, NULL, 1, &index);
431 spin_unlock(&mmu_context_lock); 434 spin_unlock(&mmu_context_lock);
432 435
433 if (err == -EAGAIN) 436 if (err == -EAGAIN)
434 goto again; 437 goto again;
435 else if (err) 438 else if (err)
436 return err; 439 return err;
437 440
438 if (index > MAX_CONTEXT) { 441 if (index > MAX_CONTEXT) {
439 idr_remove(&mmu_context_idr, index); 442 idr_remove(&mmu_context_idr, index);
440 return -ENOMEM; 443 return -ENOMEM;
441 } 444 }
442 445
443 mm->context.id = index; 446 mm->context.id = index;
444 447
445 return 0; 448 return 0;
446 } 449 }
447 450
448 void destroy_context(struct mm_struct *mm) 451 void destroy_context(struct mm_struct *mm)
449 { 452 {
450 spin_lock(&mmu_context_lock); 453 spin_lock(&mmu_context_lock);
451 idr_remove(&mmu_context_idr, mm->context.id); 454 idr_remove(&mmu_context_idr, mm->context.id);
452 spin_unlock(&mmu_context_lock); 455 spin_unlock(&mmu_context_lock);
453 456
454 mm->context.id = NO_CONTEXT; 457 mm->context.id = NO_CONTEXT;
455 } 458 }
456 459
457 /* 460 /*
458 * Do very early mm setup. 461 * Do very early mm setup.
459 */ 462 */
460 void __init mm_init_ppc64(void) 463 void __init mm_init_ppc64(void)
461 { 464 {
462 #ifndef CONFIG_PPC_ISERIES 465 #ifndef CONFIG_PPC_ISERIES
463 unsigned long i; 466 unsigned long i;
464 #endif 467 #endif
465 468
466 ppc64_boot_msg(0x100, "MM Init"); 469 ppc64_boot_msg(0x100, "MM Init");
467 470
468 /* This is the story of the IO hole... please, keep seated, 471 /* This is the story of the IO hole... please, keep seated,
469 * unfortunately, we are out of oxygen masks at the moment. 472 * unfortunately, we are out of oxygen masks at the moment.
470 * So we need some rough way to tell where your big IO hole 473 * So we need some rough way to tell where your big IO hole
471 * is. On pmac, it's between 2G and 4G, on POWER3, it's around 474 * is. On pmac, it's between 2G and 4G, on POWER3, it's around
472 * that area as well, on POWER4 we don't have one, etc... 475 * that area as well, on POWER4 we don't have one, etc...
473 * We need that as a "hint" when sizing the TCE table on POWER3 476 * We need that as a "hint" when sizing the TCE table on POWER3
474 * So far, the simplest way that seem work well enough for us it 477 * So far, the simplest way that seem work well enough for us it
475 * to just assume that the first discontinuity in our physical 478 * to just assume that the first discontinuity in our physical
476 * RAM layout is the IO hole. That may not be correct in the future 479 * RAM layout is the IO hole. That may not be correct in the future
477 * (and isn't on iSeries but then we don't care ;) 480 * (and isn't on iSeries but then we don't care ;)
478 */ 481 */
479 482
480 #ifndef CONFIG_PPC_ISERIES 483 #ifndef CONFIG_PPC_ISERIES
481 for (i = 1; i < lmb.memory.cnt; i++) { 484 for (i = 1; i < lmb.memory.cnt; i++) {
482 unsigned long base, prevbase, prevsize; 485 unsigned long base, prevbase, prevsize;
483 486
484 prevbase = lmb.memory.region[i-1].base; 487 prevbase = lmb.memory.region[i-1].base;
485 prevsize = lmb.memory.region[i-1].size; 488 prevsize = lmb.memory.region[i-1].size;
486 base = lmb.memory.region[i].base; 489 base = lmb.memory.region[i].base;
487 if (base > (prevbase + prevsize)) { 490 if (base > (prevbase + prevsize)) {
488 io_hole_start = prevbase + prevsize; 491 io_hole_start = prevbase + prevsize;
489 io_hole_size = base - (prevbase + prevsize); 492 io_hole_size = base - (prevbase + prevsize);
490 break; 493 break;
491 } 494 }
492 } 495 }
493 #endif /* CONFIG_PPC_ISERIES */ 496 #endif /* CONFIG_PPC_ISERIES */
494 if (io_hole_start) 497 if (io_hole_start)
495 printk("IO Hole assumed to be %lx -> %lx\n", 498 printk("IO Hole assumed to be %lx -> %lx\n",
496 io_hole_start, io_hole_start + io_hole_size - 1); 499 io_hole_start, io_hole_start + io_hole_size - 1);
497 500
498 ppc64_boot_msg(0x100, "MM Init Done"); 501 ppc64_boot_msg(0x100, "MM Init Done");
499 } 502 }
500 503
501 /* 504 /*
502 * This is called by /dev/mem to know if a given address has to 505 * This is called by /dev/mem to know if a given address has to
503 * be mapped non-cacheable or not 506 * be mapped non-cacheable or not
504 */ 507 */
505 int page_is_ram(unsigned long pfn) 508 int page_is_ram(unsigned long pfn)
506 { 509 {
507 int i; 510 int i;
508 unsigned long paddr = (pfn << PAGE_SHIFT); 511 unsigned long paddr = (pfn << PAGE_SHIFT);
509 512
510 for (i=0; i < lmb.memory.cnt; i++) { 513 for (i=0; i < lmb.memory.cnt; i++) {
511 unsigned long base; 514 unsigned long base;
512 515
513 base = lmb.memory.region[i].base; 516 base = lmb.memory.region[i].base;
514 517
515 if ((paddr >= base) && 518 if ((paddr >= base) &&
516 (paddr < (base + lmb.memory.region[i].size))) { 519 (paddr < (base + lmb.memory.region[i].size))) {
517 return 1; 520 return 1;
518 } 521 }
519 } 522 }
520 523
521 return 0; 524 return 0;
522 } 525 }
523 EXPORT_SYMBOL(page_is_ram); 526 EXPORT_SYMBOL(page_is_ram);
524 527
525 /* 528 /*
526 * Initialize the bootmem system and give it all the memory we 529 * Initialize the bootmem system and give it all the memory we
527 * have available. 530 * have available.
528 */ 531 */
529 #ifndef CONFIG_NEED_MULTIPLE_NODES 532 #ifndef CONFIG_NEED_MULTIPLE_NODES
530 void __init do_init_bootmem(void) 533 void __init do_init_bootmem(void)
531 { 534 {
532 unsigned long i; 535 unsigned long i;
533 unsigned long start, bootmap_pages; 536 unsigned long start, bootmap_pages;
534 unsigned long total_pages = lmb_end_of_DRAM() >> PAGE_SHIFT; 537 unsigned long total_pages = lmb_end_of_DRAM() >> PAGE_SHIFT;
535 int boot_mapsize; 538 int boot_mapsize;
536 539
537 /* 540 /*
538 * Find an area to use for the bootmem bitmap. Calculate the size of 541 * Find an area to use for the bootmem bitmap. Calculate the size of
539 * bitmap required as (Total Memory) / PAGE_SIZE / BITS_PER_BYTE. 542 * bitmap required as (Total Memory) / PAGE_SIZE / BITS_PER_BYTE.
540 * Add 1 additional page in case the address isn't page-aligned. 543 * Add 1 additional page in case the address isn't page-aligned.
541 */ 544 */
542 bootmap_pages = bootmem_bootmap_pages(total_pages); 545 bootmap_pages = bootmem_bootmap_pages(total_pages);
543 546
544 start = lmb_alloc(bootmap_pages<<PAGE_SHIFT, PAGE_SIZE); 547 start = lmb_alloc(bootmap_pages<<PAGE_SHIFT, PAGE_SIZE);
545 BUG_ON(!start); 548 BUG_ON(!start);
546 549
547 boot_mapsize = init_bootmem(start >> PAGE_SHIFT, total_pages); 550 boot_mapsize = init_bootmem(start >> PAGE_SHIFT, total_pages);
548 551
549 max_pfn = max_low_pfn; 552 max_pfn = max_low_pfn;
550 553
551 /* Add all physical memory to the bootmem map, mark each area 554 /* Add all physical memory to the bootmem map, mark each area
552 * present. 555 * present.
553 */ 556 */
554 for (i=0; i < lmb.memory.cnt; i++) 557 for (i=0; i < lmb.memory.cnt; i++)
555 free_bootmem(lmb.memory.region[i].base, 558 free_bootmem(lmb.memory.region[i].base,
556 lmb_size_bytes(&lmb.memory, i)); 559 lmb_size_bytes(&lmb.memory, i));
557 560
558 /* reserve the sections we're already using */ 561 /* reserve the sections we're already using */
559 for (i=0; i < lmb.reserved.cnt; i++) 562 for (i=0; i < lmb.reserved.cnt; i++)
560 reserve_bootmem(lmb.reserved.region[i].base, 563 reserve_bootmem(lmb.reserved.region[i].base,
561 lmb_size_bytes(&lmb.reserved, i)); 564 lmb_size_bytes(&lmb.reserved, i));
562 565
563 for (i=0; i < lmb.memory.cnt; i++) 566 for (i=0; i < lmb.memory.cnt; i++)
564 memory_present(0, lmb_start_pfn(&lmb.memory, i), 567 memory_present(0, lmb_start_pfn(&lmb.memory, i),
565 lmb_end_pfn(&lmb.memory, i)); 568 lmb_end_pfn(&lmb.memory, i));
566 } 569 }
567 570
568 /* 571 /*
569 * paging_init() sets up the page tables - in fact we've already done this. 572 * paging_init() sets up the page tables - in fact we've already done this.
570 */ 573 */
571 void __init paging_init(void) 574 void __init paging_init(void)
572 { 575 {
573 unsigned long zones_size[MAX_NR_ZONES]; 576 unsigned long zones_size[MAX_NR_ZONES];
574 unsigned long zholes_size[MAX_NR_ZONES]; 577 unsigned long zholes_size[MAX_NR_ZONES];
575 unsigned long total_ram = lmb_phys_mem_size(); 578 unsigned long total_ram = lmb_phys_mem_size();
576 unsigned long top_of_ram = lmb_end_of_DRAM(); 579 unsigned long top_of_ram = lmb_end_of_DRAM();
577 580
578 printk(KERN_INFO "Top of RAM: 0x%lx, Total RAM: 0x%lx\n", 581 printk(KERN_INFO "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
579 top_of_ram, total_ram); 582 top_of_ram, total_ram);
580 printk(KERN_INFO "Memory hole size: %ldMB\n", 583 printk(KERN_INFO "Memory hole size: %ldMB\n",
581 (top_of_ram - total_ram) >> 20); 584 (top_of_ram - total_ram) >> 20);
582 /* 585 /*
583 * All pages are DMA-able so we put them all in the DMA zone. 586 * All pages are DMA-able so we put them all in the DMA zone.
584 */ 587 */
585 memset(zones_size, 0, sizeof(zones_size)); 588 memset(zones_size, 0, sizeof(zones_size));
586 memset(zholes_size, 0, sizeof(zholes_size)); 589 memset(zholes_size, 0, sizeof(zholes_size));
587 590
588 zones_size[ZONE_DMA] = top_of_ram >> PAGE_SHIFT; 591 zones_size[ZONE_DMA] = top_of_ram >> PAGE_SHIFT;
589 zholes_size[ZONE_DMA] = (top_of_ram - total_ram) >> PAGE_SHIFT; 592 zholes_size[ZONE_DMA] = (top_of_ram - total_ram) >> PAGE_SHIFT;
590 593
591 free_area_init_node(0, NODE_DATA(0), zones_size, 594 free_area_init_node(0, NODE_DATA(0), zones_size,
592 __pa(PAGE_OFFSET) >> PAGE_SHIFT, zholes_size); 595 __pa(PAGE_OFFSET) >> PAGE_SHIFT, zholes_size);
593 } 596 }
594 #endif /* ! CONFIG_NEED_MULTIPLE_NODES */ 597 #endif /* ! CONFIG_NEED_MULTIPLE_NODES */
595 598
596 static struct kcore_list kcore_vmem; 599 static struct kcore_list kcore_vmem;
597 600
598 static int __init setup_kcore(void) 601 static int __init setup_kcore(void)
599 { 602 {
600 int i; 603 int i;
601 604
602 for (i=0; i < lmb.memory.cnt; i++) { 605 for (i=0; i < lmb.memory.cnt; i++) {
603 unsigned long base, size; 606 unsigned long base, size;
604 struct kcore_list *kcore_mem; 607 struct kcore_list *kcore_mem;
605 608
606 base = lmb.memory.region[i].base; 609 base = lmb.memory.region[i].base;
607 size = lmb.memory.region[i].size; 610 size = lmb.memory.region[i].size;
608 611
609 /* GFP_ATOMIC to avoid might_sleep warnings during boot */ 612 /* GFP_ATOMIC to avoid might_sleep warnings during boot */
610 kcore_mem = kmalloc(sizeof(struct kcore_list), GFP_ATOMIC); 613 kcore_mem = kmalloc(sizeof(struct kcore_list), GFP_ATOMIC);
611 if (!kcore_mem) 614 if (!kcore_mem)
612 panic("mem_init: kmalloc failed\n"); 615 panic("mem_init: kmalloc failed\n");
613 616
614 kclist_add(kcore_mem, __va(base), size); 617 kclist_add(kcore_mem, __va(base), size);
615 } 618 }
616 619
617 kclist_add(&kcore_vmem, (void *)VMALLOC_START, VMALLOC_END-VMALLOC_START); 620 kclist_add(&kcore_vmem, (void *)VMALLOC_START, VMALLOC_END-VMALLOC_START);
618 621
619 return 0; 622 return 0;
620 } 623 }
621 module_init(setup_kcore); 624 module_init(setup_kcore);
622 625
623 void __init mem_init(void) 626 void __init mem_init(void)
624 { 627 {
625 #ifdef CONFIG_NEED_MULTIPLE_NODES 628 #ifdef CONFIG_NEED_MULTIPLE_NODES
626 int nid; 629 int nid;
627 #endif 630 #endif
628 pg_data_t *pgdat; 631 pg_data_t *pgdat;
629 unsigned long i; 632 unsigned long i;
630 struct page *page; 633 struct page *page;
631 unsigned long reservedpages = 0, codesize, initsize, datasize, bsssize; 634 unsigned long reservedpages = 0, codesize, initsize, datasize, bsssize;
632 635
633 num_physpages = max_low_pfn; /* RAM is assumed contiguous */ 636 num_physpages = max_low_pfn; /* RAM is assumed contiguous */
634 high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); 637 high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
635 638
636 #ifdef CONFIG_NEED_MULTIPLE_NODES 639 #ifdef CONFIG_NEED_MULTIPLE_NODES
637 for_each_online_node(nid) { 640 for_each_online_node(nid) {
638 if (NODE_DATA(nid)->node_spanned_pages != 0) { 641 if (NODE_DATA(nid)->node_spanned_pages != 0) {
639 printk("freeing bootmem node %x\n", nid); 642 printk("freeing bootmem node %x\n", nid);
640 totalram_pages += 643 totalram_pages +=
641 free_all_bootmem_node(NODE_DATA(nid)); 644 free_all_bootmem_node(NODE_DATA(nid));
642 } 645 }
643 } 646 }
644 #else 647 #else
645 max_mapnr = num_physpages; 648 max_mapnr = num_physpages;
646 totalram_pages += free_all_bootmem(); 649 totalram_pages += free_all_bootmem();
647 #endif 650 #endif
648 651
649 for_each_pgdat(pgdat) { 652 for_each_pgdat(pgdat) {
653 unsigned long flags;
654 pgdat_resize_lock(pgdat, &flags);
650 for (i = 0; i < pgdat->node_spanned_pages; i++) { 655 for (i = 0; i < pgdat->node_spanned_pages; i++) {
651 page = pgdat_page_nr(pgdat, i); 656 page = pgdat_page_nr(pgdat, i);
652 if (PageReserved(page)) 657 if (PageReserved(page))
653 reservedpages++; 658 reservedpages++;
654 } 659 }
660 pgdat_resize_unlock(pgdat, &flags);
655 } 661 }
656 662
657 codesize = (unsigned long)&_etext - (unsigned long)&_stext; 663 codesize = (unsigned long)&_etext - (unsigned long)&_stext;
658 initsize = (unsigned long)&__init_end - (unsigned long)&__init_begin; 664 initsize = (unsigned long)&__init_end - (unsigned long)&__init_begin;
659 datasize = (unsigned long)&_edata - (unsigned long)&__init_end; 665 datasize = (unsigned long)&_edata - (unsigned long)&__init_end;
660 bsssize = (unsigned long)&__bss_stop - (unsigned long)&__bss_start; 666 bsssize = (unsigned long)&__bss_stop - (unsigned long)&__bss_start;
661 667
662 printk(KERN_INFO "Memory: %luk/%luk available (%luk kernel code, " 668 printk(KERN_INFO "Memory: %luk/%luk available (%luk kernel code, "
663 "%luk reserved, %luk data, %luk bss, %luk init)\n", 669 "%luk reserved, %luk data, %luk bss, %luk init)\n",
664 (unsigned long)nr_free_pages() << (PAGE_SHIFT-10), 670 (unsigned long)nr_free_pages() << (PAGE_SHIFT-10),
665 num_physpages << (PAGE_SHIFT-10), 671 num_physpages << (PAGE_SHIFT-10),
666 codesize >> 10, 672 codesize >> 10,
667 reservedpages << (PAGE_SHIFT-10), 673 reservedpages << (PAGE_SHIFT-10),
668 datasize >> 10, 674 datasize >> 10,
669 bsssize >> 10, 675 bsssize >> 10,
670 initsize >> 10); 676 initsize >> 10);
671 677
672 mem_init_done = 1; 678 mem_init_done = 1;
673 679
674 /* Initialize the vDSO */ 680 /* Initialize the vDSO */
675 vdso_init(); 681 vdso_init();
676 } 682 }
677 683
678 /* 684 /*
679 * This is called when a page has been modified by the kernel. 685 * This is called when a page has been modified by the kernel.
680 * It just marks the page as not i-cache clean. We do the i-cache 686 * It just marks the page as not i-cache clean. We do the i-cache
681 * flush later when the page is given to a user process, if necessary. 687 * flush later when the page is given to a user process, if necessary.
682 */ 688 */
683 void flush_dcache_page(struct page *page) 689 void flush_dcache_page(struct page *page)
684 { 690 {
685 if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) 691 if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
686 return; 692 return;
687 /* avoid an atomic op if possible */ 693 /* avoid an atomic op if possible */
688 if (test_bit(PG_arch_1, &page->flags)) 694 if (test_bit(PG_arch_1, &page->flags))
689 clear_bit(PG_arch_1, &page->flags); 695 clear_bit(PG_arch_1, &page->flags);
690 } 696 }
691 EXPORT_SYMBOL(flush_dcache_page); 697 EXPORT_SYMBOL(flush_dcache_page);
692 698
693 void clear_user_page(void *page, unsigned long vaddr, struct page *pg) 699 void clear_user_page(void *page, unsigned long vaddr, struct page *pg)
694 { 700 {
695 clear_page(page); 701 clear_page(page);
696 702
697 if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) 703 if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
698 return; 704 return;
699 /* 705 /*
700 * We shouldnt have to do this, but some versions of glibc 706 * We shouldnt have to do this, but some versions of glibc
701 * require it (ld.so assumes zero filled pages are icache clean) 707 * require it (ld.so assumes zero filled pages are icache clean)
702 * - Anton 708 * - Anton
703 */ 709 */
704 710
705 /* avoid an atomic op if possible */ 711 /* avoid an atomic op if possible */
706 if (test_bit(PG_arch_1, &pg->flags)) 712 if (test_bit(PG_arch_1, &pg->flags))
707 clear_bit(PG_arch_1, &pg->flags); 713 clear_bit(PG_arch_1, &pg->flags);
708 } 714 }
709 EXPORT_SYMBOL(clear_user_page); 715 EXPORT_SYMBOL(clear_user_page);
710 716
711 void copy_user_page(void *vto, void *vfrom, unsigned long vaddr, 717 void copy_user_page(void *vto, void *vfrom, unsigned long vaddr,
712 struct page *pg) 718 struct page *pg)
713 { 719 {
714 copy_page(vto, vfrom); 720 copy_page(vto, vfrom);
715 721
716 /* 722 /*
717 * We should be able to use the following optimisation, however 723 * We should be able to use the following optimisation, however
718 * there are two problems. 724 * there are two problems.
719 * Firstly a bug in some versions of binutils meant PLT sections 725 * Firstly a bug in some versions of binutils meant PLT sections
720 * were not marked executable. 726 * were not marked executable.
721 * Secondly the first word in the GOT section is blrl, used 727 * Secondly the first word in the GOT section is blrl, used
722 * to establish the GOT address. Until recently the GOT was 728 * to establish the GOT address. Until recently the GOT was
723 * not marked executable. 729 * not marked executable.
724 * - Anton 730 * - Anton
725 */ 731 */
726 #if 0 732 #if 0
727 if (!vma->vm_file && ((vma->vm_flags & VM_EXEC) == 0)) 733 if (!vma->vm_file && ((vma->vm_flags & VM_EXEC) == 0))
728 return; 734 return;
729 #endif 735 #endif
730 736
731 if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) 737 if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
732 return; 738 return;
733 739
734 /* avoid an atomic op if possible */ 740 /* avoid an atomic op if possible */
735 if (test_bit(PG_arch_1, &pg->flags)) 741 if (test_bit(PG_arch_1, &pg->flags))
736 clear_bit(PG_arch_1, &pg->flags); 742 clear_bit(PG_arch_1, &pg->flags);
737 } 743 }
738 744
739 void flush_icache_user_range(struct vm_area_struct *vma, struct page *page, 745 void flush_icache_user_range(struct vm_area_struct *vma, struct page *page,
740 unsigned long addr, int len) 746 unsigned long addr, int len)
741 { 747 {
742 unsigned long maddr; 748 unsigned long maddr;
743 749
744 maddr = (unsigned long)page_address(page) + (addr & ~PAGE_MASK); 750 maddr = (unsigned long)page_address(page) + (addr & ~PAGE_MASK);
745 flush_icache_range(maddr, maddr + len); 751 flush_icache_range(maddr, maddr + len);
746 } 752 }
747 EXPORT_SYMBOL(flush_icache_user_range); 753 EXPORT_SYMBOL(flush_icache_user_range);
748 754
749 /* 755 /*
750 * This is called at the end of handling a user page fault, when the 756 * This is called at the end of handling a user page fault, when the
751 * fault has been handled by updating a PTE in the linux page tables. 757 * fault has been handled by updating a PTE in the linux page tables.
752 * We use it to preload an HPTE into the hash table corresponding to 758 * We use it to preload an HPTE into the hash table corresponding to
753 * the updated linux PTE. 759 * the updated linux PTE.
754 * 760 *
755 * This must always be called with the mm->page_table_lock held 761 * This must always be called with the mm->page_table_lock held
756 */ 762 */
757 void update_mmu_cache(struct vm_area_struct *vma, unsigned long ea, 763 void update_mmu_cache(struct vm_area_struct *vma, unsigned long ea,
758 pte_t pte) 764 pte_t pte)
759 { 765 {
760 unsigned long vsid; 766 unsigned long vsid;
761 void *pgdir; 767 void *pgdir;
762 pte_t *ptep; 768 pte_t *ptep;
763 int local = 0; 769 int local = 0;
764 cpumask_t tmp; 770 cpumask_t tmp;
765 unsigned long flags; 771 unsigned long flags;
766 772
767 /* handle i-cache coherency */ 773 /* handle i-cache coherency */
768 if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE) && 774 if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE) &&
769 !cpu_has_feature(CPU_FTR_NOEXECUTE)) { 775 !cpu_has_feature(CPU_FTR_NOEXECUTE)) {
770 unsigned long pfn = pte_pfn(pte); 776 unsigned long pfn = pte_pfn(pte);
771 if (pfn_valid(pfn)) { 777 if (pfn_valid(pfn)) {
772 struct page *page = pfn_to_page(pfn); 778 struct page *page = pfn_to_page(pfn);
773 if (!PageReserved(page) 779 if (!PageReserved(page)
774 && !test_bit(PG_arch_1, &page->flags)) { 780 && !test_bit(PG_arch_1, &page->flags)) {
775 __flush_dcache_icache(page_address(page)); 781 __flush_dcache_icache(page_address(page));
776 set_bit(PG_arch_1, &page->flags); 782 set_bit(PG_arch_1, &page->flags);
777 } 783 }
778 } 784 }
779 } 785 }
780 786
781 /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */ 787 /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */
782 if (!pte_young(pte)) 788 if (!pte_young(pte))
783 return; 789 return;
784 790
785 pgdir = vma->vm_mm->pgd; 791 pgdir = vma->vm_mm->pgd;
786 if (pgdir == NULL) 792 if (pgdir == NULL)
787 return; 793 return;
788 794
789 ptep = find_linux_pte(pgdir, ea); 795 ptep = find_linux_pte(pgdir, ea);
790 if (!ptep) 796 if (!ptep)
791 return; 797 return;
792 798
793 vsid = get_vsid(vma->vm_mm->context.id, ea); 799 vsid = get_vsid(vma->vm_mm->context.id, ea);
794 800
795 local_irq_save(flags); 801 local_irq_save(flags);
796 tmp = cpumask_of_cpu(smp_processor_id()); 802 tmp = cpumask_of_cpu(smp_processor_id());
797 if (cpus_equal(vma->vm_mm->cpu_vm_mask, tmp)) 803 if (cpus_equal(vma->vm_mm->cpu_vm_mask, tmp))
798 local = 1; 804 local = 1;
799 805
800 __hash_page(ea, 0, vsid, ptep, 0x300, local); 806 __hash_page(ea, 0, vsid, ptep, 0x300, local);
801 local_irq_restore(flags); 807 local_irq_restore(flags);
802 } 808 }
803 809
804 void __iomem * reserve_phb_iospace(unsigned long size) 810 void __iomem * reserve_phb_iospace(unsigned long size)
805 { 811 {
806 void __iomem *virt_addr; 812 void __iomem *virt_addr;
807 813
808 if (phbs_io_bot >= IMALLOC_BASE) 814 if (phbs_io_bot >= IMALLOC_BASE)
809 panic("reserve_phb_iospace(): phb io space overflow\n"); 815 panic("reserve_phb_iospace(): phb io space overflow\n");
810 816
811 virt_addr = (void __iomem *) phbs_io_bot; 817 virt_addr = (void __iomem *) phbs_io_bot;
812 phbs_io_bot += size; 818 phbs_io_bot += size;
813 819
814 return virt_addr; 820 return virt_addr;
815 } 821 }
816 822
817 static void zero_ctor(void *addr, kmem_cache_t *cache, unsigned long flags) 823 static void zero_ctor(void *addr, kmem_cache_t *cache, unsigned long flags)
818 { 824 {
819 memset(addr, 0, kmem_cache_size(cache)); 825 memset(addr, 0, kmem_cache_size(cache));
820 } 826 }
821 827
822 static const int pgtable_cache_size[2] = { 828 static const int pgtable_cache_size[2] = {
823 PTE_TABLE_SIZE, PMD_TABLE_SIZE 829 PTE_TABLE_SIZE, PMD_TABLE_SIZE
824 }; 830 };
825 static const char *pgtable_cache_name[ARRAY_SIZE(pgtable_cache_size)] = { 831 static const char *pgtable_cache_name[ARRAY_SIZE(pgtable_cache_size)] = {
826 "pgd_pte_cache", "pud_pmd_cache", 832 "pgd_pte_cache", "pud_pmd_cache",
827 }; 833 };
828 834
829 kmem_cache_t *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)]; 835 kmem_cache_t *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)];
830 836
831 void pgtable_cache_init(void) 837 void pgtable_cache_init(void)
832 { 838 {
833 int i; 839 int i;
834 840
835 BUILD_BUG_ON(PTE_TABLE_SIZE != pgtable_cache_size[PTE_CACHE_NUM]); 841 BUILD_BUG_ON(PTE_TABLE_SIZE != pgtable_cache_size[PTE_CACHE_NUM]);
836 BUILD_BUG_ON(PMD_TABLE_SIZE != pgtable_cache_size[PMD_CACHE_NUM]); 842 BUILD_BUG_ON(PMD_TABLE_SIZE != pgtable_cache_size[PMD_CACHE_NUM]);
837 BUILD_BUG_ON(PUD_TABLE_SIZE != pgtable_cache_size[PUD_CACHE_NUM]); 843 BUILD_BUG_ON(PUD_TABLE_SIZE != pgtable_cache_size[PUD_CACHE_NUM]);
838 BUILD_BUG_ON(PGD_TABLE_SIZE != pgtable_cache_size[PGD_CACHE_NUM]); 844 BUILD_BUG_ON(PGD_TABLE_SIZE != pgtable_cache_size[PGD_CACHE_NUM]);
839 845
840 for (i = 0; i < ARRAY_SIZE(pgtable_cache_size); i++) { 846 for (i = 0; i < ARRAY_SIZE(pgtable_cache_size); i++) {
841 int size = pgtable_cache_size[i]; 847 int size = pgtable_cache_size[i];
842 const char *name = pgtable_cache_name[i]; 848 const char *name = pgtable_cache_name[i];
843 849
844 pgtable_cache[i] = kmem_cache_create(name, 850 pgtable_cache[i] = kmem_cache_create(name,
845 size, size, 851 size, size,
846 SLAB_HWCACHE_ALIGN 852 SLAB_HWCACHE_ALIGN
847 | SLAB_MUST_HWCACHE_ALIGN, 853 | SLAB_MUST_HWCACHE_ALIGN,
848 zero_ctor, 854 zero_ctor,
849 NULL); 855 NULL);
850 if (! pgtable_cache[i]) 856 if (! pgtable_cache[i])
851 panic("pgtable_cache_init(): could not create %s!\n", 857 panic("pgtable_cache_init(): could not create %s!\n",
852 name); 858 name);
853 } 859 }
854 } 860 }
855 861
856 pgprot_t phys_mem_access_prot(struct file *file, unsigned long addr, 862 pgprot_t phys_mem_access_prot(struct file *file, unsigned long addr,
857 unsigned long size, pgprot_t vma_prot) 863 unsigned long size, pgprot_t vma_prot)
858 { 864 {
859 if (ppc_md.phys_mem_access_prot) 865 if (ppc_md.phys_mem_access_prot)
860 return ppc_md.phys_mem_access_prot(file, addr, size, vma_prot); 866 return ppc_md.phys_mem_access_prot(file, addr, size, vma_prot);
861 867
862 if (!page_is_ram(addr >> PAGE_SHIFT)) 868 if (!page_is_ram(addr >> PAGE_SHIFT))
863 vma_prot = __pgprot(pgprot_val(vma_prot) 869 vma_prot = __pgprot(pgprot_val(vma_prot)
864 | _PAGE_GUARDED | _PAGE_NO_CACHE); 870 | _PAGE_GUARDED | _PAGE_NO_CACHE);
865 return vma_prot; 871 return vma_prot;
866 } 872 }
867 EXPORT_SYMBOL(phys_mem_access_prot); 873 EXPORT_SYMBOL(phys_mem_access_prot);
868 874
include/linux/memory_hotplug.h
File was created 1 #ifndef __LINUX_MEMORY_HOTPLUG_H
2 #define __LINUX_MEMORY_HOTPLUG_H
3
4 #include <linux/mmzone.h>
5 #include <linux/spinlock.h>
6
7 #ifdef CONFIG_MEMORY_HOTPLUG
8 /*
9 * pgdat resizing functions
10 */
11 static inline
12 void pgdat_resize_lock(struct pglist_data *pgdat, unsigned long *flags)
13 {
14 spin_lock_irqsave(&pgdat->node_size_lock, *flags);
15 }
16 static inline
17 void pgdat_resize_unlock(struct pglist_data *pgdat, unsigned long *flags)
18 {
19 spin_lock_irqrestore(&pgdat->node_size_lock, *flags);
20 }
21 static inline
22 void pgdat_resize_init(struct pglist_data *pgdat)
23 {
24 spin_lock_init(&pgdat->node_size_lock);
25 }
26 #else /* ! CONFIG_MEMORY_HOTPLUG */
27 /*
28 * Stub functions for when hotplug is off
29 */
30 static inline void pgdat_resize_lock(struct pglist_data *p, unsigned long *f) {}
31 static inline void pgdat_resize_unlock(struct pglist_data *p, unsigned long *f) {}
32 static inline void pgdat_resize_init(struct pglist_data *pgdat) {}
33 #endif
34 #endif /* __LINUX_MEMORY_HOTPLUG_H */
35
include/linux/mmzone.h
1 #ifndef _LINUX_MMZONE_H 1 #ifndef _LINUX_MMZONE_H
2 #define _LINUX_MMZONE_H 2 #define _LINUX_MMZONE_H
3 3
4 #ifdef __KERNEL__ 4 #ifdef __KERNEL__
5 #ifndef __ASSEMBLY__ 5 #ifndef __ASSEMBLY__
6 6
7 #include <linux/config.h> 7 #include <linux/config.h>
8 #include <linux/spinlock.h> 8 #include <linux/spinlock.h>
9 #include <linux/list.h> 9 #include <linux/list.h>
10 #include <linux/wait.h> 10 #include <linux/wait.h>
11 #include <linux/cache.h> 11 #include <linux/cache.h>
12 #include <linux/threads.h> 12 #include <linux/threads.h>
13 #include <linux/numa.h> 13 #include <linux/numa.h>
14 #include <linux/init.h> 14 #include <linux/init.h>
15 #include <asm/atomic.h> 15 #include <asm/atomic.h>
16 16
17 /* Free memory management - zoned buddy allocator. */ 17 /* Free memory management - zoned buddy allocator. */
18 #ifndef CONFIG_FORCE_MAX_ZONEORDER 18 #ifndef CONFIG_FORCE_MAX_ZONEORDER
19 #define MAX_ORDER 11 19 #define MAX_ORDER 11
20 #else 20 #else
21 #define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER 21 #define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER
22 #endif 22 #endif
23 23
24 struct free_area { 24 struct free_area {
25 struct list_head free_list; 25 struct list_head free_list;
26 unsigned long nr_free; 26 unsigned long nr_free;
27 }; 27 };
28 28
29 struct pglist_data; 29 struct pglist_data;
30 30
31 /* 31 /*
32 * zone->lock and zone->lru_lock are two of the hottest locks in the kernel. 32 * zone->lock and zone->lru_lock are two of the hottest locks in the kernel.
33 * So add a wild amount of padding here to ensure that they fall into separate 33 * So add a wild amount of padding here to ensure that they fall into separate
34 * cachelines. There are very few zone structures in the machine, so space 34 * cachelines. There are very few zone structures in the machine, so space
35 * consumption is not a concern here. 35 * consumption is not a concern here.
36 */ 36 */
37 #if defined(CONFIG_SMP) 37 #if defined(CONFIG_SMP)
38 struct zone_padding { 38 struct zone_padding {
39 char x[0]; 39 char x[0];
40 } ____cacheline_maxaligned_in_smp; 40 } ____cacheline_maxaligned_in_smp;
41 #define ZONE_PADDING(name) struct zone_padding name; 41 #define ZONE_PADDING(name) struct zone_padding name;
42 #else 42 #else
43 #define ZONE_PADDING(name) 43 #define ZONE_PADDING(name)
44 #endif 44 #endif
45 45
46 struct per_cpu_pages { 46 struct per_cpu_pages {
47 int count; /* number of pages in the list */ 47 int count; /* number of pages in the list */
48 int low; /* low watermark, refill needed */ 48 int low; /* low watermark, refill needed */
49 int high; /* high watermark, emptying needed */ 49 int high; /* high watermark, emptying needed */
50 int batch; /* chunk size for buddy add/remove */ 50 int batch; /* chunk size for buddy add/remove */
51 struct list_head list; /* the list of pages */ 51 struct list_head list; /* the list of pages */
52 }; 52 };
53 53
54 struct per_cpu_pageset { 54 struct per_cpu_pageset {
55 struct per_cpu_pages pcp[2]; /* 0: hot. 1: cold */ 55 struct per_cpu_pages pcp[2]; /* 0: hot. 1: cold */
56 #ifdef CONFIG_NUMA 56 #ifdef CONFIG_NUMA
57 unsigned long numa_hit; /* allocated in intended node */ 57 unsigned long numa_hit; /* allocated in intended node */
58 unsigned long numa_miss; /* allocated in non intended node */ 58 unsigned long numa_miss; /* allocated in non intended node */
59 unsigned long numa_foreign; /* was intended here, hit elsewhere */ 59 unsigned long numa_foreign; /* was intended here, hit elsewhere */
60 unsigned long interleave_hit; /* interleaver prefered this zone */ 60 unsigned long interleave_hit; /* interleaver prefered this zone */
61 unsigned long local_node; /* allocation from local node */ 61 unsigned long local_node; /* allocation from local node */
62 unsigned long other_node; /* allocation from other node */ 62 unsigned long other_node; /* allocation from other node */
63 #endif 63 #endif
64 } ____cacheline_aligned_in_smp; 64 } ____cacheline_aligned_in_smp;
65 65
66 #ifdef CONFIG_NUMA 66 #ifdef CONFIG_NUMA
67 #define zone_pcp(__z, __cpu) ((__z)->pageset[(__cpu)]) 67 #define zone_pcp(__z, __cpu) ((__z)->pageset[(__cpu)])
68 #else 68 #else
69 #define zone_pcp(__z, __cpu) (&(__z)->pageset[(__cpu)]) 69 #define zone_pcp(__z, __cpu) (&(__z)->pageset[(__cpu)])
70 #endif 70 #endif
71 71
72 #define ZONE_DMA 0 72 #define ZONE_DMA 0
73 #define ZONE_NORMAL 1 73 #define ZONE_NORMAL 1
74 #define ZONE_HIGHMEM 2 74 #define ZONE_HIGHMEM 2
75 75
76 #define MAX_NR_ZONES 3 /* Sync this with ZONES_SHIFT */ 76 #define MAX_NR_ZONES 3 /* Sync this with ZONES_SHIFT */
77 #define ZONES_SHIFT 2 /* ceil(log2(MAX_NR_ZONES)) */ 77 #define ZONES_SHIFT 2 /* ceil(log2(MAX_NR_ZONES)) */
78 78
79 79
80 /* 80 /*
81 * When a memory allocation must conform to specific limitations (such 81 * When a memory allocation must conform to specific limitations (such
82 * as being suitable for DMA) the caller will pass in hints to the 82 * as being suitable for DMA) the caller will pass in hints to the
83 * allocator in the gfp_mask, in the zone modifier bits. These bits 83 * allocator in the gfp_mask, in the zone modifier bits. These bits
84 * are used to select a priority ordered list of memory zones which 84 * are used to select a priority ordered list of memory zones which
85 * match the requested limits. GFP_ZONEMASK defines which bits within 85 * match the requested limits. GFP_ZONEMASK defines which bits within
86 * the gfp_mask should be considered as zone modifiers. Each valid 86 * the gfp_mask should be considered as zone modifiers. Each valid
87 * combination of the zone modifier bits has a corresponding list 87 * combination of the zone modifier bits has a corresponding list
88 * of zones (in node_zonelists). Thus for two zone modifiers there 88 * of zones (in node_zonelists). Thus for two zone modifiers there
89 * will be a maximum of 4 (2 ** 2) zonelists, for 3 modifiers there will 89 * will be a maximum of 4 (2 ** 2) zonelists, for 3 modifiers there will
90 * be 8 (2 ** 3) zonelists. GFP_ZONETYPES defines the number of possible 90 * be 8 (2 ** 3) zonelists. GFP_ZONETYPES defines the number of possible
91 * combinations of zone modifiers in "zone modifier space". 91 * combinations of zone modifiers in "zone modifier space".
92 */ 92 */
93 #define GFP_ZONEMASK 0x03 93 #define GFP_ZONEMASK 0x03
94 /* 94 /*
95 * As an optimisation any zone modifier bits which are only valid when 95 * As an optimisation any zone modifier bits which are only valid when
96 * no other zone modifier bits are set (loners) should be placed in 96 * no other zone modifier bits are set (loners) should be placed in
97 * the highest order bits of this field. This allows us to reduce the 97 * the highest order bits of this field. This allows us to reduce the
98 * extent of the zonelists thus saving space. For example in the case 98 * extent of the zonelists thus saving space. For example in the case
99 * of three zone modifier bits, we could require up to eight zonelists. 99 * of three zone modifier bits, we could require up to eight zonelists.
100 * If the left most zone modifier is a "loner" then the highest valid 100 * If the left most zone modifier is a "loner" then the highest valid
101 * zonelist would be four allowing us to allocate only five zonelists. 101 * zonelist would be four allowing us to allocate only five zonelists.
102 * Use the first form when the left most bit is not a "loner", otherwise 102 * Use the first form when the left most bit is not a "loner", otherwise
103 * use the second. 103 * use the second.
104 */ 104 */
105 /* #define GFP_ZONETYPES (GFP_ZONEMASK + 1) */ /* Non-loner */ 105 /* #define GFP_ZONETYPES (GFP_ZONEMASK + 1) */ /* Non-loner */
106 #define GFP_ZONETYPES ((GFP_ZONEMASK + 1) / 2 + 1) /* Loner */ 106 #define GFP_ZONETYPES ((GFP_ZONEMASK + 1) / 2 + 1) /* Loner */
107 107
108 /* 108 /*
109 * On machines where it is needed (eg PCs) we divide physical memory 109 * On machines where it is needed (eg PCs) we divide physical memory
110 * into multiple physical zones. On a PC we have 3 zones: 110 * into multiple physical zones. On a PC we have 3 zones:
111 * 111 *
112 * ZONE_DMA < 16 MB ISA DMA capable memory 112 * ZONE_DMA < 16 MB ISA DMA capable memory
113 * ZONE_NORMAL 16-896 MB direct mapped by the kernel 113 * ZONE_NORMAL 16-896 MB direct mapped by the kernel
114 * ZONE_HIGHMEM > 896 MB only page cache and user processes 114 * ZONE_HIGHMEM > 896 MB only page cache and user processes
115 */ 115 */
116 116
117 struct zone { 117 struct zone {
118 /* Fields commonly accessed by the page allocator */ 118 /* Fields commonly accessed by the page allocator */
119 unsigned long free_pages; 119 unsigned long free_pages;
120 unsigned long pages_min, pages_low, pages_high; 120 unsigned long pages_min, pages_low, pages_high;
121 /* 121 /*
122 * We don't know if the memory that we're going to allocate will be freeable 122 * We don't know if the memory that we're going to allocate will be freeable
123 * or/and it will be released eventually, so to avoid totally wasting several 123 * or/and it will be released eventually, so to avoid totally wasting several
124 * GB of ram we must reserve some of the lower zone memory (otherwise we risk 124 * GB of ram we must reserve some of the lower zone memory (otherwise we risk
125 * to run OOM on the lower zones despite there's tons of freeable ram 125 * to run OOM on the lower zones despite there's tons of freeable ram
126 * on the higher zones). This array is recalculated at runtime if the 126 * on the higher zones). This array is recalculated at runtime if the
127 * sysctl_lowmem_reserve_ratio sysctl changes. 127 * sysctl_lowmem_reserve_ratio sysctl changes.
128 */ 128 */
129 unsigned long lowmem_reserve[MAX_NR_ZONES]; 129 unsigned long lowmem_reserve[MAX_NR_ZONES];
130 130
131 #ifdef CONFIG_NUMA 131 #ifdef CONFIG_NUMA
132 struct per_cpu_pageset *pageset[NR_CPUS]; 132 struct per_cpu_pageset *pageset[NR_CPUS];
133 #else 133 #else
134 struct per_cpu_pageset pageset[NR_CPUS]; 134 struct per_cpu_pageset pageset[NR_CPUS];
135 #endif 135 #endif
136 /* 136 /*
137 * free areas of different sizes 137 * free areas of different sizes
138 */ 138 */
139 spinlock_t lock; 139 spinlock_t lock;
140 struct free_area free_area[MAX_ORDER]; 140 struct free_area free_area[MAX_ORDER];
141 141
142 142
143 ZONE_PADDING(_pad1_) 143 ZONE_PADDING(_pad1_)
144 144
145 /* Fields commonly accessed by the page reclaim scanner */ 145 /* Fields commonly accessed by the page reclaim scanner */
146 spinlock_t lru_lock; 146 spinlock_t lru_lock;
147 struct list_head active_list; 147 struct list_head active_list;
148 struct list_head inactive_list; 148 struct list_head inactive_list;
149 unsigned long nr_scan_active; 149 unsigned long nr_scan_active;
150 unsigned long nr_scan_inactive; 150 unsigned long nr_scan_inactive;
151 unsigned long nr_active; 151 unsigned long nr_active;
152 unsigned long nr_inactive; 152 unsigned long nr_inactive;
153 unsigned long pages_scanned; /* since last reclaim */ 153 unsigned long pages_scanned; /* since last reclaim */
154 int all_unreclaimable; /* All pages pinned */ 154 int all_unreclaimable; /* All pages pinned */
155 155
156 /* 156 /*
157 * Does the allocator try to reclaim pages from the zone as soon 157 * Does the allocator try to reclaim pages from the zone as soon
158 * as it fails a watermark_ok() in __alloc_pages? 158 * as it fails a watermark_ok() in __alloc_pages?
159 */ 159 */
160 int reclaim_pages; 160 int reclaim_pages;
161 /* A count of how many reclaimers are scanning this zone */ 161 /* A count of how many reclaimers are scanning this zone */
162 atomic_t reclaim_in_progress; 162 atomic_t reclaim_in_progress;
163 163
164 /* 164 /*
165 * prev_priority holds the scanning priority for this zone. It is 165 * prev_priority holds the scanning priority for this zone. It is
166 * defined as the scanning priority at which we achieved our reclaim 166 * defined as the scanning priority at which we achieved our reclaim
167 * target at the previous try_to_free_pages() or balance_pgdat() 167 * target at the previous try_to_free_pages() or balance_pgdat()
168 * invokation. 168 * invokation.
169 * 169 *
170 * We use prev_priority as a measure of how much stress page reclaim is 170 * We use prev_priority as a measure of how much stress page reclaim is
171 * under - it drives the swappiness decision: whether to unmap mapped 171 * under - it drives the swappiness decision: whether to unmap mapped
172 * pages. 172 * pages.
173 * 173 *
174 * temp_priority is used to remember the scanning priority at which 174 * temp_priority is used to remember the scanning priority at which
175 * this zone was successfully refilled to free_pages == pages_high. 175 * this zone was successfully refilled to free_pages == pages_high.
176 * 176 *
177 * Access to both these fields is quite racy even on uniprocessor. But 177 * Access to both these fields is quite racy even on uniprocessor. But
178 * it is expected to average out OK. 178 * it is expected to average out OK.
179 */ 179 */
180 int temp_priority; 180 int temp_priority;
181 int prev_priority; 181 int prev_priority;
182 182
183 183
184 ZONE_PADDING(_pad2_) 184 ZONE_PADDING(_pad2_)
185 /* Rarely used or read-mostly fields */ 185 /* Rarely used or read-mostly fields */
186 186
187 /* 187 /*
188 * wait_table -- the array holding the hash table 188 * wait_table -- the array holding the hash table
189 * wait_table_size -- the size of the hash table array 189 * wait_table_size -- the size of the hash table array
190 * wait_table_bits -- wait_table_size == (1 << wait_table_bits) 190 * wait_table_bits -- wait_table_size == (1 << wait_table_bits)
191 * 191 *
192 * The purpose of all these is to keep track of the people 192 * The purpose of all these is to keep track of the people
193 * waiting for a page to become available and make them 193 * waiting for a page to become available and make them
194 * runnable again when possible. The trouble is that this 194 * runnable again when possible. The trouble is that this
195 * consumes a lot of space, especially when so few things 195 * consumes a lot of space, especially when so few things
196 * wait on pages at a given time. So instead of using 196 * wait on pages at a given time. So instead of using
197 * per-page waitqueues, we use a waitqueue hash table. 197 * per-page waitqueues, we use a waitqueue hash table.
198 * 198 *
199 * The bucket discipline is to sleep on the same queue when 199 * The bucket discipline is to sleep on the same queue when
200 * colliding and wake all in that wait queue when removing. 200 * colliding and wake all in that wait queue when removing.
201 * When something wakes, it must check to be sure its page is 201 * When something wakes, it must check to be sure its page is
202 * truly available, a la thundering herd. The cost of a 202 * truly available, a la thundering herd. The cost of a
203 * collision is great, but given the expected load of the 203 * collision is great, but given the expected load of the
204 * table, they should be so rare as to be outweighed by the 204 * table, they should be so rare as to be outweighed by the
205 * benefits from the saved space. 205 * benefits from the saved space.
206 * 206 *
207 * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the 207 * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the
208 * primary users of these fields, and in mm/page_alloc.c 208 * primary users of these fields, and in mm/page_alloc.c
209 * free_area_init_core() performs the initialization of them. 209 * free_area_init_core() performs the initialization of them.
210 */ 210 */
211 wait_queue_head_t * wait_table; 211 wait_queue_head_t * wait_table;
212 unsigned long wait_table_size; 212 unsigned long wait_table_size;
213 unsigned long wait_table_bits; 213 unsigned long wait_table_bits;
214 214
215 /* 215 /*
216 * Discontig memory support fields. 216 * Discontig memory support fields.
217 */ 217 */
218 struct pglist_data *zone_pgdat; 218 struct pglist_data *zone_pgdat;
219 struct page *zone_mem_map; 219 struct page *zone_mem_map;
220 /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */ 220 /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
221 unsigned long zone_start_pfn; 221 unsigned long zone_start_pfn;
222 222
223 unsigned long spanned_pages; /* total size, including holes */ 223 unsigned long spanned_pages; /* total size, including holes */
224 unsigned long present_pages; /* amount of memory (excluding holes) */ 224 unsigned long present_pages; /* amount of memory (excluding holes) */
225 225
226 /* 226 /*
227 * rarely used fields: 227 * rarely used fields:
228 */ 228 */
229 char *name; 229 char *name;
230 } ____cacheline_maxaligned_in_smp; 230 } ____cacheline_maxaligned_in_smp;
231 231
232 232
233 /* 233 /*
234 * The "priority" of VM scanning is how much of the queues we will scan in one 234 * The "priority" of VM scanning is how much of the queues we will scan in one
235 * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the 235 * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the
236 * queues ("queue_length >> 12") during an aging round. 236 * queues ("queue_length >> 12") during an aging round.
237 */ 237 */
238 #define DEF_PRIORITY 12 238 #define DEF_PRIORITY 12
239 239
240 /* 240 /*
241 * One allocation request operates on a zonelist. A zonelist 241 * One allocation request operates on a zonelist. A zonelist
242 * is a list of zones, the first one is the 'goal' of the 242 * is a list of zones, the first one is the 'goal' of the
243 * allocation, the other zones are fallback zones, in decreasing 243 * allocation, the other zones are fallback zones, in decreasing
244 * priority. 244 * priority.
245 * 245 *
246 * Right now a zonelist takes up less than a cacheline. We never 246 * Right now a zonelist takes up less than a cacheline. We never
247 * modify it apart from boot-up, and only a few indices are used, 247 * modify it apart from boot-up, and only a few indices are used,
248 * so despite the zonelist table being relatively big, the cache 248 * so despite the zonelist table being relatively big, the cache
249 * footprint of this construct is very small. 249 * footprint of this construct is very small.
250 */ 250 */
251 struct zonelist { 251 struct zonelist {
252 struct zone *zones[MAX_NUMNODES * MAX_NR_ZONES + 1]; // NULL delimited 252 struct zone *zones[MAX_NUMNODES * MAX_NR_ZONES + 1]; // NULL delimited
253 }; 253 };
254 254
255 255
256 /* 256 /*
257 * The pg_data_t structure is used in machines with CONFIG_DISCONTIGMEM 257 * The pg_data_t structure is used in machines with CONFIG_DISCONTIGMEM
258 * (mostly NUMA machines?) to denote a higher-level memory zone than the 258 * (mostly NUMA machines?) to denote a higher-level memory zone than the
259 * zone denotes. 259 * zone denotes.
260 * 260 *
261 * On NUMA machines, each NUMA node would have a pg_data_t to describe 261 * On NUMA machines, each NUMA node would have a pg_data_t to describe
262 * it's memory layout. 262 * it's memory layout.
263 * 263 *
264 * Memory statistics and page replacement data structures are maintained on a 264 * Memory statistics and page replacement data structures are maintained on a
265 * per-zone basis. 265 * per-zone basis.
266 */ 266 */
267 struct bootmem_data; 267 struct bootmem_data;
268 typedef struct pglist_data { 268 typedef struct pglist_data {
269 struct zone node_zones[MAX_NR_ZONES]; 269 struct zone node_zones[MAX_NR_ZONES];
270 struct zonelist node_zonelists[GFP_ZONETYPES]; 270 struct zonelist node_zonelists[GFP_ZONETYPES];
271 int nr_zones; 271 int nr_zones;
272 #ifdef CONFIG_FLAT_NODE_MEM_MAP 272 #ifdef CONFIG_FLAT_NODE_MEM_MAP
273 struct page *node_mem_map; 273 struct page *node_mem_map;
274 #endif 274 #endif
275 struct bootmem_data *bdata; 275 struct bootmem_data *bdata;
276 #ifdef CONFIG_MEMORY_HOTPLUG
277 /*
278 * Must be held any time you expect node_start_pfn, node_present_pages
279 * or node_spanned_pages stay constant. Holding this will also
280 * guarantee that any pfn_valid() stays that way.
281 *
282 * Nests above zone->lock and zone->size_seqlock.
283 */
284 spinlock_t node_size_lock;
285 #endif
276 unsigned long node_start_pfn; 286 unsigned long node_start_pfn;
277 unsigned long node_present_pages; /* total number of physical pages */ 287 unsigned long node_present_pages; /* total number of physical pages */
278 unsigned long node_spanned_pages; /* total size of physical page 288 unsigned long node_spanned_pages; /* total size of physical page
279 range, including holes */ 289 range, including holes */
280 int node_id; 290 int node_id;
281 struct pglist_data *pgdat_next; 291 struct pglist_data *pgdat_next;
282 wait_queue_head_t kswapd_wait; 292 wait_queue_head_t kswapd_wait;
283 struct task_struct *kswapd; 293 struct task_struct *kswapd;
284 int kswapd_max_order; 294 int kswapd_max_order;
285 } pg_data_t; 295 } pg_data_t;
286 296
287 #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) 297 #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages)
288 #define node_spanned_pages(nid) (NODE_DATA(nid)->node_spanned_pages) 298 #define node_spanned_pages(nid) (NODE_DATA(nid)->node_spanned_pages)
289 #ifdef CONFIG_FLAT_NODE_MEM_MAP 299 #ifdef CONFIG_FLAT_NODE_MEM_MAP
290 #define pgdat_page_nr(pgdat, pagenr) ((pgdat)->node_mem_map + (pagenr)) 300 #define pgdat_page_nr(pgdat, pagenr) ((pgdat)->node_mem_map + (pagenr))
291 #else 301 #else
292 #define pgdat_page_nr(pgdat, pagenr) pfn_to_page((pgdat)->node_start_pfn + (pagenr)) 302 #define pgdat_page_nr(pgdat, pagenr) pfn_to_page((pgdat)->node_start_pfn + (pagenr))
293 #endif 303 #endif
294 #define nid_page_nr(nid, pagenr) pgdat_page_nr(NODE_DATA(nid),(pagenr)) 304 #define nid_page_nr(nid, pagenr) pgdat_page_nr(NODE_DATA(nid),(pagenr))
305
306 #include <linux/memory_hotplug.h>
295 307
296 extern struct pglist_data *pgdat_list; 308 extern struct pglist_data *pgdat_list;
297 309
298 void __get_zone_counts(unsigned long *active, unsigned long *inactive, 310 void __get_zone_counts(unsigned long *active, unsigned long *inactive,
299 unsigned long *free, struct pglist_data *pgdat); 311 unsigned long *free, struct pglist_data *pgdat);
300 void get_zone_counts(unsigned long *active, unsigned long *inactive, 312 void get_zone_counts(unsigned long *active, unsigned long *inactive,
301 unsigned long *free); 313 unsigned long *free);
302 void build_all_zonelists(void); 314 void build_all_zonelists(void);
303 void wakeup_kswapd(struct zone *zone, int order); 315 void wakeup_kswapd(struct zone *zone, int order);
304 int zone_watermark_ok(struct zone *z, int order, unsigned long mark, 316 int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
305 int alloc_type, int can_try_harder, gfp_t gfp_high); 317 int alloc_type, int can_try_harder, gfp_t gfp_high);
306 318
307 #ifdef CONFIG_HAVE_MEMORY_PRESENT 319 #ifdef CONFIG_HAVE_MEMORY_PRESENT
308 void memory_present(int nid, unsigned long start, unsigned long end); 320 void memory_present(int nid, unsigned long start, unsigned long end);
309 #else 321 #else
310 static inline void memory_present(int nid, unsigned long start, unsigned long end) {} 322 static inline void memory_present(int nid, unsigned long start, unsigned long end) {}
311 #endif 323 #endif
312 324
313 #ifdef CONFIG_NEED_NODE_MEMMAP_SIZE 325 #ifdef CONFIG_NEED_NODE_MEMMAP_SIZE
314 unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long); 326 unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long);
315 #endif 327 #endif
316 328
317 /* 329 /*
318 * zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc. 330 * zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc.
319 */ 331 */
320 #define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones) 332 #define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones)
321 333
322 /** 334 /**
323 * for_each_pgdat - helper macro to iterate over all nodes 335 * for_each_pgdat - helper macro to iterate over all nodes
324 * @pgdat - pointer to a pg_data_t variable 336 * @pgdat - pointer to a pg_data_t variable
325 * 337 *
326 * Meant to help with common loops of the form 338 * Meant to help with common loops of the form
327 * pgdat = pgdat_list; 339 * pgdat = pgdat_list;
328 * while(pgdat) { 340 * while(pgdat) {
329 * ... 341 * ...
330 * pgdat = pgdat->pgdat_next; 342 * pgdat = pgdat->pgdat_next;
331 * } 343 * }
332 */ 344 */
333 #define for_each_pgdat(pgdat) \ 345 #define for_each_pgdat(pgdat) \
334 for (pgdat = pgdat_list; pgdat; pgdat = pgdat->pgdat_next) 346 for (pgdat = pgdat_list; pgdat; pgdat = pgdat->pgdat_next)
335 347
336 /* 348 /*
337 * next_zone - helper magic for for_each_zone() 349 * next_zone - helper magic for for_each_zone()
338 * Thanks to William Lee Irwin III for this piece of ingenuity. 350 * Thanks to William Lee Irwin III for this piece of ingenuity.
339 */ 351 */
340 static inline struct zone *next_zone(struct zone *zone) 352 static inline struct zone *next_zone(struct zone *zone)
341 { 353 {
342 pg_data_t *pgdat = zone->zone_pgdat; 354 pg_data_t *pgdat = zone->zone_pgdat;
343 355
344 if (zone < pgdat->node_zones + MAX_NR_ZONES - 1) 356 if (zone < pgdat->node_zones + MAX_NR_ZONES - 1)
345 zone++; 357 zone++;
346 else if (pgdat->pgdat_next) { 358 else if (pgdat->pgdat_next) {
347 pgdat = pgdat->pgdat_next; 359 pgdat = pgdat->pgdat_next;
348 zone = pgdat->node_zones; 360 zone = pgdat->node_zones;
349 } else 361 } else
350 zone = NULL; 362 zone = NULL;
351 363
352 return zone; 364 return zone;
353 } 365 }
354 366
355 /** 367 /**
356 * for_each_zone - helper macro to iterate over all memory zones 368 * for_each_zone - helper macro to iterate over all memory zones
357 * @zone - pointer to struct zone variable 369 * @zone - pointer to struct zone variable
358 * 370 *
359 * The user only needs to declare the zone variable, for_each_zone 371 * The user only needs to declare the zone variable, for_each_zone
360 * fills it in. This basically means for_each_zone() is an 372 * fills it in. This basically means for_each_zone() is an
361 * easier to read version of this piece of code: 373 * easier to read version of this piece of code:
362 * 374 *
363 * for (pgdat = pgdat_list; pgdat; pgdat = pgdat->node_next) 375 * for (pgdat = pgdat_list; pgdat; pgdat = pgdat->node_next)
364 * for (i = 0; i < MAX_NR_ZONES; ++i) { 376 * for (i = 0; i < MAX_NR_ZONES; ++i) {
365 * struct zone * z = pgdat->node_zones + i; 377 * struct zone * z = pgdat->node_zones + i;
366 * ... 378 * ...
367 * } 379 * }
368 * } 380 * }
369 */ 381 */
370 #define for_each_zone(zone) \ 382 #define for_each_zone(zone) \
371 for (zone = pgdat_list->node_zones; zone; zone = next_zone(zone)) 383 for (zone = pgdat_list->node_zones; zone; zone = next_zone(zone))
372 384
373 static inline int is_highmem_idx(int idx) 385 static inline int is_highmem_idx(int idx)
374 { 386 {
375 return (idx == ZONE_HIGHMEM); 387 return (idx == ZONE_HIGHMEM);
376 } 388 }
377 389
378 static inline int is_normal_idx(int idx) 390 static inline int is_normal_idx(int idx)
379 { 391 {
380 return (idx == ZONE_NORMAL); 392 return (idx == ZONE_NORMAL);
381 } 393 }
382 /** 394 /**
383 * is_highmem - helper function to quickly check if a struct zone is a 395 * is_highmem - helper function to quickly check if a struct zone is a
384 * highmem zone or not. This is an attempt to keep references 396 * highmem zone or not. This is an attempt to keep references
385 * to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum. 397 * to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum.
386 * @zone - pointer to struct zone variable 398 * @zone - pointer to struct zone variable
387 */ 399 */
388 static inline int is_highmem(struct zone *zone) 400 static inline int is_highmem(struct zone *zone)
389 { 401 {
390 return zone == zone->zone_pgdat->node_zones + ZONE_HIGHMEM; 402 return zone == zone->zone_pgdat->node_zones + ZONE_HIGHMEM;
391 } 403 }
392 404
393 static inline int is_normal(struct zone *zone) 405 static inline int is_normal(struct zone *zone)
394 { 406 {
395 return zone == zone->zone_pgdat->node_zones + ZONE_NORMAL; 407 return zone == zone->zone_pgdat->node_zones + ZONE_NORMAL;
396 } 408 }
397 409
398 /* These two functions are used to setup the per zone pages min values */ 410 /* These two functions are used to setup the per zone pages min values */
399 struct ctl_table; 411 struct ctl_table;
400 struct file; 412 struct file;
401 int min_free_kbytes_sysctl_handler(struct ctl_table *, int, struct file *, 413 int min_free_kbytes_sysctl_handler(struct ctl_table *, int, struct file *,
402 void __user *, size_t *, loff_t *); 414 void __user *, size_t *, loff_t *);
403 extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1]; 415 extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1];
404 int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, struct file *, 416 int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, struct file *,
405 void __user *, size_t *, loff_t *); 417 void __user *, size_t *, loff_t *);
406 418
407 #include <linux/topology.h> 419 #include <linux/topology.h>
408 /* Returns the number of the current Node. */ 420 /* Returns the number of the current Node. */
409 #define numa_node_id() (cpu_to_node(raw_smp_processor_id())) 421 #define numa_node_id() (cpu_to_node(raw_smp_processor_id()))
410 422
411 #ifndef CONFIG_NEED_MULTIPLE_NODES 423 #ifndef CONFIG_NEED_MULTIPLE_NODES
412 424
413 extern struct pglist_data contig_page_data; 425 extern struct pglist_data contig_page_data;
414 #define NODE_DATA(nid) (&contig_page_data) 426 #define NODE_DATA(nid) (&contig_page_data)
415 #define NODE_MEM_MAP(nid) mem_map 427 #define NODE_MEM_MAP(nid) mem_map
416 #define MAX_NODES_SHIFT 1 428 #define MAX_NODES_SHIFT 1
417 #define pfn_to_nid(pfn) (0) 429 #define pfn_to_nid(pfn) (0)
418 430
419 #else /* CONFIG_NEED_MULTIPLE_NODES */ 431 #else /* CONFIG_NEED_MULTIPLE_NODES */
420 432
421 #include <asm/mmzone.h> 433 #include <asm/mmzone.h>
422 434
423 #endif /* !CONFIG_NEED_MULTIPLE_NODES */ 435 #endif /* !CONFIG_NEED_MULTIPLE_NODES */
424 436
425 #ifdef CONFIG_SPARSEMEM 437 #ifdef CONFIG_SPARSEMEM
426 #include <asm/sparsemem.h> 438 #include <asm/sparsemem.h>
427 #endif 439 #endif
428 440
429 #if BITS_PER_LONG == 32 || defined(ARCH_HAS_ATOMIC_UNSIGNED) 441 #if BITS_PER_LONG == 32 || defined(ARCH_HAS_ATOMIC_UNSIGNED)
430 /* 442 /*
431 * with 32 bit page->flags field, we reserve 8 bits for node/zone info. 443 * with 32 bit page->flags field, we reserve 8 bits for node/zone info.
432 * there are 3 zones (2 bits) and this leaves 8-2=6 bits for nodes. 444 * there are 3 zones (2 bits) and this leaves 8-2=6 bits for nodes.
433 */ 445 */
434 #define FLAGS_RESERVED 8 446 #define FLAGS_RESERVED 8
435 447
436 #elif BITS_PER_LONG == 64 448 #elif BITS_PER_LONG == 64
437 /* 449 /*
438 * with 64 bit flags field, there's plenty of room. 450 * with 64 bit flags field, there's plenty of room.
439 */ 451 */
440 #define FLAGS_RESERVED 32 452 #define FLAGS_RESERVED 32
441 453
442 #else 454 #else
443 455
444 #error BITS_PER_LONG not defined 456 #error BITS_PER_LONG not defined
445 457
446 #endif 458 #endif
447 459
448 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID 460 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
449 #define early_pfn_to_nid(nid) (0UL) 461 #define early_pfn_to_nid(nid) (0UL)
450 #endif 462 #endif
451 463
452 #define pfn_to_section_nr(pfn) ((pfn) >> PFN_SECTION_SHIFT) 464 #define pfn_to_section_nr(pfn) ((pfn) >> PFN_SECTION_SHIFT)
453 #define section_nr_to_pfn(sec) ((sec) << PFN_SECTION_SHIFT) 465 #define section_nr_to_pfn(sec) ((sec) << PFN_SECTION_SHIFT)
454 466
455 #ifdef CONFIG_SPARSEMEM 467 #ifdef CONFIG_SPARSEMEM
456 468
457 /* 469 /*
458 * SECTION_SHIFT #bits space required to store a section # 470 * SECTION_SHIFT #bits space required to store a section #
459 * 471 *
460 * PA_SECTION_SHIFT physical address to/from section number 472 * PA_SECTION_SHIFT physical address to/from section number
461 * PFN_SECTION_SHIFT pfn to/from section number 473 * PFN_SECTION_SHIFT pfn to/from section number
462 */ 474 */
463 #define SECTIONS_SHIFT (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS) 475 #define SECTIONS_SHIFT (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS)
464 476
465 #define PA_SECTION_SHIFT (SECTION_SIZE_BITS) 477 #define PA_SECTION_SHIFT (SECTION_SIZE_BITS)
466 #define PFN_SECTION_SHIFT (SECTION_SIZE_BITS - PAGE_SHIFT) 478 #define PFN_SECTION_SHIFT (SECTION_SIZE_BITS - PAGE_SHIFT)
467 479
468 #define NR_MEM_SECTIONS (1UL << SECTIONS_SHIFT) 480 #define NR_MEM_SECTIONS (1UL << SECTIONS_SHIFT)
469 481
470 #define PAGES_PER_SECTION (1UL << PFN_SECTION_SHIFT) 482 #define PAGES_PER_SECTION (1UL << PFN_SECTION_SHIFT)
471 #define PAGE_SECTION_MASK (~(PAGES_PER_SECTION-1)) 483 #define PAGE_SECTION_MASK (~(PAGES_PER_SECTION-1))
472 484
473 #if (MAX_ORDER - 1 + PAGE_SHIFT) > SECTION_SIZE_BITS 485 #if (MAX_ORDER - 1 + PAGE_SHIFT) > SECTION_SIZE_BITS
474 #error Allocator MAX_ORDER exceeds SECTION_SIZE 486 #error Allocator MAX_ORDER exceeds SECTION_SIZE
475 #endif 487 #endif
476 488
477 struct page; 489 struct page;
478 struct mem_section { 490 struct mem_section {
479 /* 491 /*
480 * This is, logically, a pointer to an array of struct 492 * This is, logically, a pointer to an array of struct
481 * pages. However, it is stored with some other magic. 493 * pages. However, it is stored with some other magic.
482 * (see sparse.c::sparse_init_one_section()) 494 * (see sparse.c::sparse_init_one_section())
483 * 495 *
484 * Making it a UL at least makes someone do a cast 496 * Making it a UL at least makes someone do a cast
485 * before using it wrong. 497 * before using it wrong.
486 */ 498 */
487 unsigned long section_mem_map; 499 unsigned long section_mem_map;
488 }; 500 };
489 501
490 #ifdef CONFIG_SPARSEMEM_EXTREME 502 #ifdef CONFIG_SPARSEMEM_EXTREME
491 #define SECTIONS_PER_ROOT (PAGE_SIZE / sizeof (struct mem_section)) 503 #define SECTIONS_PER_ROOT (PAGE_SIZE / sizeof (struct mem_section))
492 #else 504 #else
493 #define SECTIONS_PER_ROOT 1 505 #define SECTIONS_PER_ROOT 1
494 #endif 506 #endif
495 507
496 #define SECTION_NR_TO_ROOT(sec) ((sec) / SECTIONS_PER_ROOT) 508 #define SECTION_NR_TO_ROOT(sec) ((sec) / SECTIONS_PER_ROOT)
497 #define NR_SECTION_ROOTS (NR_MEM_SECTIONS / SECTIONS_PER_ROOT) 509 #define NR_SECTION_ROOTS (NR_MEM_SECTIONS / SECTIONS_PER_ROOT)
498 #define SECTION_ROOT_MASK (SECTIONS_PER_ROOT - 1) 510 #define SECTION_ROOT_MASK (SECTIONS_PER_ROOT - 1)
499 511
500 #ifdef CONFIG_SPARSEMEM_EXTREME 512 #ifdef CONFIG_SPARSEMEM_EXTREME
501 extern struct mem_section *mem_section[NR_SECTION_ROOTS]; 513 extern struct mem_section *mem_section[NR_SECTION_ROOTS];
502 #else 514 #else
503 extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]; 515 extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT];
504 #endif 516 #endif
505 517
506 static inline struct mem_section *__nr_to_section(unsigned long nr) 518 static inline struct mem_section *__nr_to_section(unsigned long nr)
507 { 519 {
508 if (!mem_section[SECTION_NR_TO_ROOT(nr)]) 520 if (!mem_section[SECTION_NR_TO_ROOT(nr)])
509 return NULL; 521 return NULL;
510 return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK]; 522 return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK];
511 } 523 }
512 extern int __section_nr(struct mem_section* ms); 524 extern int __section_nr(struct mem_section* ms);
513 525
514 /* 526 /*
515 * We use the lower bits of the mem_map pointer to store 527 * We use the lower bits of the mem_map pointer to store
516 * a little bit of information. There should be at least 528 * a little bit of information. There should be at least
517 * 3 bits here due to 32-bit alignment. 529 * 3 bits here due to 32-bit alignment.
518 */ 530 */
519 #define SECTION_MARKED_PRESENT (1UL<<0) 531 #define SECTION_MARKED_PRESENT (1UL<<0)
520 #define SECTION_HAS_MEM_MAP (1UL<<1) 532 #define SECTION_HAS_MEM_MAP (1UL<<1)
521 #define SECTION_MAP_LAST_BIT (1UL<<2) 533 #define SECTION_MAP_LAST_BIT (1UL<<2)
522 #define SECTION_MAP_MASK (~(SECTION_MAP_LAST_BIT-1)) 534 #define SECTION_MAP_MASK (~(SECTION_MAP_LAST_BIT-1))
523 535
524 static inline struct page *__section_mem_map_addr(struct mem_section *section) 536 static inline struct page *__section_mem_map_addr(struct mem_section *section)
525 { 537 {
526 unsigned long map = section->section_mem_map; 538 unsigned long map = section->section_mem_map;
527 map &= SECTION_MAP_MASK; 539 map &= SECTION_MAP_MASK;
528 return (struct page *)map; 540 return (struct page *)map;
529 } 541 }
530 542
531 static inline int valid_section(struct mem_section *section) 543 static inline int valid_section(struct mem_section *section)
532 { 544 {
533 return (section && (section->section_mem_map & SECTION_MARKED_PRESENT)); 545 return (section && (section->section_mem_map & SECTION_MARKED_PRESENT));
534 } 546 }
535 547
536 static inline int section_has_mem_map(struct mem_section *section) 548 static inline int section_has_mem_map(struct mem_section *section)
537 { 549 {
538 return (section && (section->section_mem_map & SECTION_HAS_MEM_MAP)); 550 return (section && (section->section_mem_map & SECTION_HAS_MEM_MAP));
539 } 551 }
540 552
541 static inline int valid_section_nr(unsigned long nr) 553 static inline int valid_section_nr(unsigned long nr)
542 { 554 {
543 return valid_section(__nr_to_section(nr)); 555 return valid_section(__nr_to_section(nr));
544 } 556 }
545 557
546 /* 558 /*
547 * Given a kernel address, find the home node of the underlying memory. 559 * Given a kernel address, find the home node of the underlying memory.
548 */ 560 */
549 #define kvaddr_to_nid(kaddr) pfn_to_nid(__pa(kaddr) >> PAGE_SHIFT) 561 #define kvaddr_to_nid(kaddr) pfn_to_nid(__pa(kaddr) >> PAGE_SHIFT)
550 562
551 static inline struct mem_section *__pfn_to_section(unsigned long pfn) 563 static inline struct mem_section *__pfn_to_section(unsigned long pfn)
552 { 564 {
553 return __nr_to_section(pfn_to_section_nr(pfn)); 565 return __nr_to_section(pfn_to_section_nr(pfn));
554 } 566 }
555 567
556 #define pfn_to_page(pfn) \ 568 #define pfn_to_page(pfn) \
557 ({ \ 569 ({ \
558 unsigned long __pfn = (pfn); \ 570 unsigned long __pfn = (pfn); \
559 __section_mem_map_addr(__pfn_to_section(__pfn)) + __pfn; \ 571 __section_mem_map_addr(__pfn_to_section(__pfn)) + __pfn; \
560 }) 572 })
561 #define page_to_pfn(page) \ 573 #define page_to_pfn(page) \
562 ({ \ 574 ({ \
563 page - __section_mem_map_addr(__nr_to_section( \ 575 page - __section_mem_map_addr(__nr_to_section( \
564 page_to_section(page))); \ 576 page_to_section(page))); \
565 }) 577 })
566 578
567 static inline int pfn_valid(unsigned long pfn) 579 static inline int pfn_valid(unsigned long pfn)
568 { 580 {
569 if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS) 581 if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
570 return 0; 582 return 0;
571 return valid_section(__nr_to_section(pfn_to_section_nr(pfn))); 583 return valid_section(__nr_to_section(pfn_to_section_nr(pfn)));
572 } 584 }
573 585
574 /* 586 /*
575 * These are _only_ used during initialisation, therefore they 587 * These are _only_ used during initialisation, therefore they
576 * can use __initdata ... They could have names to indicate 588 * can use __initdata ... They could have names to indicate
577 * this restriction. 589 * this restriction.
578 */ 590 */
579 #ifdef CONFIG_NUMA 591 #ifdef CONFIG_NUMA
580 #define pfn_to_nid early_pfn_to_nid 592 #define pfn_to_nid early_pfn_to_nid
581 #endif 593 #endif
582 594
583 #define pfn_to_pgdat(pfn) \ 595 #define pfn_to_pgdat(pfn) \
584 ({ \ 596 ({ \
585 NODE_DATA(pfn_to_nid(pfn)); \ 597 NODE_DATA(pfn_to_nid(pfn)); \
586 }) 598 })
587 599
588 #define early_pfn_valid(pfn) pfn_valid(pfn) 600 #define early_pfn_valid(pfn) pfn_valid(pfn)
589 void sparse_init(void); 601 void sparse_init(void);
590 #else 602 #else
591 #define sparse_init() do {} while (0) 603 #define sparse_init() do {} while (0)
592 #define sparse_index_init(_sec, _nid) do {} while (0) 604 #define sparse_index_init(_sec, _nid) do {} while (0)
593 #endif /* CONFIG_SPARSEMEM */ 605 #endif /* CONFIG_SPARSEMEM */
594 606
595 #ifdef CONFIG_NODES_SPAN_OTHER_NODES 607 #ifdef CONFIG_NODES_SPAN_OTHER_NODES
596 #define early_pfn_in_nid(pfn, nid) (early_pfn_to_nid(pfn) == (nid)) 608 #define early_pfn_in_nid(pfn, nid) (early_pfn_to_nid(pfn) == (nid))
597 #else 609 #else
598 #define early_pfn_in_nid(pfn, nid) (1) 610 #define early_pfn_in_nid(pfn, nid) (1)
599 #endif 611 #endif
600 612
601 #ifndef early_pfn_valid 613 #ifndef early_pfn_valid
602 #define early_pfn_valid(pfn) (1) 614 #define early_pfn_valid(pfn) (1)
603 #endif 615 #endif
604 616
605 void memory_present(int nid, unsigned long start, unsigned long end); 617 void memory_present(int nid, unsigned long start, unsigned long end);
606 unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long); 618 unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long);
607 619
608 #endif /* !__ASSEMBLY__ */ 620 #endif /* !__ASSEMBLY__ */
609 #endif /* __KERNEL__ */ 621 #endif /* __KERNEL__ */
610 #endif /* _LINUX_MMZONE_H */ 622 #endif /* _LINUX_MMZONE_H */
611 623
1 /* 1 /*
2 * linux/mm/page_alloc.c 2 * linux/mm/page_alloc.c
3 * 3 *
4 * Manages the free list, the system allocates free pages here. 4 * Manages the free list, the system allocates free pages here.
5 * Note that kmalloc() lives in slab.c 5 * Note that kmalloc() lives in slab.c
6 * 6 *
7 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 7 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
8 * Swap reorganised 29.12.95, Stephen Tweedie 8 * Swap reorganised 29.12.95, Stephen Tweedie
9 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 9 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
10 * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 10 * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
11 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 11 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
12 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 12 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000
13 * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002 13 * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
14 * (lots of bits borrowed from Ingo Molnar & Andrew Morton) 14 * (lots of bits borrowed from Ingo Molnar & Andrew Morton)
15 */ 15 */
16 16
17 #include <linux/config.h> 17 #include <linux/config.h>
18 #include <linux/stddef.h> 18 #include <linux/stddef.h>
19 #include <linux/mm.h> 19 #include <linux/mm.h>
20 #include <linux/swap.h> 20 #include <linux/swap.h>
21 #include <linux/interrupt.h> 21 #include <linux/interrupt.h>
22 #include <linux/pagemap.h> 22 #include <linux/pagemap.h>
23 #include <linux/bootmem.h> 23 #include <linux/bootmem.h>
24 #include <linux/compiler.h> 24 #include <linux/compiler.h>
25 #include <linux/kernel.h> 25 #include <linux/kernel.h>
26 #include <linux/module.h> 26 #include <linux/module.h>
27 #include <linux/suspend.h> 27 #include <linux/suspend.h>
28 #include <linux/pagevec.h> 28 #include <linux/pagevec.h>
29 #include <linux/blkdev.h> 29 #include <linux/blkdev.h>
30 #include <linux/slab.h> 30 #include <linux/slab.h>
31 #include <linux/notifier.h> 31 #include <linux/notifier.h>
32 #include <linux/topology.h> 32 #include <linux/topology.h>
33 #include <linux/sysctl.h> 33 #include <linux/sysctl.h>
34 #include <linux/cpu.h> 34 #include <linux/cpu.h>
35 #include <linux/cpuset.h> 35 #include <linux/cpuset.h>
36 #include <linux/nodemask.h> 36 #include <linux/nodemask.h>
37 #include <linux/vmalloc.h> 37 #include <linux/vmalloc.h>
38 38
39 #include <asm/tlbflush.h> 39 #include <asm/tlbflush.h>
40 #include "internal.h" 40 #include "internal.h"
41 41
42 /* 42 /*
43 * MCD - HACK: Find somewhere to initialize this EARLY, or make this 43 * MCD - HACK: Find somewhere to initialize this EARLY, or make this
44 * initializer cleaner 44 * initializer cleaner
45 */ 45 */
46 nodemask_t node_online_map __read_mostly = { { [0] = 1UL } }; 46 nodemask_t node_online_map __read_mostly = { { [0] = 1UL } };
47 EXPORT_SYMBOL(node_online_map); 47 EXPORT_SYMBOL(node_online_map);
48 nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL; 48 nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL;
49 EXPORT_SYMBOL(node_possible_map); 49 EXPORT_SYMBOL(node_possible_map);
50 struct pglist_data *pgdat_list __read_mostly; 50 struct pglist_data *pgdat_list __read_mostly;
51 unsigned long totalram_pages __read_mostly; 51 unsigned long totalram_pages __read_mostly;
52 unsigned long totalhigh_pages __read_mostly; 52 unsigned long totalhigh_pages __read_mostly;
53 long nr_swap_pages; 53 long nr_swap_pages;
54 54
55 /* 55 /*
56 * results with 256, 32 in the lowmem_reserve sysctl: 56 * results with 256, 32 in the lowmem_reserve sysctl:
57 * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) 57 * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
58 * 1G machine -> (16M dma, 784M normal, 224M high) 58 * 1G machine -> (16M dma, 784M normal, 224M high)
59 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA 59 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
60 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL 60 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
61 * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA 61 * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
62 */ 62 */
63 int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 32 }; 63 int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 32 };
64 64
65 EXPORT_SYMBOL(totalram_pages); 65 EXPORT_SYMBOL(totalram_pages);
66 EXPORT_SYMBOL(nr_swap_pages); 66 EXPORT_SYMBOL(nr_swap_pages);
67 67
68 /* 68 /*
69 * Used by page_zone() to look up the address of the struct zone whose 69 * Used by page_zone() to look up the address of the struct zone whose
70 * id is encoded in the upper bits of page->flags 70 * id is encoded in the upper bits of page->flags
71 */ 71 */
72 struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly; 72 struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly;
73 EXPORT_SYMBOL(zone_table); 73 EXPORT_SYMBOL(zone_table);
74 74
75 static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; 75 static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
76 int min_free_kbytes = 1024; 76 int min_free_kbytes = 1024;
77 77
78 unsigned long __initdata nr_kernel_pages; 78 unsigned long __initdata nr_kernel_pages;
79 unsigned long __initdata nr_all_pages; 79 unsigned long __initdata nr_all_pages;
80 80
81 static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 81 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
82 { 82 {
83 if (page_to_pfn(page) >= zone->zone_start_pfn + zone->spanned_pages) 83 if (page_to_pfn(page) >= zone->zone_start_pfn + zone->spanned_pages)
84 return 1; 84 return 1;
85 if (page_to_pfn(page) < zone->zone_start_pfn) 85 if (page_to_pfn(page) < zone->zone_start_pfn)
86 return 1; 86 return 1;
87 87
88 return 0; 88 return 0;
89 } 89 }
90 90
91 static int page_is_consistent(struct zone *zone, struct page *page) 91 static int page_is_consistent(struct zone *zone, struct page *page)
92 { 92 {
93 #ifdef CONFIG_HOLES_IN_ZONE 93 #ifdef CONFIG_HOLES_IN_ZONE
94 if (!pfn_valid(page_to_pfn(page))) 94 if (!pfn_valid(page_to_pfn(page)))
95 return 0; 95 return 0;
96 #endif 96 #endif
97 if (zone != page_zone(page)) 97 if (zone != page_zone(page))
98 return 0; 98 return 0;
99 99
100 return 1; 100 return 1;
101 } 101 }
102 /* 102 /*
103 * Temporary debugging check for pages not lying within a given zone. 103 * Temporary debugging check for pages not lying within a given zone.
104 */ 104 */
105 static int bad_range(struct zone *zone, struct page *page) 105 static int bad_range(struct zone *zone, struct page *page)
106 { 106 {
107 if (page_outside_zone_boundaries(zone, page)) 107 if (page_outside_zone_boundaries(zone, page))
108 return 1; 108 return 1;
109 if (!page_is_consistent(zone, page)) 109 if (!page_is_consistent(zone, page))
110 return 1; 110 return 1;
111 111
112 return 0; 112 return 0;
113 } 113 }
114 114
115 static void bad_page(const char *function, struct page *page) 115 static void bad_page(const char *function, struct page *page)
116 { 116 {
117 printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n", 117 printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n",
118 function, current->comm, page); 118 function, current->comm, page);
119 printk(KERN_EMERG "flags:0x%0*lx mapping:%p mapcount:%d count:%d\n", 119 printk(KERN_EMERG "flags:0x%0*lx mapping:%p mapcount:%d count:%d\n",
120 (int)(2*sizeof(page_flags_t)), (unsigned long)page->flags, 120 (int)(2*sizeof(page_flags_t)), (unsigned long)page->flags,
121 page->mapping, page_mapcount(page), page_count(page)); 121 page->mapping, page_mapcount(page), page_count(page));
122 printk(KERN_EMERG "Backtrace:\n"); 122 printk(KERN_EMERG "Backtrace:\n");
123 dump_stack(); 123 dump_stack();
124 printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"); 124 printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n");
125 page->flags &= ~(1 << PG_lru | 125 page->flags &= ~(1 << PG_lru |
126 1 << PG_private | 126 1 << PG_private |
127 1 << PG_locked | 127 1 << PG_locked |
128 1 << PG_active | 128 1 << PG_active |
129 1 << PG_dirty | 129 1 << PG_dirty |
130 1 << PG_reclaim | 130 1 << PG_reclaim |
131 1 << PG_slab | 131 1 << PG_slab |
132 1 << PG_swapcache | 132 1 << PG_swapcache |
133 1 << PG_writeback | 133 1 << PG_writeback |
134 1 << PG_reserved ); 134 1 << PG_reserved );
135 set_page_count(page, 0); 135 set_page_count(page, 0);
136 reset_page_mapcount(page); 136 reset_page_mapcount(page);
137 page->mapping = NULL; 137 page->mapping = NULL;
138 add_taint(TAINT_BAD_PAGE); 138 add_taint(TAINT_BAD_PAGE);
139 } 139 }
140 140
141 #ifndef CONFIG_HUGETLB_PAGE 141 #ifndef CONFIG_HUGETLB_PAGE
142 #define prep_compound_page(page, order) do { } while (0) 142 #define prep_compound_page(page, order) do { } while (0)
143 #define destroy_compound_page(page, order) do { } while (0) 143 #define destroy_compound_page(page, order) do { } while (0)
144 #else 144 #else
145 /* 145 /*
146 * Higher-order pages are called "compound pages". They are structured thusly: 146 * Higher-order pages are called "compound pages". They are structured thusly:
147 * 147 *
148 * The first PAGE_SIZE page is called the "head page". 148 * The first PAGE_SIZE page is called the "head page".
149 * 149 *
150 * The remaining PAGE_SIZE pages are called "tail pages". 150 * The remaining PAGE_SIZE pages are called "tail pages".
151 * 151 *
152 * All pages have PG_compound set. All pages have their ->private pointing at 152 * All pages have PG_compound set. All pages have their ->private pointing at
153 * the head page (even the head page has this). 153 * the head page (even the head page has this).
154 * 154 *
155 * The first tail page's ->mapping, if non-zero, holds the address of the 155 * The first tail page's ->mapping, if non-zero, holds the address of the
156 * compound page's put_page() function. 156 * compound page's put_page() function.
157 * 157 *
158 * The order of the allocation is stored in the first tail page's ->index 158 * The order of the allocation is stored in the first tail page's ->index
159 * This is only for debug at present. This usage means that zero-order pages 159 * This is only for debug at present. This usage means that zero-order pages
160 * may not be compound. 160 * may not be compound.
161 */ 161 */
162 static void prep_compound_page(struct page *page, unsigned long order) 162 static void prep_compound_page(struct page *page, unsigned long order)
163 { 163 {
164 int i; 164 int i;
165 int nr_pages = 1 << order; 165 int nr_pages = 1 << order;
166 166
167 page[1].mapping = NULL; 167 page[1].mapping = NULL;
168 page[1].index = order; 168 page[1].index = order;
169 for (i = 0; i < nr_pages; i++) { 169 for (i = 0; i < nr_pages; i++) {
170 struct page *p = page + i; 170 struct page *p = page + i;
171 171
172 SetPageCompound(p); 172 SetPageCompound(p);
173 set_page_private(p, (unsigned long)page); 173 set_page_private(p, (unsigned long)page);
174 } 174 }
175 } 175 }
176 176
177 static void destroy_compound_page(struct page *page, unsigned long order) 177 static void destroy_compound_page(struct page *page, unsigned long order)
178 { 178 {
179 int i; 179 int i;
180 int nr_pages = 1 << order; 180 int nr_pages = 1 << order;
181 181
182 if (!PageCompound(page)) 182 if (!PageCompound(page))
183 return; 183 return;
184 184
185 if (page[1].index != order) 185 if (page[1].index != order)
186 bad_page(__FUNCTION__, page); 186 bad_page(__FUNCTION__, page);
187 187
188 for (i = 0; i < nr_pages; i++) { 188 for (i = 0; i < nr_pages; i++) {
189 struct page *p = page + i; 189 struct page *p = page + i;
190 190
191 if (!PageCompound(p)) 191 if (!PageCompound(p))
192 bad_page(__FUNCTION__, page); 192 bad_page(__FUNCTION__, page);
193 if (page_private(p) != (unsigned long)page) 193 if (page_private(p) != (unsigned long)page)
194 bad_page(__FUNCTION__, page); 194 bad_page(__FUNCTION__, page);
195 ClearPageCompound(p); 195 ClearPageCompound(p);
196 } 196 }
197 } 197 }
198 #endif /* CONFIG_HUGETLB_PAGE */ 198 #endif /* CONFIG_HUGETLB_PAGE */
199 199
200 /* 200 /*
201 * function for dealing with page's order in buddy system. 201 * function for dealing with page's order in buddy system.
202 * zone->lock is already acquired when we use these. 202 * zone->lock is already acquired when we use these.
203 * So, we don't need atomic page->flags operations here. 203 * So, we don't need atomic page->flags operations here.
204 */ 204 */
205 static inline unsigned long page_order(struct page *page) { 205 static inline unsigned long page_order(struct page *page) {
206 return page_private(page); 206 return page_private(page);
207 } 207 }
208 208
209 static inline void set_page_order(struct page *page, int order) { 209 static inline void set_page_order(struct page *page, int order) {
210 set_page_private(page, order); 210 set_page_private(page, order);
211 __SetPagePrivate(page); 211 __SetPagePrivate(page);
212 } 212 }
213 213
214 static inline void rmv_page_order(struct page *page) 214 static inline void rmv_page_order(struct page *page)
215 { 215 {
216 __ClearPagePrivate(page); 216 __ClearPagePrivate(page);
217 set_page_private(page, 0); 217 set_page_private(page, 0);
218 } 218 }
219 219
220 /* 220 /*
221 * Locate the struct page for both the matching buddy in our 221 * Locate the struct page for both the matching buddy in our
222 * pair (buddy1) and the combined O(n+1) page they form (page). 222 * pair (buddy1) and the combined O(n+1) page they form (page).
223 * 223 *
224 * 1) Any buddy B1 will have an order O twin B2 which satisfies 224 * 1) Any buddy B1 will have an order O twin B2 which satisfies
225 * the following equation: 225 * the following equation:
226 * B2 = B1 ^ (1 << O) 226 * B2 = B1 ^ (1 << O)
227 * For example, if the starting buddy (buddy2) is #8 its order 227 * For example, if the starting buddy (buddy2) is #8 its order
228 * 1 buddy is #10: 228 * 1 buddy is #10:
229 * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10 229 * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
230 * 230 *
231 * 2) Any buddy B will have an order O+1 parent P which 231 * 2) Any buddy B will have an order O+1 parent P which
232 * satisfies the following equation: 232 * satisfies the following equation:
233 * P = B & ~(1 << O) 233 * P = B & ~(1 << O)
234 * 234 *
235 * Assumption: *_mem_map is contigious at least up to MAX_ORDER 235 * Assumption: *_mem_map is contigious at least up to MAX_ORDER
236 */ 236 */
237 static inline struct page * 237 static inline struct page *
238 __page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order) 238 __page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
239 { 239 {
240 unsigned long buddy_idx = page_idx ^ (1 << order); 240 unsigned long buddy_idx = page_idx ^ (1 << order);
241 241
242 return page + (buddy_idx - page_idx); 242 return page + (buddy_idx - page_idx);
243 } 243 }
244 244
245 static inline unsigned long 245 static inline unsigned long
246 __find_combined_index(unsigned long page_idx, unsigned int order) 246 __find_combined_index(unsigned long page_idx, unsigned int order)
247 { 247 {
248 return (page_idx & ~(1 << order)); 248 return (page_idx & ~(1 << order));
249 } 249 }
250 250
251 /* 251 /*
252 * This function checks whether a page is free && is the buddy 252 * This function checks whether a page is free && is the buddy
253 * we can do coalesce a page and its buddy if 253 * we can do coalesce a page and its buddy if
254 * (a) the buddy is free && 254 * (a) the buddy is free &&
255 * (b) the buddy is on the buddy system && 255 * (b) the buddy is on the buddy system &&
256 * (c) a page and its buddy have the same order. 256 * (c) a page and its buddy have the same order.
257 * for recording page's order, we use page_private(page) and PG_private. 257 * for recording page's order, we use page_private(page) and PG_private.
258 * 258 *
259 */ 259 */
260 static inline int page_is_buddy(struct page *page, int order) 260 static inline int page_is_buddy(struct page *page, int order)
261 { 261 {
262 if (PagePrivate(page) && 262 if (PagePrivate(page) &&
263 (page_order(page) == order) && 263 (page_order(page) == order) &&
264 page_count(page) == 0) 264 page_count(page) == 0)
265 return 1; 265 return 1;
266 return 0; 266 return 0;
267 } 267 }
268 268
269 /* 269 /*
270 * Freeing function for a buddy system allocator. 270 * Freeing function for a buddy system allocator.
271 * 271 *
272 * The concept of a buddy system is to maintain direct-mapped table 272 * The concept of a buddy system is to maintain direct-mapped table
273 * (containing bit values) for memory blocks of various "orders". 273 * (containing bit values) for memory blocks of various "orders".
274 * The bottom level table contains the map for the smallest allocatable 274 * The bottom level table contains the map for the smallest allocatable
275 * units of memory (here, pages), and each level above it describes 275 * units of memory (here, pages), and each level above it describes
276 * pairs of units from the levels below, hence, "buddies". 276 * pairs of units from the levels below, hence, "buddies".
277 * At a high level, all that happens here is marking the table entry 277 * At a high level, all that happens here is marking the table entry
278 * at the bottom level available, and propagating the changes upward 278 * at the bottom level available, and propagating the changes upward
279 * as necessary, plus some accounting needed to play nicely with other 279 * as necessary, plus some accounting needed to play nicely with other
280 * parts of the VM system. 280 * parts of the VM system.
281 * At each level, we keep a list of pages, which are heads of continuous 281 * At each level, we keep a list of pages, which are heads of continuous
282 * free pages of length of (1 << order) and marked with PG_Private.Page's 282 * free pages of length of (1 << order) and marked with PG_Private.Page's
283 * order is recorded in page_private(page) field. 283 * order is recorded in page_private(page) field.
284 * So when we are allocating or freeing one, we can derive the state of the 284 * So when we are allocating or freeing one, we can derive the state of the
285 * other. That is, if we allocate a small block, and both were 285 * other. That is, if we allocate a small block, and both were
286 * free, the remainder of the region must be split into blocks. 286 * free, the remainder of the region must be split into blocks.
287 * If a block is freed, and its buddy is also free, then this 287 * If a block is freed, and its buddy is also free, then this
288 * triggers coalescing into a block of larger size. 288 * triggers coalescing into a block of larger size.
289 * 289 *
290 * -- wli 290 * -- wli
291 */ 291 */
292 292
293 static inline void __free_pages_bulk (struct page *page, 293 static inline void __free_pages_bulk (struct page *page,
294 struct zone *zone, unsigned int order) 294 struct zone *zone, unsigned int order)
295 { 295 {
296 unsigned long page_idx; 296 unsigned long page_idx;
297 int order_size = 1 << order; 297 int order_size = 1 << order;
298 298
299 if (unlikely(order)) 299 if (unlikely(order))
300 destroy_compound_page(page, order); 300 destroy_compound_page(page, order);
301 301
302 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); 302 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
303 303
304 BUG_ON(page_idx & (order_size - 1)); 304 BUG_ON(page_idx & (order_size - 1));
305 BUG_ON(bad_range(zone, page)); 305 BUG_ON(bad_range(zone, page));
306 306
307 zone->free_pages += order_size; 307 zone->free_pages += order_size;
308 while (order < MAX_ORDER-1) { 308 while (order < MAX_ORDER-1) {
309 unsigned long combined_idx; 309 unsigned long combined_idx;
310 struct free_area *area; 310 struct free_area *area;
311 struct page *buddy; 311 struct page *buddy;
312 312
313 combined_idx = __find_combined_index(page_idx, order); 313 combined_idx = __find_combined_index(page_idx, order);
314 buddy = __page_find_buddy(page, page_idx, order); 314 buddy = __page_find_buddy(page, page_idx, order);
315 315
316 if (bad_range(zone, buddy)) 316 if (bad_range(zone, buddy))
317 break; 317 break;
318 if (!page_is_buddy(buddy, order)) 318 if (!page_is_buddy(buddy, order))
319 break; /* Move the buddy up one level. */ 319 break; /* Move the buddy up one level. */
320 list_del(&buddy->lru); 320 list_del(&buddy->lru);
321 area = zone->free_area + order; 321 area = zone->free_area + order;
322 area->nr_free--; 322 area->nr_free--;
323 rmv_page_order(buddy); 323 rmv_page_order(buddy);
324 page = page + (combined_idx - page_idx); 324 page = page + (combined_idx - page_idx);
325 page_idx = combined_idx; 325 page_idx = combined_idx;
326 order++; 326 order++;
327 } 327 }
328 set_page_order(page, order); 328 set_page_order(page, order);
329 list_add(&page->lru, &zone->free_area[order].free_list); 329 list_add(&page->lru, &zone->free_area[order].free_list);
330 zone->free_area[order].nr_free++; 330 zone->free_area[order].nr_free++;
331 } 331 }
332 332
333 static inline void free_pages_check(const char *function, struct page *page) 333 static inline void free_pages_check(const char *function, struct page *page)
334 { 334 {
335 if ( page_mapcount(page) || 335 if ( page_mapcount(page) ||
336 page->mapping != NULL || 336 page->mapping != NULL ||
337 page_count(page) != 0 || 337 page_count(page) != 0 ||
338 (page->flags & ( 338 (page->flags & (
339 1 << PG_lru | 339 1 << PG_lru |
340 1 << PG_private | 340 1 << PG_private |
341 1 << PG_locked | 341 1 << PG_locked |
342 1 << PG_active | 342 1 << PG_active |
343 1 << PG_reclaim | 343 1 << PG_reclaim |
344 1 << PG_slab | 344 1 << PG_slab |
345 1 << PG_swapcache | 345 1 << PG_swapcache |
346 1 << PG_writeback | 346 1 << PG_writeback |
347 1 << PG_reserved ))) 347 1 << PG_reserved )))
348 bad_page(function, page); 348 bad_page(function, page);
349 if (PageDirty(page)) 349 if (PageDirty(page))
350 __ClearPageDirty(page); 350 __ClearPageDirty(page);
351 } 351 }
352 352
353 /* 353 /*
354 * Frees a list of pages. 354 * Frees a list of pages.
355 * Assumes all pages on list are in same zone, and of same order. 355 * Assumes all pages on list are in same zone, and of same order.
356 * count is the number of pages to free. 356 * count is the number of pages to free.
357 * 357 *
358 * If the zone was previously in an "all pages pinned" state then look to 358 * If the zone was previously in an "all pages pinned" state then look to
359 * see if this freeing clears that state. 359 * see if this freeing clears that state.
360 * 360 *
361 * And clear the zone's pages_scanned counter, to hold off the "all pages are 361 * And clear the zone's pages_scanned counter, to hold off the "all pages are
362 * pinned" detection logic. 362 * pinned" detection logic.
363 */ 363 */
364 static int 364 static int
365 free_pages_bulk(struct zone *zone, int count, 365 free_pages_bulk(struct zone *zone, int count,
366 struct list_head *list, unsigned int order) 366 struct list_head *list, unsigned int order)
367 { 367 {
368 unsigned long flags; 368 unsigned long flags;
369 struct page *page = NULL; 369 struct page *page = NULL;
370 int ret = 0; 370 int ret = 0;
371 371
372 spin_lock_irqsave(&zone->lock, flags); 372 spin_lock_irqsave(&zone->lock, flags);
373 zone->all_unreclaimable = 0; 373 zone->all_unreclaimable = 0;
374 zone->pages_scanned = 0; 374 zone->pages_scanned = 0;
375 while (!list_empty(list) && count--) { 375 while (!list_empty(list) && count--) {
376 page = list_entry(list->prev, struct page, lru); 376 page = list_entry(list->prev, struct page, lru);
377 /* have to delete it as __free_pages_bulk list manipulates */ 377 /* have to delete it as __free_pages_bulk list manipulates */
378 list_del(&page->lru); 378 list_del(&page->lru);
379 __free_pages_bulk(page, zone, order); 379 __free_pages_bulk(page, zone, order);
380 ret++; 380 ret++;
381 } 381 }
382 spin_unlock_irqrestore(&zone->lock, flags); 382 spin_unlock_irqrestore(&zone->lock, flags);
383 return ret; 383 return ret;
384 } 384 }
385 385
386 void __free_pages_ok(struct page *page, unsigned int order) 386 void __free_pages_ok(struct page *page, unsigned int order)
387 { 387 {
388 LIST_HEAD(list); 388 LIST_HEAD(list);
389 int i; 389 int i;
390 390
391 arch_free_page(page, order); 391 arch_free_page(page, order);
392 392
393 mod_page_state(pgfree, 1 << order); 393 mod_page_state(pgfree, 1 << order);
394 394
395 #ifndef CONFIG_MMU 395 #ifndef CONFIG_MMU
396 if (order > 0) 396 if (order > 0)
397 for (i = 1 ; i < (1 << order) ; ++i) 397 for (i = 1 ; i < (1 << order) ; ++i)
398 __put_page(page + i); 398 __put_page(page + i);
399 #endif 399 #endif
400 400
401 for (i = 0 ; i < (1 << order) ; ++i) 401 for (i = 0 ; i < (1 << order) ; ++i)
402 free_pages_check(__FUNCTION__, page + i); 402 free_pages_check(__FUNCTION__, page + i);
403 list_add(&page->lru, &list); 403 list_add(&page->lru, &list);
404 kernel_map_pages(page, 1<<order, 0); 404 kernel_map_pages(page, 1<<order, 0);
405 free_pages_bulk(page_zone(page), 1, &list, order); 405 free_pages_bulk(page_zone(page), 1, &list, order);
406 } 406 }
407 407
408 408
409 /* 409 /*
410 * The order of subdivision here is critical for the IO subsystem. 410 * The order of subdivision here is critical for the IO subsystem.
411 * Please do not alter this order without good reasons and regression 411 * Please do not alter this order without good reasons and regression
412 * testing. Specifically, as large blocks of memory are subdivided, 412 * testing. Specifically, as large blocks of memory are subdivided,
413 * the order in which smaller blocks are delivered depends on the order 413 * the order in which smaller blocks are delivered depends on the order
414 * they're subdivided in this function. This is the primary factor 414 * they're subdivided in this function. This is the primary factor
415 * influencing the order in which pages are delivered to the IO 415 * influencing the order in which pages are delivered to the IO
416 * subsystem according to empirical testing, and this is also justified 416 * subsystem according to empirical testing, and this is also justified
417 * by considering the behavior of a buddy system containing a single 417 * by considering the behavior of a buddy system containing a single
418 * large block of memory acted on by a series of small allocations. 418 * large block of memory acted on by a series of small allocations.
419 * This behavior is a critical factor in sglist merging's success. 419 * This behavior is a critical factor in sglist merging's success.
420 * 420 *
421 * -- wli 421 * -- wli
422 */ 422 */
423 static inline struct page * 423 static inline struct page *
424 expand(struct zone *zone, struct page *page, 424 expand(struct zone *zone, struct page *page,
425 int low, int high, struct free_area *area) 425 int low, int high, struct free_area *area)
426 { 426 {
427 unsigned long size = 1 << high; 427 unsigned long size = 1 << high;
428 428
429 while (high > low) { 429 while (high > low) {
430 area--; 430 area--;
431 high--; 431 high--;
432 size >>= 1; 432 size >>= 1;
433 BUG_ON(bad_range(zone, &page[size])); 433 BUG_ON(bad_range(zone, &page[size]));
434 list_add(&page[size].lru, &area->free_list); 434 list_add(&page[size].lru, &area->free_list);
435 area->nr_free++; 435 area->nr_free++;
436 set_page_order(&page[size], high); 436 set_page_order(&page[size], high);
437 } 437 }
438 return page; 438 return page;
439 } 439 }
440 440
441 void set_page_refs(struct page *page, int order) 441 void set_page_refs(struct page *page, int order)
442 { 442 {
443 #ifdef CONFIG_MMU 443 #ifdef CONFIG_MMU
444 set_page_count(page, 1); 444 set_page_count(page, 1);
445 #else 445 #else
446 int i; 446 int i;
447 447
448 /* 448 /*
449 * We need to reference all the pages for this order, otherwise if 449 * We need to reference all the pages for this order, otherwise if
450 * anyone accesses one of the pages with (get/put) it will be freed. 450 * anyone accesses one of the pages with (get/put) it will be freed.
451 * - eg: access_process_vm() 451 * - eg: access_process_vm()
452 */ 452 */
453 for (i = 0; i < (1 << order); i++) 453 for (i = 0; i < (1 << order); i++)
454 set_page_count(page + i, 1); 454 set_page_count(page + i, 1);
455 #endif /* CONFIG_MMU */ 455 #endif /* CONFIG_MMU */
456 } 456 }
457 457
458 /* 458 /*
459 * This page is about to be returned from the page allocator 459 * This page is about to be returned from the page allocator
460 */ 460 */
461 static void prep_new_page(struct page *page, int order) 461 static void prep_new_page(struct page *page, int order)
462 { 462 {
463 if ( page_mapcount(page) || 463 if ( page_mapcount(page) ||
464 page->mapping != NULL || 464 page->mapping != NULL ||
465 page_count(page) != 0 || 465 page_count(page) != 0 ||
466 (page->flags & ( 466 (page->flags & (
467 1 << PG_lru | 467 1 << PG_lru |
468 1 << PG_private | 468 1 << PG_private |
469 1 << PG_locked | 469 1 << PG_locked |
470 1 << PG_active | 470 1 << PG_active |
471 1 << PG_dirty | 471 1 << PG_dirty |
472 1 << PG_reclaim | 472 1 << PG_reclaim |
473 1 << PG_slab | 473 1 << PG_slab |
474 1 << PG_swapcache | 474 1 << PG_swapcache |
475 1 << PG_writeback | 475 1 << PG_writeback |
476 1 << PG_reserved ))) 476 1 << PG_reserved )))
477 bad_page(__FUNCTION__, page); 477 bad_page(__FUNCTION__, page);
478 478
479 page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 479 page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
480 1 << PG_referenced | 1 << PG_arch_1 | 480 1 << PG_referenced | 1 << PG_arch_1 |
481 1 << PG_checked | 1 << PG_mappedtodisk); 481 1 << PG_checked | 1 << PG_mappedtodisk);
482 set_page_private(page, 0); 482 set_page_private(page, 0);
483 set_page_refs(page, order); 483 set_page_refs(page, order);
484 kernel_map_pages(page, 1 << order, 1); 484 kernel_map_pages(page, 1 << order, 1);
485 } 485 }
486 486
487 /* 487 /*
488 * Do the hard work of removing an element from the buddy allocator. 488 * Do the hard work of removing an element from the buddy allocator.
489 * Call me with the zone->lock already held. 489 * Call me with the zone->lock already held.
490 */ 490 */
491 static struct page *__rmqueue(struct zone *zone, unsigned int order) 491 static struct page *__rmqueue(struct zone *zone, unsigned int order)
492 { 492 {
493 struct free_area * area; 493 struct free_area * area;
494 unsigned int current_order; 494 unsigned int current_order;
495 struct page *page; 495 struct page *page;
496 496
497 for (current_order = order; current_order < MAX_ORDER; ++current_order) { 497 for (current_order = order; current_order < MAX_ORDER; ++current_order) {
498 area = zone->free_area + current_order; 498 area = zone->free_area + current_order;
499 if (list_empty(&area->free_list)) 499 if (list_empty(&area->free_list))
500 continue; 500 continue;
501 501
502 page = list_entry(area->free_list.next, struct page, lru); 502 page = list_entry(area->free_list.next, struct page, lru);
503 list_del(&page->lru); 503 list_del(&page->lru);
504 rmv_page_order(page); 504 rmv_page_order(page);
505 area->nr_free--; 505 area->nr_free--;
506 zone->free_pages -= 1UL << order; 506 zone->free_pages -= 1UL << order;
507 return expand(zone, page, order, current_order, area); 507 return expand(zone, page, order, current_order, area);
508 } 508 }
509 509
510 return NULL; 510 return NULL;
511 } 511 }
512 512
513 /* 513 /*
514 * Obtain a specified number of elements from the buddy allocator, all under 514 * Obtain a specified number of elements from the buddy allocator, all under
515 * a single hold of the lock, for efficiency. Add them to the supplied list. 515 * a single hold of the lock, for efficiency. Add them to the supplied list.
516 * Returns the number of new pages which were placed at *list. 516 * Returns the number of new pages which were placed at *list.
517 */ 517 */
518 static int rmqueue_bulk(struct zone *zone, unsigned int order, 518 static int rmqueue_bulk(struct zone *zone, unsigned int order,
519 unsigned long count, struct list_head *list) 519 unsigned long count, struct list_head *list)
520 { 520 {
521 unsigned long flags; 521 unsigned long flags;
522 int i; 522 int i;
523 int allocated = 0; 523 int allocated = 0;
524 struct page *page; 524 struct page *page;
525 525
526 spin_lock_irqsave(&zone->lock, flags); 526 spin_lock_irqsave(&zone->lock, flags);
527 for (i = 0; i < count; ++i) { 527 for (i = 0; i < count; ++i) {
528 page = __rmqueue(zone, order); 528 page = __rmqueue(zone, order);
529 if (page == NULL) 529 if (page == NULL)
530 break; 530 break;
531 allocated++; 531 allocated++;
532 list_add_tail(&page->lru, list); 532 list_add_tail(&page->lru, list);
533 } 533 }
534 spin_unlock_irqrestore(&zone->lock, flags); 534 spin_unlock_irqrestore(&zone->lock, flags);
535 return allocated; 535 return allocated;
536 } 536 }
537 537
538 #ifdef CONFIG_NUMA 538 #ifdef CONFIG_NUMA
539 /* Called from the slab reaper to drain remote pagesets */ 539 /* Called from the slab reaper to drain remote pagesets */
540 void drain_remote_pages(void) 540 void drain_remote_pages(void)
541 { 541 {
542 struct zone *zone; 542 struct zone *zone;
543 int i; 543 int i;
544 unsigned long flags; 544 unsigned long flags;
545 545
546 local_irq_save(flags); 546 local_irq_save(flags);
547 for_each_zone(zone) { 547 for_each_zone(zone) {
548 struct per_cpu_pageset *pset; 548 struct per_cpu_pageset *pset;
549 549
550 /* Do not drain local pagesets */ 550 /* Do not drain local pagesets */
551 if (zone->zone_pgdat->node_id == numa_node_id()) 551 if (zone->zone_pgdat->node_id == numa_node_id())
552 continue; 552 continue;
553 553
554 pset = zone->pageset[smp_processor_id()]; 554 pset = zone->pageset[smp_processor_id()];
555 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { 555 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
556 struct per_cpu_pages *pcp; 556 struct per_cpu_pages *pcp;
557 557
558 pcp = &pset->pcp[i]; 558 pcp = &pset->pcp[i];
559 if (pcp->count) 559 if (pcp->count)
560 pcp->count -= free_pages_bulk(zone, pcp->count, 560 pcp->count -= free_pages_bulk(zone, pcp->count,
561 &pcp->list, 0); 561 &pcp->list, 0);
562 } 562 }
563 } 563 }
564 local_irq_restore(flags); 564 local_irq_restore(flags);
565 } 565 }
566 #endif 566 #endif
567 567
568 #if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) 568 #if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
569 static void __drain_pages(unsigned int cpu) 569 static void __drain_pages(unsigned int cpu)
570 { 570 {
571 struct zone *zone; 571 struct zone *zone;
572 int i; 572 int i;
573 573
574 for_each_zone(zone) { 574 for_each_zone(zone) {
575 struct per_cpu_pageset *pset; 575 struct per_cpu_pageset *pset;
576 576
577 pset = zone_pcp(zone, cpu); 577 pset = zone_pcp(zone, cpu);
578 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { 578 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
579 struct per_cpu_pages *pcp; 579 struct per_cpu_pages *pcp;
580 580
581 pcp = &pset->pcp[i]; 581 pcp = &pset->pcp[i];
582 pcp->count -= free_pages_bulk(zone, pcp->count, 582 pcp->count -= free_pages_bulk(zone, pcp->count,
583 &pcp->list, 0); 583 &pcp->list, 0);
584 } 584 }
585 } 585 }
586 } 586 }
587 #endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */ 587 #endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */
588 588
589 #ifdef CONFIG_PM 589 #ifdef CONFIG_PM
590 590
591 void mark_free_pages(struct zone *zone) 591 void mark_free_pages(struct zone *zone)
592 { 592 {
593 unsigned long zone_pfn, flags; 593 unsigned long zone_pfn, flags;
594 int order; 594 int order;
595 struct list_head *curr; 595 struct list_head *curr;
596 596
597 if (!zone->spanned_pages) 597 if (!zone->spanned_pages)
598 return; 598 return;
599 599
600 spin_lock_irqsave(&zone->lock, flags); 600 spin_lock_irqsave(&zone->lock, flags);
601 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) 601 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
602 ClearPageNosaveFree(pfn_to_page(zone_pfn + zone->zone_start_pfn)); 602 ClearPageNosaveFree(pfn_to_page(zone_pfn + zone->zone_start_pfn));
603 603
604 for (order = MAX_ORDER - 1; order >= 0; --order) 604 for (order = MAX_ORDER - 1; order >= 0; --order)
605 list_for_each(curr, &zone->free_area[order].free_list) { 605 list_for_each(curr, &zone->free_area[order].free_list) {
606 unsigned long start_pfn, i; 606 unsigned long start_pfn, i;
607 607
608 start_pfn = page_to_pfn(list_entry(curr, struct page, lru)); 608 start_pfn = page_to_pfn(list_entry(curr, struct page, lru));
609 609
610 for (i=0; i < (1<<order); i++) 610 for (i=0; i < (1<<order); i++)
611 SetPageNosaveFree(pfn_to_page(start_pfn+i)); 611 SetPageNosaveFree(pfn_to_page(start_pfn+i));
612 } 612 }
613 spin_unlock_irqrestore(&zone->lock, flags); 613 spin_unlock_irqrestore(&zone->lock, flags);
614 } 614 }
615 615
616 /* 616 /*
617 * Spill all of this CPU's per-cpu pages back into the buddy allocator. 617 * Spill all of this CPU's per-cpu pages back into the buddy allocator.
618 */ 618 */
619 void drain_local_pages(void) 619 void drain_local_pages(void)
620 { 620 {
621 unsigned long flags; 621 unsigned long flags;
622 622
623 local_irq_save(flags); 623 local_irq_save(flags);
624 __drain_pages(smp_processor_id()); 624 __drain_pages(smp_processor_id());
625 local_irq_restore(flags); 625 local_irq_restore(flags);
626 } 626 }
627 #endif /* CONFIG_PM */ 627 #endif /* CONFIG_PM */
628 628
629 static void zone_statistics(struct zonelist *zonelist, struct zone *z) 629 static void zone_statistics(struct zonelist *zonelist, struct zone *z)
630 { 630 {
631 #ifdef CONFIG_NUMA 631 #ifdef CONFIG_NUMA
632 unsigned long flags; 632 unsigned long flags;
633 int cpu; 633 int cpu;
634 pg_data_t *pg = z->zone_pgdat; 634 pg_data_t *pg = z->zone_pgdat;
635 pg_data_t *orig = zonelist->zones[0]->zone_pgdat; 635 pg_data_t *orig = zonelist->zones[0]->zone_pgdat;
636 struct per_cpu_pageset *p; 636 struct per_cpu_pageset *p;
637 637
638 local_irq_save(flags); 638 local_irq_save(flags);
639 cpu = smp_processor_id(); 639 cpu = smp_processor_id();
640 p = zone_pcp(z,cpu); 640 p = zone_pcp(z,cpu);
641 if (pg == orig) { 641 if (pg == orig) {
642 p->numa_hit++; 642 p->numa_hit++;
643 } else { 643 } else {
644 p->numa_miss++; 644 p->numa_miss++;
645 zone_pcp(zonelist->zones[0], cpu)->numa_foreign++; 645 zone_pcp(zonelist->zones[0], cpu)->numa_foreign++;
646 } 646 }
647 if (pg == NODE_DATA(numa_node_id())) 647 if (pg == NODE_DATA(numa_node_id()))
648 p->local_node++; 648 p->local_node++;
649 else 649 else
650 p->other_node++; 650 p->other_node++;
651 local_irq_restore(flags); 651 local_irq_restore(flags);
652 #endif 652 #endif
653 } 653 }
654 654
655 /* 655 /*
656 * Free a 0-order page 656 * Free a 0-order page
657 */ 657 */
658 static void FASTCALL(free_hot_cold_page(struct page *page, int cold)); 658 static void FASTCALL(free_hot_cold_page(struct page *page, int cold));
659 static void fastcall free_hot_cold_page(struct page *page, int cold) 659 static void fastcall free_hot_cold_page(struct page *page, int cold)
660 { 660 {
661 struct zone *zone = page_zone(page); 661 struct zone *zone = page_zone(page);
662 struct per_cpu_pages *pcp; 662 struct per_cpu_pages *pcp;
663 unsigned long flags; 663 unsigned long flags;
664 664
665 arch_free_page(page, 0); 665 arch_free_page(page, 0);
666 666
667 kernel_map_pages(page, 1, 0); 667 kernel_map_pages(page, 1, 0);
668 inc_page_state(pgfree); 668 inc_page_state(pgfree);
669 if (PageAnon(page)) 669 if (PageAnon(page))
670 page->mapping = NULL; 670 page->mapping = NULL;
671 free_pages_check(__FUNCTION__, page); 671 free_pages_check(__FUNCTION__, page);
672 pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; 672 pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
673 local_irq_save(flags); 673 local_irq_save(flags);
674 list_add(&page->lru, &pcp->list); 674 list_add(&page->lru, &pcp->list);
675 pcp->count++; 675 pcp->count++;
676 if (pcp->count >= pcp->high) 676 if (pcp->count >= pcp->high)
677 pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0); 677 pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
678 local_irq_restore(flags); 678 local_irq_restore(flags);
679 put_cpu(); 679 put_cpu();
680 } 680 }
681 681
682 void fastcall free_hot_page(struct page *page) 682 void fastcall free_hot_page(struct page *page)
683 { 683 {
684 free_hot_cold_page(page, 0); 684 free_hot_cold_page(page, 0);
685 } 685 }
686 686
687 void fastcall free_cold_page(struct page *page) 687 void fastcall free_cold_page(struct page *page)
688 { 688 {
689 free_hot_cold_page(page, 1); 689 free_hot_cold_page(page, 1);
690 } 690 }
691 691
692 static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) 692 static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
693 { 693 {
694 int i; 694 int i;
695 695
696 BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM); 696 BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM);
697 for(i = 0; i < (1 << order); i++) 697 for(i = 0; i < (1 << order); i++)
698 clear_highpage(page + i); 698 clear_highpage(page + i);
699 } 699 }
700 700
701 /* 701 /*
702 * Really, prep_compound_page() should be called from __rmqueue_bulk(). But 702 * Really, prep_compound_page() should be called from __rmqueue_bulk(). But
703 * we cheat by calling it from here, in the order > 0 path. Saves a branch 703 * we cheat by calling it from here, in the order > 0 path. Saves a branch
704 * or two. 704 * or two.
705 */ 705 */
706 static struct page * 706 static struct page *
707 buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags) 707 buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags)
708 { 708 {
709 unsigned long flags; 709 unsigned long flags;
710 struct page *page = NULL; 710 struct page *page = NULL;
711 int cold = !!(gfp_flags & __GFP_COLD); 711 int cold = !!(gfp_flags & __GFP_COLD);
712 712
713 if (order == 0) { 713 if (order == 0) {
714 struct per_cpu_pages *pcp; 714 struct per_cpu_pages *pcp;
715 715
716 pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; 716 pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
717 local_irq_save(flags); 717 local_irq_save(flags);
718 if (pcp->count <= pcp->low) 718 if (pcp->count <= pcp->low)
719 pcp->count += rmqueue_bulk(zone, 0, 719 pcp->count += rmqueue_bulk(zone, 0,
720 pcp->batch, &pcp->list); 720 pcp->batch, &pcp->list);
721 if (pcp->count) { 721 if (pcp->count) {
722 page = list_entry(pcp->list.next, struct page, lru); 722 page = list_entry(pcp->list.next, struct page, lru);
723 list_del(&page->lru); 723 list_del(&page->lru);
724 pcp->count--; 724 pcp->count--;
725 } 725 }
726 local_irq_restore(flags); 726 local_irq_restore(flags);
727 put_cpu(); 727 put_cpu();
728 } 728 }
729 729
730 if (page == NULL) { 730 if (page == NULL) {
731 spin_lock_irqsave(&zone->lock, flags); 731 spin_lock_irqsave(&zone->lock, flags);
732 page = __rmqueue(zone, order); 732 page = __rmqueue(zone, order);
733 spin_unlock_irqrestore(&zone->lock, flags); 733 spin_unlock_irqrestore(&zone->lock, flags);
734 } 734 }
735 735
736 if (page != NULL) { 736 if (page != NULL) {
737 BUG_ON(bad_range(zone, page)); 737 BUG_ON(bad_range(zone, page));
738 mod_page_state_zone(zone, pgalloc, 1 << order); 738 mod_page_state_zone(zone, pgalloc, 1 << order);
739 prep_new_page(page, order); 739 prep_new_page(page, order);
740 740
741 if (gfp_flags & __GFP_ZERO) 741 if (gfp_flags & __GFP_ZERO)
742 prep_zero_page(page, order, gfp_flags); 742 prep_zero_page(page, order, gfp_flags);
743 743
744 if (order && (gfp_flags & __GFP_COMP)) 744 if (order && (gfp_flags & __GFP_COMP))
745 prep_compound_page(page, order); 745 prep_compound_page(page, order);
746 } 746 }
747 return page; 747 return page;
748 } 748 }
749 749
750 /* 750 /*
751 * Return 1 if free pages are above 'mark'. This takes into account the order 751 * Return 1 if free pages are above 'mark'. This takes into account the order
752 * of the allocation. 752 * of the allocation.
753 */ 753 */
754 int zone_watermark_ok(struct zone *z, int order, unsigned long mark, 754 int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
755 int classzone_idx, int can_try_harder, gfp_t gfp_high) 755 int classzone_idx, int can_try_harder, gfp_t gfp_high)
756 { 756 {
757 /* free_pages my go negative - that's OK */ 757 /* free_pages my go negative - that's OK */
758 long min = mark, free_pages = z->free_pages - (1 << order) + 1; 758 long min = mark, free_pages = z->free_pages - (1 << order) + 1;
759 int o; 759 int o;
760 760
761 if (gfp_high) 761 if (gfp_high)
762 min -= min / 2; 762 min -= min / 2;
763 if (can_try_harder) 763 if (can_try_harder)
764 min -= min / 4; 764 min -= min / 4;
765 765
766 if (free_pages <= min + z->lowmem_reserve[classzone_idx]) 766 if (free_pages <= min + z->lowmem_reserve[classzone_idx])
767 return 0; 767 return 0;
768 for (o = 0; o < order; o++) { 768 for (o = 0; o < order; o++) {
769 /* At the next order, this order's pages become unavailable */ 769 /* At the next order, this order's pages become unavailable */
770 free_pages -= z->free_area[o].nr_free << o; 770 free_pages -= z->free_area[o].nr_free << o;
771 771
772 /* Require fewer higher order pages to be free */ 772 /* Require fewer higher order pages to be free */
773 min >>= 1; 773 min >>= 1;
774 774
775 if (free_pages <= min) 775 if (free_pages <= min)
776 return 0; 776 return 0;
777 } 777 }
778 return 1; 778 return 1;
779 } 779 }
780 780
781 static inline int 781 static inline int
782 should_reclaim_zone(struct zone *z, gfp_t gfp_mask) 782 should_reclaim_zone(struct zone *z, gfp_t gfp_mask)
783 { 783 {
784 if (!z->reclaim_pages) 784 if (!z->reclaim_pages)
785 return 0; 785 return 0;
786 if (gfp_mask & __GFP_NORECLAIM) 786 if (gfp_mask & __GFP_NORECLAIM)
787 return 0; 787 return 0;
788 return 1; 788 return 1;
789 } 789 }
790 790
791 /* 791 /*
792 * This is the 'heart' of the zoned buddy allocator. 792 * This is the 'heart' of the zoned buddy allocator.
793 */ 793 */
794 struct page * fastcall 794 struct page * fastcall
795 __alloc_pages(gfp_t gfp_mask, unsigned int order, 795 __alloc_pages(gfp_t gfp_mask, unsigned int order,
796 struct zonelist *zonelist) 796 struct zonelist *zonelist)
797 { 797 {
798 const gfp_t wait = gfp_mask & __GFP_WAIT; 798 const gfp_t wait = gfp_mask & __GFP_WAIT;
799 struct zone **zones, *z; 799 struct zone **zones, *z;
800 struct page *page; 800 struct page *page;
801 struct reclaim_state reclaim_state; 801 struct reclaim_state reclaim_state;
802 struct task_struct *p = current; 802 struct task_struct *p = current;
803 int i; 803 int i;
804 int classzone_idx; 804 int classzone_idx;
805 int do_retry; 805 int do_retry;
806 int can_try_harder; 806 int can_try_harder;
807 int did_some_progress; 807 int did_some_progress;
808 808
809 might_sleep_if(wait); 809 might_sleep_if(wait);
810 810
811 /* 811 /*
812 * The caller may dip into page reserves a bit more if the caller 812 * The caller may dip into page reserves a bit more if the caller
813 * cannot run direct reclaim, or is the caller has realtime scheduling 813 * cannot run direct reclaim, or is the caller has realtime scheduling
814 * policy 814 * policy
815 */ 815 */
816 can_try_harder = (unlikely(rt_task(p)) && !in_interrupt()) || !wait; 816 can_try_harder = (unlikely(rt_task(p)) && !in_interrupt()) || !wait;
817 817
818 zones = zonelist->zones; /* the list of zones suitable for gfp_mask */ 818 zones = zonelist->zones; /* the list of zones suitable for gfp_mask */
819 819
820 if (unlikely(zones[0] == NULL)) { 820 if (unlikely(zones[0] == NULL)) {
821 /* Should this ever happen?? */ 821 /* Should this ever happen?? */
822 return NULL; 822 return NULL;
823 } 823 }
824 824
825 classzone_idx = zone_idx(zones[0]); 825 classzone_idx = zone_idx(zones[0]);
826 826
827 restart: 827 restart:
828 /* 828 /*
829 * Go through the zonelist once, looking for a zone with enough free. 829 * Go through the zonelist once, looking for a zone with enough free.
830 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 830 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
831 */ 831 */
832 for (i = 0; (z = zones[i]) != NULL; i++) { 832 for (i = 0; (z = zones[i]) != NULL; i++) {
833 int do_reclaim = should_reclaim_zone(z, gfp_mask); 833 int do_reclaim = should_reclaim_zone(z, gfp_mask);
834 834
835 if (!cpuset_zone_allowed(z, __GFP_HARDWALL)) 835 if (!cpuset_zone_allowed(z, __GFP_HARDWALL))
836 continue; 836 continue;
837 837
838 /* 838 /*
839 * If the zone is to attempt early page reclaim then this loop 839 * If the zone is to attempt early page reclaim then this loop
840 * will try to reclaim pages and check the watermark a second 840 * will try to reclaim pages and check the watermark a second
841 * time before giving up and falling back to the next zone. 841 * time before giving up and falling back to the next zone.
842 */ 842 */
843 zone_reclaim_retry: 843 zone_reclaim_retry:
844 if (!zone_watermark_ok(z, order, z->pages_low, 844 if (!zone_watermark_ok(z, order, z->pages_low,
845 classzone_idx, 0, 0)) { 845 classzone_idx, 0, 0)) {
846 if (!do_reclaim) 846 if (!do_reclaim)
847 continue; 847 continue;
848 else { 848 else {
849 zone_reclaim(z, gfp_mask, order); 849 zone_reclaim(z, gfp_mask, order);
850 /* Only try reclaim once */ 850 /* Only try reclaim once */
851 do_reclaim = 0; 851 do_reclaim = 0;
852 goto zone_reclaim_retry; 852 goto zone_reclaim_retry;
853 } 853 }
854 } 854 }
855 855
856 page = buffered_rmqueue(z, order, gfp_mask); 856 page = buffered_rmqueue(z, order, gfp_mask);
857 if (page) 857 if (page)
858 goto got_pg; 858 goto got_pg;
859 } 859 }
860 860
861 for (i = 0; (z = zones[i]) != NULL; i++) 861 for (i = 0; (z = zones[i]) != NULL; i++)
862 wakeup_kswapd(z, order); 862 wakeup_kswapd(z, order);
863 863
864 /* 864 /*
865 * Go through the zonelist again. Let __GFP_HIGH and allocations 865 * Go through the zonelist again. Let __GFP_HIGH and allocations
866 * coming from realtime tasks to go deeper into reserves 866 * coming from realtime tasks to go deeper into reserves
867 * 867 *
868 * This is the last chance, in general, before the goto nopage. 868 * This is the last chance, in general, before the goto nopage.
869 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. 869 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
870 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 870 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
871 */ 871 */
872 for (i = 0; (z = zones[i]) != NULL; i++) { 872 for (i = 0; (z = zones[i]) != NULL; i++) {
873 if (!zone_watermark_ok(z, order, z->pages_min, 873 if (!zone_watermark_ok(z, order, z->pages_min,
874 classzone_idx, can_try_harder, 874 classzone_idx, can_try_harder,
875 gfp_mask & __GFP_HIGH)) 875 gfp_mask & __GFP_HIGH))
876 continue; 876 continue;
877 877
878 if (wait && !cpuset_zone_allowed(z, gfp_mask)) 878 if (wait && !cpuset_zone_allowed(z, gfp_mask))
879 continue; 879 continue;
880 880
881 page = buffered_rmqueue(z, order, gfp_mask); 881 page = buffered_rmqueue(z, order, gfp_mask);
882 if (page) 882 if (page)
883 goto got_pg; 883 goto got_pg;
884 } 884 }
885 885
886 /* This allocation should allow future memory freeing. */ 886 /* This allocation should allow future memory freeing. */
887 887
888 if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) 888 if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
889 && !in_interrupt()) { 889 && !in_interrupt()) {
890 if (!(gfp_mask & __GFP_NOMEMALLOC)) { 890 if (!(gfp_mask & __GFP_NOMEMALLOC)) {
891 /* go through the zonelist yet again, ignoring mins */ 891 /* go through the zonelist yet again, ignoring mins */
892 for (i = 0; (z = zones[i]) != NULL; i++) { 892 for (i = 0; (z = zones[i]) != NULL; i++) {
893 if (!cpuset_zone_allowed(z, gfp_mask)) 893 if (!cpuset_zone_allowed(z, gfp_mask))
894 continue; 894 continue;
895 page = buffered_rmqueue(z, order, gfp_mask); 895 page = buffered_rmqueue(z, order, gfp_mask);
896 if (page) 896 if (page)
897 goto got_pg; 897 goto got_pg;
898 } 898 }
899 } 899 }
900 goto nopage; 900 goto nopage;
901 } 901 }
902 902
903 /* Atomic allocations - we can't balance anything */ 903 /* Atomic allocations - we can't balance anything */
904 if (!wait) 904 if (!wait)
905 goto nopage; 905 goto nopage;
906 906
907 rebalance: 907 rebalance:
908 cond_resched(); 908 cond_resched();
909 909
910 /* We now go into synchronous reclaim */ 910 /* We now go into synchronous reclaim */
911 p->flags |= PF_MEMALLOC; 911 p->flags |= PF_MEMALLOC;
912 reclaim_state.reclaimed_slab = 0; 912 reclaim_state.reclaimed_slab = 0;
913 p->reclaim_state = &reclaim_state; 913 p->reclaim_state = &reclaim_state;
914 914
915 did_some_progress = try_to_free_pages(zones, gfp_mask); 915 did_some_progress = try_to_free_pages(zones, gfp_mask);
916 916
917 p->reclaim_state = NULL; 917 p->reclaim_state = NULL;
918 p->flags &= ~PF_MEMALLOC; 918 p->flags &= ~PF_MEMALLOC;
919 919
920 cond_resched(); 920 cond_resched();
921 921
922 if (likely(did_some_progress)) { 922 if (likely(did_some_progress)) {
923 for (i = 0; (z = zones[i]) != NULL; i++) { 923 for (i = 0; (z = zones[i]) != NULL; i++) {
924 if (!zone_watermark_ok(z, order, z->pages_min, 924 if (!zone_watermark_ok(z, order, z->pages_min,
925 classzone_idx, can_try_harder, 925 classzone_idx, can_try_harder,
926 gfp_mask & __GFP_HIGH)) 926 gfp_mask & __GFP_HIGH))
927 continue; 927 continue;
928 928
929 if (!cpuset_zone_allowed(z, gfp_mask)) 929 if (!cpuset_zone_allowed(z, gfp_mask))
930 continue; 930 continue;
931 931
932 page = buffered_rmqueue(z, order, gfp_mask); 932 page = buffered_rmqueue(z, order, gfp_mask);
933 if (page) 933 if (page)
934 goto got_pg; 934 goto got_pg;
935 } 935 }
936 } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { 936 } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
937 /* 937 /*
938 * Go through the zonelist yet one more time, keep 938 * Go through the zonelist yet one more time, keep
939 * very high watermark here, this is only to catch 939 * very high watermark here, this is only to catch
940 * a parallel oom killing, we must fail if we're still 940 * a parallel oom killing, we must fail if we're still
941 * under heavy pressure. 941 * under heavy pressure.
942 */ 942 */
943 for (i = 0; (z = zones[i]) != NULL; i++) { 943 for (i = 0; (z = zones[i]) != NULL; i++) {
944 if (!zone_watermark_ok(z, order, z->pages_high, 944 if (!zone_watermark_ok(z, order, z->pages_high,
945 classzone_idx, 0, 0)) 945 classzone_idx, 0, 0))
946 continue; 946 continue;
947 947
948 if (!cpuset_zone_allowed(z, __GFP_HARDWALL)) 948 if (!cpuset_zone_allowed(z, __GFP_HARDWALL))
949 continue; 949 continue;
950 950
951 page = buffered_rmqueue(z, order, gfp_mask); 951 page = buffered_rmqueue(z, order, gfp_mask);
952 if (page) 952 if (page)
953 goto got_pg; 953 goto got_pg;
954 } 954 }
955 955
956 out_of_memory(gfp_mask, order); 956 out_of_memory(gfp_mask, order);
957 goto restart; 957 goto restart;
958 } 958 }
959 959
960 /* 960 /*
961 * Don't let big-order allocations loop unless the caller explicitly 961 * Don't let big-order allocations loop unless the caller explicitly
962 * requests that. Wait for some write requests to complete then retry. 962 * requests that. Wait for some write requests to complete then retry.
963 * 963 *
964 * In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order 964 * In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order
965 * <= 3, but that may not be true in other implementations. 965 * <= 3, but that may not be true in other implementations.
966 */ 966 */
967 do_retry = 0; 967 do_retry = 0;
968 if (!(gfp_mask & __GFP_NORETRY)) { 968 if (!(gfp_mask & __GFP_NORETRY)) {
969 if ((order <= 3) || (gfp_mask & __GFP_REPEAT)) 969 if ((order <= 3) || (gfp_mask & __GFP_REPEAT))
970 do_retry = 1; 970 do_retry = 1;
971 if (gfp_mask & __GFP_NOFAIL) 971 if (gfp_mask & __GFP_NOFAIL)
972 do_retry = 1; 972 do_retry = 1;
973 } 973 }
974 if (do_retry) { 974 if (do_retry) {
975 blk_congestion_wait(WRITE, HZ/50); 975 blk_congestion_wait(WRITE, HZ/50);
976 goto rebalance; 976 goto rebalance;
977 } 977 }
978 978
979 nopage: 979 nopage:
980 if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { 980 if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
981 printk(KERN_WARNING "%s: page allocation failure." 981 printk(KERN_WARNING "%s: page allocation failure."
982 " order:%d, mode:0x%x\n", 982 " order:%d, mode:0x%x\n",
983 p->comm, order, gfp_mask); 983 p->comm, order, gfp_mask);
984 dump_stack(); 984 dump_stack();
985 show_mem(); 985 show_mem();
986 } 986 }
987 return NULL; 987 return NULL;
988 got_pg: 988 got_pg:
989 zone_statistics(zonelist, z); 989 zone_statistics(zonelist, z);
990 return page; 990 return page;
991 } 991 }
992 992
993 EXPORT_SYMBOL(__alloc_pages); 993 EXPORT_SYMBOL(__alloc_pages);
994 994
995 /* 995 /*
996 * Common helper functions. 996 * Common helper functions.
997 */ 997 */
998 fastcall unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) 998 fastcall unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
999 { 999 {
1000 struct page * page; 1000 struct page * page;
1001 page = alloc_pages(gfp_mask, order); 1001 page = alloc_pages(gfp_mask, order);
1002 if (!page) 1002 if (!page)
1003 return 0; 1003 return 0;
1004 return (unsigned long) page_address(page); 1004 return (unsigned long) page_address(page);
1005 } 1005 }
1006 1006
1007 EXPORT_SYMBOL(__get_free_pages); 1007 EXPORT_SYMBOL(__get_free_pages);
1008 1008
1009 fastcall unsigned long get_zeroed_page(gfp_t gfp_mask) 1009 fastcall unsigned long get_zeroed_page(gfp_t gfp_mask)
1010 { 1010 {
1011 struct page * page; 1011 struct page * page;
1012 1012
1013 /* 1013 /*
1014 * get_zeroed_page() returns a 32-bit address, which cannot represent 1014 * get_zeroed_page() returns a 32-bit address, which cannot represent
1015 * a highmem page 1015 * a highmem page
1016 */ 1016 */
1017 BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); 1017 BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
1018 1018
1019 page = alloc_pages(gfp_mask | __GFP_ZERO, 0); 1019 page = alloc_pages(gfp_mask | __GFP_ZERO, 0);
1020 if (page) 1020 if (page)
1021 return (unsigned long) page_address(page); 1021 return (unsigned long) page_address(page);
1022 return 0; 1022 return 0;
1023 } 1023 }
1024 1024
1025 EXPORT_SYMBOL(get_zeroed_page); 1025 EXPORT_SYMBOL(get_zeroed_page);
1026 1026
1027 void __pagevec_free(struct pagevec *pvec) 1027 void __pagevec_free(struct pagevec *pvec)
1028 { 1028 {
1029 int i = pagevec_count(pvec); 1029 int i = pagevec_count(pvec);
1030 1030
1031 while (--i >= 0) 1031 while (--i >= 0)
1032 free_hot_cold_page(pvec->pages[i], pvec->cold); 1032 free_hot_cold_page(pvec->pages[i], pvec->cold);
1033 } 1033 }
1034 1034
1035 fastcall void __free_pages(struct page *page, unsigned int order) 1035 fastcall void __free_pages(struct page *page, unsigned int order)
1036 { 1036 {
1037 if (put_page_testzero(page)) { 1037 if (put_page_testzero(page)) {
1038 if (order == 0) 1038 if (order == 0)
1039 free_hot_page(page); 1039 free_hot_page(page);
1040 else 1040 else
1041 __free_pages_ok(page, order); 1041 __free_pages_ok(page, order);
1042 } 1042 }
1043 } 1043 }
1044 1044
1045 EXPORT_SYMBOL(__free_pages); 1045 EXPORT_SYMBOL(__free_pages);
1046 1046
1047 fastcall void free_pages(unsigned long addr, unsigned int order) 1047 fastcall void free_pages(unsigned long addr, unsigned int order)
1048 { 1048 {
1049 if (addr != 0) { 1049 if (addr != 0) {
1050 BUG_ON(!virt_addr_valid((void *)addr)); 1050 BUG_ON(!virt_addr_valid((void *)addr));
1051 __free_pages(virt_to_page((void *)addr), order); 1051 __free_pages(virt_to_page((void *)addr), order);
1052 } 1052 }
1053 } 1053 }
1054 1054
1055 EXPORT_SYMBOL(free_pages); 1055 EXPORT_SYMBOL(free_pages);
1056 1056
1057 /* 1057 /*
1058 * Total amount of free (allocatable) RAM: 1058 * Total amount of free (allocatable) RAM:
1059 */ 1059 */
1060 unsigned int nr_free_pages(void) 1060 unsigned int nr_free_pages(void)
1061 { 1061 {
1062 unsigned int sum = 0; 1062 unsigned int sum = 0;
1063 struct zone *zone; 1063 struct zone *zone;
1064 1064
1065 for_each_zone(zone) 1065 for_each_zone(zone)
1066 sum += zone->free_pages; 1066 sum += zone->free_pages;
1067 1067
1068 return sum; 1068 return sum;
1069 } 1069 }
1070 1070
1071 EXPORT_SYMBOL(nr_free_pages); 1071 EXPORT_SYMBOL(nr_free_pages);
1072 1072
1073 #ifdef CONFIG_NUMA 1073 #ifdef CONFIG_NUMA
1074 unsigned int nr_free_pages_pgdat(pg_data_t *pgdat) 1074 unsigned int nr_free_pages_pgdat(pg_data_t *pgdat)
1075 { 1075 {
1076 unsigned int i, sum = 0; 1076 unsigned int i, sum = 0;
1077 1077
1078 for (i = 0; i < MAX_NR_ZONES; i++) 1078 for (i = 0; i < MAX_NR_ZONES; i++)
1079 sum += pgdat->node_zones[i].free_pages; 1079 sum += pgdat->node_zones[i].free_pages;
1080 1080
1081 return sum; 1081 return sum;
1082 } 1082 }
1083 #endif 1083 #endif
1084 1084
1085 static unsigned int nr_free_zone_pages(int offset) 1085 static unsigned int nr_free_zone_pages(int offset)
1086 { 1086 {
1087 /* Just pick one node, since fallback list is circular */ 1087 /* Just pick one node, since fallback list is circular */
1088 pg_data_t *pgdat = NODE_DATA(numa_node_id()); 1088 pg_data_t *pgdat = NODE_DATA(numa_node_id());
1089 unsigned int sum = 0; 1089 unsigned int sum = 0;
1090 1090
1091 struct zonelist *zonelist = pgdat->node_zonelists + offset; 1091 struct zonelist *zonelist = pgdat->node_zonelists + offset;
1092 struct zone **zonep = zonelist->zones; 1092 struct zone **zonep = zonelist->zones;
1093 struct zone *zone; 1093 struct zone *zone;
1094 1094
1095 for (zone = *zonep++; zone; zone = *zonep++) { 1095 for (zone = *zonep++; zone; zone = *zonep++) {
1096 unsigned long size = zone->present_pages; 1096 unsigned long size = zone->present_pages;
1097 unsigned long high = zone->pages_high; 1097 unsigned long high = zone->pages_high;
1098 if (size > high) 1098 if (size > high)
1099 sum += size - high; 1099 sum += size - high;
1100 } 1100 }
1101 1101
1102 return sum; 1102 return sum;
1103 } 1103 }
1104 1104
1105 /* 1105 /*
1106 * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL 1106 * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL
1107 */ 1107 */
1108 unsigned int nr_free_buffer_pages(void) 1108 unsigned int nr_free_buffer_pages(void)
1109 { 1109 {
1110 return nr_free_zone_pages(gfp_zone(GFP_USER)); 1110 return nr_free_zone_pages(gfp_zone(GFP_USER));
1111 } 1111 }
1112 1112
1113 /* 1113 /*
1114 * Amount of free RAM allocatable within all zones 1114 * Amount of free RAM allocatable within all zones
1115 */ 1115 */
1116 unsigned int nr_free_pagecache_pages(void) 1116 unsigned int nr_free_pagecache_pages(void)
1117 { 1117 {
1118 return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER)); 1118 return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER));
1119 } 1119 }
1120 1120
1121 #ifdef CONFIG_HIGHMEM 1121 #ifdef CONFIG_HIGHMEM
1122 unsigned int nr_free_highpages (void) 1122 unsigned int nr_free_highpages (void)
1123 { 1123 {
1124 pg_data_t *pgdat; 1124 pg_data_t *pgdat;
1125 unsigned int pages = 0; 1125 unsigned int pages = 0;
1126 1126
1127 for_each_pgdat(pgdat) 1127 for_each_pgdat(pgdat)
1128 pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages; 1128 pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
1129 1129
1130 return pages; 1130 return pages;
1131 } 1131 }
1132 #endif 1132 #endif
1133 1133
1134 #ifdef CONFIG_NUMA 1134 #ifdef CONFIG_NUMA
1135 static void show_node(struct zone *zone) 1135 static void show_node(struct zone *zone)
1136 { 1136 {
1137 printk("Node %d ", zone->zone_pgdat->node_id); 1137 printk("Node %d ", zone->zone_pgdat->node_id);
1138 } 1138 }
1139 #else 1139 #else
1140 #define show_node(zone) do { } while (0) 1140 #define show_node(zone) do { } while (0)
1141 #endif 1141 #endif
1142 1142
1143 /* 1143 /*
1144 * Accumulate the page_state information across all CPUs. 1144 * Accumulate the page_state information across all CPUs.
1145 * The result is unavoidably approximate - it can change 1145 * The result is unavoidably approximate - it can change
1146 * during and after execution of this function. 1146 * during and after execution of this function.
1147 */ 1147 */
1148 static DEFINE_PER_CPU(struct page_state, page_states) = {0}; 1148 static DEFINE_PER_CPU(struct page_state, page_states) = {0};
1149 1149
1150 atomic_t nr_pagecache = ATOMIC_INIT(0); 1150 atomic_t nr_pagecache = ATOMIC_INIT(0);
1151 EXPORT_SYMBOL(nr_pagecache); 1151 EXPORT_SYMBOL(nr_pagecache);
1152 #ifdef CONFIG_SMP 1152 #ifdef CONFIG_SMP
1153 DEFINE_PER_CPU(long, nr_pagecache_local) = 0; 1153 DEFINE_PER_CPU(long, nr_pagecache_local) = 0;
1154 #endif 1154 #endif
1155 1155
1156 void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask) 1156 void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask)
1157 { 1157 {
1158 int cpu = 0; 1158 int cpu = 0;
1159 1159
1160 memset(ret, 0, sizeof(*ret)); 1160 memset(ret, 0, sizeof(*ret));
1161 cpus_and(*cpumask, *cpumask, cpu_online_map); 1161 cpus_and(*cpumask, *cpumask, cpu_online_map);
1162 1162
1163 cpu = first_cpu(*cpumask); 1163 cpu = first_cpu(*cpumask);
1164 while (cpu < NR_CPUS) { 1164 while (cpu < NR_CPUS) {
1165 unsigned long *in, *out, off; 1165 unsigned long *in, *out, off;
1166 1166
1167 in = (unsigned long *)&per_cpu(page_states, cpu); 1167 in = (unsigned long *)&per_cpu(page_states, cpu);
1168 1168
1169 cpu = next_cpu(cpu, *cpumask); 1169 cpu = next_cpu(cpu, *cpumask);
1170 1170
1171 if (cpu < NR_CPUS) 1171 if (cpu < NR_CPUS)
1172 prefetch(&per_cpu(page_states, cpu)); 1172 prefetch(&per_cpu(page_states, cpu));
1173 1173
1174 out = (unsigned long *)ret; 1174 out = (unsigned long *)ret;
1175 for (off = 0; off < nr; off++) 1175 for (off = 0; off < nr; off++)
1176 *out++ += *in++; 1176 *out++ += *in++;
1177 } 1177 }
1178 } 1178 }
1179 1179
1180 void get_page_state_node(struct page_state *ret, int node) 1180 void get_page_state_node(struct page_state *ret, int node)
1181 { 1181 {
1182 int nr; 1182 int nr;
1183 cpumask_t mask = node_to_cpumask(node); 1183 cpumask_t mask = node_to_cpumask(node);
1184 1184
1185 nr = offsetof(struct page_state, GET_PAGE_STATE_LAST); 1185 nr = offsetof(struct page_state, GET_PAGE_STATE_LAST);
1186 nr /= sizeof(unsigned long); 1186 nr /= sizeof(unsigned long);
1187 1187
1188 __get_page_state(ret, nr+1, &mask); 1188 __get_page_state(ret, nr+1, &mask);
1189 } 1189 }
1190 1190
1191 void get_page_state(struct page_state *ret) 1191 void get_page_state(struct page_state *ret)
1192 { 1192 {
1193 int nr; 1193 int nr;
1194 cpumask_t mask = CPU_MASK_ALL; 1194 cpumask_t mask = CPU_MASK_ALL;
1195 1195
1196 nr = offsetof(struct page_state, GET_PAGE_STATE_LAST); 1196 nr = offsetof(struct page_state, GET_PAGE_STATE_LAST);
1197 nr /= sizeof(unsigned long); 1197 nr /= sizeof(unsigned long);
1198 1198
1199 __get_page_state(ret, nr + 1, &mask); 1199 __get_page_state(ret, nr + 1, &mask);
1200 } 1200 }
1201 1201
1202 void get_full_page_state(struct page_state *ret) 1202 void get_full_page_state(struct page_state *ret)
1203 { 1203 {
1204 cpumask_t mask = CPU_MASK_ALL; 1204 cpumask_t mask = CPU_MASK_ALL;
1205 1205
1206 __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask); 1206 __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask);
1207 } 1207 }
1208 1208
1209 unsigned long __read_page_state(unsigned long offset) 1209 unsigned long __read_page_state(unsigned long offset)
1210 { 1210 {
1211 unsigned long ret = 0; 1211 unsigned long ret = 0;
1212 int cpu; 1212 int cpu;
1213 1213
1214 for_each_online_cpu(cpu) { 1214 for_each_online_cpu(cpu) {
1215 unsigned long in; 1215 unsigned long in;
1216 1216
1217 in = (unsigned long)&per_cpu(page_states, cpu) + offset; 1217 in = (unsigned long)&per_cpu(page_states, cpu) + offset;
1218 ret += *((unsigned long *)in); 1218 ret += *((unsigned long *)in);
1219 } 1219 }
1220 return ret; 1220 return ret;
1221 } 1221 }
1222 1222
1223 void __mod_page_state(unsigned long offset, unsigned long delta) 1223 void __mod_page_state(unsigned long offset, unsigned long delta)
1224 { 1224 {
1225 unsigned long flags; 1225 unsigned long flags;
1226 void* ptr; 1226 void* ptr;
1227 1227
1228 local_irq_save(flags); 1228 local_irq_save(flags);
1229 ptr = &__get_cpu_var(page_states); 1229 ptr = &__get_cpu_var(page_states);
1230 *(unsigned long*)(ptr + offset) += delta; 1230 *(unsigned long*)(ptr + offset) += delta;
1231 local_irq_restore(flags); 1231 local_irq_restore(flags);
1232 } 1232 }
1233 1233
1234 EXPORT_SYMBOL(__mod_page_state); 1234 EXPORT_SYMBOL(__mod_page_state);
1235 1235
1236 void __get_zone_counts(unsigned long *active, unsigned long *inactive, 1236 void __get_zone_counts(unsigned long *active, unsigned long *inactive,
1237 unsigned long *free, struct pglist_data *pgdat) 1237 unsigned long *free, struct pglist_data *pgdat)
1238 { 1238 {
1239 struct zone *zones = pgdat->node_zones; 1239 struct zone *zones = pgdat->node_zones;
1240 int i; 1240 int i;
1241 1241
1242 *active = 0; 1242 *active = 0;
1243 *inactive = 0; 1243 *inactive = 0;
1244 *free = 0; 1244 *free = 0;
1245 for (i = 0; i < MAX_NR_ZONES; i++) { 1245 for (i = 0; i < MAX_NR_ZONES; i++) {
1246 *active += zones[i].nr_active; 1246 *active += zones[i].nr_active;
1247 *inactive += zones[i].nr_inactive; 1247 *inactive += zones[i].nr_inactive;
1248 *free += zones[i].free_pages; 1248 *free += zones[i].free_pages;
1249 } 1249 }
1250 } 1250 }
1251 1251
1252 void get_zone_counts(unsigned long *active, 1252 void get_zone_counts(unsigned long *active,
1253 unsigned long *inactive, unsigned long *free) 1253 unsigned long *inactive, unsigned long *free)
1254 { 1254 {
1255 struct pglist_data *pgdat; 1255 struct pglist_data *pgdat;
1256 1256
1257 *active = 0; 1257 *active = 0;
1258 *inactive = 0; 1258 *inactive = 0;
1259 *free = 0; 1259 *free = 0;
1260 for_each_pgdat(pgdat) { 1260 for_each_pgdat(pgdat) {
1261 unsigned long l, m, n; 1261 unsigned long l, m, n;
1262 __get_zone_counts(&l, &m, &n, pgdat); 1262 __get_zone_counts(&l, &m, &n, pgdat);
1263 *active += l; 1263 *active += l;
1264 *inactive += m; 1264 *inactive += m;
1265 *free += n; 1265 *free += n;
1266 } 1266 }
1267 } 1267 }
1268 1268
1269 void si_meminfo(struct sysinfo *val) 1269 void si_meminfo(struct sysinfo *val)
1270 { 1270 {
1271 val->totalram = totalram_pages; 1271 val->totalram = totalram_pages;
1272 val->sharedram = 0; 1272 val->sharedram = 0;
1273 val->freeram = nr_free_pages(); 1273 val->freeram = nr_free_pages();
1274 val->bufferram = nr_blockdev_pages(); 1274 val->bufferram = nr_blockdev_pages();
1275 #ifdef CONFIG_HIGHMEM 1275 #ifdef CONFIG_HIGHMEM
1276 val->totalhigh = totalhigh_pages; 1276 val->totalhigh = totalhigh_pages;
1277 val->freehigh = nr_free_highpages(); 1277 val->freehigh = nr_free_highpages();
1278 #else 1278 #else
1279 val->totalhigh = 0; 1279 val->totalhigh = 0;
1280 val->freehigh = 0; 1280 val->freehigh = 0;
1281 #endif 1281 #endif
1282 val->mem_unit = PAGE_SIZE; 1282 val->mem_unit = PAGE_SIZE;
1283 } 1283 }
1284 1284
1285 EXPORT_SYMBOL(si_meminfo); 1285 EXPORT_SYMBOL(si_meminfo);
1286 1286
1287 #ifdef CONFIG_NUMA 1287 #ifdef CONFIG_NUMA
1288 void si_meminfo_node(struct sysinfo *val, int nid) 1288 void si_meminfo_node(struct sysinfo *val, int nid)
1289 { 1289 {
1290 pg_data_t *pgdat = NODE_DATA(nid); 1290 pg_data_t *pgdat = NODE_DATA(nid);
1291 1291
1292 val->totalram = pgdat->node_present_pages; 1292 val->totalram = pgdat->node_present_pages;
1293 val->freeram = nr_free_pages_pgdat(pgdat); 1293 val->freeram = nr_free_pages_pgdat(pgdat);
1294 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages; 1294 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;
1295 val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages; 1295 val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages;
1296 val->mem_unit = PAGE_SIZE; 1296 val->mem_unit = PAGE_SIZE;
1297 } 1297 }
1298 #endif 1298 #endif
1299 1299
1300 #define K(x) ((x) << (PAGE_SHIFT-10)) 1300 #define K(x) ((x) << (PAGE_SHIFT-10))
1301 1301
1302 /* 1302 /*
1303 * Show free area list (used inside shift_scroll-lock stuff) 1303 * Show free area list (used inside shift_scroll-lock stuff)
1304 * We also calculate the percentage fragmentation. We do this by counting the 1304 * We also calculate the percentage fragmentation. We do this by counting the
1305 * memory on each free list with the exception of the first item on the list. 1305 * memory on each free list with the exception of the first item on the list.
1306 */ 1306 */
1307 void show_free_areas(void) 1307 void show_free_areas(void)
1308 { 1308 {
1309 struct page_state ps; 1309 struct page_state ps;
1310 int cpu, temperature; 1310 int cpu, temperature;
1311 unsigned long active; 1311 unsigned long active;
1312 unsigned long inactive; 1312 unsigned long inactive;
1313 unsigned long free; 1313 unsigned long free;
1314 struct zone *zone; 1314 struct zone *zone;
1315 1315
1316 for_each_zone(zone) { 1316 for_each_zone(zone) {
1317 show_node(zone); 1317 show_node(zone);
1318 printk("%s per-cpu:", zone->name); 1318 printk("%s per-cpu:", zone->name);
1319 1319
1320 if (!zone->present_pages) { 1320 if (!zone->present_pages) {
1321 printk(" empty\n"); 1321 printk(" empty\n");
1322 continue; 1322 continue;
1323 } else 1323 } else
1324 printk("\n"); 1324 printk("\n");
1325 1325
1326 for (cpu = 0; cpu < NR_CPUS; ++cpu) { 1326 for (cpu = 0; cpu < NR_CPUS; ++cpu) {
1327 struct per_cpu_pageset *pageset; 1327 struct per_cpu_pageset *pageset;
1328 1328
1329 if (!cpu_possible(cpu)) 1329 if (!cpu_possible(cpu))
1330 continue; 1330 continue;
1331 1331
1332 pageset = zone_pcp(zone, cpu); 1332 pageset = zone_pcp(zone, cpu);
1333 1333
1334 for (temperature = 0; temperature < 2; temperature++) 1334 for (temperature = 0; temperature < 2; temperature++)
1335 printk("cpu %d %s: low %d, high %d, batch %d used:%d\n", 1335 printk("cpu %d %s: low %d, high %d, batch %d used:%d\n",
1336 cpu, 1336 cpu,
1337 temperature ? "cold" : "hot", 1337 temperature ? "cold" : "hot",
1338 pageset->pcp[temperature].low, 1338 pageset->pcp[temperature].low,
1339 pageset->pcp[temperature].high, 1339 pageset->pcp[temperature].high,
1340 pageset->pcp[temperature].batch, 1340 pageset->pcp[temperature].batch,
1341 pageset->pcp[temperature].count); 1341 pageset->pcp[temperature].count);
1342 } 1342 }
1343 } 1343 }
1344 1344
1345 get_page_state(&ps); 1345 get_page_state(&ps);
1346 get_zone_counts(&active, &inactive, &free); 1346 get_zone_counts(&active, &inactive, &free);
1347 1347
1348 printk("Free pages: %11ukB (%ukB HighMem)\n", 1348 printk("Free pages: %11ukB (%ukB HighMem)\n",
1349 K(nr_free_pages()), 1349 K(nr_free_pages()),
1350 K(nr_free_highpages())); 1350 K(nr_free_highpages()));
1351 1351
1352 printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu " 1352 printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu "
1353 "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n", 1353 "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n",
1354 active, 1354 active,
1355 inactive, 1355 inactive,
1356 ps.nr_dirty, 1356 ps.nr_dirty,
1357 ps.nr_writeback, 1357 ps.nr_writeback,
1358 ps.nr_unstable, 1358 ps.nr_unstable,
1359 nr_free_pages(), 1359 nr_free_pages(),
1360 ps.nr_slab, 1360 ps.nr_slab,
1361 ps.nr_mapped, 1361 ps.nr_mapped,
1362 ps.nr_page_table_pages); 1362 ps.nr_page_table_pages);
1363 1363
1364 for_each_zone(zone) { 1364 for_each_zone(zone) {
1365 int i; 1365 int i;
1366 1366
1367 show_node(zone); 1367 show_node(zone);
1368 printk("%s" 1368 printk("%s"
1369 " free:%lukB" 1369 " free:%lukB"
1370 " min:%lukB" 1370 " min:%lukB"
1371 " low:%lukB" 1371 " low:%lukB"
1372 " high:%lukB" 1372 " high:%lukB"
1373 " active:%lukB" 1373 " active:%lukB"
1374 " inactive:%lukB" 1374 " inactive:%lukB"
1375 " present:%lukB" 1375 " present:%lukB"
1376 " pages_scanned:%lu" 1376 " pages_scanned:%lu"
1377 " all_unreclaimable? %s" 1377 " all_unreclaimable? %s"
1378 "\n", 1378 "\n",
1379 zone->name, 1379 zone->name,
1380 K(zone->free_pages), 1380 K(zone->free_pages),
1381 K(zone->pages_min), 1381 K(zone->pages_min),
1382 K(zone->pages_low), 1382 K(zone->pages_low),
1383 K(zone->pages_high), 1383 K(zone->pages_high),
1384 K(zone->nr_active), 1384 K(zone->nr_active),
1385 K(zone->nr_inactive), 1385 K(zone->nr_inactive),
1386 K(zone->present_pages), 1386 K(zone->present_pages),
1387 zone->pages_scanned, 1387 zone->pages_scanned,
1388 (zone->all_unreclaimable ? "yes" : "no") 1388 (zone->all_unreclaimable ? "yes" : "no")
1389 ); 1389 );
1390 printk("lowmem_reserve[]:"); 1390 printk("lowmem_reserve[]:");
1391 for (i = 0; i < MAX_NR_ZONES; i++) 1391 for (i = 0; i < MAX_NR_ZONES; i++)
1392 printk(" %lu", zone->lowmem_reserve[i]); 1392 printk(" %lu", zone->lowmem_reserve[i]);
1393 printk("\n"); 1393 printk("\n");
1394 } 1394 }
1395 1395
1396 for_each_zone(zone) { 1396 for_each_zone(zone) {
1397 unsigned long nr, flags, order, total = 0; 1397 unsigned long nr, flags, order, total = 0;
1398 1398
1399 show_node(zone); 1399 show_node(zone);
1400 printk("%s: ", zone->name); 1400 printk("%s: ", zone->name);
1401 if (!zone->present_pages) { 1401 if (!zone->present_pages) {
1402 printk("empty\n"); 1402 printk("empty\n");
1403 continue; 1403 continue;
1404 } 1404 }
1405 1405
1406 spin_lock_irqsave(&zone->lock, flags); 1406 spin_lock_irqsave(&zone->lock, flags);
1407 for (order = 0; order < MAX_ORDER; order++) { 1407 for (order = 0; order < MAX_ORDER; order++) {
1408 nr = zone->free_area[order].nr_free; 1408 nr = zone->free_area[order].nr_free;
1409 total += nr << order; 1409 total += nr << order;
1410 printk("%lu*%lukB ", nr, K(1UL) << order); 1410 printk("%lu*%lukB ", nr, K(1UL) << order);
1411 } 1411 }
1412 spin_unlock_irqrestore(&zone->lock, flags); 1412 spin_unlock_irqrestore(&zone->lock, flags);
1413 printk("= %lukB\n", K(total)); 1413 printk("= %lukB\n", K(total));
1414 } 1414 }
1415 1415
1416 show_swap_cache_info(); 1416 show_swap_cache_info();
1417 } 1417 }
1418 1418
1419 /* 1419 /*
1420 * Builds allocation fallback zone lists. 1420 * Builds allocation fallback zone lists.
1421 */ 1421 */
1422 static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k) 1422 static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k)
1423 { 1423 {
1424 switch (k) { 1424 switch (k) {
1425 struct zone *zone; 1425 struct zone *zone;
1426 default: 1426 default:
1427 BUG(); 1427 BUG();
1428 case ZONE_HIGHMEM: 1428 case ZONE_HIGHMEM:
1429 zone = pgdat->node_zones + ZONE_HIGHMEM; 1429 zone = pgdat->node_zones + ZONE_HIGHMEM;
1430 if (zone->present_pages) { 1430 if (zone->present_pages) {
1431 #ifndef CONFIG_HIGHMEM 1431 #ifndef CONFIG_HIGHMEM
1432 BUG(); 1432 BUG();
1433 #endif 1433 #endif
1434 zonelist->zones[j++] = zone; 1434 zonelist->zones[j++] = zone;
1435 } 1435 }
1436 case ZONE_NORMAL: 1436 case ZONE_NORMAL:
1437 zone = pgdat->node_zones + ZONE_NORMAL; 1437 zone = pgdat->node_zones + ZONE_NORMAL;
1438 if (zone->present_pages) 1438 if (zone->present_pages)
1439 zonelist->zones[j++] = zone; 1439 zonelist->zones[j++] = zone;
1440 case ZONE_DMA: 1440 case ZONE_DMA:
1441 zone = pgdat->node_zones + ZONE_DMA; 1441 zone = pgdat->node_zones + ZONE_DMA;
1442 if (zone->present_pages) 1442 if (zone->present_pages)
1443 zonelist->zones[j++] = zone; 1443 zonelist->zones[j++] = zone;
1444 } 1444 }
1445 1445
1446 return j; 1446 return j;
1447 } 1447 }
1448 1448
1449 static inline int highest_zone(int zone_bits) 1449 static inline int highest_zone(int zone_bits)
1450 { 1450 {
1451 int res = ZONE_NORMAL; 1451 int res = ZONE_NORMAL;
1452 if (zone_bits & (__force int)__GFP_HIGHMEM) 1452 if (zone_bits & (__force int)__GFP_HIGHMEM)
1453 res = ZONE_HIGHMEM; 1453 res = ZONE_HIGHMEM;
1454 if (zone_bits & (__force int)__GFP_DMA) 1454 if (zone_bits & (__force int)__GFP_DMA)
1455 res = ZONE_DMA; 1455 res = ZONE_DMA;
1456 return res; 1456 return res;
1457 } 1457 }
1458 1458
1459 #ifdef CONFIG_NUMA 1459 #ifdef CONFIG_NUMA
1460 #define MAX_NODE_LOAD (num_online_nodes()) 1460 #define MAX_NODE_LOAD (num_online_nodes())
1461 static int __initdata node_load[MAX_NUMNODES]; 1461 static int __initdata node_load[MAX_NUMNODES];
1462 /** 1462 /**
1463 * find_next_best_node - find the next node that should appear in a given node's fallback list 1463 * find_next_best_node - find the next node that should appear in a given node's fallback list
1464 * @node: node whose fallback list we're appending 1464 * @node: node whose fallback list we're appending
1465 * @used_node_mask: nodemask_t of already used nodes 1465 * @used_node_mask: nodemask_t of already used nodes
1466 * 1466 *
1467 * We use a number of factors to determine which is the next node that should 1467 * We use a number of factors to determine which is the next node that should
1468 * appear on a given node's fallback list. The node should not have appeared 1468 * appear on a given node's fallback list. The node should not have appeared
1469 * already in @node's fallback list, and it should be the next closest node 1469 * already in @node's fallback list, and it should be the next closest node
1470 * according to the distance array (which contains arbitrary distance values 1470 * according to the distance array (which contains arbitrary distance values
1471 * from each node to each node in the system), and should also prefer nodes 1471 * from each node to each node in the system), and should also prefer nodes
1472 * with no CPUs, since presumably they'll have very little allocation pressure 1472 * with no CPUs, since presumably they'll have very little allocation pressure
1473 * on them otherwise. 1473 * on them otherwise.
1474 * It returns -1 if no node is found. 1474 * It returns -1 if no node is found.
1475 */ 1475 */
1476 static int __init find_next_best_node(int node, nodemask_t *used_node_mask) 1476 static int __init find_next_best_node(int node, nodemask_t *used_node_mask)
1477 { 1477 {
1478 int i, n, val; 1478 int i, n, val;
1479 int min_val = INT_MAX; 1479 int min_val = INT_MAX;
1480 int best_node = -1; 1480 int best_node = -1;
1481 1481
1482 for_each_online_node(i) { 1482 for_each_online_node(i) {
1483 cpumask_t tmp; 1483 cpumask_t tmp;
1484 1484
1485 /* Start from local node */ 1485 /* Start from local node */
1486 n = (node+i) % num_online_nodes(); 1486 n = (node+i) % num_online_nodes();
1487 1487
1488 /* Don't want a node to appear more than once */ 1488 /* Don't want a node to appear more than once */
1489 if (node_isset(n, *used_node_mask)) 1489 if (node_isset(n, *used_node_mask))
1490 continue; 1490 continue;
1491 1491
1492 /* Use the local node if we haven't already */ 1492 /* Use the local node if we haven't already */
1493 if (!node_isset(node, *used_node_mask)) { 1493 if (!node_isset(node, *used_node_mask)) {
1494 best_node = node; 1494 best_node = node;
1495 break; 1495 break;
1496 } 1496 }
1497 1497
1498 /* Use the distance array to find the distance */ 1498 /* Use the distance array to find the distance */
1499 val = node_distance(node, n); 1499 val = node_distance(node, n);
1500 1500
1501 /* Give preference to headless and unused nodes */ 1501 /* Give preference to headless and unused nodes */
1502 tmp = node_to_cpumask(n); 1502 tmp = node_to_cpumask(n);
1503 if (!cpus_empty(tmp)) 1503 if (!cpus_empty(tmp))
1504 val += PENALTY_FOR_NODE_WITH_CPUS; 1504 val += PENALTY_FOR_NODE_WITH_CPUS;
1505 1505
1506 /* Slight preference for less loaded node */ 1506 /* Slight preference for less loaded node */
1507 val *= (MAX_NODE_LOAD*MAX_NUMNODES); 1507 val *= (MAX_NODE_LOAD*MAX_NUMNODES);
1508 val += node_load[n]; 1508 val += node_load[n];
1509 1509
1510 if (val < min_val) { 1510 if (val < min_val) {
1511 min_val = val; 1511 min_val = val;
1512 best_node = n; 1512 best_node = n;
1513 } 1513 }
1514 } 1514 }
1515 1515
1516 if (best_node >= 0) 1516 if (best_node >= 0)
1517 node_set(best_node, *used_node_mask); 1517 node_set(best_node, *used_node_mask);
1518 1518
1519 return best_node; 1519 return best_node;
1520 } 1520 }
1521 1521
1522 static void __init build_zonelists(pg_data_t *pgdat) 1522 static void __init build_zonelists(pg_data_t *pgdat)
1523 { 1523 {
1524 int i, j, k, node, local_node; 1524 int i, j, k, node, local_node;
1525 int prev_node, load; 1525 int prev_node, load;
1526 struct zonelist *zonelist; 1526 struct zonelist *zonelist;
1527 nodemask_t used_mask; 1527 nodemask_t used_mask;
1528 1528
1529 /* initialize zonelists */ 1529 /* initialize zonelists */
1530 for (i = 0; i < GFP_ZONETYPES; i++) { 1530 for (i = 0; i < GFP_ZONETYPES; i++) {
1531 zonelist = pgdat->node_zonelists + i; 1531 zonelist = pgdat->node_zonelists + i;
1532 zonelist->zones[0] = NULL; 1532 zonelist->zones[0] = NULL;
1533 } 1533 }
1534 1534
1535 /* NUMA-aware ordering of nodes */ 1535 /* NUMA-aware ordering of nodes */
1536 local_node = pgdat->node_id; 1536 local_node = pgdat->node_id;
1537 load = num_online_nodes(); 1537 load = num_online_nodes();
1538 prev_node = local_node; 1538 prev_node = local_node;
1539 nodes_clear(used_mask); 1539 nodes_clear(used_mask);
1540 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { 1540 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
1541 /* 1541 /*
1542 * We don't want to pressure a particular node. 1542 * We don't want to pressure a particular node.
1543 * So adding penalty to the first node in same 1543 * So adding penalty to the first node in same
1544 * distance group to make it round-robin. 1544 * distance group to make it round-robin.
1545 */ 1545 */
1546 if (node_distance(local_node, node) != 1546 if (node_distance(local_node, node) !=
1547 node_distance(local_node, prev_node)) 1547 node_distance(local_node, prev_node))
1548 node_load[node] += load; 1548 node_load[node] += load;
1549 prev_node = node; 1549 prev_node = node;
1550 load--; 1550 load--;
1551 for (i = 0; i < GFP_ZONETYPES; i++) { 1551 for (i = 0; i < GFP_ZONETYPES; i++) {
1552 zonelist = pgdat->node_zonelists + i; 1552 zonelist = pgdat->node_zonelists + i;
1553 for (j = 0; zonelist->zones[j] != NULL; j++); 1553 for (j = 0; zonelist->zones[j] != NULL; j++);
1554 1554
1555 k = highest_zone(i); 1555 k = highest_zone(i);
1556 1556
1557 j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); 1557 j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
1558 zonelist->zones[j] = NULL; 1558 zonelist->zones[j] = NULL;
1559 } 1559 }
1560 } 1560 }
1561 } 1561 }
1562 1562
1563 #else /* CONFIG_NUMA */ 1563 #else /* CONFIG_NUMA */
1564 1564
1565 static void __init build_zonelists(pg_data_t *pgdat) 1565 static void __init build_zonelists(pg_data_t *pgdat)
1566 { 1566 {
1567 int i, j, k, node, local_node; 1567 int i, j, k, node, local_node;
1568 1568
1569 local_node = pgdat->node_id; 1569 local_node = pgdat->node_id;
1570 for (i = 0; i < GFP_ZONETYPES; i++) { 1570 for (i = 0; i < GFP_ZONETYPES; i++) {
1571 struct zonelist *zonelist; 1571 struct zonelist *zonelist;
1572 1572
1573 zonelist = pgdat->node_zonelists + i; 1573 zonelist = pgdat->node_zonelists + i;
1574 1574
1575 j = 0; 1575 j = 0;
1576 k = highest_zone(i); 1576 k = highest_zone(i);
1577 j = build_zonelists_node(pgdat, zonelist, j, k); 1577 j = build_zonelists_node(pgdat, zonelist, j, k);
1578 /* 1578 /*
1579 * Now we build the zonelist so that it contains the zones 1579 * Now we build the zonelist so that it contains the zones
1580 * of all the other nodes. 1580 * of all the other nodes.
1581 * We don't want to pressure a particular node, so when 1581 * We don't want to pressure a particular node, so when
1582 * building the zones for node N, we make sure that the 1582 * building the zones for node N, we make sure that the
1583 * zones coming right after the local ones are those from 1583 * zones coming right after the local ones are those from
1584 * node N+1 (modulo N) 1584 * node N+1 (modulo N)
1585 */ 1585 */
1586 for (node = local_node + 1; node < MAX_NUMNODES; node++) { 1586 for (node = local_node + 1; node < MAX_NUMNODES; node++) {
1587 if (!node_online(node)) 1587 if (!node_online(node))
1588 continue; 1588 continue;
1589 j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); 1589 j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
1590 } 1590 }
1591 for (node = 0; node < local_node; node++) { 1591 for (node = 0; node < local_node; node++) {
1592 if (!node_online(node)) 1592 if (!node_online(node))
1593 continue; 1593 continue;
1594 j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); 1594 j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
1595 } 1595 }
1596 1596
1597 zonelist->zones[j] = NULL; 1597 zonelist->zones[j] = NULL;
1598 } 1598 }
1599 } 1599 }
1600 1600
1601 #endif /* CONFIG_NUMA */ 1601 #endif /* CONFIG_NUMA */
1602 1602
1603 void __init build_all_zonelists(void) 1603 void __init build_all_zonelists(void)
1604 { 1604 {
1605 int i; 1605 int i;
1606 1606
1607 for_each_online_node(i) 1607 for_each_online_node(i)
1608 build_zonelists(NODE_DATA(i)); 1608 build_zonelists(NODE_DATA(i));
1609 printk("Built %i zonelists\n", num_online_nodes()); 1609 printk("Built %i zonelists\n", num_online_nodes());
1610 cpuset_init_current_mems_allowed(); 1610 cpuset_init_current_mems_allowed();
1611 } 1611 }
1612 1612
1613 /* 1613 /*
1614 * Helper functions to size the waitqueue hash table. 1614 * Helper functions to size the waitqueue hash table.
1615 * Essentially these want to choose hash table sizes sufficiently 1615 * Essentially these want to choose hash table sizes sufficiently
1616 * large so that collisions trying to wait on pages are rare. 1616 * large so that collisions trying to wait on pages are rare.
1617 * But in fact, the number of active page waitqueues on typical 1617 * But in fact, the number of active page waitqueues on typical
1618 * systems is ridiculously low, less than 200. So this is even 1618 * systems is ridiculously low, less than 200. So this is even
1619 * conservative, even though it seems large. 1619 * conservative, even though it seems large.
1620 * 1620 *
1621 * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to 1621 * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
1622 * waitqueues, i.e. the size of the waitq table given the number of pages. 1622 * waitqueues, i.e. the size of the waitq table given the number of pages.
1623 */ 1623 */
1624 #define PAGES_PER_WAITQUEUE 256 1624 #define PAGES_PER_WAITQUEUE 256
1625 1625
1626 static inline unsigned long wait_table_size(unsigned long pages) 1626 static inline unsigned long wait_table_size(unsigned long pages)
1627 { 1627 {
1628 unsigned long size = 1; 1628 unsigned long size = 1;
1629 1629
1630 pages /= PAGES_PER_WAITQUEUE; 1630 pages /= PAGES_PER_WAITQUEUE;
1631 1631
1632 while (size < pages) 1632 while (size < pages)
1633 size <<= 1; 1633 size <<= 1;
1634 1634
1635 /* 1635 /*
1636 * Once we have dozens or even hundreds of threads sleeping 1636 * Once we have dozens or even hundreds of threads sleeping
1637 * on IO we've got bigger problems than wait queue collision. 1637 * on IO we've got bigger problems than wait queue collision.
1638 * Limit the size of the wait table to a reasonable size. 1638 * Limit the size of the wait table to a reasonable size.
1639 */ 1639 */
1640 size = min(size, 4096UL); 1640 size = min(size, 4096UL);
1641 1641
1642 return max(size, 4UL); 1642 return max(size, 4UL);
1643 } 1643 }
1644 1644
1645 /* 1645 /*
1646 * This is an integer logarithm so that shifts can be used later 1646 * This is an integer logarithm so that shifts can be used later
1647 * to extract the more random high bits from the multiplicative 1647 * to extract the more random high bits from the multiplicative
1648 * hash function before the remainder is taken. 1648 * hash function before the remainder is taken.
1649 */ 1649 */
1650 static inline unsigned long wait_table_bits(unsigned long size) 1650 static inline unsigned long wait_table_bits(unsigned long size)
1651 { 1651 {
1652 return ffz(~size); 1652 return ffz(~size);
1653 } 1653 }
1654 1654
1655 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) 1655 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
1656 1656
1657 static void __init calculate_zone_totalpages(struct pglist_data *pgdat, 1657 static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
1658 unsigned long *zones_size, unsigned long *zholes_size) 1658 unsigned long *zones_size, unsigned long *zholes_size)
1659 { 1659 {
1660 unsigned long realtotalpages, totalpages = 0; 1660 unsigned long realtotalpages, totalpages = 0;
1661 int i; 1661 int i;
1662 1662
1663 for (i = 0; i < MAX_NR_ZONES; i++) 1663 for (i = 0; i < MAX_NR_ZONES; i++)
1664 totalpages += zones_size[i]; 1664 totalpages += zones_size[i];
1665 pgdat->node_spanned_pages = totalpages; 1665 pgdat->node_spanned_pages = totalpages;
1666 1666
1667 realtotalpages = totalpages; 1667 realtotalpages = totalpages;
1668 if (zholes_size) 1668 if (zholes_size)
1669 for (i = 0; i < MAX_NR_ZONES; i++) 1669 for (i = 0; i < MAX_NR_ZONES; i++)
1670 realtotalpages -= zholes_size[i]; 1670 realtotalpages -= zholes_size[i];
1671 pgdat->node_present_pages = realtotalpages; 1671 pgdat->node_present_pages = realtotalpages;
1672 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages); 1672 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
1673 } 1673 }
1674 1674
1675 1675
1676 /* 1676 /*
1677 * Initially all pages are reserved - free ones are freed 1677 * Initially all pages are reserved - free ones are freed
1678 * up by free_all_bootmem() once the early boot process is 1678 * up by free_all_bootmem() once the early boot process is
1679 * done. Non-atomic initialization, single-pass. 1679 * done. Non-atomic initialization, single-pass.
1680 */ 1680 */
1681 void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone, 1681 void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone,
1682 unsigned long start_pfn) 1682 unsigned long start_pfn)
1683 { 1683 {
1684 struct page *page; 1684 struct page *page;
1685 unsigned long end_pfn = start_pfn + size; 1685 unsigned long end_pfn = start_pfn + size;
1686 unsigned long pfn; 1686 unsigned long pfn;
1687 1687
1688 for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) { 1688 for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) {
1689 if (!early_pfn_valid(pfn)) 1689 if (!early_pfn_valid(pfn))
1690 continue; 1690 continue;
1691 if (!early_pfn_in_nid(pfn, nid)) 1691 if (!early_pfn_in_nid(pfn, nid))
1692 continue; 1692 continue;
1693 page = pfn_to_page(pfn); 1693 page = pfn_to_page(pfn);
1694 set_page_links(page, zone, nid, pfn); 1694 set_page_links(page, zone, nid, pfn);
1695 set_page_count(page, 1); 1695 set_page_count(page, 1);
1696 reset_page_mapcount(page); 1696 reset_page_mapcount(page);
1697 SetPageReserved(page); 1697 SetPageReserved(page);
1698 INIT_LIST_HEAD(&page->lru); 1698 INIT_LIST_HEAD(&page->lru);
1699 #ifdef WANT_PAGE_VIRTUAL 1699 #ifdef WANT_PAGE_VIRTUAL
1700 /* The shift won't overflow because ZONE_NORMAL is below 4G. */ 1700 /* The shift won't overflow because ZONE_NORMAL is below 4G. */
1701 if (!is_highmem_idx(zone)) 1701 if (!is_highmem_idx(zone))
1702 set_page_address(page, __va(pfn << PAGE_SHIFT)); 1702 set_page_address(page, __va(pfn << PAGE_SHIFT));
1703 #endif 1703 #endif
1704 } 1704 }
1705 } 1705 }
1706 1706
1707 void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone, 1707 void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
1708 unsigned long size) 1708 unsigned long size)
1709 { 1709 {
1710 int order; 1710 int order;
1711 for (order = 0; order < MAX_ORDER ; order++) { 1711 for (order = 0; order < MAX_ORDER ; order++) {
1712 INIT_LIST_HEAD(&zone->free_area[order].free_list); 1712 INIT_LIST_HEAD(&zone->free_area[order].free_list);
1713 zone->free_area[order].nr_free = 0; 1713 zone->free_area[order].nr_free = 0;
1714 } 1714 }
1715 } 1715 }
1716 1716
1717 #define ZONETABLE_INDEX(x, zone_nr) ((x << ZONES_SHIFT) | zone_nr) 1717 #define ZONETABLE_INDEX(x, zone_nr) ((x << ZONES_SHIFT) | zone_nr)
1718 void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn, 1718 void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
1719 unsigned long size) 1719 unsigned long size)
1720 { 1720 {
1721 unsigned long snum = pfn_to_section_nr(pfn); 1721 unsigned long snum = pfn_to_section_nr(pfn);
1722 unsigned long end = pfn_to_section_nr(pfn + size); 1722 unsigned long end = pfn_to_section_nr(pfn + size);
1723 1723
1724 if (FLAGS_HAS_NODE) 1724 if (FLAGS_HAS_NODE)
1725 zone_table[ZONETABLE_INDEX(nid, zid)] = zone; 1725 zone_table[ZONETABLE_INDEX(nid, zid)] = zone;
1726 else 1726 else
1727 for (; snum <= end; snum++) 1727 for (; snum <= end; snum++)
1728 zone_table[ZONETABLE_INDEX(snum, zid)] = zone; 1728 zone_table[ZONETABLE_INDEX(snum, zid)] = zone;
1729 } 1729 }
1730 1730
1731 #ifndef __HAVE_ARCH_MEMMAP_INIT 1731 #ifndef __HAVE_ARCH_MEMMAP_INIT
1732 #define memmap_init(size, nid, zone, start_pfn) \ 1732 #define memmap_init(size, nid, zone, start_pfn) \
1733 memmap_init_zone((size), (nid), (zone), (start_pfn)) 1733 memmap_init_zone((size), (nid), (zone), (start_pfn))
1734 #endif 1734 #endif
1735 1735
1736 static int __devinit zone_batchsize(struct zone *zone) 1736 static int __devinit zone_batchsize(struct zone *zone)
1737 { 1737 {
1738 int batch; 1738 int batch;
1739 1739
1740 /* 1740 /*
1741 * The per-cpu-pages pools are set to around 1000th of the 1741 * The per-cpu-pages pools are set to around 1000th of the
1742 * size of the zone. But no more than 1/2 of a meg. 1742 * size of the zone. But no more than 1/2 of a meg.
1743 * 1743 *
1744 * OK, so we don't know how big the cache is. So guess. 1744 * OK, so we don't know how big the cache is. So guess.
1745 */ 1745 */
1746 batch = zone->present_pages / 1024; 1746 batch = zone->present_pages / 1024;
1747 if (batch * PAGE_SIZE > 512 * 1024) 1747 if (batch * PAGE_SIZE > 512 * 1024)
1748 batch = (512 * 1024) / PAGE_SIZE; 1748 batch = (512 * 1024) / PAGE_SIZE;
1749 batch /= 4; /* We effectively *= 4 below */ 1749 batch /= 4; /* We effectively *= 4 below */
1750 if (batch < 1) 1750 if (batch < 1)
1751 batch = 1; 1751 batch = 1;
1752 1752
1753 /* 1753 /*
1754 * We will be trying to allcoate bigger chunks of contiguous 1754 * We will be trying to allcoate bigger chunks of contiguous
1755 * memory of the order of fls(batch). This should result in 1755 * memory of the order of fls(batch). This should result in
1756 * better cache coloring. 1756 * better cache coloring.
1757 * 1757 *
1758 * A sanity check also to ensure that batch is still in limits. 1758 * A sanity check also to ensure that batch is still in limits.
1759 */ 1759 */
1760 batch = (1 << fls(batch + batch/2)); 1760 batch = (1 << fls(batch + batch/2));
1761 1761
1762 if (fls(batch) >= (PAGE_SHIFT + MAX_ORDER - 2)) 1762 if (fls(batch) >= (PAGE_SHIFT + MAX_ORDER - 2))
1763 batch = PAGE_SHIFT + ((MAX_ORDER - 1 - PAGE_SHIFT)/2); 1763 batch = PAGE_SHIFT + ((MAX_ORDER - 1 - PAGE_SHIFT)/2);
1764 1764
1765 return batch; 1765 return batch;
1766 } 1766 }
1767 1767
1768 inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) 1768 inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
1769 { 1769 {
1770 struct per_cpu_pages *pcp; 1770 struct per_cpu_pages *pcp;
1771 1771
1772 memset(p, 0, sizeof(*p)); 1772 memset(p, 0, sizeof(*p));
1773 1773
1774 pcp = &p->pcp[0]; /* hot */ 1774 pcp = &p->pcp[0]; /* hot */
1775 pcp->count = 0; 1775 pcp->count = 0;
1776 pcp->low = 0; 1776 pcp->low = 0;
1777 pcp->high = 6 * batch; 1777 pcp->high = 6 * batch;
1778 pcp->batch = max(1UL, 1 * batch); 1778 pcp->batch = max(1UL, 1 * batch);
1779 INIT_LIST_HEAD(&pcp->list); 1779 INIT_LIST_HEAD(&pcp->list);
1780 1780
1781 pcp = &p->pcp[1]; /* cold*/ 1781 pcp = &p->pcp[1]; /* cold*/
1782 pcp->count = 0; 1782 pcp->count = 0;
1783 pcp->low = 0; 1783 pcp->low = 0;
1784 pcp->high = 2 * batch; 1784 pcp->high = 2 * batch;
1785 pcp->batch = max(1UL, batch/2); 1785 pcp->batch = max(1UL, batch/2);
1786 INIT_LIST_HEAD(&pcp->list); 1786 INIT_LIST_HEAD(&pcp->list);
1787 } 1787 }
1788 1788
1789 #ifdef CONFIG_NUMA 1789 #ifdef CONFIG_NUMA
1790 /* 1790 /*
1791 * Boot pageset table. One per cpu which is going to be used for all 1791 * Boot pageset table. One per cpu which is going to be used for all
1792 * zones and all nodes. The parameters will be set in such a way 1792 * zones and all nodes. The parameters will be set in such a way
1793 * that an item put on a list will immediately be handed over to 1793 * that an item put on a list will immediately be handed over to
1794 * the buddy list. This is safe since pageset manipulation is done 1794 * the buddy list. This is safe since pageset manipulation is done
1795 * with interrupts disabled. 1795 * with interrupts disabled.
1796 * 1796 *
1797 * Some NUMA counter updates may also be caught by the boot pagesets. 1797 * Some NUMA counter updates may also be caught by the boot pagesets.
1798 * 1798 *
1799 * The boot_pagesets must be kept even after bootup is complete for 1799 * The boot_pagesets must be kept even after bootup is complete for
1800 * unused processors and/or zones. They do play a role for bootstrapping 1800 * unused processors and/or zones. They do play a role for bootstrapping
1801 * hotplugged processors. 1801 * hotplugged processors.
1802 * 1802 *
1803 * zoneinfo_show() and maybe other functions do 1803 * zoneinfo_show() and maybe other functions do
1804 * not check if the processor is online before following the pageset pointer. 1804 * not check if the processor is online before following the pageset pointer.
1805 * Other parts of the kernel may not check if the zone is available. 1805 * Other parts of the kernel may not check if the zone is available.
1806 */ 1806 */
1807 static struct per_cpu_pageset 1807 static struct per_cpu_pageset
1808 boot_pageset[NR_CPUS]; 1808 boot_pageset[NR_CPUS];
1809 1809
1810 /* 1810 /*
1811 * Dynamically allocate memory for the 1811 * Dynamically allocate memory for the
1812 * per cpu pageset array in struct zone. 1812 * per cpu pageset array in struct zone.
1813 */ 1813 */
1814 static int __devinit process_zones(int cpu) 1814 static int __devinit process_zones(int cpu)
1815 { 1815 {
1816 struct zone *zone, *dzone; 1816 struct zone *zone, *dzone;
1817 1817
1818 for_each_zone(zone) { 1818 for_each_zone(zone) {
1819 1819
1820 zone->pageset[cpu] = kmalloc_node(sizeof(struct per_cpu_pageset), 1820 zone->pageset[cpu] = kmalloc_node(sizeof(struct per_cpu_pageset),
1821 GFP_KERNEL, cpu_to_node(cpu)); 1821 GFP_KERNEL, cpu_to_node(cpu));
1822 if (!zone->pageset[cpu]) 1822 if (!zone->pageset[cpu])
1823 goto bad; 1823 goto bad;
1824 1824
1825 setup_pageset(zone->pageset[cpu], zone_batchsize(zone)); 1825 setup_pageset(zone->pageset[cpu], zone_batchsize(zone));
1826 } 1826 }
1827 1827
1828 return 0; 1828 return 0;
1829 bad: 1829 bad:
1830 for_each_zone(dzone) { 1830 for_each_zone(dzone) {
1831 if (dzone == zone) 1831 if (dzone == zone)
1832 break; 1832 break;
1833 kfree(dzone->pageset[cpu]); 1833 kfree(dzone->pageset[cpu]);
1834 dzone->pageset[cpu] = NULL; 1834 dzone->pageset[cpu] = NULL;
1835 } 1835 }
1836 return -ENOMEM; 1836 return -ENOMEM;
1837 } 1837 }
1838 1838
1839 static inline void free_zone_pagesets(int cpu) 1839 static inline void free_zone_pagesets(int cpu)
1840 { 1840 {
1841 #ifdef CONFIG_NUMA 1841 #ifdef CONFIG_NUMA
1842 struct zone *zone; 1842 struct zone *zone;
1843 1843
1844 for_each_zone(zone) { 1844 for_each_zone(zone) {
1845 struct per_cpu_pageset *pset = zone_pcp(zone, cpu); 1845 struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
1846 1846
1847 zone_pcp(zone, cpu) = NULL; 1847 zone_pcp(zone, cpu) = NULL;
1848 kfree(pset); 1848 kfree(pset);
1849 } 1849 }
1850 #endif 1850 #endif
1851 } 1851 }
1852 1852
1853 static int __devinit pageset_cpuup_callback(struct notifier_block *nfb, 1853 static int __devinit pageset_cpuup_callback(struct notifier_block *nfb,
1854 unsigned long action, 1854 unsigned long action,
1855 void *hcpu) 1855 void *hcpu)
1856 { 1856 {
1857 int cpu = (long)hcpu; 1857 int cpu = (long)hcpu;
1858 int ret = NOTIFY_OK; 1858 int ret = NOTIFY_OK;
1859 1859
1860 switch (action) { 1860 switch (action) {
1861 case CPU_UP_PREPARE: 1861 case CPU_UP_PREPARE:
1862 if (process_zones(cpu)) 1862 if (process_zones(cpu))
1863 ret = NOTIFY_BAD; 1863 ret = NOTIFY_BAD;
1864 break; 1864 break;
1865 #ifdef CONFIG_HOTPLUG_CPU 1865 #ifdef CONFIG_HOTPLUG_CPU
1866 case CPU_DEAD: 1866 case CPU_DEAD:
1867 free_zone_pagesets(cpu); 1867 free_zone_pagesets(cpu);
1868 break; 1868 break;
1869 #endif 1869 #endif
1870 default: 1870 default:
1871 break; 1871 break;
1872 } 1872 }
1873 return ret; 1873 return ret;
1874 } 1874 }
1875 1875
1876 static struct notifier_block pageset_notifier = 1876 static struct notifier_block pageset_notifier =
1877 { &pageset_cpuup_callback, NULL, 0 }; 1877 { &pageset_cpuup_callback, NULL, 0 };
1878 1878
1879 void __init setup_per_cpu_pageset() 1879 void __init setup_per_cpu_pageset()
1880 { 1880 {
1881 int err; 1881 int err;
1882 1882
1883 /* Initialize per_cpu_pageset for cpu 0. 1883 /* Initialize per_cpu_pageset for cpu 0.
1884 * A cpuup callback will do this for every cpu 1884 * A cpuup callback will do this for every cpu
1885 * as it comes online 1885 * as it comes online
1886 */ 1886 */
1887 err = process_zones(smp_processor_id()); 1887 err = process_zones(smp_processor_id());
1888 BUG_ON(err); 1888 BUG_ON(err);
1889 register_cpu_notifier(&pageset_notifier); 1889 register_cpu_notifier(&pageset_notifier);
1890 } 1890 }
1891 1891
1892 #endif 1892 #endif
1893 1893
1894 static __devinit 1894 static __devinit
1895 void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) 1895 void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
1896 { 1896 {
1897 int i; 1897 int i;
1898 struct pglist_data *pgdat = zone->zone_pgdat; 1898 struct pglist_data *pgdat = zone->zone_pgdat;
1899 1899
1900 /* 1900 /*
1901 * The per-page waitqueue mechanism uses hashed waitqueues 1901 * The per-page waitqueue mechanism uses hashed waitqueues
1902 * per zone. 1902 * per zone.
1903 */ 1903 */
1904 zone->wait_table_size = wait_table_size(zone_size_pages); 1904 zone->wait_table_size = wait_table_size(zone_size_pages);
1905 zone->wait_table_bits = wait_table_bits(zone->wait_table_size); 1905 zone->wait_table_bits = wait_table_bits(zone->wait_table_size);
1906 zone->wait_table = (wait_queue_head_t *) 1906 zone->wait_table = (wait_queue_head_t *)
1907 alloc_bootmem_node(pgdat, zone->wait_table_size 1907 alloc_bootmem_node(pgdat, zone->wait_table_size
1908 * sizeof(wait_queue_head_t)); 1908 * sizeof(wait_queue_head_t));
1909 1909
1910 for(i = 0; i < zone->wait_table_size; ++i) 1910 for(i = 0; i < zone->wait_table_size; ++i)
1911 init_waitqueue_head(zone->wait_table + i); 1911 init_waitqueue_head(zone->wait_table + i);
1912 } 1912 }
1913 1913
1914 static __devinit void zone_pcp_init(struct zone *zone) 1914 static __devinit void zone_pcp_init(struct zone *zone)
1915 { 1915 {
1916 int cpu; 1916 int cpu;
1917 unsigned long batch = zone_batchsize(zone); 1917 unsigned long batch = zone_batchsize(zone);
1918 1918
1919 for (cpu = 0; cpu < NR_CPUS; cpu++) { 1919 for (cpu = 0; cpu < NR_CPUS; cpu++) {
1920 #ifdef CONFIG_NUMA 1920 #ifdef CONFIG_NUMA
1921 /* Early boot. Slab allocator not functional yet */ 1921 /* Early boot. Slab allocator not functional yet */
1922 zone->pageset[cpu] = &boot_pageset[cpu]; 1922 zone->pageset[cpu] = &boot_pageset[cpu];
1923 setup_pageset(&boot_pageset[cpu],0); 1923 setup_pageset(&boot_pageset[cpu],0);
1924 #else 1924 #else
1925 setup_pageset(zone_pcp(zone,cpu), batch); 1925 setup_pageset(zone_pcp(zone,cpu), batch);
1926 #endif 1926 #endif
1927 } 1927 }
1928 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", 1928 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",
1929 zone->name, zone->present_pages, batch); 1929 zone->name, zone->present_pages, batch);
1930 } 1930 }
1931 1931
1932 static __devinit void init_currently_empty_zone(struct zone *zone, 1932 static __devinit void init_currently_empty_zone(struct zone *zone,
1933 unsigned long zone_start_pfn, unsigned long size) 1933 unsigned long zone_start_pfn, unsigned long size)
1934 { 1934 {
1935 struct pglist_data *pgdat = zone->zone_pgdat; 1935 struct pglist_data *pgdat = zone->zone_pgdat;
1936 1936
1937 zone_wait_table_init(zone, size); 1937 zone_wait_table_init(zone, size);
1938 pgdat->nr_zones = zone_idx(zone) + 1; 1938 pgdat->nr_zones = zone_idx(zone) + 1;
1939 1939
1940 zone->zone_mem_map = pfn_to_page(zone_start_pfn); 1940 zone->zone_mem_map = pfn_to_page(zone_start_pfn);
1941 zone->zone_start_pfn = zone_start_pfn; 1941 zone->zone_start_pfn = zone_start_pfn;
1942 1942
1943 memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn); 1943 memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
1944 1944
1945 zone_init_free_lists(pgdat, zone, zone->spanned_pages); 1945 zone_init_free_lists(pgdat, zone, zone->spanned_pages);
1946 } 1946 }
1947 1947
1948 /* 1948 /*
1949 * Set up the zone data structures: 1949 * Set up the zone data structures:
1950 * - mark all pages reserved 1950 * - mark all pages reserved
1951 * - mark all memory queues empty 1951 * - mark all memory queues empty
1952 * - clear the memory bitmaps 1952 * - clear the memory bitmaps
1953 */ 1953 */
1954 static void __init free_area_init_core(struct pglist_data *pgdat, 1954 static void __init free_area_init_core(struct pglist_data *pgdat,
1955 unsigned long *zones_size, unsigned long *zholes_size) 1955 unsigned long *zones_size, unsigned long *zholes_size)
1956 { 1956 {
1957 unsigned long j; 1957 unsigned long j;
1958 int nid = pgdat->node_id; 1958 int nid = pgdat->node_id;
1959 unsigned long zone_start_pfn = pgdat->node_start_pfn; 1959 unsigned long zone_start_pfn = pgdat->node_start_pfn;
1960 1960
1961 pgdat_resize_init(pgdat);
1961 pgdat->nr_zones = 0; 1962 pgdat->nr_zones = 0;
1962 init_waitqueue_head(&pgdat->kswapd_wait); 1963 init_waitqueue_head(&pgdat->kswapd_wait);
1963 pgdat->kswapd_max_order = 0; 1964 pgdat->kswapd_max_order = 0;
1964 1965
1965 for (j = 0; j < MAX_NR_ZONES; j++) { 1966 for (j = 0; j < MAX_NR_ZONES; j++) {
1966 struct zone *zone = pgdat->node_zones + j; 1967 struct zone *zone = pgdat->node_zones + j;
1967 unsigned long size, realsize; 1968 unsigned long size, realsize;
1968 1969
1969 realsize = size = zones_size[j]; 1970 realsize = size = zones_size[j];
1970 if (zholes_size) 1971 if (zholes_size)
1971 realsize -= zholes_size[j]; 1972 realsize -= zholes_size[j];
1972 1973
1973 if (j == ZONE_DMA || j == ZONE_NORMAL) 1974 if (j == ZONE_DMA || j == ZONE_NORMAL)
1974 nr_kernel_pages += realsize; 1975 nr_kernel_pages += realsize;
1975 nr_all_pages += realsize; 1976 nr_all_pages += realsize;
1976 1977
1977 zone->spanned_pages = size; 1978 zone->spanned_pages = size;
1978 zone->present_pages = realsize; 1979 zone->present_pages = realsize;
1979 zone->name = zone_names[j]; 1980 zone->name = zone_names[j];
1980 spin_lock_init(&zone->lock); 1981 spin_lock_init(&zone->lock);
1981 spin_lock_init(&zone->lru_lock); 1982 spin_lock_init(&zone->lru_lock);
1982 zone->zone_pgdat = pgdat; 1983 zone->zone_pgdat = pgdat;
1983 zone->free_pages = 0; 1984 zone->free_pages = 0;
1984 1985
1985 zone->temp_priority = zone->prev_priority = DEF_PRIORITY; 1986 zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
1986 1987
1987 zone_pcp_init(zone); 1988 zone_pcp_init(zone);
1988 INIT_LIST_HEAD(&zone->active_list); 1989 INIT_LIST_HEAD(&zone->active_list);
1989 INIT_LIST_HEAD(&zone->inactive_list); 1990 INIT_LIST_HEAD(&zone->inactive_list);
1990 zone->nr_scan_active = 0; 1991 zone->nr_scan_active = 0;
1991 zone->nr_scan_inactive = 0; 1992 zone->nr_scan_inactive = 0;
1992 zone->nr_active = 0; 1993 zone->nr_active = 0;
1993 zone->nr_inactive = 0; 1994 zone->nr_inactive = 0;
1994 atomic_set(&zone->reclaim_in_progress, 0); 1995 atomic_set(&zone->reclaim_in_progress, 0);
1995 if (!size) 1996 if (!size)
1996 continue; 1997 continue;
1997 1998
1998 zonetable_add(zone, nid, j, zone_start_pfn, size); 1999 zonetable_add(zone, nid, j, zone_start_pfn, size);
1999 init_currently_empty_zone(zone, zone_start_pfn, size); 2000 init_currently_empty_zone(zone, zone_start_pfn, size);
2000 zone_start_pfn += size; 2001 zone_start_pfn += size;
2001 } 2002 }
2002 } 2003 }
2003 2004
2004 static void __init alloc_node_mem_map(struct pglist_data *pgdat) 2005 static void __init alloc_node_mem_map(struct pglist_data *pgdat)
2005 { 2006 {
2006 /* Skip empty nodes */ 2007 /* Skip empty nodes */
2007 if (!pgdat->node_spanned_pages) 2008 if (!pgdat->node_spanned_pages)
2008 return; 2009 return;
2009 2010
2010 #ifdef CONFIG_FLAT_NODE_MEM_MAP 2011 #ifdef CONFIG_FLAT_NODE_MEM_MAP
2011 /* ia64 gets its own node_mem_map, before this, without bootmem */ 2012 /* ia64 gets its own node_mem_map, before this, without bootmem */
2012 if (!pgdat->node_mem_map) { 2013 if (!pgdat->node_mem_map) {
2013 unsigned long size; 2014 unsigned long size;
2014 struct page *map; 2015 struct page *map;
2015 2016
2016 size = (pgdat->node_spanned_pages + 1) * sizeof(struct page); 2017 size = (pgdat->node_spanned_pages + 1) * sizeof(struct page);
2017 map = alloc_remap(pgdat->node_id, size); 2018 map = alloc_remap(pgdat->node_id, size);
2018 if (!map) 2019 if (!map)
2019 map = alloc_bootmem_node(pgdat, size); 2020 map = alloc_bootmem_node(pgdat, size);
2020 pgdat->node_mem_map = map; 2021 pgdat->node_mem_map = map;
2021 } 2022 }
2022 #ifdef CONFIG_FLATMEM 2023 #ifdef CONFIG_FLATMEM
2023 /* 2024 /*
2024 * With no DISCONTIG, the global mem_map is just set as node 0's 2025 * With no DISCONTIG, the global mem_map is just set as node 0's
2025 */ 2026 */
2026 if (pgdat == NODE_DATA(0)) 2027 if (pgdat == NODE_DATA(0))
2027 mem_map = NODE_DATA(0)->node_mem_map; 2028 mem_map = NODE_DATA(0)->node_mem_map;
2028 #endif 2029 #endif
2029 #endif /* CONFIG_FLAT_NODE_MEM_MAP */ 2030 #endif /* CONFIG_FLAT_NODE_MEM_MAP */
2030 } 2031 }
2031 2032
2032 void __init free_area_init_node(int nid, struct pglist_data *pgdat, 2033 void __init free_area_init_node(int nid, struct pglist_data *pgdat,
2033 unsigned long *zones_size, unsigned long node_start_pfn, 2034 unsigned long *zones_size, unsigned long node_start_pfn,
2034 unsigned long *zholes_size) 2035 unsigned long *zholes_size)
2035 { 2036 {
2036 pgdat->node_id = nid; 2037 pgdat->node_id = nid;
2037 pgdat->node_start_pfn = node_start_pfn; 2038 pgdat->node_start_pfn = node_start_pfn;
2038 calculate_zone_totalpages(pgdat, zones_size, zholes_size); 2039 calculate_zone_totalpages(pgdat, zones_size, zholes_size);
2039 2040
2040 alloc_node_mem_map(pgdat); 2041 alloc_node_mem_map(pgdat);
2041 2042
2042 free_area_init_core(pgdat, zones_size, zholes_size); 2043 free_area_init_core(pgdat, zones_size, zholes_size);
2043 } 2044 }
2044 2045
2045 #ifndef CONFIG_NEED_MULTIPLE_NODES 2046 #ifndef CONFIG_NEED_MULTIPLE_NODES
2046 static bootmem_data_t contig_bootmem_data; 2047 static bootmem_data_t contig_bootmem_data;
2047 struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data }; 2048 struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
2048 2049
2049 EXPORT_SYMBOL(contig_page_data); 2050 EXPORT_SYMBOL(contig_page_data);
2050 #endif 2051 #endif
2051 2052
2052 void __init free_area_init(unsigned long *zones_size) 2053 void __init free_area_init(unsigned long *zones_size)
2053 { 2054 {
2054 free_area_init_node(0, NODE_DATA(0), zones_size, 2055 free_area_init_node(0, NODE_DATA(0), zones_size,
2055 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); 2056 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
2056 } 2057 }
2057 2058
2058 #ifdef CONFIG_PROC_FS 2059 #ifdef CONFIG_PROC_FS
2059 2060
2060 #include <linux/seq_file.h> 2061 #include <linux/seq_file.h>
2061 2062
2062 static void *frag_start(struct seq_file *m, loff_t *pos) 2063 static void *frag_start(struct seq_file *m, loff_t *pos)
2063 { 2064 {
2064 pg_data_t *pgdat; 2065 pg_data_t *pgdat;
2065 loff_t node = *pos; 2066 loff_t node = *pos;
2066 2067
2067 for (pgdat = pgdat_list; pgdat && node; pgdat = pgdat->pgdat_next) 2068 for (pgdat = pgdat_list; pgdat && node; pgdat = pgdat->pgdat_next)
2068 --node; 2069 --node;
2069 2070
2070 return pgdat; 2071 return pgdat;
2071 } 2072 }
2072 2073
2073 static void *frag_next(struct seq_file *m, void *arg, loff_t *pos) 2074 static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
2074 { 2075 {
2075 pg_data_t *pgdat = (pg_data_t *)arg; 2076 pg_data_t *pgdat = (pg_data_t *)arg;
2076 2077
2077 (*pos)++; 2078 (*pos)++;
2078 return pgdat->pgdat_next; 2079 return pgdat->pgdat_next;
2079 } 2080 }
2080 2081
2081 static void frag_stop(struct seq_file *m, void *arg) 2082 static void frag_stop(struct seq_file *m, void *arg)
2082 { 2083 {
2083 } 2084 }
2084 2085
2085 /* 2086 /*
2086 * This walks the free areas for each zone. 2087 * This walks the free areas for each zone.
2087 */ 2088 */
2088 static int frag_show(struct seq_file *m, void *arg) 2089 static int frag_show(struct seq_file *m, void *arg)
2089 { 2090 {
2090 pg_data_t *pgdat = (pg_data_t *)arg; 2091 pg_data_t *pgdat = (pg_data_t *)arg;
2091 struct zone *zone; 2092 struct zone *zone;
2092 struct zone *node_zones = pgdat->node_zones; 2093 struct zone *node_zones = pgdat->node_zones;
2093 unsigned long flags; 2094 unsigned long flags;
2094 int order; 2095 int order;
2095 2096
2096 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { 2097 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
2097 if (!zone->present_pages) 2098 if (!zone->present_pages)
2098 continue; 2099 continue;
2099 2100
2100 spin_lock_irqsave(&zone->lock, flags); 2101 spin_lock_irqsave(&zone->lock, flags);
2101 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); 2102 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
2102 for (order = 0; order < MAX_ORDER; ++order) 2103 for (order = 0; order < MAX_ORDER; ++order)
2103 seq_printf(m, "%6lu ", zone->free_area[order].nr_free); 2104 seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
2104 spin_unlock_irqrestore(&zone->lock, flags); 2105 spin_unlock_irqrestore(&zone->lock, flags);
2105 seq_putc(m, '\n'); 2106 seq_putc(m, '\n');
2106 } 2107 }
2107 return 0; 2108 return 0;
2108 } 2109 }
2109 2110
2110 struct seq_operations fragmentation_op = { 2111 struct seq_operations fragmentation_op = {
2111 .start = frag_start, 2112 .start = frag_start,
2112 .next = frag_next, 2113 .next = frag_next,
2113 .stop = frag_stop, 2114 .stop = frag_stop,
2114 .show = frag_show, 2115 .show = frag_show,
2115 }; 2116 };
2116 2117
2117 /* 2118 /*
2118 * Output information about zones in @pgdat. 2119 * Output information about zones in @pgdat.
2119 */ 2120 */
2120 static int zoneinfo_show(struct seq_file *m, void *arg) 2121 static int zoneinfo_show(struct seq_file *m, void *arg)
2121 { 2122 {
2122 pg_data_t *pgdat = arg; 2123 pg_data_t *pgdat = arg;
2123 struct zone *zone; 2124 struct zone *zone;
2124 struct zone *node_zones = pgdat->node_zones; 2125 struct zone *node_zones = pgdat->node_zones;
2125 unsigned long flags; 2126 unsigned long flags;
2126 2127
2127 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) { 2128 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) {
2128 int i; 2129 int i;
2129 2130
2130 if (!zone->present_pages) 2131 if (!zone->present_pages)
2131 continue; 2132 continue;
2132 2133
2133 spin_lock_irqsave(&zone->lock, flags); 2134 spin_lock_irqsave(&zone->lock, flags);
2134 seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name); 2135 seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
2135 seq_printf(m, 2136 seq_printf(m,
2136 "\n pages free %lu" 2137 "\n pages free %lu"
2137 "\n min %lu" 2138 "\n min %lu"
2138 "\n low %lu" 2139 "\n low %lu"
2139 "\n high %lu" 2140 "\n high %lu"
2140 "\n active %lu" 2141 "\n active %lu"
2141 "\n inactive %lu" 2142 "\n inactive %lu"
2142 "\n scanned %lu (a: %lu i: %lu)" 2143 "\n scanned %lu (a: %lu i: %lu)"
2143 "\n spanned %lu" 2144 "\n spanned %lu"
2144 "\n present %lu", 2145 "\n present %lu",
2145 zone->free_pages, 2146 zone->free_pages,
2146 zone->pages_min, 2147 zone->pages_min,
2147 zone->pages_low, 2148 zone->pages_low,
2148 zone->pages_high, 2149 zone->pages_high,
2149 zone->nr_active, 2150 zone->nr_active,
2150 zone->nr_inactive, 2151 zone->nr_inactive,
2151 zone->pages_scanned, 2152 zone->pages_scanned,
2152 zone->nr_scan_active, zone->nr_scan_inactive, 2153 zone->nr_scan_active, zone->nr_scan_inactive,
2153 zone->spanned_pages, 2154 zone->spanned_pages,
2154 zone->present_pages); 2155 zone->present_pages);
2155 seq_printf(m, 2156 seq_printf(m,
2156 "\n protection: (%lu", 2157 "\n protection: (%lu",
2157 zone->lowmem_reserve[0]); 2158 zone->lowmem_reserve[0]);
2158 for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) 2159 for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
2159 seq_printf(m, ", %lu", zone->lowmem_reserve[i]); 2160 seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
2160 seq_printf(m, 2161 seq_printf(m,
2161 ")" 2162 ")"
2162 "\n pagesets"); 2163 "\n pagesets");
2163 for (i = 0; i < ARRAY_SIZE(zone->pageset); i++) { 2164 for (i = 0; i < ARRAY_SIZE(zone->pageset); i++) {
2164 struct per_cpu_pageset *pageset; 2165 struct per_cpu_pageset *pageset;
2165 int j; 2166 int j;
2166 2167
2167 pageset = zone_pcp(zone, i); 2168 pageset = zone_pcp(zone, i);
2168 for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { 2169 for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
2169 if (pageset->pcp[j].count) 2170 if (pageset->pcp[j].count)
2170 break; 2171 break;
2171 } 2172 }
2172 if (j == ARRAY_SIZE(pageset->pcp)) 2173 if (j == ARRAY_SIZE(pageset->pcp))
2173 continue; 2174 continue;
2174 for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { 2175 for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
2175 seq_printf(m, 2176 seq_printf(m,
2176 "\n cpu: %i pcp: %i" 2177 "\n cpu: %i pcp: %i"
2177 "\n count: %i" 2178 "\n count: %i"
2178 "\n low: %i" 2179 "\n low: %i"
2179 "\n high: %i" 2180 "\n high: %i"
2180 "\n batch: %i", 2181 "\n batch: %i",
2181 i, j, 2182 i, j,
2182 pageset->pcp[j].count, 2183 pageset->pcp[j].count,
2183 pageset->pcp[j].low, 2184 pageset->pcp[j].low,
2184 pageset->pcp[j].high, 2185 pageset->pcp[j].high,
2185 pageset->pcp[j].batch); 2186 pageset->pcp[j].batch);
2186 } 2187 }
2187 #ifdef CONFIG_NUMA 2188 #ifdef CONFIG_NUMA
2188 seq_printf(m, 2189 seq_printf(m,
2189 "\n numa_hit: %lu" 2190 "\n numa_hit: %lu"
2190 "\n numa_miss: %lu" 2191 "\n numa_miss: %lu"
2191 "\n numa_foreign: %lu" 2192 "\n numa_foreign: %lu"
2192 "\n interleave_hit: %lu" 2193 "\n interleave_hit: %lu"
2193 "\n local_node: %lu" 2194 "\n local_node: %lu"
2194 "\n other_node: %lu", 2195 "\n other_node: %lu",
2195 pageset->numa_hit, 2196 pageset->numa_hit,
2196 pageset->numa_miss, 2197 pageset->numa_miss,
2197 pageset->numa_foreign, 2198 pageset->numa_foreign,
2198 pageset->interleave_hit, 2199 pageset->interleave_hit,
2199 pageset->local_node, 2200 pageset->local_node,
2200 pageset->other_node); 2201 pageset->other_node);
2201 #endif 2202 #endif
2202 } 2203 }
2203 seq_printf(m, 2204 seq_printf(m,
2204 "\n all_unreclaimable: %u" 2205 "\n all_unreclaimable: %u"
2205 "\n prev_priority: %i" 2206 "\n prev_priority: %i"
2206 "\n temp_priority: %i" 2207 "\n temp_priority: %i"
2207 "\n start_pfn: %lu", 2208 "\n start_pfn: %lu",
2208 zone->all_unreclaimable, 2209 zone->all_unreclaimable,
2209 zone->prev_priority, 2210 zone->prev_priority,
2210 zone->temp_priority, 2211 zone->temp_priority,
2211 zone->zone_start_pfn); 2212 zone->zone_start_pfn);
2212 spin_unlock_irqrestore(&zone->lock, flags); 2213 spin_unlock_irqrestore(&zone->lock, flags);
2213 seq_putc(m, '\n'); 2214 seq_putc(m, '\n');
2214 } 2215 }
2215 return 0; 2216 return 0;
2216 } 2217 }
2217 2218
2218 struct seq_operations zoneinfo_op = { 2219 struct seq_operations zoneinfo_op = {
2219 .start = frag_start, /* iterate over all zones. The same as in 2220 .start = frag_start, /* iterate over all zones. The same as in
2220 * fragmentation. */ 2221 * fragmentation. */
2221 .next = frag_next, 2222 .next = frag_next,
2222 .stop = frag_stop, 2223 .stop = frag_stop,
2223 .show = zoneinfo_show, 2224 .show = zoneinfo_show,
2224 }; 2225 };
2225 2226
2226 static char *vmstat_text[] = { 2227 static char *vmstat_text[] = {
2227 "nr_dirty", 2228 "nr_dirty",
2228 "nr_writeback", 2229 "nr_writeback",
2229 "nr_unstable", 2230 "nr_unstable",
2230 "nr_page_table_pages", 2231 "nr_page_table_pages",
2231 "nr_mapped", 2232 "nr_mapped",
2232 "nr_slab", 2233 "nr_slab",
2233 2234
2234 "pgpgin", 2235 "pgpgin",
2235 "pgpgout", 2236 "pgpgout",
2236 "pswpin", 2237 "pswpin",
2237 "pswpout", 2238 "pswpout",
2238 "pgalloc_high", 2239 "pgalloc_high",
2239 2240
2240 "pgalloc_normal", 2241 "pgalloc_normal",
2241 "pgalloc_dma", 2242 "pgalloc_dma",
2242 "pgfree", 2243 "pgfree",
2243 "pgactivate", 2244 "pgactivate",
2244 "pgdeactivate", 2245 "pgdeactivate",
2245 2246
2246 "pgfault", 2247 "pgfault",
2247 "pgmajfault", 2248 "pgmajfault",
2248 "pgrefill_high", 2249 "pgrefill_high",
2249 "pgrefill_normal", 2250 "pgrefill_normal",
2250 "pgrefill_dma", 2251 "pgrefill_dma",
2251 2252
2252 "pgsteal_high", 2253 "pgsteal_high",
2253 "pgsteal_normal", 2254 "pgsteal_normal",
2254 "pgsteal_dma", 2255 "pgsteal_dma",
2255 "pgscan_kswapd_high", 2256 "pgscan_kswapd_high",
2256 "pgscan_kswapd_normal", 2257 "pgscan_kswapd_normal",
2257 2258
2258 "pgscan_kswapd_dma", 2259 "pgscan_kswapd_dma",
2259 "pgscan_direct_high", 2260 "pgscan_direct_high",
2260 "pgscan_direct_normal", 2261 "pgscan_direct_normal",
2261 "pgscan_direct_dma", 2262 "pgscan_direct_dma",
2262 "pginodesteal", 2263 "pginodesteal",
2263 2264
2264 "slabs_scanned", 2265 "slabs_scanned",
2265 "kswapd_steal", 2266 "kswapd_steal",
2266 "kswapd_inodesteal", 2267 "kswapd_inodesteal",
2267 "pageoutrun", 2268 "pageoutrun",
2268 "allocstall", 2269 "allocstall",
2269 2270
2270 "pgrotated", 2271 "pgrotated",
2271 "nr_bounce", 2272 "nr_bounce",
2272 }; 2273 };
2273 2274
2274 static void *vmstat_start(struct seq_file *m, loff_t *pos) 2275 static void *vmstat_start(struct seq_file *m, loff_t *pos)
2275 { 2276 {
2276 struct page_state *ps; 2277 struct page_state *ps;
2277 2278
2278 if (*pos >= ARRAY_SIZE(vmstat_text)) 2279 if (*pos >= ARRAY_SIZE(vmstat_text))
2279 return NULL; 2280 return NULL;
2280 2281
2281 ps = kmalloc(sizeof(*ps), GFP_KERNEL); 2282 ps = kmalloc(sizeof(*ps), GFP_KERNEL);
2282 m->private = ps; 2283 m->private = ps;
2283 if (!ps) 2284 if (!ps)
2284 return ERR_PTR(-ENOMEM); 2285 return ERR_PTR(-ENOMEM);
2285 get_full_page_state(ps); 2286 get_full_page_state(ps);
2286 ps->pgpgin /= 2; /* sectors -> kbytes */ 2287 ps->pgpgin /= 2; /* sectors -> kbytes */
2287 ps->pgpgout /= 2; 2288 ps->pgpgout /= 2;
2288 return (unsigned long *)ps + *pos; 2289 return (unsigned long *)ps + *pos;
2289 } 2290 }
2290 2291
2291 static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos) 2292 static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
2292 { 2293 {
2293 (*pos)++; 2294 (*pos)++;
2294 if (*pos >= ARRAY_SIZE(vmstat_text)) 2295 if (*pos >= ARRAY_SIZE(vmstat_text))
2295 return NULL; 2296 return NULL;
2296 return (unsigned long *)m->private + *pos; 2297 return (unsigned long *)m->private + *pos;
2297 } 2298 }
2298 2299
2299 static int vmstat_show(struct seq_file *m, void *arg) 2300 static int vmstat_show(struct seq_file *m, void *arg)
2300 { 2301 {
2301 unsigned long *l = arg; 2302 unsigned long *l = arg;
2302 unsigned long off = l - (unsigned long *)m->private; 2303 unsigned long off = l - (unsigned long *)m->private;
2303 2304
2304 seq_printf(m, "%s %lu\n", vmstat_text[off], *l); 2305 seq_printf(m, "%s %lu\n", vmstat_text[off], *l);
2305 return 0; 2306 return 0;
2306 } 2307 }
2307 2308
2308 static void vmstat_stop(struct seq_file *m, void *arg) 2309 static void vmstat_stop(struct seq_file *m, void *arg)
2309 { 2310 {
2310 kfree(m->private); 2311 kfree(m->private);
2311 m->private = NULL; 2312 m->private = NULL;
2312 } 2313 }
2313 2314
2314 struct seq_operations vmstat_op = { 2315 struct seq_operations vmstat_op = {
2315 .start = vmstat_start, 2316 .start = vmstat_start,
2316 .next = vmstat_next, 2317 .next = vmstat_next,
2317 .stop = vmstat_stop, 2318 .stop = vmstat_stop,
2318 .show = vmstat_show, 2319 .show = vmstat_show,
2319 }; 2320 };
2320 2321
2321 #endif /* CONFIG_PROC_FS */ 2322 #endif /* CONFIG_PROC_FS */
2322 2323
2323 #ifdef CONFIG_HOTPLUG_CPU 2324 #ifdef CONFIG_HOTPLUG_CPU
2324 static int page_alloc_cpu_notify(struct notifier_block *self, 2325 static int page_alloc_cpu_notify(struct notifier_block *self,
2325 unsigned long action, void *hcpu) 2326 unsigned long action, void *hcpu)
2326 { 2327 {
2327 int cpu = (unsigned long)hcpu; 2328 int cpu = (unsigned long)hcpu;
2328 long *count; 2329 long *count;
2329 unsigned long *src, *dest; 2330 unsigned long *src, *dest;
2330 2331
2331 if (action == CPU_DEAD) { 2332 if (action == CPU_DEAD) {
2332 int i; 2333 int i;
2333 2334
2334 /* Drain local pagecache count. */ 2335 /* Drain local pagecache count. */
2335 count = &per_cpu(nr_pagecache_local, cpu); 2336 count = &per_cpu(nr_pagecache_local, cpu);
2336 atomic_add(*count, &nr_pagecache); 2337 atomic_add(*count, &nr_pagecache);
2337 *count = 0; 2338 *count = 0;
2338 local_irq_disable(); 2339 local_irq_disable();
2339 __drain_pages(cpu); 2340 __drain_pages(cpu);
2340 2341
2341 /* Add dead cpu's page_states to our own. */ 2342 /* Add dead cpu's page_states to our own. */
2342 dest = (unsigned long *)&__get_cpu_var(page_states); 2343 dest = (unsigned long *)&__get_cpu_var(page_states);
2343 src = (unsigned long *)&per_cpu(page_states, cpu); 2344 src = (unsigned long *)&per_cpu(page_states, cpu);
2344 2345
2345 for (i = 0; i < sizeof(struct page_state)/sizeof(unsigned long); 2346 for (i = 0; i < sizeof(struct page_state)/sizeof(unsigned long);
2346 i++) { 2347 i++) {
2347 dest[i] += src[i]; 2348 dest[i] += src[i];
2348 src[i] = 0; 2349 src[i] = 0;
2349 } 2350 }
2350 2351
2351 local_irq_enable(); 2352 local_irq_enable();
2352 } 2353 }
2353 return NOTIFY_OK; 2354 return NOTIFY_OK;
2354 } 2355 }
2355 #endif /* CONFIG_HOTPLUG_CPU */ 2356 #endif /* CONFIG_HOTPLUG_CPU */
2356 2357
2357 void __init page_alloc_init(void) 2358 void __init page_alloc_init(void)
2358 { 2359 {
2359 hotcpu_notifier(page_alloc_cpu_notify, 0); 2360 hotcpu_notifier(page_alloc_cpu_notify, 0);
2360 } 2361 }
2361 2362
2362 /* 2363 /*
2363 * setup_per_zone_lowmem_reserve - called whenever 2364 * setup_per_zone_lowmem_reserve - called whenever
2364 * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone 2365 * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone
2365 * has a correct pages reserved value, so an adequate number of 2366 * has a correct pages reserved value, so an adequate number of
2366 * pages are left in the zone after a successful __alloc_pages(). 2367 * pages are left in the zone after a successful __alloc_pages().
2367 */ 2368 */
2368 static void setup_per_zone_lowmem_reserve(void) 2369 static void setup_per_zone_lowmem_reserve(void)
2369 { 2370 {
2370 struct pglist_data *pgdat; 2371 struct pglist_data *pgdat;
2371 int j, idx; 2372 int j, idx;
2372 2373
2373 for_each_pgdat(pgdat) { 2374 for_each_pgdat(pgdat) {
2374 for (j = 0; j < MAX_NR_ZONES; j++) { 2375 for (j = 0; j < MAX_NR_ZONES; j++) {
2375 struct zone *zone = pgdat->node_zones + j; 2376 struct zone *zone = pgdat->node_zones + j;
2376 unsigned long present_pages = zone->present_pages; 2377 unsigned long present_pages = zone->present_pages;
2377 2378
2378 zone->lowmem_reserve[j] = 0; 2379 zone->lowmem_reserve[j] = 0;
2379 2380
2380 for (idx = j-1; idx >= 0; idx--) { 2381 for (idx = j-1; idx >= 0; idx--) {
2381 struct zone *lower_zone; 2382 struct zone *lower_zone;
2382 2383
2383 if (sysctl_lowmem_reserve_ratio[idx] < 1) 2384 if (sysctl_lowmem_reserve_ratio[idx] < 1)
2384 sysctl_lowmem_reserve_ratio[idx] = 1; 2385 sysctl_lowmem_reserve_ratio[idx] = 1;
2385 2386
2386 lower_zone = pgdat->node_zones + idx; 2387 lower_zone = pgdat->node_zones + idx;
2387 lower_zone->lowmem_reserve[j] = present_pages / 2388 lower_zone->lowmem_reserve[j] = present_pages /
2388 sysctl_lowmem_reserve_ratio[idx]; 2389 sysctl_lowmem_reserve_ratio[idx];
2389 present_pages += lower_zone->present_pages; 2390 present_pages += lower_zone->present_pages;
2390 } 2391 }
2391 } 2392 }
2392 } 2393 }
2393 } 2394 }
2394 2395
2395 /* 2396 /*
2396 * setup_per_zone_pages_min - called when min_free_kbytes changes. Ensures 2397 * setup_per_zone_pages_min - called when min_free_kbytes changes. Ensures
2397 * that the pages_{min,low,high} values for each zone are set correctly 2398 * that the pages_{min,low,high} values for each zone are set correctly
2398 * with respect to min_free_kbytes. 2399 * with respect to min_free_kbytes.
2399 */ 2400 */
2400 static void setup_per_zone_pages_min(void) 2401 static void setup_per_zone_pages_min(void)
2401 { 2402 {
2402 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); 2403 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
2403 unsigned long lowmem_pages = 0; 2404 unsigned long lowmem_pages = 0;
2404 struct zone *zone; 2405 struct zone *zone;
2405 unsigned long flags; 2406 unsigned long flags;
2406 2407
2407 /* Calculate total number of !ZONE_HIGHMEM pages */ 2408 /* Calculate total number of !ZONE_HIGHMEM pages */
2408 for_each_zone(zone) { 2409 for_each_zone(zone) {
2409 if (!is_highmem(zone)) 2410 if (!is_highmem(zone))
2410 lowmem_pages += zone->present_pages; 2411 lowmem_pages += zone->present_pages;
2411 } 2412 }
2412 2413
2413 for_each_zone(zone) { 2414 for_each_zone(zone) {
2414 spin_lock_irqsave(&zone->lru_lock, flags); 2415 spin_lock_irqsave(&zone->lru_lock, flags);
2415 if (is_highmem(zone)) { 2416 if (is_highmem(zone)) {
2416 /* 2417 /*
2417 * Often, highmem doesn't need to reserve any pages. 2418 * Often, highmem doesn't need to reserve any pages.
2418 * But the pages_min/low/high values are also used for 2419 * But the pages_min/low/high values are also used for
2419 * batching up page reclaim activity so we need a 2420 * batching up page reclaim activity so we need a
2420 * decent value here. 2421 * decent value here.
2421 */ 2422 */
2422 int min_pages; 2423 int min_pages;
2423 2424
2424 min_pages = zone->present_pages / 1024; 2425 min_pages = zone->present_pages / 1024;
2425 if (min_pages < SWAP_CLUSTER_MAX) 2426 if (min_pages < SWAP_CLUSTER_MAX)
2426 min_pages = SWAP_CLUSTER_MAX; 2427 min_pages = SWAP_CLUSTER_MAX;
2427 if (min_pages > 128) 2428 if (min_pages > 128)
2428 min_pages = 128; 2429 min_pages = 128;
2429 zone->pages_min = min_pages; 2430 zone->pages_min = min_pages;
2430 } else { 2431 } else {
2431 /* if it's a lowmem zone, reserve a number of pages 2432 /* if it's a lowmem zone, reserve a number of pages
2432 * proportionate to the zone's size. 2433 * proportionate to the zone's size.
2433 */ 2434 */
2434 zone->pages_min = (pages_min * zone->present_pages) / 2435 zone->pages_min = (pages_min * zone->present_pages) /
2435 lowmem_pages; 2436 lowmem_pages;
2436 } 2437 }
2437 2438
2438 /* 2439 /*
2439 * When interpreting these watermarks, just keep in mind that: 2440 * When interpreting these watermarks, just keep in mind that:
2440 * zone->pages_min == (zone->pages_min * 4) / 4; 2441 * zone->pages_min == (zone->pages_min * 4) / 4;
2441 */ 2442 */
2442 zone->pages_low = (zone->pages_min * 5) / 4; 2443 zone->pages_low = (zone->pages_min * 5) / 4;
2443 zone->pages_high = (zone->pages_min * 6) / 4; 2444 zone->pages_high = (zone->pages_min * 6) / 4;
2444 spin_unlock_irqrestore(&zone->lru_lock, flags); 2445 spin_unlock_irqrestore(&zone->lru_lock, flags);
2445 } 2446 }
2446 } 2447 }
2447 2448
2448 /* 2449 /*
2449 * Initialise min_free_kbytes. 2450 * Initialise min_free_kbytes.
2450 * 2451 *
2451 * For small machines we want it small (128k min). For large machines 2452 * For small machines we want it small (128k min). For large machines
2452 * we want it large (64MB max). But it is not linear, because network 2453 * we want it large (64MB max). But it is not linear, because network
2453 * bandwidth does not increase linearly with machine size. We use 2454 * bandwidth does not increase linearly with machine size. We use
2454 * 2455 *
2455 * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: 2456 * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
2456 * min_free_kbytes = sqrt(lowmem_kbytes * 16) 2457 * min_free_kbytes = sqrt(lowmem_kbytes * 16)
2457 * 2458 *
2458 * which yields 2459 * which yields
2459 * 2460 *
2460 * 16MB: 512k 2461 * 16MB: 512k
2461 * 32MB: 724k 2462 * 32MB: 724k
2462 * 64MB: 1024k 2463 * 64MB: 1024k
2463 * 128MB: 1448k 2464 * 128MB: 1448k
2464 * 256MB: 2048k 2465 * 256MB: 2048k
2465 * 512MB: 2896k 2466 * 512MB: 2896k
2466 * 1024MB: 4096k 2467 * 1024MB: 4096k
2467 * 2048MB: 5792k 2468 * 2048MB: 5792k
2468 * 4096MB: 8192k 2469 * 4096MB: 8192k
2469 * 8192MB: 11584k 2470 * 8192MB: 11584k
2470 * 16384MB: 16384k 2471 * 16384MB: 16384k
2471 */ 2472 */
2472 static int __init init_per_zone_pages_min(void) 2473 static int __init init_per_zone_pages_min(void)
2473 { 2474 {
2474 unsigned long lowmem_kbytes; 2475 unsigned long lowmem_kbytes;
2475 2476
2476 lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); 2477 lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
2477 2478
2478 min_free_kbytes = int_sqrt(lowmem_kbytes * 16); 2479 min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
2479 if (min_free_kbytes < 128) 2480 if (min_free_kbytes < 128)
2480 min_free_kbytes = 128; 2481 min_free_kbytes = 128;
2481 if (min_free_kbytes > 65536) 2482 if (min_free_kbytes > 65536)
2482 min_free_kbytes = 65536; 2483 min_free_kbytes = 65536;
2483 setup_per_zone_pages_min(); 2484 setup_per_zone_pages_min();
2484 setup_per_zone_lowmem_reserve(); 2485 setup_per_zone_lowmem_reserve();
2485 return 0; 2486 return 0;
2486 } 2487 }
2487 module_init(init_per_zone_pages_min) 2488 module_init(init_per_zone_pages_min)
2488 2489
2489 /* 2490 /*
2490 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 2491 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
2491 * that we can call two helper functions whenever min_free_kbytes 2492 * that we can call two helper functions whenever min_free_kbytes
2492 * changes. 2493 * changes.
2493 */ 2494 */
2494 int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 2495 int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
2495 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 2496 struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
2496 { 2497 {
2497 proc_dointvec(table, write, file, buffer, length, ppos); 2498 proc_dointvec(table, write, file, buffer, length, ppos);
2498 setup_per_zone_pages_min(); 2499 setup_per_zone_pages_min();
2499 return 0; 2500 return 0;
2500 } 2501 }
2501 2502
2502 /* 2503 /*
2503 * lowmem_reserve_ratio_sysctl_handler - just a wrapper around 2504 * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
2504 * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve() 2505 * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
2505 * whenever sysctl_lowmem_reserve_ratio changes. 2506 * whenever sysctl_lowmem_reserve_ratio changes.
2506 * 2507 *
2507 * The reserve ratio obviously has absolutely no relation with the 2508 * The reserve ratio obviously has absolutely no relation with the
2508 * pages_min watermarks. The lowmem reserve ratio can only make sense 2509 * pages_min watermarks. The lowmem reserve ratio can only make sense
2509 * if in function of the boot time zone sizes. 2510 * if in function of the boot time zone sizes.
2510 */ 2511 */
2511 int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, 2512 int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
2512 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 2513 struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
2513 { 2514 {
2514 proc_dointvec_minmax(table, write, file, buffer, length, ppos); 2515 proc_dointvec_minmax(table, write, file, buffer, length, ppos);
2515 setup_per_zone_lowmem_reserve(); 2516 setup_per_zone_lowmem_reserve();
2516 return 0; 2517 return 0;
2517 } 2518 }
2518 2519
2519 __initdata int hashdist = HASHDIST_DEFAULT; 2520 __initdata int hashdist = HASHDIST_DEFAULT;
2520 2521
2521 #ifdef CONFIG_NUMA 2522 #ifdef CONFIG_NUMA
2522 static int __init set_hashdist(char *str) 2523 static int __init set_hashdist(char *str)
2523 { 2524 {
2524 if (!str) 2525 if (!str)
2525 return 0; 2526 return 0;
2526 hashdist = simple_strtoul(str, &str, 0); 2527 hashdist = simple_strtoul(str, &str, 0);
2527 return 1; 2528 return 1;
2528 } 2529 }
2529 __setup("hashdist=", set_hashdist); 2530 __setup("hashdist=", set_hashdist);
2530 #endif 2531 #endif
2531 2532
2532 /* 2533 /*
2533 * allocate a large system hash table from bootmem 2534 * allocate a large system hash table from bootmem
2534 * - it is assumed that the hash table must contain an exact power-of-2 2535 * - it is assumed that the hash table must contain an exact power-of-2
2535 * quantity of entries 2536 * quantity of entries
2536 * - limit is the number of hash buckets, not the total allocation size 2537 * - limit is the number of hash buckets, not the total allocation size
2537 */ 2538 */
2538 void *__init alloc_large_system_hash(const char *tablename, 2539 void *__init alloc_large_system_hash(const char *tablename,
2539 unsigned long bucketsize, 2540 unsigned long bucketsize,
2540 unsigned long numentries, 2541 unsigned long numentries,
2541 int scale, 2542 int scale,
2542 int flags, 2543 int flags,
2543 unsigned int *_hash_shift, 2544 unsigned int *_hash_shift,
2544 unsigned int *_hash_mask, 2545 unsigned int *_hash_mask,
2545 unsigned long limit) 2546 unsigned long limit)
2546 { 2547 {
2547 unsigned long long max = limit; 2548 unsigned long long max = limit;
2548 unsigned long log2qty, size; 2549 unsigned long log2qty, size;
2549 void *table = NULL; 2550 void *table = NULL;
2550 2551
2551 /* allow the kernel cmdline to have a say */ 2552 /* allow the kernel cmdline to have a say */
2552 if (!numentries) { 2553 if (!numentries) {
2553 /* round applicable memory size up to nearest megabyte */ 2554 /* round applicable memory size up to nearest megabyte */
2554 numentries = (flags & HASH_HIGHMEM) ? nr_all_pages : nr_kernel_pages; 2555 numentries = (flags & HASH_HIGHMEM) ? nr_all_pages : nr_kernel_pages;
2555 numentries += (1UL << (20 - PAGE_SHIFT)) - 1; 2556 numentries += (1UL << (20 - PAGE_SHIFT)) - 1;
2556 numentries >>= 20 - PAGE_SHIFT; 2557 numentries >>= 20 - PAGE_SHIFT;
2557 numentries <<= 20 - PAGE_SHIFT; 2558 numentries <<= 20 - PAGE_SHIFT;
2558 2559
2559 /* limit to 1 bucket per 2^scale bytes of low memory */ 2560 /* limit to 1 bucket per 2^scale bytes of low memory */
2560 if (scale > PAGE_SHIFT) 2561 if (scale > PAGE_SHIFT)
2561 numentries >>= (scale - PAGE_SHIFT); 2562 numentries >>= (scale - PAGE_SHIFT);
2562 else 2563 else
2563 numentries <<= (PAGE_SHIFT - scale); 2564 numentries <<= (PAGE_SHIFT - scale);
2564 } 2565 }
2565 /* rounded up to nearest power of 2 in size */ 2566 /* rounded up to nearest power of 2 in size */
2566 numentries = 1UL << (long_log2(numentries) + 1); 2567 numentries = 1UL << (long_log2(numentries) + 1);
2567 2568
2568 /* limit allocation size to 1/16 total memory by default */ 2569 /* limit allocation size to 1/16 total memory by default */
2569 if (max == 0) { 2570 if (max == 0) {
2570 max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4; 2571 max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
2571 do_div(max, bucketsize); 2572 do_div(max, bucketsize);
2572 } 2573 }
2573 2574
2574 if (numentries > max) 2575 if (numentries > max)
2575 numentries = max; 2576 numentries = max;
2576 2577
2577 log2qty = long_log2(numentries); 2578 log2qty = long_log2(numentries);
2578 2579
2579 do { 2580 do {
2580 size = bucketsize << log2qty; 2581 size = bucketsize << log2qty;
2581 if (flags & HASH_EARLY) 2582 if (flags & HASH_EARLY)
2582 table = alloc_bootmem(size); 2583 table = alloc_bootmem(size);
2583 else if (hashdist) 2584 else if (hashdist)
2584 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); 2585 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
2585 else { 2586 else {
2586 unsigned long order; 2587 unsigned long order;
2587 for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++) 2588 for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++)
2588 ; 2589 ;
2589 table = (void*) __get_free_pages(GFP_ATOMIC, order); 2590 table = (void*) __get_free_pages(GFP_ATOMIC, order);
2590 } 2591 }
2591 } while (!table && size > PAGE_SIZE && --log2qty); 2592 } while (!table && size > PAGE_SIZE && --log2qty);
2592 2593
2593 if (!table) 2594 if (!table)
2594 panic("Failed to allocate %s hash table\n", tablename); 2595 panic("Failed to allocate %s hash table\n", tablename);
2595 2596
2596 printk("%s hash table entries: %d (order: %d, %lu bytes)\n", 2597 printk("%s hash table entries: %d (order: %d, %lu bytes)\n",
2597 tablename, 2598 tablename,
2598 (1U << log2qty), 2599 (1U << log2qty),
2599 long_log2(size) - PAGE_SHIFT, 2600 long_log2(size) - PAGE_SHIFT,
2600 size); 2601 size);
2601 2602
2602 if (_hash_shift) 2603 if (_hash_shift)
2603 *_hash_shift = log2qty; 2604 *_hash_shift = log2qty;
2604 if (_hash_mask) 2605 if (_hash_mask)
2605 *_hash_mask = (1 << log2qty) - 1; 2606 *_hash_mask = (1 << log2qty) - 1;
2606 2607
2607 return table; 2608 return table;
2608 } 2609 }
2609 2610