Commit 3a58a2a6c879b2e47daafd6e641661c50ac9da5a

Authored by Yinghai Lu
Committed by Ingo Molnar
1 parent cfb0e53b05

x86: introduce init_memory_mapping for 32bit #3

move kva related early backto initmem_init for numa32

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

Showing 3 changed files with 6 additions and 27 deletions Inline Diff

arch/x86/mm/discontig_32.c
1 /* 1 /*
2 * Written by: Patricia Gaughen <gone@us.ibm.com>, IBM Corporation 2 * Written by: Patricia Gaughen <gone@us.ibm.com>, IBM Corporation
3 * August 2002: added remote node KVA remap - Martin J. Bligh 3 * August 2002: added remote node KVA remap - Martin J. Bligh
4 * 4 *
5 * Copyright (C) 2002, IBM Corp. 5 * Copyright (C) 2002, IBM Corp.
6 * 6 *
7 * All rights reserved. 7 * All rights reserved.
8 * 8 *
9 * This program is free software; you can redistribute it and/or modify 9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by 10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or 11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version. 12 * (at your option) any later version.
13 * 13 *
14 * This program is distributed in the hope that it will be useful, but 14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of 15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or 16 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
17 * NON INFRINGEMENT. See the GNU General Public License for more 17 * NON INFRINGEMENT. See the GNU General Public License for more
18 * details. 18 * details.
19 * 19 *
20 * You should have received a copy of the GNU General Public License 20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software 21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 */ 23 */
24 24
25 #include <linux/mm.h> 25 #include <linux/mm.h>
26 #include <linux/bootmem.h> 26 #include <linux/bootmem.h>
27 #include <linux/mmzone.h> 27 #include <linux/mmzone.h>
28 #include <linux/highmem.h> 28 #include <linux/highmem.h>
29 #include <linux/initrd.h> 29 #include <linux/initrd.h>
30 #include <linux/nodemask.h> 30 #include <linux/nodemask.h>
31 #include <linux/module.h> 31 #include <linux/module.h>
32 #include <linux/kexec.h> 32 #include <linux/kexec.h>
33 #include <linux/pfn.h> 33 #include <linux/pfn.h>
34 #include <linux/swap.h> 34 #include <linux/swap.h>
35 #include <linux/acpi.h> 35 #include <linux/acpi.h>
36 36
37 #include <asm/e820.h> 37 #include <asm/e820.h>
38 #include <asm/setup.h> 38 #include <asm/setup.h>
39 #include <asm/mmzone.h> 39 #include <asm/mmzone.h>
40 #include <asm/bios_ebda.h> 40 #include <asm/bios_ebda.h>
41 #include <asm/proto.h> 41 #include <asm/proto.h>
42 42
43 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; 43 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
44 EXPORT_SYMBOL(node_data); 44 EXPORT_SYMBOL(node_data);
45 static bootmem_data_t node0_bdata; 45 static bootmem_data_t node0_bdata;
46 46
47 /* 47 /*
48 * numa interface - we expect the numa architecture specific code to have 48 * numa interface - we expect the numa architecture specific code to have
49 * populated the following initialisation. 49 * populated the following initialisation.
50 * 50 *
51 * 1) node_online_map - the map of all nodes configured (online) in the system 51 * 1) node_online_map - the map of all nodes configured (online) in the system
52 * 2) node_start_pfn - the starting page frame number for a node 52 * 2) node_start_pfn - the starting page frame number for a node
53 * 3) node_end_pfn - the ending page fram number for a node 53 * 3) node_end_pfn - the ending page fram number for a node
54 */ 54 */
55 unsigned long node_start_pfn[MAX_NUMNODES] __read_mostly; 55 unsigned long node_start_pfn[MAX_NUMNODES] __read_mostly;
56 unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly; 56 unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly;
57 57
58 58
59 #ifdef CONFIG_DISCONTIGMEM 59 #ifdef CONFIG_DISCONTIGMEM
60 /* 60 /*
61 * 4) physnode_map - the mapping between a pfn and owning node 61 * 4) physnode_map - the mapping between a pfn and owning node
62 * physnode_map keeps track of the physical memory layout of a generic 62 * physnode_map keeps track of the physical memory layout of a generic
63 * numa node on a 64Mb break (each element of the array will 63 * numa node on a 64Mb break (each element of the array will
64 * represent 64Mb of memory and will be marked by the node id. so, 64 * represent 64Mb of memory and will be marked by the node id. so,
65 * if the first gig is on node 0, and the second gig is on node 1 65 * if the first gig is on node 0, and the second gig is on node 1
66 * physnode_map will contain: 66 * physnode_map will contain:
67 * 67 *
68 * physnode_map[0-15] = 0; 68 * physnode_map[0-15] = 0;
69 * physnode_map[16-31] = 1; 69 * physnode_map[16-31] = 1;
70 * physnode_map[32- ] = -1; 70 * physnode_map[32- ] = -1;
71 */ 71 */
72 s8 physnode_map[MAX_ELEMENTS] __read_mostly = { [0 ... (MAX_ELEMENTS - 1)] = -1}; 72 s8 physnode_map[MAX_ELEMENTS] __read_mostly = { [0 ... (MAX_ELEMENTS - 1)] = -1};
73 EXPORT_SYMBOL(physnode_map); 73 EXPORT_SYMBOL(physnode_map);
74 74
75 void memory_present(int nid, unsigned long start, unsigned long end) 75 void memory_present(int nid, unsigned long start, unsigned long end)
76 { 76 {
77 unsigned long pfn; 77 unsigned long pfn;
78 78
79 printk(KERN_INFO "Node: %d, start_pfn: %lx, end_pfn: %lx\n", 79 printk(KERN_INFO "Node: %d, start_pfn: %lx, end_pfn: %lx\n",
80 nid, start, end); 80 nid, start, end);
81 printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid); 81 printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid);
82 printk(KERN_DEBUG " "); 82 printk(KERN_DEBUG " ");
83 for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) { 83 for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) {
84 physnode_map[pfn / PAGES_PER_ELEMENT] = nid; 84 physnode_map[pfn / PAGES_PER_ELEMENT] = nid;
85 printk(KERN_CONT "%lx ", pfn); 85 printk(KERN_CONT "%lx ", pfn);
86 } 86 }
87 printk(KERN_CONT "\n"); 87 printk(KERN_CONT "\n");
88 } 88 }
89 89
90 unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn, 90 unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
91 unsigned long end_pfn) 91 unsigned long end_pfn)
92 { 92 {
93 unsigned long nr_pages = end_pfn - start_pfn; 93 unsigned long nr_pages = end_pfn - start_pfn;
94 94
95 if (!nr_pages) 95 if (!nr_pages)
96 return 0; 96 return 0;
97 97
98 return (nr_pages + 1) * sizeof(struct page); 98 return (nr_pages + 1) * sizeof(struct page);
99 } 99 }
100 #endif 100 #endif
101 101
102 extern unsigned long find_max_low_pfn(void); 102 extern unsigned long find_max_low_pfn(void);
103 extern unsigned long highend_pfn, highstart_pfn; 103 extern unsigned long highend_pfn, highstart_pfn;
104 104
105 #define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) 105 #define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
106 106
107 unsigned long node_remap_size[MAX_NUMNODES]; 107 unsigned long node_remap_size[MAX_NUMNODES];
108 static void *node_remap_start_vaddr[MAX_NUMNODES]; 108 static void *node_remap_start_vaddr[MAX_NUMNODES];
109 void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); 109 void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
110 110
111 static unsigned long kva_start_pfn; 111 static unsigned long kva_start_pfn;
112 static unsigned long kva_pages; 112 static unsigned long kva_pages;
113 /* 113 /*
114 * FLAT - support for basic PC memory model with discontig enabled, essentially 114 * FLAT - support for basic PC memory model with discontig enabled, essentially
115 * a single node with all available processors in it with a flat 115 * a single node with all available processors in it with a flat
116 * memory map. 116 * memory map.
117 */ 117 */
118 int __init get_memcfg_numa_flat(void) 118 int __init get_memcfg_numa_flat(void)
119 { 119 {
120 printk(KERN_DEBUG "NUMA - single node, flat memory mode\n"); 120 printk(KERN_DEBUG "NUMA - single node, flat memory mode\n");
121 121
122 node_start_pfn[0] = 0; 122 node_start_pfn[0] = 0;
123 node_end_pfn[0] = max_pfn; 123 node_end_pfn[0] = max_pfn;
124 e820_register_active_regions(0, 0, max_pfn); 124 e820_register_active_regions(0, 0, max_pfn);
125 memory_present(0, 0, max_pfn); 125 memory_present(0, 0, max_pfn);
126 node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn); 126 node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn);
127 127
128 /* Indicate there is one node available. */ 128 /* Indicate there is one node available. */
129 nodes_clear(node_online_map); 129 nodes_clear(node_online_map);
130 node_set_online(0); 130 node_set_online(0);
131 return 1; 131 return 1;
132 } 132 }
133 133
134 /* 134 /*
135 * Find the highest page frame number we have available for the node 135 * Find the highest page frame number we have available for the node
136 */ 136 */
137 static void __init propagate_e820_map_node(int nid) 137 static void __init propagate_e820_map_node(int nid)
138 { 138 {
139 if (node_end_pfn[nid] > max_pfn) 139 if (node_end_pfn[nid] > max_pfn)
140 node_end_pfn[nid] = max_pfn; 140 node_end_pfn[nid] = max_pfn;
141 /* 141 /*
142 * if a user has given mem=XXXX, then we need to make sure 142 * if a user has given mem=XXXX, then we need to make sure
143 * that the node _starts_ before that, too, not just ends 143 * that the node _starts_ before that, too, not just ends
144 */ 144 */
145 if (node_start_pfn[nid] > max_pfn) 145 if (node_start_pfn[nid] > max_pfn)
146 node_start_pfn[nid] = max_pfn; 146 node_start_pfn[nid] = max_pfn;
147 BUG_ON(node_start_pfn[nid] > node_end_pfn[nid]); 147 BUG_ON(node_start_pfn[nid] > node_end_pfn[nid]);
148 } 148 }
149 149
150 /* 150 /*
151 * Allocate memory for the pg_data_t for this node via a crude pre-bootmem 151 * Allocate memory for the pg_data_t for this node via a crude pre-bootmem
152 * method. For node zero take this from the bottom of memory, for 152 * method. For node zero take this from the bottom of memory, for
153 * subsequent nodes place them at node_remap_start_vaddr which contains 153 * subsequent nodes place them at node_remap_start_vaddr which contains
154 * node local data in physically node local memory. See setup_memory() 154 * node local data in physically node local memory. See setup_memory()
155 * for details. 155 * for details.
156 */ 156 */
157 static void __init allocate_pgdat(int nid) 157 static void __init allocate_pgdat(int nid)
158 { 158 {
159 if (nid && node_has_online_mem(nid) && node_remap_start_vaddr[nid]) 159 if (nid && node_has_online_mem(nid) && node_remap_start_vaddr[nid])
160 NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid]; 160 NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid];
161 else { 161 else {
162 unsigned long pgdat_phys; 162 unsigned long pgdat_phys;
163 pgdat_phys = find_e820_area(min_low_pfn<<PAGE_SHIFT, 163 pgdat_phys = find_e820_area(min_low_pfn<<PAGE_SHIFT,
164 (nid ? max_low_pfn:max_pfn_mapped)<<PAGE_SHIFT, 164 (nid ? max_low_pfn:max_pfn_mapped)<<PAGE_SHIFT,
165 sizeof(pg_data_t), 165 sizeof(pg_data_t),
166 PAGE_SIZE); 166 PAGE_SIZE);
167 NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(pgdat_phys>>PAGE_SHIFT)); 167 NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(pgdat_phys>>PAGE_SHIFT));
168 reserve_early(pgdat_phys, pgdat_phys + sizeof(pg_data_t), 168 reserve_early(pgdat_phys, pgdat_phys + sizeof(pg_data_t),
169 "NODE_DATA"); 169 "NODE_DATA");
170 } 170 }
171 printk(KERN_DEBUG "allocate_pgdat: node %d NODE_DATA %08lx\n", 171 printk(KERN_DEBUG "allocate_pgdat: node %d NODE_DATA %08lx\n",
172 nid, (unsigned long)NODE_DATA(nid)); 172 nid, (unsigned long)NODE_DATA(nid));
173 } 173 }
174 174
175 /* 175 /*
176 * In the DISCONTIGMEM and SPARSEMEM memory model, a portion of the kernel 176 * In the DISCONTIGMEM and SPARSEMEM memory model, a portion of the kernel
177 * virtual address space (KVA) is reserved and portions of nodes are mapped 177 * virtual address space (KVA) is reserved and portions of nodes are mapped
178 * using it. This is to allow node-local memory to be allocated for 178 * using it. This is to allow node-local memory to be allocated for
179 * structures that would normally require ZONE_NORMAL. The memory is 179 * structures that would normally require ZONE_NORMAL. The memory is
180 * allocated with alloc_remap() and callers should be prepared to allocate 180 * allocated with alloc_remap() and callers should be prepared to allocate
181 * from the bootmem allocator instead. 181 * from the bootmem allocator instead.
182 */ 182 */
183 static unsigned long node_remap_start_pfn[MAX_NUMNODES]; 183 static unsigned long node_remap_start_pfn[MAX_NUMNODES];
184 static void *node_remap_end_vaddr[MAX_NUMNODES]; 184 static void *node_remap_end_vaddr[MAX_NUMNODES];
185 static void *node_remap_alloc_vaddr[MAX_NUMNODES]; 185 static void *node_remap_alloc_vaddr[MAX_NUMNODES];
186 static unsigned long node_remap_offset[MAX_NUMNODES]; 186 static unsigned long node_remap_offset[MAX_NUMNODES];
187 187
188 void *alloc_remap(int nid, unsigned long size) 188 void *alloc_remap(int nid, unsigned long size)
189 { 189 {
190 void *allocation = node_remap_alloc_vaddr[nid]; 190 void *allocation = node_remap_alloc_vaddr[nid];
191 191
192 size = ALIGN(size, L1_CACHE_BYTES); 192 size = ALIGN(size, L1_CACHE_BYTES);
193 193
194 if (!allocation || (allocation + size) >= node_remap_end_vaddr[nid]) 194 if (!allocation || (allocation + size) >= node_remap_end_vaddr[nid])
195 return 0; 195 return 0;
196 196
197 node_remap_alloc_vaddr[nid] += size; 197 node_remap_alloc_vaddr[nid] += size;
198 memset(allocation, 0, size); 198 memset(allocation, 0, size);
199 199
200 return allocation; 200 return allocation;
201 } 201 }
202 202
203 void __init remap_numa_kva(void) 203 static void __init remap_numa_kva(void)
204 { 204 {
205 void *vaddr; 205 void *vaddr;
206 unsigned long pfn; 206 unsigned long pfn;
207 int node; 207 int node;
208 208
209 for_each_online_node(node) { 209 for_each_online_node(node) {
210 printk(KERN_DEBUG "remap_numa_kva: node %d\n", node); 210 printk(KERN_DEBUG "remap_numa_kva: node %d\n", node);
211 for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) { 211 for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) {
212 vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT); 212 vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT);
213 printk(KERN_DEBUG "remap_numa_kva: %08lx to pfn %08lx\n", 213 printk(KERN_DEBUG "remap_numa_kva: %08lx to pfn %08lx\n",
214 (unsigned long)vaddr, 214 (unsigned long)vaddr,
215 node_remap_start_pfn[node] + pfn); 215 node_remap_start_pfn[node] + pfn);
216 set_pmd_pfn((ulong) vaddr, 216 set_pmd_pfn((ulong) vaddr,
217 node_remap_start_pfn[node] + pfn, 217 node_remap_start_pfn[node] + pfn,
218 PAGE_KERNEL_LARGE); 218 PAGE_KERNEL_LARGE);
219 } 219 }
220 } 220 }
221 } 221 }
222 222
223 static unsigned long calculate_numa_remap_pages(void) 223 static unsigned long calculate_numa_remap_pages(void)
224 { 224 {
225 int nid; 225 int nid;
226 unsigned long size, reserve_pages = 0; 226 unsigned long size, reserve_pages = 0;
227 227
228 for_each_online_node(nid) { 228 for_each_online_node(nid) {
229 u64 node_kva_target; 229 u64 node_kva_target;
230 u64 node_kva_final; 230 u64 node_kva_final;
231 231
232 /* 232 /*
233 * The acpi/srat node info can show hot-add memroy zones 233 * The acpi/srat node info can show hot-add memroy zones
234 * where memory could be added but not currently present. 234 * where memory could be added but not currently present.
235 */ 235 */
236 printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n", 236 printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n",
237 nid, node_start_pfn[nid], node_end_pfn[nid]); 237 nid, node_start_pfn[nid], node_end_pfn[nid]);
238 if (node_start_pfn[nid] > max_pfn) 238 if (node_start_pfn[nid] > max_pfn)
239 continue; 239 continue;
240 if (!node_end_pfn[nid]) 240 if (!node_end_pfn[nid])
241 continue; 241 continue;
242 if (node_end_pfn[nid] > max_pfn) 242 if (node_end_pfn[nid] > max_pfn)
243 node_end_pfn[nid] = max_pfn; 243 node_end_pfn[nid] = max_pfn;
244 244
245 /* ensure the remap includes space for the pgdat. */ 245 /* ensure the remap includes space for the pgdat. */
246 size = node_remap_size[nid] + sizeof(pg_data_t); 246 size = node_remap_size[nid] + sizeof(pg_data_t);
247 247
248 /* convert size to large (pmd size) pages, rounding up */ 248 /* convert size to large (pmd size) pages, rounding up */
249 size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES; 249 size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES;
250 /* now the roundup is correct, convert to PAGE_SIZE pages */ 250 /* now the roundup is correct, convert to PAGE_SIZE pages */
251 size = size * PTRS_PER_PTE; 251 size = size * PTRS_PER_PTE;
252 252
253 node_kva_target = round_down(node_end_pfn[nid] - size, 253 node_kva_target = round_down(node_end_pfn[nid] - size,
254 PTRS_PER_PTE); 254 PTRS_PER_PTE);
255 node_kva_target <<= PAGE_SHIFT; 255 node_kva_target <<= PAGE_SHIFT;
256 do { 256 do {
257 node_kva_final = find_e820_area(node_kva_target, 257 node_kva_final = find_e820_area(node_kva_target,
258 ((u64)node_end_pfn[nid])<<PAGE_SHIFT, 258 ((u64)node_end_pfn[nid])<<PAGE_SHIFT,
259 ((u64)size)<<PAGE_SHIFT, 259 ((u64)size)<<PAGE_SHIFT,
260 LARGE_PAGE_BYTES); 260 LARGE_PAGE_BYTES);
261 node_kva_target -= LARGE_PAGE_BYTES; 261 node_kva_target -= LARGE_PAGE_BYTES;
262 } while (node_kva_final == -1ULL && 262 } while (node_kva_final == -1ULL &&
263 (node_kva_target>>PAGE_SHIFT) > (node_start_pfn[nid])); 263 (node_kva_target>>PAGE_SHIFT) > (node_start_pfn[nid]));
264 264
265 if (node_kva_final == -1ULL) 265 if (node_kva_final == -1ULL)
266 panic("Can not get kva ram\n"); 266 panic("Can not get kva ram\n");
267 267
268 node_remap_size[nid] = size; 268 node_remap_size[nid] = size;
269 node_remap_offset[nid] = reserve_pages; 269 node_remap_offset[nid] = reserve_pages;
270 reserve_pages += size; 270 reserve_pages += size;
271 printk(KERN_DEBUG "Reserving %ld pages of KVA for lmem_map of" 271 printk(KERN_DEBUG "Reserving %ld pages of KVA for lmem_map of"
272 " node %d at %llx\n", 272 " node %d at %llx\n",
273 size, nid, node_kva_final>>PAGE_SHIFT); 273 size, nid, node_kva_final>>PAGE_SHIFT);
274 274
275 /* 275 /*
276 * prevent kva address below max_low_pfn want it on system 276 * prevent kva address below max_low_pfn want it on system
277 * with less memory later. 277 * with less memory later.
278 * layout will be: KVA address , KVA RAM 278 * layout will be: KVA address , KVA RAM
279 * 279 *
280 * we are supposed to only record the one less then max_low_pfn 280 * we are supposed to only record the one less then max_low_pfn
281 * but we could have some hole in high memory, and it will only 281 * but we could have some hole in high memory, and it will only
282 * check page_is_ram(pfn) && !page_is_reserved_early(pfn) to decide 282 * check page_is_ram(pfn) && !page_is_reserved_early(pfn) to decide
283 * to use it as free. 283 * to use it as free.
284 * So reserve_early here, hope we don't run out of that array 284 * So reserve_early here, hope we don't run out of that array
285 */ 285 */
286 reserve_early(node_kva_final, 286 reserve_early(node_kva_final,
287 node_kva_final+(((u64)size)<<PAGE_SHIFT), 287 node_kva_final+(((u64)size)<<PAGE_SHIFT),
288 "KVA RAM"); 288 "KVA RAM");
289 289
290 node_remap_start_pfn[nid] = node_kva_final>>PAGE_SHIFT; 290 node_remap_start_pfn[nid] = node_kva_final>>PAGE_SHIFT;
291 remove_active_range(nid, node_remap_start_pfn[nid], 291 remove_active_range(nid, node_remap_start_pfn[nid],
292 node_remap_start_pfn[nid] + size); 292 node_remap_start_pfn[nid] + size);
293 } 293 }
294 printk(KERN_INFO "Reserving total of %lx pages for numa KVA remap\n", 294 printk(KERN_INFO "Reserving total of %lx pages for numa KVA remap\n",
295 reserve_pages); 295 reserve_pages);
296 return reserve_pages; 296 return reserve_pages;
297 } 297 }
298 298
299 static void init_remap_allocator(int nid) 299 static void init_remap_allocator(int nid)
300 { 300 {
301 node_remap_start_vaddr[nid] = pfn_to_kaddr( 301 node_remap_start_vaddr[nid] = pfn_to_kaddr(
302 kva_start_pfn + node_remap_offset[nid]); 302 kva_start_pfn + node_remap_offset[nid]);
303 node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] + 303 node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] +
304 (node_remap_size[nid] * PAGE_SIZE); 304 (node_remap_size[nid] * PAGE_SIZE);
305 node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] + 305 node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] +
306 ALIGN(sizeof(pg_data_t), PAGE_SIZE); 306 ALIGN(sizeof(pg_data_t), PAGE_SIZE);
307 307
308 printk(KERN_DEBUG "node %d will remap to vaddr %08lx - %08lx\n", nid, 308 printk(KERN_DEBUG "node %d will remap to vaddr %08lx - %08lx\n", nid,
309 (ulong) node_remap_start_vaddr[nid], 309 (ulong) node_remap_start_vaddr[nid],
310 (ulong) node_remap_end_vaddr[nid]); 310 (ulong) node_remap_end_vaddr[nid]);
311 } 311 }
312 312
313 void __init initmem_init(unsigned long start_pfn, 313 void __init initmem_init(unsigned long start_pfn,
314 unsigned long end_pfn) 314 unsigned long end_pfn)
315 { 315 {
316 int nid; 316 int nid;
317 long kva_target_pfn; 317 long kva_target_pfn;
318 318
319 /* 319 /*
320 * When mapping a NUMA machine we allocate the node_mem_map arrays 320 * When mapping a NUMA machine we allocate the node_mem_map arrays
321 * from node local memory. They are then mapped directly into KVA 321 * from node local memory. They are then mapped directly into KVA
322 * between zone normal and vmalloc space. Calculate the size of 322 * between zone normal and vmalloc space. Calculate the size of
323 * this space and use it to adjust the boundary between ZONE_NORMAL 323 * this space and use it to adjust the boundary between ZONE_NORMAL
324 * and ZONE_HIGHMEM. 324 * and ZONE_HIGHMEM.
325 */ 325 */
326 326
327 remove_all_active_ranges(); 327 remove_all_active_ranges();
328 get_memcfg_numa(); 328 get_memcfg_numa();
329 329
330 kva_pages = round_up(calculate_numa_remap_pages(), PTRS_PER_PTE); 330 kva_pages = round_up(calculate_numa_remap_pages(), PTRS_PER_PTE);
331 331
332 kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE); 332 kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE);
333 do { 333 do {
334 kva_start_pfn = find_e820_area(kva_target_pfn<<PAGE_SHIFT, 334 kva_start_pfn = find_e820_area(kva_target_pfn<<PAGE_SHIFT,
335 max_low_pfn<<PAGE_SHIFT, 335 max_low_pfn<<PAGE_SHIFT,
336 kva_pages<<PAGE_SHIFT, 336 kva_pages<<PAGE_SHIFT,
337 PTRS_PER_PTE<<PAGE_SHIFT) >> PAGE_SHIFT; 337 PTRS_PER_PTE<<PAGE_SHIFT) >> PAGE_SHIFT;
338 kva_target_pfn -= PTRS_PER_PTE; 338 kva_target_pfn -= PTRS_PER_PTE;
339 } while (kva_start_pfn == -1UL && kva_target_pfn > min_low_pfn); 339 } while (kva_start_pfn == -1UL && kva_target_pfn > min_low_pfn);
340 340
341 if (kva_start_pfn == -1UL) 341 if (kva_start_pfn == -1UL)
342 panic("Can not get kva space\n"); 342 panic("Can not get kva space\n");
343 343
344 printk(KERN_INFO "kva_start_pfn ~ %lx max_low_pfn ~ %lx\n", 344 printk(KERN_INFO "kva_start_pfn ~ %lx max_low_pfn ~ %lx\n",
345 kva_start_pfn, max_low_pfn); 345 kva_start_pfn, max_low_pfn);
346 printk(KERN_INFO "max_pfn = %lx\n", max_pfn); 346 printk(KERN_INFO "max_pfn = %lx\n", max_pfn);
347 347
348 /* avoid clash with initrd */ 348 /* avoid clash with initrd */
349 reserve_early(kva_start_pfn<<PAGE_SHIFT, 349 reserve_early(kva_start_pfn<<PAGE_SHIFT,
350 (kva_start_pfn + kva_pages)<<PAGE_SHIFT, 350 (kva_start_pfn + kva_pages)<<PAGE_SHIFT,
351 "KVA PG"); 351 "KVA PG");
352 #ifdef CONFIG_HIGHMEM 352 #ifdef CONFIG_HIGHMEM
353 highstart_pfn = highend_pfn = max_pfn; 353 highstart_pfn = highend_pfn = max_pfn;
354 if (max_pfn > max_low_pfn) 354 if (max_pfn > max_low_pfn)
355 highstart_pfn = max_low_pfn; 355 highstart_pfn = max_low_pfn;
356 printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", 356 printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
357 pages_to_mb(highend_pfn - highstart_pfn)); 357 pages_to_mb(highend_pfn - highstart_pfn));
358 num_physpages = highend_pfn; 358 num_physpages = highend_pfn;
359 high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; 359 high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
360 #else 360 #else
361 num_physpages = max_low_pfn; 361 num_physpages = max_low_pfn;
362 high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; 362 high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
363 #endif 363 #endif
364 printk(KERN_NOTICE "%ldMB LOWMEM available.\n", 364 printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
365 pages_to_mb(max_low_pfn)); 365 pages_to_mb(max_low_pfn));
366 printk(KERN_DEBUG "max_low_pfn = %lx, highstart_pfn = %lx\n", 366 printk(KERN_DEBUG "max_low_pfn = %lx, highstart_pfn = %lx\n",
367 max_low_pfn, highstart_pfn); 367 max_low_pfn, highstart_pfn);
368 368
369 printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n", 369 printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n",
370 (ulong) pfn_to_kaddr(max_low_pfn)); 370 (ulong) pfn_to_kaddr(max_low_pfn));
371 for_each_online_node(nid) { 371 for_each_online_node(nid) {
372 init_remap_allocator(nid); 372 init_remap_allocator(nid);
373 373
374 allocate_pgdat(nid); 374 allocate_pgdat(nid);
375 } 375 }
376 remap_numa_kva();
377
376 printk(KERN_DEBUG "High memory starts at vaddr %08lx\n", 378 printk(KERN_DEBUG "High memory starts at vaddr %08lx\n",
377 (ulong) pfn_to_kaddr(highstart_pfn)); 379 (ulong) pfn_to_kaddr(highstart_pfn));
378 for_each_online_node(nid) 380 for_each_online_node(nid)
379 propagate_e820_map_node(nid); 381 propagate_e820_map_node(nid);
380 382
381 memset(NODE_DATA(0), 0, sizeof(struct pglist_data)); 383 for_each_online_node(nid)
384 memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
385
382 NODE_DATA(0)->bdata = &node0_bdata; 386 NODE_DATA(0)->bdata = &node0_bdata;
383 setup_bootmem_allocator(); 387 setup_bootmem_allocator();
384 } 388 }
385 389
386 void __init zone_sizes_init(void) 390 void __init zone_sizes_init(void)
387 { 391 {
388 unsigned long max_zone_pfns[MAX_NR_ZONES]; 392 unsigned long max_zone_pfns[MAX_NR_ZONES];
389 memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); 393 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
390 max_zone_pfns[ZONE_DMA] = 394 max_zone_pfns[ZONE_DMA] =
391 virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; 395 virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
392 max_zone_pfns[ZONE_NORMAL] = max_low_pfn; 396 max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
393 #ifdef CONFIG_HIGHMEM 397 #ifdef CONFIG_HIGHMEM
394 max_zone_pfns[ZONE_HIGHMEM] = highend_pfn; 398 max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
395 #endif 399 #endif
396 400
397 free_area_init_nodes(max_zone_pfns); 401 free_area_init_nodes(max_zone_pfns);
398 return; 402 return;
399 } 403 }
400 404
401 void __init set_highmem_pages_init(void) 405 void __init set_highmem_pages_init(void)
402 { 406 {
403 #ifdef CONFIG_HIGHMEM 407 #ifdef CONFIG_HIGHMEM
404 struct zone *zone; 408 struct zone *zone;
405 int nid; 409 int nid;
406 410
407 for_each_zone(zone) { 411 for_each_zone(zone) {
408 unsigned long zone_start_pfn, zone_end_pfn; 412 unsigned long zone_start_pfn, zone_end_pfn;
409 413
410 if (!is_highmem(zone)) 414 if (!is_highmem(zone))
411 continue; 415 continue;
412 416
413 zone_start_pfn = zone->zone_start_pfn; 417 zone_start_pfn = zone->zone_start_pfn;
414 zone_end_pfn = zone_start_pfn + zone->spanned_pages; 418 zone_end_pfn = zone_start_pfn + zone->spanned_pages;
415 419
416 nid = zone_to_nid(zone); 420 nid = zone_to_nid(zone);
417 printk(KERN_INFO "Initializing %s for node %d (%08lx:%08lx)\n", 421 printk(KERN_INFO "Initializing %s for node %d (%08lx:%08lx)\n",
418 zone->name, nid, zone_start_pfn, zone_end_pfn); 422 zone->name, nid, zone_start_pfn, zone_end_pfn);
419 423
420 add_highpages_with_active_regions(nid, zone_start_pfn, 424 add_highpages_with_active_regions(nid, zone_start_pfn,
421 zone_end_pfn); 425 zone_end_pfn);
422 } 426 }
423 totalram_pages += totalhigh_pages; 427 totalram_pages += totalhigh_pages;
424 #endif 428 #endif
425 } 429 }
426 430
427 #ifdef CONFIG_MEMORY_HOTPLUG 431 #ifdef CONFIG_MEMORY_HOTPLUG
428 static int paddr_to_nid(u64 addr) 432 static int paddr_to_nid(u64 addr)
429 { 433 {
430 int nid; 434 int nid;
431 unsigned long pfn = PFN_DOWN(addr); 435 unsigned long pfn = PFN_DOWN(addr);
432 436
433 for_each_node(nid) 437 for_each_node(nid)
434 if (node_start_pfn[nid] <= pfn && 438 if (node_start_pfn[nid] <= pfn &&
435 pfn < node_end_pfn[nid]) 439 pfn < node_end_pfn[nid])
436 return nid; 440 return nid;
437 441
438 return -1; 442 return -1;
439 } 443 }
440 444
441 /* 445 /*
442 * This function is used to ask node id BEFORE memmap and mem_section's 446 * This function is used to ask node id BEFORE memmap and mem_section's
443 * initialization (pfn_to_nid() can't be used yet). 447 * initialization (pfn_to_nid() can't be used yet).
444 * If _PXM is not defined on ACPI's DSDT, node id must be found by this. 448 * If _PXM is not defined on ACPI's DSDT, node id must be found by this.
445 */ 449 */
446 int memory_add_physaddr_to_nid(u64 addr) 450 int memory_add_physaddr_to_nid(u64 addr)
447 { 451 {
448 int nid = paddr_to_nid(addr); 452 int nid = paddr_to_nid(addr);
449 return (nid >= 0) ? nid : 0; 453 return (nid >= 0) ? nid : 0;
450 } 454 }
451 455
452 EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); 456 EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
453 #endif 457 #endif
454 458
455 #if defined(CONFIG_ACPI_NUMA) && !defined(CONFIG_HAVE_ARCH_PARSE_SRAT) 459 #if defined(CONFIG_ACPI_NUMA) && !defined(CONFIG_HAVE_ARCH_PARSE_SRAT)
456 /* 460 /*
457 * Dummy on 32-bit, for now: 461 * Dummy on 32-bit, for now:
458 */ 462 */
459 void __init acpi_numa_slit_init(struct acpi_table_slit *slit) 463 void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
460 { 464 {
461 } 465 }
462 466
463 void __init 467 void __init
464 acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) 468 acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
465 { 469 {
466 } 470 }
467 471
468 void __init acpi_numa_arch_fixup(void) 472 void __init acpi_numa_arch_fixup(void)
469 { 473 {
470 } 474 }
471 #endif 475 #endif
472 476
arch/x86/mm/init_32.c
1 /* 1 /*
2 * 2 *
3 * Copyright (C) 1995 Linus Torvalds 3 * Copyright (C) 1995 Linus Torvalds
4 * 4 *
5 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 5 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
6 */ 6 */
7 7
8 #include <linux/module.h> 8 #include <linux/module.h>
9 #include <linux/signal.h> 9 #include <linux/signal.h>
10 #include <linux/sched.h> 10 #include <linux/sched.h>
11 #include <linux/kernel.h> 11 #include <linux/kernel.h>
12 #include <linux/errno.h> 12 #include <linux/errno.h>
13 #include <linux/string.h> 13 #include <linux/string.h>
14 #include <linux/types.h> 14 #include <linux/types.h>
15 #include <linux/ptrace.h> 15 #include <linux/ptrace.h>
16 #include <linux/mman.h> 16 #include <linux/mman.h>
17 #include <linux/mm.h> 17 #include <linux/mm.h>
18 #include <linux/hugetlb.h> 18 #include <linux/hugetlb.h>
19 #include <linux/swap.h> 19 #include <linux/swap.h>
20 #include <linux/smp.h> 20 #include <linux/smp.h>
21 #include <linux/init.h> 21 #include <linux/init.h>
22 #include <linux/highmem.h> 22 #include <linux/highmem.h>
23 #include <linux/pagemap.h> 23 #include <linux/pagemap.h>
24 #include <linux/pfn.h> 24 #include <linux/pfn.h>
25 #include <linux/poison.h> 25 #include <linux/poison.h>
26 #include <linux/bootmem.h> 26 #include <linux/bootmem.h>
27 #include <linux/slab.h> 27 #include <linux/slab.h>
28 #include <linux/proc_fs.h> 28 #include <linux/proc_fs.h>
29 #include <linux/memory_hotplug.h> 29 #include <linux/memory_hotplug.h>
30 #include <linux/initrd.h> 30 #include <linux/initrd.h>
31 #include <linux/cpumask.h> 31 #include <linux/cpumask.h>
32 32
33 #include <asm/asm.h> 33 #include <asm/asm.h>
34 #include <asm/processor.h> 34 #include <asm/processor.h>
35 #include <asm/system.h> 35 #include <asm/system.h>
36 #include <asm/uaccess.h> 36 #include <asm/uaccess.h>
37 #include <asm/pgtable.h> 37 #include <asm/pgtable.h>
38 #include <asm/dma.h> 38 #include <asm/dma.h>
39 #include <asm/fixmap.h> 39 #include <asm/fixmap.h>
40 #include <asm/e820.h> 40 #include <asm/e820.h>
41 #include <asm/apic.h> 41 #include <asm/apic.h>
42 #include <asm/bugs.h> 42 #include <asm/bugs.h>
43 #include <asm/tlb.h> 43 #include <asm/tlb.h>
44 #include <asm/tlbflush.h> 44 #include <asm/tlbflush.h>
45 #include <asm/pgalloc.h> 45 #include <asm/pgalloc.h>
46 #include <asm/sections.h> 46 #include <asm/sections.h>
47 #include <asm/paravirt.h> 47 #include <asm/paravirt.h>
48 #include <asm/setup.h> 48 #include <asm/setup.h>
49 #include <asm/cacheflush.h> 49 #include <asm/cacheflush.h>
50 50
51 unsigned int __VMALLOC_RESERVE = 128 << 20; 51 unsigned int __VMALLOC_RESERVE = 128 << 20;
52 52
53 unsigned long max_pfn_mapped; 53 unsigned long max_pfn_mapped;
54 54
55 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); 55 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
56 unsigned long highstart_pfn, highend_pfn; 56 unsigned long highstart_pfn, highend_pfn;
57 57
58 static noinline int do_test_wp_bit(void); 58 static noinline int do_test_wp_bit(void);
59 59
60 60
61 static unsigned long __initdata table_start; 61 static unsigned long __initdata table_start;
62 static unsigned long __meminitdata table_end; 62 static unsigned long __meminitdata table_end;
63 static unsigned long __meminitdata table_top; 63 static unsigned long __meminitdata table_top;
64 64
65 static int __initdata after_init_bootmem; 65 static int __initdata after_init_bootmem;
66 66
67 static __init void *alloc_low_page(unsigned long *phys) 67 static __init void *alloc_low_page(unsigned long *phys)
68 { 68 {
69 unsigned long pfn = table_end++; 69 unsigned long pfn = table_end++;
70 void *adr; 70 void *adr;
71 71
72 if (pfn >= table_top) 72 if (pfn >= table_top)
73 panic("alloc_low_page: ran out of memory"); 73 panic("alloc_low_page: ran out of memory");
74 74
75 adr = __va(pfn * PAGE_SIZE); 75 adr = __va(pfn * PAGE_SIZE);
76 memset(adr, 0, PAGE_SIZE); 76 memset(adr, 0, PAGE_SIZE);
77 *phys = pfn * PAGE_SIZE; 77 *phys = pfn * PAGE_SIZE;
78 return adr; 78 return adr;
79 } 79 }
80 80
81 /* 81 /*
82 * Creates a middle page table and puts a pointer to it in the 82 * Creates a middle page table and puts a pointer to it in the
83 * given global directory entry. This only returns the gd entry 83 * given global directory entry. This only returns the gd entry
84 * in non-PAE compilation mode, since the middle layer is folded. 84 * in non-PAE compilation mode, since the middle layer is folded.
85 */ 85 */
86 static pmd_t * __init one_md_table_init(pgd_t *pgd) 86 static pmd_t * __init one_md_table_init(pgd_t *pgd)
87 { 87 {
88 pud_t *pud; 88 pud_t *pud;
89 pmd_t *pmd_table; 89 pmd_t *pmd_table;
90 90
91 #ifdef CONFIG_X86_PAE 91 #ifdef CONFIG_X86_PAE
92 unsigned long phys; 92 unsigned long phys;
93 if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { 93 if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
94 if (after_init_bootmem) 94 if (after_init_bootmem)
95 pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); 95 pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
96 else 96 else
97 pmd_table = (pmd_t *)alloc_low_page(&phys); 97 pmd_table = (pmd_t *)alloc_low_page(&phys);
98 paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); 98 paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
99 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); 99 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
100 pud = pud_offset(pgd, 0); 100 pud = pud_offset(pgd, 0);
101 BUG_ON(pmd_table != pmd_offset(pud, 0)); 101 BUG_ON(pmd_table != pmd_offset(pud, 0));
102 } 102 }
103 #endif 103 #endif
104 pud = pud_offset(pgd, 0); 104 pud = pud_offset(pgd, 0);
105 pmd_table = pmd_offset(pud, 0); 105 pmd_table = pmd_offset(pud, 0);
106 106
107 return pmd_table; 107 return pmd_table;
108 } 108 }
109 109
110 /* 110 /*
111 * Create a page table and place a pointer to it in a middle page 111 * Create a page table and place a pointer to it in a middle page
112 * directory entry: 112 * directory entry:
113 */ 113 */
114 static pte_t * __init one_page_table_init(pmd_t *pmd) 114 static pte_t * __init one_page_table_init(pmd_t *pmd)
115 { 115 {
116 if (!(pmd_val(*pmd) & _PAGE_PRESENT)) { 116 if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
117 pte_t *page_table = NULL; 117 pte_t *page_table = NULL;
118 118
119 if (after_init_bootmem) { 119 if (after_init_bootmem) {
120 #ifdef CONFIG_DEBUG_PAGEALLOC 120 #ifdef CONFIG_DEBUG_PAGEALLOC
121 page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE); 121 page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
122 #endif 122 #endif
123 if (!page_table) 123 if (!page_table)
124 page_table = 124 page_table =
125 (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE); 125 (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
126 } else { 126 } else {
127 unsigned long phys; 127 unsigned long phys;
128 page_table = (pte_t *)alloc_low_page(&phys); 128 page_table = (pte_t *)alloc_low_page(&phys);
129 } 129 }
130 130
131 paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT); 131 paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
132 set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); 132 set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
133 BUG_ON(page_table != pte_offset_kernel(pmd, 0)); 133 BUG_ON(page_table != pte_offset_kernel(pmd, 0));
134 } 134 }
135 135
136 return pte_offset_kernel(pmd, 0); 136 return pte_offset_kernel(pmd, 0);
137 } 137 }
138 138
139 /* 139 /*
140 * This function initializes a certain range of kernel virtual memory 140 * This function initializes a certain range of kernel virtual memory
141 * with new bootmem page tables, everywhere page tables are missing in 141 * with new bootmem page tables, everywhere page tables are missing in
142 * the given range. 142 * the given range.
143 * 143 *
144 * NOTE: The pagetables are allocated contiguous on the physical space 144 * NOTE: The pagetables are allocated contiguous on the physical space
145 * so we can cache the place of the first one and move around without 145 * so we can cache the place of the first one and move around without
146 * checking the pgd every time. 146 * checking the pgd every time.
147 */ 147 */
148 static void __init 148 static void __init
149 page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base) 149 page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
150 { 150 {
151 int pgd_idx, pmd_idx; 151 int pgd_idx, pmd_idx;
152 unsigned long vaddr; 152 unsigned long vaddr;
153 pgd_t *pgd; 153 pgd_t *pgd;
154 pmd_t *pmd; 154 pmd_t *pmd;
155 155
156 vaddr = start; 156 vaddr = start;
157 pgd_idx = pgd_index(vaddr); 157 pgd_idx = pgd_index(vaddr);
158 pmd_idx = pmd_index(vaddr); 158 pmd_idx = pmd_index(vaddr);
159 pgd = pgd_base + pgd_idx; 159 pgd = pgd_base + pgd_idx;
160 160
161 for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) { 161 for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
162 pmd = one_md_table_init(pgd); 162 pmd = one_md_table_init(pgd);
163 pmd = pmd + pmd_index(vaddr); 163 pmd = pmd + pmd_index(vaddr);
164 for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); 164 for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
165 pmd++, pmd_idx++) { 165 pmd++, pmd_idx++) {
166 one_page_table_init(pmd); 166 one_page_table_init(pmd);
167 167
168 vaddr += PMD_SIZE; 168 vaddr += PMD_SIZE;
169 } 169 }
170 pmd_idx = 0; 170 pmd_idx = 0;
171 } 171 }
172 } 172 }
173 173
174 static inline int is_kernel_text(unsigned long addr) 174 static inline int is_kernel_text(unsigned long addr)
175 { 175 {
176 if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end) 176 if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end)
177 return 1; 177 return 1;
178 return 0; 178 return 0;
179 } 179 }
180 180
181 /* 181 /*
182 * This maps the physical memory to kernel virtual address space, a total 182 * This maps the physical memory to kernel virtual address space, a total
183 * of max_low_pfn pages, by creating page tables starting from address 183 * of max_low_pfn pages, by creating page tables starting from address
184 * PAGE_OFFSET: 184 * PAGE_OFFSET:
185 */ 185 */
186 static void __init kernel_physical_mapping_init(pgd_t *pgd_base, 186 static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
187 unsigned long start, 187 unsigned long start,
188 unsigned long end) 188 unsigned long end)
189 { 189 {
190 int pgd_idx, pmd_idx, pte_ofs; 190 int pgd_idx, pmd_idx, pte_ofs;
191 unsigned long pfn; 191 unsigned long pfn;
192 pgd_t *pgd; 192 pgd_t *pgd;
193 pmd_t *pmd; 193 pmd_t *pmd;
194 pte_t *pte; 194 pte_t *pte;
195 unsigned pages_2m = 0, pages_4k = 0; 195 unsigned pages_2m = 0, pages_4k = 0;
196 unsigned limit_pfn = end >> PAGE_SHIFT; 196 unsigned limit_pfn = end >> PAGE_SHIFT;
197 197
198 pgd_idx = pgd_index(PAGE_OFFSET); 198 pgd_idx = pgd_index(PAGE_OFFSET);
199 pgd = pgd_base + pgd_idx; 199 pgd = pgd_base + pgd_idx;
200 pfn = start >> PAGE_SHIFT; 200 pfn = start >> PAGE_SHIFT;
201 201
202 for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) { 202 for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
203 pmd = one_md_table_init(pgd); 203 pmd = one_md_table_init(pgd);
204 if (pfn >= limit_pfn) 204 if (pfn >= limit_pfn)
205 continue; 205 continue;
206 206
207 for (pmd_idx = 0; 207 for (pmd_idx = 0;
208 pmd_idx < PTRS_PER_PMD && pfn < limit_pfn; 208 pmd_idx < PTRS_PER_PMD && pfn < limit_pfn;
209 pmd++, pmd_idx++) { 209 pmd++, pmd_idx++) {
210 unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET; 210 unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET;
211 211
212 /* 212 /*
213 * Map with big pages if possible, otherwise 213 * Map with big pages if possible, otherwise
214 * create normal page tables: 214 * create normal page tables:
215 * 215 *
216 * Don't use a large page for the first 2/4MB of memory 216 * Don't use a large page for the first 2/4MB of memory
217 * because there are often fixed size MTRRs in there 217 * because there are often fixed size MTRRs in there
218 * and overlapping MTRRs into large pages can cause 218 * and overlapping MTRRs into large pages can cause
219 * slowdowns. 219 * slowdowns.
220 */ 220 */
221 if (cpu_has_pse && !(pgd_idx == 0 && pmd_idx == 0)) { 221 if (cpu_has_pse && !(pgd_idx == 0 && pmd_idx == 0)) {
222 unsigned int addr2; 222 unsigned int addr2;
223 pgprot_t prot = PAGE_KERNEL_LARGE; 223 pgprot_t prot = PAGE_KERNEL_LARGE;
224 224
225 addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE + 225 addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE +
226 PAGE_OFFSET + PAGE_SIZE-1; 226 PAGE_OFFSET + PAGE_SIZE-1;
227 227
228 if (is_kernel_text(addr) || 228 if (is_kernel_text(addr) ||
229 is_kernel_text(addr2)) 229 is_kernel_text(addr2))
230 prot = PAGE_KERNEL_LARGE_EXEC; 230 prot = PAGE_KERNEL_LARGE_EXEC;
231 231
232 pages_2m++; 232 pages_2m++;
233 set_pmd(pmd, pfn_pmd(pfn, prot)); 233 set_pmd(pmd, pfn_pmd(pfn, prot));
234 234
235 pfn += PTRS_PER_PTE; 235 pfn += PTRS_PER_PTE;
236 max_pfn_mapped = pfn; 236 max_pfn_mapped = pfn;
237 continue; 237 continue;
238 } 238 }
239 pte = one_page_table_init(pmd); 239 pte = one_page_table_init(pmd);
240 240
241 for (pte_ofs = 0; 241 for (pte_ofs = 0;
242 pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; 242 pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn;
243 pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) { 243 pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) {
244 pgprot_t prot = PAGE_KERNEL; 244 pgprot_t prot = PAGE_KERNEL;
245 245
246 if (is_kernel_text(addr)) 246 if (is_kernel_text(addr))
247 prot = PAGE_KERNEL_EXEC; 247 prot = PAGE_KERNEL_EXEC;
248 248
249 pages_4k++; 249 pages_4k++;
250 set_pte(pte, pfn_pte(pfn, prot)); 250 set_pte(pte, pfn_pte(pfn, prot));
251 } 251 }
252 max_pfn_mapped = pfn; 252 max_pfn_mapped = pfn;
253 } 253 }
254 } 254 }
255 update_page_count(PG_LEVEL_2M, pages_2m); 255 update_page_count(PG_LEVEL_2M, pages_2m);
256 update_page_count(PG_LEVEL_4K, pages_4k); 256 update_page_count(PG_LEVEL_4K, pages_4k);
257 } 257 }
258 258
259 /* 259 /*
260 * devmem_is_allowed() checks to see if /dev/mem access to a certain address 260 * devmem_is_allowed() checks to see if /dev/mem access to a certain address
261 * is valid. The argument is a physical page number. 261 * is valid. The argument is a physical page number.
262 * 262 *
263 * 263 *
264 * On x86, access has to be given to the first megabyte of ram because that area 264 * On x86, access has to be given to the first megabyte of ram because that area
265 * contains bios code and data regions used by X and dosemu and similar apps. 265 * contains bios code and data regions used by X and dosemu and similar apps.
266 * Access has to be given to non-kernel-ram areas as well, these contain the PCI 266 * Access has to be given to non-kernel-ram areas as well, these contain the PCI
267 * mmio resources as well as potential bios/acpi data regions. 267 * mmio resources as well as potential bios/acpi data regions.
268 */ 268 */
269 int devmem_is_allowed(unsigned long pagenr) 269 int devmem_is_allowed(unsigned long pagenr)
270 { 270 {
271 if (pagenr <= 256) 271 if (pagenr <= 256)
272 return 1; 272 return 1;
273 if (!page_is_ram(pagenr)) 273 if (!page_is_ram(pagenr))
274 return 1; 274 return 1;
275 return 0; 275 return 0;
276 } 276 }
277 277
278 #ifdef CONFIG_HIGHMEM 278 #ifdef CONFIG_HIGHMEM
279 pte_t *kmap_pte; 279 pte_t *kmap_pte;
280 pgprot_t kmap_prot; 280 pgprot_t kmap_prot;
281 281
282 static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr) 282 static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr)
283 { 283 {
284 return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), 284 return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr),
285 vaddr), vaddr), vaddr); 285 vaddr), vaddr), vaddr);
286 } 286 }
287 287
288 static void __init kmap_init(void) 288 static void __init kmap_init(void)
289 { 289 {
290 unsigned long kmap_vstart; 290 unsigned long kmap_vstart;
291 291
292 /* 292 /*
293 * Cache the first kmap pte: 293 * Cache the first kmap pte:
294 */ 294 */
295 kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN); 295 kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
296 kmap_pte = kmap_get_fixmap_pte(kmap_vstart); 296 kmap_pte = kmap_get_fixmap_pte(kmap_vstart);
297 297
298 kmap_prot = PAGE_KERNEL; 298 kmap_prot = PAGE_KERNEL;
299 } 299 }
300 300
301 static void __init permanent_kmaps_init(pgd_t *pgd_base) 301 static void __init permanent_kmaps_init(pgd_t *pgd_base)
302 { 302 {
303 unsigned long vaddr; 303 unsigned long vaddr;
304 pgd_t *pgd; 304 pgd_t *pgd;
305 pud_t *pud; 305 pud_t *pud;
306 pmd_t *pmd; 306 pmd_t *pmd;
307 pte_t *pte; 307 pte_t *pte;
308 308
309 vaddr = PKMAP_BASE; 309 vaddr = PKMAP_BASE;
310 page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base); 310 page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);
311 311
312 pgd = swapper_pg_dir + pgd_index(vaddr); 312 pgd = swapper_pg_dir + pgd_index(vaddr);
313 pud = pud_offset(pgd, vaddr); 313 pud = pud_offset(pgd, vaddr);
314 pmd = pmd_offset(pud, vaddr); 314 pmd = pmd_offset(pud, vaddr);
315 pte = pte_offset_kernel(pmd, vaddr); 315 pte = pte_offset_kernel(pmd, vaddr);
316 pkmap_page_table = pte; 316 pkmap_page_table = pte;
317 } 317 }
318 318
319 static void __init add_one_highpage_init(struct page *page, int pfn) 319 static void __init add_one_highpage_init(struct page *page, int pfn)
320 { 320 {
321 ClearPageReserved(page); 321 ClearPageReserved(page);
322 init_page_count(page); 322 init_page_count(page);
323 __free_page(page); 323 __free_page(page);
324 totalhigh_pages++; 324 totalhigh_pages++;
325 } 325 }
326 326
327 struct add_highpages_data { 327 struct add_highpages_data {
328 unsigned long start_pfn; 328 unsigned long start_pfn;
329 unsigned long end_pfn; 329 unsigned long end_pfn;
330 }; 330 };
331 331
332 static int __init add_highpages_work_fn(unsigned long start_pfn, 332 static int __init add_highpages_work_fn(unsigned long start_pfn,
333 unsigned long end_pfn, void *datax) 333 unsigned long end_pfn, void *datax)
334 { 334 {
335 int node_pfn; 335 int node_pfn;
336 struct page *page; 336 struct page *page;
337 unsigned long final_start_pfn, final_end_pfn; 337 unsigned long final_start_pfn, final_end_pfn;
338 struct add_highpages_data *data; 338 struct add_highpages_data *data;
339 339
340 data = (struct add_highpages_data *)datax; 340 data = (struct add_highpages_data *)datax;
341 341
342 final_start_pfn = max(start_pfn, data->start_pfn); 342 final_start_pfn = max(start_pfn, data->start_pfn);
343 final_end_pfn = min(end_pfn, data->end_pfn); 343 final_end_pfn = min(end_pfn, data->end_pfn);
344 if (final_start_pfn >= final_end_pfn) 344 if (final_start_pfn >= final_end_pfn)
345 return 0; 345 return 0;
346 346
347 for (node_pfn = final_start_pfn; node_pfn < final_end_pfn; 347 for (node_pfn = final_start_pfn; node_pfn < final_end_pfn;
348 node_pfn++) { 348 node_pfn++) {
349 if (!pfn_valid(node_pfn)) 349 if (!pfn_valid(node_pfn))
350 continue; 350 continue;
351 page = pfn_to_page(node_pfn); 351 page = pfn_to_page(node_pfn);
352 add_one_highpage_init(page, node_pfn); 352 add_one_highpage_init(page, node_pfn);
353 } 353 }
354 354
355 return 0; 355 return 0;
356 356
357 } 357 }
358 358
359 void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn, 359 void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn,
360 unsigned long end_pfn) 360 unsigned long end_pfn)
361 { 361 {
362 struct add_highpages_data data; 362 struct add_highpages_data data;
363 363
364 data.start_pfn = start_pfn; 364 data.start_pfn = start_pfn;
365 data.end_pfn = end_pfn; 365 data.end_pfn = end_pfn;
366 366
367 work_with_active_regions(nid, add_highpages_work_fn, &data); 367 work_with_active_regions(nid, add_highpages_work_fn, &data);
368 } 368 }
369 369
370 #ifndef CONFIG_NUMA 370 #ifndef CONFIG_NUMA
371 static void __init set_highmem_pages_init(void) 371 static void __init set_highmem_pages_init(void)
372 { 372 {
373 add_highpages_with_active_regions(0, highstart_pfn, highend_pfn); 373 add_highpages_with_active_regions(0, highstart_pfn, highend_pfn);
374 374
375 totalram_pages += totalhigh_pages; 375 totalram_pages += totalhigh_pages;
376 } 376 }
377 #endif /* !CONFIG_NUMA */ 377 #endif /* !CONFIG_NUMA */
378 378
379 #else 379 #else
380 # define kmap_init() do { } while (0) 380 # define kmap_init() do { } while (0)
381 # define permanent_kmaps_init(pgd_base) do { } while (0) 381 # define permanent_kmaps_init(pgd_base) do { } while (0)
382 # define set_highmem_pages_init() do { } while (0) 382 # define set_highmem_pages_init() do { } while (0)
383 #endif /* CONFIG_HIGHMEM */ 383 #endif /* CONFIG_HIGHMEM */
384 384
385 pteval_t __PAGE_KERNEL = _PAGE_KERNEL; 385 pteval_t __PAGE_KERNEL = _PAGE_KERNEL;
386 EXPORT_SYMBOL(__PAGE_KERNEL); 386 EXPORT_SYMBOL(__PAGE_KERNEL);
387 387
388 pteval_t __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC; 388 pteval_t __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
389 389
390 void __init native_pagetable_setup_start(pgd_t *base) 390 void __init native_pagetable_setup_start(pgd_t *base)
391 { 391 {
392 unsigned long pfn, va; 392 unsigned long pfn, va;
393 pgd_t *pgd; 393 pgd_t *pgd;
394 pud_t *pud; 394 pud_t *pud;
395 pmd_t *pmd; 395 pmd_t *pmd;
396 pte_t *pte; 396 pte_t *pte;
397 397
398 /* 398 /*
399 * Remove any mappings which extend past the end of physical 399 * Remove any mappings which extend past the end of physical
400 * memory from the boot time page table: 400 * memory from the boot time page table:
401 */ 401 */
402 for (pfn = max_low_pfn + 1; pfn < 1<<(32-PAGE_SHIFT); pfn++) { 402 for (pfn = max_low_pfn + 1; pfn < 1<<(32-PAGE_SHIFT); pfn++) {
403 va = PAGE_OFFSET + (pfn<<PAGE_SHIFT); 403 va = PAGE_OFFSET + (pfn<<PAGE_SHIFT);
404 pgd = base + pgd_index(va); 404 pgd = base + pgd_index(va);
405 if (!pgd_present(*pgd)) 405 if (!pgd_present(*pgd))
406 break; 406 break;
407 407
408 pud = pud_offset(pgd, va); 408 pud = pud_offset(pgd, va);
409 pmd = pmd_offset(pud, va); 409 pmd = pmd_offset(pud, va);
410 if (!pmd_present(*pmd)) 410 if (!pmd_present(*pmd))
411 break; 411 break;
412 412
413 pte = pte_offset_kernel(pmd, va); 413 pte = pte_offset_kernel(pmd, va);
414 if (!pte_present(*pte)) 414 if (!pte_present(*pte))
415 break; 415 break;
416 416
417 pte_clear(NULL, va, pte); 417 pte_clear(NULL, va, pte);
418 } 418 }
419 paravirt_alloc_pmd(&init_mm, __pa(base) >> PAGE_SHIFT); 419 paravirt_alloc_pmd(&init_mm, __pa(base) >> PAGE_SHIFT);
420 } 420 }
421 421
422 void __init native_pagetable_setup_done(pgd_t *base) 422 void __init native_pagetable_setup_done(pgd_t *base)
423 { 423 {
424 } 424 }
425 425
426 /* 426 /*
427 * Build a proper pagetable for the kernel mappings. Up until this 427 * Build a proper pagetable for the kernel mappings. Up until this
428 * point, we've been running on some set of pagetables constructed by 428 * point, we've been running on some set of pagetables constructed by
429 * the boot process. 429 * the boot process.
430 * 430 *
431 * If we're booting on native hardware, this will be a pagetable 431 * If we're booting on native hardware, this will be a pagetable
432 * constructed in arch/x86/kernel/head_32.S. The root of the 432 * constructed in arch/x86/kernel/head_32.S. The root of the
433 * pagetable will be swapper_pg_dir. 433 * pagetable will be swapper_pg_dir.
434 * 434 *
435 * If we're booting paravirtualized under a hypervisor, then there are 435 * If we're booting paravirtualized under a hypervisor, then there are
436 * more options: we may already be running PAE, and the pagetable may 436 * more options: we may already be running PAE, and the pagetable may
437 * or may not be based in swapper_pg_dir. In any case, 437 * or may not be based in swapper_pg_dir. In any case,
438 * paravirt_pagetable_setup_start() will set up swapper_pg_dir 438 * paravirt_pagetable_setup_start() will set up swapper_pg_dir
439 * appropriately for the rest of the initialization to work. 439 * appropriately for the rest of the initialization to work.
440 * 440 *
441 * In general, pagetable_init() assumes that the pagetable may already 441 * In general, pagetable_init() assumes that the pagetable may already
442 * be partially populated, and so it avoids stomping on any existing 442 * be partially populated, and so it avoids stomping on any existing
443 * mappings. 443 * mappings.
444 */ 444 */
445 static void __init pagetable_init(void) 445 static void __init pagetable_init(void)
446 { 446 {
447 pgd_t *pgd_base = swapper_pg_dir; 447 pgd_t *pgd_base = swapper_pg_dir;
448 unsigned long vaddr, end; 448 unsigned long vaddr, end;
449 449
450 paravirt_pagetable_setup_start(pgd_base); 450 paravirt_pagetable_setup_start(pgd_base);
451 451
452 remap_numa_kva();
453 /* 452 /*
454 * Fixed mappings, only the page table structure has to be 453 * Fixed mappings, only the page table structure has to be
455 * created - mappings will be set by set_fixmap(): 454 * created - mappings will be set by set_fixmap():
456 */ 455 */
457 early_ioremap_clear(); 456 early_ioremap_clear();
458 vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; 457 vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
459 end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK; 458 end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
460 page_table_range_init(vaddr, end, pgd_base); 459 page_table_range_init(vaddr, end, pgd_base);
461 early_ioremap_reset(); 460 early_ioremap_reset();
462 461
463 permanent_kmaps_init(pgd_base); 462 permanent_kmaps_init(pgd_base);
464 463
465 paravirt_pagetable_setup_done(pgd_base); 464 paravirt_pagetable_setup_done(pgd_base);
466 } 465 }
467 466
468 #ifdef CONFIG_ACPI_SLEEP 467 #ifdef CONFIG_ACPI_SLEEP
469 /* 468 /*
470 * ACPI suspend needs this for resume, because things like the intel-agp 469 * ACPI suspend needs this for resume, because things like the intel-agp
471 * driver might have split up a kernel 4MB mapping. 470 * driver might have split up a kernel 4MB mapping.
472 */ 471 */
473 char swsusp_pg_dir[PAGE_SIZE] 472 char swsusp_pg_dir[PAGE_SIZE]
474 __attribute__ ((aligned(PAGE_SIZE))); 473 __attribute__ ((aligned(PAGE_SIZE)));
475 474
476 static inline void save_pg_dir(void) 475 static inline void save_pg_dir(void)
477 { 476 {
478 memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE); 477 memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE);
479 } 478 }
480 #else /* !CONFIG_ACPI_SLEEP */ 479 #else /* !CONFIG_ACPI_SLEEP */
481 static inline void save_pg_dir(void) 480 static inline void save_pg_dir(void)
482 { 481 {
483 } 482 }
484 #endif /* !CONFIG_ACPI_SLEEP */ 483 #endif /* !CONFIG_ACPI_SLEEP */
485 484
486 void zap_low_mappings(void) 485 void zap_low_mappings(void)
487 { 486 {
488 int i; 487 int i;
489 488
490 /* 489 /*
491 * Zap initial low-memory mappings. 490 * Zap initial low-memory mappings.
492 * 491 *
493 * Note that "pgd_clear()" doesn't do it for 492 * Note that "pgd_clear()" doesn't do it for
494 * us, because pgd_clear() is a no-op on i386. 493 * us, because pgd_clear() is a no-op on i386.
495 */ 494 */
496 for (i = 0; i < KERNEL_PGD_BOUNDARY; i++) { 495 for (i = 0; i < KERNEL_PGD_BOUNDARY; i++) {
497 #ifdef CONFIG_X86_PAE 496 #ifdef CONFIG_X86_PAE
498 set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page))); 497 set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));
499 #else 498 #else
500 set_pgd(swapper_pg_dir+i, __pgd(0)); 499 set_pgd(swapper_pg_dir+i, __pgd(0));
501 #endif 500 #endif
502 } 501 }
503 flush_tlb_all(); 502 flush_tlb_all();
504 } 503 }
505 504
506 int nx_enabled; 505 int nx_enabled;
507 506
508 pteval_t __supported_pte_mask __read_mostly = ~_PAGE_NX; 507 pteval_t __supported_pte_mask __read_mostly = ~_PAGE_NX;
509 EXPORT_SYMBOL_GPL(__supported_pte_mask); 508 EXPORT_SYMBOL_GPL(__supported_pte_mask);
510 509
511 #ifdef CONFIG_X86_PAE 510 #ifdef CONFIG_X86_PAE
512 511
513 static int disable_nx __initdata; 512 static int disable_nx __initdata;
514 513
515 /* 514 /*
516 * noexec = on|off 515 * noexec = on|off
517 * 516 *
518 * Control non executable mappings. 517 * Control non executable mappings.
519 * 518 *
520 * on Enable 519 * on Enable
521 * off Disable 520 * off Disable
522 */ 521 */
523 static int __init noexec_setup(char *str) 522 static int __init noexec_setup(char *str)
524 { 523 {
525 if (!str || !strcmp(str, "on")) { 524 if (!str || !strcmp(str, "on")) {
526 if (cpu_has_nx) { 525 if (cpu_has_nx) {
527 __supported_pte_mask |= _PAGE_NX; 526 __supported_pte_mask |= _PAGE_NX;
528 disable_nx = 0; 527 disable_nx = 0;
529 } 528 }
530 } else { 529 } else {
531 if (!strcmp(str, "off")) { 530 if (!strcmp(str, "off")) {
532 disable_nx = 1; 531 disable_nx = 1;
533 __supported_pte_mask &= ~_PAGE_NX; 532 __supported_pte_mask &= ~_PAGE_NX;
534 } else { 533 } else {
535 return -EINVAL; 534 return -EINVAL;
536 } 535 }
537 } 536 }
538 537
539 return 0; 538 return 0;
540 } 539 }
541 early_param("noexec", noexec_setup); 540 early_param("noexec", noexec_setup);
542 541
543 static void __init set_nx(void) 542 static void __init set_nx(void)
544 { 543 {
545 unsigned int v[4], l, h; 544 unsigned int v[4], l, h;
546 545
547 if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) { 546 if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
548 cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]); 547 cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
549 548
550 if ((v[3] & (1 << 20)) && !disable_nx) { 549 if ((v[3] & (1 << 20)) && !disable_nx) {
551 rdmsr(MSR_EFER, l, h); 550 rdmsr(MSR_EFER, l, h);
552 l |= EFER_NX; 551 l |= EFER_NX;
553 wrmsr(MSR_EFER, l, h); 552 wrmsr(MSR_EFER, l, h);
554 nx_enabled = 1; 553 nx_enabled = 1;
555 __supported_pte_mask |= _PAGE_NX; 554 __supported_pte_mask |= _PAGE_NX;
556 } 555 }
557 } 556 }
558 } 557 }
559 #endif 558 #endif
560 559
561 /* user-defined highmem size */ 560 /* user-defined highmem size */
562 static unsigned int highmem_pages = -1; 561 static unsigned int highmem_pages = -1;
563 562
564 /* 563 /*
565 * highmem=size forces highmem to be exactly 'size' bytes. 564 * highmem=size forces highmem to be exactly 'size' bytes.
566 * This works even on boxes that have no highmem otherwise. 565 * This works even on boxes that have no highmem otherwise.
567 * This also works to reduce highmem size on bigger boxes. 566 * This also works to reduce highmem size on bigger boxes.
568 */ 567 */
569 static int __init parse_highmem(char *arg) 568 static int __init parse_highmem(char *arg)
570 { 569 {
571 if (!arg) 570 if (!arg)
572 return -EINVAL; 571 return -EINVAL;
573 572
574 highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT; 573 highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT;
575 return 0; 574 return 0;
576 } 575 }
577 early_param("highmem", parse_highmem); 576 early_param("highmem", parse_highmem);
578 577
579 /* 578 /*
580 * Determine low and high memory ranges: 579 * Determine low and high memory ranges:
581 */ 580 */
582 void __init find_low_pfn_range(void) 581 void __init find_low_pfn_range(void)
583 { 582 {
584 /* it could update max_pfn */ 583 /* it could update max_pfn */
585 584
586 /* max_low_pfn is 0, we already have early_res support */ 585 /* max_low_pfn is 0, we already have early_res support */
587 586
588 max_low_pfn = max_pfn; 587 max_low_pfn = max_pfn;
589 if (max_low_pfn > MAXMEM_PFN) { 588 if (max_low_pfn > MAXMEM_PFN) {
590 if (highmem_pages == -1) 589 if (highmem_pages == -1)
591 highmem_pages = max_pfn - MAXMEM_PFN; 590 highmem_pages = max_pfn - MAXMEM_PFN;
592 if (highmem_pages + MAXMEM_PFN < max_pfn) 591 if (highmem_pages + MAXMEM_PFN < max_pfn)
593 max_pfn = MAXMEM_PFN + highmem_pages; 592 max_pfn = MAXMEM_PFN + highmem_pages;
594 if (highmem_pages + MAXMEM_PFN > max_pfn) { 593 if (highmem_pages + MAXMEM_PFN > max_pfn) {
595 printk(KERN_WARNING "only %luMB highmem pages " 594 printk(KERN_WARNING "only %luMB highmem pages "
596 "available, ignoring highmem size of %uMB.\n", 595 "available, ignoring highmem size of %uMB.\n",
597 pages_to_mb(max_pfn - MAXMEM_PFN), 596 pages_to_mb(max_pfn - MAXMEM_PFN),
598 pages_to_mb(highmem_pages)); 597 pages_to_mb(highmem_pages));
599 highmem_pages = 0; 598 highmem_pages = 0;
600 } 599 }
601 max_low_pfn = MAXMEM_PFN; 600 max_low_pfn = MAXMEM_PFN;
602 #ifndef CONFIG_HIGHMEM 601 #ifndef CONFIG_HIGHMEM
603 /* Maximum memory usable is what is directly addressable */ 602 /* Maximum memory usable is what is directly addressable */
604 printk(KERN_WARNING "Warning only %ldMB will be used.\n", 603 printk(KERN_WARNING "Warning only %ldMB will be used.\n",
605 MAXMEM>>20); 604 MAXMEM>>20);
606 if (max_pfn > MAX_NONPAE_PFN) 605 if (max_pfn > MAX_NONPAE_PFN)
607 printk(KERN_WARNING 606 printk(KERN_WARNING
608 "Use a HIGHMEM64G enabled kernel.\n"); 607 "Use a HIGHMEM64G enabled kernel.\n");
609 else 608 else
610 printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n"); 609 printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
611 max_pfn = MAXMEM_PFN; 610 max_pfn = MAXMEM_PFN;
612 #else /* !CONFIG_HIGHMEM */ 611 #else /* !CONFIG_HIGHMEM */
613 #ifndef CONFIG_HIGHMEM64G 612 #ifndef CONFIG_HIGHMEM64G
614 if (max_pfn > MAX_NONPAE_PFN) { 613 if (max_pfn > MAX_NONPAE_PFN) {
615 max_pfn = MAX_NONPAE_PFN; 614 max_pfn = MAX_NONPAE_PFN;
616 printk(KERN_WARNING "Warning only 4GB will be used." 615 printk(KERN_WARNING "Warning only 4GB will be used."
617 "Use a HIGHMEM64G enabled kernel.\n"); 616 "Use a HIGHMEM64G enabled kernel.\n");
618 } 617 }
619 #endif /* !CONFIG_HIGHMEM64G */ 618 #endif /* !CONFIG_HIGHMEM64G */
620 #endif /* !CONFIG_HIGHMEM */ 619 #endif /* !CONFIG_HIGHMEM */
621 } else { 620 } else {
622 if (highmem_pages == -1) 621 if (highmem_pages == -1)
623 highmem_pages = 0; 622 highmem_pages = 0;
624 #ifdef CONFIG_HIGHMEM 623 #ifdef CONFIG_HIGHMEM
625 if (highmem_pages >= max_pfn) { 624 if (highmem_pages >= max_pfn) {
626 printk(KERN_ERR "highmem size specified (%uMB) is " 625 printk(KERN_ERR "highmem size specified (%uMB) is "
627 "bigger than pages available (%luMB)!.\n", 626 "bigger than pages available (%luMB)!.\n",
628 pages_to_mb(highmem_pages), 627 pages_to_mb(highmem_pages),
629 pages_to_mb(max_pfn)); 628 pages_to_mb(max_pfn));
630 highmem_pages = 0; 629 highmem_pages = 0;
631 } 630 }
632 if (highmem_pages) { 631 if (highmem_pages) {
633 if (max_low_pfn - highmem_pages < 632 if (max_low_pfn - highmem_pages <
634 64*1024*1024/PAGE_SIZE){ 633 64*1024*1024/PAGE_SIZE){
635 printk(KERN_ERR "highmem size %uMB results in " 634 printk(KERN_ERR "highmem size %uMB results in "
636 "smaller than 64MB lowmem, ignoring it.\n" 635 "smaller than 64MB lowmem, ignoring it.\n"
637 , pages_to_mb(highmem_pages)); 636 , pages_to_mb(highmem_pages));
638 highmem_pages = 0; 637 highmem_pages = 0;
639 } 638 }
640 max_low_pfn -= highmem_pages; 639 max_low_pfn -= highmem_pages;
641 } 640 }
642 #else 641 #else
643 if (highmem_pages) 642 if (highmem_pages)
644 printk(KERN_ERR "ignoring highmem size on non-highmem" 643 printk(KERN_ERR "ignoring highmem size on non-highmem"
645 " kernel!\n"); 644 " kernel!\n");
646 #endif 645 #endif
647 } 646 }
648 } 647 }
649 648
650 #ifndef CONFIG_NEED_MULTIPLE_NODES 649 #ifndef CONFIG_NEED_MULTIPLE_NODES
651 void __init initmem_init(unsigned long start_pfn, 650 void __init initmem_init(unsigned long start_pfn,
652 unsigned long end_pfn) 651 unsigned long end_pfn)
653 { 652 {
654 #ifdef CONFIG_HIGHMEM 653 #ifdef CONFIG_HIGHMEM
655 highstart_pfn = highend_pfn = max_pfn; 654 highstart_pfn = highend_pfn = max_pfn;
656 if (max_pfn > max_low_pfn) 655 if (max_pfn > max_low_pfn)
657 highstart_pfn = max_low_pfn; 656 highstart_pfn = max_low_pfn;
658 memory_present(0, 0, highend_pfn); 657 memory_present(0, 0, highend_pfn);
659 printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", 658 printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
660 pages_to_mb(highend_pfn - highstart_pfn)); 659 pages_to_mb(highend_pfn - highstart_pfn));
661 num_physpages = highend_pfn; 660 num_physpages = highend_pfn;
662 high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; 661 high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
663 #else 662 #else
664 memory_present(0, 0, max_low_pfn); 663 memory_present(0, 0, max_low_pfn);
665 num_physpages = max_low_pfn; 664 num_physpages = max_low_pfn;
666 high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; 665 high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
667 #endif 666 #endif
668 #ifdef CONFIG_FLATMEM 667 #ifdef CONFIG_FLATMEM
669 max_mapnr = num_physpages; 668 max_mapnr = num_physpages;
670 #endif 669 #endif
671 printk(KERN_NOTICE "%ldMB LOWMEM available.\n", 670 printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
672 pages_to_mb(max_low_pfn)); 671 pages_to_mb(max_low_pfn));
673 672
674 setup_bootmem_allocator(); 673 setup_bootmem_allocator();
675 } 674 }
676 675
677 void __init zone_sizes_init(void) 676 void __init zone_sizes_init(void)
678 { 677 {
679 unsigned long max_zone_pfns[MAX_NR_ZONES]; 678 unsigned long max_zone_pfns[MAX_NR_ZONES];
680 memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); 679 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
681 max_zone_pfns[ZONE_DMA] = 680 max_zone_pfns[ZONE_DMA] =
682 virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; 681 virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
683 max_zone_pfns[ZONE_NORMAL] = max_low_pfn; 682 max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
684 remove_all_active_ranges(); 683 remove_all_active_ranges();
685 #ifdef CONFIG_HIGHMEM 684 #ifdef CONFIG_HIGHMEM
686 max_zone_pfns[ZONE_HIGHMEM] = highend_pfn; 685 max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
687 e820_register_active_regions(0, 0, highend_pfn); 686 e820_register_active_regions(0, 0, highend_pfn);
688 #else 687 #else
689 e820_register_active_regions(0, 0, max_low_pfn); 688 e820_register_active_regions(0, 0, max_low_pfn);
690 #endif 689 #endif
691 690
692 free_area_init_nodes(max_zone_pfns); 691 free_area_init_nodes(max_zone_pfns);
693 } 692 }
694 #endif /* !CONFIG_NEED_MULTIPLE_NODES */ 693 #endif /* !CONFIG_NEED_MULTIPLE_NODES */
695 694
696 void __init setup_bootmem_allocator(void) 695 void __init setup_bootmem_allocator(void)
697 { 696 {
698 int i; 697 int i;
699 unsigned long bootmap_size, bootmap; 698 unsigned long bootmap_size, bootmap;
700 /* 699 /*
701 * Initialize the boot-time allocator (with low memory only): 700 * Initialize the boot-time allocator (with low memory only):
702 */ 701 */
703 bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT; 702 bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT;
704 bootmap = find_e820_area(min_low_pfn<<PAGE_SHIFT, 703 bootmap = find_e820_area(min_low_pfn<<PAGE_SHIFT,
705 max_pfn_mapped<<PAGE_SHIFT, bootmap_size, 704 max_pfn_mapped<<PAGE_SHIFT, bootmap_size,
706 PAGE_SIZE); 705 PAGE_SIZE);
707 if (bootmap == -1L) 706 if (bootmap == -1L)
708 panic("Cannot find bootmem map of size %ld\n", bootmap_size); 707 panic("Cannot find bootmem map of size %ld\n", bootmap_size);
709 reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP"); 708 reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
710 709
711 /* don't touch min_low_pfn */ 710 /* don't touch min_low_pfn */
712 bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT, 711 bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
713 min_low_pfn, max_low_pfn); 712 min_low_pfn, max_low_pfn);
714 printk(KERN_INFO " mapped low ram: 0 - %08lx\n", 713 printk(KERN_INFO " mapped low ram: 0 - %08lx\n",
715 max_pfn_mapped<<PAGE_SHIFT); 714 max_pfn_mapped<<PAGE_SHIFT);
716 printk(KERN_INFO " low ram: %08lx - %08lx\n", 715 printk(KERN_INFO " low ram: %08lx - %08lx\n",
717 min_low_pfn<<PAGE_SHIFT, max_low_pfn<<PAGE_SHIFT); 716 min_low_pfn<<PAGE_SHIFT, max_low_pfn<<PAGE_SHIFT);
718 printk(KERN_INFO " bootmap %08lx - %08lx\n", 717 printk(KERN_INFO " bootmap %08lx - %08lx\n",
719 bootmap, bootmap + bootmap_size); 718 bootmap, bootmap + bootmap_size);
720 for_each_online_node(i) 719 for_each_online_node(i)
721 free_bootmem_with_active_regions(i, max_low_pfn); 720 free_bootmem_with_active_regions(i, max_low_pfn);
722 early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT); 721 early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
723 722
724 after_init_bootmem = 1; 723 after_init_bootmem = 1;
725 } 724 }
726 725
727 /*
728 * The node 0 pgdat is initialized before all of these because
729 * it's needed for bootmem. node>0 pgdats have their virtual
730 * space allocated before the pagetables are in place to access
731 * them, so they can't be cleared then.
732 *
733 * This should all compile down to nothing when NUMA is off.
734 */
735 static void __init remapped_pgdat_init(void)
736 {
737 int nid;
738
739 for_each_online_node(nid) {
740 if (nid != 0)
741 memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
742 }
743 }
744
745 static void __init find_early_table_space(unsigned long end) 726 static void __init find_early_table_space(unsigned long end)
746 { 727 {
747 unsigned long puds, pmds, tables, start; 728 unsigned long puds, pmds, tables, start;
748 729
749 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; 730 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
750 tables = PAGE_ALIGN(puds * sizeof(pud_t)); 731 tables = PAGE_ALIGN(puds * sizeof(pud_t));
751 732
752 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; 733 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
753 tables += PAGE_ALIGN(pmds * sizeof(pmd_t)); 734 tables += PAGE_ALIGN(pmds * sizeof(pmd_t));
754 735
755 /* 736 /*
756 * RED-PEN putting page tables only on node 0 could 737 * RED-PEN putting page tables only on node 0 could
757 * cause a hotspot and fill up ZONE_DMA. The page tables 738 * cause a hotspot and fill up ZONE_DMA. The page tables
758 * need roughly 0.5KB per GB. 739 * need roughly 0.5KB per GB.
759 */ 740 */
760 start = 0x7000; 741 start = 0x7000;
761 table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT, 742 table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
762 tables, PAGE_SIZE); 743 tables, PAGE_SIZE);
763 if (table_start == -1UL) 744 if (table_start == -1UL)
764 panic("Cannot find space for the kernel page tables"); 745 panic("Cannot find space for the kernel page tables");
765 746
766 table_start >>= PAGE_SHIFT; 747 table_start >>= PAGE_SHIFT;
767 table_end = table_start; 748 table_end = table_start;
768 table_top = table_start + (tables>>PAGE_SHIFT); 749 table_top = table_start + (tables>>PAGE_SHIFT);
769 750
770 printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n", 751 printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
771 end, table_start << PAGE_SHIFT, 752 end, table_start << PAGE_SHIFT,
772 (table_start << PAGE_SHIFT) + tables); 753 (table_start << PAGE_SHIFT) + tables);
773 } 754 }
774 755
775 unsigned long __init_refok init_memory_mapping(unsigned long start, 756 unsigned long __init_refok init_memory_mapping(unsigned long start,
776 unsigned long end) 757 unsigned long end)
777 { 758 {
778 pgd_t *pgd_base = swapper_pg_dir; 759 pgd_t *pgd_base = swapper_pg_dir;
779 760
780 /* 761 /*
781 * Find space for the kernel direct mapping tables. 762 * Find space for the kernel direct mapping tables.
782 */ 763 */
783 if (!after_init_bootmem) 764 if (!after_init_bootmem)
784 find_early_table_space(end); 765 find_early_table_space(end);
785 766
786 #ifdef CONFIG_X86_PAE 767 #ifdef CONFIG_X86_PAE
787 set_nx(); 768 set_nx();
788 if (nx_enabled) 769 if (nx_enabled)
789 printk(KERN_INFO "NX (Execute Disable) protection: active\n"); 770 printk(KERN_INFO "NX (Execute Disable) protection: active\n");
790 #endif 771 #endif
791 772
792 /* Enable PSE if available */ 773 /* Enable PSE if available */
793 if (cpu_has_pse) 774 if (cpu_has_pse)
794 set_in_cr4(X86_CR4_PSE); 775 set_in_cr4(X86_CR4_PSE);
795 776
796 /* Enable PGE if available */ 777 /* Enable PGE if available */
797 if (cpu_has_pge) { 778 if (cpu_has_pge) {
798 set_in_cr4(X86_CR4_PGE); 779 set_in_cr4(X86_CR4_PGE);
799 __PAGE_KERNEL |= _PAGE_GLOBAL; 780 __PAGE_KERNEL |= _PAGE_GLOBAL;
800 __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL; 781 __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL;
801 } 782 }
802 783
803 kernel_physical_mapping_init(pgd_base, start, end); 784 kernel_physical_mapping_init(pgd_base, start, end);
804 785
805 load_cr3(swapper_pg_dir); 786 load_cr3(swapper_pg_dir);
806 787
807 __flush_tlb_all(); 788 __flush_tlb_all();
808 789
809 if (!after_init_bootmem) 790 if (!after_init_bootmem)
810 reserve_early(table_start << PAGE_SHIFT, 791 reserve_early(table_start << PAGE_SHIFT,
811 table_end << PAGE_SHIFT, "PGTABLE"); 792 table_end << PAGE_SHIFT, "PGTABLE");
812 793
813 return end >> PAGE_SHIFT; 794 return end >> PAGE_SHIFT;
814 } 795 }
815 796
816 /* 797 /*
817 * paging_init() sets up the page tables - note that the first 8MB are 798 * paging_init() sets up the page tables - note that the first 8MB are
818 * already mapped by head.S. 799 * already mapped by head.S.
819 * 800 *
820 * This routines also unmaps the page at virtual kernel address 0, so 801 * This routines also unmaps the page at virtual kernel address 0, so
821 * that we can trap those pesky NULL-reference errors in the kernel. 802 * that we can trap those pesky NULL-reference errors in the kernel.
822 */ 803 */
823 void __init paging_init(void) 804 void __init paging_init(void)
824 { 805 {
825 pagetable_init(); 806 pagetable_init();
826 807
827 __flush_tlb_all(); 808 __flush_tlb_all();
828 809
829 kmap_init(); 810 kmap_init();
830 811
831 /* 812 /*
832 * NOTE: at this point the bootmem allocator is fully available. 813 * NOTE: at this point the bootmem allocator is fully available.
833 */ 814 */
834 remapped_pgdat_init();
835 sparse_init(); 815 sparse_init();
836 zone_sizes_init(); 816 zone_sizes_init();
837 817
838 paravirt_post_allocator_init(); 818 paravirt_post_allocator_init();
839 } 819 }
840 820
841 /* 821 /*
842 * Test if the WP bit works in supervisor mode. It isn't supported on 386's 822 * Test if the WP bit works in supervisor mode. It isn't supported on 386's
843 * and also on some strange 486's. All 586+'s are OK. This used to involve 823 * and also on some strange 486's. All 586+'s are OK. This used to involve
844 * black magic jumps to work around some nasty CPU bugs, but fortunately the 824 * black magic jumps to work around some nasty CPU bugs, but fortunately the
845 * switch to using exceptions got rid of all that. 825 * switch to using exceptions got rid of all that.
846 */ 826 */
847 static void __init test_wp_bit(void) 827 static void __init test_wp_bit(void)
848 { 828 {
849 printk(KERN_INFO 829 printk(KERN_INFO
850 "Checking if this processor honours the WP bit even in supervisor mode..."); 830 "Checking if this processor honours the WP bit even in supervisor mode...");
851 831
852 /* Any page-aligned address will do, the test is non-destructive */ 832 /* Any page-aligned address will do, the test is non-destructive */
853 __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY); 833 __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY);
854 boot_cpu_data.wp_works_ok = do_test_wp_bit(); 834 boot_cpu_data.wp_works_ok = do_test_wp_bit();
855 clear_fixmap(FIX_WP_TEST); 835 clear_fixmap(FIX_WP_TEST);
856 836
857 if (!boot_cpu_data.wp_works_ok) { 837 if (!boot_cpu_data.wp_works_ok) {
858 printk(KERN_CONT "No.\n"); 838 printk(KERN_CONT "No.\n");
859 #ifdef CONFIG_X86_WP_WORKS_OK 839 #ifdef CONFIG_X86_WP_WORKS_OK
860 panic( 840 panic(
861 "This kernel doesn't support CPU's with broken WP. Recompile it for a 386!"); 841 "This kernel doesn't support CPU's with broken WP. Recompile it for a 386!");
862 #endif 842 #endif
863 } else { 843 } else {
864 printk(KERN_CONT "Ok.\n"); 844 printk(KERN_CONT "Ok.\n");
865 } 845 }
866 } 846 }
867 847
868 static struct kcore_list kcore_mem, kcore_vmalloc; 848 static struct kcore_list kcore_mem, kcore_vmalloc;
869 849
870 void __init mem_init(void) 850 void __init mem_init(void)
871 { 851 {
872 int codesize, reservedpages, datasize, initsize; 852 int codesize, reservedpages, datasize, initsize;
873 int tmp; 853 int tmp;
874 854
875 #ifdef CONFIG_FLATMEM 855 #ifdef CONFIG_FLATMEM
876 BUG_ON(!mem_map); 856 BUG_ON(!mem_map);
877 #endif 857 #endif
878 /* this will put all low memory onto the freelists */ 858 /* this will put all low memory onto the freelists */
879 totalram_pages += free_all_bootmem(); 859 totalram_pages += free_all_bootmem();
880 860
881 reservedpages = 0; 861 reservedpages = 0;
882 for (tmp = 0; tmp < max_low_pfn; tmp++) 862 for (tmp = 0; tmp < max_low_pfn; tmp++)
883 /* 863 /*
884 * Only count reserved RAM pages: 864 * Only count reserved RAM pages:
885 */ 865 */
886 if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp))) 866 if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
887 reservedpages++; 867 reservedpages++;
888 868
889 set_highmem_pages_init(); 869 set_highmem_pages_init();
890 870
891 codesize = (unsigned long) &_etext - (unsigned long) &_text; 871 codesize = (unsigned long) &_etext - (unsigned long) &_text;
892 datasize = (unsigned long) &_edata - (unsigned long) &_etext; 872 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
893 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; 873 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
894 874
895 kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); 875 kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
896 kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, 876 kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
897 VMALLOC_END-VMALLOC_START); 877 VMALLOC_END-VMALLOC_START);
898 878
899 printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, " 879 printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, "
900 "%dk reserved, %dk data, %dk init, %ldk highmem)\n", 880 "%dk reserved, %dk data, %dk init, %ldk highmem)\n",
901 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), 881 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
902 num_physpages << (PAGE_SHIFT-10), 882 num_physpages << (PAGE_SHIFT-10),
903 codesize >> 10, 883 codesize >> 10,
904 reservedpages << (PAGE_SHIFT-10), 884 reservedpages << (PAGE_SHIFT-10),
905 datasize >> 10, 885 datasize >> 10,
906 initsize >> 10, 886 initsize >> 10,
907 (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10)) 887 (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
908 ); 888 );
909 889
910 printk(KERN_INFO "virtual kernel memory layout:\n" 890 printk(KERN_INFO "virtual kernel memory layout:\n"
911 " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" 891 " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
912 #ifdef CONFIG_HIGHMEM 892 #ifdef CONFIG_HIGHMEM
913 " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n" 893 " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
914 #endif 894 #endif
915 " vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n" 895 " vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n"
916 " lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n" 896 " lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n"
917 " .init : 0x%08lx - 0x%08lx (%4ld kB)\n" 897 " .init : 0x%08lx - 0x%08lx (%4ld kB)\n"
918 " .data : 0x%08lx - 0x%08lx (%4ld kB)\n" 898 " .data : 0x%08lx - 0x%08lx (%4ld kB)\n"
919 " .text : 0x%08lx - 0x%08lx (%4ld kB)\n", 899 " .text : 0x%08lx - 0x%08lx (%4ld kB)\n",
920 FIXADDR_START, FIXADDR_TOP, 900 FIXADDR_START, FIXADDR_TOP,
921 (FIXADDR_TOP - FIXADDR_START) >> 10, 901 (FIXADDR_TOP - FIXADDR_START) >> 10,
922 902
923 #ifdef CONFIG_HIGHMEM 903 #ifdef CONFIG_HIGHMEM
924 PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, 904 PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
925 (LAST_PKMAP*PAGE_SIZE) >> 10, 905 (LAST_PKMAP*PAGE_SIZE) >> 10,
926 #endif 906 #endif
927 907
928 VMALLOC_START, VMALLOC_END, 908 VMALLOC_START, VMALLOC_END,
929 (VMALLOC_END - VMALLOC_START) >> 20, 909 (VMALLOC_END - VMALLOC_START) >> 20,
930 910
931 (unsigned long)__va(0), (unsigned long)high_memory, 911 (unsigned long)__va(0), (unsigned long)high_memory,
932 ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20, 912 ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20,
933 913
934 (unsigned long)&__init_begin, (unsigned long)&__init_end, 914 (unsigned long)&__init_begin, (unsigned long)&__init_end,
935 ((unsigned long)&__init_end - 915 ((unsigned long)&__init_end -
936 (unsigned long)&__init_begin) >> 10, 916 (unsigned long)&__init_begin) >> 10,
937 917
938 (unsigned long)&_etext, (unsigned long)&_edata, 918 (unsigned long)&_etext, (unsigned long)&_edata,
939 ((unsigned long)&_edata - (unsigned long)&_etext) >> 10, 919 ((unsigned long)&_edata - (unsigned long)&_etext) >> 10,
940 920
941 (unsigned long)&_text, (unsigned long)&_etext, 921 (unsigned long)&_text, (unsigned long)&_etext,
942 ((unsigned long)&_etext - (unsigned long)&_text) >> 10); 922 ((unsigned long)&_etext - (unsigned long)&_text) >> 10);
943 923
944 #ifdef CONFIG_HIGHMEM 924 #ifdef CONFIG_HIGHMEM
945 BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START); 925 BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
946 BUG_ON(VMALLOC_END > PKMAP_BASE); 926 BUG_ON(VMALLOC_END > PKMAP_BASE);
947 #endif 927 #endif
948 BUG_ON(VMALLOC_START > VMALLOC_END); 928 BUG_ON(VMALLOC_START > VMALLOC_END);
949 BUG_ON((unsigned long)high_memory > VMALLOC_START); 929 BUG_ON((unsigned long)high_memory > VMALLOC_START);
950 930
951 if (boot_cpu_data.wp_works_ok < 0) 931 if (boot_cpu_data.wp_works_ok < 0)
952 test_wp_bit(); 932 test_wp_bit();
953 933
954 cpa_init(); 934 cpa_init();
955 save_pg_dir(); 935 save_pg_dir();
956 zap_low_mappings(); 936 zap_low_mappings();
957 } 937 }
958 938
959 #ifdef CONFIG_MEMORY_HOTPLUG 939 #ifdef CONFIG_MEMORY_HOTPLUG
960 int arch_add_memory(int nid, u64 start, u64 size) 940 int arch_add_memory(int nid, u64 start, u64 size)
961 { 941 {
962 struct pglist_data *pgdata = NODE_DATA(nid); 942 struct pglist_data *pgdata = NODE_DATA(nid);
963 struct zone *zone = pgdata->node_zones + ZONE_HIGHMEM; 943 struct zone *zone = pgdata->node_zones + ZONE_HIGHMEM;
964 unsigned long start_pfn = start >> PAGE_SHIFT; 944 unsigned long start_pfn = start >> PAGE_SHIFT;
965 unsigned long nr_pages = size >> PAGE_SHIFT; 945 unsigned long nr_pages = size >> PAGE_SHIFT;
966 946
967 return __add_pages(zone, start_pfn, nr_pages); 947 return __add_pages(zone, start_pfn, nr_pages);
968 } 948 }
969 #endif 949 #endif
970 950
971 /* 951 /*
972 * This function cannot be __init, since exceptions don't work in that 952 * This function cannot be __init, since exceptions don't work in that
973 * section. Put this after the callers, so that it cannot be inlined. 953 * section. Put this after the callers, so that it cannot be inlined.
974 */ 954 */
975 static noinline int do_test_wp_bit(void) 955 static noinline int do_test_wp_bit(void)
976 { 956 {
977 char tmp_reg; 957 char tmp_reg;
978 int flag; 958 int flag;
979 959
980 __asm__ __volatile__( 960 __asm__ __volatile__(
981 " movb %0, %1 \n" 961 " movb %0, %1 \n"
982 "1: movb %1, %0 \n" 962 "1: movb %1, %0 \n"
983 " xorl %2, %2 \n" 963 " xorl %2, %2 \n"
984 "2: \n" 964 "2: \n"
985 _ASM_EXTABLE(1b,2b) 965 _ASM_EXTABLE(1b,2b)
986 :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)), 966 :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)),
987 "=q" (tmp_reg), 967 "=q" (tmp_reg),
988 "=r" (flag) 968 "=r" (flag)
989 :"2" (1) 969 :"2" (1)
990 :"memory"); 970 :"memory");
991 971
992 return flag; 972 return flag;
993 } 973 }
994 974
995 #ifdef CONFIG_DEBUG_RODATA 975 #ifdef CONFIG_DEBUG_RODATA
996 const int rodata_test_data = 0xC3; 976 const int rodata_test_data = 0xC3;
997 EXPORT_SYMBOL_GPL(rodata_test_data); 977 EXPORT_SYMBOL_GPL(rodata_test_data);
998 978
999 void mark_rodata_ro(void) 979 void mark_rodata_ro(void)
1000 { 980 {
1001 unsigned long start = PFN_ALIGN(_text); 981 unsigned long start = PFN_ALIGN(_text);
1002 unsigned long size = PFN_ALIGN(_etext) - start; 982 unsigned long size = PFN_ALIGN(_etext) - start;
1003 983
1004 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); 984 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
1005 printk(KERN_INFO "Write protecting the kernel text: %luk\n", 985 printk(KERN_INFO "Write protecting the kernel text: %luk\n",
1006 size >> 10); 986 size >> 10);
1007 987
1008 #ifdef CONFIG_CPA_DEBUG 988 #ifdef CONFIG_CPA_DEBUG
1009 printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n", 989 printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n",
1010 start, start+size); 990 start, start+size);
1011 set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT); 991 set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT);
1012 992
1013 printk(KERN_INFO "Testing CPA: write protecting again\n"); 993 printk(KERN_INFO "Testing CPA: write protecting again\n");
1014 set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT); 994 set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
1015 #endif 995 #endif
1016 start += size; 996 start += size;
1017 size = (unsigned long)__end_rodata - start; 997 size = (unsigned long)__end_rodata - start;
1018 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); 998 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
1019 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", 999 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
1020 size >> 10); 1000 size >> 10);
1021 rodata_test(); 1001 rodata_test();
1022 1002
1023 #ifdef CONFIG_CPA_DEBUG 1003 #ifdef CONFIG_CPA_DEBUG
1024 printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, start + size); 1004 printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, start + size);
1025 set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT); 1005 set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT);
1026 1006
1027 printk(KERN_INFO "Testing CPA: write protecting again\n"); 1007 printk(KERN_INFO "Testing CPA: write protecting again\n");
1028 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); 1008 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
1029 #endif 1009 #endif
1030 } 1010 }
1031 #endif 1011 #endif
1032 1012
1033 void free_init_pages(char *what, unsigned long begin, unsigned long end) 1013 void free_init_pages(char *what, unsigned long begin, unsigned long end)
1034 { 1014 {
1035 #ifdef CONFIG_DEBUG_PAGEALLOC 1015 #ifdef CONFIG_DEBUG_PAGEALLOC
1036 /* 1016 /*
1037 * If debugging page accesses then do not free this memory but 1017 * If debugging page accesses then do not free this memory but
1038 * mark them not present - any buggy init-section access will 1018 * mark them not present - any buggy init-section access will
1039 * create a kernel page fault: 1019 * create a kernel page fault:
1040 */ 1020 */
1041 printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n", 1021 printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
1042 begin, PAGE_ALIGN(end)); 1022 begin, PAGE_ALIGN(end));
1043 set_memory_np(begin, (end - begin) >> PAGE_SHIFT); 1023 set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
1044 #else 1024 #else
1045 unsigned long addr; 1025 unsigned long addr;
1046 1026
1047 /* 1027 /*
1048 * We just marked the kernel text read only above, now that 1028 * We just marked the kernel text read only above, now that
1049 * we are going to free part of that, we need to make that 1029 * we are going to free part of that, we need to make that
1050 * writeable first. 1030 * writeable first.
1051 */ 1031 */
1052 set_memory_rw(begin, (end - begin) >> PAGE_SHIFT); 1032 set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
1053 1033
1054 for (addr = begin; addr < end; addr += PAGE_SIZE) { 1034 for (addr = begin; addr < end; addr += PAGE_SIZE) {
1055 ClearPageReserved(virt_to_page(addr)); 1035 ClearPageReserved(virt_to_page(addr));
1056 init_page_count(virt_to_page(addr)); 1036 init_page_count(virt_to_page(addr));
1057 memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE); 1037 memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
1058 free_page(addr); 1038 free_page(addr);
1059 totalram_pages++; 1039 totalram_pages++;
1060 } 1040 }
1061 printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); 1041 printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
1062 #endif 1042 #endif
1063 } 1043 }
1064 1044
1065 void free_initmem(void) 1045 void free_initmem(void)
1066 { 1046 {
1067 free_init_pages("unused kernel memory", 1047 free_init_pages("unused kernel memory",
1068 (unsigned long)(&__init_begin), 1048 (unsigned long)(&__init_begin),
1069 (unsigned long)(&__init_end)); 1049 (unsigned long)(&__init_end));
1070 } 1050 }
1071 1051
1072 #ifdef CONFIG_BLK_DEV_INITRD 1052 #ifdef CONFIG_BLK_DEV_INITRD
1073 void free_initrd_mem(unsigned long start, unsigned long end) 1053 void free_initrd_mem(unsigned long start, unsigned long end)
1074 { 1054 {
1075 free_init_pages("initrd memory", start, end); 1055 free_init_pages("initrd memory", start, end);
1076 } 1056 }
1077 #endif 1057 #endif
1078 1058
1079 int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, 1059 int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
1080 int flags) 1060 int flags)
1081 { 1061 {
1082 return reserve_bootmem(phys, len, flags); 1062 return reserve_bootmem(phys, len, flags);
1083 } 1063 }
1084 1064
include/asm-x86/numa_32.h
1 #ifndef _ASM_X86_32_NUMA_H 1 #ifndef _ASM_X86_32_NUMA_H
2 #define _ASM_X86_32_NUMA_H 1 2 #define _ASM_X86_32_NUMA_H 1
3 3
4 extern int pxm_to_nid(int pxm); 4 extern int pxm_to_nid(int pxm);
5 extern void numa_remove_cpu(int cpu); 5 extern void numa_remove_cpu(int cpu);
6 6
7 #ifdef CONFIG_NUMA 7 #ifdef CONFIG_NUMA
8 extern void __init remap_numa_kva(void);
9 extern void set_highmem_pages_init(void); 8 extern void set_highmem_pages_init(void);
10 #else
11 static inline void remap_numa_kva(void)
12 {
13 }
14 #endif 9 #endif
15 10
16 #endif /* _ASM_X86_32_NUMA_H */ 11 #endif /* _ASM_X86_32_NUMA_H */
17 12