Commit 208d54e5513c0c02d85af0990901354c74364d5c
Committed by
Linus Torvalds
1 parent
c6a57e19e4
Exists in
master
and in
7 other branches
[PATCH] memory hotplug locking: node_size_lock
pgdat->node_size_lock is basically only neeeded in one place in the normal code: show_mem(), which is the arch-specific sysrq-m printing function. Strictly speaking, the architectures not doing memory hotplug do no need this locking in show_mem(). However, they are all included for completeness. This should also make any future consolidation of all of the implementations a little more straightforward. This lock is also held in the sparsemem code during a memory removal, as sections are invalidated. This is the place there pfn_valid() is made false for a memory area that's being removed. The lock is only required when doing pfn_valid() operations on memory which the user does not already have a reference on the page, such as in show_mem(). Signed-off-by: Dave Hansen <haveblue@us.ibm.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Showing 9 changed files with 76 additions and 2 deletions Inline Diff
arch/alpha/mm/numa.c
1 | /* | 1 | /* |
2 | * linux/arch/alpha/mm/numa.c | 2 | * linux/arch/alpha/mm/numa.c |
3 | * | 3 | * |
4 | * DISCONTIGMEM NUMA alpha support. | 4 | * DISCONTIGMEM NUMA alpha support. |
5 | * | 5 | * |
6 | * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE | 6 | * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE |
7 | */ | 7 | */ |
8 | 8 | ||
9 | #include <linux/config.h> | 9 | #include <linux/config.h> |
10 | #include <linux/types.h> | 10 | #include <linux/types.h> |
11 | #include <linux/kernel.h> | 11 | #include <linux/kernel.h> |
12 | #include <linux/mm.h> | 12 | #include <linux/mm.h> |
13 | #include <linux/bootmem.h> | 13 | #include <linux/bootmem.h> |
14 | #include <linux/swap.h> | 14 | #include <linux/swap.h> |
15 | #include <linux/initrd.h> | 15 | #include <linux/initrd.h> |
16 | 16 | ||
17 | #include <asm/hwrpb.h> | 17 | #include <asm/hwrpb.h> |
18 | #include <asm/pgalloc.h> | 18 | #include <asm/pgalloc.h> |
19 | 19 | ||
20 | pg_data_t node_data[MAX_NUMNODES]; | 20 | pg_data_t node_data[MAX_NUMNODES]; |
21 | bootmem_data_t node_bdata[MAX_NUMNODES]; | 21 | bootmem_data_t node_bdata[MAX_NUMNODES]; |
22 | 22 | ||
23 | #undef DEBUG_DISCONTIG | 23 | #undef DEBUG_DISCONTIG |
24 | #ifdef DEBUG_DISCONTIG | 24 | #ifdef DEBUG_DISCONTIG |
25 | #define DBGDCONT(args...) printk(args) | 25 | #define DBGDCONT(args...) printk(args) |
26 | #else | 26 | #else |
27 | #define DBGDCONT(args...) | 27 | #define DBGDCONT(args...) |
28 | #endif | 28 | #endif |
29 | 29 | ||
30 | #define PFN_UP(x) (((x) + PAGE_SIZE-1) >> PAGE_SHIFT) | 30 | #define PFN_UP(x) (((x) + PAGE_SIZE-1) >> PAGE_SHIFT) |
31 | #define PFN_DOWN(x) ((x) >> PAGE_SHIFT) | 31 | #define PFN_DOWN(x) ((x) >> PAGE_SHIFT) |
32 | #define PFN_PHYS(x) ((x) << PAGE_SHIFT) | 32 | #define PFN_PHYS(x) ((x) << PAGE_SHIFT) |
33 | #define for_each_mem_cluster(memdesc, cluster, i) \ | 33 | #define for_each_mem_cluster(memdesc, cluster, i) \ |
34 | for ((cluster) = (memdesc)->cluster, (i) = 0; \ | 34 | for ((cluster) = (memdesc)->cluster, (i) = 0; \ |
35 | (i) < (memdesc)->numclusters; (i)++, (cluster)++) | 35 | (i) < (memdesc)->numclusters; (i)++, (cluster)++) |
36 | 36 | ||
37 | static void __init show_mem_layout(void) | 37 | static void __init show_mem_layout(void) |
38 | { | 38 | { |
39 | struct memclust_struct * cluster; | 39 | struct memclust_struct * cluster; |
40 | struct memdesc_struct * memdesc; | 40 | struct memdesc_struct * memdesc; |
41 | int i; | 41 | int i; |
42 | 42 | ||
43 | /* Find free clusters, and init and free the bootmem accordingly. */ | 43 | /* Find free clusters, and init and free the bootmem accordingly. */ |
44 | memdesc = (struct memdesc_struct *) | 44 | memdesc = (struct memdesc_struct *) |
45 | (hwrpb->mddt_offset + (unsigned long) hwrpb); | 45 | (hwrpb->mddt_offset + (unsigned long) hwrpb); |
46 | 46 | ||
47 | printk("Raw memory layout:\n"); | 47 | printk("Raw memory layout:\n"); |
48 | for_each_mem_cluster(memdesc, cluster, i) { | 48 | for_each_mem_cluster(memdesc, cluster, i) { |
49 | printk(" memcluster %2d, usage %1lx, start %8lu, end %8lu\n", | 49 | printk(" memcluster %2d, usage %1lx, start %8lu, end %8lu\n", |
50 | i, cluster->usage, cluster->start_pfn, | 50 | i, cluster->usage, cluster->start_pfn, |
51 | cluster->start_pfn + cluster->numpages); | 51 | cluster->start_pfn + cluster->numpages); |
52 | } | 52 | } |
53 | } | 53 | } |
54 | 54 | ||
55 | static void __init | 55 | static void __init |
56 | setup_memory_node(int nid, void *kernel_end) | 56 | setup_memory_node(int nid, void *kernel_end) |
57 | { | 57 | { |
58 | extern unsigned long mem_size_limit; | 58 | extern unsigned long mem_size_limit; |
59 | struct memclust_struct * cluster; | 59 | struct memclust_struct * cluster; |
60 | struct memdesc_struct * memdesc; | 60 | struct memdesc_struct * memdesc; |
61 | unsigned long start_kernel_pfn, end_kernel_pfn; | 61 | unsigned long start_kernel_pfn, end_kernel_pfn; |
62 | unsigned long bootmap_size, bootmap_pages, bootmap_start; | 62 | unsigned long bootmap_size, bootmap_pages, bootmap_start; |
63 | unsigned long start, end; | 63 | unsigned long start, end; |
64 | unsigned long node_pfn_start, node_pfn_end; | 64 | unsigned long node_pfn_start, node_pfn_end; |
65 | unsigned long node_min_pfn, node_max_pfn; | 65 | unsigned long node_min_pfn, node_max_pfn; |
66 | int i; | 66 | int i; |
67 | unsigned long node_datasz = PFN_UP(sizeof(pg_data_t)); | 67 | unsigned long node_datasz = PFN_UP(sizeof(pg_data_t)); |
68 | int show_init = 0; | 68 | int show_init = 0; |
69 | 69 | ||
70 | /* Find the bounds of current node */ | 70 | /* Find the bounds of current node */ |
71 | node_pfn_start = (node_mem_start(nid)) >> PAGE_SHIFT; | 71 | node_pfn_start = (node_mem_start(nid)) >> PAGE_SHIFT; |
72 | node_pfn_end = node_pfn_start + (node_mem_size(nid) >> PAGE_SHIFT); | 72 | node_pfn_end = node_pfn_start + (node_mem_size(nid) >> PAGE_SHIFT); |
73 | 73 | ||
74 | /* Find free clusters, and init and free the bootmem accordingly. */ | 74 | /* Find free clusters, and init and free the bootmem accordingly. */ |
75 | memdesc = (struct memdesc_struct *) | 75 | memdesc = (struct memdesc_struct *) |
76 | (hwrpb->mddt_offset + (unsigned long) hwrpb); | 76 | (hwrpb->mddt_offset + (unsigned long) hwrpb); |
77 | 77 | ||
78 | /* find the bounds of this node (node_min_pfn/node_max_pfn) */ | 78 | /* find the bounds of this node (node_min_pfn/node_max_pfn) */ |
79 | node_min_pfn = ~0UL; | 79 | node_min_pfn = ~0UL; |
80 | node_max_pfn = 0UL; | 80 | node_max_pfn = 0UL; |
81 | for_each_mem_cluster(memdesc, cluster, i) { | 81 | for_each_mem_cluster(memdesc, cluster, i) { |
82 | /* Bit 0 is console/PALcode reserved. Bit 1 is | 82 | /* Bit 0 is console/PALcode reserved. Bit 1 is |
83 | non-volatile memory -- we might want to mark | 83 | non-volatile memory -- we might want to mark |
84 | this for later. */ | 84 | this for later. */ |
85 | if (cluster->usage & 3) | 85 | if (cluster->usage & 3) |
86 | continue; | 86 | continue; |
87 | 87 | ||
88 | start = cluster->start_pfn; | 88 | start = cluster->start_pfn; |
89 | end = start + cluster->numpages; | 89 | end = start + cluster->numpages; |
90 | 90 | ||
91 | if (start >= node_pfn_end || end <= node_pfn_start) | 91 | if (start >= node_pfn_end || end <= node_pfn_start) |
92 | continue; | 92 | continue; |
93 | 93 | ||
94 | if (!show_init) { | 94 | if (!show_init) { |
95 | show_init = 1; | 95 | show_init = 1; |
96 | printk("Initializing bootmem allocator on Node ID %d\n", nid); | 96 | printk("Initializing bootmem allocator on Node ID %d\n", nid); |
97 | } | 97 | } |
98 | printk(" memcluster %2d, usage %1lx, start %8lu, end %8lu\n", | 98 | printk(" memcluster %2d, usage %1lx, start %8lu, end %8lu\n", |
99 | i, cluster->usage, cluster->start_pfn, | 99 | i, cluster->usage, cluster->start_pfn, |
100 | cluster->start_pfn + cluster->numpages); | 100 | cluster->start_pfn + cluster->numpages); |
101 | 101 | ||
102 | if (start < node_pfn_start) | 102 | if (start < node_pfn_start) |
103 | start = node_pfn_start; | 103 | start = node_pfn_start; |
104 | if (end > node_pfn_end) | 104 | if (end > node_pfn_end) |
105 | end = node_pfn_end; | 105 | end = node_pfn_end; |
106 | 106 | ||
107 | if (start < node_min_pfn) | 107 | if (start < node_min_pfn) |
108 | node_min_pfn = start; | 108 | node_min_pfn = start; |
109 | if (end > node_max_pfn) | 109 | if (end > node_max_pfn) |
110 | node_max_pfn = end; | 110 | node_max_pfn = end; |
111 | } | 111 | } |
112 | 112 | ||
113 | if (mem_size_limit && node_max_pfn > mem_size_limit) { | 113 | if (mem_size_limit && node_max_pfn > mem_size_limit) { |
114 | static int msg_shown = 0; | 114 | static int msg_shown = 0; |
115 | if (!msg_shown) { | 115 | if (!msg_shown) { |
116 | msg_shown = 1; | 116 | msg_shown = 1; |
117 | printk("setup: forcing memory size to %ldK (from %ldK).\n", | 117 | printk("setup: forcing memory size to %ldK (from %ldK).\n", |
118 | mem_size_limit << (PAGE_SHIFT - 10), | 118 | mem_size_limit << (PAGE_SHIFT - 10), |
119 | node_max_pfn << (PAGE_SHIFT - 10)); | 119 | node_max_pfn << (PAGE_SHIFT - 10)); |
120 | } | 120 | } |
121 | node_max_pfn = mem_size_limit; | 121 | node_max_pfn = mem_size_limit; |
122 | } | 122 | } |
123 | 123 | ||
124 | if (node_min_pfn >= node_max_pfn) | 124 | if (node_min_pfn >= node_max_pfn) |
125 | return; | 125 | return; |
126 | 126 | ||
127 | /* Update global {min,max}_low_pfn from node information. */ | 127 | /* Update global {min,max}_low_pfn from node information. */ |
128 | if (node_min_pfn < min_low_pfn) | 128 | if (node_min_pfn < min_low_pfn) |
129 | min_low_pfn = node_min_pfn; | 129 | min_low_pfn = node_min_pfn; |
130 | if (node_max_pfn > max_low_pfn) | 130 | if (node_max_pfn > max_low_pfn) |
131 | max_pfn = max_low_pfn = node_max_pfn; | 131 | max_pfn = max_low_pfn = node_max_pfn; |
132 | 132 | ||
133 | num_physpages += node_max_pfn - node_min_pfn; | 133 | num_physpages += node_max_pfn - node_min_pfn; |
134 | 134 | ||
135 | #if 0 /* we'll try this one again in a little while */ | 135 | #if 0 /* we'll try this one again in a little while */ |
136 | /* Cute trick to make sure our local node data is on local memory */ | 136 | /* Cute trick to make sure our local node data is on local memory */ |
137 | node_data[nid] = (pg_data_t *)(__va(node_min_pfn << PAGE_SHIFT)); | 137 | node_data[nid] = (pg_data_t *)(__va(node_min_pfn << PAGE_SHIFT)); |
138 | #endif | 138 | #endif |
139 | /* Quasi-mark the pg_data_t as in-use */ | 139 | /* Quasi-mark the pg_data_t as in-use */ |
140 | node_min_pfn += node_datasz; | 140 | node_min_pfn += node_datasz; |
141 | if (node_min_pfn >= node_max_pfn) { | 141 | if (node_min_pfn >= node_max_pfn) { |
142 | printk(" not enough mem to reserve NODE_DATA"); | 142 | printk(" not enough mem to reserve NODE_DATA"); |
143 | return; | 143 | return; |
144 | } | 144 | } |
145 | NODE_DATA(nid)->bdata = &node_bdata[nid]; | 145 | NODE_DATA(nid)->bdata = &node_bdata[nid]; |
146 | 146 | ||
147 | printk(" Detected node memory: start %8lu, end %8lu\n", | 147 | printk(" Detected node memory: start %8lu, end %8lu\n", |
148 | node_min_pfn, node_max_pfn); | 148 | node_min_pfn, node_max_pfn); |
149 | 149 | ||
150 | DBGDCONT(" DISCONTIG: node_data[%d] is at 0x%p\n", nid, NODE_DATA(nid)); | 150 | DBGDCONT(" DISCONTIG: node_data[%d] is at 0x%p\n", nid, NODE_DATA(nid)); |
151 | DBGDCONT(" DISCONTIG: NODE_DATA(%d)->bdata is at 0x%p\n", nid, NODE_DATA(nid)->bdata); | 151 | DBGDCONT(" DISCONTIG: NODE_DATA(%d)->bdata is at 0x%p\n", nid, NODE_DATA(nid)->bdata); |
152 | 152 | ||
153 | /* Find the bounds of kernel memory. */ | 153 | /* Find the bounds of kernel memory. */ |
154 | start_kernel_pfn = PFN_DOWN(KERNEL_START_PHYS); | 154 | start_kernel_pfn = PFN_DOWN(KERNEL_START_PHYS); |
155 | end_kernel_pfn = PFN_UP(virt_to_phys(kernel_end)); | 155 | end_kernel_pfn = PFN_UP(virt_to_phys(kernel_end)); |
156 | bootmap_start = -1; | 156 | bootmap_start = -1; |
157 | 157 | ||
158 | if (!nid && (node_max_pfn < end_kernel_pfn || node_min_pfn > start_kernel_pfn)) | 158 | if (!nid && (node_max_pfn < end_kernel_pfn || node_min_pfn > start_kernel_pfn)) |
159 | panic("kernel loaded out of ram"); | 159 | panic("kernel loaded out of ram"); |
160 | 160 | ||
161 | /* Zone start phys-addr must be 2^(MAX_ORDER-1) aligned. | 161 | /* Zone start phys-addr must be 2^(MAX_ORDER-1) aligned. |
162 | Note that we round this down, not up - node memory | 162 | Note that we round this down, not up - node memory |
163 | has much larger alignment than 8Mb, so it's safe. */ | 163 | has much larger alignment than 8Mb, so it's safe. */ |
164 | node_min_pfn &= ~((1UL << (MAX_ORDER-1))-1); | 164 | node_min_pfn &= ~((1UL << (MAX_ORDER-1))-1); |
165 | 165 | ||
166 | /* We need to know how many physically contiguous pages | 166 | /* We need to know how many physically contiguous pages |
167 | we'll need for the bootmap. */ | 167 | we'll need for the bootmap. */ |
168 | bootmap_pages = bootmem_bootmap_pages(node_max_pfn-node_min_pfn); | 168 | bootmap_pages = bootmem_bootmap_pages(node_max_pfn-node_min_pfn); |
169 | 169 | ||
170 | /* Now find a good region where to allocate the bootmap. */ | 170 | /* Now find a good region where to allocate the bootmap. */ |
171 | for_each_mem_cluster(memdesc, cluster, i) { | 171 | for_each_mem_cluster(memdesc, cluster, i) { |
172 | if (cluster->usage & 3) | 172 | if (cluster->usage & 3) |
173 | continue; | 173 | continue; |
174 | 174 | ||
175 | start = cluster->start_pfn; | 175 | start = cluster->start_pfn; |
176 | end = start + cluster->numpages; | 176 | end = start + cluster->numpages; |
177 | 177 | ||
178 | if (start >= node_max_pfn || end <= node_min_pfn) | 178 | if (start >= node_max_pfn || end <= node_min_pfn) |
179 | continue; | 179 | continue; |
180 | 180 | ||
181 | if (end > node_max_pfn) | 181 | if (end > node_max_pfn) |
182 | end = node_max_pfn; | 182 | end = node_max_pfn; |
183 | if (start < node_min_pfn) | 183 | if (start < node_min_pfn) |
184 | start = node_min_pfn; | 184 | start = node_min_pfn; |
185 | 185 | ||
186 | if (start < start_kernel_pfn) { | 186 | if (start < start_kernel_pfn) { |
187 | if (end > end_kernel_pfn | 187 | if (end > end_kernel_pfn |
188 | && end - end_kernel_pfn >= bootmap_pages) { | 188 | && end - end_kernel_pfn >= bootmap_pages) { |
189 | bootmap_start = end_kernel_pfn; | 189 | bootmap_start = end_kernel_pfn; |
190 | break; | 190 | break; |
191 | } else if (end > start_kernel_pfn) | 191 | } else if (end > start_kernel_pfn) |
192 | end = start_kernel_pfn; | 192 | end = start_kernel_pfn; |
193 | } else if (start < end_kernel_pfn) | 193 | } else if (start < end_kernel_pfn) |
194 | start = end_kernel_pfn; | 194 | start = end_kernel_pfn; |
195 | if (end - start >= bootmap_pages) { | 195 | if (end - start >= bootmap_pages) { |
196 | bootmap_start = start; | 196 | bootmap_start = start; |
197 | break; | 197 | break; |
198 | } | 198 | } |
199 | } | 199 | } |
200 | 200 | ||
201 | if (bootmap_start == -1) | 201 | if (bootmap_start == -1) |
202 | panic("couldn't find a contigous place for the bootmap"); | 202 | panic("couldn't find a contigous place for the bootmap"); |
203 | 203 | ||
204 | /* Allocate the bootmap and mark the whole MM as reserved. */ | 204 | /* Allocate the bootmap and mark the whole MM as reserved. */ |
205 | bootmap_size = init_bootmem_node(NODE_DATA(nid), bootmap_start, | 205 | bootmap_size = init_bootmem_node(NODE_DATA(nid), bootmap_start, |
206 | node_min_pfn, node_max_pfn); | 206 | node_min_pfn, node_max_pfn); |
207 | DBGDCONT(" bootmap_start %lu, bootmap_size %lu, bootmap_pages %lu\n", | 207 | DBGDCONT(" bootmap_start %lu, bootmap_size %lu, bootmap_pages %lu\n", |
208 | bootmap_start, bootmap_size, bootmap_pages); | 208 | bootmap_start, bootmap_size, bootmap_pages); |
209 | 209 | ||
210 | /* Mark the free regions. */ | 210 | /* Mark the free regions. */ |
211 | for_each_mem_cluster(memdesc, cluster, i) { | 211 | for_each_mem_cluster(memdesc, cluster, i) { |
212 | if (cluster->usage & 3) | 212 | if (cluster->usage & 3) |
213 | continue; | 213 | continue; |
214 | 214 | ||
215 | start = cluster->start_pfn; | 215 | start = cluster->start_pfn; |
216 | end = cluster->start_pfn + cluster->numpages; | 216 | end = cluster->start_pfn + cluster->numpages; |
217 | 217 | ||
218 | if (start >= node_max_pfn || end <= node_min_pfn) | 218 | if (start >= node_max_pfn || end <= node_min_pfn) |
219 | continue; | 219 | continue; |
220 | 220 | ||
221 | if (end > node_max_pfn) | 221 | if (end > node_max_pfn) |
222 | end = node_max_pfn; | 222 | end = node_max_pfn; |
223 | if (start < node_min_pfn) | 223 | if (start < node_min_pfn) |
224 | start = node_min_pfn; | 224 | start = node_min_pfn; |
225 | 225 | ||
226 | if (start < start_kernel_pfn) { | 226 | if (start < start_kernel_pfn) { |
227 | if (end > end_kernel_pfn) { | 227 | if (end > end_kernel_pfn) { |
228 | free_bootmem_node(NODE_DATA(nid), PFN_PHYS(start), | 228 | free_bootmem_node(NODE_DATA(nid), PFN_PHYS(start), |
229 | (PFN_PHYS(start_kernel_pfn) | 229 | (PFN_PHYS(start_kernel_pfn) |
230 | - PFN_PHYS(start))); | 230 | - PFN_PHYS(start))); |
231 | printk(" freeing pages %ld:%ld\n", | 231 | printk(" freeing pages %ld:%ld\n", |
232 | start, start_kernel_pfn); | 232 | start, start_kernel_pfn); |
233 | start = end_kernel_pfn; | 233 | start = end_kernel_pfn; |
234 | } else if (end > start_kernel_pfn) | 234 | } else if (end > start_kernel_pfn) |
235 | end = start_kernel_pfn; | 235 | end = start_kernel_pfn; |
236 | } else if (start < end_kernel_pfn) | 236 | } else if (start < end_kernel_pfn) |
237 | start = end_kernel_pfn; | 237 | start = end_kernel_pfn; |
238 | if (start >= end) | 238 | if (start >= end) |
239 | continue; | 239 | continue; |
240 | 240 | ||
241 | free_bootmem_node(NODE_DATA(nid), PFN_PHYS(start), PFN_PHYS(end) - PFN_PHYS(start)); | 241 | free_bootmem_node(NODE_DATA(nid), PFN_PHYS(start), PFN_PHYS(end) - PFN_PHYS(start)); |
242 | printk(" freeing pages %ld:%ld\n", start, end); | 242 | printk(" freeing pages %ld:%ld\n", start, end); |
243 | } | 243 | } |
244 | 244 | ||
245 | /* Reserve the bootmap memory. */ | 245 | /* Reserve the bootmap memory. */ |
246 | reserve_bootmem_node(NODE_DATA(nid), PFN_PHYS(bootmap_start), bootmap_size); | 246 | reserve_bootmem_node(NODE_DATA(nid), PFN_PHYS(bootmap_start), bootmap_size); |
247 | printk(" reserving pages %ld:%ld\n", bootmap_start, bootmap_start+PFN_UP(bootmap_size)); | 247 | printk(" reserving pages %ld:%ld\n", bootmap_start, bootmap_start+PFN_UP(bootmap_size)); |
248 | 248 | ||
249 | node_set_online(nid); | 249 | node_set_online(nid); |
250 | } | 250 | } |
251 | 251 | ||
252 | void __init | 252 | void __init |
253 | setup_memory(void *kernel_end) | 253 | setup_memory(void *kernel_end) |
254 | { | 254 | { |
255 | int nid; | 255 | int nid; |
256 | 256 | ||
257 | show_mem_layout(); | 257 | show_mem_layout(); |
258 | 258 | ||
259 | nodes_clear(node_online_map); | 259 | nodes_clear(node_online_map); |
260 | 260 | ||
261 | min_low_pfn = ~0UL; | 261 | min_low_pfn = ~0UL; |
262 | max_low_pfn = 0UL; | 262 | max_low_pfn = 0UL; |
263 | for (nid = 0; nid < MAX_NUMNODES; nid++) | 263 | for (nid = 0; nid < MAX_NUMNODES; nid++) |
264 | setup_memory_node(nid, kernel_end); | 264 | setup_memory_node(nid, kernel_end); |
265 | 265 | ||
266 | #ifdef CONFIG_BLK_DEV_INITRD | 266 | #ifdef CONFIG_BLK_DEV_INITRD |
267 | initrd_start = INITRD_START; | 267 | initrd_start = INITRD_START; |
268 | if (initrd_start) { | 268 | if (initrd_start) { |
269 | extern void *move_initrd(unsigned long); | 269 | extern void *move_initrd(unsigned long); |
270 | 270 | ||
271 | initrd_end = initrd_start+INITRD_SIZE; | 271 | initrd_end = initrd_start+INITRD_SIZE; |
272 | printk("Initial ramdisk at: 0x%p (%lu bytes)\n", | 272 | printk("Initial ramdisk at: 0x%p (%lu bytes)\n", |
273 | (void *) initrd_start, INITRD_SIZE); | 273 | (void *) initrd_start, INITRD_SIZE); |
274 | 274 | ||
275 | if ((void *)initrd_end > phys_to_virt(PFN_PHYS(max_low_pfn))) { | 275 | if ((void *)initrd_end > phys_to_virt(PFN_PHYS(max_low_pfn))) { |
276 | if (!move_initrd(PFN_PHYS(max_low_pfn))) | 276 | if (!move_initrd(PFN_PHYS(max_low_pfn))) |
277 | printk("initrd extends beyond end of memory " | 277 | printk("initrd extends beyond end of memory " |
278 | "(0x%08lx > 0x%p)\ndisabling initrd\n", | 278 | "(0x%08lx > 0x%p)\ndisabling initrd\n", |
279 | initrd_end, | 279 | initrd_end, |
280 | phys_to_virt(PFN_PHYS(max_low_pfn))); | 280 | phys_to_virt(PFN_PHYS(max_low_pfn))); |
281 | } else { | 281 | } else { |
282 | nid = kvaddr_to_nid(initrd_start); | 282 | nid = kvaddr_to_nid(initrd_start); |
283 | reserve_bootmem_node(NODE_DATA(nid), | 283 | reserve_bootmem_node(NODE_DATA(nid), |
284 | virt_to_phys((void *)initrd_start), | 284 | virt_to_phys((void *)initrd_start), |
285 | INITRD_SIZE); | 285 | INITRD_SIZE); |
286 | } | 286 | } |
287 | } | 287 | } |
288 | #endif /* CONFIG_BLK_DEV_INITRD */ | 288 | #endif /* CONFIG_BLK_DEV_INITRD */ |
289 | } | 289 | } |
290 | 290 | ||
291 | void __init paging_init(void) | 291 | void __init paging_init(void) |
292 | { | 292 | { |
293 | unsigned int nid; | 293 | unsigned int nid; |
294 | unsigned long zones_size[MAX_NR_ZONES] = {0, }; | 294 | unsigned long zones_size[MAX_NR_ZONES] = {0, }; |
295 | unsigned long dma_local_pfn; | 295 | unsigned long dma_local_pfn; |
296 | 296 | ||
297 | /* | 297 | /* |
298 | * The old global MAX_DMA_ADDRESS per-arch API doesn't fit | 298 | * The old global MAX_DMA_ADDRESS per-arch API doesn't fit |
299 | * in the NUMA model, for now we convert it to a pfn and | 299 | * in the NUMA model, for now we convert it to a pfn and |
300 | * we interpret this pfn as a local per-node information. | 300 | * we interpret this pfn as a local per-node information. |
301 | * This issue isn't very important since none of these machines | 301 | * This issue isn't very important since none of these machines |
302 | * have legacy ISA slots anyways. | 302 | * have legacy ISA slots anyways. |
303 | */ | 303 | */ |
304 | dma_local_pfn = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; | 304 | dma_local_pfn = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; |
305 | 305 | ||
306 | for_each_online_node(nid) { | 306 | for_each_online_node(nid) { |
307 | unsigned long start_pfn = node_bdata[nid].node_boot_start >> PAGE_SHIFT; | 307 | unsigned long start_pfn = node_bdata[nid].node_boot_start >> PAGE_SHIFT; |
308 | unsigned long end_pfn = node_bdata[nid].node_low_pfn; | 308 | unsigned long end_pfn = node_bdata[nid].node_low_pfn; |
309 | 309 | ||
310 | if (dma_local_pfn >= end_pfn - start_pfn) | 310 | if (dma_local_pfn >= end_pfn - start_pfn) |
311 | zones_size[ZONE_DMA] = end_pfn - start_pfn; | 311 | zones_size[ZONE_DMA] = end_pfn - start_pfn; |
312 | else { | 312 | else { |
313 | zones_size[ZONE_DMA] = dma_local_pfn; | 313 | zones_size[ZONE_DMA] = dma_local_pfn; |
314 | zones_size[ZONE_NORMAL] = (end_pfn - start_pfn) - dma_local_pfn; | 314 | zones_size[ZONE_NORMAL] = (end_pfn - start_pfn) - dma_local_pfn; |
315 | } | 315 | } |
316 | free_area_init_node(nid, NODE_DATA(nid), zones_size, start_pfn, NULL); | 316 | free_area_init_node(nid, NODE_DATA(nid), zones_size, start_pfn, NULL); |
317 | } | 317 | } |
318 | 318 | ||
319 | /* Initialize the kernel's ZERO_PGE. */ | 319 | /* Initialize the kernel's ZERO_PGE. */ |
320 | memset((void *)ZERO_PGE, 0, PAGE_SIZE); | 320 | memset((void *)ZERO_PGE, 0, PAGE_SIZE); |
321 | } | 321 | } |
322 | 322 | ||
323 | void __init mem_init(void) | 323 | void __init mem_init(void) |
324 | { | 324 | { |
325 | unsigned long codesize, reservedpages, datasize, initsize, pfn; | 325 | unsigned long codesize, reservedpages, datasize, initsize, pfn; |
326 | extern int page_is_ram(unsigned long) __init; | 326 | extern int page_is_ram(unsigned long) __init; |
327 | extern char _text, _etext, _data, _edata; | 327 | extern char _text, _etext, _data, _edata; |
328 | extern char __init_begin, __init_end; | 328 | extern char __init_begin, __init_end; |
329 | unsigned long nid, i; | 329 | unsigned long nid, i; |
330 | high_memory = (void *) __va(max_low_pfn << PAGE_SHIFT); | 330 | high_memory = (void *) __va(max_low_pfn << PAGE_SHIFT); |
331 | 331 | ||
332 | reservedpages = 0; | 332 | reservedpages = 0; |
333 | for_each_online_node(nid) { | 333 | for_each_online_node(nid) { |
334 | /* | 334 | /* |
335 | * This will free up the bootmem, ie, slot 0 memory | 335 | * This will free up the bootmem, ie, slot 0 memory |
336 | */ | 336 | */ |
337 | totalram_pages += free_all_bootmem_node(NODE_DATA(nid)); | 337 | totalram_pages += free_all_bootmem_node(NODE_DATA(nid)); |
338 | 338 | ||
339 | pfn = NODE_DATA(nid)->node_start_pfn; | 339 | pfn = NODE_DATA(nid)->node_start_pfn; |
340 | for (i = 0; i < node_spanned_pages(nid); i++, pfn++) | 340 | for (i = 0; i < node_spanned_pages(nid); i++, pfn++) |
341 | if (page_is_ram(pfn) && | 341 | if (page_is_ram(pfn) && |
342 | PageReserved(nid_page_nr(nid, i))) | 342 | PageReserved(nid_page_nr(nid, i))) |
343 | reservedpages++; | 343 | reservedpages++; |
344 | } | 344 | } |
345 | 345 | ||
346 | codesize = (unsigned long) &_etext - (unsigned long) &_text; | 346 | codesize = (unsigned long) &_etext - (unsigned long) &_text; |
347 | datasize = (unsigned long) &_edata - (unsigned long) &_data; | 347 | datasize = (unsigned long) &_edata - (unsigned long) &_data; |
348 | initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; | 348 | initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; |
349 | 349 | ||
350 | printk("Memory: %luk/%luk available (%luk kernel code, %luk reserved, " | 350 | printk("Memory: %luk/%luk available (%luk kernel code, %luk reserved, " |
351 | "%luk data, %luk init)\n", | 351 | "%luk data, %luk init)\n", |
352 | (unsigned long)nr_free_pages() << (PAGE_SHIFT-10), | 352 | (unsigned long)nr_free_pages() << (PAGE_SHIFT-10), |
353 | num_physpages << (PAGE_SHIFT-10), | 353 | num_physpages << (PAGE_SHIFT-10), |
354 | codesize >> 10, | 354 | codesize >> 10, |
355 | reservedpages << (PAGE_SHIFT-10), | 355 | reservedpages << (PAGE_SHIFT-10), |
356 | datasize >> 10, | 356 | datasize >> 10, |
357 | initsize >> 10); | 357 | initsize >> 10); |
358 | #if 0 | 358 | #if 0 |
359 | mem_stress(); | 359 | mem_stress(); |
360 | #endif | 360 | #endif |
361 | } | 361 | } |
362 | 362 | ||
363 | void | 363 | void |
364 | show_mem(void) | 364 | show_mem(void) |
365 | { | 365 | { |
366 | long i,free = 0,total = 0,reserved = 0; | 366 | long i,free = 0,total = 0,reserved = 0; |
367 | long shared = 0, cached = 0; | 367 | long shared = 0, cached = 0; |
368 | int nid; | 368 | int nid; |
369 | 369 | ||
370 | printk("\nMem-info:\n"); | 370 | printk("\nMem-info:\n"); |
371 | show_free_areas(); | 371 | show_free_areas(); |
372 | printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); | 372 | printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); |
373 | for_each_online_node(nid) { | 373 | for_each_online_node(nid) { |
374 | unsigned long flags; | ||
375 | pgdat_resize_lock(NODE_DATA(nid), &flags); | ||
374 | i = node_spanned_pages(nid); | 376 | i = node_spanned_pages(nid); |
375 | while (i-- > 0) { | 377 | while (i-- > 0) { |
376 | struct page *page = nid_page_nr(nid, i); | 378 | struct page *page = nid_page_nr(nid, i); |
377 | total++; | 379 | total++; |
378 | if (PageReserved(page)) | 380 | if (PageReserved(page)) |
379 | reserved++; | 381 | reserved++; |
380 | else if (PageSwapCache(page)) | 382 | else if (PageSwapCache(page)) |
381 | cached++; | 383 | cached++; |
382 | else if (!page_count(page)) | 384 | else if (!page_count(page)) |
383 | free++; | 385 | free++; |
384 | else | 386 | else |
385 | shared += page_count(page) - 1; | 387 | shared += page_count(page) - 1; |
386 | } | 388 | } |
389 | pgdat_resize_unlock(NODE_DATA(nid), &flags); | ||
387 | } | 390 | } |
388 | printk("%ld pages of RAM\n",total); | 391 | printk("%ld pages of RAM\n",total); |
389 | printk("%ld free pages\n",free); | 392 | printk("%ld free pages\n",free); |
390 | printk("%ld reserved pages\n",reserved); | 393 | printk("%ld reserved pages\n",reserved); |
391 | printk("%ld pages shared\n",shared); | 394 | printk("%ld pages shared\n",shared); |
392 | printk("%ld pages swap cached\n",cached); | 395 | printk("%ld pages swap cached\n",cached); |
393 | } | 396 | } |
394 | 397 |
arch/i386/mm/pgtable.c
1 | /* | 1 | /* |
2 | * linux/arch/i386/mm/pgtable.c | 2 | * linux/arch/i386/mm/pgtable.c |
3 | */ | 3 | */ |
4 | 4 | ||
5 | #include <linux/config.h> | 5 | #include <linux/config.h> |
6 | #include <linux/sched.h> | 6 | #include <linux/sched.h> |
7 | #include <linux/kernel.h> | 7 | #include <linux/kernel.h> |
8 | #include <linux/errno.h> | 8 | #include <linux/errno.h> |
9 | #include <linux/mm.h> | 9 | #include <linux/mm.h> |
10 | #include <linux/swap.h> | 10 | #include <linux/swap.h> |
11 | #include <linux/smp.h> | 11 | #include <linux/smp.h> |
12 | #include <linux/highmem.h> | 12 | #include <linux/highmem.h> |
13 | #include <linux/slab.h> | 13 | #include <linux/slab.h> |
14 | #include <linux/pagemap.h> | 14 | #include <linux/pagemap.h> |
15 | #include <linux/spinlock.h> | 15 | #include <linux/spinlock.h> |
16 | 16 | ||
17 | #include <asm/system.h> | 17 | #include <asm/system.h> |
18 | #include <asm/pgtable.h> | 18 | #include <asm/pgtable.h> |
19 | #include <asm/pgalloc.h> | 19 | #include <asm/pgalloc.h> |
20 | #include <asm/fixmap.h> | 20 | #include <asm/fixmap.h> |
21 | #include <asm/e820.h> | 21 | #include <asm/e820.h> |
22 | #include <asm/tlb.h> | 22 | #include <asm/tlb.h> |
23 | #include <asm/tlbflush.h> | 23 | #include <asm/tlbflush.h> |
24 | 24 | ||
25 | void show_mem(void) | 25 | void show_mem(void) |
26 | { | 26 | { |
27 | int total = 0, reserved = 0; | 27 | int total = 0, reserved = 0; |
28 | int shared = 0, cached = 0; | 28 | int shared = 0, cached = 0; |
29 | int highmem = 0; | 29 | int highmem = 0; |
30 | struct page *page; | 30 | struct page *page; |
31 | pg_data_t *pgdat; | 31 | pg_data_t *pgdat; |
32 | unsigned long i; | 32 | unsigned long i; |
33 | struct page_state ps; | 33 | struct page_state ps; |
34 | unsigned long flags; | ||
34 | 35 | ||
35 | printk(KERN_INFO "Mem-info:\n"); | 36 | printk(KERN_INFO "Mem-info:\n"); |
36 | show_free_areas(); | 37 | show_free_areas(); |
37 | printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); | 38 | printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); |
38 | for_each_pgdat(pgdat) { | 39 | for_each_pgdat(pgdat) { |
40 | pgdat_resize_lock(pgdat, &flags); | ||
39 | for (i = 0; i < pgdat->node_spanned_pages; ++i) { | 41 | for (i = 0; i < pgdat->node_spanned_pages; ++i) { |
40 | page = pgdat_page_nr(pgdat, i); | 42 | page = pgdat_page_nr(pgdat, i); |
41 | total++; | 43 | total++; |
42 | if (PageHighMem(page)) | 44 | if (PageHighMem(page)) |
43 | highmem++; | 45 | highmem++; |
44 | if (PageReserved(page)) | 46 | if (PageReserved(page)) |
45 | reserved++; | 47 | reserved++; |
46 | else if (PageSwapCache(page)) | 48 | else if (PageSwapCache(page)) |
47 | cached++; | 49 | cached++; |
48 | else if (page_count(page)) | 50 | else if (page_count(page)) |
49 | shared += page_count(page) - 1; | 51 | shared += page_count(page) - 1; |
50 | } | 52 | } |
53 | pgdat_resize_unlock(pgdat, &flags); | ||
51 | } | 54 | } |
52 | printk(KERN_INFO "%d pages of RAM\n", total); | 55 | printk(KERN_INFO "%d pages of RAM\n", total); |
53 | printk(KERN_INFO "%d pages of HIGHMEM\n", highmem); | 56 | printk(KERN_INFO "%d pages of HIGHMEM\n", highmem); |
54 | printk(KERN_INFO "%d reserved pages\n", reserved); | 57 | printk(KERN_INFO "%d reserved pages\n", reserved); |
55 | printk(KERN_INFO "%d pages shared\n", shared); | 58 | printk(KERN_INFO "%d pages shared\n", shared); |
56 | printk(KERN_INFO "%d pages swap cached\n", cached); | 59 | printk(KERN_INFO "%d pages swap cached\n", cached); |
57 | 60 | ||
58 | get_page_state(&ps); | 61 | get_page_state(&ps); |
59 | printk(KERN_INFO "%lu pages dirty\n", ps.nr_dirty); | 62 | printk(KERN_INFO "%lu pages dirty\n", ps.nr_dirty); |
60 | printk(KERN_INFO "%lu pages writeback\n", ps.nr_writeback); | 63 | printk(KERN_INFO "%lu pages writeback\n", ps.nr_writeback); |
61 | printk(KERN_INFO "%lu pages mapped\n", ps.nr_mapped); | 64 | printk(KERN_INFO "%lu pages mapped\n", ps.nr_mapped); |
62 | printk(KERN_INFO "%lu pages slab\n", ps.nr_slab); | 65 | printk(KERN_INFO "%lu pages slab\n", ps.nr_slab); |
63 | printk(KERN_INFO "%lu pages pagetables\n", ps.nr_page_table_pages); | 66 | printk(KERN_INFO "%lu pages pagetables\n", ps.nr_page_table_pages); |
64 | } | 67 | } |
65 | 68 | ||
66 | /* | 69 | /* |
67 | * Associate a virtual page frame with a given physical page frame | 70 | * Associate a virtual page frame with a given physical page frame |
68 | * and protection flags for that frame. | 71 | * and protection flags for that frame. |
69 | */ | 72 | */ |
70 | static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags) | 73 | static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags) |
71 | { | 74 | { |
72 | pgd_t *pgd; | 75 | pgd_t *pgd; |
73 | pud_t *pud; | 76 | pud_t *pud; |
74 | pmd_t *pmd; | 77 | pmd_t *pmd; |
75 | pte_t *pte; | 78 | pte_t *pte; |
76 | 79 | ||
77 | pgd = swapper_pg_dir + pgd_index(vaddr); | 80 | pgd = swapper_pg_dir + pgd_index(vaddr); |
78 | if (pgd_none(*pgd)) { | 81 | if (pgd_none(*pgd)) { |
79 | BUG(); | 82 | BUG(); |
80 | return; | 83 | return; |
81 | } | 84 | } |
82 | pud = pud_offset(pgd, vaddr); | 85 | pud = pud_offset(pgd, vaddr); |
83 | if (pud_none(*pud)) { | 86 | if (pud_none(*pud)) { |
84 | BUG(); | 87 | BUG(); |
85 | return; | 88 | return; |
86 | } | 89 | } |
87 | pmd = pmd_offset(pud, vaddr); | 90 | pmd = pmd_offset(pud, vaddr); |
88 | if (pmd_none(*pmd)) { | 91 | if (pmd_none(*pmd)) { |
89 | BUG(); | 92 | BUG(); |
90 | return; | 93 | return; |
91 | } | 94 | } |
92 | pte = pte_offset_kernel(pmd, vaddr); | 95 | pte = pte_offset_kernel(pmd, vaddr); |
93 | /* <pfn,flags> stored as-is, to permit clearing entries */ | 96 | /* <pfn,flags> stored as-is, to permit clearing entries */ |
94 | set_pte(pte, pfn_pte(pfn, flags)); | 97 | set_pte(pte, pfn_pte(pfn, flags)); |
95 | 98 | ||
96 | /* | 99 | /* |
97 | * It's enough to flush this one mapping. | 100 | * It's enough to flush this one mapping. |
98 | * (PGE mappings get flushed as well) | 101 | * (PGE mappings get flushed as well) |
99 | */ | 102 | */ |
100 | __flush_tlb_one(vaddr); | 103 | __flush_tlb_one(vaddr); |
101 | } | 104 | } |
102 | 105 | ||
103 | /* | 106 | /* |
104 | * Associate a large virtual page frame with a given physical page frame | 107 | * Associate a large virtual page frame with a given physical page frame |
105 | * and protection flags for that frame. pfn is for the base of the page, | 108 | * and protection flags for that frame. pfn is for the base of the page, |
106 | * vaddr is what the page gets mapped to - both must be properly aligned. | 109 | * vaddr is what the page gets mapped to - both must be properly aligned. |
107 | * The pmd must already be instantiated. Assumes PAE mode. | 110 | * The pmd must already be instantiated. Assumes PAE mode. |
108 | */ | 111 | */ |
109 | void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags) | 112 | void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags) |
110 | { | 113 | { |
111 | pgd_t *pgd; | 114 | pgd_t *pgd; |
112 | pud_t *pud; | 115 | pud_t *pud; |
113 | pmd_t *pmd; | 116 | pmd_t *pmd; |
114 | 117 | ||
115 | if (vaddr & (PMD_SIZE-1)) { /* vaddr is misaligned */ | 118 | if (vaddr & (PMD_SIZE-1)) { /* vaddr is misaligned */ |
116 | printk(KERN_WARNING "set_pmd_pfn: vaddr misaligned\n"); | 119 | printk(KERN_WARNING "set_pmd_pfn: vaddr misaligned\n"); |
117 | return; /* BUG(); */ | 120 | return; /* BUG(); */ |
118 | } | 121 | } |
119 | if (pfn & (PTRS_PER_PTE-1)) { /* pfn is misaligned */ | 122 | if (pfn & (PTRS_PER_PTE-1)) { /* pfn is misaligned */ |
120 | printk(KERN_WARNING "set_pmd_pfn: pfn misaligned\n"); | 123 | printk(KERN_WARNING "set_pmd_pfn: pfn misaligned\n"); |
121 | return; /* BUG(); */ | 124 | return; /* BUG(); */ |
122 | } | 125 | } |
123 | pgd = swapper_pg_dir + pgd_index(vaddr); | 126 | pgd = swapper_pg_dir + pgd_index(vaddr); |
124 | if (pgd_none(*pgd)) { | 127 | if (pgd_none(*pgd)) { |
125 | printk(KERN_WARNING "set_pmd_pfn: pgd_none\n"); | 128 | printk(KERN_WARNING "set_pmd_pfn: pgd_none\n"); |
126 | return; /* BUG(); */ | 129 | return; /* BUG(); */ |
127 | } | 130 | } |
128 | pud = pud_offset(pgd, vaddr); | 131 | pud = pud_offset(pgd, vaddr); |
129 | pmd = pmd_offset(pud, vaddr); | 132 | pmd = pmd_offset(pud, vaddr); |
130 | set_pmd(pmd, pfn_pmd(pfn, flags)); | 133 | set_pmd(pmd, pfn_pmd(pfn, flags)); |
131 | /* | 134 | /* |
132 | * It's enough to flush this one mapping. | 135 | * It's enough to flush this one mapping. |
133 | * (PGE mappings get flushed as well) | 136 | * (PGE mappings get flushed as well) |
134 | */ | 137 | */ |
135 | __flush_tlb_one(vaddr); | 138 | __flush_tlb_one(vaddr); |
136 | } | 139 | } |
137 | 140 | ||
138 | void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags) | 141 | void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags) |
139 | { | 142 | { |
140 | unsigned long address = __fix_to_virt(idx); | 143 | unsigned long address = __fix_to_virt(idx); |
141 | 144 | ||
142 | if (idx >= __end_of_fixed_addresses) { | 145 | if (idx >= __end_of_fixed_addresses) { |
143 | BUG(); | 146 | BUG(); |
144 | return; | 147 | return; |
145 | } | 148 | } |
146 | set_pte_pfn(address, phys >> PAGE_SHIFT, flags); | 149 | set_pte_pfn(address, phys >> PAGE_SHIFT, flags); |
147 | } | 150 | } |
148 | 151 | ||
149 | pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) | 152 | pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) |
150 | { | 153 | { |
151 | return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); | 154 | return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); |
152 | } | 155 | } |
153 | 156 | ||
154 | struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) | 157 | struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) |
155 | { | 158 | { |
156 | struct page *pte; | 159 | struct page *pte; |
157 | 160 | ||
158 | #ifdef CONFIG_HIGHPTE | 161 | #ifdef CONFIG_HIGHPTE |
159 | pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0); | 162 | pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0); |
160 | #else | 163 | #else |
161 | pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); | 164 | pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); |
162 | #endif | 165 | #endif |
163 | return pte; | 166 | return pte; |
164 | } | 167 | } |
165 | 168 | ||
166 | void pmd_ctor(void *pmd, kmem_cache_t *cache, unsigned long flags) | 169 | void pmd_ctor(void *pmd, kmem_cache_t *cache, unsigned long flags) |
167 | { | 170 | { |
168 | memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t)); | 171 | memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t)); |
169 | } | 172 | } |
170 | 173 | ||
171 | /* | 174 | /* |
172 | * List of all pgd's needed for non-PAE so it can invalidate entries | 175 | * List of all pgd's needed for non-PAE so it can invalidate entries |
173 | * in both cached and uncached pgd's; not needed for PAE since the | 176 | * in both cached and uncached pgd's; not needed for PAE since the |
174 | * kernel pmd is shared. If PAE were not to share the pmd a similar | 177 | * kernel pmd is shared. If PAE were not to share the pmd a similar |
175 | * tactic would be needed. This is essentially codepath-based locking | 178 | * tactic would be needed. This is essentially codepath-based locking |
176 | * against pageattr.c; it is the unique case in which a valid change | 179 | * against pageattr.c; it is the unique case in which a valid change |
177 | * of kernel pagetables can't be lazily synchronized by vmalloc faults. | 180 | * of kernel pagetables can't be lazily synchronized by vmalloc faults. |
178 | * vmalloc faults work because attached pagetables are never freed. | 181 | * vmalloc faults work because attached pagetables are never freed. |
179 | * The locking scheme was chosen on the basis of manfred's | 182 | * The locking scheme was chosen on the basis of manfred's |
180 | * recommendations and having no core impact whatsoever. | 183 | * recommendations and having no core impact whatsoever. |
181 | * -- wli | 184 | * -- wli |
182 | */ | 185 | */ |
183 | DEFINE_SPINLOCK(pgd_lock); | 186 | DEFINE_SPINLOCK(pgd_lock); |
184 | struct page *pgd_list; | 187 | struct page *pgd_list; |
185 | 188 | ||
186 | static inline void pgd_list_add(pgd_t *pgd) | 189 | static inline void pgd_list_add(pgd_t *pgd) |
187 | { | 190 | { |
188 | struct page *page = virt_to_page(pgd); | 191 | struct page *page = virt_to_page(pgd); |
189 | page->index = (unsigned long)pgd_list; | 192 | page->index = (unsigned long)pgd_list; |
190 | if (pgd_list) | 193 | if (pgd_list) |
191 | set_page_private(pgd_list, (unsigned long)&page->index); | 194 | set_page_private(pgd_list, (unsigned long)&page->index); |
192 | pgd_list = page; | 195 | pgd_list = page; |
193 | set_page_private(page, (unsigned long)&pgd_list); | 196 | set_page_private(page, (unsigned long)&pgd_list); |
194 | } | 197 | } |
195 | 198 | ||
196 | static inline void pgd_list_del(pgd_t *pgd) | 199 | static inline void pgd_list_del(pgd_t *pgd) |
197 | { | 200 | { |
198 | struct page *next, **pprev, *page = virt_to_page(pgd); | 201 | struct page *next, **pprev, *page = virt_to_page(pgd); |
199 | next = (struct page *)page->index; | 202 | next = (struct page *)page->index; |
200 | pprev = (struct page **)page_private(page); | 203 | pprev = (struct page **)page_private(page); |
201 | *pprev = next; | 204 | *pprev = next; |
202 | if (next) | 205 | if (next) |
203 | set_page_private(next, (unsigned long)pprev); | 206 | set_page_private(next, (unsigned long)pprev); |
204 | } | 207 | } |
205 | 208 | ||
206 | void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused) | 209 | void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused) |
207 | { | 210 | { |
208 | unsigned long flags; | 211 | unsigned long flags; |
209 | 212 | ||
210 | if (PTRS_PER_PMD == 1) { | 213 | if (PTRS_PER_PMD == 1) { |
211 | memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); | 214 | memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); |
212 | spin_lock_irqsave(&pgd_lock, flags); | 215 | spin_lock_irqsave(&pgd_lock, flags); |
213 | } | 216 | } |
214 | 217 | ||
215 | clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD, | 218 | clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD, |
216 | swapper_pg_dir + USER_PTRS_PER_PGD, | 219 | swapper_pg_dir + USER_PTRS_PER_PGD, |
217 | KERNEL_PGD_PTRS); | 220 | KERNEL_PGD_PTRS); |
218 | if (PTRS_PER_PMD > 1) | 221 | if (PTRS_PER_PMD > 1) |
219 | return; | 222 | return; |
220 | 223 | ||
221 | pgd_list_add(pgd); | 224 | pgd_list_add(pgd); |
222 | spin_unlock_irqrestore(&pgd_lock, flags); | 225 | spin_unlock_irqrestore(&pgd_lock, flags); |
223 | } | 226 | } |
224 | 227 | ||
225 | /* never called when PTRS_PER_PMD > 1 */ | 228 | /* never called when PTRS_PER_PMD > 1 */ |
226 | void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused) | 229 | void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused) |
227 | { | 230 | { |
228 | unsigned long flags; /* can be called from interrupt context */ | 231 | unsigned long flags; /* can be called from interrupt context */ |
229 | 232 | ||
230 | spin_lock_irqsave(&pgd_lock, flags); | 233 | spin_lock_irqsave(&pgd_lock, flags); |
231 | pgd_list_del(pgd); | 234 | pgd_list_del(pgd); |
232 | spin_unlock_irqrestore(&pgd_lock, flags); | 235 | spin_unlock_irqrestore(&pgd_lock, flags); |
233 | } | 236 | } |
234 | 237 | ||
235 | pgd_t *pgd_alloc(struct mm_struct *mm) | 238 | pgd_t *pgd_alloc(struct mm_struct *mm) |
236 | { | 239 | { |
237 | int i; | 240 | int i; |
238 | pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL); | 241 | pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL); |
239 | 242 | ||
240 | if (PTRS_PER_PMD == 1 || !pgd) | 243 | if (PTRS_PER_PMD == 1 || !pgd) |
241 | return pgd; | 244 | return pgd; |
242 | 245 | ||
243 | for (i = 0; i < USER_PTRS_PER_PGD; ++i) { | 246 | for (i = 0; i < USER_PTRS_PER_PGD; ++i) { |
244 | pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL); | 247 | pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL); |
245 | if (!pmd) | 248 | if (!pmd) |
246 | goto out_oom; | 249 | goto out_oom; |
247 | set_pgd(&pgd[i], __pgd(1 + __pa(pmd))); | 250 | set_pgd(&pgd[i], __pgd(1 + __pa(pmd))); |
248 | } | 251 | } |
249 | return pgd; | 252 | return pgd; |
250 | 253 | ||
251 | out_oom: | 254 | out_oom: |
252 | for (i--; i >= 0; i--) | 255 | for (i--; i >= 0; i--) |
253 | kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1)); | 256 | kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1)); |
254 | kmem_cache_free(pgd_cache, pgd); | 257 | kmem_cache_free(pgd_cache, pgd); |
255 | return NULL; | 258 | return NULL; |
256 | } | 259 | } |
257 | 260 | ||
258 | void pgd_free(pgd_t *pgd) | 261 | void pgd_free(pgd_t *pgd) |
259 | { | 262 | { |
260 | int i; | 263 | int i; |
261 | 264 | ||
262 | /* in the PAE case user pgd entries are overwritten before usage */ | 265 | /* in the PAE case user pgd entries are overwritten before usage */ |
263 | if (PTRS_PER_PMD > 1) | 266 | if (PTRS_PER_PMD > 1) |
264 | for (i = 0; i < USER_PTRS_PER_PGD; ++i) | 267 | for (i = 0; i < USER_PTRS_PER_PGD; ++i) |
265 | kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1)); | 268 | kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1)); |
266 | /* in the non-PAE case, free_pgtables() clears user pgd entries */ | 269 | /* in the non-PAE case, free_pgtables() clears user pgd entries */ |
267 | kmem_cache_free(pgd_cache, pgd); | 270 | kmem_cache_free(pgd_cache, pgd); |
268 | } | 271 | } |
269 | 272 |
arch/ia64/mm/discontig.c
1 | /* | 1 | /* |
2 | * Copyright (c) 2000, 2003 Silicon Graphics, Inc. All rights reserved. | 2 | * Copyright (c) 2000, 2003 Silicon Graphics, Inc. All rights reserved. |
3 | * Copyright (c) 2001 Intel Corp. | 3 | * Copyright (c) 2001 Intel Corp. |
4 | * Copyright (c) 2001 Tony Luck <tony.luck@intel.com> | 4 | * Copyright (c) 2001 Tony Luck <tony.luck@intel.com> |
5 | * Copyright (c) 2002 NEC Corp. | 5 | * Copyright (c) 2002 NEC Corp. |
6 | * Copyright (c) 2002 Kimio Suganuma <k-suganuma@da.jp.nec.com> | 6 | * Copyright (c) 2002 Kimio Suganuma <k-suganuma@da.jp.nec.com> |
7 | * Copyright (c) 2004 Silicon Graphics, Inc | 7 | * Copyright (c) 2004 Silicon Graphics, Inc |
8 | * Russ Anderson <rja@sgi.com> | 8 | * Russ Anderson <rja@sgi.com> |
9 | * Jesse Barnes <jbarnes@sgi.com> | 9 | * Jesse Barnes <jbarnes@sgi.com> |
10 | * Jack Steiner <steiner@sgi.com> | 10 | * Jack Steiner <steiner@sgi.com> |
11 | */ | 11 | */ |
12 | 12 | ||
13 | /* | 13 | /* |
14 | * Platform initialization for Discontig Memory | 14 | * Platform initialization for Discontig Memory |
15 | */ | 15 | */ |
16 | 16 | ||
17 | #include <linux/kernel.h> | 17 | #include <linux/kernel.h> |
18 | #include <linux/mm.h> | 18 | #include <linux/mm.h> |
19 | #include <linux/swap.h> | 19 | #include <linux/swap.h> |
20 | #include <linux/bootmem.h> | 20 | #include <linux/bootmem.h> |
21 | #include <linux/acpi.h> | 21 | #include <linux/acpi.h> |
22 | #include <linux/efi.h> | 22 | #include <linux/efi.h> |
23 | #include <linux/nodemask.h> | 23 | #include <linux/nodemask.h> |
24 | #include <asm/pgalloc.h> | 24 | #include <asm/pgalloc.h> |
25 | #include <asm/tlb.h> | 25 | #include <asm/tlb.h> |
26 | #include <asm/meminit.h> | 26 | #include <asm/meminit.h> |
27 | #include <asm/numa.h> | 27 | #include <asm/numa.h> |
28 | #include <asm/sections.h> | 28 | #include <asm/sections.h> |
29 | 29 | ||
30 | /* | 30 | /* |
31 | * Track per-node information needed to setup the boot memory allocator, the | 31 | * Track per-node information needed to setup the boot memory allocator, the |
32 | * per-node areas, and the real VM. | 32 | * per-node areas, and the real VM. |
33 | */ | 33 | */ |
34 | struct early_node_data { | 34 | struct early_node_data { |
35 | struct ia64_node_data *node_data; | 35 | struct ia64_node_data *node_data; |
36 | pg_data_t *pgdat; | 36 | pg_data_t *pgdat; |
37 | unsigned long pernode_addr; | 37 | unsigned long pernode_addr; |
38 | unsigned long pernode_size; | 38 | unsigned long pernode_size; |
39 | struct bootmem_data bootmem_data; | 39 | struct bootmem_data bootmem_data; |
40 | unsigned long num_physpages; | 40 | unsigned long num_physpages; |
41 | unsigned long num_dma_physpages; | 41 | unsigned long num_dma_physpages; |
42 | unsigned long min_pfn; | 42 | unsigned long min_pfn; |
43 | unsigned long max_pfn; | 43 | unsigned long max_pfn; |
44 | }; | 44 | }; |
45 | 45 | ||
46 | static struct early_node_data mem_data[MAX_NUMNODES] __initdata; | 46 | static struct early_node_data mem_data[MAX_NUMNODES] __initdata; |
47 | static nodemask_t memory_less_mask __initdata; | 47 | static nodemask_t memory_less_mask __initdata; |
48 | 48 | ||
49 | /* | 49 | /* |
50 | * To prevent cache aliasing effects, align per-node structures so that they | 50 | * To prevent cache aliasing effects, align per-node structures so that they |
51 | * start at addresses that are strided by node number. | 51 | * start at addresses that are strided by node number. |
52 | */ | 52 | */ |
53 | #define NODEDATA_ALIGN(addr, node) \ | 53 | #define NODEDATA_ALIGN(addr, node) \ |
54 | ((((addr) + 1024*1024-1) & ~(1024*1024-1)) + (node)*PERCPU_PAGE_SIZE) | 54 | ((((addr) + 1024*1024-1) & ~(1024*1024-1)) + (node)*PERCPU_PAGE_SIZE) |
55 | 55 | ||
56 | /** | 56 | /** |
57 | * build_node_maps - callback to setup bootmem structs for each node | 57 | * build_node_maps - callback to setup bootmem structs for each node |
58 | * @start: physical start of range | 58 | * @start: physical start of range |
59 | * @len: length of range | 59 | * @len: length of range |
60 | * @node: node where this range resides | 60 | * @node: node where this range resides |
61 | * | 61 | * |
62 | * We allocate a struct bootmem_data for each piece of memory that we wish to | 62 | * We allocate a struct bootmem_data for each piece of memory that we wish to |
63 | * treat as a virtually contiguous block (i.e. each node). Each such block | 63 | * treat as a virtually contiguous block (i.e. each node). Each such block |
64 | * must start on an %IA64_GRANULE_SIZE boundary, so we round the address down | 64 | * must start on an %IA64_GRANULE_SIZE boundary, so we round the address down |
65 | * if necessary. Any non-existent pages will simply be part of the virtual | 65 | * if necessary. Any non-existent pages will simply be part of the virtual |
66 | * memmap. We also update min_low_pfn and max_low_pfn here as we receive | 66 | * memmap. We also update min_low_pfn and max_low_pfn here as we receive |
67 | * memory ranges from the caller. | 67 | * memory ranges from the caller. |
68 | */ | 68 | */ |
69 | static int __init build_node_maps(unsigned long start, unsigned long len, | 69 | static int __init build_node_maps(unsigned long start, unsigned long len, |
70 | int node) | 70 | int node) |
71 | { | 71 | { |
72 | unsigned long cstart, epfn, end = start + len; | 72 | unsigned long cstart, epfn, end = start + len; |
73 | struct bootmem_data *bdp = &mem_data[node].bootmem_data; | 73 | struct bootmem_data *bdp = &mem_data[node].bootmem_data; |
74 | 74 | ||
75 | epfn = GRANULEROUNDUP(end) >> PAGE_SHIFT; | 75 | epfn = GRANULEROUNDUP(end) >> PAGE_SHIFT; |
76 | cstart = GRANULEROUNDDOWN(start); | 76 | cstart = GRANULEROUNDDOWN(start); |
77 | 77 | ||
78 | if (!bdp->node_low_pfn) { | 78 | if (!bdp->node_low_pfn) { |
79 | bdp->node_boot_start = cstart; | 79 | bdp->node_boot_start = cstart; |
80 | bdp->node_low_pfn = epfn; | 80 | bdp->node_low_pfn = epfn; |
81 | } else { | 81 | } else { |
82 | bdp->node_boot_start = min(cstart, bdp->node_boot_start); | 82 | bdp->node_boot_start = min(cstart, bdp->node_boot_start); |
83 | bdp->node_low_pfn = max(epfn, bdp->node_low_pfn); | 83 | bdp->node_low_pfn = max(epfn, bdp->node_low_pfn); |
84 | } | 84 | } |
85 | 85 | ||
86 | min_low_pfn = min(min_low_pfn, bdp->node_boot_start>>PAGE_SHIFT); | 86 | min_low_pfn = min(min_low_pfn, bdp->node_boot_start>>PAGE_SHIFT); |
87 | max_low_pfn = max(max_low_pfn, bdp->node_low_pfn); | 87 | max_low_pfn = max(max_low_pfn, bdp->node_low_pfn); |
88 | 88 | ||
89 | return 0; | 89 | return 0; |
90 | } | 90 | } |
91 | 91 | ||
92 | /** | 92 | /** |
93 | * early_nr_cpus_node - return number of cpus on a given node | 93 | * early_nr_cpus_node - return number of cpus on a given node |
94 | * @node: node to check | 94 | * @node: node to check |
95 | * | 95 | * |
96 | * Count the number of cpus on @node. We can't use nr_cpus_node() yet because | 96 | * Count the number of cpus on @node. We can't use nr_cpus_node() yet because |
97 | * acpi_boot_init() (which builds the node_to_cpu_mask array) hasn't been | 97 | * acpi_boot_init() (which builds the node_to_cpu_mask array) hasn't been |
98 | * called yet. Note that node 0 will also count all non-existent cpus. | 98 | * called yet. Note that node 0 will also count all non-existent cpus. |
99 | */ | 99 | */ |
100 | static int __init early_nr_cpus_node(int node) | 100 | static int __init early_nr_cpus_node(int node) |
101 | { | 101 | { |
102 | int cpu, n = 0; | 102 | int cpu, n = 0; |
103 | 103 | ||
104 | for (cpu = 0; cpu < NR_CPUS; cpu++) | 104 | for (cpu = 0; cpu < NR_CPUS; cpu++) |
105 | if (node == node_cpuid[cpu].nid) | 105 | if (node == node_cpuid[cpu].nid) |
106 | n++; | 106 | n++; |
107 | 107 | ||
108 | return n; | 108 | return n; |
109 | } | 109 | } |
110 | 110 | ||
111 | /** | 111 | /** |
112 | * compute_pernodesize - compute size of pernode data | 112 | * compute_pernodesize - compute size of pernode data |
113 | * @node: the node id. | 113 | * @node: the node id. |
114 | */ | 114 | */ |
115 | static unsigned long __init compute_pernodesize(int node) | 115 | static unsigned long __init compute_pernodesize(int node) |
116 | { | 116 | { |
117 | unsigned long pernodesize = 0, cpus; | 117 | unsigned long pernodesize = 0, cpus; |
118 | 118 | ||
119 | cpus = early_nr_cpus_node(node); | 119 | cpus = early_nr_cpus_node(node); |
120 | pernodesize += PERCPU_PAGE_SIZE * cpus; | 120 | pernodesize += PERCPU_PAGE_SIZE * cpus; |
121 | pernodesize += node * L1_CACHE_BYTES; | 121 | pernodesize += node * L1_CACHE_BYTES; |
122 | pernodesize += L1_CACHE_ALIGN(sizeof(pg_data_t)); | 122 | pernodesize += L1_CACHE_ALIGN(sizeof(pg_data_t)); |
123 | pernodesize += L1_CACHE_ALIGN(sizeof(struct ia64_node_data)); | 123 | pernodesize += L1_CACHE_ALIGN(sizeof(struct ia64_node_data)); |
124 | pernodesize = PAGE_ALIGN(pernodesize); | 124 | pernodesize = PAGE_ALIGN(pernodesize); |
125 | return pernodesize; | 125 | return pernodesize; |
126 | } | 126 | } |
127 | 127 | ||
128 | /** | 128 | /** |
129 | * per_cpu_node_setup - setup per-cpu areas on each node | 129 | * per_cpu_node_setup - setup per-cpu areas on each node |
130 | * @cpu_data: per-cpu area on this node | 130 | * @cpu_data: per-cpu area on this node |
131 | * @node: node to setup | 131 | * @node: node to setup |
132 | * | 132 | * |
133 | * Copy the static per-cpu data into the region we just set aside and then | 133 | * Copy the static per-cpu data into the region we just set aside and then |
134 | * setup __per_cpu_offset for each CPU on this node. Return a pointer to | 134 | * setup __per_cpu_offset for each CPU on this node. Return a pointer to |
135 | * the end of the area. | 135 | * the end of the area. |
136 | */ | 136 | */ |
137 | static void *per_cpu_node_setup(void *cpu_data, int node) | 137 | static void *per_cpu_node_setup(void *cpu_data, int node) |
138 | { | 138 | { |
139 | #ifdef CONFIG_SMP | 139 | #ifdef CONFIG_SMP |
140 | int cpu; | 140 | int cpu; |
141 | 141 | ||
142 | for (cpu = 0; cpu < NR_CPUS; cpu++) { | 142 | for (cpu = 0; cpu < NR_CPUS; cpu++) { |
143 | if (node == node_cpuid[cpu].nid) { | 143 | if (node == node_cpuid[cpu].nid) { |
144 | memcpy(__va(cpu_data), __phys_per_cpu_start, | 144 | memcpy(__va(cpu_data), __phys_per_cpu_start, |
145 | __per_cpu_end - __per_cpu_start); | 145 | __per_cpu_end - __per_cpu_start); |
146 | __per_cpu_offset[cpu] = (char*)__va(cpu_data) - | 146 | __per_cpu_offset[cpu] = (char*)__va(cpu_data) - |
147 | __per_cpu_start; | 147 | __per_cpu_start; |
148 | cpu_data += PERCPU_PAGE_SIZE; | 148 | cpu_data += PERCPU_PAGE_SIZE; |
149 | } | 149 | } |
150 | } | 150 | } |
151 | #endif | 151 | #endif |
152 | return cpu_data; | 152 | return cpu_data; |
153 | } | 153 | } |
154 | 154 | ||
155 | /** | 155 | /** |
156 | * fill_pernode - initialize pernode data. | 156 | * fill_pernode - initialize pernode data. |
157 | * @node: the node id. | 157 | * @node: the node id. |
158 | * @pernode: physical address of pernode data | 158 | * @pernode: physical address of pernode data |
159 | * @pernodesize: size of the pernode data | 159 | * @pernodesize: size of the pernode data |
160 | */ | 160 | */ |
161 | static void __init fill_pernode(int node, unsigned long pernode, | 161 | static void __init fill_pernode(int node, unsigned long pernode, |
162 | unsigned long pernodesize) | 162 | unsigned long pernodesize) |
163 | { | 163 | { |
164 | void *cpu_data; | 164 | void *cpu_data; |
165 | int cpus = early_nr_cpus_node(node); | 165 | int cpus = early_nr_cpus_node(node); |
166 | struct bootmem_data *bdp = &mem_data[node].bootmem_data; | 166 | struct bootmem_data *bdp = &mem_data[node].bootmem_data; |
167 | 167 | ||
168 | mem_data[node].pernode_addr = pernode; | 168 | mem_data[node].pernode_addr = pernode; |
169 | mem_data[node].pernode_size = pernodesize; | 169 | mem_data[node].pernode_size = pernodesize; |
170 | memset(__va(pernode), 0, pernodesize); | 170 | memset(__va(pernode), 0, pernodesize); |
171 | 171 | ||
172 | cpu_data = (void *)pernode; | 172 | cpu_data = (void *)pernode; |
173 | pernode += PERCPU_PAGE_SIZE * cpus; | 173 | pernode += PERCPU_PAGE_SIZE * cpus; |
174 | pernode += node * L1_CACHE_BYTES; | 174 | pernode += node * L1_CACHE_BYTES; |
175 | 175 | ||
176 | mem_data[node].pgdat = __va(pernode); | 176 | mem_data[node].pgdat = __va(pernode); |
177 | pernode += L1_CACHE_ALIGN(sizeof(pg_data_t)); | 177 | pernode += L1_CACHE_ALIGN(sizeof(pg_data_t)); |
178 | 178 | ||
179 | mem_data[node].node_data = __va(pernode); | 179 | mem_data[node].node_data = __va(pernode); |
180 | pernode += L1_CACHE_ALIGN(sizeof(struct ia64_node_data)); | 180 | pernode += L1_CACHE_ALIGN(sizeof(struct ia64_node_data)); |
181 | 181 | ||
182 | mem_data[node].pgdat->bdata = bdp; | 182 | mem_data[node].pgdat->bdata = bdp; |
183 | pernode += L1_CACHE_ALIGN(sizeof(pg_data_t)); | 183 | pernode += L1_CACHE_ALIGN(sizeof(pg_data_t)); |
184 | 184 | ||
185 | cpu_data = per_cpu_node_setup(cpu_data, node); | 185 | cpu_data = per_cpu_node_setup(cpu_data, node); |
186 | 186 | ||
187 | return; | 187 | return; |
188 | } | 188 | } |
189 | 189 | ||
190 | /** | 190 | /** |
191 | * find_pernode_space - allocate memory for memory map and per-node structures | 191 | * find_pernode_space - allocate memory for memory map and per-node structures |
192 | * @start: physical start of range | 192 | * @start: physical start of range |
193 | * @len: length of range | 193 | * @len: length of range |
194 | * @node: node where this range resides | 194 | * @node: node where this range resides |
195 | * | 195 | * |
196 | * This routine reserves space for the per-cpu data struct, the list of | 196 | * This routine reserves space for the per-cpu data struct, the list of |
197 | * pg_data_ts and the per-node data struct. Each node will have something like | 197 | * pg_data_ts and the per-node data struct. Each node will have something like |
198 | * the following in the first chunk of addr. space large enough to hold it. | 198 | * the following in the first chunk of addr. space large enough to hold it. |
199 | * | 199 | * |
200 | * ________________________ | 200 | * ________________________ |
201 | * | | | 201 | * | | |
202 | * |~~~~~~~~~~~~~~~~~~~~~~~~| <-- NODEDATA_ALIGN(start, node) for the first | 202 | * |~~~~~~~~~~~~~~~~~~~~~~~~| <-- NODEDATA_ALIGN(start, node) for the first |
203 | * | PERCPU_PAGE_SIZE * | start and length big enough | 203 | * | PERCPU_PAGE_SIZE * | start and length big enough |
204 | * | cpus_on_this_node | Node 0 will also have entries for all non-existent cpus. | 204 | * | cpus_on_this_node | Node 0 will also have entries for all non-existent cpus. |
205 | * |------------------------| | 205 | * |------------------------| |
206 | * | local pg_data_t * | | 206 | * | local pg_data_t * | |
207 | * |------------------------| | 207 | * |------------------------| |
208 | * | local ia64_node_data | | 208 | * | local ia64_node_data | |
209 | * |------------------------| | 209 | * |------------------------| |
210 | * | ??? | | 210 | * | ??? | |
211 | * |________________________| | 211 | * |________________________| |
212 | * | 212 | * |
213 | * Once this space has been set aside, the bootmem maps are initialized. We | 213 | * Once this space has been set aside, the bootmem maps are initialized. We |
214 | * could probably move the allocation of the per-cpu and ia64_node_data space | 214 | * could probably move the allocation of the per-cpu and ia64_node_data space |
215 | * outside of this function and use alloc_bootmem_node(), but doing it here | 215 | * outside of this function and use alloc_bootmem_node(), but doing it here |
216 | * is straightforward and we get the alignments we want so... | 216 | * is straightforward and we get the alignments we want so... |
217 | */ | 217 | */ |
218 | static int __init find_pernode_space(unsigned long start, unsigned long len, | 218 | static int __init find_pernode_space(unsigned long start, unsigned long len, |
219 | int node) | 219 | int node) |
220 | { | 220 | { |
221 | unsigned long epfn; | 221 | unsigned long epfn; |
222 | unsigned long pernodesize = 0, pernode, pages, mapsize; | 222 | unsigned long pernodesize = 0, pernode, pages, mapsize; |
223 | struct bootmem_data *bdp = &mem_data[node].bootmem_data; | 223 | struct bootmem_data *bdp = &mem_data[node].bootmem_data; |
224 | 224 | ||
225 | epfn = (start + len) >> PAGE_SHIFT; | 225 | epfn = (start + len) >> PAGE_SHIFT; |
226 | 226 | ||
227 | pages = bdp->node_low_pfn - (bdp->node_boot_start >> PAGE_SHIFT); | 227 | pages = bdp->node_low_pfn - (bdp->node_boot_start >> PAGE_SHIFT); |
228 | mapsize = bootmem_bootmap_pages(pages) << PAGE_SHIFT; | 228 | mapsize = bootmem_bootmap_pages(pages) << PAGE_SHIFT; |
229 | 229 | ||
230 | /* | 230 | /* |
231 | * Make sure this memory falls within this node's usable memory | 231 | * Make sure this memory falls within this node's usable memory |
232 | * since we may have thrown some away in build_maps(). | 232 | * since we may have thrown some away in build_maps(). |
233 | */ | 233 | */ |
234 | if (start < bdp->node_boot_start || epfn > bdp->node_low_pfn) | 234 | if (start < bdp->node_boot_start || epfn > bdp->node_low_pfn) |
235 | return 0; | 235 | return 0; |
236 | 236 | ||
237 | /* Don't setup this node's local space twice... */ | 237 | /* Don't setup this node's local space twice... */ |
238 | if (mem_data[node].pernode_addr) | 238 | if (mem_data[node].pernode_addr) |
239 | return 0; | 239 | return 0; |
240 | 240 | ||
241 | /* | 241 | /* |
242 | * Calculate total size needed, incl. what's necessary | 242 | * Calculate total size needed, incl. what's necessary |
243 | * for good alignment and alias prevention. | 243 | * for good alignment and alias prevention. |
244 | */ | 244 | */ |
245 | pernodesize = compute_pernodesize(node); | 245 | pernodesize = compute_pernodesize(node); |
246 | pernode = NODEDATA_ALIGN(start, node); | 246 | pernode = NODEDATA_ALIGN(start, node); |
247 | 247 | ||
248 | /* Is this range big enough for what we want to store here? */ | 248 | /* Is this range big enough for what we want to store here? */ |
249 | if (start + len > (pernode + pernodesize + mapsize)) | 249 | if (start + len > (pernode + pernodesize + mapsize)) |
250 | fill_pernode(node, pernode, pernodesize); | 250 | fill_pernode(node, pernode, pernodesize); |
251 | 251 | ||
252 | return 0; | 252 | return 0; |
253 | } | 253 | } |
254 | 254 | ||
255 | /** | 255 | /** |
256 | * free_node_bootmem - free bootmem allocator memory for use | 256 | * free_node_bootmem - free bootmem allocator memory for use |
257 | * @start: physical start of range | 257 | * @start: physical start of range |
258 | * @len: length of range | 258 | * @len: length of range |
259 | * @node: node where this range resides | 259 | * @node: node where this range resides |
260 | * | 260 | * |
261 | * Simply calls the bootmem allocator to free the specified ranged from | 261 | * Simply calls the bootmem allocator to free the specified ranged from |
262 | * the given pg_data_t's bdata struct. After this function has been called | 262 | * the given pg_data_t's bdata struct. After this function has been called |
263 | * for all the entries in the EFI memory map, the bootmem allocator will | 263 | * for all the entries in the EFI memory map, the bootmem allocator will |
264 | * be ready to service allocation requests. | 264 | * be ready to service allocation requests. |
265 | */ | 265 | */ |
266 | static int __init free_node_bootmem(unsigned long start, unsigned long len, | 266 | static int __init free_node_bootmem(unsigned long start, unsigned long len, |
267 | int node) | 267 | int node) |
268 | { | 268 | { |
269 | free_bootmem_node(mem_data[node].pgdat, start, len); | 269 | free_bootmem_node(mem_data[node].pgdat, start, len); |
270 | 270 | ||
271 | return 0; | 271 | return 0; |
272 | } | 272 | } |
273 | 273 | ||
274 | /** | 274 | /** |
275 | * reserve_pernode_space - reserve memory for per-node space | 275 | * reserve_pernode_space - reserve memory for per-node space |
276 | * | 276 | * |
277 | * Reserve the space used by the bootmem maps & per-node space in the boot | 277 | * Reserve the space used by the bootmem maps & per-node space in the boot |
278 | * allocator so that when we actually create the real mem maps we don't | 278 | * allocator so that when we actually create the real mem maps we don't |
279 | * use their memory. | 279 | * use their memory. |
280 | */ | 280 | */ |
281 | static void __init reserve_pernode_space(void) | 281 | static void __init reserve_pernode_space(void) |
282 | { | 282 | { |
283 | unsigned long base, size, pages; | 283 | unsigned long base, size, pages; |
284 | struct bootmem_data *bdp; | 284 | struct bootmem_data *bdp; |
285 | int node; | 285 | int node; |
286 | 286 | ||
287 | for_each_online_node(node) { | 287 | for_each_online_node(node) { |
288 | pg_data_t *pdp = mem_data[node].pgdat; | 288 | pg_data_t *pdp = mem_data[node].pgdat; |
289 | 289 | ||
290 | if (node_isset(node, memory_less_mask)) | 290 | if (node_isset(node, memory_less_mask)) |
291 | continue; | 291 | continue; |
292 | 292 | ||
293 | bdp = pdp->bdata; | 293 | bdp = pdp->bdata; |
294 | 294 | ||
295 | /* First the bootmem_map itself */ | 295 | /* First the bootmem_map itself */ |
296 | pages = bdp->node_low_pfn - (bdp->node_boot_start>>PAGE_SHIFT); | 296 | pages = bdp->node_low_pfn - (bdp->node_boot_start>>PAGE_SHIFT); |
297 | size = bootmem_bootmap_pages(pages) << PAGE_SHIFT; | 297 | size = bootmem_bootmap_pages(pages) << PAGE_SHIFT; |
298 | base = __pa(bdp->node_bootmem_map); | 298 | base = __pa(bdp->node_bootmem_map); |
299 | reserve_bootmem_node(pdp, base, size); | 299 | reserve_bootmem_node(pdp, base, size); |
300 | 300 | ||
301 | /* Now the per-node space */ | 301 | /* Now the per-node space */ |
302 | size = mem_data[node].pernode_size; | 302 | size = mem_data[node].pernode_size; |
303 | base = __pa(mem_data[node].pernode_addr); | 303 | base = __pa(mem_data[node].pernode_addr); |
304 | reserve_bootmem_node(pdp, base, size); | 304 | reserve_bootmem_node(pdp, base, size); |
305 | } | 305 | } |
306 | } | 306 | } |
307 | 307 | ||
308 | /** | 308 | /** |
309 | * initialize_pernode_data - fixup per-cpu & per-node pointers | 309 | * initialize_pernode_data - fixup per-cpu & per-node pointers |
310 | * | 310 | * |
311 | * Each node's per-node area has a copy of the global pg_data_t list, so | 311 | * Each node's per-node area has a copy of the global pg_data_t list, so |
312 | * we copy that to each node here, as well as setting the per-cpu pointer | 312 | * we copy that to each node here, as well as setting the per-cpu pointer |
313 | * to the local node data structure. The active_cpus field of the per-node | 313 | * to the local node data structure. The active_cpus field of the per-node |
314 | * structure gets setup by the platform_cpu_init() function later. | 314 | * structure gets setup by the platform_cpu_init() function later. |
315 | */ | 315 | */ |
316 | static void __init initialize_pernode_data(void) | 316 | static void __init initialize_pernode_data(void) |
317 | { | 317 | { |
318 | pg_data_t *pgdat_list[MAX_NUMNODES]; | 318 | pg_data_t *pgdat_list[MAX_NUMNODES]; |
319 | int cpu, node; | 319 | int cpu, node; |
320 | 320 | ||
321 | for_each_online_node(node) | 321 | for_each_online_node(node) |
322 | pgdat_list[node] = mem_data[node].pgdat; | 322 | pgdat_list[node] = mem_data[node].pgdat; |
323 | 323 | ||
324 | /* Copy the pg_data_t list to each node and init the node field */ | 324 | /* Copy the pg_data_t list to each node and init the node field */ |
325 | for_each_online_node(node) { | 325 | for_each_online_node(node) { |
326 | memcpy(mem_data[node].node_data->pg_data_ptrs, pgdat_list, | 326 | memcpy(mem_data[node].node_data->pg_data_ptrs, pgdat_list, |
327 | sizeof(pgdat_list)); | 327 | sizeof(pgdat_list)); |
328 | } | 328 | } |
329 | #ifdef CONFIG_SMP | 329 | #ifdef CONFIG_SMP |
330 | /* Set the node_data pointer for each per-cpu struct */ | 330 | /* Set the node_data pointer for each per-cpu struct */ |
331 | for (cpu = 0; cpu < NR_CPUS; cpu++) { | 331 | for (cpu = 0; cpu < NR_CPUS; cpu++) { |
332 | node = node_cpuid[cpu].nid; | 332 | node = node_cpuid[cpu].nid; |
333 | per_cpu(cpu_info, cpu).node_data = mem_data[node].node_data; | 333 | per_cpu(cpu_info, cpu).node_data = mem_data[node].node_data; |
334 | } | 334 | } |
335 | #else | 335 | #else |
336 | { | 336 | { |
337 | struct cpuinfo_ia64 *cpu0_cpu_info; | 337 | struct cpuinfo_ia64 *cpu0_cpu_info; |
338 | cpu = 0; | 338 | cpu = 0; |
339 | node = node_cpuid[cpu].nid; | 339 | node = node_cpuid[cpu].nid; |
340 | cpu0_cpu_info = (struct cpuinfo_ia64 *)(__phys_per_cpu_start + | 340 | cpu0_cpu_info = (struct cpuinfo_ia64 *)(__phys_per_cpu_start + |
341 | ((char *)&per_cpu__cpu_info - __per_cpu_start)); | 341 | ((char *)&per_cpu__cpu_info - __per_cpu_start)); |
342 | cpu0_cpu_info->node_data = mem_data[node].node_data; | 342 | cpu0_cpu_info->node_data = mem_data[node].node_data; |
343 | } | 343 | } |
344 | #endif /* CONFIG_SMP */ | 344 | #endif /* CONFIG_SMP */ |
345 | } | 345 | } |
346 | 346 | ||
347 | /** | 347 | /** |
348 | * memory_less_node_alloc - * attempt to allocate memory on the best NUMA slit | 348 | * memory_less_node_alloc - * attempt to allocate memory on the best NUMA slit |
349 | * node but fall back to any other node when __alloc_bootmem_node fails | 349 | * node but fall back to any other node when __alloc_bootmem_node fails |
350 | * for best. | 350 | * for best. |
351 | * @nid: node id | 351 | * @nid: node id |
352 | * @pernodesize: size of this node's pernode data | 352 | * @pernodesize: size of this node's pernode data |
353 | * @align: alignment to use for this node's pernode data | 353 | * @align: alignment to use for this node's pernode data |
354 | */ | 354 | */ |
355 | static void __init *memory_less_node_alloc(int nid, unsigned long pernodesize, | 355 | static void __init *memory_less_node_alloc(int nid, unsigned long pernodesize, |
356 | unsigned long align) | 356 | unsigned long align) |
357 | { | 357 | { |
358 | void *ptr = NULL; | 358 | void *ptr = NULL; |
359 | u8 best = 0xff; | 359 | u8 best = 0xff; |
360 | int bestnode = -1, node; | 360 | int bestnode = -1, node; |
361 | 361 | ||
362 | for_each_online_node(node) { | 362 | for_each_online_node(node) { |
363 | if (node_isset(node, memory_less_mask)) | 363 | if (node_isset(node, memory_less_mask)) |
364 | continue; | 364 | continue; |
365 | else if (node_distance(nid, node) < best) { | 365 | else if (node_distance(nid, node) < best) { |
366 | best = node_distance(nid, node); | 366 | best = node_distance(nid, node); |
367 | bestnode = node; | 367 | bestnode = node; |
368 | } | 368 | } |
369 | } | 369 | } |
370 | 370 | ||
371 | ptr = __alloc_bootmem_node(mem_data[bestnode].pgdat, | 371 | ptr = __alloc_bootmem_node(mem_data[bestnode].pgdat, |
372 | pernodesize, align, __pa(MAX_DMA_ADDRESS)); | 372 | pernodesize, align, __pa(MAX_DMA_ADDRESS)); |
373 | 373 | ||
374 | if (!ptr) | 374 | if (!ptr) |
375 | panic("NO memory for memory less node\n"); | 375 | panic("NO memory for memory less node\n"); |
376 | return ptr; | 376 | return ptr; |
377 | } | 377 | } |
378 | 378 | ||
379 | /** | 379 | /** |
380 | * pgdat_insert - insert the pgdat into global pgdat_list | 380 | * pgdat_insert - insert the pgdat into global pgdat_list |
381 | * @pgdat: the pgdat for a node. | 381 | * @pgdat: the pgdat for a node. |
382 | */ | 382 | */ |
383 | static void __init pgdat_insert(pg_data_t *pgdat) | 383 | static void __init pgdat_insert(pg_data_t *pgdat) |
384 | { | 384 | { |
385 | pg_data_t *prev = NULL, *next; | 385 | pg_data_t *prev = NULL, *next; |
386 | 386 | ||
387 | for_each_pgdat(next) | 387 | for_each_pgdat(next) |
388 | if (pgdat->node_id < next->node_id) | 388 | if (pgdat->node_id < next->node_id) |
389 | break; | 389 | break; |
390 | else | 390 | else |
391 | prev = next; | 391 | prev = next; |
392 | 392 | ||
393 | if (prev) { | 393 | if (prev) { |
394 | prev->pgdat_next = pgdat; | 394 | prev->pgdat_next = pgdat; |
395 | pgdat->pgdat_next = next; | 395 | pgdat->pgdat_next = next; |
396 | } else { | 396 | } else { |
397 | pgdat->pgdat_next = pgdat_list; | 397 | pgdat->pgdat_next = pgdat_list; |
398 | pgdat_list = pgdat; | 398 | pgdat_list = pgdat; |
399 | } | 399 | } |
400 | 400 | ||
401 | return; | 401 | return; |
402 | } | 402 | } |
403 | 403 | ||
404 | /** | 404 | /** |
405 | * memory_less_nodes - allocate and initialize CPU only nodes pernode | 405 | * memory_less_nodes - allocate and initialize CPU only nodes pernode |
406 | * information. | 406 | * information. |
407 | */ | 407 | */ |
408 | static void __init memory_less_nodes(void) | 408 | static void __init memory_less_nodes(void) |
409 | { | 409 | { |
410 | unsigned long pernodesize; | 410 | unsigned long pernodesize; |
411 | void *pernode; | 411 | void *pernode; |
412 | int node; | 412 | int node; |
413 | 413 | ||
414 | for_each_node_mask(node, memory_less_mask) { | 414 | for_each_node_mask(node, memory_less_mask) { |
415 | pernodesize = compute_pernodesize(node); | 415 | pernodesize = compute_pernodesize(node); |
416 | pernode = memory_less_node_alloc(node, pernodesize, | 416 | pernode = memory_less_node_alloc(node, pernodesize, |
417 | (node) ? (node * PERCPU_PAGE_SIZE) : (1024*1024)); | 417 | (node) ? (node * PERCPU_PAGE_SIZE) : (1024*1024)); |
418 | fill_pernode(node, __pa(pernode), pernodesize); | 418 | fill_pernode(node, __pa(pernode), pernodesize); |
419 | } | 419 | } |
420 | 420 | ||
421 | return; | 421 | return; |
422 | } | 422 | } |
423 | 423 | ||
424 | #ifdef CONFIG_SPARSEMEM | 424 | #ifdef CONFIG_SPARSEMEM |
425 | /** | 425 | /** |
426 | * register_sparse_mem - notify SPARSEMEM that this memory range exists. | 426 | * register_sparse_mem - notify SPARSEMEM that this memory range exists. |
427 | * @start: physical start of range | 427 | * @start: physical start of range |
428 | * @end: physical end of range | 428 | * @end: physical end of range |
429 | * @arg: unused | 429 | * @arg: unused |
430 | * | 430 | * |
431 | * Simply calls SPARSEMEM to register memory section(s). | 431 | * Simply calls SPARSEMEM to register memory section(s). |
432 | */ | 432 | */ |
433 | static int __init register_sparse_mem(unsigned long start, unsigned long end, | 433 | static int __init register_sparse_mem(unsigned long start, unsigned long end, |
434 | void *arg) | 434 | void *arg) |
435 | { | 435 | { |
436 | int nid; | 436 | int nid; |
437 | 437 | ||
438 | start = __pa(start) >> PAGE_SHIFT; | 438 | start = __pa(start) >> PAGE_SHIFT; |
439 | end = __pa(end) >> PAGE_SHIFT; | 439 | end = __pa(end) >> PAGE_SHIFT; |
440 | nid = early_pfn_to_nid(start); | 440 | nid = early_pfn_to_nid(start); |
441 | memory_present(nid, start, end); | 441 | memory_present(nid, start, end); |
442 | 442 | ||
443 | return 0; | 443 | return 0; |
444 | } | 444 | } |
445 | 445 | ||
446 | static void __init arch_sparse_init(void) | 446 | static void __init arch_sparse_init(void) |
447 | { | 447 | { |
448 | efi_memmap_walk(register_sparse_mem, NULL); | 448 | efi_memmap_walk(register_sparse_mem, NULL); |
449 | sparse_init(); | 449 | sparse_init(); |
450 | } | 450 | } |
451 | #else | 451 | #else |
452 | #define arch_sparse_init() do {} while (0) | 452 | #define arch_sparse_init() do {} while (0) |
453 | #endif | 453 | #endif |
454 | 454 | ||
455 | /** | 455 | /** |
456 | * find_memory - walk the EFI memory map and setup the bootmem allocator | 456 | * find_memory - walk the EFI memory map and setup the bootmem allocator |
457 | * | 457 | * |
458 | * Called early in boot to setup the bootmem allocator, and to | 458 | * Called early in boot to setup the bootmem allocator, and to |
459 | * allocate the per-cpu and per-node structures. | 459 | * allocate the per-cpu and per-node structures. |
460 | */ | 460 | */ |
461 | void __init find_memory(void) | 461 | void __init find_memory(void) |
462 | { | 462 | { |
463 | int node; | 463 | int node; |
464 | 464 | ||
465 | reserve_memory(); | 465 | reserve_memory(); |
466 | 466 | ||
467 | if (num_online_nodes() == 0) { | 467 | if (num_online_nodes() == 0) { |
468 | printk(KERN_ERR "node info missing!\n"); | 468 | printk(KERN_ERR "node info missing!\n"); |
469 | node_set_online(0); | 469 | node_set_online(0); |
470 | } | 470 | } |
471 | 471 | ||
472 | nodes_or(memory_less_mask, memory_less_mask, node_online_map); | 472 | nodes_or(memory_less_mask, memory_less_mask, node_online_map); |
473 | min_low_pfn = -1; | 473 | min_low_pfn = -1; |
474 | max_low_pfn = 0; | 474 | max_low_pfn = 0; |
475 | 475 | ||
476 | /* These actually end up getting called by call_pernode_memory() */ | 476 | /* These actually end up getting called by call_pernode_memory() */ |
477 | efi_memmap_walk(filter_rsvd_memory, build_node_maps); | 477 | efi_memmap_walk(filter_rsvd_memory, build_node_maps); |
478 | efi_memmap_walk(filter_rsvd_memory, find_pernode_space); | 478 | efi_memmap_walk(filter_rsvd_memory, find_pernode_space); |
479 | 479 | ||
480 | for_each_online_node(node) | 480 | for_each_online_node(node) |
481 | if (mem_data[node].bootmem_data.node_low_pfn) { | 481 | if (mem_data[node].bootmem_data.node_low_pfn) { |
482 | node_clear(node, memory_less_mask); | 482 | node_clear(node, memory_less_mask); |
483 | mem_data[node].min_pfn = ~0UL; | 483 | mem_data[node].min_pfn = ~0UL; |
484 | } | 484 | } |
485 | /* | 485 | /* |
486 | * Initialize the boot memory maps in reverse order since that's | 486 | * Initialize the boot memory maps in reverse order since that's |
487 | * what the bootmem allocator expects | 487 | * what the bootmem allocator expects |
488 | */ | 488 | */ |
489 | for (node = MAX_NUMNODES - 1; node >= 0; node--) { | 489 | for (node = MAX_NUMNODES - 1; node >= 0; node--) { |
490 | unsigned long pernode, pernodesize, map; | 490 | unsigned long pernode, pernodesize, map; |
491 | struct bootmem_data *bdp; | 491 | struct bootmem_data *bdp; |
492 | 492 | ||
493 | if (!node_online(node)) | 493 | if (!node_online(node)) |
494 | continue; | 494 | continue; |
495 | else if (node_isset(node, memory_less_mask)) | 495 | else if (node_isset(node, memory_less_mask)) |
496 | continue; | 496 | continue; |
497 | 497 | ||
498 | bdp = &mem_data[node].bootmem_data; | 498 | bdp = &mem_data[node].bootmem_data; |
499 | pernode = mem_data[node].pernode_addr; | 499 | pernode = mem_data[node].pernode_addr; |
500 | pernodesize = mem_data[node].pernode_size; | 500 | pernodesize = mem_data[node].pernode_size; |
501 | map = pernode + pernodesize; | 501 | map = pernode + pernodesize; |
502 | 502 | ||
503 | init_bootmem_node(mem_data[node].pgdat, | 503 | init_bootmem_node(mem_data[node].pgdat, |
504 | map>>PAGE_SHIFT, | 504 | map>>PAGE_SHIFT, |
505 | bdp->node_boot_start>>PAGE_SHIFT, | 505 | bdp->node_boot_start>>PAGE_SHIFT, |
506 | bdp->node_low_pfn); | 506 | bdp->node_low_pfn); |
507 | } | 507 | } |
508 | 508 | ||
509 | efi_memmap_walk(filter_rsvd_memory, free_node_bootmem); | 509 | efi_memmap_walk(filter_rsvd_memory, free_node_bootmem); |
510 | 510 | ||
511 | reserve_pernode_space(); | 511 | reserve_pernode_space(); |
512 | memory_less_nodes(); | 512 | memory_less_nodes(); |
513 | initialize_pernode_data(); | 513 | initialize_pernode_data(); |
514 | 514 | ||
515 | max_pfn = max_low_pfn; | 515 | max_pfn = max_low_pfn; |
516 | 516 | ||
517 | find_initrd(); | 517 | find_initrd(); |
518 | } | 518 | } |
519 | 519 | ||
520 | #ifdef CONFIG_SMP | 520 | #ifdef CONFIG_SMP |
521 | /** | 521 | /** |
522 | * per_cpu_init - setup per-cpu variables | 522 | * per_cpu_init - setup per-cpu variables |
523 | * | 523 | * |
524 | * find_pernode_space() does most of this already, we just need to set | 524 | * find_pernode_space() does most of this already, we just need to set |
525 | * local_per_cpu_offset | 525 | * local_per_cpu_offset |
526 | */ | 526 | */ |
527 | void *per_cpu_init(void) | 527 | void *per_cpu_init(void) |
528 | { | 528 | { |
529 | int cpu; | 529 | int cpu; |
530 | 530 | ||
531 | if (smp_processor_id() != 0) | 531 | if (smp_processor_id() != 0) |
532 | return __per_cpu_start + __per_cpu_offset[smp_processor_id()]; | 532 | return __per_cpu_start + __per_cpu_offset[smp_processor_id()]; |
533 | 533 | ||
534 | for (cpu = 0; cpu < NR_CPUS; cpu++) | 534 | for (cpu = 0; cpu < NR_CPUS; cpu++) |
535 | per_cpu(local_per_cpu_offset, cpu) = __per_cpu_offset[cpu]; | 535 | per_cpu(local_per_cpu_offset, cpu) = __per_cpu_offset[cpu]; |
536 | 536 | ||
537 | return __per_cpu_start + __per_cpu_offset[smp_processor_id()]; | 537 | return __per_cpu_start + __per_cpu_offset[smp_processor_id()]; |
538 | } | 538 | } |
539 | #endif /* CONFIG_SMP */ | 539 | #endif /* CONFIG_SMP */ |
540 | 540 | ||
541 | /** | 541 | /** |
542 | * show_mem - give short summary of memory stats | 542 | * show_mem - give short summary of memory stats |
543 | * | 543 | * |
544 | * Shows a simple page count of reserved and used pages in the system. | 544 | * Shows a simple page count of reserved and used pages in the system. |
545 | * For discontig machines, it does this on a per-pgdat basis. | 545 | * For discontig machines, it does this on a per-pgdat basis. |
546 | */ | 546 | */ |
547 | void show_mem(void) | 547 | void show_mem(void) |
548 | { | 548 | { |
549 | int i, total_reserved = 0; | 549 | int i, total_reserved = 0; |
550 | int total_shared = 0, total_cached = 0; | 550 | int total_shared = 0, total_cached = 0; |
551 | unsigned long total_present = 0; | 551 | unsigned long total_present = 0; |
552 | pg_data_t *pgdat; | 552 | pg_data_t *pgdat; |
553 | 553 | ||
554 | printk("Mem-info:\n"); | 554 | printk("Mem-info:\n"); |
555 | show_free_areas(); | 555 | show_free_areas(); |
556 | printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); | 556 | printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); |
557 | for_each_pgdat(pgdat) { | 557 | for_each_pgdat(pgdat) { |
558 | unsigned long present = pgdat->node_present_pages; | 558 | unsigned long present; |
559 | unsigned long flags; | ||
559 | int shared = 0, cached = 0, reserved = 0; | 560 | int shared = 0, cached = 0, reserved = 0; |
561 | |||
560 | printk("Node ID: %d\n", pgdat->node_id); | 562 | printk("Node ID: %d\n", pgdat->node_id); |
563 | pgdat_resize_lock(pgdat, &flags); | ||
564 | present = pgdat->node_present_pages; | ||
561 | for(i = 0; i < pgdat->node_spanned_pages; i++) { | 565 | for(i = 0; i < pgdat->node_spanned_pages; i++) { |
562 | struct page *page; | 566 | struct page *page; |
563 | if (pfn_valid(pgdat->node_start_pfn + i)) | 567 | if (pfn_valid(pgdat->node_start_pfn + i)) |
564 | page = pfn_to_page(pgdat->node_start_pfn + i); | 568 | page = pfn_to_page(pgdat->node_start_pfn + i); |
565 | else | 569 | else |
566 | continue; | 570 | continue; |
567 | if (PageReserved(page)) | 571 | if (PageReserved(page)) |
568 | reserved++; | 572 | reserved++; |
569 | else if (PageSwapCache(page)) | 573 | else if (PageSwapCache(page)) |
570 | cached++; | 574 | cached++; |
571 | else if (page_count(page)) | 575 | else if (page_count(page)) |
572 | shared += page_count(page)-1; | 576 | shared += page_count(page)-1; |
573 | } | 577 | } |
578 | pgdat_resize_unlock(pgdat, &flags); | ||
574 | total_present += present; | 579 | total_present += present; |
575 | total_reserved += reserved; | 580 | total_reserved += reserved; |
576 | total_cached += cached; | 581 | total_cached += cached; |
577 | total_shared += shared; | 582 | total_shared += shared; |
578 | printk("\t%ld pages of RAM\n", present); | 583 | printk("\t%ld pages of RAM\n", present); |
579 | printk("\t%d reserved pages\n", reserved); | 584 | printk("\t%d reserved pages\n", reserved); |
580 | printk("\t%d pages shared\n", shared); | 585 | printk("\t%d pages shared\n", shared); |
581 | printk("\t%d pages swap cached\n", cached); | 586 | printk("\t%d pages swap cached\n", cached); |
582 | } | 587 | } |
583 | printk("%ld pages of RAM\n", total_present); | 588 | printk("%ld pages of RAM\n", total_present); |
584 | printk("%d reserved pages\n", total_reserved); | 589 | printk("%d reserved pages\n", total_reserved); |
585 | printk("%d pages shared\n", total_shared); | 590 | printk("%d pages shared\n", total_shared); |
586 | printk("%d pages swap cached\n", total_cached); | 591 | printk("%d pages swap cached\n", total_cached); |
587 | printk("Total of %ld pages in page table cache\n", | 592 | printk("Total of %ld pages in page table cache\n", |
588 | pgtable_quicklist_total_size()); | 593 | pgtable_quicklist_total_size()); |
589 | printk("%d free buffer pages\n", nr_free_buffer_pages()); | 594 | printk("%d free buffer pages\n", nr_free_buffer_pages()); |
590 | } | 595 | } |
591 | 596 | ||
592 | /** | 597 | /** |
593 | * call_pernode_memory - use SRAT to call callback functions with node info | 598 | * call_pernode_memory - use SRAT to call callback functions with node info |
594 | * @start: physical start of range | 599 | * @start: physical start of range |
595 | * @len: length of range | 600 | * @len: length of range |
596 | * @arg: function to call for each range | 601 | * @arg: function to call for each range |
597 | * | 602 | * |
598 | * efi_memmap_walk() knows nothing about layout of memory across nodes. Find | 603 | * efi_memmap_walk() knows nothing about layout of memory across nodes. Find |
599 | * out to which node a block of memory belongs. Ignore memory that we cannot | 604 | * out to which node a block of memory belongs. Ignore memory that we cannot |
600 | * identify, and split blocks that run across multiple nodes. | 605 | * identify, and split blocks that run across multiple nodes. |
601 | * | 606 | * |
602 | * Take this opportunity to round the start address up and the end address | 607 | * Take this opportunity to round the start address up and the end address |
603 | * down to page boundaries. | 608 | * down to page boundaries. |
604 | */ | 609 | */ |
605 | void call_pernode_memory(unsigned long start, unsigned long len, void *arg) | 610 | void call_pernode_memory(unsigned long start, unsigned long len, void *arg) |
606 | { | 611 | { |
607 | unsigned long rs, re, end = start + len; | 612 | unsigned long rs, re, end = start + len; |
608 | void (*func)(unsigned long, unsigned long, int); | 613 | void (*func)(unsigned long, unsigned long, int); |
609 | int i; | 614 | int i; |
610 | 615 | ||
611 | start = PAGE_ALIGN(start); | 616 | start = PAGE_ALIGN(start); |
612 | end &= PAGE_MASK; | 617 | end &= PAGE_MASK; |
613 | if (start >= end) | 618 | if (start >= end) |
614 | return; | 619 | return; |
615 | 620 | ||
616 | func = arg; | 621 | func = arg; |
617 | 622 | ||
618 | if (!num_node_memblks) { | 623 | if (!num_node_memblks) { |
619 | /* No SRAT table, so assume one node (node 0) */ | 624 | /* No SRAT table, so assume one node (node 0) */ |
620 | if (start < end) | 625 | if (start < end) |
621 | (*func)(start, end - start, 0); | 626 | (*func)(start, end - start, 0); |
622 | return; | 627 | return; |
623 | } | 628 | } |
624 | 629 | ||
625 | for (i = 0; i < num_node_memblks; i++) { | 630 | for (i = 0; i < num_node_memblks; i++) { |
626 | rs = max(start, node_memblk[i].start_paddr); | 631 | rs = max(start, node_memblk[i].start_paddr); |
627 | re = min(end, node_memblk[i].start_paddr + | 632 | re = min(end, node_memblk[i].start_paddr + |
628 | node_memblk[i].size); | 633 | node_memblk[i].size); |
629 | 634 | ||
630 | if (rs < re) | 635 | if (rs < re) |
631 | (*func)(rs, re - rs, node_memblk[i].nid); | 636 | (*func)(rs, re - rs, node_memblk[i].nid); |
632 | 637 | ||
633 | if (re == end) | 638 | if (re == end) |
634 | break; | 639 | break; |
635 | } | 640 | } |
636 | } | 641 | } |
637 | 642 | ||
638 | /** | 643 | /** |
639 | * count_node_pages - callback to build per-node memory info structures | 644 | * count_node_pages - callback to build per-node memory info structures |
640 | * @start: physical start of range | 645 | * @start: physical start of range |
641 | * @len: length of range | 646 | * @len: length of range |
642 | * @node: node where this range resides | 647 | * @node: node where this range resides |
643 | * | 648 | * |
644 | * Each node has it's own number of physical pages, DMAable pages, start, and | 649 | * Each node has it's own number of physical pages, DMAable pages, start, and |
645 | * end page frame number. This routine will be called by call_pernode_memory() | 650 | * end page frame number. This routine will be called by call_pernode_memory() |
646 | * for each piece of usable memory and will setup these values for each node. | 651 | * for each piece of usable memory and will setup these values for each node. |
647 | * Very similar to build_maps(). | 652 | * Very similar to build_maps(). |
648 | */ | 653 | */ |
649 | static __init int count_node_pages(unsigned long start, unsigned long len, int node) | 654 | static __init int count_node_pages(unsigned long start, unsigned long len, int node) |
650 | { | 655 | { |
651 | unsigned long end = start + len; | 656 | unsigned long end = start + len; |
652 | 657 | ||
653 | mem_data[node].num_physpages += len >> PAGE_SHIFT; | 658 | mem_data[node].num_physpages += len >> PAGE_SHIFT; |
654 | if (start <= __pa(MAX_DMA_ADDRESS)) | 659 | if (start <= __pa(MAX_DMA_ADDRESS)) |
655 | mem_data[node].num_dma_physpages += | 660 | mem_data[node].num_dma_physpages += |
656 | (min(end, __pa(MAX_DMA_ADDRESS)) - start) >>PAGE_SHIFT; | 661 | (min(end, __pa(MAX_DMA_ADDRESS)) - start) >>PAGE_SHIFT; |
657 | start = GRANULEROUNDDOWN(start); | 662 | start = GRANULEROUNDDOWN(start); |
658 | start = ORDERROUNDDOWN(start); | 663 | start = ORDERROUNDDOWN(start); |
659 | end = GRANULEROUNDUP(end); | 664 | end = GRANULEROUNDUP(end); |
660 | mem_data[node].max_pfn = max(mem_data[node].max_pfn, | 665 | mem_data[node].max_pfn = max(mem_data[node].max_pfn, |
661 | end >> PAGE_SHIFT); | 666 | end >> PAGE_SHIFT); |
662 | mem_data[node].min_pfn = min(mem_data[node].min_pfn, | 667 | mem_data[node].min_pfn = min(mem_data[node].min_pfn, |
663 | start >> PAGE_SHIFT); | 668 | start >> PAGE_SHIFT); |
664 | 669 | ||
665 | return 0; | 670 | return 0; |
666 | } | 671 | } |
667 | 672 | ||
668 | /** | 673 | /** |
669 | * paging_init - setup page tables | 674 | * paging_init - setup page tables |
670 | * | 675 | * |
671 | * paging_init() sets up the page tables for each node of the system and frees | 676 | * paging_init() sets up the page tables for each node of the system and frees |
672 | * the bootmem allocator memory for general use. | 677 | * the bootmem allocator memory for general use. |
673 | */ | 678 | */ |
674 | void __init paging_init(void) | 679 | void __init paging_init(void) |
675 | { | 680 | { |
676 | unsigned long max_dma; | 681 | unsigned long max_dma; |
677 | unsigned long zones_size[MAX_NR_ZONES]; | 682 | unsigned long zones_size[MAX_NR_ZONES]; |
678 | unsigned long zholes_size[MAX_NR_ZONES]; | 683 | unsigned long zholes_size[MAX_NR_ZONES]; |
679 | unsigned long pfn_offset = 0; | 684 | unsigned long pfn_offset = 0; |
680 | int node; | 685 | int node; |
681 | 686 | ||
682 | max_dma = virt_to_phys((void *) MAX_DMA_ADDRESS) >> PAGE_SHIFT; | 687 | max_dma = virt_to_phys((void *) MAX_DMA_ADDRESS) >> PAGE_SHIFT; |
683 | 688 | ||
684 | arch_sparse_init(); | 689 | arch_sparse_init(); |
685 | 690 | ||
686 | efi_memmap_walk(filter_rsvd_memory, count_node_pages); | 691 | efi_memmap_walk(filter_rsvd_memory, count_node_pages); |
687 | 692 | ||
688 | #ifdef CONFIG_VIRTUAL_MEM_MAP | 693 | #ifdef CONFIG_VIRTUAL_MEM_MAP |
689 | vmalloc_end -= PAGE_ALIGN(max_low_pfn * sizeof(struct page)); | 694 | vmalloc_end -= PAGE_ALIGN(max_low_pfn * sizeof(struct page)); |
690 | vmem_map = (struct page *) vmalloc_end; | 695 | vmem_map = (struct page *) vmalloc_end; |
691 | efi_memmap_walk(create_mem_map_page_table, NULL); | 696 | efi_memmap_walk(create_mem_map_page_table, NULL); |
692 | printk("Virtual mem_map starts at 0x%p\n", vmem_map); | 697 | printk("Virtual mem_map starts at 0x%p\n", vmem_map); |
693 | #endif | 698 | #endif |
694 | 699 | ||
695 | for_each_online_node(node) { | 700 | for_each_online_node(node) { |
696 | memset(zones_size, 0, sizeof(zones_size)); | 701 | memset(zones_size, 0, sizeof(zones_size)); |
697 | memset(zholes_size, 0, sizeof(zholes_size)); | 702 | memset(zholes_size, 0, sizeof(zholes_size)); |
698 | 703 | ||
699 | num_physpages += mem_data[node].num_physpages; | 704 | num_physpages += mem_data[node].num_physpages; |
700 | 705 | ||
701 | if (mem_data[node].min_pfn >= max_dma) { | 706 | if (mem_data[node].min_pfn >= max_dma) { |
702 | /* All of this node's memory is above ZONE_DMA */ | 707 | /* All of this node's memory is above ZONE_DMA */ |
703 | zones_size[ZONE_NORMAL] = mem_data[node].max_pfn - | 708 | zones_size[ZONE_NORMAL] = mem_data[node].max_pfn - |
704 | mem_data[node].min_pfn; | 709 | mem_data[node].min_pfn; |
705 | zholes_size[ZONE_NORMAL] = mem_data[node].max_pfn - | 710 | zholes_size[ZONE_NORMAL] = mem_data[node].max_pfn - |
706 | mem_data[node].min_pfn - | 711 | mem_data[node].min_pfn - |
707 | mem_data[node].num_physpages; | 712 | mem_data[node].num_physpages; |
708 | } else if (mem_data[node].max_pfn < max_dma) { | 713 | } else if (mem_data[node].max_pfn < max_dma) { |
709 | /* All of this node's memory is in ZONE_DMA */ | 714 | /* All of this node's memory is in ZONE_DMA */ |
710 | zones_size[ZONE_DMA] = mem_data[node].max_pfn - | 715 | zones_size[ZONE_DMA] = mem_data[node].max_pfn - |
711 | mem_data[node].min_pfn; | 716 | mem_data[node].min_pfn; |
712 | zholes_size[ZONE_DMA] = mem_data[node].max_pfn - | 717 | zholes_size[ZONE_DMA] = mem_data[node].max_pfn - |
713 | mem_data[node].min_pfn - | 718 | mem_data[node].min_pfn - |
714 | mem_data[node].num_dma_physpages; | 719 | mem_data[node].num_dma_physpages; |
715 | } else { | 720 | } else { |
716 | /* This node has memory in both zones */ | 721 | /* This node has memory in both zones */ |
717 | zones_size[ZONE_DMA] = max_dma - | 722 | zones_size[ZONE_DMA] = max_dma - |
718 | mem_data[node].min_pfn; | 723 | mem_data[node].min_pfn; |
719 | zholes_size[ZONE_DMA] = zones_size[ZONE_DMA] - | 724 | zholes_size[ZONE_DMA] = zones_size[ZONE_DMA] - |
720 | mem_data[node].num_dma_physpages; | 725 | mem_data[node].num_dma_physpages; |
721 | zones_size[ZONE_NORMAL] = mem_data[node].max_pfn - | 726 | zones_size[ZONE_NORMAL] = mem_data[node].max_pfn - |
722 | max_dma; | 727 | max_dma; |
723 | zholes_size[ZONE_NORMAL] = zones_size[ZONE_NORMAL] - | 728 | zholes_size[ZONE_NORMAL] = zones_size[ZONE_NORMAL] - |
724 | (mem_data[node].num_physpages - | 729 | (mem_data[node].num_physpages - |
725 | mem_data[node].num_dma_physpages); | 730 | mem_data[node].num_dma_physpages); |
726 | } | 731 | } |
727 | 732 | ||
728 | pfn_offset = mem_data[node].min_pfn; | 733 | pfn_offset = mem_data[node].min_pfn; |
729 | 734 | ||
730 | #ifdef CONFIG_VIRTUAL_MEM_MAP | 735 | #ifdef CONFIG_VIRTUAL_MEM_MAP |
731 | NODE_DATA(node)->node_mem_map = vmem_map + pfn_offset; | 736 | NODE_DATA(node)->node_mem_map = vmem_map + pfn_offset; |
732 | #endif | 737 | #endif |
733 | free_area_init_node(node, NODE_DATA(node), zones_size, | 738 | free_area_init_node(node, NODE_DATA(node), zones_size, |
734 | pfn_offset, zholes_size); | 739 | pfn_offset, zholes_size); |
735 | } | 740 | } |
736 | 741 | ||
737 | /* | 742 | /* |
738 | * Make memory less nodes become a member of the known nodes. | 743 | * Make memory less nodes become a member of the known nodes. |
739 | */ | 744 | */ |
740 | for_each_node_mask(node, memory_less_mask) | 745 | for_each_node_mask(node, memory_less_mask) |
741 | pgdat_insert(mem_data[node].pgdat); | 746 | pgdat_insert(mem_data[node].pgdat); |
742 | 747 | ||
743 | zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page)); | 748 | zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page)); |
744 | } | 749 | } |
745 | 750 |
arch/m32r/mm/init.c
1 | /* | 1 | /* |
2 | * linux/arch/m32r/mm/init.c | 2 | * linux/arch/m32r/mm/init.c |
3 | * | 3 | * |
4 | * Copyright (c) 2001, 2002 Hitoshi Yamamoto | 4 | * Copyright (c) 2001, 2002 Hitoshi Yamamoto |
5 | * | 5 | * |
6 | * Some code taken from sh version. | 6 | * Some code taken from sh version. |
7 | * Copyright (C) 1999 Niibe Yutaka | 7 | * Copyright (C) 1999 Niibe Yutaka |
8 | * Based on linux/arch/i386/mm/init.c: | 8 | * Based on linux/arch/i386/mm/init.c: |
9 | * Copyright (C) 1995 Linus Torvalds | 9 | * Copyright (C) 1995 Linus Torvalds |
10 | */ | 10 | */ |
11 | 11 | ||
12 | #include <linux/init.h> | 12 | #include <linux/init.h> |
13 | #include <linux/kernel.h> | 13 | #include <linux/kernel.h> |
14 | #include <linux/mm.h> | 14 | #include <linux/mm.h> |
15 | #include <linux/pagemap.h> | 15 | #include <linux/pagemap.h> |
16 | #include <linux/bootmem.h> | 16 | #include <linux/bootmem.h> |
17 | #include <linux/swap.h> | 17 | #include <linux/swap.h> |
18 | #include <linux/highmem.h> | 18 | #include <linux/highmem.h> |
19 | #include <linux/bitops.h> | 19 | #include <linux/bitops.h> |
20 | #include <linux/nodemask.h> | 20 | #include <linux/nodemask.h> |
21 | #include <asm/types.h> | 21 | #include <asm/types.h> |
22 | #include <asm/processor.h> | 22 | #include <asm/processor.h> |
23 | #include <asm/page.h> | 23 | #include <asm/page.h> |
24 | #include <asm/pgtable.h> | 24 | #include <asm/pgtable.h> |
25 | #include <asm/pgalloc.h> | 25 | #include <asm/pgalloc.h> |
26 | #include <asm/mmu_context.h> | 26 | #include <asm/mmu_context.h> |
27 | #include <asm/setup.h> | 27 | #include <asm/setup.h> |
28 | #include <asm/tlb.h> | 28 | #include <asm/tlb.h> |
29 | 29 | ||
30 | /* References to section boundaries */ | 30 | /* References to section boundaries */ |
31 | extern char _text, _etext, _edata; | 31 | extern char _text, _etext, _edata; |
32 | extern char __init_begin, __init_end; | 32 | extern char __init_begin, __init_end; |
33 | 33 | ||
34 | pgd_t swapper_pg_dir[1024]; | 34 | pgd_t swapper_pg_dir[1024]; |
35 | 35 | ||
36 | DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); | 36 | DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); |
37 | 37 | ||
38 | void show_mem(void) | 38 | void show_mem(void) |
39 | { | 39 | { |
40 | int total = 0, reserved = 0; | 40 | int total = 0, reserved = 0; |
41 | int shared = 0, cached = 0; | 41 | int shared = 0, cached = 0; |
42 | int highmem = 0; | 42 | int highmem = 0; |
43 | struct page *page; | 43 | struct page *page; |
44 | pg_data_t *pgdat; | 44 | pg_data_t *pgdat; |
45 | unsigned long i; | 45 | unsigned long i; |
46 | 46 | ||
47 | printk("Mem-info:\n"); | 47 | printk("Mem-info:\n"); |
48 | show_free_areas(); | 48 | show_free_areas(); |
49 | printk("Free swap: %6ldkB\n",nr_swap_pages<<(PAGE_SHIFT-10)); | 49 | printk("Free swap: %6ldkB\n",nr_swap_pages<<(PAGE_SHIFT-10)); |
50 | for_each_pgdat(pgdat) { | 50 | for_each_pgdat(pgdat) { |
51 | unsigned long flags; | ||
52 | pgdat_resize_lock(pgdat, &flags); | ||
51 | for (i = 0; i < pgdat->node_spanned_pages; ++i) { | 53 | for (i = 0; i < pgdat->node_spanned_pages; ++i) { |
52 | page = pgdat_page_nr(pgdat, i); | 54 | page = pgdat_page_nr(pgdat, i); |
53 | total++; | 55 | total++; |
54 | if (PageHighMem(page)) | 56 | if (PageHighMem(page)) |
55 | highmem++; | 57 | highmem++; |
56 | if (PageReserved(page)) | 58 | if (PageReserved(page)) |
57 | reserved++; | 59 | reserved++; |
58 | else if (PageSwapCache(page)) | 60 | else if (PageSwapCache(page)) |
59 | cached++; | 61 | cached++; |
60 | else if (page_count(page)) | 62 | else if (page_count(page)) |
61 | shared += page_count(page) - 1; | 63 | shared += page_count(page) - 1; |
62 | } | 64 | } |
65 | pgdat_resize_unlock(pgdat, &flags); | ||
63 | } | 66 | } |
64 | printk("%d pages of RAM\n", total); | 67 | printk("%d pages of RAM\n", total); |
65 | printk("%d pages of HIGHMEM\n",highmem); | 68 | printk("%d pages of HIGHMEM\n",highmem); |
66 | printk("%d reserved pages\n",reserved); | 69 | printk("%d reserved pages\n",reserved); |
67 | printk("%d pages shared\n",shared); | 70 | printk("%d pages shared\n",shared); |
68 | printk("%d pages swap cached\n",cached); | 71 | printk("%d pages swap cached\n",cached); |
69 | } | 72 | } |
70 | 73 | ||
71 | /* | 74 | /* |
72 | * Cache of MMU context last used. | 75 | * Cache of MMU context last used. |
73 | */ | 76 | */ |
74 | #ifndef CONFIG_SMP | 77 | #ifndef CONFIG_SMP |
75 | unsigned long mmu_context_cache_dat; | 78 | unsigned long mmu_context_cache_dat; |
76 | #else | 79 | #else |
77 | unsigned long mmu_context_cache_dat[NR_CPUS]; | 80 | unsigned long mmu_context_cache_dat[NR_CPUS]; |
78 | #endif | 81 | #endif |
79 | static unsigned long hole_pages; | 82 | static unsigned long hole_pages; |
80 | 83 | ||
81 | /* | 84 | /* |
82 | * function prototype | 85 | * function prototype |
83 | */ | 86 | */ |
84 | void __init paging_init(void); | 87 | void __init paging_init(void); |
85 | void __init mem_init(void); | 88 | void __init mem_init(void); |
86 | void free_initmem(void); | 89 | void free_initmem(void); |
87 | #ifdef CONFIG_BLK_DEV_INITRD | 90 | #ifdef CONFIG_BLK_DEV_INITRD |
88 | void free_initrd_mem(unsigned long, unsigned long); | 91 | void free_initrd_mem(unsigned long, unsigned long); |
89 | #endif | 92 | #endif |
90 | 93 | ||
91 | /* It'd be good if these lines were in the standard header file. */ | 94 | /* It'd be good if these lines were in the standard header file. */ |
92 | #define START_PFN(nid) \ | 95 | #define START_PFN(nid) \ |
93 | (NODE_DATA(nid)->bdata->node_boot_start >> PAGE_SHIFT) | 96 | (NODE_DATA(nid)->bdata->node_boot_start >> PAGE_SHIFT) |
94 | #define MAX_LOW_PFN(nid) (NODE_DATA(nid)->bdata->node_low_pfn) | 97 | #define MAX_LOW_PFN(nid) (NODE_DATA(nid)->bdata->node_low_pfn) |
95 | 98 | ||
96 | #ifndef CONFIG_DISCONTIGMEM | 99 | #ifndef CONFIG_DISCONTIGMEM |
97 | unsigned long __init zone_sizes_init(void) | 100 | unsigned long __init zone_sizes_init(void) |
98 | { | 101 | { |
99 | unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0}; | 102 | unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0}; |
100 | unsigned long max_dma; | 103 | unsigned long max_dma; |
101 | unsigned long low; | 104 | unsigned long low; |
102 | unsigned long start_pfn; | 105 | unsigned long start_pfn; |
103 | 106 | ||
104 | #ifdef CONFIG_MMU | 107 | #ifdef CONFIG_MMU |
105 | start_pfn = START_PFN(0); | 108 | start_pfn = START_PFN(0); |
106 | max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; | 109 | max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; |
107 | low = MAX_LOW_PFN(0); | 110 | low = MAX_LOW_PFN(0); |
108 | 111 | ||
109 | if (low < max_dma){ | 112 | if (low < max_dma){ |
110 | zones_size[ZONE_DMA] = low - start_pfn; | 113 | zones_size[ZONE_DMA] = low - start_pfn; |
111 | zones_size[ZONE_NORMAL] = 0; | 114 | zones_size[ZONE_NORMAL] = 0; |
112 | } else { | 115 | } else { |
113 | zones_size[ZONE_DMA] = low - start_pfn; | 116 | zones_size[ZONE_DMA] = low - start_pfn; |
114 | zones_size[ZONE_NORMAL] = low - max_dma; | 117 | zones_size[ZONE_NORMAL] = low - max_dma; |
115 | } | 118 | } |
116 | #else | 119 | #else |
117 | zones_size[ZONE_DMA] = 0 >> PAGE_SHIFT; | 120 | zones_size[ZONE_DMA] = 0 >> PAGE_SHIFT; |
118 | zones_size[ZONE_NORMAL] = __MEMORY_SIZE >> PAGE_SHIFT; | 121 | zones_size[ZONE_NORMAL] = __MEMORY_SIZE >> PAGE_SHIFT; |
119 | start_pfn = __MEMORY_START >> PAGE_SHIFT; | 122 | start_pfn = __MEMORY_START >> PAGE_SHIFT; |
120 | #endif /* CONFIG_MMU */ | 123 | #endif /* CONFIG_MMU */ |
121 | 124 | ||
122 | free_area_init_node(0, NODE_DATA(0), zones_size, start_pfn, 0); | 125 | free_area_init_node(0, NODE_DATA(0), zones_size, start_pfn, 0); |
123 | 126 | ||
124 | return 0; | 127 | return 0; |
125 | } | 128 | } |
126 | #else /* CONFIG_DISCONTIGMEM */ | 129 | #else /* CONFIG_DISCONTIGMEM */ |
127 | extern unsigned long zone_sizes_init(void); | 130 | extern unsigned long zone_sizes_init(void); |
128 | #endif /* CONFIG_DISCONTIGMEM */ | 131 | #endif /* CONFIG_DISCONTIGMEM */ |
129 | 132 | ||
130 | /*======================================================================* | 133 | /*======================================================================* |
131 | * paging_init() : sets up the page tables | 134 | * paging_init() : sets up the page tables |
132 | *======================================================================*/ | 135 | *======================================================================*/ |
133 | void __init paging_init(void) | 136 | void __init paging_init(void) |
134 | { | 137 | { |
135 | #ifdef CONFIG_MMU | 138 | #ifdef CONFIG_MMU |
136 | int i; | 139 | int i; |
137 | pgd_t *pg_dir; | 140 | pgd_t *pg_dir; |
138 | 141 | ||
139 | /* We don't need kernel mapping as hardware support that. */ | 142 | /* We don't need kernel mapping as hardware support that. */ |
140 | pg_dir = swapper_pg_dir; | 143 | pg_dir = swapper_pg_dir; |
141 | 144 | ||
142 | for (i = 0 ; i < USER_PTRS_PER_PGD * 2 ; i++) | 145 | for (i = 0 ; i < USER_PTRS_PER_PGD * 2 ; i++) |
143 | pgd_val(pg_dir[i]) = 0; | 146 | pgd_val(pg_dir[i]) = 0; |
144 | #endif /* CONFIG_MMU */ | 147 | #endif /* CONFIG_MMU */ |
145 | hole_pages = zone_sizes_init(); | 148 | hole_pages = zone_sizes_init(); |
146 | } | 149 | } |
147 | 150 | ||
148 | int __init reservedpages_count(void) | 151 | int __init reservedpages_count(void) |
149 | { | 152 | { |
150 | int reservedpages, nid, i; | 153 | int reservedpages, nid, i; |
151 | 154 | ||
152 | reservedpages = 0; | 155 | reservedpages = 0; |
153 | for_each_online_node(nid) | 156 | for_each_online_node(nid) { |
157 | unsigned long flags; | ||
158 | pgdat_resize_lock(NODE_DATA(nid), &flags); | ||
154 | for (i = 0 ; i < MAX_LOW_PFN(nid) - START_PFN(nid) ; i++) | 159 | for (i = 0 ; i < MAX_LOW_PFN(nid) - START_PFN(nid) ; i++) |
155 | if (PageReserved(nid_page_nr(nid, i))) | 160 | if (PageReserved(nid_page_nr(nid, i))) |
156 | reservedpages++; | 161 | reservedpages++; |
162 | pgdat_resize_unlock(NODE_DATA(nid), &flags); | ||
163 | } | ||
157 | 164 | ||
158 | return reservedpages; | 165 | return reservedpages; |
159 | } | 166 | } |
160 | 167 | ||
161 | /*======================================================================* | 168 | /*======================================================================* |
162 | * mem_init() : | 169 | * mem_init() : |
163 | * orig : arch/sh/mm/init.c | 170 | * orig : arch/sh/mm/init.c |
164 | *======================================================================*/ | 171 | *======================================================================*/ |
165 | void __init mem_init(void) | 172 | void __init mem_init(void) |
166 | { | 173 | { |
167 | int codesize, reservedpages, datasize, initsize; | 174 | int codesize, reservedpages, datasize, initsize; |
168 | int nid; | 175 | int nid; |
169 | #ifndef CONFIG_MMU | 176 | #ifndef CONFIG_MMU |
170 | extern unsigned long memory_end; | 177 | extern unsigned long memory_end; |
171 | #endif | 178 | #endif |
172 | 179 | ||
173 | num_physpages = 0; | 180 | num_physpages = 0; |
174 | for_each_online_node(nid) | 181 | for_each_online_node(nid) |
175 | num_physpages += MAX_LOW_PFN(nid) - START_PFN(nid) + 1; | 182 | num_physpages += MAX_LOW_PFN(nid) - START_PFN(nid) + 1; |
176 | 183 | ||
177 | num_physpages -= hole_pages; | 184 | num_physpages -= hole_pages; |
178 | 185 | ||
179 | #ifndef CONFIG_DISCONTIGMEM | 186 | #ifndef CONFIG_DISCONTIGMEM |
180 | max_mapnr = num_physpages; | 187 | max_mapnr = num_physpages; |
181 | #endif /* CONFIG_DISCONTIGMEM */ | 188 | #endif /* CONFIG_DISCONTIGMEM */ |
182 | 189 | ||
183 | #ifdef CONFIG_MMU | 190 | #ifdef CONFIG_MMU |
184 | high_memory = (void *)__va(PFN_PHYS(MAX_LOW_PFN(0))); | 191 | high_memory = (void *)__va(PFN_PHYS(MAX_LOW_PFN(0))); |
185 | #else | 192 | #else |
186 | high_memory = (void *)(memory_end & PAGE_MASK); | 193 | high_memory = (void *)(memory_end & PAGE_MASK); |
187 | #endif /* CONFIG_MMU */ | 194 | #endif /* CONFIG_MMU */ |
188 | 195 | ||
189 | /* clear the zero-page */ | 196 | /* clear the zero-page */ |
190 | memset(empty_zero_page, 0, PAGE_SIZE); | 197 | memset(empty_zero_page, 0, PAGE_SIZE); |
191 | 198 | ||
192 | /* this will put all low memory onto the freelists */ | 199 | /* this will put all low memory onto the freelists */ |
193 | for_each_online_node(nid) | 200 | for_each_online_node(nid) |
194 | totalram_pages += free_all_bootmem_node(NODE_DATA(nid)); | 201 | totalram_pages += free_all_bootmem_node(NODE_DATA(nid)); |
195 | 202 | ||
196 | reservedpages = reservedpages_count() - hole_pages; | 203 | reservedpages = reservedpages_count() - hole_pages; |
197 | codesize = (unsigned long) &_etext - (unsigned long)&_text; | 204 | codesize = (unsigned long) &_etext - (unsigned long)&_text; |
198 | datasize = (unsigned long) &_edata - (unsigned long)&_etext; | 205 | datasize = (unsigned long) &_edata - (unsigned long)&_etext; |
199 | initsize = (unsigned long) &__init_end - (unsigned long)&__init_begin; | 206 | initsize = (unsigned long) &__init_end - (unsigned long)&__init_begin; |
200 | 207 | ||
201 | printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, " | 208 | printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, " |
202 | "%dk reserved, %dk data, %dk init)\n", | 209 | "%dk reserved, %dk data, %dk init)\n", |
203 | (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), | 210 | (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), |
204 | num_physpages << (PAGE_SHIFT-10), | 211 | num_physpages << (PAGE_SHIFT-10), |
205 | codesize >> 10, | 212 | codesize >> 10, |
206 | reservedpages << (PAGE_SHIFT-10), | 213 | reservedpages << (PAGE_SHIFT-10), |
207 | datasize >> 10, | 214 | datasize >> 10, |
208 | initsize >> 10); | 215 | initsize >> 10); |
209 | } | 216 | } |
210 | 217 | ||
211 | /*======================================================================* | 218 | /*======================================================================* |
212 | * free_initmem() : | 219 | * free_initmem() : |
213 | * orig : arch/sh/mm/init.c | 220 | * orig : arch/sh/mm/init.c |
214 | *======================================================================*/ | 221 | *======================================================================*/ |
215 | void free_initmem(void) | 222 | void free_initmem(void) |
216 | { | 223 | { |
217 | unsigned long addr; | 224 | unsigned long addr; |
218 | 225 | ||
219 | addr = (unsigned long)(&__init_begin); | 226 | addr = (unsigned long)(&__init_begin); |
220 | for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { | 227 | for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { |
221 | ClearPageReserved(virt_to_page(addr)); | 228 | ClearPageReserved(virt_to_page(addr)); |
222 | set_page_count(virt_to_page(addr), 1); | 229 | set_page_count(virt_to_page(addr), 1); |
223 | free_page(addr); | 230 | free_page(addr); |
224 | totalram_pages++; | 231 | totalram_pages++; |
225 | } | 232 | } |
226 | printk (KERN_INFO "Freeing unused kernel memory: %dk freed\n", \ | 233 | printk (KERN_INFO "Freeing unused kernel memory: %dk freed\n", \ |
227 | (int)(&__init_end - &__init_begin) >> 10); | 234 | (int)(&__init_end - &__init_begin) >> 10); |
228 | } | 235 | } |
229 | 236 | ||
230 | #ifdef CONFIG_BLK_DEV_INITRD | 237 | #ifdef CONFIG_BLK_DEV_INITRD |
231 | /*======================================================================* | 238 | /*======================================================================* |
232 | * free_initrd_mem() : | 239 | * free_initrd_mem() : |
233 | * orig : arch/sh/mm/init.c | 240 | * orig : arch/sh/mm/init.c |
234 | *======================================================================*/ | 241 | *======================================================================*/ |
235 | void free_initrd_mem(unsigned long start, unsigned long end) | 242 | void free_initrd_mem(unsigned long start, unsigned long end) |
236 | { | 243 | { |
237 | unsigned long p; | 244 | unsigned long p; |
238 | for (p = start; p < end; p += PAGE_SIZE) { | 245 | for (p = start; p < end; p += PAGE_SIZE) { |
239 | ClearPageReserved(virt_to_page(p)); | 246 | ClearPageReserved(virt_to_page(p)); |
240 | set_page_count(virt_to_page(p), 1); | 247 | set_page_count(virt_to_page(p), 1); |
241 | free_page(p); | 248 | free_page(p); |
242 | totalram_pages++; | 249 | totalram_pages++; |
243 | } | 250 | } |
244 | printk (KERN_INFO "Freeing initrd memory: %ldk freed\n", (end - start) >> 10); | 251 | printk (KERN_INFO "Freeing initrd memory: %ldk freed\n", (end - start) >> 10); |
245 | } | 252 | } |
246 | #endif | 253 | #endif |
247 | 254 | ||
248 | 255 |
arch/parisc/mm/init.c
1 | /* | 1 | /* |
2 | * linux/arch/parisc/mm/init.c | 2 | * linux/arch/parisc/mm/init.c |
3 | * | 3 | * |
4 | * Copyright (C) 1995 Linus Torvalds | 4 | * Copyright (C) 1995 Linus Torvalds |
5 | * Copyright 1999 SuSE GmbH | 5 | * Copyright 1999 SuSE GmbH |
6 | * changed by Philipp Rumpf | 6 | * changed by Philipp Rumpf |
7 | * Copyright 1999 Philipp Rumpf (prumpf@tux.org) | 7 | * Copyright 1999 Philipp Rumpf (prumpf@tux.org) |
8 | * Copyright 2004 Randolph Chung (tausq@debian.org) | 8 | * Copyright 2004 Randolph Chung (tausq@debian.org) |
9 | * | 9 | * |
10 | */ | 10 | */ |
11 | 11 | ||
12 | #include <linux/config.h> | 12 | #include <linux/config.h> |
13 | 13 | ||
14 | #include <linux/module.h> | 14 | #include <linux/module.h> |
15 | #include <linux/mm.h> | 15 | #include <linux/mm.h> |
16 | #include <linux/bootmem.h> | 16 | #include <linux/bootmem.h> |
17 | #include <linux/delay.h> | 17 | #include <linux/delay.h> |
18 | #include <linux/init.h> | 18 | #include <linux/init.h> |
19 | #include <linux/pci.h> /* for hppa_dma_ops and pcxl_dma_ops */ | 19 | #include <linux/pci.h> /* for hppa_dma_ops and pcxl_dma_ops */ |
20 | #include <linux/initrd.h> | 20 | #include <linux/initrd.h> |
21 | #include <linux/swap.h> | 21 | #include <linux/swap.h> |
22 | #include <linux/unistd.h> | 22 | #include <linux/unistd.h> |
23 | #include <linux/nodemask.h> /* for node_online_map */ | 23 | #include <linux/nodemask.h> /* for node_online_map */ |
24 | #include <linux/pagemap.h> /* for release_pages and page_cache_release */ | 24 | #include <linux/pagemap.h> /* for release_pages and page_cache_release */ |
25 | 25 | ||
26 | #include <asm/pgalloc.h> | 26 | #include <asm/pgalloc.h> |
27 | #include <asm/tlb.h> | 27 | #include <asm/tlb.h> |
28 | #include <asm/pdc_chassis.h> | 28 | #include <asm/pdc_chassis.h> |
29 | #include <asm/mmzone.h> | 29 | #include <asm/mmzone.h> |
30 | 30 | ||
31 | DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); | 31 | DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); |
32 | 32 | ||
33 | extern char _text; /* start of kernel code, defined by linker */ | 33 | extern char _text; /* start of kernel code, defined by linker */ |
34 | extern int data_start; | 34 | extern int data_start; |
35 | extern char _end; /* end of BSS, defined by linker */ | 35 | extern char _end; /* end of BSS, defined by linker */ |
36 | extern char __init_begin, __init_end; | 36 | extern char __init_begin, __init_end; |
37 | 37 | ||
38 | #ifdef CONFIG_DISCONTIGMEM | 38 | #ifdef CONFIG_DISCONTIGMEM |
39 | struct node_map_data node_data[MAX_NUMNODES]; | 39 | struct node_map_data node_data[MAX_NUMNODES]; |
40 | bootmem_data_t bmem_data[MAX_NUMNODES]; | 40 | bootmem_data_t bmem_data[MAX_NUMNODES]; |
41 | unsigned char pfnnid_map[PFNNID_MAP_MAX]; | 41 | unsigned char pfnnid_map[PFNNID_MAP_MAX]; |
42 | #endif | 42 | #endif |
43 | 43 | ||
44 | static struct resource data_resource = { | 44 | static struct resource data_resource = { |
45 | .name = "Kernel data", | 45 | .name = "Kernel data", |
46 | .flags = IORESOURCE_BUSY | IORESOURCE_MEM, | 46 | .flags = IORESOURCE_BUSY | IORESOURCE_MEM, |
47 | }; | 47 | }; |
48 | 48 | ||
49 | static struct resource code_resource = { | 49 | static struct resource code_resource = { |
50 | .name = "Kernel code", | 50 | .name = "Kernel code", |
51 | .flags = IORESOURCE_BUSY | IORESOURCE_MEM, | 51 | .flags = IORESOURCE_BUSY | IORESOURCE_MEM, |
52 | }; | 52 | }; |
53 | 53 | ||
54 | static struct resource pdcdata_resource = { | 54 | static struct resource pdcdata_resource = { |
55 | .name = "PDC data (Page Zero)", | 55 | .name = "PDC data (Page Zero)", |
56 | .start = 0, | 56 | .start = 0, |
57 | .end = 0x9ff, | 57 | .end = 0x9ff, |
58 | .flags = IORESOURCE_BUSY | IORESOURCE_MEM, | 58 | .flags = IORESOURCE_BUSY | IORESOURCE_MEM, |
59 | }; | 59 | }; |
60 | 60 | ||
61 | static struct resource sysram_resources[MAX_PHYSMEM_RANGES]; | 61 | static struct resource sysram_resources[MAX_PHYSMEM_RANGES]; |
62 | 62 | ||
63 | /* The following array is initialized from the firmware specific | 63 | /* The following array is initialized from the firmware specific |
64 | * information retrieved in kernel/inventory.c. | 64 | * information retrieved in kernel/inventory.c. |
65 | */ | 65 | */ |
66 | 66 | ||
67 | physmem_range_t pmem_ranges[MAX_PHYSMEM_RANGES]; | 67 | physmem_range_t pmem_ranges[MAX_PHYSMEM_RANGES]; |
68 | int npmem_ranges; | 68 | int npmem_ranges; |
69 | 69 | ||
70 | #ifdef __LP64__ | 70 | #ifdef __LP64__ |
71 | #define MAX_MEM (~0UL) | 71 | #define MAX_MEM (~0UL) |
72 | #else /* !__LP64__ */ | 72 | #else /* !__LP64__ */ |
73 | #define MAX_MEM (3584U*1024U*1024U) | 73 | #define MAX_MEM (3584U*1024U*1024U) |
74 | #endif /* !__LP64__ */ | 74 | #endif /* !__LP64__ */ |
75 | 75 | ||
76 | static unsigned long mem_limit = MAX_MEM; | 76 | static unsigned long mem_limit = MAX_MEM; |
77 | 77 | ||
78 | static void __init mem_limit_func(void) | 78 | static void __init mem_limit_func(void) |
79 | { | 79 | { |
80 | char *cp, *end; | 80 | char *cp, *end; |
81 | unsigned long limit; | 81 | unsigned long limit; |
82 | extern char saved_command_line[]; | 82 | extern char saved_command_line[]; |
83 | 83 | ||
84 | /* We need this before __setup() functions are called */ | 84 | /* We need this before __setup() functions are called */ |
85 | 85 | ||
86 | limit = MAX_MEM; | 86 | limit = MAX_MEM; |
87 | for (cp = saved_command_line; *cp; ) { | 87 | for (cp = saved_command_line; *cp; ) { |
88 | if (memcmp(cp, "mem=", 4) == 0) { | 88 | if (memcmp(cp, "mem=", 4) == 0) { |
89 | cp += 4; | 89 | cp += 4; |
90 | limit = memparse(cp, &end); | 90 | limit = memparse(cp, &end); |
91 | if (end != cp) | 91 | if (end != cp) |
92 | break; | 92 | break; |
93 | cp = end; | 93 | cp = end; |
94 | } else { | 94 | } else { |
95 | while (*cp != ' ' && *cp) | 95 | while (*cp != ' ' && *cp) |
96 | ++cp; | 96 | ++cp; |
97 | while (*cp == ' ') | 97 | while (*cp == ' ') |
98 | ++cp; | 98 | ++cp; |
99 | } | 99 | } |
100 | } | 100 | } |
101 | 101 | ||
102 | if (limit < mem_limit) | 102 | if (limit < mem_limit) |
103 | mem_limit = limit; | 103 | mem_limit = limit; |
104 | } | 104 | } |
105 | 105 | ||
106 | #define MAX_GAP (0x40000000UL >> PAGE_SHIFT) | 106 | #define MAX_GAP (0x40000000UL >> PAGE_SHIFT) |
107 | 107 | ||
108 | static void __init setup_bootmem(void) | 108 | static void __init setup_bootmem(void) |
109 | { | 109 | { |
110 | unsigned long bootmap_size; | 110 | unsigned long bootmap_size; |
111 | unsigned long mem_max; | 111 | unsigned long mem_max; |
112 | unsigned long bootmap_pages; | 112 | unsigned long bootmap_pages; |
113 | unsigned long bootmap_start_pfn; | 113 | unsigned long bootmap_start_pfn; |
114 | unsigned long bootmap_pfn; | 114 | unsigned long bootmap_pfn; |
115 | #ifndef CONFIG_DISCONTIGMEM | 115 | #ifndef CONFIG_DISCONTIGMEM |
116 | physmem_range_t pmem_holes[MAX_PHYSMEM_RANGES - 1]; | 116 | physmem_range_t pmem_holes[MAX_PHYSMEM_RANGES - 1]; |
117 | int npmem_holes; | 117 | int npmem_holes; |
118 | #endif | 118 | #endif |
119 | int i, sysram_resource_count; | 119 | int i, sysram_resource_count; |
120 | 120 | ||
121 | disable_sr_hashing(); /* Turn off space register hashing */ | 121 | disable_sr_hashing(); /* Turn off space register hashing */ |
122 | 122 | ||
123 | /* | 123 | /* |
124 | * Sort the ranges. Since the number of ranges is typically | 124 | * Sort the ranges. Since the number of ranges is typically |
125 | * small, and performance is not an issue here, just do | 125 | * small, and performance is not an issue here, just do |
126 | * a simple insertion sort. | 126 | * a simple insertion sort. |
127 | */ | 127 | */ |
128 | 128 | ||
129 | for (i = 1; i < npmem_ranges; i++) { | 129 | for (i = 1; i < npmem_ranges; i++) { |
130 | int j; | 130 | int j; |
131 | 131 | ||
132 | for (j = i; j > 0; j--) { | 132 | for (j = i; j > 0; j--) { |
133 | unsigned long tmp; | 133 | unsigned long tmp; |
134 | 134 | ||
135 | if (pmem_ranges[j-1].start_pfn < | 135 | if (pmem_ranges[j-1].start_pfn < |
136 | pmem_ranges[j].start_pfn) { | 136 | pmem_ranges[j].start_pfn) { |
137 | 137 | ||
138 | break; | 138 | break; |
139 | } | 139 | } |
140 | tmp = pmem_ranges[j-1].start_pfn; | 140 | tmp = pmem_ranges[j-1].start_pfn; |
141 | pmem_ranges[j-1].start_pfn = pmem_ranges[j].start_pfn; | 141 | pmem_ranges[j-1].start_pfn = pmem_ranges[j].start_pfn; |
142 | pmem_ranges[j].start_pfn = tmp; | 142 | pmem_ranges[j].start_pfn = tmp; |
143 | tmp = pmem_ranges[j-1].pages; | 143 | tmp = pmem_ranges[j-1].pages; |
144 | pmem_ranges[j-1].pages = pmem_ranges[j].pages; | 144 | pmem_ranges[j-1].pages = pmem_ranges[j].pages; |
145 | pmem_ranges[j].pages = tmp; | 145 | pmem_ranges[j].pages = tmp; |
146 | } | 146 | } |
147 | } | 147 | } |
148 | 148 | ||
149 | #ifndef CONFIG_DISCONTIGMEM | 149 | #ifndef CONFIG_DISCONTIGMEM |
150 | /* | 150 | /* |
151 | * Throw out ranges that are too far apart (controlled by | 151 | * Throw out ranges that are too far apart (controlled by |
152 | * MAX_GAP). | 152 | * MAX_GAP). |
153 | */ | 153 | */ |
154 | 154 | ||
155 | for (i = 1; i < npmem_ranges; i++) { | 155 | for (i = 1; i < npmem_ranges; i++) { |
156 | if (pmem_ranges[i].start_pfn - | 156 | if (pmem_ranges[i].start_pfn - |
157 | (pmem_ranges[i-1].start_pfn + | 157 | (pmem_ranges[i-1].start_pfn + |
158 | pmem_ranges[i-1].pages) > MAX_GAP) { | 158 | pmem_ranges[i-1].pages) > MAX_GAP) { |
159 | npmem_ranges = i; | 159 | npmem_ranges = i; |
160 | printk("Large gap in memory detected (%ld pages). " | 160 | printk("Large gap in memory detected (%ld pages). " |
161 | "Consider turning on CONFIG_DISCONTIGMEM\n", | 161 | "Consider turning on CONFIG_DISCONTIGMEM\n", |
162 | pmem_ranges[i].start_pfn - | 162 | pmem_ranges[i].start_pfn - |
163 | (pmem_ranges[i-1].start_pfn + | 163 | (pmem_ranges[i-1].start_pfn + |
164 | pmem_ranges[i-1].pages)); | 164 | pmem_ranges[i-1].pages)); |
165 | break; | 165 | break; |
166 | } | 166 | } |
167 | } | 167 | } |
168 | #endif | 168 | #endif |
169 | 169 | ||
170 | if (npmem_ranges > 1) { | 170 | if (npmem_ranges > 1) { |
171 | 171 | ||
172 | /* Print the memory ranges */ | 172 | /* Print the memory ranges */ |
173 | 173 | ||
174 | printk(KERN_INFO "Memory Ranges:\n"); | 174 | printk(KERN_INFO "Memory Ranges:\n"); |
175 | 175 | ||
176 | for (i = 0; i < npmem_ranges; i++) { | 176 | for (i = 0; i < npmem_ranges; i++) { |
177 | unsigned long start; | 177 | unsigned long start; |
178 | unsigned long size; | 178 | unsigned long size; |
179 | 179 | ||
180 | size = (pmem_ranges[i].pages << PAGE_SHIFT); | 180 | size = (pmem_ranges[i].pages << PAGE_SHIFT); |
181 | start = (pmem_ranges[i].start_pfn << PAGE_SHIFT); | 181 | start = (pmem_ranges[i].start_pfn << PAGE_SHIFT); |
182 | printk(KERN_INFO "%2d) Start 0x%016lx End 0x%016lx Size %6ld MB\n", | 182 | printk(KERN_INFO "%2d) Start 0x%016lx End 0x%016lx Size %6ld MB\n", |
183 | i,start, start + (size - 1), size >> 20); | 183 | i,start, start + (size - 1), size >> 20); |
184 | } | 184 | } |
185 | } | 185 | } |
186 | 186 | ||
187 | sysram_resource_count = npmem_ranges; | 187 | sysram_resource_count = npmem_ranges; |
188 | for (i = 0; i < sysram_resource_count; i++) { | 188 | for (i = 0; i < sysram_resource_count; i++) { |
189 | struct resource *res = &sysram_resources[i]; | 189 | struct resource *res = &sysram_resources[i]; |
190 | res->name = "System RAM"; | 190 | res->name = "System RAM"; |
191 | res->start = pmem_ranges[i].start_pfn << PAGE_SHIFT; | 191 | res->start = pmem_ranges[i].start_pfn << PAGE_SHIFT; |
192 | res->end = res->start + (pmem_ranges[i].pages << PAGE_SHIFT)-1; | 192 | res->end = res->start + (pmem_ranges[i].pages << PAGE_SHIFT)-1; |
193 | res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; | 193 | res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; |
194 | request_resource(&iomem_resource, res); | 194 | request_resource(&iomem_resource, res); |
195 | } | 195 | } |
196 | 196 | ||
197 | /* | 197 | /* |
198 | * For 32 bit kernels we limit the amount of memory we can | 198 | * For 32 bit kernels we limit the amount of memory we can |
199 | * support, in order to preserve enough kernel address space | 199 | * support, in order to preserve enough kernel address space |
200 | * for other purposes. For 64 bit kernels we don't normally | 200 | * for other purposes. For 64 bit kernels we don't normally |
201 | * limit the memory, but this mechanism can be used to | 201 | * limit the memory, but this mechanism can be used to |
202 | * artificially limit the amount of memory (and it is written | 202 | * artificially limit the amount of memory (and it is written |
203 | * to work with multiple memory ranges). | 203 | * to work with multiple memory ranges). |
204 | */ | 204 | */ |
205 | 205 | ||
206 | mem_limit_func(); /* check for "mem=" argument */ | 206 | mem_limit_func(); /* check for "mem=" argument */ |
207 | 207 | ||
208 | mem_max = 0; | 208 | mem_max = 0; |
209 | num_physpages = 0; | 209 | num_physpages = 0; |
210 | for (i = 0; i < npmem_ranges; i++) { | 210 | for (i = 0; i < npmem_ranges; i++) { |
211 | unsigned long rsize; | 211 | unsigned long rsize; |
212 | 212 | ||
213 | rsize = pmem_ranges[i].pages << PAGE_SHIFT; | 213 | rsize = pmem_ranges[i].pages << PAGE_SHIFT; |
214 | if ((mem_max + rsize) > mem_limit) { | 214 | if ((mem_max + rsize) > mem_limit) { |
215 | printk(KERN_WARNING "Memory truncated to %ld MB\n", mem_limit >> 20); | 215 | printk(KERN_WARNING "Memory truncated to %ld MB\n", mem_limit >> 20); |
216 | if (mem_max == mem_limit) | 216 | if (mem_max == mem_limit) |
217 | npmem_ranges = i; | 217 | npmem_ranges = i; |
218 | else { | 218 | else { |
219 | pmem_ranges[i].pages = (mem_limit >> PAGE_SHIFT) | 219 | pmem_ranges[i].pages = (mem_limit >> PAGE_SHIFT) |
220 | - (mem_max >> PAGE_SHIFT); | 220 | - (mem_max >> PAGE_SHIFT); |
221 | npmem_ranges = i + 1; | 221 | npmem_ranges = i + 1; |
222 | mem_max = mem_limit; | 222 | mem_max = mem_limit; |
223 | } | 223 | } |
224 | num_physpages += pmem_ranges[i].pages; | 224 | num_physpages += pmem_ranges[i].pages; |
225 | break; | 225 | break; |
226 | } | 226 | } |
227 | num_physpages += pmem_ranges[i].pages; | 227 | num_physpages += pmem_ranges[i].pages; |
228 | mem_max += rsize; | 228 | mem_max += rsize; |
229 | } | 229 | } |
230 | 230 | ||
231 | printk(KERN_INFO "Total Memory: %ld MB\n",mem_max >> 20); | 231 | printk(KERN_INFO "Total Memory: %ld MB\n",mem_max >> 20); |
232 | 232 | ||
233 | #ifndef CONFIG_DISCONTIGMEM | 233 | #ifndef CONFIG_DISCONTIGMEM |
234 | /* Merge the ranges, keeping track of the holes */ | 234 | /* Merge the ranges, keeping track of the holes */ |
235 | 235 | ||
236 | { | 236 | { |
237 | unsigned long end_pfn; | 237 | unsigned long end_pfn; |
238 | unsigned long hole_pages; | 238 | unsigned long hole_pages; |
239 | 239 | ||
240 | npmem_holes = 0; | 240 | npmem_holes = 0; |
241 | end_pfn = pmem_ranges[0].start_pfn + pmem_ranges[0].pages; | 241 | end_pfn = pmem_ranges[0].start_pfn + pmem_ranges[0].pages; |
242 | for (i = 1; i < npmem_ranges; i++) { | 242 | for (i = 1; i < npmem_ranges; i++) { |
243 | 243 | ||
244 | hole_pages = pmem_ranges[i].start_pfn - end_pfn; | 244 | hole_pages = pmem_ranges[i].start_pfn - end_pfn; |
245 | if (hole_pages) { | 245 | if (hole_pages) { |
246 | pmem_holes[npmem_holes].start_pfn = end_pfn; | 246 | pmem_holes[npmem_holes].start_pfn = end_pfn; |
247 | pmem_holes[npmem_holes++].pages = hole_pages; | 247 | pmem_holes[npmem_holes++].pages = hole_pages; |
248 | end_pfn += hole_pages; | 248 | end_pfn += hole_pages; |
249 | } | 249 | } |
250 | end_pfn += pmem_ranges[i].pages; | 250 | end_pfn += pmem_ranges[i].pages; |
251 | } | 251 | } |
252 | 252 | ||
253 | pmem_ranges[0].pages = end_pfn - pmem_ranges[0].start_pfn; | 253 | pmem_ranges[0].pages = end_pfn - pmem_ranges[0].start_pfn; |
254 | npmem_ranges = 1; | 254 | npmem_ranges = 1; |
255 | } | 255 | } |
256 | #endif | 256 | #endif |
257 | 257 | ||
258 | bootmap_pages = 0; | 258 | bootmap_pages = 0; |
259 | for (i = 0; i < npmem_ranges; i++) | 259 | for (i = 0; i < npmem_ranges; i++) |
260 | bootmap_pages += bootmem_bootmap_pages(pmem_ranges[i].pages); | 260 | bootmap_pages += bootmem_bootmap_pages(pmem_ranges[i].pages); |
261 | 261 | ||
262 | bootmap_start_pfn = PAGE_ALIGN(__pa((unsigned long) &_end)) >> PAGE_SHIFT; | 262 | bootmap_start_pfn = PAGE_ALIGN(__pa((unsigned long) &_end)) >> PAGE_SHIFT; |
263 | 263 | ||
264 | #ifdef CONFIG_DISCONTIGMEM | 264 | #ifdef CONFIG_DISCONTIGMEM |
265 | for (i = 0; i < MAX_PHYSMEM_RANGES; i++) { | 265 | for (i = 0; i < MAX_PHYSMEM_RANGES; i++) { |
266 | memset(NODE_DATA(i), 0, sizeof(pg_data_t)); | 266 | memset(NODE_DATA(i), 0, sizeof(pg_data_t)); |
267 | NODE_DATA(i)->bdata = &bmem_data[i]; | 267 | NODE_DATA(i)->bdata = &bmem_data[i]; |
268 | } | 268 | } |
269 | memset(pfnnid_map, 0xff, sizeof(pfnnid_map)); | 269 | memset(pfnnid_map, 0xff, sizeof(pfnnid_map)); |
270 | 270 | ||
271 | for (i = 0; i < npmem_ranges; i++) | 271 | for (i = 0; i < npmem_ranges; i++) |
272 | node_set_online(i); | 272 | node_set_online(i); |
273 | #endif | 273 | #endif |
274 | 274 | ||
275 | /* | 275 | /* |
276 | * Initialize and free the full range of memory in each range. | 276 | * Initialize and free the full range of memory in each range. |
277 | * Note that the only writing these routines do are to the bootmap, | 277 | * Note that the only writing these routines do are to the bootmap, |
278 | * and we've made sure to locate the bootmap properly so that they | 278 | * and we've made sure to locate the bootmap properly so that they |
279 | * won't be writing over anything important. | 279 | * won't be writing over anything important. |
280 | */ | 280 | */ |
281 | 281 | ||
282 | bootmap_pfn = bootmap_start_pfn; | 282 | bootmap_pfn = bootmap_start_pfn; |
283 | max_pfn = 0; | 283 | max_pfn = 0; |
284 | for (i = 0; i < npmem_ranges; i++) { | 284 | for (i = 0; i < npmem_ranges; i++) { |
285 | unsigned long start_pfn; | 285 | unsigned long start_pfn; |
286 | unsigned long npages; | 286 | unsigned long npages; |
287 | 287 | ||
288 | start_pfn = pmem_ranges[i].start_pfn; | 288 | start_pfn = pmem_ranges[i].start_pfn; |
289 | npages = pmem_ranges[i].pages; | 289 | npages = pmem_ranges[i].pages; |
290 | 290 | ||
291 | bootmap_size = init_bootmem_node(NODE_DATA(i), | 291 | bootmap_size = init_bootmem_node(NODE_DATA(i), |
292 | bootmap_pfn, | 292 | bootmap_pfn, |
293 | start_pfn, | 293 | start_pfn, |
294 | (start_pfn + npages) ); | 294 | (start_pfn + npages) ); |
295 | free_bootmem_node(NODE_DATA(i), | 295 | free_bootmem_node(NODE_DATA(i), |
296 | (start_pfn << PAGE_SHIFT), | 296 | (start_pfn << PAGE_SHIFT), |
297 | (npages << PAGE_SHIFT) ); | 297 | (npages << PAGE_SHIFT) ); |
298 | bootmap_pfn += (bootmap_size + PAGE_SIZE - 1) >> PAGE_SHIFT; | 298 | bootmap_pfn += (bootmap_size + PAGE_SIZE - 1) >> PAGE_SHIFT; |
299 | if ((start_pfn + npages) > max_pfn) | 299 | if ((start_pfn + npages) > max_pfn) |
300 | max_pfn = start_pfn + npages; | 300 | max_pfn = start_pfn + npages; |
301 | } | 301 | } |
302 | 302 | ||
303 | if ((bootmap_pfn - bootmap_start_pfn) != bootmap_pages) { | 303 | if ((bootmap_pfn - bootmap_start_pfn) != bootmap_pages) { |
304 | printk(KERN_WARNING "WARNING! bootmap sizing is messed up!\n"); | 304 | printk(KERN_WARNING "WARNING! bootmap sizing is messed up!\n"); |
305 | BUG(); | 305 | BUG(); |
306 | } | 306 | } |
307 | 307 | ||
308 | /* reserve PAGE0 pdc memory, kernel text/data/bss & bootmap */ | 308 | /* reserve PAGE0 pdc memory, kernel text/data/bss & bootmap */ |
309 | 309 | ||
310 | #define PDC_CONSOLE_IO_IODC_SIZE 32768 | 310 | #define PDC_CONSOLE_IO_IODC_SIZE 32768 |
311 | 311 | ||
312 | reserve_bootmem_node(NODE_DATA(0), 0UL, | 312 | reserve_bootmem_node(NODE_DATA(0), 0UL, |
313 | (unsigned long)(PAGE0->mem_free + PDC_CONSOLE_IO_IODC_SIZE)); | 313 | (unsigned long)(PAGE0->mem_free + PDC_CONSOLE_IO_IODC_SIZE)); |
314 | reserve_bootmem_node(NODE_DATA(0),__pa((unsigned long)&_text), | 314 | reserve_bootmem_node(NODE_DATA(0),__pa((unsigned long)&_text), |
315 | (unsigned long)(&_end - &_text)); | 315 | (unsigned long)(&_end - &_text)); |
316 | reserve_bootmem_node(NODE_DATA(0), (bootmap_start_pfn << PAGE_SHIFT), | 316 | reserve_bootmem_node(NODE_DATA(0), (bootmap_start_pfn << PAGE_SHIFT), |
317 | ((bootmap_pfn - bootmap_start_pfn) << PAGE_SHIFT)); | 317 | ((bootmap_pfn - bootmap_start_pfn) << PAGE_SHIFT)); |
318 | 318 | ||
319 | #ifndef CONFIG_DISCONTIGMEM | 319 | #ifndef CONFIG_DISCONTIGMEM |
320 | 320 | ||
321 | /* reserve the holes */ | 321 | /* reserve the holes */ |
322 | 322 | ||
323 | for (i = 0; i < npmem_holes; i++) { | 323 | for (i = 0; i < npmem_holes; i++) { |
324 | reserve_bootmem_node(NODE_DATA(0), | 324 | reserve_bootmem_node(NODE_DATA(0), |
325 | (pmem_holes[i].start_pfn << PAGE_SHIFT), | 325 | (pmem_holes[i].start_pfn << PAGE_SHIFT), |
326 | (pmem_holes[i].pages << PAGE_SHIFT)); | 326 | (pmem_holes[i].pages << PAGE_SHIFT)); |
327 | } | 327 | } |
328 | #endif | 328 | #endif |
329 | 329 | ||
330 | #ifdef CONFIG_BLK_DEV_INITRD | 330 | #ifdef CONFIG_BLK_DEV_INITRD |
331 | if (initrd_start) { | 331 | if (initrd_start) { |
332 | printk(KERN_INFO "initrd: %08lx-%08lx\n", initrd_start, initrd_end); | 332 | printk(KERN_INFO "initrd: %08lx-%08lx\n", initrd_start, initrd_end); |
333 | if (__pa(initrd_start) < mem_max) { | 333 | if (__pa(initrd_start) < mem_max) { |
334 | unsigned long initrd_reserve; | 334 | unsigned long initrd_reserve; |
335 | 335 | ||
336 | if (__pa(initrd_end) > mem_max) { | 336 | if (__pa(initrd_end) > mem_max) { |
337 | initrd_reserve = mem_max - __pa(initrd_start); | 337 | initrd_reserve = mem_max - __pa(initrd_start); |
338 | } else { | 338 | } else { |
339 | initrd_reserve = initrd_end - initrd_start; | 339 | initrd_reserve = initrd_end - initrd_start; |
340 | } | 340 | } |
341 | initrd_below_start_ok = 1; | 341 | initrd_below_start_ok = 1; |
342 | printk(KERN_INFO "initrd: reserving %08lx-%08lx (mem_max %08lx)\n", __pa(initrd_start), __pa(initrd_start) + initrd_reserve, mem_max); | 342 | printk(KERN_INFO "initrd: reserving %08lx-%08lx (mem_max %08lx)\n", __pa(initrd_start), __pa(initrd_start) + initrd_reserve, mem_max); |
343 | 343 | ||
344 | reserve_bootmem_node(NODE_DATA(0),__pa(initrd_start), initrd_reserve); | 344 | reserve_bootmem_node(NODE_DATA(0),__pa(initrd_start), initrd_reserve); |
345 | } | 345 | } |
346 | } | 346 | } |
347 | #endif | 347 | #endif |
348 | 348 | ||
349 | data_resource.start = virt_to_phys(&data_start); | 349 | data_resource.start = virt_to_phys(&data_start); |
350 | data_resource.end = virt_to_phys(&_end)-1; | 350 | data_resource.end = virt_to_phys(&_end)-1; |
351 | code_resource.start = virt_to_phys(&_text); | 351 | code_resource.start = virt_to_phys(&_text); |
352 | code_resource.end = virt_to_phys(&data_start)-1; | 352 | code_resource.end = virt_to_phys(&data_start)-1; |
353 | 353 | ||
354 | /* We don't know which region the kernel will be in, so try | 354 | /* We don't know which region the kernel will be in, so try |
355 | * all of them. | 355 | * all of them. |
356 | */ | 356 | */ |
357 | for (i = 0; i < sysram_resource_count; i++) { | 357 | for (i = 0; i < sysram_resource_count; i++) { |
358 | struct resource *res = &sysram_resources[i]; | 358 | struct resource *res = &sysram_resources[i]; |
359 | request_resource(res, &code_resource); | 359 | request_resource(res, &code_resource); |
360 | request_resource(res, &data_resource); | 360 | request_resource(res, &data_resource); |
361 | } | 361 | } |
362 | request_resource(&sysram_resources[0], &pdcdata_resource); | 362 | request_resource(&sysram_resources[0], &pdcdata_resource); |
363 | } | 363 | } |
364 | 364 | ||
365 | void free_initmem(void) | 365 | void free_initmem(void) |
366 | { | 366 | { |
367 | /* FIXME: */ | 367 | /* FIXME: */ |
368 | #if 0 | 368 | #if 0 |
369 | printk(KERN_INFO "NOT FREEING INITMEM (%dk)\n", | 369 | printk(KERN_INFO "NOT FREEING INITMEM (%dk)\n", |
370 | (&__init_end - &__init_begin) >> 10); | 370 | (&__init_end - &__init_begin) >> 10); |
371 | return; | 371 | return; |
372 | #else | 372 | #else |
373 | unsigned long addr; | 373 | unsigned long addr; |
374 | 374 | ||
375 | printk(KERN_INFO "Freeing unused kernel memory: "); | 375 | printk(KERN_INFO "Freeing unused kernel memory: "); |
376 | 376 | ||
377 | #if 1 | 377 | #if 1 |
378 | /* Attempt to catch anyone trying to execute code here | 378 | /* Attempt to catch anyone trying to execute code here |
379 | * by filling the page with BRK insns. | 379 | * by filling the page with BRK insns. |
380 | * | 380 | * |
381 | * If we disable interrupts for all CPUs, then IPI stops working. | 381 | * If we disable interrupts for all CPUs, then IPI stops working. |
382 | * Kinda breaks the global cache flushing. | 382 | * Kinda breaks the global cache flushing. |
383 | */ | 383 | */ |
384 | local_irq_disable(); | 384 | local_irq_disable(); |
385 | 385 | ||
386 | memset(&__init_begin, 0x00, | 386 | memset(&__init_begin, 0x00, |
387 | (unsigned long)&__init_end - (unsigned long)&__init_begin); | 387 | (unsigned long)&__init_end - (unsigned long)&__init_begin); |
388 | 388 | ||
389 | flush_data_cache(); | 389 | flush_data_cache(); |
390 | asm volatile("sync" : : ); | 390 | asm volatile("sync" : : ); |
391 | flush_icache_range((unsigned long)&__init_begin, (unsigned long)&__init_end); | 391 | flush_icache_range((unsigned long)&__init_begin, (unsigned long)&__init_end); |
392 | asm volatile("sync" : : ); | 392 | asm volatile("sync" : : ); |
393 | 393 | ||
394 | local_irq_enable(); | 394 | local_irq_enable(); |
395 | #endif | 395 | #endif |
396 | 396 | ||
397 | addr = (unsigned long)(&__init_begin); | 397 | addr = (unsigned long)(&__init_begin); |
398 | for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { | 398 | for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { |
399 | ClearPageReserved(virt_to_page(addr)); | 399 | ClearPageReserved(virt_to_page(addr)); |
400 | set_page_count(virt_to_page(addr), 1); | 400 | set_page_count(virt_to_page(addr), 1); |
401 | free_page(addr); | 401 | free_page(addr); |
402 | num_physpages++; | 402 | num_physpages++; |
403 | totalram_pages++; | 403 | totalram_pages++; |
404 | } | 404 | } |
405 | 405 | ||
406 | /* set up a new led state on systems shipped LED State panel */ | 406 | /* set up a new led state on systems shipped LED State panel */ |
407 | pdc_chassis_send_status(PDC_CHASSIS_DIRECT_BCOMPLETE); | 407 | pdc_chassis_send_status(PDC_CHASSIS_DIRECT_BCOMPLETE); |
408 | 408 | ||
409 | printk("%luk freed\n", (unsigned long)(&__init_end - &__init_begin) >> 10); | 409 | printk("%luk freed\n", (unsigned long)(&__init_end - &__init_begin) >> 10); |
410 | #endif | 410 | #endif |
411 | } | 411 | } |
412 | 412 | ||
413 | /* | 413 | /* |
414 | * Just an arbitrary offset to serve as a "hole" between mapping areas | 414 | * Just an arbitrary offset to serve as a "hole" between mapping areas |
415 | * (between top of physical memory and a potential pcxl dma mapping | 415 | * (between top of physical memory and a potential pcxl dma mapping |
416 | * area, and below the vmalloc mapping area). | 416 | * area, and below the vmalloc mapping area). |
417 | * | 417 | * |
418 | * The current 32K value just means that there will be a 32K "hole" | 418 | * The current 32K value just means that there will be a 32K "hole" |
419 | * between mapping areas. That means that any out-of-bounds memory | 419 | * between mapping areas. That means that any out-of-bounds memory |
420 | * accesses will hopefully be caught. The vmalloc() routines leaves | 420 | * accesses will hopefully be caught. The vmalloc() routines leaves |
421 | * a hole of 4kB between each vmalloced area for the same reason. | 421 | * a hole of 4kB between each vmalloced area for the same reason. |
422 | */ | 422 | */ |
423 | 423 | ||
424 | /* Leave room for gateway page expansion */ | 424 | /* Leave room for gateway page expansion */ |
425 | #if KERNEL_MAP_START < GATEWAY_PAGE_SIZE | 425 | #if KERNEL_MAP_START < GATEWAY_PAGE_SIZE |
426 | #error KERNEL_MAP_START is in gateway reserved region | 426 | #error KERNEL_MAP_START is in gateway reserved region |
427 | #endif | 427 | #endif |
428 | #define MAP_START (KERNEL_MAP_START) | 428 | #define MAP_START (KERNEL_MAP_START) |
429 | 429 | ||
430 | #define VM_MAP_OFFSET (32*1024) | 430 | #define VM_MAP_OFFSET (32*1024) |
431 | #define SET_MAP_OFFSET(x) ((void *)(((unsigned long)(x) + VM_MAP_OFFSET) \ | 431 | #define SET_MAP_OFFSET(x) ((void *)(((unsigned long)(x) + VM_MAP_OFFSET) \ |
432 | & ~(VM_MAP_OFFSET-1))) | 432 | & ~(VM_MAP_OFFSET-1))) |
433 | 433 | ||
434 | void *vmalloc_start; | 434 | void *vmalloc_start; |
435 | EXPORT_SYMBOL(vmalloc_start); | 435 | EXPORT_SYMBOL(vmalloc_start); |
436 | 436 | ||
437 | #ifdef CONFIG_PA11 | 437 | #ifdef CONFIG_PA11 |
438 | unsigned long pcxl_dma_start; | 438 | unsigned long pcxl_dma_start; |
439 | #endif | 439 | #endif |
440 | 440 | ||
441 | void __init mem_init(void) | 441 | void __init mem_init(void) |
442 | { | 442 | { |
443 | high_memory = __va((max_pfn << PAGE_SHIFT)); | 443 | high_memory = __va((max_pfn << PAGE_SHIFT)); |
444 | 444 | ||
445 | #ifndef CONFIG_DISCONTIGMEM | 445 | #ifndef CONFIG_DISCONTIGMEM |
446 | max_mapnr = page_to_pfn(virt_to_page(high_memory - 1)) + 1; | 446 | max_mapnr = page_to_pfn(virt_to_page(high_memory - 1)) + 1; |
447 | totalram_pages += free_all_bootmem(); | 447 | totalram_pages += free_all_bootmem(); |
448 | #else | 448 | #else |
449 | { | 449 | { |
450 | int i; | 450 | int i; |
451 | 451 | ||
452 | for (i = 0; i < npmem_ranges; i++) | 452 | for (i = 0; i < npmem_ranges; i++) |
453 | totalram_pages += free_all_bootmem_node(NODE_DATA(i)); | 453 | totalram_pages += free_all_bootmem_node(NODE_DATA(i)); |
454 | } | 454 | } |
455 | #endif | 455 | #endif |
456 | 456 | ||
457 | printk(KERN_INFO "Memory: %luk available\n", num_physpages << (PAGE_SHIFT-10)); | 457 | printk(KERN_INFO "Memory: %luk available\n", num_physpages << (PAGE_SHIFT-10)); |
458 | 458 | ||
459 | #ifdef CONFIG_PA11 | 459 | #ifdef CONFIG_PA11 |
460 | if (hppa_dma_ops == &pcxl_dma_ops) { | 460 | if (hppa_dma_ops == &pcxl_dma_ops) { |
461 | pcxl_dma_start = (unsigned long)SET_MAP_OFFSET(MAP_START); | 461 | pcxl_dma_start = (unsigned long)SET_MAP_OFFSET(MAP_START); |
462 | vmalloc_start = SET_MAP_OFFSET(pcxl_dma_start + PCXL_DMA_MAP_SIZE); | 462 | vmalloc_start = SET_MAP_OFFSET(pcxl_dma_start + PCXL_DMA_MAP_SIZE); |
463 | } else { | 463 | } else { |
464 | pcxl_dma_start = 0; | 464 | pcxl_dma_start = 0; |
465 | vmalloc_start = SET_MAP_OFFSET(MAP_START); | 465 | vmalloc_start = SET_MAP_OFFSET(MAP_START); |
466 | } | 466 | } |
467 | #else | 467 | #else |
468 | vmalloc_start = SET_MAP_OFFSET(MAP_START); | 468 | vmalloc_start = SET_MAP_OFFSET(MAP_START); |
469 | #endif | 469 | #endif |
470 | 470 | ||
471 | } | 471 | } |
472 | 472 | ||
473 | int do_check_pgt_cache(int low, int high) | 473 | int do_check_pgt_cache(int low, int high) |
474 | { | 474 | { |
475 | return 0; | 475 | return 0; |
476 | } | 476 | } |
477 | 477 | ||
478 | unsigned long *empty_zero_page; | 478 | unsigned long *empty_zero_page; |
479 | 479 | ||
480 | void show_mem(void) | 480 | void show_mem(void) |
481 | { | 481 | { |
482 | int i,free = 0,total = 0,reserved = 0; | 482 | int i,free = 0,total = 0,reserved = 0; |
483 | int shared = 0, cached = 0; | 483 | int shared = 0, cached = 0; |
484 | 484 | ||
485 | printk(KERN_INFO "Mem-info:\n"); | 485 | printk(KERN_INFO "Mem-info:\n"); |
486 | show_free_areas(); | 486 | show_free_areas(); |
487 | printk(KERN_INFO "Free swap: %6ldkB\n", | 487 | printk(KERN_INFO "Free swap: %6ldkB\n", |
488 | nr_swap_pages<<(PAGE_SHIFT-10)); | 488 | nr_swap_pages<<(PAGE_SHIFT-10)); |
489 | #ifndef CONFIG_DISCONTIGMEM | 489 | #ifndef CONFIG_DISCONTIGMEM |
490 | i = max_mapnr; | 490 | i = max_mapnr; |
491 | while (i-- > 0) { | 491 | while (i-- > 0) { |
492 | total++; | 492 | total++; |
493 | if (PageReserved(mem_map+i)) | 493 | if (PageReserved(mem_map+i)) |
494 | reserved++; | 494 | reserved++; |
495 | else if (PageSwapCache(mem_map+i)) | 495 | else if (PageSwapCache(mem_map+i)) |
496 | cached++; | 496 | cached++; |
497 | else if (!page_count(&mem_map[i])) | 497 | else if (!page_count(&mem_map[i])) |
498 | free++; | 498 | free++; |
499 | else | 499 | else |
500 | shared += page_count(&mem_map[i]) - 1; | 500 | shared += page_count(&mem_map[i]) - 1; |
501 | } | 501 | } |
502 | #else | 502 | #else |
503 | for (i = 0; i < npmem_ranges; i++) { | 503 | for (i = 0; i < npmem_ranges; i++) { |
504 | int j; | 504 | int j; |
505 | 505 | ||
506 | for (j = node_start_pfn(i); j < node_end_pfn(i); j++) { | 506 | for (j = node_start_pfn(i); j < node_end_pfn(i); j++) { |
507 | struct page *p; | 507 | struct page *p; |
508 | unsigned long flags; | ||
508 | 509 | ||
510 | pgdat_resize_lock(NODE_DATA(i), &flags); | ||
509 | p = nid_page_nr(i, j) - node_start_pfn(i); | 511 | p = nid_page_nr(i, j) - node_start_pfn(i); |
510 | 512 | ||
511 | total++; | 513 | total++; |
512 | if (PageReserved(p)) | 514 | if (PageReserved(p)) |
513 | reserved++; | 515 | reserved++; |
514 | else if (PageSwapCache(p)) | 516 | else if (PageSwapCache(p)) |
515 | cached++; | 517 | cached++; |
516 | else if (!page_count(p)) | 518 | else if (!page_count(p)) |
517 | free++; | 519 | free++; |
518 | else | 520 | else |
519 | shared += page_count(p) - 1; | 521 | shared += page_count(p) - 1; |
522 | pgdat_resize_unlock(NODE_DATA(i), &flags); | ||
520 | } | 523 | } |
521 | } | 524 | } |
522 | #endif | 525 | #endif |
523 | printk(KERN_INFO "%d pages of RAM\n", total); | 526 | printk(KERN_INFO "%d pages of RAM\n", total); |
524 | printk(KERN_INFO "%d reserved pages\n", reserved); | 527 | printk(KERN_INFO "%d reserved pages\n", reserved); |
525 | printk(KERN_INFO "%d pages shared\n", shared); | 528 | printk(KERN_INFO "%d pages shared\n", shared); |
526 | printk(KERN_INFO "%d pages swap cached\n", cached); | 529 | printk(KERN_INFO "%d pages swap cached\n", cached); |
527 | 530 | ||
528 | 531 | ||
529 | #ifdef CONFIG_DISCONTIGMEM | 532 | #ifdef CONFIG_DISCONTIGMEM |
530 | { | 533 | { |
531 | struct zonelist *zl; | 534 | struct zonelist *zl; |
532 | int i, j, k; | 535 | int i, j, k; |
533 | 536 | ||
534 | for (i = 0; i < npmem_ranges; i++) { | 537 | for (i = 0; i < npmem_ranges; i++) { |
535 | for (j = 0; j < MAX_NR_ZONES; j++) { | 538 | for (j = 0; j < MAX_NR_ZONES; j++) { |
536 | zl = NODE_DATA(i)->node_zonelists + j; | 539 | zl = NODE_DATA(i)->node_zonelists + j; |
537 | 540 | ||
538 | printk("Zone list for zone %d on node %d: ", j, i); | 541 | printk("Zone list for zone %d on node %d: ", j, i); |
539 | for (k = 0; zl->zones[k] != NULL; k++) | 542 | for (k = 0; zl->zones[k] != NULL; k++) |
540 | printk("[%d/%s] ", zl->zones[k]->zone_pgdat->node_id, zl->zones[k]->name); | 543 | printk("[%d/%s] ", zl->zones[k]->zone_pgdat->node_id, zl->zones[k]->name); |
541 | printk("\n"); | 544 | printk("\n"); |
542 | } | 545 | } |
543 | } | 546 | } |
544 | } | 547 | } |
545 | #endif | 548 | #endif |
546 | } | 549 | } |
547 | 550 | ||
548 | 551 | ||
549 | static void __init map_pages(unsigned long start_vaddr, unsigned long start_paddr, unsigned long size, pgprot_t pgprot) | 552 | static void __init map_pages(unsigned long start_vaddr, unsigned long start_paddr, unsigned long size, pgprot_t pgprot) |
550 | { | 553 | { |
551 | pgd_t *pg_dir; | 554 | pgd_t *pg_dir; |
552 | pmd_t *pmd; | 555 | pmd_t *pmd; |
553 | pte_t *pg_table; | 556 | pte_t *pg_table; |
554 | unsigned long end_paddr; | 557 | unsigned long end_paddr; |
555 | unsigned long start_pmd; | 558 | unsigned long start_pmd; |
556 | unsigned long start_pte; | 559 | unsigned long start_pte; |
557 | unsigned long tmp1; | 560 | unsigned long tmp1; |
558 | unsigned long tmp2; | 561 | unsigned long tmp2; |
559 | unsigned long address; | 562 | unsigned long address; |
560 | unsigned long ro_start; | 563 | unsigned long ro_start; |
561 | unsigned long ro_end; | 564 | unsigned long ro_end; |
562 | unsigned long fv_addr; | 565 | unsigned long fv_addr; |
563 | unsigned long gw_addr; | 566 | unsigned long gw_addr; |
564 | extern const unsigned long fault_vector_20; | 567 | extern const unsigned long fault_vector_20; |
565 | extern void * const linux_gateway_page; | 568 | extern void * const linux_gateway_page; |
566 | 569 | ||
567 | ro_start = __pa((unsigned long)&_text); | 570 | ro_start = __pa((unsigned long)&_text); |
568 | ro_end = __pa((unsigned long)&data_start); | 571 | ro_end = __pa((unsigned long)&data_start); |
569 | fv_addr = __pa((unsigned long)&fault_vector_20) & PAGE_MASK; | 572 | fv_addr = __pa((unsigned long)&fault_vector_20) & PAGE_MASK; |
570 | gw_addr = __pa((unsigned long)&linux_gateway_page) & PAGE_MASK; | 573 | gw_addr = __pa((unsigned long)&linux_gateway_page) & PAGE_MASK; |
571 | 574 | ||
572 | end_paddr = start_paddr + size; | 575 | end_paddr = start_paddr + size; |
573 | 576 | ||
574 | pg_dir = pgd_offset_k(start_vaddr); | 577 | pg_dir = pgd_offset_k(start_vaddr); |
575 | 578 | ||
576 | #if PTRS_PER_PMD == 1 | 579 | #if PTRS_PER_PMD == 1 |
577 | start_pmd = 0; | 580 | start_pmd = 0; |
578 | #else | 581 | #else |
579 | start_pmd = ((start_vaddr >> PMD_SHIFT) & (PTRS_PER_PMD - 1)); | 582 | start_pmd = ((start_vaddr >> PMD_SHIFT) & (PTRS_PER_PMD - 1)); |
580 | #endif | 583 | #endif |
581 | start_pte = ((start_vaddr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)); | 584 | start_pte = ((start_vaddr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)); |
582 | 585 | ||
583 | address = start_paddr; | 586 | address = start_paddr; |
584 | while (address < end_paddr) { | 587 | while (address < end_paddr) { |
585 | #if PTRS_PER_PMD == 1 | 588 | #if PTRS_PER_PMD == 1 |
586 | pmd = (pmd_t *)__pa(pg_dir); | 589 | pmd = (pmd_t *)__pa(pg_dir); |
587 | #else | 590 | #else |
588 | pmd = (pmd_t *)pgd_address(*pg_dir); | 591 | pmd = (pmd_t *)pgd_address(*pg_dir); |
589 | 592 | ||
590 | /* | 593 | /* |
591 | * pmd is physical at this point | 594 | * pmd is physical at this point |
592 | */ | 595 | */ |
593 | 596 | ||
594 | if (!pmd) { | 597 | if (!pmd) { |
595 | pmd = (pmd_t *) alloc_bootmem_low_pages_node(NODE_DATA(0),PAGE_SIZE << PMD_ORDER); | 598 | pmd = (pmd_t *) alloc_bootmem_low_pages_node(NODE_DATA(0),PAGE_SIZE << PMD_ORDER); |
596 | pmd = (pmd_t *) __pa(pmd); | 599 | pmd = (pmd_t *) __pa(pmd); |
597 | } | 600 | } |
598 | 601 | ||
599 | pgd_populate(NULL, pg_dir, __va(pmd)); | 602 | pgd_populate(NULL, pg_dir, __va(pmd)); |
600 | #endif | 603 | #endif |
601 | pg_dir++; | 604 | pg_dir++; |
602 | 605 | ||
603 | /* now change pmd to kernel virtual addresses */ | 606 | /* now change pmd to kernel virtual addresses */ |
604 | 607 | ||
605 | pmd = (pmd_t *)__va(pmd) + start_pmd; | 608 | pmd = (pmd_t *)__va(pmd) + start_pmd; |
606 | for (tmp1 = start_pmd; tmp1 < PTRS_PER_PMD; tmp1++,pmd++) { | 609 | for (tmp1 = start_pmd; tmp1 < PTRS_PER_PMD; tmp1++,pmd++) { |
607 | 610 | ||
608 | /* | 611 | /* |
609 | * pg_table is physical at this point | 612 | * pg_table is physical at this point |
610 | */ | 613 | */ |
611 | 614 | ||
612 | pg_table = (pte_t *)pmd_address(*pmd); | 615 | pg_table = (pte_t *)pmd_address(*pmd); |
613 | if (!pg_table) { | 616 | if (!pg_table) { |
614 | pg_table = (pte_t *) | 617 | pg_table = (pte_t *) |
615 | alloc_bootmem_low_pages_node(NODE_DATA(0),PAGE_SIZE); | 618 | alloc_bootmem_low_pages_node(NODE_DATA(0),PAGE_SIZE); |
616 | pg_table = (pte_t *) __pa(pg_table); | 619 | pg_table = (pte_t *) __pa(pg_table); |
617 | } | 620 | } |
618 | 621 | ||
619 | pmd_populate_kernel(NULL, pmd, __va(pg_table)); | 622 | pmd_populate_kernel(NULL, pmd, __va(pg_table)); |
620 | 623 | ||
621 | /* now change pg_table to kernel virtual addresses */ | 624 | /* now change pg_table to kernel virtual addresses */ |
622 | 625 | ||
623 | pg_table = (pte_t *) __va(pg_table) + start_pte; | 626 | pg_table = (pte_t *) __va(pg_table) + start_pte; |
624 | for (tmp2 = start_pte; tmp2 < PTRS_PER_PTE; tmp2++,pg_table++) { | 627 | for (tmp2 = start_pte; tmp2 < PTRS_PER_PTE; tmp2++,pg_table++) { |
625 | pte_t pte; | 628 | pte_t pte; |
626 | 629 | ||
627 | /* | 630 | /* |
628 | * Map the fault vector writable so we can | 631 | * Map the fault vector writable so we can |
629 | * write the HPMC checksum. | 632 | * write the HPMC checksum. |
630 | */ | 633 | */ |
631 | if (address >= ro_start && address < ro_end | 634 | if (address >= ro_start && address < ro_end |
632 | && address != fv_addr | 635 | && address != fv_addr |
633 | && address != gw_addr) | 636 | && address != gw_addr) |
634 | pte = __mk_pte(address, PAGE_KERNEL_RO); | 637 | pte = __mk_pte(address, PAGE_KERNEL_RO); |
635 | else | 638 | else |
636 | pte = __mk_pte(address, pgprot); | 639 | pte = __mk_pte(address, pgprot); |
637 | 640 | ||
638 | if (address >= end_paddr) | 641 | if (address >= end_paddr) |
639 | pte_val(pte) = 0; | 642 | pte_val(pte) = 0; |
640 | 643 | ||
641 | set_pte(pg_table, pte); | 644 | set_pte(pg_table, pte); |
642 | 645 | ||
643 | address += PAGE_SIZE; | 646 | address += PAGE_SIZE; |
644 | } | 647 | } |
645 | start_pte = 0; | 648 | start_pte = 0; |
646 | 649 | ||
647 | if (address >= end_paddr) | 650 | if (address >= end_paddr) |
648 | break; | 651 | break; |
649 | } | 652 | } |
650 | start_pmd = 0; | 653 | start_pmd = 0; |
651 | } | 654 | } |
652 | } | 655 | } |
653 | 656 | ||
654 | /* | 657 | /* |
655 | * pagetable_init() sets up the page tables | 658 | * pagetable_init() sets up the page tables |
656 | * | 659 | * |
657 | * Note that gateway_init() places the Linux gateway page at page 0. | 660 | * Note that gateway_init() places the Linux gateway page at page 0. |
658 | * Since gateway pages cannot be dereferenced this has the desirable | 661 | * Since gateway pages cannot be dereferenced this has the desirable |
659 | * side effect of trapping those pesky NULL-reference errors in the | 662 | * side effect of trapping those pesky NULL-reference errors in the |
660 | * kernel. | 663 | * kernel. |
661 | */ | 664 | */ |
662 | static void __init pagetable_init(void) | 665 | static void __init pagetable_init(void) |
663 | { | 666 | { |
664 | int range; | 667 | int range; |
665 | 668 | ||
666 | /* Map each physical memory range to its kernel vaddr */ | 669 | /* Map each physical memory range to its kernel vaddr */ |
667 | 670 | ||
668 | for (range = 0; range < npmem_ranges; range++) { | 671 | for (range = 0; range < npmem_ranges; range++) { |
669 | unsigned long start_paddr; | 672 | unsigned long start_paddr; |
670 | unsigned long end_paddr; | 673 | unsigned long end_paddr; |
671 | unsigned long size; | 674 | unsigned long size; |
672 | 675 | ||
673 | start_paddr = pmem_ranges[range].start_pfn << PAGE_SHIFT; | 676 | start_paddr = pmem_ranges[range].start_pfn << PAGE_SHIFT; |
674 | end_paddr = start_paddr + (pmem_ranges[range].pages << PAGE_SHIFT); | 677 | end_paddr = start_paddr + (pmem_ranges[range].pages << PAGE_SHIFT); |
675 | size = pmem_ranges[range].pages << PAGE_SHIFT; | 678 | size = pmem_ranges[range].pages << PAGE_SHIFT; |
676 | 679 | ||
677 | map_pages((unsigned long)__va(start_paddr), start_paddr, | 680 | map_pages((unsigned long)__va(start_paddr), start_paddr, |
678 | size, PAGE_KERNEL); | 681 | size, PAGE_KERNEL); |
679 | } | 682 | } |
680 | 683 | ||
681 | #ifdef CONFIG_BLK_DEV_INITRD | 684 | #ifdef CONFIG_BLK_DEV_INITRD |
682 | if (initrd_end && initrd_end > mem_limit) { | 685 | if (initrd_end && initrd_end > mem_limit) { |
683 | printk("initrd: mapping %08lx-%08lx\n", initrd_start, initrd_end); | 686 | printk("initrd: mapping %08lx-%08lx\n", initrd_start, initrd_end); |
684 | map_pages(initrd_start, __pa(initrd_start), | 687 | map_pages(initrd_start, __pa(initrd_start), |
685 | initrd_end - initrd_start, PAGE_KERNEL); | 688 | initrd_end - initrd_start, PAGE_KERNEL); |
686 | } | 689 | } |
687 | #endif | 690 | #endif |
688 | 691 | ||
689 | empty_zero_page = alloc_bootmem_pages(PAGE_SIZE); | 692 | empty_zero_page = alloc_bootmem_pages(PAGE_SIZE); |
690 | memset(empty_zero_page, 0, PAGE_SIZE); | 693 | memset(empty_zero_page, 0, PAGE_SIZE); |
691 | } | 694 | } |
692 | 695 | ||
693 | static void __init gateway_init(void) | 696 | static void __init gateway_init(void) |
694 | { | 697 | { |
695 | unsigned long linux_gateway_page_addr; | 698 | unsigned long linux_gateway_page_addr; |
696 | /* FIXME: This is 'const' in order to trick the compiler | 699 | /* FIXME: This is 'const' in order to trick the compiler |
697 | into not treating it as DP-relative data. */ | 700 | into not treating it as DP-relative data. */ |
698 | extern void * const linux_gateway_page; | 701 | extern void * const linux_gateway_page; |
699 | 702 | ||
700 | linux_gateway_page_addr = LINUX_GATEWAY_ADDR & PAGE_MASK; | 703 | linux_gateway_page_addr = LINUX_GATEWAY_ADDR & PAGE_MASK; |
701 | 704 | ||
702 | /* | 705 | /* |
703 | * Setup Linux Gateway page. | 706 | * Setup Linux Gateway page. |
704 | * | 707 | * |
705 | * The Linux gateway page will reside in kernel space (on virtual | 708 | * The Linux gateway page will reside in kernel space (on virtual |
706 | * page 0), so it doesn't need to be aliased into user space. | 709 | * page 0), so it doesn't need to be aliased into user space. |
707 | */ | 710 | */ |
708 | 711 | ||
709 | map_pages(linux_gateway_page_addr, __pa(&linux_gateway_page), | 712 | map_pages(linux_gateway_page_addr, __pa(&linux_gateway_page), |
710 | PAGE_SIZE, PAGE_GATEWAY); | 713 | PAGE_SIZE, PAGE_GATEWAY); |
711 | } | 714 | } |
712 | 715 | ||
713 | #ifdef CONFIG_HPUX | 716 | #ifdef CONFIG_HPUX |
714 | void | 717 | void |
715 | map_hpux_gateway_page(struct task_struct *tsk, struct mm_struct *mm) | 718 | map_hpux_gateway_page(struct task_struct *tsk, struct mm_struct *mm) |
716 | { | 719 | { |
717 | pgd_t *pg_dir; | 720 | pgd_t *pg_dir; |
718 | pmd_t *pmd; | 721 | pmd_t *pmd; |
719 | pte_t *pg_table; | 722 | pte_t *pg_table; |
720 | unsigned long start_pmd; | 723 | unsigned long start_pmd; |
721 | unsigned long start_pte; | 724 | unsigned long start_pte; |
722 | unsigned long address; | 725 | unsigned long address; |
723 | unsigned long hpux_gw_page_addr; | 726 | unsigned long hpux_gw_page_addr; |
724 | /* FIXME: This is 'const' in order to trick the compiler | 727 | /* FIXME: This is 'const' in order to trick the compiler |
725 | into not treating it as DP-relative data. */ | 728 | into not treating it as DP-relative data. */ |
726 | extern void * const hpux_gateway_page; | 729 | extern void * const hpux_gateway_page; |
727 | 730 | ||
728 | hpux_gw_page_addr = HPUX_GATEWAY_ADDR & PAGE_MASK; | 731 | hpux_gw_page_addr = HPUX_GATEWAY_ADDR & PAGE_MASK; |
729 | 732 | ||
730 | /* | 733 | /* |
731 | * Setup HP-UX Gateway page. | 734 | * Setup HP-UX Gateway page. |
732 | * | 735 | * |
733 | * The HP-UX gateway page resides in the user address space, | 736 | * The HP-UX gateway page resides in the user address space, |
734 | * so it needs to be aliased into each process. | 737 | * so it needs to be aliased into each process. |
735 | */ | 738 | */ |
736 | 739 | ||
737 | pg_dir = pgd_offset(mm,hpux_gw_page_addr); | 740 | pg_dir = pgd_offset(mm,hpux_gw_page_addr); |
738 | 741 | ||
739 | #if PTRS_PER_PMD == 1 | 742 | #if PTRS_PER_PMD == 1 |
740 | start_pmd = 0; | 743 | start_pmd = 0; |
741 | #else | 744 | #else |
742 | start_pmd = ((hpux_gw_page_addr >> PMD_SHIFT) & (PTRS_PER_PMD - 1)); | 745 | start_pmd = ((hpux_gw_page_addr >> PMD_SHIFT) & (PTRS_PER_PMD - 1)); |
743 | #endif | 746 | #endif |
744 | start_pte = ((hpux_gw_page_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)); | 747 | start_pte = ((hpux_gw_page_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)); |
745 | 748 | ||
746 | address = __pa(&hpux_gateway_page); | 749 | address = __pa(&hpux_gateway_page); |
747 | #if PTRS_PER_PMD == 1 | 750 | #if PTRS_PER_PMD == 1 |
748 | pmd = (pmd_t *)__pa(pg_dir); | 751 | pmd = (pmd_t *)__pa(pg_dir); |
749 | #else | 752 | #else |
750 | pmd = (pmd_t *) pgd_address(*pg_dir); | 753 | pmd = (pmd_t *) pgd_address(*pg_dir); |
751 | 754 | ||
752 | /* | 755 | /* |
753 | * pmd is physical at this point | 756 | * pmd is physical at this point |
754 | */ | 757 | */ |
755 | 758 | ||
756 | if (!pmd) { | 759 | if (!pmd) { |
757 | pmd = (pmd_t *) get_zeroed_page(GFP_KERNEL); | 760 | pmd = (pmd_t *) get_zeroed_page(GFP_KERNEL); |
758 | pmd = (pmd_t *) __pa(pmd); | 761 | pmd = (pmd_t *) __pa(pmd); |
759 | } | 762 | } |
760 | 763 | ||
761 | __pgd_val_set(*pg_dir, PxD_FLAG_PRESENT | PxD_FLAG_VALID | (unsigned long) pmd); | 764 | __pgd_val_set(*pg_dir, PxD_FLAG_PRESENT | PxD_FLAG_VALID | (unsigned long) pmd); |
762 | #endif | 765 | #endif |
763 | /* now change pmd to kernel virtual addresses */ | 766 | /* now change pmd to kernel virtual addresses */ |
764 | 767 | ||
765 | pmd = (pmd_t *)__va(pmd) + start_pmd; | 768 | pmd = (pmd_t *)__va(pmd) + start_pmd; |
766 | 769 | ||
767 | /* | 770 | /* |
768 | * pg_table is physical at this point | 771 | * pg_table is physical at this point |
769 | */ | 772 | */ |
770 | 773 | ||
771 | pg_table = (pte_t *) pmd_address(*pmd); | 774 | pg_table = (pte_t *) pmd_address(*pmd); |
772 | if (!pg_table) | 775 | if (!pg_table) |
773 | pg_table = (pte_t *) __pa(get_zeroed_page(GFP_KERNEL)); | 776 | pg_table = (pte_t *) __pa(get_zeroed_page(GFP_KERNEL)); |
774 | 777 | ||
775 | __pmd_val_set(*pmd, PxD_FLAG_PRESENT | PxD_FLAG_VALID | (unsigned long) pg_table); | 778 | __pmd_val_set(*pmd, PxD_FLAG_PRESENT | PxD_FLAG_VALID | (unsigned long) pg_table); |
776 | 779 | ||
777 | /* now change pg_table to kernel virtual addresses */ | 780 | /* now change pg_table to kernel virtual addresses */ |
778 | 781 | ||
779 | pg_table = (pte_t *) __va(pg_table) + start_pte; | 782 | pg_table = (pte_t *) __va(pg_table) + start_pte; |
780 | set_pte(pg_table, __mk_pte(address, PAGE_GATEWAY)); | 783 | set_pte(pg_table, __mk_pte(address, PAGE_GATEWAY)); |
781 | } | 784 | } |
782 | EXPORT_SYMBOL(map_hpux_gateway_page); | 785 | EXPORT_SYMBOL(map_hpux_gateway_page); |
783 | #endif | 786 | #endif |
784 | 787 | ||
785 | extern void flush_tlb_all_local(void); | 788 | extern void flush_tlb_all_local(void); |
786 | 789 | ||
787 | void __init paging_init(void) | 790 | void __init paging_init(void) |
788 | { | 791 | { |
789 | int i; | 792 | int i; |
790 | 793 | ||
791 | setup_bootmem(); | 794 | setup_bootmem(); |
792 | pagetable_init(); | 795 | pagetable_init(); |
793 | gateway_init(); | 796 | gateway_init(); |
794 | flush_cache_all_local(); /* start with known state */ | 797 | flush_cache_all_local(); /* start with known state */ |
795 | flush_tlb_all_local(); | 798 | flush_tlb_all_local(); |
796 | 799 | ||
797 | for (i = 0; i < npmem_ranges; i++) { | 800 | for (i = 0; i < npmem_ranges; i++) { |
798 | unsigned long zones_size[MAX_NR_ZONES] = { 0, 0, 0 }; | 801 | unsigned long zones_size[MAX_NR_ZONES] = { 0, 0, 0 }; |
799 | 802 | ||
800 | /* We have an IOMMU, so all memory can go into a single | 803 | /* We have an IOMMU, so all memory can go into a single |
801 | ZONE_DMA zone. */ | 804 | ZONE_DMA zone. */ |
802 | zones_size[ZONE_DMA] = pmem_ranges[i].pages; | 805 | zones_size[ZONE_DMA] = pmem_ranges[i].pages; |
803 | 806 | ||
804 | #ifdef CONFIG_DISCONTIGMEM | 807 | #ifdef CONFIG_DISCONTIGMEM |
805 | /* Need to initialize the pfnnid_map before we can initialize | 808 | /* Need to initialize the pfnnid_map before we can initialize |
806 | the zone */ | 809 | the zone */ |
807 | { | 810 | { |
808 | int j; | 811 | int j; |
809 | for (j = (pmem_ranges[i].start_pfn >> PFNNID_SHIFT); | 812 | for (j = (pmem_ranges[i].start_pfn >> PFNNID_SHIFT); |
810 | j <= ((pmem_ranges[i].start_pfn + pmem_ranges[i].pages) >> PFNNID_SHIFT); | 813 | j <= ((pmem_ranges[i].start_pfn + pmem_ranges[i].pages) >> PFNNID_SHIFT); |
811 | j++) { | 814 | j++) { |
812 | pfnnid_map[j] = i; | 815 | pfnnid_map[j] = i; |
813 | } | 816 | } |
814 | } | 817 | } |
815 | #endif | 818 | #endif |
816 | 819 | ||
817 | free_area_init_node(i, NODE_DATA(i), zones_size, | 820 | free_area_init_node(i, NODE_DATA(i), zones_size, |
818 | pmem_ranges[i].start_pfn, NULL); | 821 | pmem_ranges[i].start_pfn, NULL); |
819 | } | 822 | } |
820 | } | 823 | } |
821 | 824 | ||
822 | #ifdef CONFIG_PA20 | 825 | #ifdef CONFIG_PA20 |
823 | 826 | ||
824 | /* | 827 | /* |
825 | * Currently, all PA20 chips have 18 bit protection id's, which is the | 828 | * Currently, all PA20 chips have 18 bit protection id's, which is the |
826 | * limiting factor (space ids are 32 bits). | 829 | * limiting factor (space ids are 32 bits). |
827 | */ | 830 | */ |
828 | 831 | ||
829 | #define NR_SPACE_IDS 262144 | 832 | #define NR_SPACE_IDS 262144 |
830 | 833 | ||
831 | #else | 834 | #else |
832 | 835 | ||
833 | /* | 836 | /* |
834 | * Currently we have a one-to-one relationship between space id's and | 837 | * Currently we have a one-to-one relationship between space id's and |
835 | * protection id's. Older parisc chips (PCXS, PCXT, PCXL, PCXL2) only | 838 | * protection id's. Older parisc chips (PCXS, PCXT, PCXL, PCXL2) only |
836 | * support 15 bit protection id's, so that is the limiting factor. | 839 | * support 15 bit protection id's, so that is the limiting factor. |
837 | * PCXT' has 18 bit protection id's, but only 16 bit spaceids, so it's | 840 | * PCXT' has 18 bit protection id's, but only 16 bit spaceids, so it's |
838 | * probably not worth the effort for a special case here. | 841 | * probably not worth the effort for a special case here. |
839 | */ | 842 | */ |
840 | 843 | ||
841 | #define NR_SPACE_IDS 32768 | 844 | #define NR_SPACE_IDS 32768 |
842 | 845 | ||
843 | #endif /* !CONFIG_PA20 */ | 846 | #endif /* !CONFIG_PA20 */ |
844 | 847 | ||
845 | #define RECYCLE_THRESHOLD (NR_SPACE_IDS / 2) | 848 | #define RECYCLE_THRESHOLD (NR_SPACE_IDS / 2) |
846 | #define SID_ARRAY_SIZE (NR_SPACE_IDS / (8 * sizeof(long))) | 849 | #define SID_ARRAY_SIZE (NR_SPACE_IDS / (8 * sizeof(long))) |
847 | 850 | ||
848 | static unsigned long space_id[SID_ARRAY_SIZE] = { 1 }; /* disallow space 0 */ | 851 | static unsigned long space_id[SID_ARRAY_SIZE] = { 1 }; /* disallow space 0 */ |
849 | static unsigned long dirty_space_id[SID_ARRAY_SIZE]; | 852 | static unsigned long dirty_space_id[SID_ARRAY_SIZE]; |
850 | static unsigned long space_id_index; | 853 | static unsigned long space_id_index; |
851 | static unsigned long free_space_ids = NR_SPACE_IDS - 1; | 854 | static unsigned long free_space_ids = NR_SPACE_IDS - 1; |
852 | static unsigned long dirty_space_ids = 0; | 855 | static unsigned long dirty_space_ids = 0; |
853 | 856 | ||
854 | static DEFINE_SPINLOCK(sid_lock); | 857 | static DEFINE_SPINLOCK(sid_lock); |
855 | 858 | ||
856 | unsigned long alloc_sid(void) | 859 | unsigned long alloc_sid(void) |
857 | { | 860 | { |
858 | unsigned long index; | 861 | unsigned long index; |
859 | 862 | ||
860 | spin_lock(&sid_lock); | 863 | spin_lock(&sid_lock); |
861 | 864 | ||
862 | if (free_space_ids == 0) { | 865 | if (free_space_ids == 0) { |
863 | if (dirty_space_ids != 0) { | 866 | if (dirty_space_ids != 0) { |
864 | spin_unlock(&sid_lock); | 867 | spin_unlock(&sid_lock); |
865 | flush_tlb_all(); /* flush_tlb_all() calls recycle_sids() */ | 868 | flush_tlb_all(); /* flush_tlb_all() calls recycle_sids() */ |
866 | spin_lock(&sid_lock); | 869 | spin_lock(&sid_lock); |
867 | } | 870 | } |
868 | if (free_space_ids == 0) | 871 | if (free_space_ids == 0) |
869 | BUG(); | 872 | BUG(); |
870 | } | 873 | } |
871 | 874 | ||
872 | free_space_ids--; | 875 | free_space_ids--; |
873 | 876 | ||
874 | index = find_next_zero_bit(space_id, NR_SPACE_IDS, space_id_index); | 877 | index = find_next_zero_bit(space_id, NR_SPACE_IDS, space_id_index); |
875 | space_id[index >> SHIFT_PER_LONG] |= (1L << (index & (BITS_PER_LONG - 1))); | 878 | space_id[index >> SHIFT_PER_LONG] |= (1L << (index & (BITS_PER_LONG - 1))); |
876 | space_id_index = index; | 879 | space_id_index = index; |
877 | 880 | ||
878 | spin_unlock(&sid_lock); | 881 | spin_unlock(&sid_lock); |
879 | 882 | ||
880 | return index << SPACEID_SHIFT; | 883 | return index << SPACEID_SHIFT; |
881 | } | 884 | } |
882 | 885 | ||
883 | void free_sid(unsigned long spaceid) | 886 | void free_sid(unsigned long spaceid) |
884 | { | 887 | { |
885 | unsigned long index = spaceid >> SPACEID_SHIFT; | 888 | unsigned long index = spaceid >> SPACEID_SHIFT; |
886 | unsigned long *dirty_space_offset; | 889 | unsigned long *dirty_space_offset; |
887 | 890 | ||
888 | dirty_space_offset = dirty_space_id + (index >> SHIFT_PER_LONG); | 891 | dirty_space_offset = dirty_space_id + (index >> SHIFT_PER_LONG); |
889 | index &= (BITS_PER_LONG - 1); | 892 | index &= (BITS_PER_LONG - 1); |
890 | 893 | ||
891 | spin_lock(&sid_lock); | 894 | spin_lock(&sid_lock); |
892 | 895 | ||
893 | if (*dirty_space_offset & (1L << index)) | 896 | if (*dirty_space_offset & (1L << index)) |
894 | BUG(); /* attempt to free space id twice */ | 897 | BUG(); /* attempt to free space id twice */ |
895 | 898 | ||
896 | *dirty_space_offset |= (1L << index); | 899 | *dirty_space_offset |= (1L << index); |
897 | dirty_space_ids++; | 900 | dirty_space_ids++; |
898 | 901 | ||
899 | spin_unlock(&sid_lock); | 902 | spin_unlock(&sid_lock); |
900 | } | 903 | } |
901 | 904 | ||
902 | 905 | ||
903 | #ifdef CONFIG_SMP | 906 | #ifdef CONFIG_SMP |
904 | static void get_dirty_sids(unsigned long *ndirtyptr,unsigned long *dirty_array) | 907 | static void get_dirty_sids(unsigned long *ndirtyptr,unsigned long *dirty_array) |
905 | { | 908 | { |
906 | int i; | 909 | int i; |
907 | 910 | ||
908 | /* NOTE: sid_lock must be held upon entry */ | 911 | /* NOTE: sid_lock must be held upon entry */ |
909 | 912 | ||
910 | *ndirtyptr = dirty_space_ids; | 913 | *ndirtyptr = dirty_space_ids; |
911 | if (dirty_space_ids != 0) { | 914 | if (dirty_space_ids != 0) { |
912 | for (i = 0; i < SID_ARRAY_SIZE; i++) { | 915 | for (i = 0; i < SID_ARRAY_SIZE; i++) { |
913 | dirty_array[i] = dirty_space_id[i]; | 916 | dirty_array[i] = dirty_space_id[i]; |
914 | dirty_space_id[i] = 0; | 917 | dirty_space_id[i] = 0; |
915 | } | 918 | } |
916 | dirty_space_ids = 0; | 919 | dirty_space_ids = 0; |
917 | } | 920 | } |
918 | 921 | ||
919 | return; | 922 | return; |
920 | } | 923 | } |
921 | 924 | ||
922 | static void recycle_sids(unsigned long ndirty,unsigned long *dirty_array) | 925 | static void recycle_sids(unsigned long ndirty,unsigned long *dirty_array) |
923 | { | 926 | { |
924 | int i; | 927 | int i; |
925 | 928 | ||
926 | /* NOTE: sid_lock must be held upon entry */ | 929 | /* NOTE: sid_lock must be held upon entry */ |
927 | 930 | ||
928 | if (ndirty != 0) { | 931 | if (ndirty != 0) { |
929 | for (i = 0; i < SID_ARRAY_SIZE; i++) { | 932 | for (i = 0; i < SID_ARRAY_SIZE; i++) { |
930 | space_id[i] ^= dirty_array[i]; | 933 | space_id[i] ^= dirty_array[i]; |
931 | } | 934 | } |
932 | 935 | ||
933 | free_space_ids += ndirty; | 936 | free_space_ids += ndirty; |
934 | space_id_index = 0; | 937 | space_id_index = 0; |
935 | } | 938 | } |
936 | } | 939 | } |
937 | 940 | ||
938 | #else /* CONFIG_SMP */ | 941 | #else /* CONFIG_SMP */ |
939 | 942 | ||
940 | static void recycle_sids(void) | 943 | static void recycle_sids(void) |
941 | { | 944 | { |
942 | int i; | 945 | int i; |
943 | 946 | ||
944 | /* NOTE: sid_lock must be held upon entry */ | 947 | /* NOTE: sid_lock must be held upon entry */ |
945 | 948 | ||
946 | if (dirty_space_ids != 0) { | 949 | if (dirty_space_ids != 0) { |
947 | for (i = 0; i < SID_ARRAY_SIZE; i++) { | 950 | for (i = 0; i < SID_ARRAY_SIZE; i++) { |
948 | space_id[i] ^= dirty_space_id[i]; | 951 | space_id[i] ^= dirty_space_id[i]; |
949 | dirty_space_id[i] = 0; | 952 | dirty_space_id[i] = 0; |
950 | } | 953 | } |
951 | 954 | ||
952 | free_space_ids += dirty_space_ids; | 955 | free_space_ids += dirty_space_ids; |
953 | dirty_space_ids = 0; | 956 | dirty_space_ids = 0; |
954 | space_id_index = 0; | 957 | space_id_index = 0; |
955 | } | 958 | } |
956 | } | 959 | } |
957 | #endif | 960 | #endif |
958 | 961 | ||
959 | /* | 962 | /* |
960 | * flush_tlb_all() calls recycle_sids(), since whenever the entire tlb is | 963 | * flush_tlb_all() calls recycle_sids(), since whenever the entire tlb is |
961 | * purged, we can safely reuse the space ids that were released but | 964 | * purged, we can safely reuse the space ids that were released but |
962 | * not flushed from the tlb. | 965 | * not flushed from the tlb. |
963 | */ | 966 | */ |
964 | 967 | ||
965 | #ifdef CONFIG_SMP | 968 | #ifdef CONFIG_SMP |
966 | 969 | ||
967 | static unsigned long recycle_ndirty; | 970 | static unsigned long recycle_ndirty; |
968 | static unsigned long recycle_dirty_array[SID_ARRAY_SIZE]; | 971 | static unsigned long recycle_dirty_array[SID_ARRAY_SIZE]; |
969 | static unsigned int recycle_inuse = 0; | 972 | static unsigned int recycle_inuse = 0; |
970 | 973 | ||
971 | void flush_tlb_all(void) | 974 | void flush_tlb_all(void) |
972 | { | 975 | { |
973 | int do_recycle; | 976 | int do_recycle; |
974 | 977 | ||
975 | do_recycle = 0; | 978 | do_recycle = 0; |
976 | spin_lock(&sid_lock); | 979 | spin_lock(&sid_lock); |
977 | if (dirty_space_ids > RECYCLE_THRESHOLD) { | 980 | if (dirty_space_ids > RECYCLE_THRESHOLD) { |
978 | if (recycle_inuse) { | 981 | if (recycle_inuse) { |
979 | BUG(); /* FIXME: Use a semaphore/wait queue here */ | 982 | BUG(); /* FIXME: Use a semaphore/wait queue here */ |
980 | } | 983 | } |
981 | get_dirty_sids(&recycle_ndirty,recycle_dirty_array); | 984 | get_dirty_sids(&recycle_ndirty,recycle_dirty_array); |
982 | recycle_inuse++; | 985 | recycle_inuse++; |
983 | do_recycle++; | 986 | do_recycle++; |
984 | } | 987 | } |
985 | spin_unlock(&sid_lock); | 988 | spin_unlock(&sid_lock); |
986 | on_each_cpu((void (*)(void *))flush_tlb_all_local, NULL, 1, 1); | 989 | on_each_cpu((void (*)(void *))flush_tlb_all_local, NULL, 1, 1); |
987 | if (do_recycle) { | 990 | if (do_recycle) { |
988 | spin_lock(&sid_lock); | 991 | spin_lock(&sid_lock); |
989 | recycle_sids(recycle_ndirty,recycle_dirty_array); | 992 | recycle_sids(recycle_ndirty,recycle_dirty_array); |
990 | recycle_inuse = 0; | 993 | recycle_inuse = 0; |
991 | spin_unlock(&sid_lock); | 994 | spin_unlock(&sid_lock); |
992 | } | 995 | } |
993 | } | 996 | } |
994 | #else | 997 | #else |
995 | void flush_tlb_all(void) | 998 | void flush_tlb_all(void) |
996 | { | 999 | { |
997 | spin_lock(&sid_lock); | 1000 | spin_lock(&sid_lock); |
998 | flush_tlb_all_local(); | 1001 | flush_tlb_all_local(); |
999 | recycle_sids(); | 1002 | recycle_sids(); |
1000 | spin_unlock(&sid_lock); | 1003 | spin_unlock(&sid_lock); |
1001 | } | 1004 | } |
1002 | #endif | 1005 | #endif |
1003 | 1006 | ||
1004 | #ifdef CONFIG_BLK_DEV_INITRD | 1007 | #ifdef CONFIG_BLK_DEV_INITRD |
1005 | void free_initrd_mem(unsigned long start, unsigned long end) | 1008 | void free_initrd_mem(unsigned long start, unsigned long end) |
1006 | { | 1009 | { |
1007 | #if 0 | 1010 | #if 0 |
1008 | if (start < end) | 1011 | if (start < end) |
1009 | printk(KERN_INFO "Freeing initrd memory: %ldk freed\n", (end - start) >> 10); | 1012 | printk(KERN_INFO "Freeing initrd memory: %ldk freed\n", (end - start) >> 10); |
1010 | for (; start < end; start += PAGE_SIZE) { | 1013 | for (; start < end; start += PAGE_SIZE) { |
1011 | ClearPageReserved(virt_to_page(start)); | 1014 | ClearPageReserved(virt_to_page(start)); |
1012 | set_page_count(virt_to_page(start), 1); | 1015 | set_page_count(virt_to_page(start), 1); |
1013 | free_page(start); | 1016 | free_page(start); |
1014 | num_physpages++; | 1017 | num_physpages++; |
1015 | totalram_pages++; | 1018 | totalram_pages++; |
1016 | } | 1019 | } |
1017 | #endif | 1020 | #endif |
1018 | } | 1021 | } |
1019 | #endif | 1022 | #endif |
1020 | 1023 |
arch/ppc64/mm/init.c
1 | /* | 1 | /* |
2 | * PowerPC version | 2 | * PowerPC version |
3 | * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) | 3 | * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) |
4 | * | 4 | * |
5 | * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) | 5 | * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) |
6 | * and Cort Dougan (PReP) (cort@cs.nmt.edu) | 6 | * and Cort Dougan (PReP) (cort@cs.nmt.edu) |
7 | * Copyright (C) 1996 Paul Mackerras | 7 | * Copyright (C) 1996 Paul Mackerras |
8 | * Amiga/APUS changes by Jesper Skov (jskov@cygnus.co.uk). | 8 | * Amiga/APUS changes by Jesper Skov (jskov@cygnus.co.uk). |
9 | * | 9 | * |
10 | * Derived from "arch/i386/mm/init.c" | 10 | * Derived from "arch/i386/mm/init.c" |
11 | * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds | 11 | * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds |
12 | * | 12 | * |
13 | * Dave Engebretsen <engebret@us.ibm.com> | 13 | * Dave Engebretsen <engebret@us.ibm.com> |
14 | * Rework for PPC64 port. | 14 | * Rework for PPC64 port. |
15 | * | 15 | * |
16 | * This program is free software; you can redistribute it and/or | 16 | * This program is free software; you can redistribute it and/or |
17 | * modify it under the terms of the GNU General Public License | 17 | * modify it under the terms of the GNU General Public License |
18 | * as published by the Free Software Foundation; either version | 18 | * as published by the Free Software Foundation; either version |
19 | * 2 of the License, or (at your option) any later version. | 19 | * 2 of the License, or (at your option) any later version. |
20 | * | 20 | * |
21 | */ | 21 | */ |
22 | 22 | ||
23 | #include <linux/config.h> | 23 | #include <linux/config.h> |
24 | #include <linux/signal.h> | 24 | #include <linux/signal.h> |
25 | #include <linux/sched.h> | 25 | #include <linux/sched.h> |
26 | #include <linux/kernel.h> | 26 | #include <linux/kernel.h> |
27 | #include <linux/errno.h> | 27 | #include <linux/errno.h> |
28 | #include <linux/string.h> | 28 | #include <linux/string.h> |
29 | #include <linux/types.h> | 29 | #include <linux/types.h> |
30 | #include <linux/mman.h> | 30 | #include <linux/mman.h> |
31 | #include <linux/mm.h> | 31 | #include <linux/mm.h> |
32 | #include <linux/swap.h> | 32 | #include <linux/swap.h> |
33 | #include <linux/stddef.h> | 33 | #include <linux/stddef.h> |
34 | #include <linux/vmalloc.h> | 34 | #include <linux/vmalloc.h> |
35 | #include <linux/init.h> | 35 | #include <linux/init.h> |
36 | #include <linux/delay.h> | 36 | #include <linux/delay.h> |
37 | #include <linux/bootmem.h> | 37 | #include <linux/bootmem.h> |
38 | #include <linux/highmem.h> | 38 | #include <linux/highmem.h> |
39 | #include <linux/idr.h> | 39 | #include <linux/idr.h> |
40 | #include <linux/nodemask.h> | 40 | #include <linux/nodemask.h> |
41 | #include <linux/module.h> | 41 | #include <linux/module.h> |
42 | 42 | ||
43 | #include <asm/pgalloc.h> | 43 | #include <asm/pgalloc.h> |
44 | #include <asm/page.h> | 44 | #include <asm/page.h> |
45 | #include <asm/prom.h> | 45 | #include <asm/prom.h> |
46 | #include <asm/lmb.h> | 46 | #include <asm/lmb.h> |
47 | #include <asm/rtas.h> | 47 | #include <asm/rtas.h> |
48 | #include <asm/io.h> | 48 | #include <asm/io.h> |
49 | #include <asm/mmu_context.h> | 49 | #include <asm/mmu_context.h> |
50 | #include <asm/pgtable.h> | 50 | #include <asm/pgtable.h> |
51 | #include <asm/mmu.h> | 51 | #include <asm/mmu.h> |
52 | #include <asm/uaccess.h> | 52 | #include <asm/uaccess.h> |
53 | #include <asm/smp.h> | 53 | #include <asm/smp.h> |
54 | #include <asm/machdep.h> | 54 | #include <asm/machdep.h> |
55 | #include <asm/tlb.h> | 55 | #include <asm/tlb.h> |
56 | #include <asm/eeh.h> | 56 | #include <asm/eeh.h> |
57 | #include <asm/processor.h> | 57 | #include <asm/processor.h> |
58 | #include <asm/mmzone.h> | 58 | #include <asm/mmzone.h> |
59 | #include <asm/cputable.h> | 59 | #include <asm/cputable.h> |
60 | #include <asm/ppcdebug.h> | 60 | #include <asm/ppcdebug.h> |
61 | #include <asm/sections.h> | 61 | #include <asm/sections.h> |
62 | #include <asm/system.h> | 62 | #include <asm/system.h> |
63 | #include <asm/iommu.h> | 63 | #include <asm/iommu.h> |
64 | #include <asm/abs_addr.h> | 64 | #include <asm/abs_addr.h> |
65 | #include <asm/vdso.h> | 65 | #include <asm/vdso.h> |
66 | #include <asm/imalloc.h> | 66 | #include <asm/imalloc.h> |
67 | 67 | ||
68 | #if PGTABLE_RANGE > USER_VSID_RANGE | 68 | #if PGTABLE_RANGE > USER_VSID_RANGE |
69 | #warning Limited user VSID range means pagetable space is wasted | 69 | #warning Limited user VSID range means pagetable space is wasted |
70 | #endif | 70 | #endif |
71 | 71 | ||
72 | #if (TASK_SIZE_USER64 < PGTABLE_RANGE) && (TASK_SIZE_USER64 < USER_VSID_RANGE) | 72 | #if (TASK_SIZE_USER64 < PGTABLE_RANGE) && (TASK_SIZE_USER64 < USER_VSID_RANGE) |
73 | #warning TASK_SIZE is smaller than it needs to be. | 73 | #warning TASK_SIZE is smaller than it needs to be. |
74 | #endif | 74 | #endif |
75 | 75 | ||
76 | int mem_init_done; | 76 | int mem_init_done; |
77 | unsigned long ioremap_bot = IMALLOC_BASE; | 77 | unsigned long ioremap_bot = IMALLOC_BASE; |
78 | static unsigned long phbs_io_bot = PHBS_IO_BASE; | 78 | static unsigned long phbs_io_bot = PHBS_IO_BASE; |
79 | 79 | ||
80 | extern pgd_t swapper_pg_dir[]; | 80 | extern pgd_t swapper_pg_dir[]; |
81 | extern struct task_struct *current_set[NR_CPUS]; | 81 | extern struct task_struct *current_set[NR_CPUS]; |
82 | 82 | ||
83 | unsigned long klimit = (unsigned long)_end; | 83 | unsigned long klimit = (unsigned long)_end; |
84 | 84 | ||
85 | unsigned long _SDR1=0; | 85 | unsigned long _SDR1=0; |
86 | unsigned long _ASR=0; | 86 | unsigned long _ASR=0; |
87 | 87 | ||
88 | /* max amount of RAM to use */ | 88 | /* max amount of RAM to use */ |
89 | unsigned long __max_memory; | 89 | unsigned long __max_memory; |
90 | 90 | ||
91 | /* info on what we think the IO hole is */ | 91 | /* info on what we think the IO hole is */ |
92 | unsigned long io_hole_start; | 92 | unsigned long io_hole_start; |
93 | unsigned long io_hole_size; | 93 | unsigned long io_hole_size; |
94 | 94 | ||
95 | void show_mem(void) | 95 | void show_mem(void) |
96 | { | 96 | { |
97 | unsigned long total = 0, reserved = 0; | 97 | unsigned long total = 0, reserved = 0; |
98 | unsigned long shared = 0, cached = 0; | 98 | unsigned long shared = 0, cached = 0; |
99 | struct page *page; | 99 | struct page *page; |
100 | pg_data_t *pgdat; | 100 | pg_data_t *pgdat; |
101 | unsigned long i; | 101 | unsigned long i; |
102 | 102 | ||
103 | printk("Mem-info:\n"); | 103 | printk("Mem-info:\n"); |
104 | show_free_areas(); | 104 | show_free_areas(); |
105 | printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); | 105 | printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); |
106 | for_each_pgdat(pgdat) { | 106 | for_each_pgdat(pgdat) { |
107 | unsigned long flags; | ||
108 | pgdat_resize_lock(pgdat, &flags); | ||
107 | for (i = 0; i < pgdat->node_spanned_pages; i++) { | 109 | for (i = 0; i < pgdat->node_spanned_pages; i++) { |
108 | page = pgdat_page_nr(pgdat, i); | 110 | page = pgdat_page_nr(pgdat, i); |
109 | total++; | 111 | total++; |
110 | if (PageReserved(page)) | 112 | if (PageReserved(page)) |
111 | reserved++; | 113 | reserved++; |
112 | else if (PageSwapCache(page)) | 114 | else if (PageSwapCache(page)) |
113 | cached++; | 115 | cached++; |
114 | else if (page_count(page)) | 116 | else if (page_count(page)) |
115 | shared += page_count(page) - 1; | 117 | shared += page_count(page) - 1; |
116 | } | 118 | } |
119 | pgdat_resize_unlock(pgdat, &flags); | ||
117 | } | 120 | } |
118 | printk("%ld pages of RAM\n", total); | 121 | printk("%ld pages of RAM\n", total); |
119 | printk("%ld reserved pages\n", reserved); | 122 | printk("%ld reserved pages\n", reserved); |
120 | printk("%ld pages shared\n", shared); | 123 | printk("%ld pages shared\n", shared); |
121 | printk("%ld pages swap cached\n", cached); | 124 | printk("%ld pages swap cached\n", cached); |
122 | } | 125 | } |
123 | 126 | ||
124 | #ifdef CONFIG_PPC_ISERIES | 127 | #ifdef CONFIG_PPC_ISERIES |
125 | 128 | ||
126 | void __iomem *ioremap(unsigned long addr, unsigned long size) | 129 | void __iomem *ioremap(unsigned long addr, unsigned long size) |
127 | { | 130 | { |
128 | return (void __iomem *)addr; | 131 | return (void __iomem *)addr; |
129 | } | 132 | } |
130 | 133 | ||
131 | extern void __iomem *__ioremap(unsigned long addr, unsigned long size, | 134 | extern void __iomem *__ioremap(unsigned long addr, unsigned long size, |
132 | unsigned long flags) | 135 | unsigned long flags) |
133 | { | 136 | { |
134 | return (void __iomem *)addr; | 137 | return (void __iomem *)addr; |
135 | } | 138 | } |
136 | 139 | ||
137 | void iounmap(volatile void __iomem *addr) | 140 | void iounmap(volatile void __iomem *addr) |
138 | { | 141 | { |
139 | return; | 142 | return; |
140 | } | 143 | } |
141 | 144 | ||
142 | #else | 145 | #else |
143 | 146 | ||
144 | /* | 147 | /* |
145 | * map_io_page currently only called by __ioremap | 148 | * map_io_page currently only called by __ioremap |
146 | * map_io_page adds an entry to the ioremap page table | 149 | * map_io_page adds an entry to the ioremap page table |
147 | * and adds an entry to the HPT, possibly bolting it | 150 | * and adds an entry to the HPT, possibly bolting it |
148 | */ | 151 | */ |
149 | static int map_io_page(unsigned long ea, unsigned long pa, int flags) | 152 | static int map_io_page(unsigned long ea, unsigned long pa, int flags) |
150 | { | 153 | { |
151 | pgd_t *pgdp; | 154 | pgd_t *pgdp; |
152 | pud_t *pudp; | 155 | pud_t *pudp; |
153 | pmd_t *pmdp; | 156 | pmd_t *pmdp; |
154 | pte_t *ptep; | 157 | pte_t *ptep; |
155 | unsigned long vsid; | 158 | unsigned long vsid; |
156 | 159 | ||
157 | if (mem_init_done) { | 160 | if (mem_init_done) { |
158 | pgdp = pgd_offset_k(ea); | 161 | pgdp = pgd_offset_k(ea); |
159 | pudp = pud_alloc(&init_mm, pgdp, ea); | 162 | pudp = pud_alloc(&init_mm, pgdp, ea); |
160 | if (!pudp) | 163 | if (!pudp) |
161 | return -ENOMEM; | 164 | return -ENOMEM; |
162 | pmdp = pmd_alloc(&init_mm, pudp, ea); | 165 | pmdp = pmd_alloc(&init_mm, pudp, ea); |
163 | if (!pmdp) | 166 | if (!pmdp) |
164 | return -ENOMEM; | 167 | return -ENOMEM; |
165 | ptep = pte_alloc_kernel(pmdp, ea); | 168 | ptep = pte_alloc_kernel(pmdp, ea); |
166 | if (!ptep) | 169 | if (!ptep) |
167 | return -ENOMEM; | 170 | return -ENOMEM; |
168 | set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, | 171 | set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, |
169 | __pgprot(flags))); | 172 | __pgprot(flags))); |
170 | } else { | 173 | } else { |
171 | unsigned long va, vpn, hash, hpteg; | 174 | unsigned long va, vpn, hash, hpteg; |
172 | 175 | ||
173 | /* | 176 | /* |
174 | * If the mm subsystem is not fully up, we cannot create a | 177 | * If the mm subsystem is not fully up, we cannot create a |
175 | * linux page table entry for this mapping. Simply bolt an | 178 | * linux page table entry for this mapping. Simply bolt an |
176 | * entry in the hardware page table. | 179 | * entry in the hardware page table. |
177 | */ | 180 | */ |
178 | vsid = get_kernel_vsid(ea); | 181 | vsid = get_kernel_vsid(ea); |
179 | va = (vsid << 28) | (ea & 0xFFFFFFF); | 182 | va = (vsid << 28) | (ea & 0xFFFFFFF); |
180 | vpn = va >> PAGE_SHIFT; | 183 | vpn = va >> PAGE_SHIFT; |
181 | 184 | ||
182 | hash = hpt_hash(vpn, 0); | 185 | hash = hpt_hash(vpn, 0); |
183 | 186 | ||
184 | hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP); | 187 | hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP); |
185 | 188 | ||
186 | /* Panic if a pte grpup is full */ | 189 | /* Panic if a pte grpup is full */ |
187 | if (ppc_md.hpte_insert(hpteg, va, pa >> PAGE_SHIFT, | 190 | if (ppc_md.hpte_insert(hpteg, va, pa >> PAGE_SHIFT, |
188 | HPTE_V_BOLTED, | 191 | HPTE_V_BOLTED, |
189 | _PAGE_NO_CACHE|_PAGE_GUARDED|PP_RWXX) | 192 | _PAGE_NO_CACHE|_PAGE_GUARDED|PP_RWXX) |
190 | == -1) { | 193 | == -1) { |
191 | panic("map_io_page: could not insert mapping"); | 194 | panic("map_io_page: could not insert mapping"); |
192 | } | 195 | } |
193 | } | 196 | } |
194 | return 0; | 197 | return 0; |
195 | } | 198 | } |
196 | 199 | ||
197 | 200 | ||
198 | static void __iomem * __ioremap_com(unsigned long addr, unsigned long pa, | 201 | static void __iomem * __ioremap_com(unsigned long addr, unsigned long pa, |
199 | unsigned long ea, unsigned long size, | 202 | unsigned long ea, unsigned long size, |
200 | unsigned long flags) | 203 | unsigned long flags) |
201 | { | 204 | { |
202 | unsigned long i; | 205 | unsigned long i; |
203 | 206 | ||
204 | if ((flags & _PAGE_PRESENT) == 0) | 207 | if ((flags & _PAGE_PRESENT) == 0) |
205 | flags |= pgprot_val(PAGE_KERNEL); | 208 | flags |= pgprot_val(PAGE_KERNEL); |
206 | 209 | ||
207 | for (i = 0; i < size; i += PAGE_SIZE) | 210 | for (i = 0; i < size; i += PAGE_SIZE) |
208 | if (map_io_page(ea+i, pa+i, flags)) | 211 | if (map_io_page(ea+i, pa+i, flags)) |
209 | return NULL; | 212 | return NULL; |
210 | 213 | ||
211 | return (void __iomem *) (ea + (addr & ~PAGE_MASK)); | 214 | return (void __iomem *) (ea + (addr & ~PAGE_MASK)); |
212 | } | 215 | } |
213 | 216 | ||
214 | 217 | ||
215 | void __iomem * | 218 | void __iomem * |
216 | ioremap(unsigned long addr, unsigned long size) | 219 | ioremap(unsigned long addr, unsigned long size) |
217 | { | 220 | { |
218 | return __ioremap(addr, size, _PAGE_NO_CACHE | _PAGE_GUARDED); | 221 | return __ioremap(addr, size, _PAGE_NO_CACHE | _PAGE_GUARDED); |
219 | } | 222 | } |
220 | 223 | ||
221 | void __iomem * __ioremap(unsigned long addr, unsigned long size, | 224 | void __iomem * __ioremap(unsigned long addr, unsigned long size, |
222 | unsigned long flags) | 225 | unsigned long flags) |
223 | { | 226 | { |
224 | unsigned long pa, ea; | 227 | unsigned long pa, ea; |
225 | void __iomem *ret; | 228 | void __iomem *ret; |
226 | 229 | ||
227 | /* | 230 | /* |
228 | * Choose an address to map it to. | 231 | * Choose an address to map it to. |
229 | * Once the imalloc system is running, we use it. | 232 | * Once the imalloc system is running, we use it. |
230 | * Before that, we map using addresses going | 233 | * Before that, we map using addresses going |
231 | * up from ioremap_bot. imalloc will use | 234 | * up from ioremap_bot. imalloc will use |
232 | * the addresses from ioremap_bot through | 235 | * the addresses from ioremap_bot through |
233 | * IMALLOC_END | 236 | * IMALLOC_END |
234 | * | 237 | * |
235 | */ | 238 | */ |
236 | pa = addr & PAGE_MASK; | 239 | pa = addr & PAGE_MASK; |
237 | size = PAGE_ALIGN(addr + size) - pa; | 240 | size = PAGE_ALIGN(addr + size) - pa; |
238 | 241 | ||
239 | if (size == 0) | 242 | if (size == 0) |
240 | return NULL; | 243 | return NULL; |
241 | 244 | ||
242 | if (mem_init_done) { | 245 | if (mem_init_done) { |
243 | struct vm_struct *area; | 246 | struct vm_struct *area; |
244 | area = im_get_free_area(size); | 247 | area = im_get_free_area(size); |
245 | if (area == NULL) | 248 | if (area == NULL) |
246 | return NULL; | 249 | return NULL; |
247 | ea = (unsigned long)(area->addr); | 250 | ea = (unsigned long)(area->addr); |
248 | ret = __ioremap_com(addr, pa, ea, size, flags); | 251 | ret = __ioremap_com(addr, pa, ea, size, flags); |
249 | if (!ret) | 252 | if (!ret) |
250 | im_free(area->addr); | 253 | im_free(area->addr); |
251 | } else { | 254 | } else { |
252 | ea = ioremap_bot; | 255 | ea = ioremap_bot; |
253 | ret = __ioremap_com(addr, pa, ea, size, flags); | 256 | ret = __ioremap_com(addr, pa, ea, size, flags); |
254 | if (ret) | 257 | if (ret) |
255 | ioremap_bot += size; | 258 | ioremap_bot += size; |
256 | } | 259 | } |
257 | return ret; | 260 | return ret; |
258 | } | 261 | } |
259 | 262 | ||
260 | #define IS_PAGE_ALIGNED(_val) ((_val) == ((_val) & PAGE_MASK)) | 263 | #define IS_PAGE_ALIGNED(_val) ((_val) == ((_val) & PAGE_MASK)) |
261 | 264 | ||
262 | int __ioremap_explicit(unsigned long pa, unsigned long ea, | 265 | int __ioremap_explicit(unsigned long pa, unsigned long ea, |
263 | unsigned long size, unsigned long flags) | 266 | unsigned long size, unsigned long flags) |
264 | { | 267 | { |
265 | struct vm_struct *area; | 268 | struct vm_struct *area; |
266 | void __iomem *ret; | 269 | void __iomem *ret; |
267 | 270 | ||
268 | /* For now, require page-aligned values for pa, ea, and size */ | 271 | /* For now, require page-aligned values for pa, ea, and size */ |
269 | if (!IS_PAGE_ALIGNED(pa) || !IS_PAGE_ALIGNED(ea) || | 272 | if (!IS_PAGE_ALIGNED(pa) || !IS_PAGE_ALIGNED(ea) || |
270 | !IS_PAGE_ALIGNED(size)) { | 273 | !IS_PAGE_ALIGNED(size)) { |
271 | printk(KERN_ERR "unaligned value in %s\n", __FUNCTION__); | 274 | printk(KERN_ERR "unaligned value in %s\n", __FUNCTION__); |
272 | return 1; | 275 | return 1; |
273 | } | 276 | } |
274 | 277 | ||
275 | if (!mem_init_done) { | 278 | if (!mem_init_done) { |
276 | /* Two things to consider in this case: | 279 | /* Two things to consider in this case: |
277 | * 1) No records will be kept (imalloc, etc) that the region | 280 | * 1) No records will be kept (imalloc, etc) that the region |
278 | * has been remapped | 281 | * has been remapped |
279 | * 2) It won't be easy to iounmap() the region later (because | 282 | * 2) It won't be easy to iounmap() the region later (because |
280 | * of 1) | 283 | * of 1) |
281 | */ | 284 | */ |
282 | ; | 285 | ; |
283 | } else { | 286 | } else { |
284 | area = im_get_area(ea, size, | 287 | area = im_get_area(ea, size, |
285 | IM_REGION_UNUSED|IM_REGION_SUBSET|IM_REGION_EXISTS); | 288 | IM_REGION_UNUSED|IM_REGION_SUBSET|IM_REGION_EXISTS); |
286 | if (area == NULL) { | 289 | if (area == NULL) { |
287 | /* Expected when PHB-dlpar is in play */ | 290 | /* Expected when PHB-dlpar is in play */ |
288 | return 1; | 291 | return 1; |
289 | } | 292 | } |
290 | if (ea != (unsigned long) area->addr) { | 293 | if (ea != (unsigned long) area->addr) { |
291 | printk(KERN_ERR "unexpected addr return from " | 294 | printk(KERN_ERR "unexpected addr return from " |
292 | "im_get_area\n"); | 295 | "im_get_area\n"); |
293 | return 1; | 296 | return 1; |
294 | } | 297 | } |
295 | } | 298 | } |
296 | 299 | ||
297 | ret = __ioremap_com(pa, pa, ea, size, flags); | 300 | ret = __ioremap_com(pa, pa, ea, size, flags); |
298 | if (ret == NULL) { | 301 | if (ret == NULL) { |
299 | printk(KERN_ERR "ioremap_explicit() allocation failure !\n"); | 302 | printk(KERN_ERR "ioremap_explicit() allocation failure !\n"); |
300 | return 1; | 303 | return 1; |
301 | } | 304 | } |
302 | if (ret != (void *) ea) { | 305 | if (ret != (void *) ea) { |
303 | printk(KERN_ERR "__ioremap_com() returned unexpected addr\n"); | 306 | printk(KERN_ERR "__ioremap_com() returned unexpected addr\n"); |
304 | return 1; | 307 | return 1; |
305 | } | 308 | } |
306 | 309 | ||
307 | return 0; | 310 | return 0; |
308 | } | 311 | } |
309 | 312 | ||
310 | /* | 313 | /* |
311 | * Unmap an IO region and remove it from imalloc'd list. | 314 | * Unmap an IO region and remove it from imalloc'd list. |
312 | * Access to IO memory should be serialized by driver. | 315 | * Access to IO memory should be serialized by driver. |
313 | * This code is modeled after vmalloc code - unmap_vm_area() | 316 | * This code is modeled after vmalloc code - unmap_vm_area() |
314 | * | 317 | * |
315 | * XXX what about calls before mem_init_done (ie python_countermeasures()) | 318 | * XXX what about calls before mem_init_done (ie python_countermeasures()) |
316 | */ | 319 | */ |
317 | void iounmap(volatile void __iomem *token) | 320 | void iounmap(volatile void __iomem *token) |
318 | { | 321 | { |
319 | void *addr; | 322 | void *addr; |
320 | 323 | ||
321 | if (!mem_init_done) | 324 | if (!mem_init_done) |
322 | return; | 325 | return; |
323 | 326 | ||
324 | addr = (void *) ((unsigned long __force) token & PAGE_MASK); | 327 | addr = (void *) ((unsigned long __force) token & PAGE_MASK); |
325 | 328 | ||
326 | im_free(addr); | 329 | im_free(addr); |
327 | } | 330 | } |
328 | 331 | ||
329 | static int iounmap_subset_regions(unsigned long addr, unsigned long size) | 332 | static int iounmap_subset_regions(unsigned long addr, unsigned long size) |
330 | { | 333 | { |
331 | struct vm_struct *area; | 334 | struct vm_struct *area; |
332 | 335 | ||
333 | /* Check whether subsets of this region exist */ | 336 | /* Check whether subsets of this region exist */ |
334 | area = im_get_area(addr, size, IM_REGION_SUPERSET); | 337 | area = im_get_area(addr, size, IM_REGION_SUPERSET); |
335 | if (area == NULL) | 338 | if (area == NULL) |
336 | return 1; | 339 | return 1; |
337 | 340 | ||
338 | while (area) { | 341 | while (area) { |
339 | iounmap((void __iomem *) area->addr); | 342 | iounmap((void __iomem *) area->addr); |
340 | area = im_get_area(addr, size, | 343 | area = im_get_area(addr, size, |
341 | IM_REGION_SUPERSET); | 344 | IM_REGION_SUPERSET); |
342 | } | 345 | } |
343 | 346 | ||
344 | return 0; | 347 | return 0; |
345 | } | 348 | } |
346 | 349 | ||
347 | int iounmap_explicit(volatile void __iomem *start, unsigned long size) | 350 | int iounmap_explicit(volatile void __iomem *start, unsigned long size) |
348 | { | 351 | { |
349 | struct vm_struct *area; | 352 | struct vm_struct *area; |
350 | unsigned long addr; | 353 | unsigned long addr; |
351 | int rc; | 354 | int rc; |
352 | 355 | ||
353 | addr = (unsigned long __force) start & PAGE_MASK; | 356 | addr = (unsigned long __force) start & PAGE_MASK; |
354 | 357 | ||
355 | /* Verify that the region either exists or is a subset of an existing | 358 | /* Verify that the region either exists or is a subset of an existing |
356 | * region. In the latter case, split the parent region to create | 359 | * region. In the latter case, split the parent region to create |
357 | * the exact region | 360 | * the exact region |
358 | */ | 361 | */ |
359 | area = im_get_area(addr, size, | 362 | area = im_get_area(addr, size, |
360 | IM_REGION_EXISTS | IM_REGION_SUBSET); | 363 | IM_REGION_EXISTS | IM_REGION_SUBSET); |
361 | if (area == NULL) { | 364 | if (area == NULL) { |
362 | /* Determine whether subset regions exist. If so, unmap */ | 365 | /* Determine whether subset regions exist. If so, unmap */ |
363 | rc = iounmap_subset_regions(addr, size); | 366 | rc = iounmap_subset_regions(addr, size); |
364 | if (rc) { | 367 | if (rc) { |
365 | printk(KERN_ERR | 368 | printk(KERN_ERR |
366 | "%s() cannot unmap nonexistent range 0x%lx\n", | 369 | "%s() cannot unmap nonexistent range 0x%lx\n", |
367 | __FUNCTION__, addr); | 370 | __FUNCTION__, addr); |
368 | return 1; | 371 | return 1; |
369 | } | 372 | } |
370 | } else { | 373 | } else { |
371 | iounmap((void __iomem *) area->addr); | 374 | iounmap((void __iomem *) area->addr); |
372 | } | 375 | } |
373 | /* | 376 | /* |
374 | * FIXME! This can't be right: | 377 | * FIXME! This can't be right: |
375 | iounmap(area->addr); | 378 | iounmap(area->addr); |
376 | * Maybe it should be "iounmap(area);" | 379 | * Maybe it should be "iounmap(area);" |
377 | */ | 380 | */ |
378 | return 0; | 381 | return 0; |
379 | } | 382 | } |
380 | 383 | ||
381 | #endif | 384 | #endif |
382 | 385 | ||
383 | EXPORT_SYMBOL(ioremap); | 386 | EXPORT_SYMBOL(ioremap); |
384 | EXPORT_SYMBOL(__ioremap); | 387 | EXPORT_SYMBOL(__ioremap); |
385 | EXPORT_SYMBOL(iounmap); | 388 | EXPORT_SYMBOL(iounmap); |
386 | 389 | ||
387 | void free_initmem(void) | 390 | void free_initmem(void) |
388 | { | 391 | { |
389 | unsigned long addr; | 392 | unsigned long addr; |
390 | 393 | ||
391 | addr = (unsigned long)__init_begin; | 394 | addr = (unsigned long)__init_begin; |
392 | for (; addr < (unsigned long)__init_end; addr += PAGE_SIZE) { | 395 | for (; addr < (unsigned long)__init_end; addr += PAGE_SIZE) { |
393 | memset((void *)addr, 0xcc, PAGE_SIZE); | 396 | memset((void *)addr, 0xcc, PAGE_SIZE); |
394 | ClearPageReserved(virt_to_page(addr)); | 397 | ClearPageReserved(virt_to_page(addr)); |
395 | set_page_count(virt_to_page(addr), 1); | 398 | set_page_count(virt_to_page(addr), 1); |
396 | free_page(addr); | 399 | free_page(addr); |
397 | totalram_pages++; | 400 | totalram_pages++; |
398 | } | 401 | } |
399 | printk ("Freeing unused kernel memory: %luk freed\n", | 402 | printk ("Freeing unused kernel memory: %luk freed\n", |
400 | ((unsigned long)__init_end - (unsigned long)__init_begin) >> 10); | 403 | ((unsigned long)__init_end - (unsigned long)__init_begin) >> 10); |
401 | } | 404 | } |
402 | 405 | ||
403 | #ifdef CONFIG_BLK_DEV_INITRD | 406 | #ifdef CONFIG_BLK_DEV_INITRD |
404 | void free_initrd_mem(unsigned long start, unsigned long end) | 407 | void free_initrd_mem(unsigned long start, unsigned long end) |
405 | { | 408 | { |
406 | if (start < end) | 409 | if (start < end) |
407 | printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10); | 410 | printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10); |
408 | for (; start < end; start += PAGE_SIZE) { | 411 | for (; start < end; start += PAGE_SIZE) { |
409 | ClearPageReserved(virt_to_page(start)); | 412 | ClearPageReserved(virt_to_page(start)); |
410 | set_page_count(virt_to_page(start), 1); | 413 | set_page_count(virt_to_page(start), 1); |
411 | free_page(start); | 414 | free_page(start); |
412 | totalram_pages++; | 415 | totalram_pages++; |
413 | } | 416 | } |
414 | } | 417 | } |
415 | #endif | 418 | #endif |
416 | 419 | ||
417 | static DEFINE_SPINLOCK(mmu_context_lock); | 420 | static DEFINE_SPINLOCK(mmu_context_lock); |
418 | static DEFINE_IDR(mmu_context_idr); | 421 | static DEFINE_IDR(mmu_context_idr); |
419 | 422 | ||
420 | int init_new_context(struct task_struct *tsk, struct mm_struct *mm) | 423 | int init_new_context(struct task_struct *tsk, struct mm_struct *mm) |
421 | { | 424 | { |
422 | int index; | 425 | int index; |
423 | int err; | 426 | int err; |
424 | 427 | ||
425 | again: | 428 | again: |
426 | if (!idr_pre_get(&mmu_context_idr, GFP_KERNEL)) | 429 | if (!idr_pre_get(&mmu_context_idr, GFP_KERNEL)) |
427 | return -ENOMEM; | 430 | return -ENOMEM; |
428 | 431 | ||
429 | spin_lock(&mmu_context_lock); | 432 | spin_lock(&mmu_context_lock); |
430 | err = idr_get_new_above(&mmu_context_idr, NULL, 1, &index); | 433 | err = idr_get_new_above(&mmu_context_idr, NULL, 1, &index); |
431 | spin_unlock(&mmu_context_lock); | 434 | spin_unlock(&mmu_context_lock); |
432 | 435 | ||
433 | if (err == -EAGAIN) | 436 | if (err == -EAGAIN) |
434 | goto again; | 437 | goto again; |
435 | else if (err) | 438 | else if (err) |
436 | return err; | 439 | return err; |
437 | 440 | ||
438 | if (index > MAX_CONTEXT) { | 441 | if (index > MAX_CONTEXT) { |
439 | idr_remove(&mmu_context_idr, index); | 442 | idr_remove(&mmu_context_idr, index); |
440 | return -ENOMEM; | 443 | return -ENOMEM; |
441 | } | 444 | } |
442 | 445 | ||
443 | mm->context.id = index; | 446 | mm->context.id = index; |
444 | 447 | ||
445 | return 0; | 448 | return 0; |
446 | } | 449 | } |
447 | 450 | ||
448 | void destroy_context(struct mm_struct *mm) | 451 | void destroy_context(struct mm_struct *mm) |
449 | { | 452 | { |
450 | spin_lock(&mmu_context_lock); | 453 | spin_lock(&mmu_context_lock); |
451 | idr_remove(&mmu_context_idr, mm->context.id); | 454 | idr_remove(&mmu_context_idr, mm->context.id); |
452 | spin_unlock(&mmu_context_lock); | 455 | spin_unlock(&mmu_context_lock); |
453 | 456 | ||
454 | mm->context.id = NO_CONTEXT; | 457 | mm->context.id = NO_CONTEXT; |
455 | } | 458 | } |
456 | 459 | ||
457 | /* | 460 | /* |
458 | * Do very early mm setup. | 461 | * Do very early mm setup. |
459 | */ | 462 | */ |
460 | void __init mm_init_ppc64(void) | 463 | void __init mm_init_ppc64(void) |
461 | { | 464 | { |
462 | #ifndef CONFIG_PPC_ISERIES | 465 | #ifndef CONFIG_PPC_ISERIES |
463 | unsigned long i; | 466 | unsigned long i; |
464 | #endif | 467 | #endif |
465 | 468 | ||
466 | ppc64_boot_msg(0x100, "MM Init"); | 469 | ppc64_boot_msg(0x100, "MM Init"); |
467 | 470 | ||
468 | /* This is the story of the IO hole... please, keep seated, | 471 | /* This is the story of the IO hole... please, keep seated, |
469 | * unfortunately, we are out of oxygen masks at the moment. | 472 | * unfortunately, we are out of oxygen masks at the moment. |
470 | * So we need some rough way to tell where your big IO hole | 473 | * So we need some rough way to tell where your big IO hole |
471 | * is. On pmac, it's between 2G and 4G, on POWER3, it's around | 474 | * is. On pmac, it's between 2G and 4G, on POWER3, it's around |
472 | * that area as well, on POWER4 we don't have one, etc... | 475 | * that area as well, on POWER4 we don't have one, etc... |
473 | * We need that as a "hint" when sizing the TCE table on POWER3 | 476 | * We need that as a "hint" when sizing the TCE table on POWER3 |
474 | * So far, the simplest way that seem work well enough for us it | 477 | * So far, the simplest way that seem work well enough for us it |
475 | * to just assume that the first discontinuity in our physical | 478 | * to just assume that the first discontinuity in our physical |
476 | * RAM layout is the IO hole. That may not be correct in the future | 479 | * RAM layout is the IO hole. That may not be correct in the future |
477 | * (and isn't on iSeries but then we don't care ;) | 480 | * (and isn't on iSeries but then we don't care ;) |
478 | */ | 481 | */ |
479 | 482 | ||
480 | #ifndef CONFIG_PPC_ISERIES | 483 | #ifndef CONFIG_PPC_ISERIES |
481 | for (i = 1; i < lmb.memory.cnt; i++) { | 484 | for (i = 1; i < lmb.memory.cnt; i++) { |
482 | unsigned long base, prevbase, prevsize; | 485 | unsigned long base, prevbase, prevsize; |
483 | 486 | ||
484 | prevbase = lmb.memory.region[i-1].base; | 487 | prevbase = lmb.memory.region[i-1].base; |
485 | prevsize = lmb.memory.region[i-1].size; | 488 | prevsize = lmb.memory.region[i-1].size; |
486 | base = lmb.memory.region[i].base; | 489 | base = lmb.memory.region[i].base; |
487 | if (base > (prevbase + prevsize)) { | 490 | if (base > (prevbase + prevsize)) { |
488 | io_hole_start = prevbase + prevsize; | 491 | io_hole_start = prevbase + prevsize; |
489 | io_hole_size = base - (prevbase + prevsize); | 492 | io_hole_size = base - (prevbase + prevsize); |
490 | break; | 493 | break; |
491 | } | 494 | } |
492 | } | 495 | } |
493 | #endif /* CONFIG_PPC_ISERIES */ | 496 | #endif /* CONFIG_PPC_ISERIES */ |
494 | if (io_hole_start) | 497 | if (io_hole_start) |
495 | printk("IO Hole assumed to be %lx -> %lx\n", | 498 | printk("IO Hole assumed to be %lx -> %lx\n", |
496 | io_hole_start, io_hole_start + io_hole_size - 1); | 499 | io_hole_start, io_hole_start + io_hole_size - 1); |
497 | 500 | ||
498 | ppc64_boot_msg(0x100, "MM Init Done"); | 501 | ppc64_boot_msg(0x100, "MM Init Done"); |
499 | } | 502 | } |
500 | 503 | ||
501 | /* | 504 | /* |
502 | * This is called by /dev/mem to know if a given address has to | 505 | * This is called by /dev/mem to know if a given address has to |
503 | * be mapped non-cacheable or not | 506 | * be mapped non-cacheable or not |
504 | */ | 507 | */ |
505 | int page_is_ram(unsigned long pfn) | 508 | int page_is_ram(unsigned long pfn) |
506 | { | 509 | { |
507 | int i; | 510 | int i; |
508 | unsigned long paddr = (pfn << PAGE_SHIFT); | 511 | unsigned long paddr = (pfn << PAGE_SHIFT); |
509 | 512 | ||
510 | for (i=0; i < lmb.memory.cnt; i++) { | 513 | for (i=0; i < lmb.memory.cnt; i++) { |
511 | unsigned long base; | 514 | unsigned long base; |
512 | 515 | ||
513 | base = lmb.memory.region[i].base; | 516 | base = lmb.memory.region[i].base; |
514 | 517 | ||
515 | if ((paddr >= base) && | 518 | if ((paddr >= base) && |
516 | (paddr < (base + lmb.memory.region[i].size))) { | 519 | (paddr < (base + lmb.memory.region[i].size))) { |
517 | return 1; | 520 | return 1; |
518 | } | 521 | } |
519 | } | 522 | } |
520 | 523 | ||
521 | return 0; | 524 | return 0; |
522 | } | 525 | } |
523 | EXPORT_SYMBOL(page_is_ram); | 526 | EXPORT_SYMBOL(page_is_ram); |
524 | 527 | ||
525 | /* | 528 | /* |
526 | * Initialize the bootmem system and give it all the memory we | 529 | * Initialize the bootmem system and give it all the memory we |
527 | * have available. | 530 | * have available. |
528 | */ | 531 | */ |
529 | #ifndef CONFIG_NEED_MULTIPLE_NODES | 532 | #ifndef CONFIG_NEED_MULTIPLE_NODES |
530 | void __init do_init_bootmem(void) | 533 | void __init do_init_bootmem(void) |
531 | { | 534 | { |
532 | unsigned long i; | 535 | unsigned long i; |
533 | unsigned long start, bootmap_pages; | 536 | unsigned long start, bootmap_pages; |
534 | unsigned long total_pages = lmb_end_of_DRAM() >> PAGE_SHIFT; | 537 | unsigned long total_pages = lmb_end_of_DRAM() >> PAGE_SHIFT; |
535 | int boot_mapsize; | 538 | int boot_mapsize; |
536 | 539 | ||
537 | /* | 540 | /* |
538 | * Find an area to use for the bootmem bitmap. Calculate the size of | 541 | * Find an area to use for the bootmem bitmap. Calculate the size of |
539 | * bitmap required as (Total Memory) / PAGE_SIZE / BITS_PER_BYTE. | 542 | * bitmap required as (Total Memory) / PAGE_SIZE / BITS_PER_BYTE. |
540 | * Add 1 additional page in case the address isn't page-aligned. | 543 | * Add 1 additional page in case the address isn't page-aligned. |
541 | */ | 544 | */ |
542 | bootmap_pages = bootmem_bootmap_pages(total_pages); | 545 | bootmap_pages = bootmem_bootmap_pages(total_pages); |
543 | 546 | ||
544 | start = lmb_alloc(bootmap_pages<<PAGE_SHIFT, PAGE_SIZE); | 547 | start = lmb_alloc(bootmap_pages<<PAGE_SHIFT, PAGE_SIZE); |
545 | BUG_ON(!start); | 548 | BUG_ON(!start); |
546 | 549 | ||
547 | boot_mapsize = init_bootmem(start >> PAGE_SHIFT, total_pages); | 550 | boot_mapsize = init_bootmem(start >> PAGE_SHIFT, total_pages); |
548 | 551 | ||
549 | max_pfn = max_low_pfn; | 552 | max_pfn = max_low_pfn; |
550 | 553 | ||
551 | /* Add all physical memory to the bootmem map, mark each area | 554 | /* Add all physical memory to the bootmem map, mark each area |
552 | * present. | 555 | * present. |
553 | */ | 556 | */ |
554 | for (i=0; i < lmb.memory.cnt; i++) | 557 | for (i=0; i < lmb.memory.cnt; i++) |
555 | free_bootmem(lmb.memory.region[i].base, | 558 | free_bootmem(lmb.memory.region[i].base, |
556 | lmb_size_bytes(&lmb.memory, i)); | 559 | lmb_size_bytes(&lmb.memory, i)); |
557 | 560 | ||
558 | /* reserve the sections we're already using */ | 561 | /* reserve the sections we're already using */ |
559 | for (i=0; i < lmb.reserved.cnt; i++) | 562 | for (i=0; i < lmb.reserved.cnt; i++) |
560 | reserve_bootmem(lmb.reserved.region[i].base, | 563 | reserve_bootmem(lmb.reserved.region[i].base, |
561 | lmb_size_bytes(&lmb.reserved, i)); | 564 | lmb_size_bytes(&lmb.reserved, i)); |
562 | 565 | ||
563 | for (i=0; i < lmb.memory.cnt; i++) | 566 | for (i=0; i < lmb.memory.cnt; i++) |
564 | memory_present(0, lmb_start_pfn(&lmb.memory, i), | 567 | memory_present(0, lmb_start_pfn(&lmb.memory, i), |
565 | lmb_end_pfn(&lmb.memory, i)); | 568 | lmb_end_pfn(&lmb.memory, i)); |
566 | } | 569 | } |
567 | 570 | ||
568 | /* | 571 | /* |
569 | * paging_init() sets up the page tables - in fact we've already done this. | 572 | * paging_init() sets up the page tables - in fact we've already done this. |
570 | */ | 573 | */ |
571 | void __init paging_init(void) | 574 | void __init paging_init(void) |
572 | { | 575 | { |
573 | unsigned long zones_size[MAX_NR_ZONES]; | 576 | unsigned long zones_size[MAX_NR_ZONES]; |
574 | unsigned long zholes_size[MAX_NR_ZONES]; | 577 | unsigned long zholes_size[MAX_NR_ZONES]; |
575 | unsigned long total_ram = lmb_phys_mem_size(); | 578 | unsigned long total_ram = lmb_phys_mem_size(); |
576 | unsigned long top_of_ram = lmb_end_of_DRAM(); | 579 | unsigned long top_of_ram = lmb_end_of_DRAM(); |
577 | 580 | ||
578 | printk(KERN_INFO "Top of RAM: 0x%lx, Total RAM: 0x%lx\n", | 581 | printk(KERN_INFO "Top of RAM: 0x%lx, Total RAM: 0x%lx\n", |
579 | top_of_ram, total_ram); | 582 | top_of_ram, total_ram); |
580 | printk(KERN_INFO "Memory hole size: %ldMB\n", | 583 | printk(KERN_INFO "Memory hole size: %ldMB\n", |
581 | (top_of_ram - total_ram) >> 20); | 584 | (top_of_ram - total_ram) >> 20); |
582 | /* | 585 | /* |
583 | * All pages are DMA-able so we put them all in the DMA zone. | 586 | * All pages are DMA-able so we put them all in the DMA zone. |
584 | */ | 587 | */ |
585 | memset(zones_size, 0, sizeof(zones_size)); | 588 | memset(zones_size, 0, sizeof(zones_size)); |
586 | memset(zholes_size, 0, sizeof(zholes_size)); | 589 | memset(zholes_size, 0, sizeof(zholes_size)); |
587 | 590 | ||
588 | zones_size[ZONE_DMA] = top_of_ram >> PAGE_SHIFT; | 591 | zones_size[ZONE_DMA] = top_of_ram >> PAGE_SHIFT; |
589 | zholes_size[ZONE_DMA] = (top_of_ram - total_ram) >> PAGE_SHIFT; | 592 | zholes_size[ZONE_DMA] = (top_of_ram - total_ram) >> PAGE_SHIFT; |
590 | 593 | ||
591 | free_area_init_node(0, NODE_DATA(0), zones_size, | 594 | free_area_init_node(0, NODE_DATA(0), zones_size, |
592 | __pa(PAGE_OFFSET) >> PAGE_SHIFT, zholes_size); | 595 | __pa(PAGE_OFFSET) >> PAGE_SHIFT, zholes_size); |
593 | } | 596 | } |
594 | #endif /* ! CONFIG_NEED_MULTIPLE_NODES */ | 597 | #endif /* ! CONFIG_NEED_MULTIPLE_NODES */ |
595 | 598 | ||
596 | static struct kcore_list kcore_vmem; | 599 | static struct kcore_list kcore_vmem; |
597 | 600 | ||
598 | static int __init setup_kcore(void) | 601 | static int __init setup_kcore(void) |
599 | { | 602 | { |
600 | int i; | 603 | int i; |
601 | 604 | ||
602 | for (i=0; i < lmb.memory.cnt; i++) { | 605 | for (i=0; i < lmb.memory.cnt; i++) { |
603 | unsigned long base, size; | 606 | unsigned long base, size; |
604 | struct kcore_list *kcore_mem; | 607 | struct kcore_list *kcore_mem; |
605 | 608 | ||
606 | base = lmb.memory.region[i].base; | 609 | base = lmb.memory.region[i].base; |
607 | size = lmb.memory.region[i].size; | 610 | size = lmb.memory.region[i].size; |
608 | 611 | ||
609 | /* GFP_ATOMIC to avoid might_sleep warnings during boot */ | 612 | /* GFP_ATOMIC to avoid might_sleep warnings during boot */ |
610 | kcore_mem = kmalloc(sizeof(struct kcore_list), GFP_ATOMIC); | 613 | kcore_mem = kmalloc(sizeof(struct kcore_list), GFP_ATOMIC); |
611 | if (!kcore_mem) | 614 | if (!kcore_mem) |
612 | panic("mem_init: kmalloc failed\n"); | 615 | panic("mem_init: kmalloc failed\n"); |
613 | 616 | ||
614 | kclist_add(kcore_mem, __va(base), size); | 617 | kclist_add(kcore_mem, __va(base), size); |
615 | } | 618 | } |
616 | 619 | ||
617 | kclist_add(&kcore_vmem, (void *)VMALLOC_START, VMALLOC_END-VMALLOC_START); | 620 | kclist_add(&kcore_vmem, (void *)VMALLOC_START, VMALLOC_END-VMALLOC_START); |
618 | 621 | ||
619 | return 0; | 622 | return 0; |
620 | } | 623 | } |
621 | module_init(setup_kcore); | 624 | module_init(setup_kcore); |
622 | 625 | ||
623 | void __init mem_init(void) | 626 | void __init mem_init(void) |
624 | { | 627 | { |
625 | #ifdef CONFIG_NEED_MULTIPLE_NODES | 628 | #ifdef CONFIG_NEED_MULTIPLE_NODES |
626 | int nid; | 629 | int nid; |
627 | #endif | 630 | #endif |
628 | pg_data_t *pgdat; | 631 | pg_data_t *pgdat; |
629 | unsigned long i; | 632 | unsigned long i; |
630 | struct page *page; | 633 | struct page *page; |
631 | unsigned long reservedpages = 0, codesize, initsize, datasize, bsssize; | 634 | unsigned long reservedpages = 0, codesize, initsize, datasize, bsssize; |
632 | 635 | ||
633 | num_physpages = max_low_pfn; /* RAM is assumed contiguous */ | 636 | num_physpages = max_low_pfn; /* RAM is assumed contiguous */ |
634 | high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); | 637 | high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); |
635 | 638 | ||
636 | #ifdef CONFIG_NEED_MULTIPLE_NODES | 639 | #ifdef CONFIG_NEED_MULTIPLE_NODES |
637 | for_each_online_node(nid) { | 640 | for_each_online_node(nid) { |
638 | if (NODE_DATA(nid)->node_spanned_pages != 0) { | 641 | if (NODE_DATA(nid)->node_spanned_pages != 0) { |
639 | printk("freeing bootmem node %x\n", nid); | 642 | printk("freeing bootmem node %x\n", nid); |
640 | totalram_pages += | 643 | totalram_pages += |
641 | free_all_bootmem_node(NODE_DATA(nid)); | 644 | free_all_bootmem_node(NODE_DATA(nid)); |
642 | } | 645 | } |
643 | } | 646 | } |
644 | #else | 647 | #else |
645 | max_mapnr = num_physpages; | 648 | max_mapnr = num_physpages; |
646 | totalram_pages += free_all_bootmem(); | 649 | totalram_pages += free_all_bootmem(); |
647 | #endif | 650 | #endif |
648 | 651 | ||
649 | for_each_pgdat(pgdat) { | 652 | for_each_pgdat(pgdat) { |
653 | unsigned long flags; | ||
654 | pgdat_resize_lock(pgdat, &flags); | ||
650 | for (i = 0; i < pgdat->node_spanned_pages; i++) { | 655 | for (i = 0; i < pgdat->node_spanned_pages; i++) { |
651 | page = pgdat_page_nr(pgdat, i); | 656 | page = pgdat_page_nr(pgdat, i); |
652 | if (PageReserved(page)) | 657 | if (PageReserved(page)) |
653 | reservedpages++; | 658 | reservedpages++; |
654 | } | 659 | } |
660 | pgdat_resize_unlock(pgdat, &flags); | ||
655 | } | 661 | } |
656 | 662 | ||
657 | codesize = (unsigned long)&_etext - (unsigned long)&_stext; | 663 | codesize = (unsigned long)&_etext - (unsigned long)&_stext; |
658 | initsize = (unsigned long)&__init_end - (unsigned long)&__init_begin; | 664 | initsize = (unsigned long)&__init_end - (unsigned long)&__init_begin; |
659 | datasize = (unsigned long)&_edata - (unsigned long)&__init_end; | 665 | datasize = (unsigned long)&_edata - (unsigned long)&__init_end; |
660 | bsssize = (unsigned long)&__bss_stop - (unsigned long)&__bss_start; | 666 | bsssize = (unsigned long)&__bss_stop - (unsigned long)&__bss_start; |
661 | 667 | ||
662 | printk(KERN_INFO "Memory: %luk/%luk available (%luk kernel code, " | 668 | printk(KERN_INFO "Memory: %luk/%luk available (%luk kernel code, " |
663 | "%luk reserved, %luk data, %luk bss, %luk init)\n", | 669 | "%luk reserved, %luk data, %luk bss, %luk init)\n", |
664 | (unsigned long)nr_free_pages() << (PAGE_SHIFT-10), | 670 | (unsigned long)nr_free_pages() << (PAGE_SHIFT-10), |
665 | num_physpages << (PAGE_SHIFT-10), | 671 | num_physpages << (PAGE_SHIFT-10), |
666 | codesize >> 10, | 672 | codesize >> 10, |
667 | reservedpages << (PAGE_SHIFT-10), | 673 | reservedpages << (PAGE_SHIFT-10), |
668 | datasize >> 10, | 674 | datasize >> 10, |
669 | bsssize >> 10, | 675 | bsssize >> 10, |
670 | initsize >> 10); | 676 | initsize >> 10); |
671 | 677 | ||
672 | mem_init_done = 1; | 678 | mem_init_done = 1; |
673 | 679 | ||
674 | /* Initialize the vDSO */ | 680 | /* Initialize the vDSO */ |
675 | vdso_init(); | 681 | vdso_init(); |
676 | } | 682 | } |
677 | 683 | ||
678 | /* | 684 | /* |
679 | * This is called when a page has been modified by the kernel. | 685 | * This is called when a page has been modified by the kernel. |
680 | * It just marks the page as not i-cache clean. We do the i-cache | 686 | * It just marks the page as not i-cache clean. We do the i-cache |
681 | * flush later when the page is given to a user process, if necessary. | 687 | * flush later when the page is given to a user process, if necessary. |
682 | */ | 688 | */ |
683 | void flush_dcache_page(struct page *page) | 689 | void flush_dcache_page(struct page *page) |
684 | { | 690 | { |
685 | if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) | 691 | if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) |
686 | return; | 692 | return; |
687 | /* avoid an atomic op if possible */ | 693 | /* avoid an atomic op if possible */ |
688 | if (test_bit(PG_arch_1, &page->flags)) | 694 | if (test_bit(PG_arch_1, &page->flags)) |
689 | clear_bit(PG_arch_1, &page->flags); | 695 | clear_bit(PG_arch_1, &page->flags); |
690 | } | 696 | } |
691 | EXPORT_SYMBOL(flush_dcache_page); | 697 | EXPORT_SYMBOL(flush_dcache_page); |
692 | 698 | ||
693 | void clear_user_page(void *page, unsigned long vaddr, struct page *pg) | 699 | void clear_user_page(void *page, unsigned long vaddr, struct page *pg) |
694 | { | 700 | { |
695 | clear_page(page); | 701 | clear_page(page); |
696 | 702 | ||
697 | if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) | 703 | if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) |
698 | return; | 704 | return; |
699 | /* | 705 | /* |
700 | * We shouldnt have to do this, but some versions of glibc | 706 | * We shouldnt have to do this, but some versions of glibc |
701 | * require it (ld.so assumes zero filled pages are icache clean) | 707 | * require it (ld.so assumes zero filled pages are icache clean) |
702 | * - Anton | 708 | * - Anton |
703 | */ | 709 | */ |
704 | 710 | ||
705 | /* avoid an atomic op if possible */ | 711 | /* avoid an atomic op if possible */ |
706 | if (test_bit(PG_arch_1, &pg->flags)) | 712 | if (test_bit(PG_arch_1, &pg->flags)) |
707 | clear_bit(PG_arch_1, &pg->flags); | 713 | clear_bit(PG_arch_1, &pg->flags); |
708 | } | 714 | } |
709 | EXPORT_SYMBOL(clear_user_page); | 715 | EXPORT_SYMBOL(clear_user_page); |
710 | 716 | ||
711 | void copy_user_page(void *vto, void *vfrom, unsigned long vaddr, | 717 | void copy_user_page(void *vto, void *vfrom, unsigned long vaddr, |
712 | struct page *pg) | 718 | struct page *pg) |
713 | { | 719 | { |
714 | copy_page(vto, vfrom); | 720 | copy_page(vto, vfrom); |
715 | 721 | ||
716 | /* | 722 | /* |
717 | * We should be able to use the following optimisation, however | 723 | * We should be able to use the following optimisation, however |
718 | * there are two problems. | 724 | * there are two problems. |
719 | * Firstly a bug in some versions of binutils meant PLT sections | 725 | * Firstly a bug in some versions of binutils meant PLT sections |
720 | * were not marked executable. | 726 | * were not marked executable. |
721 | * Secondly the first word in the GOT section is blrl, used | 727 | * Secondly the first word in the GOT section is blrl, used |
722 | * to establish the GOT address. Until recently the GOT was | 728 | * to establish the GOT address. Until recently the GOT was |
723 | * not marked executable. | 729 | * not marked executable. |
724 | * - Anton | 730 | * - Anton |
725 | */ | 731 | */ |
726 | #if 0 | 732 | #if 0 |
727 | if (!vma->vm_file && ((vma->vm_flags & VM_EXEC) == 0)) | 733 | if (!vma->vm_file && ((vma->vm_flags & VM_EXEC) == 0)) |
728 | return; | 734 | return; |
729 | #endif | 735 | #endif |
730 | 736 | ||
731 | if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) | 737 | if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) |
732 | return; | 738 | return; |
733 | 739 | ||
734 | /* avoid an atomic op if possible */ | 740 | /* avoid an atomic op if possible */ |
735 | if (test_bit(PG_arch_1, &pg->flags)) | 741 | if (test_bit(PG_arch_1, &pg->flags)) |
736 | clear_bit(PG_arch_1, &pg->flags); | 742 | clear_bit(PG_arch_1, &pg->flags); |
737 | } | 743 | } |
738 | 744 | ||
739 | void flush_icache_user_range(struct vm_area_struct *vma, struct page *page, | 745 | void flush_icache_user_range(struct vm_area_struct *vma, struct page *page, |
740 | unsigned long addr, int len) | 746 | unsigned long addr, int len) |
741 | { | 747 | { |
742 | unsigned long maddr; | 748 | unsigned long maddr; |
743 | 749 | ||
744 | maddr = (unsigned long)page_address(page) + (addr & ~PAGE_MASK); | 750 | maddr = (unsigned long)page_address(page) + (addr & ~PAGE_MASK); |
745 | flush_icache_range(maddr, maddr + len); | 751 | flush_icache_range(maddr, maddr + len); |
746 | } | 752 | } |
747 | EXPORT_SYMBOL(flush_icache_user_range); | 753 | EXPORT_SYMBOL(flush_icache_user_range); |
748 | 754 | ||
749 | /* | 755 | /* |
750 | * This is called at the end of handling a user page fault, when the | 756 | * This is called at the end of handling a user page fault, when the |
751 | * fault has been handled by updating a PTE in the linux page tables. | 757 | * fault has been handled by updating a PTE in the linux page tables. |
752 | * We use it to preload an HPTE into the hash table corresponding to | 758 | * We use it to preload an HPTE into the hash table corresponding to |
753 | * the updated linux PTE. | 759 | * the updated linux PTE. |
754 | * | 760 | * |
755 | * This must always be called with the mm->page_table_lock held | 761 | * This must always be called with the mm->page_table_lock held |
756 | */ | 762 | */ |
757 | void update_mmu_cache(struct vm_area_struct *vma, unsigned long ea, | 763 | void update_mmu_cache(struct vm_area_struct *vma, unsigned long ea, |
758 | pte_t pte) | 764 | pte_t pte) |
759 | { | 765 | { |
760 | unsigned long vsid; | 766 | unsigned long vsid; |
761 | void *pgdir; | 767 | void *pgdir; |
762 | pte_t *ptep; | 768 | pte_t *ptep; |
763 | int local = 0; | 769 | int local = 0; |
764 | cpumask_t tmp; | 770 | cpumask_t tmp; |
765 | unsigned long flags; | 771 | unsigned long flags; |
766 | 772 | ||
767 | /* handle i-cache coherency */ | 773 | /* handle i-cache coherency */ |
768 | if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE) && | 774 | if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE) && |
769 | !cpu_has_feature(CPU_FTR_NOEXECUTE)) { | 775 | !cpu_has_feature(CPU_FTR_NOEXECUTE)) { |
770 | unsigned long pfn = pte_pfn(pte); | 776 | unsigned long pfn = pte_pfn(pte); |
771 | if (pfn_valid(pfn)) { | 777 | if (pfn_valid(pfn)) { |
772 | struct page *page = pfn_to_page(pfn); | 778 | struct page *page = pfn_to_page(pfn); |
773 | if (!PageReserved(page) | 779 | if (!PageReserved(page) |
774 | && !test_bit(PG_arch_1, &page->flags)) { | 780 | && !test_bit(PG_arch_1, &page->flags)) { |
775 | __flush_dcache_icache(page_address(page)); | 781 | __flush_dcache_icache(page_address(page)); |
776 | set_bit(PG_arch_1, &page->flags); | 782 | set_bit(PG_arch_1, &page->flags); |
777 | } | 783 | } |
778 | } | 784 | } |
779 | } | 785 | } |
780 | 786 | ||
781 | /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */ | 787 | /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */ |
782 | if (!pte_young(pte)) | 788 | if (!pte_young(pte)) |
783 | return; | 789 | return; |
784 | 790 | ||
785 | pgdir = vma->vm_mm->pgd; | 791 | pgdir = vma->vm_mm->pgd; |
786 | if (pgdir == NULL) | 792 | if (pgdir == NULL) |
787 | return; | 793 | return; |
788 | 794 | ||
789 | ptep = find_linux_pte(pgdir, ea); | 795 | ptep = find_linux_pte(pgdir, ea); |
790 | if (!ptep) | 796 | if (!ptep) |
791 | return; | 797 | return; |
792 | 798 | ||
793 | vsid = get_vsid(vma->vm_mm->context.id, ea); | 799 | vsid = get_vsid(vma->vm_mm->context.id, ea); |
794 | 800 | ||
795 | local_irq_save(flags); | 801 | local_irq_save(flags); |
796 | tmp = cpumask_of_cpu(smp_processor_id()); | 802 | tmp = cpumask_of_cpu(smp_processor_id()); |
797 | if (cpus_equal(vma->vm_mm->cpu_vm_mask, tmp)) | 803 | if (cpus_equal(vma->vm_mm->cpu_vm_mask, tmp)) |
798 | local = 1; | 804 | local = 1; |
799 | 805 | ||
800 | __hash_page(ea, 0, vsid, ptep, 0x300, local); | 806 | __hash_page(ea, 0, vsid, ptep, 0x300, local); |
801 | local_irq_restore(flags); | 807 | local_irq_restore(flags); |
802 | } | 808 | } |
803 | 809 | ||
804 | void __iomem * reserve_phb_iospace(unsigned long size) | 810 | void __iomem * reserve_phb_iospace(unsigned long size) |
805 | { | 811 | { |
806 | void __iomem *virt_addr; | 812 | void __iomem *virt_addr; |
807 | 813 | ||
808 | if (phbs_io_bot >= IMALLOC_BASE) | 814 | if (phbs_io_bot >= IMALLOC_BASE) |
809 | panic("reserve_phb_iospace(): phb io space overflow\n"); | 815 | panic("reserve_phb_iospace(): phb io space overflow\n"); |
810 | 816 | ||
811 | virt_addr = (void __iomem *) phbs_io_bot; | 817 | virt_addr = (void __iomem *) phbs_io_bot; |
812 | phbs_io_bot += size; | 818 | phbs_io_bot += size; |
813 | 819 | ||
814 | return virt_addr; | 820 | return virt_addr; |
815 | } | 821 | } |
816 | 822 | ||
817 | static void zero_ctor(void *addr, kmem_cache_t *cache, unsigned long flags) | 823 | static void zero_ctor(void *addr, kmem_cache_t *cache, unsigned long flags) |
818 | { | 824 | { |
819 | memset(addr, 0, kmem_cache_size(cache)); | 825 | memset(addr, 0, kmem_cache_size(cache)); |
820 | } | 826 | } |
821 | 827 | ||
822 | static const int pgtable_cache_size[2] = { | 828 | static const int pgtable_cache_size[2] = { |
823 | PTE_TABLE_SIZE, PMD_TABLE_SIZE | 829 | PTE_TABLE_SIZE, PMD_TABLE_SIZE |
824 | }; | 830 | }; |
825 | static const char *pgtable_cache_name[ARRAY_SIZE(pgtable_cache_size)] = { | 831 | static const char *pgtable_cache_name[ARRAY_SIZE(pgtable_cache_size)] = { |
826 | "pgd_pte_cache", "pud_pmd_cache", | 832 | "pgd_pte_cache", "pud_pmd_cache", |
827 | }; | 833 | }; |
828 | 834 | ||
829 | kmem_cache_t *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)]; | 835 | kmem_cache_t *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)]; |
830 | 836 | ||
831 | void pgtable_cache_init(void) | 837 | void pgtable_cache_init(void) |
832 | { | 838 | { |
833 | int i; | 839 | int i; |
834 | 840 | ||
835 | BUILD_BUG_ON(PTE_TABLE_SIZE != pgtable_cache_size[PTE_CACHE_NUM]); | 841 | BUILD_BUG_ON(PTE_TABLE_SIZE != pgtable_cache_size[PTE_CACHE_NUM]); |
836 | BUILD_BUG_ON(PMD_TABLE_SIZE != pgtable_cache_size[PMD_CACHE_NUM]); | 842 | BUILD_BUG_ON(PMD_TABLE_SIZE != pgtable_cache_size[PMD_CACHE_NUM]); |
837 | BUILD_BUG_ON(PUD_TABLE_SIZE != pgtable_cache_size[PUD_CACHE_NUM]); | 843 | BUILD_BUG_ON(PUD_TABLE_SIZE != pgtable_cache_size[PUD_CACHE_NUM]); |
838 | BUILD_BUG_ON(PGD_TABLE_SIZE != pgtable_cache_size[PGD_CACHE_NUM]); | 844 | BUILD_BUG_ON(PGD_TABLE_SIZE != pgtable_cache_size[PGD_CACHE_NUM]); |
839 | 845 | ||
840 | for (i = 0; i < ARRAY_SIZE(pgtable_cache_size); i++) { | 846 | for (i = 0; i < ARRAY_SIZE(pgtable_cache_size); i++) { |
841 | int size = pgtable_cache_size[i]; | 847 | int size = pgtable_cache_size[i]; |
842 | const char *name = pgtable_cache_name[i]; | 848 | const char *name = pgtable_cache_name[i]; |
843 | 849 | ||
844 | pgtable_cache[i] = kmem_cache_create(name, | 850 | pgtable_cache[i] = kmem_cache_create(name, |
845 | size, size, | 851 | size, size, |
846 | SLAB_HWCACHE_ALIGN | 852 | SLAB_HWCACHE_ALIGN |
847 | | SLAB_MUST_HWCACHE_ALIGN, | 853 | | SLAB_MUST_HWCACHE_ALIGN, |
848 | zero_ctor, | 854 | zero_ctor, |
849 | NULL); | 855 | NULL); |
850 | if (! pgtable_cache[i]) | 856 | if (! pgtable_cache[i]) |
851 | panic("pgtable_cache_init(): could not create %s!\n", | 857 | panic("pgtable_cache_init(): could not create %s!\n", |
852 | name); | 858 | name); |
853 | } | 859 | } |
854 | } | 860 | } |
855 | 861 | ||
856 | pgprot_t phys_mem_access_prot(struct file *file, unsigned long addr, | 862 | pgprot_t phys_mem_access_prot(struct file *file, unsigned long addr, |
857 | unsigned long size, pgprot_t vma_prot) | 863 | unsigned long size, pgprot_t vma_prot) |
858 | { | 864 | { |
859 | if (ppc_md.phys_mem_access_prot) | 865 | if (ppc_md.phys_mem_access_prot) |
860 | return ppc_md.phys_mem_access_prot(file, addr, size, vma_prot); | 866 | return ppc_md.phys_mem_access_prot(file, addr, size, vma_prot); |
861 | 867 | ||
862 | if (!page_is_ram(addr >> PAGE_SHIFT)) | 868 | if (!page_is_ram(addr >> PAGE_SHIFT)) |
863 | vma_prot = __pgprot(pgprot_val(vma_prot) | 869 | vma_prot = __pgprot(pgprot_val(vma_prot) |
864 | | _PAGE_GUARDED | _PAGE_NO_CACHE); | 870 | | _PAGE_GUARDED | _PAGE_NO_CACHE); |
865 | return vma_prot; | 871 | return vma_prot; |
866 | } | 872 | } |
867 | EXPORT_SYMBOL(phys_mem_access_prot); | 873 | EXPORT_SYMBOL(phys_mem_access_prot); |
868 | 874 |
include/linux/memory_hotplug.h
File was created | 1 | #ifndef __LINUX_MEMORY_HOTPLUG_H | |
2 | #define __LINUX_MEMORY_HOTPLUG_H | ||
3 | |||
4 | #include <linux/mmzone.h> | ||
5 | #include <linux/spinlock.h> | ||
6 | |||
7 | #ifdef CONFIG_MEMORY_HOTPLUG | ||
8 | /* | ||
9 | * pgdat resizing functions | ||
10 | */ | ||
11 | static inline | ||
12 | void pgdat_resize_lock(struct pglist_data *pgdat, unsigned long *flags) | ||
13 | { | ||
14 | spin_lock_irqsave(&pgdat->node_size_lock, *flags); | ||
15 | } | ||
16 | static inline | ||
17 | void pgdat_resize_unlock(struct pglist_data *pgdat, unsigned long *flags) | ||
18 | { | ||
19 | spin_lock_irqrestore(&pgdat->node_size_lock, *flags); | ||
20 | } | ||
21 | static inline | ||
22 | void pgdat_resize_init(struct pglist_data *pgdat) | ||
23 | { | ||
24 | spin_lock_init(&pgdat->node_size_lock); | ||
25 | } | ||
26 | #else /* ! CONFIG_MEMORY_HOTPLUG */ | ||
27 | /* | ||
28 | * Stub functions for when hotplug is off | ||
29 | */ | ||
30 | static inline void pgdat_resize_lock(struct pglist_data *p, unsigned long *f) {} | ||
31 | static inline void pgdat_resize_unlock(struct pglist_data *p, unsigned long *f) {} | ||
32 | static inline void pgdat_resize_init(struct pglist_data *pgdat) {} | ||
33 | #endif | ||
34 | #endif /* __LINUX_MEMORY_HOTPLUG_H */ | ||
35 |
include/linux/mmzone.h
1 | #ifndef _LINUX_MMZONE_H | 1 | #ifndef _LINUX_MMZONE_H |
2 | #define _LINUX_MMZONE_H | 2 | #define _LINUX_MMZONE_H |
3 | 3 | ||
4 | #ifdef __KERNEL__ | 4 | #ifdef __KERNEL__ |
5 | #ifndef __ASSEMBLY__ | 5 | #ifndef __ASSEMBLY__ |
6 | 6 | ||
7 | #include <linux/config.h> | 7 | #include <linux/config.h> |
8 | #include <linux/spinlock.h> | 8 | #include <linux/spinlock.h> |
9 | #include <linux/list.h> | 9 | #include <linux/list.h> |
10 | #include <linux/wait.h> | 10 | #include <linux/wait.h> |
11 | #include <linux/cache.h> | 11 | #include <linux/cache.h> |
12 | #include <linux/threads.h> | 12 | #include <linux/threads.h> |
13 | #include <linux/numa.h> | 13 | #include <linux/numa.h> |
14 | #include <linux/init.h> | 14 | #include <linux/init.h> |
15 | #include <asm/atomic.h> | 15 | #include <asm/atomic.h> |
16 | 16 | ||
17 | /* Free memory management - zoned buddy allocator. */ | 17 | /* Free memory management - zoned buddy allocator. */ |
18 | #ifndef CONFIG_FORCE_MAX_ZONEORDER | 18 | #ifndef CONFIG_FORCE_MAX_ZONEORDER |
19 | #define MAX_ORDER 11 | 19 | #define MAX_ORDER 11 |
20 | #else | 20 | #else |
21 | #define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER | 21 | #define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER |
22 | #endif | 22 | #endif |
23 | 23 | ||
24 | struct free_area { | 24 | struct free_area { |
25 | struct list_head free_list; | 25 | struct list_head free_list; |
26 | unsigned long nr_free; | 26 | unsigned long nr_free; |
27 | }; | 27 | }; |
28 | 28 | ||
29 | struct pglist_data; | 29 | struct pglist_data; |
30 | 30 | ||
31 | /* | 31 | /* |
32 | * zone->lock and zone->lru_lock are two of the hottest locks in the kernel. | 32 | * zone->lock and zone->lru_lock are two of the hottest locks in the kernel. |
33 | * So add a wild amount of padding here to ensure that they fall into separate | 33 | * So add a wild amount of padding here to ensure that they fall into separate |
34 | * cachelines. There are very few zone structures in the machine, so space | 34 | * cachelines. There are very few zone structures in the machine, so space |
35 | * consumption is not a concern here. | 35 | * consumption is not a concern here. |
36 | */ | 36 | */ |
37 | #if defined(CONFIG_SMP) | 37 | #if defined(CONFIG_SMP) |
38 | struct zone_padding { | 38 | struct zone_padding { |
39 | char x[0]; | 39 | char x[0]; |
40 | } ____cacheline_maxaligned_in_smp; | 40 | } ____cacheline_maxaligned_in_smp; |
41 | #define ZONE_PADDING(name) struct zone_padding name; | 41 | #define ZONE_PADDING(name) struct zone_padding name; |
42 | #else | 42 | #else |
43 | #define ZONE_PADDING(name) | 43 | #define ZONE_PADDING(name) |
44 | #endif | 44 | #endif |
45 | 45 | ||
46 | struct per_cpu_pages { | 46 | struct per_cpu_pages { |
47 | int count; /* number of pages in the list */ | 47 | int count; /* number of pages in the list */ |
48 | int low; /* low watermark, refill needed */ | 48 | int low; /* low watermark, refill needed */ |
49 | int high; /* high watermark, emptying needed */ | 49 | int high; /* high watermark, emptying needed */ |
50 | int batch; /* chunk size for buddy add/remove */ | 50 | int batch; /* chunk size for buddy add/remove */ |
51 | struct list_head list; /* the list of pages */ | 51 | struct list_head list; /* the list of pages */ |
52 | }; | 52 | }; |
53 | 53 | ||
54 | struct per_cpu_pageset { | 54 | struct per_cpu_pageset { |
55 | struct per_cpu_pages pcp[2]; /* 0: hot. 1: cold */ | 55 | struct per_cpu_pages pcp[2]; /* 0: hot. 1: cold */ |
56 | #ifdef CONFIG_NUMA | 56 | #ifdef CONFIG_NUMA |
57 | unsigned long numa_hit; /* allocated in intended node */ | 57 | unsigned long numa_hit; /* allocated in intended node */ |
58 | unsigned long numa_miss; /* allocated in non intended node */ | 58 | unsigned long numa_miss; /* allocated in non intended node */ |
59 | unsigned long numa_foreign; /* was intended here, hit elsewhere */ | 59 | unsigned long numa_foreign; /* was intended here, hit elsewhere */ |
60 | unsigned long interleave_hit; /* interleaver prefered this zone */ | 60 | unsigned long interleave_hit; /* interleaver prefered this zone */ |
61 | unsigned long local_node; /* allocation from local node */ | 61 | unsigned long local_node; /* allocation from local node */ |
62 | unsigned long other_node; /* allocation from other node */ | 62 | unsigned long other_node; /* allocation from other node */ |
63 | #endif | 63 | #endif |
64 | } ____cacheline_aligned_in_smp; | 64 | } ____cacheline_aligned_in_smp; |
65 | 65 | ||
66 | #ifdef CONFIG_NUMA | 66 | #ifdef CONFIG_NUMA |
67 | #define zone_pcp(__z, __cpu) ((__z)->pageset[(__cpu)]) | 67 | #define zone_pcp(__z, __cpu) ((__z)->pageset[(__cpu)]) |
68 | #else | 68 | #else |
69 | #define zone_pcp(__z, __cpu) (&(__z)->pageset[(__cpu)]) | 69 | #define zone_pcp(__z, __cpu) (&(__z)->pageset[(__cpu)]) |
70 | #endif | 70 | #endif |
71 | 71 | ||
72 | #define ZONE_DMA 0 | 72 | #define ZONE_DMA 0 |
73 | #define ZONE_NORMAL 1 | 73 | #define ZONE_NORMAL 1 |
74 | #define ZONE_HIGHMEM 2 | 74 | #define ZONE_HIGHMEM 2 |
75 | 75 | ||
76 | #define MAX_NR_ZONES 3 /* Sync this with ZONES_SHIFT */ | 76 | #define MAX_NR_ZONES 3 /* Sync this with ZONES_SHIFT */ |
77 | #define ZONES_SHIFT 2 /* ceil(log2(MAX_NR_ZONES)) */ | 77 | #define ZONES_SHIFT 2 /* ceil(log2(MAX_NR_ZONES)) */ |
78 | 78 | ||
79 | 79 | ||
80 | /* | 80 | /* |
81 | * When a memory allocation must conform to specific limitations (such | 81 | * When a memory allocation must conform to specific limitations (such |
82 | * as being suitable for DMA) the caller will pass in hints to the | 82 | * as being suitable for DMA) the caller will pass in hints to the |
83 | * allocator in the gfp_mask, in the zone modifier bits. These bits | 83 | * allocator in the gfp_mask, in the zone modifier bits. These bits |
84 | * are used to select a priority ordered list of memory zones which | 84 | * are used to select a priority ordered list of memory zones which |
85 | * match the requested limits. GFP_ZONEMASK defines which bits within | 85 | * match the requested limits. GFP_ZONEMASK defines which bits within |
86 | * the gfp_mask should be considered as zone modifiers. Each valid | 86 | * the gfp_mask should be considered as zone modifiers. Each valid |
87 | * combination of the zone modifier bits has a corresponding list | 87 | * combination of the zone modifier bits has a corresponding list |
88 | * of zones (in node_zonelists). Thus for two zone modifiers there | 88 | * of zones (in node_zonelists). Thus for two zone modifiers there |
89 | * will be a maximum of 4 (2 ** 2) zonelists, for 3 modifiers there will | 89 | * will be a maximum of 4 (2 ** 2) zonelists, for 3 modifiers there will |
90 | * be 8 (2 ** 3) zonelists. GFP_ZONETYPES defines the number of possible | 90 | * be 8 (2 ** 3) zonelists. GFP_ZONETYPES defines the number of possible |
91 | * combinations of zone modifiers in "zone modifier space". | 91 | * combinations of zone modifiers in "zone modifier space". |
92 | */ | 92 | */ |
93 | #define GFP_ZONEMASK 0x03 | 93 | #define GFP_ZONEMASK 0x03 |
94 | /* | 94 | /* |
95 | * As an optimisation any zone modifier bits which are only valid when | 95 | * As an optimisation any zone modifier bits which are only valid when |
96 | * no other zone modifier bits are set (loners) should be placed in | 96 | * no other zone modifier bits are set (loners) should be placed in |
97 | * the highest order bits of this field. This allows us to reduce the | 97 | * the highest order bits of this field. This allows us to reduce the |
98 | * extent of the zonelists thus saving space. For example in the case | 98 | * extent of the zonelists thus saving space. For example in the case |
99 | * of three zone modifier bits, we could require up to eight zonelists. | 99 | * of three zone modifier bits, we could require up to eight zonelists. |
100 | * If the left most zone modifier is a "loner" then the highest valid | 100 | * If the left most zone modifier is a "loner" then the highest valid |
101 | * zonelist would be four allowing us to allocate only five zonelists. | 101 | * zonelist would be four allowing us to allocate only five zonelists. |
102 | * Use the first form when the left most bit is not a "loner", otherwise | 102 | * Use the first form when the left most bit is not a "loner", otherwise |
103 | * use the second. | 103 | * use the second. |
104 | */ | 104 | */ |
105 | /* #define GFP_ZONETYPES (GFP_ZONEMASK + 1) */ /* Non-loner */ | 105 | /* #define GFP_ZONETYPES (GFP_ZONEMASK + 1) */ /* Non-loner */ |
106 | #define GFP_ZONETYPES ((GFP_ZONEMASK + 1) / 2 + 1) /* Loner */ | 106 | #define GFP_ZONETYPES ((GFP_ZONEMASK + 1) / 2 + 1) /* Loner */ |
107 | 107 | ||
108 | /* | 108 | /* |
109 | * On machines where it is needed (eg PCs) we divide physical memory | 109 | * On machines where it is needed (eg PCs) we divide physical memory |
110 | * into multiple physical zones. On a PC we have 3 zones: | 110 | * into multiple physical zones. On a PC we have 3 zones: |
111 | * | 111 | * |
112 | * ZONE_DMA < 16 MB ISA DMA capable memory | 112 | * ZONE_DMA < 16 MB ISA DMA capable memory |
113 | * ZONE_NORMAL 16-896 MB direct mapped by the kernel | 113 | * ZONE_NORMAL 16-896 MB direct mapped by the kernel |
114 | * ZONE_HIGHMEM > 896 MB only page cache and user processes | 114 | * ZONE_HIGHMEM > 896 MB only page cache and user processes |
115 | */ | 115 | */ |
116 | 116 | ||
117 | struct zone { | 117 | struct zone { |
118 | /* Fields commonly accessed by the page allocator */ | 118 | /* Fields commonly accessed by the page allocator */ |
119 | unsigned long free_pages; | 119 | unsigned long free_pages; |
120 | unsigned long pages_min, pages_low, pages_high; | 120 | unsigned long pages_min, pages_low, pages_high; |
121 | /* | 121 | /* |
122 | * We don't know if the memory that we're going to allocate will be freeable | 122 | * We don't know if the memory that we're going to allocate will be freeable |
123 | * or/and it will be released eventually, so to avoid totally wasting several | 123 | * or/and it will be released eventually, so to avoid totally wasting several |
124 | * GB of ram we must reserve some of the lower zone memory (otherwise we risk | 124 | * GB of ram we must reserve some of the lower zone memory (otherwise we risk |
125 | * to run OOM on the lower zones despite there's tons of freeable ram | 125 | * to run OOM on the lower zones despite there's tons of freeable ram |
126 | * on the higher zones). This array is recalculated at runtime if the | 126 | * on the higher zones). This array is recalculated at runtime if the |
127 | * sysctl_lowmem_reserve_ratio sysctl changes. | 127 | * sysctl_lowmem_reserve_ratio sysctl changes. |
128 | */ | 128 | */ |
129 | unsigned long lowmem_reserve[MAX_NR_ZONES]; | 129 | unsigned long lowmem_reserve[MAX_NR_ZONES]; |
130 | 130 | ||
131 | #ifdef CONFIG_NUMA | 131 | #ifdef CONFIG_NUMA |
132 | struct per_cpu_pageset *pageset[NR_CPUS]; | 132 | struct per_cpu_pageset *pageset[NR_CPUS]; |
133 | #else | 133 | #else |
134 | struct per_cpu_pageset pageset[NR_CPUS]; | 134 | struct per_cpu_pageset pageset[NR_CPUS]; |
135 | #endif | 135 | #endif |
136 | /* | 136 | /* |
137 | * free areas of different sizes | 137 | * free areas of different sizes |
138 | */ | 138 | */ |
139 | spinlock_t lock; | 139 | spinlock_t lock; |
140 | struct free_area free_area[MAX_ORDER]; | 140 | struct free_area free_area[MAX_ORDER]; |
141 | 141 | ||
142 | 142 | ||
143 | ZONE_PADDING(_pad1_) | 143 | ZONE_PADDING(_pad1_) |
144 | 144 | ||
145 | /* Fields commonly accessed by the page reclaim scanner */ | 145 | /* Fields commonly accessed by the page reclaim scanner */ |
146 | spinlock_t lru_lock; | 146 | spinlock_t lru_lock; |
147 | struct list_head active_list; | 147 | struct list_head active_list; |
148 | struct list_head inactive_list; | 148 | struct list_head inactive_list; |
149 | unsigned long nr_scan_active; | 149 | unsigned long nr_scan_active; |
150 | unsigned long nr_scan_inactive; | 150 | unsigned long nr_scan_inactive; |
151 | unsigned long nr_active; | 151 | unsigned long nr_active; |
152 | unsigned long nr_inactive; | 152 | unsigned long nr_inactive; |
153 | unsigned long pages_scanned; /* since last reclaim */ | 153 | unsigned long pages_scanned; /* since last reclaim */ |
154 | int all_unreclaimable; /* All pages pinned */ | 154 | int all_unreclaimable; /* All pages pinned */ |
155 | 155 | ||
156 | /* | 156 | /* |
157 | * Does the allocator try to reclaim pages from the zone as soon | 157 | * Does the allocator try to reclaim pages from the zone as soon |
158 | * as it fails a watermark_ok() in __alloc_pages? | 158 | * as it fails a watermark_ok() in __alloc_pages? |
159 | */ | 159 | */ |
160 | int reclaim_pages; | 160 | int reclaim_pages; |
161 | /* A count of how many reclaimers are scanning this zone */ | 161 | /* A count of how many reclaimers are scanning this zone */ |
162 | atomic_t reclaim_in_progress; | 162 | atomic_t reclaim_in_progress; |
163 | 163 | ||
164 | /* | 164 | /* |
165 | * prev_priority holds the scanning priority for this zone. It is | 165 | * prev_priority holds the scanning priority for this zone. It is |
166 | * defined as the scanning priority at which we achieved our reclaim | 166 | * defined as the scanning priority at which we achieved our reclaim |
167 | * target at the previous try_to_free_pages() or balance_pgdat() | 167 | * target at the previous try_to_free_pages() or balance_pgdat() |
168 | * invokation. | 168 | * invokation. |
169 | * | 169 | * |
170 | * We use prev_priority as a measure of how much stress page reclaim is | 170 | * We use prev_priority as a measure of how much stress page reclaim is |
171 | * under - it drives the swappiness decision: whether to unmap mapped | 171 | * under - it drives the swappiness decision: whether to unmap mapped |
172 | * pages. | 172 | * pages. |
173 | * | 173 | * |
174 | * temp_priority is used to remember the scanning priority at which | 174 | * temp_priority is used to remember the scanning priority at which |
175 | * this zone was successfully refilled to free_pages == pages_high. | 175 | * this zone was successfully refilled to free_pages == pages_high. |
176 | * | 176 | * |
177 | * Access to both these fields is quite racy even on uniprocessor. But | 177 | * Access to both these fields is quite racy even on uniprocessor. But |
178 | * it is expected to average out OK. | 178 | * it is expected to average out OK. |
179 | */ | 179 | */ |
180 | int temp_priority; | 180 | int temp_priority; |
181 | int prev_priority; | 181 | int prev_priority; |
182 | 182 | ||
183 | 183 | ||
184 | ZONE_PADDING(_pad2_) | 184 | ZONE_PADDING(_pad2_) |
185 | /* Rarely used or read-mostly fields */ | 185 | /* Rarely used or read-mostly fields */ |
186 | 186 | ||
187 | /* | 187 | /* |
188 | * wait_table -- the array holding the hash table | 188 | * wait_table -- the array holding the hash table |
189 | * wait_table_size -- the size of the hash table array | 189 | * wait_table_size -- the size of the hash table array |
190 | * wait_table_bits -- wait_table_size == (1 << wait_table_bits) | 190 | * wait_table_bits -- wait_table_size == (1 << wait_table_bits) |
191 | * | 191 | * |
192 | * The purpose of all these is to keep track of the people | 192 | * The purpose of all these is to keep track of the people |
193 | * waiting for a page to become available and make them | 193 | * waiting for a page to become available and make them |
194 | * runnable again when possible. The trouble is that this | 194 | * runnable again when possible. The trouble is that this |
195 | * consumes a lot of space, especially when so few things | 195 | * consumes a lot of space, especially when so few things |
196 | * wait on pages at a given time. So instead of using | 196 | * wait on pages at a given time. So instead of using |
197 | * per-page waitqueues, we use a waitqueue hash table. | 197 | * per-page waitqueues, we use a waitqueue hash table. |
198 | * | 198 | * |
199 | * The bucket discipline is to sleep on the same queue when | 199 | * The bucket discipline is to sleep on the same queue when |
200 | * colliding and wake all in that wait queue when removing. | 200 | * colliding and wake all in that wait queue when removing. |
201 | * When something wakes, it must check to be sure its page is | 201 | * When something wakes, it must check to be sure its page is |
202 | * truly available, a la thundering herd. The cost of a | 202 | * truly available, a la thundering herd. The cost of a |
203 | * collision is great, but given the expected load of the | 203 | * collision is great, but given the expected load of the |
204 | * table, they should be so rare as to be outweighed by the | 204 | * table, they should be so rare as to be outweighed by the |
205 | * benefits from the saved space. | 205 | * benefits from the saved space. |
206 | * | 206 | * |
207 | * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the | 207 | * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the |
208 | * primary users of these fields, and in mm/page_alloc.c | 208 | * primary users of these fields, and in mm/page_alloc.c |
209 | * free_area_init_core() performs the initialization of them. | 209 | * free_area_init_core() performs the initialization of them. |
210 | */ | 210 | */ |
211 | wait_queue_head_t * wait_table; | 211 | wait_queue_head_t * wait_table; |
212 | unsigned long wait_table_size; | 212 | unsigned long wait_table_size; |
213 | unsigned long wait_table_bits; | 213 | unsigned long wait_table_bits; |
214 | 214 | ||
215 | /* | 215 | /* |
216 | * Discontig memory support fields. | 216 | * Discontig memory support fields. |
217 | */ | 217 | */ |
218 | struct pglist_data *zone_pgdat; | 218 | struct pglist_data *zone_pgdat; |
219 | struct page *zone_mem_map; | 219 | struct page *zone_mem_map; |
220 | /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */ | 220 | /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */ |
221 | unsigned long zone_start_pfn; | 221 | unsigned long zone_start_pfn; |
222 | 222 | ||
223 | unsigned long spanned_pages; /* total size, including holes */ | 223 | unsigned long spanned_pages; /* total size, including holes */ |
224 | unsigned long present_pages; /* amount of memory (excluding holes) */ | 224 | unsigned long present_pages; /* amount of memory (excluding holes) */ |
225 | 225 | ||
226 | /* | 226 | /* |
227 | * rarely used fields: | 227 | * rarely used fields: |
228 | */ | 228 | */ |
229 | char *name; | 229 | char *name; |
230 | } ____cacheline_maxaligned_in_smp; | 230 | } ____cacheline_maxaligned_in_smp; |
231 | 231 | ||
232 | 232 | ||
233 | /* | 233 | /* |
234 | * The "priority" of VM scanning is how much of the queues we will scan in one | 234 | * The "priority" of VM scanning is how much of the queues we will scan in one |
235 | * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the | 235 | * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the |
236 | * queues ("queue_length >> 12") during an aging round. | 236 | * queues ("queue_length >> 12") during an aging round. |
237 | */ | 237 | */ |
238 | #define DEF_PRIORITY 12 | 238 | #define DEF_PRIORITY 12 |
239 | 239 | ||
240 | /* | 240 | /* |
241 | * One allocation request operates on a zonelist. A zonelist | 241 | * One allocation request operates on a zonelist. A zonelist |
242 | * is a list of zones, the first one is the 'goal' of the | 242 | * is a list of zones, the first one is the 'goal' of the |
243 | * allocation, the other zones are fallback zones, in decreasing | 243 | * allocation, the other zones are fallback zones, in decreasing |
244 | * priority. | 244 | * priority. |
245 | * | 245 | * |
246 | * Right now a zonelist takes up less than a cacheline. We never | 246 | * Right now a zonelist takes up less than a cacheline. We never |
247 | * modify it apart from boot-up, and only a few indices are used, | 247 | * modify it apart from boot-up, and only a few indices are used, |
248 | * so despite the zonelist table being relatively big, the cache | 248 | * so despite the zonelist table being relatively big, the cache |
249 | * footprint of this construct is very small. | 249 | * footprint of this construct is very small. |
250 | */ | 250 | */ |
251 | struct zonelist { | 251 | struct zonelist { |
252 | struct zone *zones[MAX_NUMNODES * MAX_NR_ZONES + 1]; // NULL delimited | 252 | struct zone *zones[MAX_NUMNODES * MAX_NR_ZONES + 1]; // NULL delimited |
253 | }; | 253 | }; |
254 | 254 | ||
255 | 255 | ||
256 | /* | 256 | /* |
257 | * The pg_data_t structure is used in machines with CONFIG_DISCONTIGMEM | 257 | * The pg_data_t structure is used in machines with CONFIG_DISCONTIGMEM |
258 | * (mostly NUMA machines?) to denote a higher-level memory zone than the | 258 | * (mostly NUMA machines?) to denote a higher-level memory zone than the |
259 | * zone denotes. | 259 | * zone denotes. |
260 | * | 260 | * |
261 | * On NUMA machines, each NUMA node would have a pg_data_t to describe | 261 | * On NUMA machines, each NUMA node would have a pg_data_t to describe |
262 | * it's memory layout. | 262 | * it's memory layout. |
263 | * | 263 | * |
264 | * Memory statistics and page replacement data structures are maintained on a | 264 | * Memory statistics and page replacement data structures are maintained on a |
265 | * per-zone basis. | 265 | * per-zone basis. |
266 | */ | 266 | */ |
267 | struct bootmem_data; | 267 | struct bootmem_data; |
268 | typedef struct pglist_data { | 268 | typedef struct pglist_data { |
269 | struct zone node_zones[MAX_NR_ZONES]; | 269 | struct zone node_zones[MAX_NR_ZONES]; |
270 | struct zonelist node_zonelists[GFP_ZONETYPES]; | 270 | struct zonelist node_zonelists[GFP_ZONETYPES]; |
271 | int nr_zones; | 271 | int nr_zones; |
272 | #ifdef CONFIG_FLAT_NODE_MEM_MAP | 272 | #ifdef CONFIG_FLAT_NODE_MEM_MAP |
273 | struct page *node_mem_map; | 273 | struct page *node_mem_map; |
274 | #endif | 274 | #endif |
275 | struct bootmem_data *bdata; | 275 | struct bootmem_data *bdata; |
276 | #ifdef CONFIG_MEMORY_HOTPLUG | ||
277 | /* | ||
278 | * Must be held any time you expect node_start_pfn, node_present_pages | ||
279 | * or node_spanned_pages stay constant. Holding this will also | ||
280 | * guarantee that any pfn_valid() stays that way. | ||
281 | * | ||
282 | * Nests above zone->lock and zone->size_seqlock. | ||
283 | */ | ||
284 | spinlock_t node_size_lock; | ||
285 | #endif | ||
276 | unsigned long node_start_pfn; | 286 | unsigned long node_start_pfn; |
277 | unsigned long node_present_pages; /* total number of physical pages */ | 287 | unsigned long node_present_pages; /* total number of physical pages */ |
278 | unsigned long node_spanned_pages; /* total size of physical page | 288 | unsigned long node_spanned_pages; /* total size of physical page |
279 | range, including holes */ | 289 | range, including holes */ |
280 | int node_id; | 290 | int node_id; |
281 | struct pglist_data *pgdat_next; | 291 | struct pglist_data *pgdat_next; |
282 | wait_queue_head_t kswapd_wait; | 292 | wait_queue_head_t kswapd_wait; |
283 | struct task_struct *kswapd; | 293 | struct task_struct *kswapd; |
284 | int kswapd_max_order; | 294 | int kswapd_max_order; |
285 | } pg_data_t; | 295 | } pg_data_t; |
286 | 296 | ||
287 | #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) | 297 | #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) |
288 | #define node_spanned_pages(nid) (NODE_DATA(nid)->node_spanned_pages) | 298 | #define node_spanned_pages(nid) (NODE_DATA(nid)->node_spanned_pages) |
289 | #ifdef CONFIG_FLAT_NODE_MEM_MAP | 299 | #ifdef CONFIG_FLAT_NODE_MEM_MAP |
290 | #define pgdat_page_nr(pgdat, pagenr) ((pgdat)->node_mem_map + (pagenr)) | 300 | #define pgdat_page_nr(pgdat, pagenr) ((pgdat)->node_mem_map + (pagenr)) |
291 | #else | 301 | #else |
292 | #define pgdat_page_nr(pgdat, pagenr) pfn_to_page((pgdat)->node_start_pfn + (pagenr)) | 302 | #define pgdat_page_nr(pgdat, pagenr) pfn_to_page((pgdat)->node_start_pfn + (pagenr)) |
293 | #endif | 303 | #endif |
294 | #define nid_page_nr(nid, pagenr) pgdat_page_nr(NODE_DATA(nid),(pagenr)) | 304 | #define nid_page_nr(nid, pagenr) pgdat_page_nr(NODE_DATA(nid),(pagenr)) |
305 | |||
306 | #include <linux/memory_hotplug.h> | ||
295 | 307 | ||
296 | extern struct pglist_data *pgdat_list; | 308 | extern struct pglist_data *pgdat_list; |
297 | 309 | ||
298 | void __get_zone_counts(unsigned long *active, unsigned long *inactive, | 310 | void __get_zone_counts(unsigned long *active, unsigned long *inactive, |
299 | unsigned long *free, struct pglist_data *pgdat); | 311 | unsigned long *free, struct pglist_data *pgdat); |
300 | void get_zone_counts(unsigned long *active, unsigned long *inactive, | 312 | void get_zone_counts(unsigned long *active, unsigned long *inactive, |
301 | unsigned long *free); | 313 | unsigned long *free); |
302 | void build_all_zonelists(void); | 314 | void build_all_zonelists(void); |
303 | void wakeup_kswapd(struct zone *zone, int order); | 315 | void wakeup_kswapd(struct zone *zone, int order); |
304 | int zone_watermark_ok(struct zone *z, int order, unsigned long mark, | 316 | int zone_watermark_ok(struct zone *z, int order, unsigned long mark, |
305 | int alloc_type, int can_try_harder, gfp_t gfp_high); | 317 | int alloc_type, int can_try_harder, gfp_t gfp_high); |
306 | 318 | ||
307 | #ifdef CONFIG_HAVE_MEMORY_PRESENT | 319 | #ifdef CONFIG_HAVE_MEMORY_PRESENT |
308 | void memory_present(int nid, unsigned long start, unsigned long end); | 320 | void memory_present(int nid, unsigned long start, unsigned long end); |
309 | #else | 321 | #else |
310 | static inline void memory_present(int nid, unsigned long start, unsigned long end) {} | 322 | static inline void memory_present(int nid, unsigned long start, unsigned long end) {} |
311 | #endif | 323 | #endif |
312 | 324 | ||
313 | #ifdef CONFIG_NEED_NODE_MEMMAP_SIZE | 325 | #ifdef CONFIG_NEED_NODE_MEMMAP_SIZE |
314 | unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long); | 326 | unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long); |
315 | #endif | 327 | #endif |
316 | 328 | ||
317 | /* | 329 | /* |
318 | * zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc. | 330 | * zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc. |
319 | */ | 331 | */ |
320 | #define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones) | 332 | #define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones) |
321 | 333 | ||
322 | /** | 334 | /** |
323 | * for_each_pgdat - helper macro to iterate over all nodes | 335 | * for_each_pgdat - helper macro to iterate over all nodes |
324 | * @pgdat - pointer to a pg_data_t variable | 336 | * @pgdat - pointer to a pg_data_t variable |
325 | * | 337 | * |
326 | * Meant to help with common loops of the form | 338 | * Meant to help with common loops of the form |
327 | * pgdat = pgdat_list; | 339 | * pgdat = pgdat_list; |
328 | * while(pgdat) { | 340 | * while(pgdat) { |
329 | * ... | 341 | * ... |
330 | * pgdat = pgdat->pgdat_next; | 342 | * pgdat = pgdat->pgdat_next; |
331 | * } | 343 | * } |
332 | */ | 344 | */ |
333 | #define for_each_pgdat(pgdat) \ | 345 | #define for_each_pgdat(pgdat) \ |
334 | for (pgdat = pgdat_list; pgdat; pgdat = pgdat->pgdat_next) | 346 | for (pgdat = pgdat_list; pgdat; pgdat = pgdat->pgdat_next) |
335 | 347 | ||
336 | /* | 348 | /* |
337 | * next_zone - helper magic for for_each_zone() | 349 | * next_zone - helper magic for for_each_zone() |
338 | * Thanks to William Lee Irwin III for this piece of ingenuity. | 350 | * Thanks to William Lee Irwin III for this piece of ingenuity. |
339 | */ | 351 | */ |
340 | static inline struct zone *next_zone(struct zone *zone) | 352 | static inline struct zone *next_zone(struct zone *zone) |
341 | { | 353 | { |
342 | pg_data_t *pgdat = zone->zone_pgdat; | 354 | pg_data_t *pgdat = zone->zone_pgdat; |
343 | 355 | ||
344 | if (zone < pgdat->node_zones + MAX_NR_ZONES - 1) | 356 | if (zone < pgdat->node_zones + MAX_NR_ZONES - 1) |
345 | zone++; | 357 | zone++; |
346 | else if (pgdat->pgdat_next) { | 358 | else if (pgdat->pgdat_next) { |
347 | pgdat = pgdat->pgdat_next; | 359 | pgdat = pgdat->pgdat_next; |
348 | zone = pgdat->node_zones; | 360 | zone = pgdat->node_zones; |
349 | } else | 361 | } else |
350 | zone = NULL; | 362 | zone = NULL; |
351 | 363 | ||
352 | return zone; | 364 | return zone; |
353 | } | 365 | } |
354 | 366 | ||
355 | /** | 367 | /** |
356 | * for_each_zone - helper macro to iterate over all memory zones | 368 | * for_each_zone - helper macro to iterate over all memory zones |
357 | * @zone - pointer to struct zone variable | 369 | * @zone - pointer to struct zone variable |
358 | * | 370 | * |
359 | * The user only needs to declare the zone variable, for_each_zone | 371 | * The user only needs to declare the zone variable, for_each_zone |
360 | * fills it in. This basically means for_each_zone() is an | 372 | * fills it in. This basically means for_each_zone() is an |
361 | * easier to read version of this piece of code: | 373 | * easier to read version of this piece of code: |
362 | * | 374 | * |
363 | * for (pgdat = pgdat_list; pgdat; pgdat = pgdat->node_next) | 375 | * for (pgdat = pgdat_list; pgdat; pgdat = pgdat->node_next) |
364 | * for (i = 0; i < MAX_NR_ZONES; ++i) { | 376 | * for (i = 0; i < MAX_NR_ZONES; ++i) { |
365 | * struct zone * z = pgdat->node_zones + i; | 377 | * struct zone * z = pgdat->node_zones + i; |
366 | * ... | 378 | * ... |
367 | * } | 379 | * } |
368 | * } | 380 | * } |
369 | */ | 381 | */ |
370 | #define for_each_zone(zone) \ | 382 | #define for_each_zone(zone) \ |
371 | for (zone = pgdat_list->node_zones; zone; zone = next_zone(zone)) | 383 | for (zone = pgdat_list->node_zones; zone; zone = next_zone(zone)) |
372 | 384 | ||
373 | static inline int is_highmem_idx(int idx) | 385 | static inline int is_highmem_idx(int idx) |
374 | { | 386 | { |
375 | return (idx == ZONE_HIGHMEM); | 387 | return (idx == ZONE_HIGHMEM); |
376 | } | 388 | } |
377 | 389 | ||
378 | static inline int is_normal_idx(int idx) | 390 | static inline int is_normal_idx(int idx) |
379 | { | 391 | { |
380 | return (idx == ZONE_NORMAL); | 392 | return (idx == ZONE_NORMAL); |
381 | } | 393 | } |
382 | /** | 394 | /** |
383 | * is_highmem - helper function to quickly check if a struct zone is a | 395 | * is_highmem - helper function to quickly check if a struct zone is a |
384 | * highmem zone or not. This is an attempt to keep references | 396 | * highmem zone or not. This is an attempt to keep references |
385 | * to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum. | 397 | * to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum. |
386 | * @zone - pointer to struct zone variable | 398 | * @zone - pointer to struct zone variable |
387 | */ | 399 | */ |
388 | static inline int is_highmem(struct zone *zone) | 400 | static inline int is_highmem(struct zone *zone) |
389 | { | 401 | { |
390 | return zone == zone->zone_pgdat->node_zones + ZONE_HIGHMEM; | 402 | return zone == zone->zone_pgdat->node_zones + ZONE_HIGHMEM; |
391 | } | 403 | } |
392 | 404 | ||
393 | static inline int is_normal(struct zone *zone) | 405 | static inline int is_normal(struct zone *zone) |
394 | { | 406 | { |
395 | return zone == zone->zone_pgdat->node_zones + ZONE_NORMAL; | 407 | return zone == zone->zone_pgdat->node_zones + ZONE_NORMAL; |
396 | } | 408 | } |
397 | 409 | ||
398 | /* These two functions are used to setup the per zone pages min values */ | 410 | /* These two functions are used to setup the per zone pages min values */ |
399 | struct ctl_table; | 411 | struct ctl_table; |
400 | struct file; | 412 | struct file; |
401 | int min_free_kbytes_sysctl_handler(struct ctl_table *, int, struct file *, | 413 | int min_free_kbytes_sysctl_handler(struct ctl_table *, int, struct file *, |
402 | void __user *, size_t *, loff_t *); | 414 | void __user *, size_t *, loff_t *); |
403 | extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1]; | 415 | extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1]; |
404 | int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, struct file *, | 416 | int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, struct file *, |
405 | void __user *, size_t *, loff_t *); | 417 | void __user *, size_t *, loff_t *); |
406 | 418 | ||
407 | #include <linux/topology.h> | 419 | #include <linux/topology.h> |
408 | /* Returns the number of the current Node. */ | 420 | /* Returns the number of the current Node. */ |
409 | #define numa_node_id() (cpu_to_node(raw_smp_processor_id())) | 421 | #define numa_node_id() (cpu_to_node(raw_smp_processor_id())) |
410 | 422 | ||
411 | #ifndef CONFIG_NEED_MULTIPLE_NODES | 423 | #ifndef CONFIG_NEED_MULTIPLE_NODES |
412 | 424 | ||
413 | extern struct pglist_data contig_page_data; | 425 | extern struct pglist_data contig_page_data; |
414 | #define NODE_DATA(nid) (&contig_page_data) | 426 | #define NODE_DATA(nid) (&contig_page_data) |
415 | #define NODE_MEM_MAP(nid) mem_map | 427 | #define NODE_MEM_MAP(nid) mem_map |
416 | #define MAX_NODES_SHIFT 1 | 428 | #define MAX_NODES_SHIFT 1 |
417 | #define pfn_to_nid(pfn) (0) | 429 | #define pfn_to_nid(pfn) (0) |
418 | 430 | ||
419 | #else /* CONFIG_NEED_MULTIPLE_NODES */ | 431 | #else /* CONFIG_NEED_MULTIPLE_NODES */ |
420 | 432 | ||
421 | #include <asm/mmzone.h> | 433 | #include <asm/mmzone.h> |
422 | 434 | ||
423 | #endif /* !CONFIG_NEED_MULTIPLE_NODES */ | 435 | #endif /* !CONFIG_NEED_MULTIPLE_NODES */ |
424 | 436 | ||
425 | #ifdef CONFIG_SPARSEMEM | 437 | #ifdef CONFIG_SPARSEMEM |
426 | #include <asm/sparsemem.h> | 438 | #include <asm/sparsemem.h> |
427 | #endif | 439 | #endif |
428 | 440 | ||
429 | #if BITS_PER_LONG == 32 || defined(ARCH_HAS_ATOMIC_UNSIGNED) | 441 | #if BITS_PER_LONG == 32 || defined(ARCH_HAS_ATOMIC_UNSIGNED) |
430 | /* | 442 | /* |
431 | * with 32 bit page->flags field, we reserve 8 bits for node/zone info. | 443 | * with 32 bit page->flags field, we reserve 8 bits for node/zone info. |
432 | * there are 3 zones (2 bits) and this leaves 8-2=6 bits for nodes. | 444 | * there are 3 zones (2 bits) and this leaves 8-2=6 bits for nodes. |
433 | */ | 445 | */ |
434 | #define FLAGS_RESERVED 8 | 446 | #define FLAGS_RESERVED 8 |
435 | 447 | ||
436 | #elif BITS_PER_LONG == 64 | 448 | #elif BITS_PER_LONG == 64 |
437 | /* | 449 | /* |
438 | * with 64 bit flags field, there's plenty of room. | 450 | * with 64 bit flags field, there's plenty of room. |
439 | */ | 451 | */ |
440 | #define FLAGS_RESERVED 32 | 452 | #define FLAGS_RESERVED 32 |
441 | 453 | ||
442 | #else | 454 | #else |
443 | 455 | ||
444 | #error BITS_PER_LONG not defined | 456 | #error BITS_PER_LONG not defined |
445 | 457 | ||
446 | #endif | 458 | #endif |
447 | 459 | ||
448 | #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID | 460 | #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID |
449 | #define early_pfn_to_nid(nid) (0UL) | 461 | #define early_pfn_to_nid(nid) (0UL) |
450 | #endif | 462 | #endif |
451 | 463 | ||
452 | #define pfn_to_section_nr(pfn) ((pfn) >> PFN_SECTION_SHIFT) | 464 | #define pfn_to_section_nr(pfn) ((pfn) >> PFN_SECTION_SHIFT) |
453 | #define section_nr_to_pfn(sec) ((sec) << PFN_SECTION_SHIFT) | 465 | #define section_nr_to_pfn(sec) ((sec) << PFN_SECTION_SHIFT) |
454 | 466 | ||
455 | #ifdef CONFIG_SPARSEMEM | 467 | #ifdef CONFIG_SPARSEMEM |
456 | 468 | ||
457 | /* | 469 | /* |
458 | * SECTION_SHIFT #bits space required to store a section # | 470 | * SECTION_SHIFT #bits space required to store a section # |
459 | * | 471 | * |
460 | * PA_SECTION_SHIFT physical address to/from section number | 472 | * PA_SECTION_SHIFT physical address to/from section number |
461 | * PFN_SECTION_SHIFT pfn to/from section number | 473 | * PFN_SECTION_SHIFT pfn to/from section number |
462 | */ | 474 | */ |
463 | #define SECTIONS_SHIFT (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS) | 475 | #define SECTIONS_SHIFT (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS) |
464 | 476 | ||
465 | #define PA_SECTION_SHIFT (SECTION_SIZE_BITS) | 477 | #define PA_SECTION_SHIFT (SECTION_SIZE_BITS) |
466 | #define PFN_SECTION_SHIFT (SECTION_SIZE_BITS - PAGE_SHIFT) | 478 | #define PFN_SECTION_SHIFT (SECTION_SIZE_BITS - PAGE_SHIFT) |
467 | 479 | ||
468 | #define NR_MEM_SECTIONS (1UL << SECTIONS_SHIFT) | 480 | #define NR_MEM_SECTIONS (1UL << SECTIONS_SHIFT) |
469 | 481 | ||
470 | #define PAGES_PER_SECTION (1UL << PFN_SECTION_SHIFT) | 482 | #define PAGES_PER_SECTION (1UL << PFN_SECTION_SHIFT) |
471 | #define PAGE_SECTION_MASK (~(PAGES_PER_SECTION-1)) | 483 | #define PAGE_SECTION_MASK (~(PAGES_PER_SECTION-1)) |
472 | 484 | ||
473 | #if (MAX_ORDER - 1 + PAGE_SHIFT) > SECTION_SIZE_BITS | 485 | #if (MAX_ORDER - 1 + PAGE_SHIFT) > SECTION_SIZE_BITS |
474 | #error Allocator MAX_ORDER exceeds SECTION_SIZE | 486 | #error Allocator MAX_ORDER exceeds SECTION_SIZE |
475 | #endif | 487 | #endif |
476 | 488 | ||
477 | struct page; | 489 | struct page; |
478 | struct mem_section { | 490 | struct mem_section { |
479 | /* | 491 | /* |
480 | * This is, logically, a pointer to an array of struct | 492 | * This is, logically, a pointer to an array of struct |
481 | * pages. However, it is stored with some other magic. | 493 | * pages. However, it is stored with some other magic. |
482 | * (see sparse.c::sparse_init_one_section()) | 494 | * (see sparse.c::sparse_init_one_section()) |
483 | * | 495 | * |
484 | * Making it a UL at least makes someone do a cast | 496 | * Making it a UL at least makes someone do a cast |
485 | * before using it wrong. | 497 | * before using it wrong. |
486 | */ | 498 | */ |
487 | unsigned long section_mem_map; | 499 | unsigned long section_mem_map; |
488 | }; | 500 | }; |
489 | 501 | ||
490 | #ifdef CONFIG_SPARSEMEM_EXTREME | 502 | #ifdef CONFIG_SPARSEMEM_EXTREME |
491 | #define SECTIONS_PER_ROOT (PAGE_SIZE / sizeof (struct mem_section)) | 503 | #define SECTIONS_PER_ROOT (PAGE_SIZE / sizeof (struct mem_section)) |
492 | #else | 504 | #else |
493 | #define SECTIONS_PER_ROOT 1 | 505 | #define SECTIONS_PER_ROOT 1 |
494 | #endif | 506 | #endif |
495 | 507 | ||
496 | #define SECTION_NR_TO_ROOT(sec) ((sec) / SECTIONS_PER_ROOT) | 508 | #define SECTION_NR_TO_ROOT(sec) ((sec) / SECTIONS_PER_ROOT) |
497 | #define NR_SECTION_ROOTS (NR_MEM_SECTIONS / SECTIONS_PER_ROOT) | 509 | #define NR_SECTION_ROOTS (NR_MEM_SECTIONS / SECTIONS_PER_ROOT) |
498 | #define SECTION_ROOT_MASK (SECTIONS_PER_ROOT - 1) | 510 | #define SECTION_ROOT_MASK (SECTIONS_PER_ROOT - 1) |
499 | 511 | ||
500 | #ifdef CONFIG_SPARSEMEM_EXTREME | 512 | #ifdef CONFIG_SPARSEMEM_EXTREME |
501 | extern struct mem_section *mem_section[NR_SECTION_ROOTS]; | 513 | extern struct mem_section *mem_section[NR_SECTION_ROOTS]; |
502 | #else | 514 | #else |
503 | extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]; | 515 | extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]; |
504 | #endif | 516 | #endif |
505 | 517 | ||
506 | static inline struct mem_section *__nr_to_section(unsigned long nr) | 518 | static inline struct mem_section *__nr_to_section(unsigned long nr) |
507 | { | 519 | { |
508 | if (!mem_section[SECTION_NR_TO_ROOT(nr)]) | 520 | if (!mem_section[SECTION_NR_TO_ROOT(nr)]) |
509 | return NULL; | 521 | return NULL; |
510 | return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK]; | 522 | return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK]; |
511 | } | 523 | } |
512 | extern int __section_nr(struct mem_section* ms); | 524 | extern int __section_nr(struct mem_section* ms); |
513 | 525 | ||
514 | /* | 526 | /* |
515 | * We use the lower bits of the mem_map pointer to store | 527 | * We use the lower bits of the mem_map pointer to store |
516 | * a little bit of information. There should be at least | 528 | * a little bit of information. There should be at least |
517 | * 3 bits here due to 32-bit alignment. | 529 | * 3 bits here due to 32-bit alignment. |
518 | */ | 530 | */ |
519 | #define SECTION_MARKED_PRESENT (1UL<<0) | 531 | #define SECTION_MARKED_PRESENT (1UL<<0) |
520 | #define SECTION_HAS_MEM_MAP (1UL<<1) | 532 | #define SECTION_HAS_MEM_MAP (1UL<<1) |
521 | #define SECTION_MAP_LAST_BIT (1UL<<2) | 533 | #define SECTION_MAP_LAST_BIT (1UL<<2) |
522 | #define SECTION_MAP_MASK (~(SECTION_MAP_LAST_BIT-1)) | 534 | #define SECTION_MAP_MASK (~(SECTION_MAP_LAST_BIT-1)) |
523 | 535 | ||
524 | static inline struct page *__section_mem_map_addr(struct mem_section *section) | 536 | static inline struct page *__section_mem_map_addr(struct mem_section *section) |
525 | { | 537 | { |
526 | unsigned long map = section->section_mem_map; | 538 | unsigned long map = section->section_mem_map; |
527 | map &= SECTION_MAP_MASK; | 539 | map &= SECTION_MAP_MASK; |
528 | return (struct page *)map; | 540 | return (struct page *)map; |
529 | } | 541 | } |
530 | 542 | ||
531 | static inline int valid_section(struct mem_section *section) | 543 | static inline int valid_section(struct mem_section *section) |
532 | { | 544 | { |
533 | return (section && (section->section_mem_map & SECTION_MARKED_PRESENT)); | 545 | return (section && (section->section_mem_map & SECTION_MARKED_PRESENT)); |
534 | } | 546 | } |
535 | 547 | ||
536 | static inline int section_has_mem_map(struct mem_section *section) | 548 | static inline int section_has_mem_map(struct mem_section *section) |
537 | { | 549 | { |
538 | return (section && (section->section_mem_map & SECTION_HAS_MEM_MAP)); | 550 | return (section && (section->section_mem_map & SECTION_HAS_MEM_MAP)); |
539 | } | 551 | } |
540 | 552 | ||
541 | static inline int valid_section_nr(unsigned long nr) | 553 | static inline int valid_section_nr(unsigned long nr) |
542 | { | 554 | { |
543 | return valid_section(__nr_to_section(nr)); | 555 | return valid_section(__nr_to_section(nr)); |
544 | } | 556 | } |
545 | 557 | ||
546 | /* | 558 | /* |
547 | * Given a kernel address, find the home node of the underlying memory. | 559 | * Given a kernel address, find the home node of the underlying memory. |
548 | */ | 560 | */ |
549 | #define kvaddr_to_nid(kaddr) pfn_to_nid(__pa(kaddr) >> PAGE_SHIFT) | 561 | #define kvaddr_to_nid(kaddr) pfn_to_nid(__pa(kaddr) >> PAGE_SHIFT) |
550 | 562 | ||
551 | static inline struct mem_section *__pfn_to_section(unsigned long pfn) | 563 | static inline struct mem_section *__pfn_to_section(unsigned long pfn) |
552 | { | 564 | { |
553 | return __nr_to_section(pfn_to_section_nr(pfn)); | 565 | return __nr_to_section(pfn_to_section_nr(pfn)); |
554 | } | 566 | } |
555 | 567 | ||
556 | #define pfn_to_page(pfn) \ | 568 | #define pfn_to_page(pfn) \ |
557 | ({ \ | 569 | ({ \ |
558 | unsigned long __pfn = (pfn); \ | 570 | unsigned long __pfn = (pfn); \ |
559 | __section_mem_map_addr(__pfn_to_section(__pfn)) + __pfn; \ | 571 | __section_mem_map_addr(__pfn_to_section(__pfn)) + __pfn; \ |
560 | }) | 572 | }) |
561 | #define page_to_pfn(page) \ | 573 | #define page_to_pfn(page) \ |
562 | ({ \ | 574 | ({ \ |
563 | page - __section_mem_map_addr(__nr_to_section( \ | 575 | page - __section_mem_map_addr(__nr_to_section( \ |
564 | page_to_section(page))); \ | 576 | page_to_section(page))); \ |
565 | }) | 577 | }) |
566 | 578 | ||
567 | static inline int pfn_valid(unsigned long pfn) | 579 | static inline int pfn_valid(unsigned long pfn) |
568 | { | 580 | { |
569 | if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS) | 581 | if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS) |
570 | return 0; | 582 | return 0; |
571 | return valid_section(__nr_to_section(pfn_to_section_nr(pfn))); | 583 | return valid_section(__nr_to_section(pfn_to_section_nr(pfn))); |
572 | } | 584 | } |
573 | 585 | ||
574 | /* | 586 | /* |
575 | * These are _only_ used during initialisation, therefore they | 587 | * These are _only_ used during initialisation, therefore they |
576 | * can use __initdata ... They could have names to indicate | 588 | * can use __initdata ... They could have names to indicate |
577 | * this restriction. | 589 | * this restriction. |
578 | */ | 590 | */ |
579 | #ifdef CONFIG_NUMA | 591 | #ifdef CONFIG_NUMA |
580 | #define pfn_to_nid early_pfn_to_nid | 592 | #define pfn_to_nid early_pfn_to_nid |
581 | #endif | 593 | #endif |
582 | 594 | ||
583 | #define pfn_to_pgdat(pfn) \ | 595 | #define pfn_to_pgdat(pfn) \ |
584 | ({ \ | 596 | ({ \ |
585 | NODE_DATA(pfn_to_nid(pfn)); \ | 597 | NODE_DATA(pfn_to_nid(pfn)); \ |
586 | }) | 598 | }) |
587 | 599 | ||
588 | #define early_pfn_valid(pfn) pfn_valid(pfn) | 600 | #define early_pfn_valid(pfn) pfn_valid(pfn) |
589 | void sparse_init(void); | 601 | void sparse_init(void); |
590 | #else | 602 | #else |
591 | #define sparse_init() do {} while (0) | 603 | #define sparse_init() do {} while (0) |
592 | #define sparse_index_init(_sec, _nid) do {} while (0) | 604 | #define sparse_index_init(_sec, _nid) do {} while (0) |
593 | #endif /* CONFIG_SPARSEMEM */ | 605 | #endif /* CONFIG_SPARSEMEM */ |
594 | 606 | ||
595 | #ifdef CONFIG_NODES_SPAN_OTHER_NODES | 607 | #ifdef CONFIG_NODES_SPAN_OTHER_NODES |
596 | #define early_pfn_in_nid(pfn, nid) (early_pfn_to_nid(pfn) == (nid)) | 608 | #define early_pfn_in_nid(pfn, nid) (early_pfn_to_nid(pfn) == (nid)) |
597 | #else | 609 | #else |
598 | #define early_pfn_in_nid(pfn, nid) (1) | 610 | #define early_pfn_in_nid(pfn, nid) (1) |
599 | #endif | 611 | #endif |
600 | 612 | ||
601 | #ifndef early_pfn_valid | 613 | #ifndef early_pfn_valid |
602 | #define early_pfn_valid(pfn) (1) | 614 | #define early_pfn_valid(pfn) (1) |
603 | #endif | 615 | #endif |
604 | 616 | ||
605 | void memory_present(int nid, unsigned long start, unsigned long end); | 617 | void memory_present(int nid, unsigned long start, unsigned long end); |
606 | unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long); | 618 | unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long); |
607 | 619 | ||
608 | #endif /* !__ASSEMBLY__ */ | 620 | #endif /* !__ASSEMBLY__ */ |
609 | #endif /* __KERNEL__ */ | 621 | #endif /* __KERNEL__ */ |
610 | #endif /* _LINUX_MMZONE_H */ | 622 | #endif /* _LINUX_MMZONE_H */ |
611 | 623 |
mm/page_alloc.c
1 | /* | 1 | /* |
2 | * linux/mm/page_alloc.c | 2 | * linux/mm/page_alloc.c |
3 | * | 3 | * |
4 | * Manages the free list, the system allocates free pages here. | 4 | * Manages the free list, the system allocates free pages here. |
5 | * Note that kmalloc() lives in slab.c | 5 | * Note that kmalloc() lives in slab.c |
6 | * | 6 | * |
7 | * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds | 7 | * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds |
8 | * Swap reorganised 29.12.95, Stephen Tweedie | 8 | * Swap reorganised 29.12.95, Stephen Tweedie |
9 | * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 | 9 | * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 |
10 | * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 | 10 | * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 |
11 | * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 | 11 | * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 |
12 | * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 | 12 | * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 |
13 | * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002 | 13 | * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002 |
14 | * (lots of bits borrowed from Ingo Molnar & Andrew Morton) | 14 | * (lots of bits borrowed from Ingo Molnar & Andrew Morton) |
15 | */ | 15 | */ |
16 | 16 | ||
17 | #include <linux/config.h> | 17 | #include <linux/config.h> |
18 | #include <linux/stddef.h> | 18 | #include <linux/stddef.h> |
19 | #include <linux/mm.h> | 19 | #include <linux/mm.h> |
20 | #include <linux/swap.h> | 20 | #include <linux/swap.h> |
21 | #include <linux/interrupt.h> | 21 | #include <linux/interrupt.h> |
22 | #include <linux/pagemap.h> | 22 | #include <linux/pagemap.h> |
23 | #include <linux/bootmem.h> | 23 | #include <linux/bootmem.h> |
24 | #include <linux/compiler.h> | 24 | #include <linux/compiler.h> |
25 | #include <linux/kernel.h> | 25 | #include <linux/kernel.h> |
26 | #include <linux/module.h> | 26 | #include <linux/module.h> |
27 | #include <linux/suspend.h> | 27 | #include <linux/suspend.h> |
28 | #include <linux/pagevec.h> | 28 | #include <linux/pagevec.h> |
29 | #include <linux/blkdev.h> | 29 | #include <linux/blkdev.h> |
30 | #include <linux/slab.h> | 30 | #include <linux/slab.h> |
31 | #include <linux/notifier.h> | 31 | #include <linux/notifier.h> |
32 | #include <linux/topology.h> | 32 | #include <linux/topology.h> |
33 | #include <linux/sysctl.h> | 33 | #include <linux/sysctl.h> |
34 | #include <linux/cpu.h> | 34 | #include <linux/cpu.h> |
35 | #include <linux/cpuset.h> | 35 | #include <linux/cpuset.h> |
36 | #include <linux/nodemask.h> | 36 | #include <linux/nodemask.h> |
37 | #include <linux/vmalloc.h> | 37 | #include <linux/vmalloc.h> |
38 | 38 | ||
39 | #include <asm/tlbflush.h> | 39 | #include <asm/tlbflush.h> |
40 | #include "internal.h" | 40 | #include "internal.h" |
41 | 41 | ||
42 | /* | 42 | /* |
43 | * MCD - HACK: Find somewhere to initialize this EARLY, or make this | 43 | * MCD - HACK: Find somewhere to initialize this EARLY, or make this |
44 | * initializer cleaner | 44 | * initializer cleaner |
45 | */ | 45 | */ |
46 | nodemask_t node_online_map __read_mostly = { { [0] = 1UL } }; | 46 | nodemask_t node_online_map __read_mostly = { { [0] = 1UL } }; |
47 | EXPORT_SYMBOL(node_online_map); | 47 | EXPORT_SYMBOL(node_online_map); |
48 | nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL; | 48 | nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL; |
49 | EXPORT_SYMBOL(node_possible_map); | 49 | EXPORT_SYMBOL(node_possible_map); |
50 | struct pglist_data *pgdat_list __read_mostly; | 50 | struct pglist_data *pgdat_list __read_mostly; |
51 | unsigned long totalram_pages __read_mostly; | 51 | unsigned long totalram_pages __read_mostly; |
52 | unsigned long totalhigh_pages __read_mostly; | 52 | unsigned long totalhigh_pages __read_mostly; |
53 | long nr_swap_pages; | 53 | long nr_swap_pages; |
54 | 54 | ||
55 | /* | 55 | /* |
56 | * results with 256, 32 in the lowmem_reserve sysctl: | 56 | * results with 256, 32 in the lowmem_reserve sysctl: |
57 | * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) | 57 | * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) |
58 | * 1G machine -> (16M dma, 784M normal, 224M high) | 58 | * 1G machine -> (16M dma, 784M normal, 224M high) |
59 | * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA | 59 | * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA |
60 | * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL | 60 | * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL |
61 | * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA | 61 | * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA |
62 | */ | 62 | */ |
63 | int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 32 }; | 63 | int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 32 }; |
64 | 64 | ||
65 | EXPORT_SYMBOL(totalram_pages); | 65 | EXPORT_SYMBOL(totalram_pages); |
66 | EXPORT_SYMBOL(nr_swap_pages); | 66 | EXPORT_SYMBOL(nr_swap_pages); |
67 | 67 | ||
68 | /* | 68 | /* |
69 | * Used by page_zone() to look up the address of the struct zone whose | 69 | * Used by page_zone() to look up the address of the struct zone whose |
70 | * id is encoded in the upper bits of page->flags | 70 | * id is encoded in the upper bits of page->flags |
71 | */ | 71 | */ |
72 | struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly; | 72 | struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly; |
73 | EXPORT_SYMBOL(zone_table); | 73 | EXPORT_SYMBOL(zone_table); |
74 | 74 | ||
75 | static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; | 75 | static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; |
76 | int min_free_kbytes = 1024; | 76 | int min_free_kbytes = 1024; |
77 | 77 | ||
78 | unsigned long __initdata nr_kernel_pages; | 78 | unsigned long __initdata nr_kernel_pages; |
79 | unsigned long __initdata nr_all_pages; | 79 | unsigned long __initdata nr_all_pages; |
80 | 80 | ||
81 | static int page_outside_zone_boundaries(struct zone *zone, struct page *page) | 81 | static int page_outside_zone_boundaries(struct zone *zone, struct page *page) |
82 | { | 82 | { |
83 | if (page_to_pfn(page) >= zone->zone_start_pfn + zone->spanned_pages) | 83 | if (page_to_pfn(page) >= zone->zone_start_pfn + zone->spanned_pages) |
84 | return 1; | 84 | return 1; |
85 | if (page_to_pfn(page) < zone->zone_start_pfn) | 85 | if (page_to_pfn(page) < zone->zone_start_pfn) |
86 | return 1; | 86 | return 1; |
87 | 87 | ||
88 | return 0; | 88 | return 0; |
89 | } | 89 | } |
90 | 90 | ||
91 | static int page_is_consistent(struct zone *zone, struct page *page) | 91 | static int page_is_consistent(struct zone *zone, struct page *page) |
92 | { | 92 | { |
93 | #ifdef CONFIG_HOLES_IN_ZONE | 93 | #ifdef CONFIG_HOLES_IN_ZONE |
94 | if (!pfn_valid(page_to_pfn(page))) | 94 | if (!pfn_valid(page_to_pfn(page))) |
95 | return 0; | 95 | return 0; |
96 | #endif | 96 | #endif |
97 | if (zone != page_zone(page)) | 97 | if (zone != page_zone(page)) |
98 | return 0; | 98 | return 0; |
99 | 99 | ||
100 | return 1; | 100 | return 1; |
101 | } | 101 | } |
102 | /* | 102 | /* |
103 | * Temporary debugging check for pages not lying within a given zone. | 103 | * Temporary debugging check for pages not lying within a given zone. |
104 | */ | 104 | */ |
105 | static int bad_range(struct zone *zone, struct page *page) | 105 | static int bad_range(struct zone *zone, struct page *page) |
106 | { | 106 | { |
107 | if (page_outside_zone_boundaries(zone, page)) | 107 | if (page_outside_zone_boundaries(zone, page)) |
108 | return 1; | 108 | return 1; |
109 | if (!page_is_consistent(zone, page)) | 109 | if (!page_is_consistent(zone, page)) |
110 | return 1; | 110 | return 1; |
111 | 111 | ||
112 | return 0; | 112 | return 0; |
113 | } | 113 | } |
114 | 114 | ||
115 | static void bad_page(const char *function, struct page *page) | 115 | static void bad_page(const char *function, struct page *page) |
116 | { | 116 | { |
117 | printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n", | 117 | printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n", |
118 | function, current->comm, page); | 118 | function, current->comm, page); |
119 | printk(KERN_EMERG "flags:0x%0*lx mapping:%p mapcount:%d count:%d\n", | 119 | printk(KERN_EMERG "flags:0x%0*lx mapping:%p mapcount:%d count:%d\n", |
120 | (int)(2*sizeof(page_flags_t)), (unsigned long)page->flags, | 120 | (int)(2*sizeof(page_flags_t)), (unsigned long)page->flags, |
121 | page->mapping, page_mapcount(page), page_count(page)); | 121 | page->mapping, page_mapcount(page), page_count(page)); |
122 | printk(KERN_EMERG "Backtrace:\n"); | 122 | printk(KERN_EMERG "Backtrace:\n"); |
123 | dump_stack(); | 123 | dump_stack(); |
124 | printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"); | 124 | printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"); |
125 | page->flags &= ~(1 << PG_lru | | 125 | page->flags &= ~(1 << PG_lru | |
126 | 1 << PG_private | | 126 | 1 << PG_private | |
127 | 1 << PG_locked | | 127 | 1 << PG_locked | |
128 | 1 << PG_active | | 128 | 1 << PG_active | |
129 | 1 << PG_dirty | | 129 | 1 << PG_dirty | |
130 | 1 << PG_reclaim | | 130 | 1 << PG_reclaim | |
131 | 1 << PG_slab | | 131 | 1 << PG_slab | |
132 | 1 << PG_swapcache | | 132 | 1 << PG_swapcache | |
133 | 1 << PG_writeback | | 133 | 1 << PG_writeback | |
134 | 1 << PG_reserved ); | 134 | 1 << PG_reserved ); |
135 | set_page_count(page, 0); | 135 | set_page_count(page, 0); |
136 | reset_page_mapcount(page); | 136 | reset_page_mapcount(page); |
137 | page->mapping = NULL; | 137 | page->mapping = NULL; |
138 | add_taint(TAINT_BAD_PAGE); | 138 | add_taint(TAINT_BAD_PAGE); |
139 | } | 139 | } |
140 | 140 | ||
141 | #ifndef CONFIG_HUGETLB_PAGE | 141 | #ifndef CONFIG_HUGETLB_PAGE |
142 | #define prep_compound_page(page, order) do { } while (0) | 142 | #define prep_compound_page(page, order) do { } while (0) |
143 | #define destroy_compound_page(page, order) do { } while (0) | 143 | #define destroy_compound_page(page, order) do { } while (0) |
144 | #else | 144 | #else |
145 | /* | 145 | /* |
146 | * Higher-order pages are called "compound pages". They are structured thusly: | 146 | * Higher-order pages are called "compound pages". They are structured thusly: |
147 | * | 147 | * |
148 | * The first PAGE_SIZE page is called the "head page". | 148 | * The first PAGE_SIZE page is called the "head page". |
149 | * | 149 | * |
150 | * The remaining PAGE_SIZE pages are called "tail pages". | 150 | * The remaining PAGE_SIZE pages are called "tail pages". |
151 | * | 151 | * |
152 | * All pages have PG_compound set. All pages have their ->private pointing at | 152 | * All pages have PG_compound set. All pages have their ->private pointing at |
153 | * the head page (even the head page has this). | 153 | * the head page (even the head page has this). |
154 | * | 154 | * |
155 | * The first tail page's ->mapping, if non-zero, holds the address of the | 155 | * The first tail page's ->mapping, if non-zero, holds the address of the |
156 | * compound page's put_page() function. | 156 | * compound page's put_page() function. |
157 | * | 157 | * |
158 | * The order of the allocation is stored in the first tail page's ->index | 158 | * The order of the allocation is stored in the first tail page's ->index |
159 | * This is only for debug at present. This usage means that zero-order pages | 159 | * This is only for debug at present. This usage means that zero-order pages |
160 | * may not be compound. | 160 | * may not be compound. |
161 | */ | 161 | */ |
162 | static void prep_compound_page(struct page *page, unsigned long order) | 162 | static void prep_compound_page(struct page *page, unsigned long order) |
163 | { | 163 | { |
164 | int i; | 164 | int i; |
165 | int nr_pages = 1 << order; | 165 | int nr_pages = 1 << order; |
166 | 166 | ||
167 | page[1].mapping = NULL; | 167 | page[1].mapping = NULL; |
168 | page[1].index = order; | 168 | page[1].index = order; |
169 | for (i = 0; i < nr_pages; i++) { | 169 | for (i = 0; i < nr_pages; i++) { |
170 | struct page *p = page + i; | 170 | struct page *p = page + i; |
171 | 171 | ||
172 | SetPageCompound(p); | 172 | SetPageCompound(p); |
173 | set_page_private(p, (unsigned long)page); | 173 | set_page_private(p, (unsigned long)page); |
174 | } | 174 | } |
175 | } | 175 | } |
176 | 176 | ||
177 | static void destroy_compound_page(struct page *page, unsigned long order) | 177 | static void destroy_compound_page(struct page *page, unsigned long order) |
178 | { | 178 | { |
179 | int i; | 179 | int i; |
180 | int nr_pages = 1 << order; | 180 | int nr_pages = 1 << order; |
181 | 181 | ||
182 | if (!PageCompound(page)) | 182 | if (!PageCompound(page)) |
183 | return; | 183 | return; |
184 | 184 | ||
185 | if (page[1].index != order) | 185 | if (page[1].index != order) |
186 | bad_page(__FUNCTION__, page); | 186 | bad_page(__FUNCTION__, page); |
187 | 187 | ||
188 | for (i = 0; i < nr_pages; i++) { | 188 | for (i = 0; i < nr_pages; i++) { |
189 | struct page *p = page + i; | 189 | struct page *p = page + i; |
190 | 190 | ||
191 | if (!PageCompound(p)) | 191 | if (!PageCompound(p)) |
192 | bad_page(__FUNCTION__, page); | 192 | bad_page(__FUNCTION__, page); |
193 | if (page_private(p) != (unsigned long)page) | 193 | if (page_private(p) != (unsigned long)page) |
194 | bad_page(__FUNCTION__, page); | 194 | bad_page(__FUNCTION__, page); |
195 | ClearPageCompound(p); | 195 | ClearPageCompound(p); |
196 | } | 196 | } |
197 | } | 197 | } |
198 | #endif /* CONFIG_HUGETLB_PAGE */ | 198 | #endif /* CONFIG_HUGETLB_PAGE */ |
199 | 199 | ||
200 | /* | 200 | /* |
201 | * function for dealing with page's order in buddy system. | 201 | * function for dealing with page's order in buddy system. |
202 | * zone->lock is already acquired when we use these. | 202 | * zone->lock is already acquired when we use these. |
203 | * So, we don't need atomic page->flags operations here. | 203 | * So, we don't need atomic page->flags operations here. |
204 | */ | 204 | */ |
205 | static inline unsigned long page_order(struct page *page) { | 205 | static inline unsigned long page_order(struct page *page) { |
206 | return page_private(page); | 206 | return page_private(page); |
207 | } | 207 | } |
208 | 208 | ||
209 | static inline void set_page_order(struct page *page, int order) { | 209 | static inline void set_page_order(struct page *page, int order) { |
210 | set_page_private(page, order); | 210 | set_page_private(page, order); |
211 | __SetPagePrivate(page); | 211 | __SetPagePrivate(page); |
212 | } | 212 | } |
213 | 213 | ||
214 | static inline void rmv_page_order(struct page *page) | 214 | static inline void rmv_page_order(struct page *page) |
215 | { | 215 | { |
216 | __ClearPagePrivate(page); | 216 | __ClearPagePrivate(page); |
217 | set_page_private(page, 0); | 217 | set_page_private(page, 0); |
218 | } | 218 | } |
219 | 219 | ||
220 | /* | 220 | /* |
221 | * Locate the struct page for both the matching buddy in our | 221 | * Locate the struct page for both the matching buddy in our |
222 | * pair (buddy1) and the combined O(n+1) page they form (page). | 222 | * pair (buddy1) and the combined O(n+1) page they form (page). |
223 | * | 223 | * |
224 | * 1) Any buddy B1 will have an order O twin B2 which satisfies | 224 | * 1) Any buddy B1 will have an order O twin B2 which satisfies |
225 | * the following equation: | 225 | * the following equation: |
226 | * B2 = B1 ^ (1 << O) | 226 | * B2 = B1 ^ (1 << O) |
227 | * For example, if the starting buddy (buddy2) is #8 its order | 227 | * For example, if the starting buddy (buddy2) is #8 its order |
228 | * 1 buddy is #10: | 228 | * 1 buddy is #10: |
229 | * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10 | 229 | * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10 |
230 | * | 230 | * |
231 | * 2) Any buddy B will have an order O+1 parent P which | 231 | * 2) Any buddy B will have an order O+1 parent P which |
232 | * satisfies the following equation: | 232 | * satisfies the following equation: |
233 | * P = B & ~(1 << O) | 233 | * P = B & ~(1 << O) |
234 | * | 234 | * |
235 | * Assumption: *_mem_map is contigious at least up to MAX_ORDER | 235 | * Assumption: *_mem_map is contigious at least up to MAX_ORDER |
236 | */ | 236 | */ |
237 | static inline struct page * | 237 | static inline struct page * |
238 | __page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order) | 238 | __page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order) |
239 | { | 239 | { |
240 | unsigned long buddy_idx = page_idx ^ (1 << order); | 240 | unsigned long buddy_idx = page_idx ^ (1 << order); |
241 | 241 | ||
242 | return page + (buddy_idx - page_idx); | 242 | return page + (buddy_idx - page_idx); |
243 | } | 243 | } |
244 | 244 | ||
245 | static inline unsigned long | 245 | static inline unsigned long |
246 | __find_combined_index(unsigned long page_idx, unsigned int order) | 246 | __find_combined_index(unsigned long page_idx, unsigned int order) |
247 | { | 247 | { |
248 | return (page_idx & ~(1 << order)); | 248 | return (page_idx & ~(1 << order)); |
249 | } | 249 | } |
250 | 250 | ||
251 | /* | 251 | /* |
252 | * This function checks whether a page is free && is the buddy | 252 | * This function checks whether a page is free && is the buddy |
253 | * we can do coalesce a page and its buddy if | 253 | * we can do coalesce a page and its buddy if |
254 | * (a) the buddy is free && | 254 | * (a) the buddy is free && |
255 | * (b) the buddy is on the buddy system && | 255 | * (b) the buddy is on the buddy system && |
256 | * (c) a page and its buddy have the same order. | 256 | * (c) a page and its buddy have the same order. |
257 | * for recording page's order, we use page_private(page) and PG_private. | 257 | * for recording page's order, we use page_private(page) and PG_private. |
258 | * | 258 | * |
259 | */ | 259 | */ |
260 | static inline int page_is_buddy(struct page *page, int order) | 260 | static inline int page_is_buddy(struct page *page, int order) |
261 | { | 261 | { |
262 | if (PagePrivate(page) && | 262 | if (PagePrivate(page) && |
263 | (page_order(page) == order) && | 263 | (page_order(page) == order) && |
264 | page_count(page) == 0) | 264 | page_count(page) == 0) |
265 | return 1; | 265 | return 1; |
266 | return 0; | 266 | return 0; |
267 | } | 267 | } |
268 | 268 | ||
269 | /* | 269 | /* |
270 | * Freeing function for a buddy system allocator. | 270 | * Freeing function for a buddy system allocator. |
271 | * | 271 | * |
272 | * The concept of a buddy system is to maintain direct-mapped table | 272 | * The concept of a buddy system is to maintain direct-mapped table |
273 | * (containing bit values) for memory blocks of various "orders". | 273 | * (containing bit values) for memory blocks of various "orders". |
274 | * The bottom level table contains the map for the smallest allocatable | 274 | * The bottom level table contains the map for the smallest allocatable |
275 | * units of memory (here, pages), and each level above it describes | 275 | * units of memory (here, pages), and each level above it describes |
276 | * pairs of units from the levels below, hence, "buddies". | 276 | * pairs of units from the levels below, hence, "buddies". |
277 | * At a high level, all that happens here is marking the table entry | 277 | * At a high level, all that happens here is marking the table entry |
278 | * at the bottom level available, and propagating the changes upward | 278 | * at the bottom level available, and propagating the changes upward |
279 | * as necessary, plus some accounting needed to play nicely with other | 279 | * as necessary, plus some accounting needed to play nicely with other |
280 | * parts of the VM system. | 280 | * parts of the VM system. |
281 | * At each level, we keep a list of pages, which are heads of continuous | 281 | * At each level, we keep a list of pages, which are heads of continuous |
282 | * free pages of length of (1 << order) and marked with PG_Private.Page's | 282 | * free pages of length of (1 << order) and marked with PG_Private.Page's |
283 | * order is recorded in page_private(page) field. | 283 | * order is recorded in page_private(page) field. |
284 | * So when we are allocating or freeing one, we can derive the state of the | 284 | * So when we are allocating or freeing one, we can derive the state of the |
285 | * other. That is, if we allocate a small block, and both were | 285 | * other. That is, if we allocate a small block, and both were |
286 | * free, the remainder of the region must be split into blocks. | 286 | * free, the remainder of the region must be split into blocks. |
287 | * If a block is freed, and its buddy is also free, then this | 287 | * If a block is freed, and its buddy is also free, then this |
288 | * triggers coalescing into a block of larger size. | 288 | * triggers coalescing into a block of larger size. |
289 | * | 289 | * |
290 | * -- wli | 290 | * -- wli |
291 | */ | 291 | */ |
292 | 292 | ||
293 | static inline void __free_pages_bulk (struct page *page, | 293 | static inline void __free_pages_bulk (struct page *page, |
294 | struct zone *zone, unsigned int order) | 294 | struct zone *zone, unsigned int order) |
295 | { | 295 | { |
296 | unsigned long page_idx; | 296 | unsigned long page_idx; |
297 | int order_size = 1 << order; | 297 | int order_size = 1 << order; |
298 | 298 | ||
299 | if (unlikely(order)) | 299 | if (unlikely(order)) |
300 | destroy_compound_page(page, order); | 300 | destroy_compound_page(page, order); |
301 | 301 | ||
302 | page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); | 302 | page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); |
303 | 303 | ||
304 | BUG_ON(page_idx & (order_size - 1)); | 304 | BUG_ON(page_idx & (order_size - 1)); |
305 | BUG_ON(bad_range(zone, page)); | 305 | BUG_ON(bad_range(zone, page)); |
306 | 306 | ||
307 | zone->free_pages += order_size; | 307 | zone->free_pages += order_size; |
308 | while (order < MAX_ORDER-1) { | 308 | while (order < MAX_ORDER-1) { |
309 | unsigned long combined_idx; | 309 | unsigned long combined_idx; |
310 | struct free_area *area; | 310 | struct free_area *area; |
311 | struct page *buddy; | 311 | struct page *buddy; |
312 | 312 | ||
313 | combined_idx = __find_combined_index(page_idx, order); | 313 | combined_idx = __find_combined_index(page_idx, order); |
314 | buddy = __page_find_buddy(page, page_idx, order); | 314 | buddy = __page_find_buddy(page, page_idx, order); |
315 | 315 | ||
316 | if (bad_range(zone, buddy)) | 316 | if (bad_range(zone, buddy)) |
317 | break; | 317 | break; |
318 | if (!page_is_buddy(buddy, order)) | 318 | if (!page_is_buddy(buddy, order)) |
319 | break; /* Move the buddy up one level. */ | 319 | break; /* Move the buddy up one level. */ |
320 | list_del(&buddy->lru); | 320 | list_del(&buddy->lru); |
321 | area = zone->free_area + order; | 321 | area = zone->free_area + order; |
322 | area->nr_free--; | 322 | area->nr_free--; |
323 | rmv_page_order(buddy); | 323 | rmv_page_order(buddy); |
324 | page = page + (combined_idx - page_idx); | 324 | page = page + (combined_idx - page_idx); |
325 | page_idx = combined_idx; | 325 | page_idx = combined_idx; |
326 | order++; | 326 | order++; |
327 | } | 327 | } |
328 | set_page_order(page, order); | 328 | set_page_order(page, order); |
329 | list_add(&page->lru, &zone->free_area[order].free_list); | 329 | list_add(&page->lru, &zone->free_area[order].free_list); |
330 | zone->free_area[order].nr_free++; | 330 | zone->free_area[order].nr_free++; |
331 | } | 331 | } |
332 | 332 | ||
333 | static inline void free_pages_check(const char *function, struct page *page) | 333 | static inline void free_pages_check(const char *function, struct page *page) |
334 | { | 334 | { |
335 | if ( page_mapcount(page) || | 335 | if ( page_mapcount(page) || |
336 | page->mapping != NULL || | 336 | page->mapping != NULL || |
337 | page_count(page) != 0 || | 337 | page_count(page) != 0 || |
338 | (page->flags & ( | 338 | (page->flags & ( |
339 | 1 << PG_lru | | 339 | 1 << PG_lru | |
340 | 1 << PG_private | | 340 | 1 << PG_private | |
341 | 1 << PG_locked | | 341 | 1 << PG_locked | |
342 | 1 << PG_active | | 342 | 1 << PG_active | |
343 | 1 << PG_reclaim | | 343 | 1 << PG_reclaim | |
344 | 1 << PG_slab | | 344 | 1 << PG_slab | |
345 | 1 << PG_swapcache | | 345 | 1 << PG_swapcache | |
346 | 1 << PG_writeback | | 346 | 1 << PG_writeback | |
347 | 1 << PG_reserved ))) | 347 | 1 << PG_reserved ))) |
348 | bad_page(function, page); | 348 | bad_page(function, page); |
349 | if (PageDirty(page)) | 349 | if (PageDirty(page)) |
350 | __ClearPageDirty(page); | 350 | __ClearPageDirty(page); |
351 | } | 351 | } |
352 | 352 | ||
353 | /* | 353 | /* |
354 | * Frees a list of pages. | 354 | * Frees a list of pages. |
355 | * Assumes all pages on list are in same zone, and of same order. | 355 | * Assumes all pages on list are in same zone, and of same order. |
356 | * count is the number of pages to free. | 356 | * count is the number of pages to free. |
357 | * | 357 | * |
358 | * If the zone was previously in an "all pages pinned" state then look to | 358 | * If the zone was previously in an "all pages pinned" state then look to |
359 | * see if this freeing clears that state. | 359 | * see if this freeing clears that state. |
360 | * | 360 | * |
361 | * And clear the zone's pages_scanned counter, to hold off the "all pages are | 361 | * And clear the zone's pages_scanned counter, to hold off the "all pages are |
362 | * pinned" detection logic. | 362 | * pinned" detection logic. |
363 | */ | 363 | */ |
364 | static int | 364 | static int |
365 | free_pages_bulk(struct zone *zone, int count, | 365 | free_pages_bulk(struct zone *zone, int count, |
366 | struct list_head *list, unsigned int order) | 366 | struct list_head *list, unsigned int order) |
367 | { | 367 | { |
368 | unsigned long flags; | 368 | unsigned long flags; |
369 | struct page *page = NULL; | 369 | struct page *page = NULL; |
370 | int ret = 0; | 370 | int ret = 0; |
371 | 371 | ||
372 | spin_lock_irqsave(&zone->lock, flags); | 372 | spin_lock_irqsave(&zone->lock, flags); |
373 | zone->all_unreclaimable = 0; | 373 | zone->all_unreclaimable = 0; |
374 | zone->pages_scanned = 0; | 374 | zone->pages_scanned = 0; |
375 | while (!list_empty(list) && count--) { | 375 | while (!list_empty(list) && count--) { |
376 | page = list_entry(list->prev, struct page, lru); | 376 | page = list_entry(list->prev, struct page, lru); |
377 | /* have to delete it as __free_pages_bulk list manipulates */ | 377 | /* have to delete it as __free_pages_bulk list manipulates */ |
378 | list_del(&page->lru); | 378 | list_del(&page->lru); |
379 | __free_pages_bulk(page, zone, order); | 379 | __free_pages_bulk(page, zone, order); |
380 | ret++; | 380 | ret++; |
381 | } | 381 | } |
382 | spin_unlock_irqrestore(&zone->lock, flags); | 382 | spin_unlock_irqrestore(&zone->lock, flags); |
383 | return ret; | 383 | return ret; |
384 | } | 384 | } |
385 | 385 | ||
386 | void __free_pages_ok(struct page *page, unsigned int order) | 386 | void __free_pages_ok(struct page *page, unsigned int order) |
387 | { | 387 | { |
388 | LIST_HEAD(list); | 388 | LIST_HEAD(list); |
389 | int i; | 389 | int i; |
390 | 390 | ||
391 | arch_free_page(page, order); | 391 | arch_free_page(page, order); |
392 | 392 | ||
393 | mod_page_state(pgfree, 1 << order); | 393 | mod_page_state(pgfree, 1 << order); |
394 | 394 | ||
395 | #ifndef CONFIG_MMU | 395 | #ifndef CONFIG_MMU |
396 | if (order > 0) | 396 | if (order > 0) |
397 | for (i = 1 ; i < (1 << order) ; ++i) | 397 | for (i = 1 ; i < (1 << order) ; ++i) |
398 | __put_page(page + i); | 398 | __put_page(page + i); |
399 | #endif | 399 | #endif |
400 | 400 | ||
401 | for (i = 0 ; i < (1 << order) ; ++i) | 401 | for (i = 0 ; i < (1 << order) ; ++i) |
402 | free_pages_check(__FUNCTION__, page + i); | 402 | free_pages_check(__FUNCTION__, page + i); |
403 | list_add(&page->lru, &list); | 403 | list_add(&page->lru, &list); |
404 | kernel_map_pages(page, 1<<order, 0); | 404 | kernel_map_pages(page, 1<<order, 0); |
405 | free_pages_bulk(page_zone(page), 1, &list, order); | 405 | free_pages_bulk(page_zone(page), 1, &list, order); |
406 | } | 406 | } |
407 | 407 | ||
408 | 408 | ||
409 | /* | 409 | /* |
410 | * The order of subdivision here is critical for the IO subsystem. | 410 | * The order of subdivision here is critical for the IO subsystem. |
411 | * Please do not alter this order without good reasons and regression | 411 | * Please do not alter this order without good reasons and regression |
412 | * testing. Specifically, as large blocks of memory are subdivided, | 412 | * testing. Specifically, as large blocks of memory are subdivided, |
413 | * the order in which smaller blocks are delivered depends on the order | 413 | * the order in which smaller blocks are delivered depends on the order |
414 | * they're subdivided in this function. This is the primary factor | 414 | * they're subdivided in this function. This is the primary factor |
415 | * influencing the order in which pages are delivered to the IO | 415 | * influencing the order in which pages are delivered to the IO |
416 | * subsystem according to empirical testing, and this is also justified | 416 | * subsystem according to empirical testing, and this is also justified |
417 | * by considering the behavior of a buddy system containing a single | 417 | * by considering the behavior of a buddy system containing a single |
418 | * large block of memory acted on by a series of small allocations. | 418 | * large block of memory acted on by a series of small allocations. |
419 | * This behavior is a critical factor in sglist merging's success. | 419 | * This behavior is a critical factor in sglist merging's success. |
420 | * | 420 | * |
421 | * -- wli | 421 | * -- wli |
422 | */ | 422 | */ |
423 | static inline struct page * | 423 | static inline struct page * |
424 | expand(struct zone *zone, struct page *page, | 424 | expand(struct zone *zone, struct page *page, |
425 | int low, int high, struct free_area *area) | 425 | int low, int high, struct free_area *area) |
426 | { | 426 | { |
427 | unsigned long size = 1 << high; | 427 | unsigned long size = 1 << high; |
428 | 428 | ||
429 | while (high > low) { | 429 | while (high > low) { |
430 | area--; | 430 | area--; |
431 | high--; | 431 | high--; |
432 | size >>= 1; | 432 | size >>= 1; |
433 | BUG_ON(bad_range(zone, &page[size])); | 433 | BUG_ON(bad_range(zone, &page[size])); |
434 | list_add(&page[size].lru, &area->free_list); | 434 | list_add(&page[size].lru, &area->free_list); |
435 | area->nr_free++; | 435 | area->nr_free++; |
436 | set_page_order(&page[size], high); | 436 | set_page_order(&page[size], high); |
437 | } | 437 | } |
438 | return page; | 438 | return page; |
439 | } | 439 | } |
440 | 440 | ||
441 | void set_page_refs(struct page *page, int order) | 441 | void set_page_refs(struct page *page, int order) |
442 | { | 442 | { |
443 | #ifdef CONFIG_MMU | 443 | #ifdef CONFIG_MMU |
444 | set_page_count(page, 1); | 444 | set_page_count(page, 1); |
445 | #else | 445 | #else |
446 | int i; | 446 | int i; |
447 | 447 | ||
448 | /* | 448 | /* |
449 | * We need to reference all the pages for this order, otherwise if | 449 | * We need to reference all the pages for this order, otherwise if |
450 | * anyone accesses one of the pages with (get/put) it will be freed. | 450 | * anyone accesses one of the pages with (get/put) it will be freed. |
451 | * - eg: access_process_vm() | 451 | * - eg: access_process_vm() |
452 | */ | 452 | */ |
453 | for (i = 0; i < (1 << order); i++) | 453 | for (i = 0; i < (1 << order); i++) |
454 | set_page_count(page + i, 1); | 454 | set_page_count(page + i, 1); |
455 | #endif /* CONFIG_MMU */ | 455 | #endif /* CONFIG_MMU */ |
456 | } | 456 | } |
457 | 457 | ||
458 | /* | 458 | /* |
459 | * This page is about to be returned from the page allocator | 459 | * This page is about to be returned from the page allocator |
460 | */ | 460 | */ |
461 | static void prep_new_page(struct page *page, int order) | 461 | static void prep_new_page(struct page *page, int order) |
462 | { | 462 | { |
463 | if ( page_mapcount(page) || | 463 | if ( page_mapcount(page) || |
464 | page->mapping != NULL || | 464 | page->mapping != NULL || |
465 | page_count(page) != 0 || | 465 | page_count(page) != 0 || |
466 | (page->flags & ( | 466 | (page->flags & ( |
467 | 1 << PG_lru | | 467 | 1 << PG_lru | |
468 | 1 << PG_private | | 468 | 1 << PG_private | |
469 | 1 << PG_locked | | 469 | 1 << PG_locked | |
470 | 1 << PG_active | | 470 | 1 << PG_active | |
471 | 1 << PG_dirty | | 471 | 1 << PG_dirty | |
472 | 1 << PG_reclaim | | 472 | 1 << PG_reclaim | |
473 | 1 << PG_slab | | 473 | 1 << PG_slab | |
474 | 1 << PG_swapcache | | 474 | 1 << PG_swapcache | |
475 | 1 << PG_writeback | | 475 | 1 << PG_writeback | |
476 | 1 << PG_reserved ))) | 476 | 1 << PG_reserved ))) |
477 | bad_page(__FUNCTION__, page); | 477 | bad_page(__FUNCTION__, page); |
478 | 478 | ||
479 | page->flags &= ~(1 << PG_uptodate | 1 << PG_error | | 479 | page->flags &= ~(1 << PG_uptodate | 1 << PG_error | |
480 | 1 << PG_referenced | 1 << PG_arch_1 | | 480 | 1 << PG_referenced | 1 << PG_arch_1 | |
481 | 1 << PG_checked | 1 << PG_mappedtodisk); | 481 | 1 << PG_checked | 1 << PG_mappedtodisk); |
482 | set_page_private(page, 0); | 482 | set_page_private(page, 0); |
483 | set_page_refs(page, order); | 483 | set_page_refs(page, order); |
484 | kernel_map_pages(page, 1 << order, 1); | 484 | kernel_map_pages(page, 1 << order, 1); |
485 | } | 485 | } |
486 | 486 | ||
487 | /* | 487 | /* |
488 | * Do the hard work of removing an element from the buddy allocator. | 488 | * Do the hard work of removing an element from the buddy allocator. |
489 | * Call me with the zone->lock already held. | 489 | * Call me with the zone->lock already held. |
490 | */ | 490 | */ |
491 | static struct page *__rmqueue(struct zone *zone, unsigned int order) | 491 | static struct page *__rmqueue(struct zone *zone, unsigned int order) |
492 | { | 492 | { |
493 | struct free_area * area; | 493 | struct free_area * area; |
494 | unsigned int current_order; | 494 | unsigned int current_order; |
495 | struct page *page; | 495 | struct page *page; |
496 | 496 | ||
497 | for (current_order = order; current_order < MAX_ORDER; ++current_order) { | 497 | for (current_order = order; current_order < MAX_ORDER; ++current_order) { |
498 | area = zone->free_area + current_order; | 498 | area = zone->free_area + current_order; |
499 | if (list_empty(&area->free_list)) | 499 | if (list_empty(&area->free_list)) |
500 | continue; | 500 | continue; |
501 | 501 | ||
502 | page = list_entry(area->free_list.next, struct page, lru); | 502 | page = list_entry(area->free_list.next, struct page, lru); |
503 | list_del(&page->lru); | 503 | list_del(&page->lru); |
504 | rmv_page_order(page); | 504 | rmv_page_order(page); |
505 | area->nr_free--; | 505 | area->nr_free--; |
506 | zone->free_pages -= 1UL << order; | 506 | zone->free_pages -= 1UL << order; |
507 | return expand(zone, page, order, current_order, area); | 507 | return expand(zone, page, order, current_order, area); |
508 | } | 508 | } |
509 | 509 | ||
510 | return NULL; | 510 | return NULL; |
511 | } | 511 | } |
512 | 512 | ||
513 | /* | 513 | /* |
514 | * Obtain a specified number of elements from the buddy allocator, all under | 514 | * Obtain a specified number of elements from the buddy allocator, all under |
515 | * a single hold of the lock, for efficiency. Add them to the supplied list. | 515 | * a single hold of the lock, for efficiency. Add them to the supplied list. |
516 | * Returns the number of new pages which were placed at *list. | 516 | * Returns the number of new pages which were placed at *list. |
517 | */ | 517 | */ |
518 | static int rmqueue_bulk(struct zone *zone, unsigned int order, | 518 | static int rmqueue_bulk(struct zone *zone, unsigned int order, |
519 | unsigned long count, struct list_head *list) | 519 | unsigned long count, struct list_head *list) |
520 | { | 520 | { |
521 | unsigned long flags; | 521 | unsigned long flags; |
522 | int i; | 522 | int i; |
523 | int allocated = 0; | 523 | int allocated = 0; |
524 | struct page *page; | 524 | struct page *page; |
525 | 525 | ||
526 | spin_lock_irqsave(&zone->lock, flags); | 526 | spin_lock_irqsave(&zone->lock, flags); |
527 | for (i = 0; i < count; ++i) { | 527 | for (i = 0; i < count; ++i) { |
528 | page = __rmqueue(zone, order); | 528 | page = __rmqueue(zone, order); |
529 | if (page == NULL) | 529 | if (page == NULL) |
530 | break; | 530 | break; |
531 | allocated++; | 531 | allocated++; |
532 | list_add_tail(&page->lru, list); | 532 | list_add_tail(&page->lru, list); |
533 | } | 533 | } |
534 | spin_unlock_irqrestore(&zone->lock, flags); | 534 | spin_unlock_irqrestore(&zone->lock, flags); |
535 | return allocated; | 535 | return allocated; |
536 | } | 536 | } |
537 | 537 | ||
538 | #ifdef CONFIG_NUMA | 538 | #ifdef CONFIG_NUMA |
539 | /* Called from the slab reaper to drain remote pagesets */ | 539 | /* Called from the slab reaper to drain remote pagesets */ |
540 | void drain_remote_pages(void) | 540 | void drain_remote_pages(void) |
541 | { | 541 | { |
542 | struct zone *zone; | 542 | struct zone *zone; |
543 | int i; | 543 | int i; |
544 | unsigned long flags; | 544 | unsigned long flags; |
545 | 545 | ||
546 | local_irq_save(flags); | 546 | local_irq_save(flags); |
547 | for_each_zone(zone) { | 547 | for_each_zone(zone) { |
548 | struct per_cpu_pageset *pset; | 548 | struct per_cpu_pageset *pset; |
549 | 549 | ||
550 | /* Do not drain local pagesets */ | 550 | /* Do not drain local pagesets */ |
551 | if (zone->zone_pgdat->node_id == numa_node_id()) | 551 | if (zone->zone_pgdat->node_id == numa_node_id()) |
552 | continue; | 552 | continue; |
553 | 553 | ||
554 | pset = zone->pageset[smp_processor_id()]; | 554 | pset = zone->pageset[smp_processor_id()]; |
555 | for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { | 555 | for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { |
556 | struct per_cpu_pages *pcp; | 556 | struct per_cpu_pages *pcp; |
557 | 557 | ||
558 | pcp = &pset->pcp[i]; | 558 | pcp = &pset->pcp[i]; |
559 | if (pcp->count) | 559 | if (pcp->count) |
560 | pcp->count -= free_pages_bulk(zone, pcp->count, | 560 | pcp->count -= free_pages_bulk(zone, pcp->count, |
561 | &pcp->list, 0); | 561 | &pcp->list, 0); |
562 | } | 562 | } |
563 | } | 563 | } |
564 | local_irq_restore(flags); | 564 | local_irq_restore(flags); |
565 | } | 565 | } |
566 | #endif | 566 | #endif |
567 | 567 | ||
568 | #if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) | 568 | #if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) |
569 | static void __drain_pages(unsigned int cpu) | 569 | static void __drain_pages(unsigned int cpu) |
570 | { | 570 | { |
571 | struct zone *zone; | 571 | struct zone *zone; |
572 | int i; | 572 | int i; |
573 | 573 | ||
574 | for_each_zone(zone) { | 574 | for_each_zone(zone) { |
575 | struct per_cpu_pageset *pset; | 575 | struct per_cpu_pageset *pset; |
576 | 576 | ||
577 | pset = zone_pcp(zone, cpu); | 577 | pset = zone_pcp(zone, cpu); |
578 | for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { | 578 | for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { |
579 | struct per_cpu_pages *pcp; | 579 | struct per_cpu_pages *pcp; |
580 | 580 | ||
581 | pcp = &pset->pcp[i]; | 581 | pcp = &pset->pcp[i]; |
582 | pcp->count -= free_pages_bulk(zone, pcp->count, | 582 | pcp->count -= free_pages_bulk(zone, pcp->count, |
583 | &pcp->list, 0); | 583 | &pcp->list, 0); |
584 | } | 584 | } |
585 | } | 585 | } |
586 | } | 586 | } |
587 | #endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */ | 587 | #endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */ |
588 | 588 | ||
589 | #ifdef CONFIG_PM | 589 | #ifdef CONFIG_PM |
590 | 590 | ||
591 | void mark_free_pages(struct zone *zone) | 591 | void mark_free_pages(struct zone *zone) |
592 | { | 592 | { |
593 | unsigned long zone_pfn, flags; | 593 | unsigned long zone_pfn, flags; |
594 | int order; | 594 | int order; |
595 | struct list_head *curr; | 595 | struct list_head *curr; |
596 | 596 | ||
597 | if (!zone->spanned_pages) | 597 | if (!zone->spanned_pages) |
598 | return; | 598 | return; |
599 | 599 | ||
600 | spin_lock_irqsave(&zone->lock, flags); | 600 | spin_lock_irqsave(&zone->lock, flags); |
601 | for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) | 601 | for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) |
602 | ClearPageNosaveFree(pfn_to_page(zone_pfn + zone->zone_start_pfn)); | 602 | ClearPageNosaveFree(pfn_to_page(zone_pfn + zone->zone_start_pfn)); |
603 | 603 | ||
604 | for (order = MAX_ORDER - 1; order >= 0; --order) | 604 | for (order = MAX_ORDER - 1; order >= 0; --order) |
605 | list_for_each(curr, &zone->free_area[order].free_list) { | 605 | list_for_each(curr, &zone->free_area[order].free_list) { |
606 | unsigned long start_pfn, i; | 606 | unsigned long start_pfn, i; |
607 | 607 | ||
608 | start_pfn = page_to_pfn(list_entry(curr, struct page, lru)); | 608 | start_pfn = page_to_pfn(list_entry(curr, struct page, lru)); |
609 | 609 | ||
610 | for (i=0; i < (1<<order); i++) | 610 | for (i=0; i < (1<<order); i++) |
611 | SetPageNosaveFree(pfn_to_page(start_pfn+i)); | 611 | SetPageNosaveFree(pfn_to_page(start_pfn+i)); |
612 | } | 612 | } |
613 | spin_unlock_irqrestore(&zone->lock, flags); | 613 | spin_unlock_irqrestore(&zone->lock, flags); |
614 | } | 614 | } |
615 | 615 | ||
616 | /* | 616 | /* |
617 | * Spill all of this CPU's per-cpu pages back into the buddy allocator. | 617 | * Spill all of this CPU's per-cpu pages back into the buddy allocator. |
618 | */ | 618 | */ |
619 | void drain_local_pages(void) | 619 | void drain_local_pages(void) |
620 | { | 620 | { |
621 | unsigned long flags; | 621 | unsigned long flags; |
622 | 622 | ||
623 | local_irq_save(flags); | 623 | local_irq_save(flags); |
624 | __drain_pages(smp_processor_id()); | 624 | __drain_pages(smp_processor_id()); |
625 | local_irq_restore(flags); | 625 | local_irq_restore(flags); |
626 | } | 626 | } |
627 | #endif /* CONFIG_PM */ | 627 | #endif /* CONFIG_PM */ |
628 | 628 | ||
629 | static void zone_statistics(struct zonelist *zonelist, struct zone *z) | 629 | static void zone_statistics(struct zonelist *zonelist, struct zone *z) |
630 | { | 630 | { |
631 | #ifdef CONFIG_NUMA | 631 | #ifdef CONFIG_NUMA |
632 | unsigned long flags; | 632 | unsigned long flags; |
633 | int cpu; | 633 | int cpu; |
634 | pg_data_t *pg = z->zone_pgdat; | 634 | pg_data_t *pg = z->zone_pgdat; |
635 | pg_data_t *orig = zonelist->zones[0]->zone_pgdat; | 635 | pg_data_t *orig = zonelist->zones[0]->zone_pgdat; |
636 | struct per_cpu_pageset *p; | 636 | struct per_cpu_pageset *p; |
637 | 637 | ||
638 | local_irq_save(flags); | 638 | local_irq_save(flags); |
639 | cpu = smp_processor_id(); | 639 | cpu = smp_processor_id(); |
640 | p = zone_pcp(z,cpu); | 640 | p = zone_pcp(z,cpu); |
641 | if (pg == orig) { | 641 | if (pg == orig) { |
642 | p->numa_hit++; | 642 | p->numa_hit++; |
643 | } else { | 643 | } else { |
644 | p->numa_miss++; | 644 | p->numa_miss++; |
645 | zone_pcp(zonelist->zones[0], cpu)->numa_foreign++; | 645 | zone_pcp(zonelist->zones[0], cpu)->numa_foreign++; |
646 | } | 646 | } |
647 | if (pg == NODE_DATA(numa_node_id())) | 647 | if (pg == NODE_DATA(numa_node_id())) |
648 | p->local_node++; | 648 | p->local_node++; |
649 | else | 649 | else |
650 | p->other_node++; | 650 | p->other_node++; |
651 | local_irq_restore(flags); | 651 | local_irq_restore(flags); |
652 | #endif | 652 | #endif |
653 | } | 653 | } |
654 | 654 | ||
655 | /* | 655 | /* |
656 | * Free a 0-order page | 656 | * Free a 0-order page |
657 | */ | 657 | */ |
658 | static void FASTCALL(free_hot_cold_page(struct page *page, int cold)); | 658 | static void FASTCALL(free_hot_cold_page(struct page *page, int cold)); |
659 | static void fastcall free_hot_cold_page(struct page *page, int cold) | 659 | static void fastcall free_hot_cold_page(struct page *page, int cold) |
660 | { | 660 | { |
661 | struct zone *zone = page_zone(page); | 661 | struct zone *zone = page_zone(page); |
662 | struct per_cpu_pages *pcp; | 662 | struct per_cpu_pages *pcp; |
663 | unsigned long flags; | 663 | unsigned long flags; |
664 | 664 | ||
665 | arch_free_page(page, 0); | 665 | arch_free_page(page, 0); |
666 | 666 | ||
667 | kernel_map_pages(page, 1, 0); | 667 | kernel_map_pages(page, 1, 0); |
668 | inc_page_state(pgfree); | 668 | inc_page_state(pgfree); |
669 | if (PageAnon(page)) | 669 | if (PageAnon(page)) |
670 | page->mapping = NULL; | 670 | page->mapping = NULL; |
671 | free_pages_check(__FUNCTION__, page); | 671 | free_pages_check(__FUNCTION__, page); |
672 | pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; | 672 | pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; |
673 | local_irq_save(flags); | 673 | local_irq_save(flags); |
674 | list_add(&page->lru, &pcp->list); | 674 | list_add(&page->lru, &pcp->list); |
675 | pcp->count++; | 675 | pcp->count++; |
676 | if (pcp->count >= pcp->high) | 676 | if (pcp->count >= pcp->high) |
677 | pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0); | 677 | pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0); |
678 | local_irq_restore(flags); | 678 | local_irq_restore(flags); |
679 | put_cpu(); | 679 | put_cpu(); |
680 | } | 680 | } |
681 | 681 | ||
682 | void fastcall free_hot_page(struct page *page) | 682 | void fastcall free_hot_page(struct page *page) |
683 | { | 683 | { |
684 | free_hot_cold_page(page, 0); | 684 | free_hot_cold_page(page, 0); |
685 | } | 685 | } |
686 | 686 | ||
687 | void fastcall free_cold_page(struct page *page) | 687 | void fastcall free_cold_page(struct page *page) |
688 | { | 688 | { |
689 | free_hot_cold_page(page, 1); | 689 | free_hot_cold_page(page, 1); |
690 | } | 690 | } |
691 | 691 | ||
692 | static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) | 692 | static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) |
693 | { | 693 | { |
694 | int i; | 694 | int i; |
695 | 695 | ||
696 | BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM); | 696 | BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM); |
697 | for(i = 0; i < (1 << order); i++) | 697 | for(i = 0; i < (1 << order); i++) |
698 | clear_highpage(page + i); | 698 | clear_highpage(page + i); |
699 | } | 699 | } |
700 | 700 | ||
701 | /* | 701 | /* |
702 | * Really, prep_compound_page() should be called from __rmqueue_bulk(). But | 702 | * Really, prep_compound_page() should be called from __rmqueue_bulk(). But |
703 | * we cheat by calling it from here, in the order > 0 path. Saves a branch | 703 | * we cheat by calling it from here, in the order > 0 path. Saves a branch |
704 | * or two. | 704 | * or two. |
705 | */ | 705 | */ |
706 | static struct page * | 706 | static struct page * |
707 | buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags) | 707 | buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags) |
708 | { | 708 | { |
709 | unsigned long flags; | 709 | unsigned long flags; |
710 | struct page *page = NULL; | 710 | struct page *page = NULL; |
711 | int cold = !!(gfp_flags & __GFP_COLD); | 711 | int cold = !!(gfp_flags & __GFP_COLD); |
712 | 712 | ||
713 | if (order == 0) { | 713 | if (order == 0) { |
714 | struct per_cpu_pages *pcp; | 714 | struct per_cpu_pages *pcp; |
715 | 715 | ||
716 | pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; | 716 | pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; |
717 | local_irq_save(flags); | 717 | local_irq_save(flags); |
718 | if (pcp->count <= pcp->low) | 718 | if (pcp->count <= pcp->low) |
719 | pcp->count += rmqueue_bulk(zone, 0, | 719 | pcp->count += rmqueue_bulk(zone, 0, |
720 | pcp->batch, &pcp->list); | 720 | pcp->batch, &pcp->list); |
721 | if (pcp->count) { | 721 | if (pcp->count) { |
722 | page = list_entry(pcp->list.next, struct page, lru); | 722 | page = list_entry(pcp->list.next, struct page, lru); |
723 | list_del(&page->lru); | 723 | list_del(&page->lru); |
724 | pcp->count--; | 724 | pcp->count--; |
725 | } | 725 | } |
726 | local_irq_restore(flags); | 726 | local_irq_restore(flags); |
727 | put_cpu(); | 727 | put_cpu(); |
728 | } | 728 | } |
729 | 729 | ||
730 | if (page == NULL) { | 730 | if (page == NULL) { |
731 | spin_lock_irqsave(&zone->lock, flags); | 731 | spin_lock_irqsave(&zone->lock, flags); |
732 | page = __rmqueue(zone, order); | 732 | page = __rmqueue(zone, order); |
733 | spin_unlock_irqrestore(&zone->lock, flags); | 733 | spin_unlock_irqrestore(&zone->lock, flags); |
734 | } | 734 | } |
735 | 735 | ||
736 | if (page != NULL) { | 736 | if (page != NULL) { |
737 | BUG_ON(bad_range(zone, page)); | 737 | BUG_ON(bad_range(zone, page)); |
738 | mod_page_state_zone(zone, pgalloc, 1 << order); | 738 | mod_page_state_zone(zone, pgalloc, 1 << order); |
739 | prep_new_page(page, order); | 739 | prep_new_page(page, order); |
740 | 740 | ||
741 | if (gfp_flags & __GFP_ZERO) | 741 | if (gfp_flags & __GFP_ZERO) |
742 | prep_zero_page(page, order, gfp_flags); | 742 | prep_zero_page(page, order, gfp_flags); |
743 | 743 | ||
744 | if (order && (gfp_flags & __GFP_COMP)) | 744 | if (order && (gfp_flags & __GFP_COMP)) |
745 | prep_compound_page(page, order); | 745 | prep_compound_page(page, order); |
746 | } | 746 | } |
747 | return page; | 747 | return page; |
748 | } | 748 | } |
749 | 749 | ||
750 | /* | 750 | /* |
751 | * Return 1 if free pages are above 'mark'. This takes into account the order | 751 | * Return 1 if free pages are above 'mark'. This takes into account the order |
752 | * of the allocation. | 752 | * of the allocation. |
753 | */ | 753 | */ |
754 | int zone_watermark_ok(struct zone *z, int order, unsigned long mark, | 754 | int zone_watermark_ok(struct zone *z, int order, unsigned long mark, |
755 | int classzone_idx, int can_try_harder, gfp_t gfp_high) | 755 | int classzone_idx, int can_try_harder, gfp_t gfp_high) |
756 | { | 756 | { |
757 | /* free_pages my go negative - that's OK */ | 757 | /* free_pages my go negative - that's OK */ |
758 | long min = mark, free_pages = z->free_pages - (1 << order) + 1; | 758 | long min = mark, free_pages = z->free_pages - (1 << order) + 1; |
759 | int o; | 759 | int o; |
760 | 760 | ||
761 | if (gfp_high) | 761 | if (gfp_high) |
762 | min -= min / 2; | 762 | min -= min / 2; |
763 | if (can_try_harder) | 763 | if (can_try_harder) |
764 | min -= min / 4; | 764 | min -= min / 4; |
765 | 765 | ||
766 | if (free_pages <= min + z->lowmem_reserve[classzone_idx]) | 766 | if (free_pages <= min + z->lowmem_reserve[classzone_idx]) |
767 | return 0; | 767 | return 0; |
768 | for (o = 0; o < order; o++) { | 768 | for (o = 0; o < order; o++) { |
769 | /* At the next order, this order's pages become unavailable */ | 769 | /* At the next order, this order's pages become unavailable */ |
770 | free_pages -= z->free_area[o].nr_free << o; | 770 | free_pages -= z->free_area[o].nr_free << o; |
771 | 771 | ||
772 | /* Require fewer higher order pages to be free */ | 772 | /* Require fewer higher order pages to be free */ |
773 | min >>= 1; | 773 | min >>= 1; |
774 | 774 | ||
775 | if (free_pages <= min) | 775 | if (free_pages <= min) |
776 | return 0; | 776 | return 0; |
777 | } | 777 | } |
778 | return 1; | 778 | return 1; |
779 | } | 779 | } |
780 | 780 | ||
781 | static inline int | 781 | static inline int |
782 | should_reclaim_zone(struct zone *z, gfp_t gfp_mask) | 782 | should_reclaim_zone(struct zone *z, gfp_t gfp_mask) |
783 | { | 783 | { |
784 | if (!z->reclaim_pages) | 784 | if (!z->reclaim_pages) |
785 | return 0; | 785 | return 0; |
786 | if (gfp_mask & __GFP_NORECLAIM) | 786 | if (gfp_mask & __GFP_NORECLAIM) |
787 | return 0; | 787 | return 0; |
788 | return 1; | 788 | return 1; |
789 | } | 789 | } |
790 | 790 | ||
791 | /* | 791 | /* |
792 | * This is the 'heart' of the zoned buddy allocator. | 792 | * This is the 'heart' of the zoned buddy allocator. |
793 | */ | 793 | */ |
794 | struct page * fastcall | 794 | struct page * fastcall |
795 | __alloc_pages(gfp_t gfp_mask, unsigned int order, | 795 | __alloc_pages(gfp_t gfp_mask, unsigned int order, |
796 | struct zonelist *zonelist) | 796 | struct zonelist *zonelist) |
797 | { | 797 | { |
798 | const gfp_t wait = gfp_mask & __GFP_WAIT; | 798 | const gfp_t wait = gfp_mask & __GFP_WAIT; |
799 | struct zone **zones, *z; | 799 | struct zone **zones, *z; |
800 | struct page *page; | 800 | struct page *page; |
801 | struct reclaim_state reclaim_state; | 801 | struct reclaim_state reclaim_state; |
802 | struct task_struct *p = current; | 802 | struct task_struct *p = current; |
803 | int i; | 803 | int i; |
804 | int classzone_idx; | 804 | int classzone_idx; |
805 | int do_retry; | 805 | int do_retry; |
806 | int can_try_harder; | 806 | int can_try_harder; |
807 | int did_some_progress; | 807 | int did_some_progress; |
808 | 808 | ||
809 | might_sleep_if(wait); | 809 | might_sleep_if(wait); |
810 | 810 | ||
811 | /* | 811 | /* |
812 | * The caller may dip into page reserves a bit more if the caller | 812 | * The caller may dip into page reserves a bit more if the caller |
813 | * cannot run direct reclaim, or is the caller has realtime scheduling | 813 | * cannot run direct reclaim, or is the caller has realtime scheduling |
814 | * policy | 814 | * policy |
815 | */ | 815 | */ |
816 | can_try_harder = (unlikely(rt_task(p)) && !in_interrupt()) || !wait; | 816 | can_try_harder = (unlikely(rt_task(p)) && !in_interrupt()) || !wait; |
817 | 817 | ||
818 | zones = zonelist->zones; /* the list of zones suitable for gfp_mask */ | 818 | zones = zonelist->zones; /* the list of zones suitable for gfp_mask */ |
819 | 819 | ||
820 | if (unlikely(zones[0] == NULL)) { | 820 | if (unlikely(zones[0] == NULL)) { |
821 | /* Should this ever happen?? */ | 821 | /* Should this ever happen?? */ |
822 | return NULL; | 822 | return NULL; |
823 | } | 823 | } |
824 | 824 | ||
825 | classzone_idx = zone_idx(zones[0]); | 825 | classzone_idx = zone_idx(zones[0]); |
826 | 826 | ||
827 | restart: | 827 | restart: |
828 | /* | 828 | /* |
829 | * Go through the zonelist once, looking for a zone with enough free. | 829 | * Go through the zonelist once, looking for a zone with enough free. |
830 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. | 830 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. |
831 | */ | 831 | */ |
832 | for (i = 0; (z = zones[i]) != NULL; i++) { | 832 | for (i = 0; (z = zones[i]) != NULL; i++) { |
833 | int do_reclaim = should_reclaim_zone(z, gfp_mask); | 833 | int do_reclaim = should_reclaim_zone(z, gfp_mask); |
834 | 834 | ||
835 | if (!cpuset_zone_allowed(z, __GFP_HARDWALL)) | 835 | if (!cpuset_zone_allowed(z, __GFP_HARDWALL)) |
836 | continue; | 836 | continue; |
837 | 837 | ||
838 | /* | 838 | /* |
839 | * If the zone is to attempt early page reclaim then this loop | 839 | * If the zone is to attempt early page reclaim then this loop |
840 | * will try to reclaim pages and check the watermark a second | 840 | * will try to reclaim pages and check the watermark a second |
841 | * time before giving up and falling back to the next zone. | 841 | * time before giving up and falling back to the next zone. |
842 | */ | 842 | */ |
843 | zone_reclaim_retry: | 843 | zone_reclaim_retry: |
844 | if (!zone_watermark_ok(z, order, z->pages_low, | 844 | if (!zone_watermark_ok(z, order, z->pages_low, |
845 | classzone_idx, 0, 0)) { | 845 | classzone_idx, 0, 0)) { |
846 | if (!do_reclaim) | 846 | if (!do_reclaim) |
847 | continue; | 847 | continue; |
848 | else { | 848 | else { |
849 | zone_reclaim(z, gfp_mask, order); | 849 | zone_reclaim(z, gfp_mask, order); |
850 | /* Only try reclaim once */ | 850 | /* Only try reclaim once */ |
851 | do_reclaim = 0; | 851 | do_reclaim = 0; |
852 | goto zone_reclaim_retry; | 852 | goto zone_reclaim_retry; |
853 | } | 853 | } |
854 | } | 854 | } |
855 | 855 | ||
856 | page = buffered_rmqueue(z, order, gfp_mask); | 856 | page = buffered_rmqueue(z, order, gfp_mask); |
857 | if (page) | 857 | if (page) |
858 | goto got_pg; | 858 | goto got_pg; |
859 | } | 859 | } |
860 | 860 | ||
861 | for (i = 0; (z = zones[i]) != NULL; i++) | 861 | for (i = 0; (z = zones[i]) != NULL; i++) |
862 | wakeup_kswapd(z, order); | 862 | wakeup_kswapd(z, order); |
863 | 863 | ||
864 | /* | 864 | /* |
865 | * Go through the zonelist again. Let __GFP_HIGH and allocations | 865 | * Go through the zonelist again. Let __GFP_HIGH and allocations |
866 | * coming from realtime tasks to go deeper into reserves | 866 | * coming from realtime tasks to go deeper into reserves |
867 | * | 867 | * |
868 | * This is the last chance, in general, before the goto nopage. | 868 | * This is the last chance, in general, before the goto nopage. |
869 | * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. | 869 | * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. |
870 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. | 870 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. |
871 | */ | 871 | */ |
872 | for (i = 0; (z = zones[i]) != NULL; i++) { | 872 | for (i = 0; (z = zones[i]) != NULL; i++) { |
873 | if (!zone_watermark_ok(z, order, z->pages_min, | 873 | if (!zone_watermark_ok(z, order, z->pages_min, |
874 | classzone_idx, can_try_harder, | 874 | classzone_idx, can_try_harder, |
875 | gfp_mask & __GFP_HIGH)) | 875 | gfp_mask & __GFP_HIGH)) |
876 | continue; | 876 | continue; |
877 | 877 | ||
878 | if (wait && !cpuset_zone_allowed(z, gfp_mask)) | 878 | if (wait && !cpuset_zone_allowed(z, gfp_mask)) |
879 | continue; | 879 | continue; |
880 | 880 | ||
881 | page = buffered_rmqueue(z, order, gfp_mask); | 881 | page = buffered_rmqueue(z, order, gfp_mask); |
882 | if (page) | 882 | if (page) |
883 | goto got_pg; | 883 | goto got_pg; |
884 | } | 884 | } |
885 | 885 | ||
886 | /* This allocation should allow future memory freeing. */ | 886 | /* This allocation should allow future memory freeing. */ |
887 | 887 | ||
888 | if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) | 888 | if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) |
889 | && !in_interrupt()) { | 889 | && !in_interrupt()) { |
890 | if (!(gfp_mask & __GFP_NOMEMALLOC)) { | 890 | if (!(gfp_mask & __GFP_NOMEMALLOC)) { |
891 | /* go through the zonelist yet again, ignoring mins */ | 891 | /* go through the zonelist yet again, ignoring mins */ |
892 | for (i = 0; (z = zones[i]) != NULL; i++) { | 892 | for (i = 0; (z = zones[i]) != NULL; i++) { |
893 | if (!cpuset_zone_allowed(z, gfp_mask)) | 893 | if (!cpuset_zone_allowed(z, gfp_mask)) |
894 | continue; | 894 | continue; |
895 | page = buffered_rmqueue(z, order, gfp_mask); | 895 | page = buffered_rmqueue(z, order, gfp_mask); |
896 | if (page) | 896 | if (page) |
897 | goto got_pg; | 897 | goto got_pg; |
898 | } | 898 | } |
899 | } | 899 | } |
900 | goto nopage; | 900 | goto nopage; |
901 | } | 901 | } |
902 | 902 | ||
903 | /* Atomic allocations - we can't balance anything */ | 903 | /* Atomic allocations - we can't balance anything */ |
904 | if (!wait) | 904 | if (!wait) |
905 | goto nopage; | 905 | goto nopage; |
906 | 906 | ||
907 | rebalance: | 907 | rebalance: |
908 | cond_resched(); | 908 | cond_resched(); |
909 | 909 | ||
910 | /* We now go into synchronous reclaim */ | 910 | /* We now go into synchronous reclaim */ |
911 | p->flags |= PF_MEMALLOC; | 911 | p->flags |= PF_MEMALLOC; |
912 | reclaim_state.reclaimed_slab = 0; | 912 | reclaim_state.reclaimed_slab = 0; |
913 | p->reclaim_state = &reclaim_state; | 913 | p->reclaim_state = &reclaim_state; |
914 | 914 | ||
915 | did_some_progress = try_to_free_pages(zones, gfp_mask); | 915 | did_some_progress = try_to_free_pages(zones, gfp_mask); |
916 | 916 | ||
917 | p->reclaim_state = NULL; | 917 | p->reclaim_state = NULL; |
918 | p->flags &= ~PF_MEMALLOC; | 918 | p->flags &= ~PF_MEMALLOC; |
919 | 919 | ||
920 | cond_resched(); | 920 | cond_resched(); |
921 | 921 | ||
922 | if (likely(did_some_progress)) { | 922 | if (likely(did_some_progress)) { |
923 | for (i = 0; (z = zones[i]) != NULL; i++) { | 923 | for (i = 0; (z = zones[i]) != NULL; i++) { |
924 | if (!zone_watermark_ok(z, order, z->pages_min, | 924 | if (!zone_watermark_ok(z, order, z->pages_min, |
925 | classzone_idx, can_try_harder, | 925 | classzone_idx, can_try_harder, |
926 | gfp_mask & __GFP_HIGH)) | 926 | gfp_mask & __GFP_HIGH)) |
927 | continue; | 927 | continue; |
928 | 928 | ||
929 | if (!cpuset_zone_allowed(z, gfp_mask)) | 929 | if (!cpuset_zone_allowed(z, gfp_mask)) |
930 | continue; | 930 | continue; |
931 | 931 | ||
932 | page = buffered_rmqueue(z, order, gfp_mask); | 932 | page = buffered_rmqueue(z, order, gfp_mask); |
933 | if (page) | 933 | if (page) |
934 | goto got_pg; | 934 | goto got_pg; |
935 | } | 935 | } |
936 | } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { | 936 | } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { |
937 | /* | 937 | /* |
938 | * Go through the zonelist yet one more time, keep | 938 | * Go through the zonelist yet one more time, keep |
939 | * very high watermark here, this is only to catch | 939 | * very high watermark here, this is only to catch |
940 | * a parallel oom killing, we must fail if we're still | 940 | * a parallel oom killing, we must fail if we're still |
941 | * under heavy pressure. | 941 | * under heavy pressure. |
942 | */ | 942 | */ |
943 | for (i = 0; (z = zones[i]) != NULL; i++) { | 943 | for (i = 0; (z = zones[i]) != NULL; i++) { |
944 | if (!zone_watermark_ok(z, order, z->pages_high, | 944 | if (!zone_watermark_ok(z, order, z->pages_high, |
945 | classzone_idx, 0, 0)) | 945 | classzone_idx, 0, 0)) |
946 | continue; | 946 | continue; |
947 | 947 | ||
948 | if (!cpuset_zone_allowed(z, __GFP_HARDWALL)) | 948 | if (!cpuset_zone_allowed(z, __GFP_HARDWALL)) |
949 | continue; | 949 | continue; |
950 | 950 | ||
951 | page = buffered_rmqueue(z, order, gfp_mask); | 951 | page = buffered_rmqueue(z, order, gfp_mask); |
952 | if (page) | 952 | if (page) |
953 | goto got_pg; | 953 | goto got_pg; |
954 | } | 954 | } |
955 | 955 | ||
956 | out_of_memory(gfp_mask, order); | 956 | out_of_memory(gfp_mask, order); |
957 | goto restart; | 957 | goto restart; |
958 | } | 958 | } |
959 | 959 | ||
960 | /* | 960 | /* |
961 | * Don't let big-order allocations loop unless the caller explicitly | 961 | * Don't let big-order allocations loop unless the caller explicitly |
962 | * requests that. Wait for some write requests to complete then retry. | 962 | * requests that. Wait for some write requests to complete then retry. |
963 | * | 963 | * |
964 | * In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order | 964 | * In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order |
965 | * <= 3, but that may not be true in other implementations. | 965 | * <= 3, but that may not be true in other implementations. |
966 | */ | 966 | */ |
967 | do_retry = 0; | 967 | do_retry = 0; |
968 | if (!(gfp_mask & __GFP_NORETRY)) { | 968 | if (!(gfp_mask & __GFP_NORETRY)) { |
969 | if ((order <= 3) || (gfp_mask & __GFP_REPEAT)) | 969 | if ((order <= 3) || (gfp_mask & __GFP_REPEAT)) |
970 | do_retry = 1; | 970 | do_retry = 1; |
971 | if (gfp_mask & __GFP_NOFAIL) | 971 | if (gfp_mask & __GFP_NOFAIL) |
972 | do_retry = 1; | 972 | do_retry = 1; |
973 | } | 973 | } |
974 | if (do_retry) { | 974 | if (do_retry) { |
975 | blk_congestion_wait(WRITE, HZ/50); | 975 | blk_congestion_wait(WRITE, HZ/50); |
976 | goto rebalance; | 976 | goto rebalance; |
977 | } | 977 | } |
978 | 978 | ||
979 | nopage: | 979 | nopage: |
980 | if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { | 980 | if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { |
981 | printk(KERN_WARNING "%s: page allocation failure." | 981 | printk(KERN_WARNING "%s: page allocation failure." |
982 | " order:%d, mode:0x%x\n", | 982 | " order:%d, mode:0x%x\n", |
983 | p->comm, order, gfp_mask); | 983 | p->comm, order, gfp_mask); |
984 | dump_stack(); | 984 | dump_stack(); |
985 | show_mem(); | 985 | show_mem(); |
986 | } | 986 | } |
987 | return NULL; | 987 | return NULL; |
988 | got_pg: | 988 | got_pg: |
989 | zone_statistics(zonelist, z); | 989 | zone_statistics(zonelist, z); |
990 | return page; | 990 | return page; |
991 | } | 991 | } |
992 | 992 | ||
993 | EXPORT_SYMBOL(__alloc_pages); | 993 | EXPORT_SYMBOL(__alloc_pages); |
994 | 994 | ||
995 | /* | 995 | /* |
996 | * Common helper functions. | 996 | * Common helper functions. |
997 | */ | 997 | */ |
998 | fastcall unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) | 998 | fastcall unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) |
999 | { | 999 | { |
1000 | struct page * page; | 1000 | struct page * page; |
1001 | page = alloc_pages(gfp_mask, order); | 1001 | page = alloc_pages(gfp_mask, order); |
1002 | if (!page) | 1002 | if (!page) |
1003 | return 0; | 1003 | return 0; |
1004 | return (unsigned long) page_address(page); | 1004 | return (unsigned long) page_address(page); |
1005 | } | 1005 | } |
1006 | 1006 | ||
1007 | EXPORT_SYMBOL(__get_free_pages); | 1007 | EXPORT_SYMBOL(__get_free_pages); |
1008 | 1008 | ||
1009 | fastcall unsigned long get_zeroed_page(gfp_t gfp_mask) | 1009 | fastcall unsigned long get_zeroed_page(gfp_t gfp_mask) |
1010 | { | 1010 | { |
1011 | struct page * page; | 1011 | struct page * page; |
1012 | 1012 | ||
1013 | /* | 1013 | /* |
1014 | * get_zeroed_page() returns a 32-bit address, which cannot represent | 1014 | * get_zeroed_page() returns a 32-bit address, which cannot represent |
1015 | * a highmem page | 1015 | * a highmem page |
1016 | */ | 1016 | */ |
1017 | BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); | 1017 | BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); |
1018 | 1018 | ||
1019 | page = alloc_pages(gfp_mask | __GFP_ZERO, 0); | 1019 | page = alloc_pages(gfp_mask | __GFP_ZERO, 0); |
1020 | if (page) | 1020 | if (page) |
1021 | return (unsigned long) page_address(page); | 1021 | return (unsigned long) page_address(page); |
1022 | return 0; | 1022 | return 0; |
1023 | } | 1023 | } |
1024 | 1024 | ||
1025 | EXPORT_SYMBOL(get_zeroed_page); | 1025 | EXPORT_SYMBOL(get_zeroed_page); |
1026 | 1026 | ||
1027 | void __pagevec_free(struct pagevec *pvec) | 1027 | void __pagevec_free(struct pagevec *pvec) |
1028 | { | 1028 | { |
1029 | int i = pagevec_count(pvec); | 1029 | int i = pagevec_count(pvec); |
1030 | 1030 | ||
1031 | while (--i >= 0) | 1031 | while (--i >= 0) |
1032 | free_hot_cold_page(pvec->pages[i], pvec->cold); | 1032 | free_hot_cold_page(pvec->pages[i], pvec->cold); |
1033 | } | 1033 | } |
1034 | 1034 | ||
1035 | fastcall void __free_pages(struct page *page, unsigned int order) | 1035 | fastcall void __free_pages(struct page *page, unsigned int order) |
1036 | { | 1036 | { |
1037 | if (put_page_testzero(page)) { | 1037 | if (put_page_testzero(page)) { |
1038 | if (order == 0) | 1038 | if (order == 0) |
1039 | free_hot_page(page); | 1039 | free_hot_page(page); |
1040 | else | 1040 | else |
1041 | __free_pages_ok(page, order); | 1041 | __free_pages_ok(page, order); |
1042 | } | 1042 | } |
1043 | } | 1043 | } |
1044 | 1044 | ||
1045 | EXPORT_SYMBOL(__free_pages); | 1045 | EXPORT_SYMBOL(__free_pages); |
1046 | 1046 | ||
1047 | fastcall void free_pages(unsigned long addr, unsigned int order) | 1047 | fastcall void free_pages(unsigned long addr, unsigned int order) |
1048 | { | 1048 | { |
1049 | if (addr != 0) { | 1049 | if (addr != 0) { |
1050 | BUG_ON(!virt_addr_valid((void *)addr)); | 1050 | BUG_ON(!virt_addr_valid((void *)addr)); |
1051 | __free_pages(virt_to_page((void *)addr), order); | 1051 | __free_pages(virt_to_page((void *)addr), order); |
1052 | } | 1052 | } |
1053 | } | 1053 | } |
1054 | 1054 | ||
1055 | EXPORT_SYMBOL(free_pages); | 1055 | EXPORT_SYMBOL(free_pages); |
1056 | 1056 | ||
1057 | /* | 1057 | /* |
1058 | * Total amount of free (allocatable) RAM: | 1058 | * Total amount of free (allocatable) RAM: |
1059 | */ | 1059 | */ |
1060 | unsigned int nr_free_pages(void) | 1060 | unsigned int nr_free_pages(void) |
1061 | { | 1061 | { |
1062 | unsigned int sum = 0; | 1062 | unsigned int sum = 0; |
1063 | struct zone *zone; | 1063 | struct zone *zone; |
1064 | 1064 | ||
1065 | for_each_zone(zone) | 1065 | for_each_zone(zone) |
1066 | sum += zone->free_pages; | 1066 | sum += zone->free_pages; |
1067 | 1067 | ||
1068 | return sum; | 1068 | return sum; |
1069 | } | 1069 | } |
1070 | 1070 | ||
1071 | EXPORT_SYMBOL(nr_free_pages); | 1071 | EXPORT_SYMBOL(nr_free_pages); |
1072 | 1072 | ||
1073 | #ifdef CONFIG_NUMA | 1073 | #ifdef CONFIG_NUMA |
1074 | unsigned int nr_free_pages_pgdat(pg_data_t *pgdat) | 1074 | unsigned int nr_free_pages_pgdat(pg_data_t *pgdat) |
1075 | { | 1075 | { |
1076 | unsigned int i, sum = 0; | 1076 | unsigned int i, sum = 0; |
1077 | 1077 | ||
1078 | for (i = 0; i < MAX_NR_ZONES; i++) | 1078 | for (i = 0; i < MAX_NR_ZONES; i++) |
1079 | sum += pgdat->node_zones[i].free_pages; | 1079 | sum += pgdat->node_zones[i].free_pages; |
1080 | 1080 | ||
1081 | return sum; | 1081 | return sum; |
1082 | } | 1082 | } |
1083 | #endif | 1083 | #endif |
1084 | 1084 | ||
1085 | static unsigned int nr_free_zone_pages(int offset) | 1085 | static unsigned int nr_free_zone_pages(int offset) |
1086 | { | 1086 | { |
1087 | /* Just pick one node, since fallback list is circular */ | 1087 | /* Just pick one node, since fallback list is circular */ |
1088 | pg_data_t *pgdat = NODE_DATA(numa_node_id()); | 1088 | pg_data_t *pgdat = NODE_DATA(numa_node_id()); |
1089 | unsigned int sum = 0; | 1089 | unsigned int sum = 0; |
1090 | 1090 | ||
1091 | struct zonelist *zonelist = pgdat->node_zonelists + offset; | 1091 | struct zonelist *zonelist = pgdat->node_zonelists + offset; |
1092 | struct zone **zonep = zonelist->zones; | 1092 | struct zone **zonep = zonelist->zones; |
1093 | struct zone *zone; | 1093 | struct zone *zone; |
1094 | 1094 | ||
1095 | for (zone = *zonep++; zone; zone = *zonep++) { | 1095 | for (zone = *zonep++; zone; zone = *zonep++) { |
1096 | unsigned long size = zone->present_pages; | 1096 | unsigned long size = zone->present_pages; |
1097 | unsigned long high = zone->pages_high; | 1097 | unsigned long high = zone->pages_high; |
1098 | if (size > high) | 1098 | if (size > high) |
1099 | sum += size - high; | 1099 | sum += size - high; |
1100 | } | 1100 | } |
1101 | 1101 | ||
1102 | return sum; | 1102 | return sum; |
1103 | } | 1103 | } |
1104 | 1104 | ||
1105 | /* | 1105 | /* |
1106 | * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL | 1106 | * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL |
1107 | */ | 1107 | */ |
1108 | unsigned int nr_free_buffer_pages(void) | 1108 | unsigned int nr_free_buffer_pages(void) |
1109 | { | 1109 | { |
1110 | return nr_free_zone_pages(gfp_zone(GFP_USER)); | 1110 | return nr_free_zone_pages(gfp_zone(GFP_USER)); |
1111 | } | 1111 | } |
1112 | 1112 | ||
1113 | /* | 1113 | /* |
1114 | * Amount of free RAM allocatable within all zones | 1114 | * Amount of free RAM allocatable within all zones |
1115 | */ | 1115 | */ |
1116 | unsigned int nr_free_pagecache_pages(void) | 1116 | unsigned int nr_free_pagecache_pages(void) |
1117 | { | 1117 | { |
1118 | return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER)); | 1118 | return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER)); |
1119 | } | 1119 | } |
1120 | 1120 | ||
1121 | #ifdef CONFIG_HIGHMEM | 1121 | #ifdef CONFIG_HIGHMEM |
1122 | unsigned int nr_free_highpages (void) | 1122 | unsigned int nr_free_highpages (void) |
1123 | { | 1123 | { |
1124 | pg_data_t *pgdat; | 1124 | pg_data_t *pgdat; |
1125 | unsigned int pages = 0; | 1125 | unsigned int pages = 0; |
1126 | 1126 | ||
1127 | for_each_pgdat(pgdat) | 1127 | for_each_pgdat(pgdat) |
1128 | pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages; | 1128 | pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages; |
1129 | 1129 | ||
1130 | return pages; | 1130 | return pages; |
1131 | } | 1131 | } |
1132 | #endif | 1132 | #endif |
1133 | 1133 | ||
1134 | #ifdef CONFIG_NUMA | 1134 | #ifdef CONFIG_NUMA |
1135 | static void show_node(struct zone *zone) | 1135 | static void show_node(struct zone *zone) |
1136 | { | 1136 | { |
1137 | printk("Node %d ", zone->zone_pgdat->node_id); | 1137 | printk("Node %d ", zone->zone_pgdat->node_id); |
1138 | } | 1138 | } |
1139 | #else | 1139 | #else |
1140 | #define show_node(zone) do { } while (0) | 1140 | #define show_node(zone) do { } while (0) |
1141 | #endif | 1141 | #endif |
1142 | 1142 | ||
1143 | /* | 1143 | /* |
1144 | * Accumulate the page_state information across all CPUs. | 1144 | * Accumulate the page_state information across all CPUs. |
1145 | * The result is unavoidably approximate - it can change | 1145 | * The result is unavoidably approximate - it can change |
1146 | * during and after execution of this function. | 1146 | * during and after execution of this function. |
1147 | */ | 1147 | */ |
1148 | static DEFINE_PER_CPU(struct page_state, page_states) = {0}; | 1148 | static DEFINE_PER_CPU(struct page_state, page_states) = {0}; |
1149 | 1149 | ||
1150 | atomic_t nr_pagecache = ATOMIC_INIT(0); | 1150 | atomic_t nr_pagecache = ATOMIC_INIT(0); |
1151 | EXPORT_SYMBOL(nr_pagecache); | 1151 | EXPORT_SYMBOL(nr_pagecache); |
1152 | #ifdef CONFIG_SMP | 1152 | #ifdef CONFIG_SMP |
1153 | DEFINE_PER_CPU(long, nr_pagecache_local) = 0; | 1153 | DEFINE_PER_CPU(long, nr_pagecache_local) = 0; |
1154 | #endif | 1154 | #endif |
1155 | 1155 | ||
1156 | void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask) | 1156 | void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask) |
1157 | { | 1157 | { |
1158 | int cpu = 0; | 1158 | int cpu = 0; |
1159 | 1159 | ||
1160 | memset(ret, 0, sizeof(*ret)); | 1160 | memset(ret, 0, sizeof(*ret)); |
1161 | cpus_and(*cpumask, *cpumask, cpu_online_map); | 1161 | cpus_and(*cpumask, *cpumask, cpu_online_map); |
1162 | 1162 | ||
1163 | cpu = first_cpu(*cpumask); | 1163 | cpu = first_cpu(*cpumask); |
1164 | while (cpu < NR_CPUS) { | 1164 | while (cpu < NR_CPUS) { |
1165 | unsigned long *in, *out, off; | 1165 | unsigned long *in, *out, off; |
1166 | 1166 | ||
1167 | in = (unsigned long *)&per_cpu(page_states, cpu); | 1167 | in = (unsigned long *)&per_cpu(page_states, cpu); |
1168 | 1168 | ||
1169 | cpu = next_cpu(cpu, *cpumask); | 1169 | cpu = next_cpu(cpu, *cpumask); |
1170 | 1170 | ||
1171 | if (cpu < NR_CPUS) | 1171 | if (cpu < NR_CPUS) |
1172 | prefetch(&per_cpu(page_states, cpu)); | 1172 | prefetch(&per_cpu(page_states, cpu)); |
1173 | 1173 | ||
1174 | out = (unsigned long *)ret; | 1174 | out = (unsigned long *)ret; |
1175 | for (off = 0; off < nr; off++) | 1175 | for (off = 0; off < nr; off++) |
1176 | *out++ += *in++; | 1176 | *out++ += *in++; |
1177 | } | 1177 | } |
1178 | } | 1178 | } |
1179 | 1179 | ||
1180 | void get_page_state_node(struct page_state *ret, int node) | 1180 | void get_page_state_node(struct page_state *ret, int node) |
1181 | { | 1181 | { |
1182 | int nr; | 1182 | int nr; |
1183 | cpumask_t mask = node_to_cpumask(node); | 1183 | cpumask_t mask = node_to_cpumask(node); |
1184 | 1184 | ||
1185 | nr = offsetof(struct page_state, GET_PAGE_STATE_LAST); | 1185 | nr = offsetof(struct page_state, GET_PAGE_STATE_LAST); |
1186 | nr /= sizeof(unsigned long); | 1186 | nr /= sizeof(unsigned long); |
1187 | 1187 | ||
1188 | __get_page_state(ret, nr+1, &mask); | 1188 | __get_page_state(ret, nr+1, &mask); |
1189 | } | 1189 | } |
1190 | 1190 | ||
1191 | void get_page_state(struct page_state *ret) | 1191 | void get_page_state(struct page_state *ret) |
1192 | { | 1192 | { |
1193 | int nr; | 1193 | int nr; |
1194 | cpumask_t mask = CPU_MASK_ALL; | 1194 | cpumask_t mask = CPU_MASK_ALL; |
1195 | 1195 | ||
1196 | nr = offsetof(struct page_state, GET_PAGE_STATE_LAST); | 1196 | nr = offsetof(struct page_state, GET_PAGE_STATE_LAST); |
1197 | nr /= sizeof(unsigned long); | 1197 | nr /= sizeof(unsigned long); |
1198 | 1198 | ||
1199 | __get_page_state(ret, nr + 1, &mask); | 1199 | __get_page_state(ret, nr + 1, &mask); |
1200 | } | 1200 | } |
1201 | 1201 | ||
1202 | void get_full_page_state(struct page_state *ret) | 1202 | void get_full_page_state(struct page_state *ret) |
1203 | { | 1203 | { |
1204 | cpumask_t mask = CPU_MASK_ALL; | 1204 | cpumask_t mask = CPU_MASK_ALL; |
1205 | 1205 | ||
1206 | __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask); | 1206 | __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask); |
1207 | } | 1207 | } |
1208 | 1208 | ||
1209 | unsigned long __read_page_state(unsigned long offset) | 1209 | unsigned long __read_page_state(unsigned long offset) |
1210 | { | 1210 | { |
1211 | unsigned long ret = 0; | 1211 | unsigned long ret = 0; |
1212 | int cpu; | 1212 | int cpu; |
1213 | 1213 | ||
1214 | for_each_online_cpu(cpu) { | 1214 | for_each_online_cpu(cpu) { |
1215 | unsigned long in; | 1215 | unsigned long in; |
1216 | 1216 | ||
1217 | in = (unsigned long)&per_cpu(page_states, cpu) + offset; | 1217 | in = (unsigned long)&per_cpu(page_states, cpu) + offset; |
1218 | ret += *((unsigned long *)in); | 1218 | ret += *((unsigned long *)in); |
1219 | } | 1219 | } |
1220 | return ret; | 1220 | return ret; |
1221 | } | 1221 | } |
1222 | 1222 | ||
1223 | void __mod_page_state(unsigned long offset, unsigned long delta) | 1223 | void __mod_page_state(unsigned long offset, unsigned long delta) |
1224 | { | 1224 | { |
1225 | unsigned long flags; | 1225 | unsigned long flags; |
1226 | void* ptr; | 1226 | void* ptr; |
1227 | 1227 | ||
1228 | local_irq_save(flags); | 1228 | local_irq_save(flags); |
1229 | ptr = &__get_cpu_var(page_states); | 1229 | ptr = &__get_cpu_var(page_states); |
1230 | *(unsigned long*)(ptr + offset) += delta; | 1230 | *(unsigned long*)(ptr + offset) += delta; |
1231 | local_irq_restore(flags); | 1231 | local_irq_restore(flags); |
1232 | } | 1232 | } |
1233 | 1233 | ||
1234 | EXPORT_SYMBOL(__mod_page_state); | 1234 | EXPORT_SYMBOL(__mod_page_state); |
1235 | 1235 | ||
1236 | void __get_zone_counts(unsigned long *active, unsigned long *inactive, | 1236 | void __get_zone_counts(unsigned long *active, unsigned long *inactive, |
1237 | unsigned long *free, struct pglist_data *pgdat) | 1237 | unsigned long *free, struct pglist_data *pgdat) |
1238 | { | 1238 | { |
1239 | struct zone *zones = pgdat->node_zones; | 1239 | struct zone *zones = pgdat->node_zones; |
1240 | int i; | 1240 | int i; |
1241 | 1241 | ||
1242 | *active = 0; | 1242 | *active = 0; |
1243 | *inactive = 0; | 1243 | *inactive = 0; |
1244 | *free = 0; | 1244 | *free = 0; |
1245 | for (i = 0; i < MAX_NR_ZONES; i++) { | 1245 | for (i = 0; i < MAX_NR_ZONES; i++) { |
1246 | *active += zones[i].nr_active; | 1246 | *active += zones[i].nr_active; |
1247 | *inactive += zones[i].nr_inactive; | 1247 | *inactive += zones[i].nr_inactive; |
1248 | *free += zones[i].free_pages; | 1248 | *free += zones[i].free_pages; |
1249 | } | 1249 | } |
1250 | } | 1250 | } |
1251 | 1251 | ||
1252 | void get_zone_counts(unsigned long *active, | 1252 | void get_zone_counts(unsigned long *active, |
1253 | unsigned long *inactive, unsigned long *free) | 1253 | unsigned long *inactive, unsigned long *free) |
1254 | { | 1254 | { |
1255 | struct pglist_data *pgdat; | 1255 | struct pglist_data *pgdat; |
1256 | 1256 | ||
1257 | *active = 0; | 1257 | *active = 0; |
1258 | *inactive = 0; | 1258 | *inactive = 0; |
1259 | *free = 0; | 1259 | *free = 0; |
1260 | for_each_pgdat(pgdat) { | 1260 | for_each_pgdat(pgdat) { |
1261 | unsigned long l, m, n; | 1261 | unsigned long l, m, n; |
1262 | __get_zone_counts(&l, &m, &n, pgdat); | 1262 | __get_zone_counts(&l, &m, &n, pgdat); |
1263 | *active += l; | 1263 | *active += l; |
1264 | *inactive += m; | 1264 | *inactive += m; |
1265 | *free += n; | 1265 | *free += n; |
1266 | } | 1266 | } |
1267 | } | 1267 | } |
1268 | 1268 | ||
1269 | void si_meminfo(struct sysinfo *val) | 1269 | void si_meminfo(struct sysinfo *val) |
1270 | { | 1270 | { |
1271 | val->totalram = totalram_pages; | 1271 | val->totalram = totalram_pages; |
1272 | val->sharedram = 0; | 1272 | val->sharedram = 0; |
1273 | val->freeram = nr_free_pages(); | 1273 | val->freeram = nr_free_pages(); |
1274 | val->bufferram = nr_blockdev_pages(); | 1274 | val->bufferram = nr_blockdev_pages(); |
1275 | #ifdef CONFIG_HIGHMEM | 1275 | #ifdef CONFIG_HIGHMEM |
1276 | val->totalhigh = totalhigh_pages; | 1276 | val->totalhigh = totalhigh_pages; |
1277 | val->freehigh = nr_free_highpages(); | 1277 | val->freehigh = nr_free_highpages(); |
1278 | #else | 1278 | #else |
1279 | val->totalhigh = 0; | 1279 | val->totalhigh = 0; |
1280 | val->freehigh = 0; | 1280 | val->freehigh = 0; |
1281 | #endif | 1281 | #endif |
1282 | val->mem_unit = PAGE_SIZE; | 1282 | val->mem_unit = PAGE_SIZE; |
1283 | } | 1283 | } |
1284 | 1284 | ||
1285 | EXPORT_SYMBOL(si_meminfo); | 1285 | EXPORT_SYMBOL(si_meminfo); |
1286 | 1286 | ||
1287 | #ifdef CONFIG_NUMA | 1287 | #ifdef CONFIG_NUMA |
1288 | void si_meminfo_node(struct sysinfo *val, int nid) | 1288 | void si_meminfo_node(struct sysinfo *val, int nid) |
1289 | { | 1289 | { |
1290 | pg_data_t *pgdat = NODE_DATA(nid); | 1290 | pg_data_t *pgdat = NODE_DATA(nid); |
1291 | 1291 | ||
1292 | val->totalram = pgdat->node_present_pages; | 1292 | val->totalram = pgdat->node_present_pages; |
1293 | val->freeram = nr_free_pages_pgdat(pgdat); | 1293 | val->freeram = nr_free_pages_pgdat(pgdat); |
1294 | val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages; | 1294 | val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages; |
1295 | val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages; | 1295 | val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages; |
1296 | val->mem_unit = PAGE_SIZE; | 1296 | val->mem_unit = PAGE_SIZE; |
1297 | } | 1297 | } |
1298 | #endif | 1298 | #endif |
1299 | 1299 | ||
1300 | #define K(x) ((x) << (PAGE_SHIFT-10)) | 1300 | #define K(x) ((x) << (PAGE_SHIFT-10)) |
1301 | 1301 | ||
1302 | /* | 1302 | /* |
1303 | * Show free area list (used inside shift_scroll-lock stuff) | 1303 | * Show free area list (used inside shift_scroll-lock stuff) |
1304 | * We also calculate the percentage fragmentation. We do this by counting the | 1304 | * We also calculate the percentage fragmentation. We do this by counting the |
1305 | * memory on each free list with the exception of the first item on the list. | 1305 | * memory on each free list with the exception of the first item on the list. |
1306 | */ | 1306 | */ |
1307 | void show_free_areas(void) | 1307 | void show_free_areas(void) |
1308 | { | 1308 | { |
1309 | struct page_state ps; | 1309 | struct page_state ps; |
1310 | int cpu, temperature; | 1310 | int cpu, temperature; |
1311 | unsigned long active; | 1311 | unsigned long active; |
1312 | unsigned long inactive; | 1312 | unsigned long inactive; |
1313 | unsigned long free; | 1313 | unsigned long free; |
1314 | struct zone *zone; | 1314 | struct zone *zone; |
1315 | 1315 | ||
1316 | for_each_zone(zone) { | 1316 | for_each_zone(zone) { |
1317 | show_node(zone); | 1317 | show_node(zone); |
1318 | printk("%s per-cpu:", zone->name); | 1318 | printk("%s per-cpu:", zone->name); |
1319 | 1319 | ||
1320 | if (!zone->present_pages) { | 1320 | if (!zone->present_pages) { |
1321 | printk(" empty\n"); | 1321 | printk(" empty\n"); |
1322 | continue; | 1322 | continue; |
1323 | } else | 1323 | } else |
1324 | printk("\n"); | 1324 | printk("\n"); |
1325 | 1325 | ||
1326 | for (cpu = 0; cpu < NR_CPUS; ++cpu) { | 1326 | for (cpu = 0; cpu < NR_CPUS; ++cpu) { |
1327 | struct per_cpu_pageset *pageset; | 1327 | struct per_cpu_pageset *pageset; |
1328 | 1328 | ||
1329 | if (!cpu_possible(cpu)) | 1329 | if (!cpu_possible(cpu)) |
1330 | continue; | 1330 | continue; |
1331 | 1331 | ||
1332 | pageset = zone_pcp(zone, cpu); | 1332 | pageset = zone_pcp(zone, cpu); |
1333 | 1333 | ||
1334 | for (temperature = 0; temperature < 2; temperature++) | 1334 | for (temperature = 0; temperature < 2; temperature++) |
1335 | printk("cpu %d %s: low %d, high %d, batch %d used:%d\n", | 1335 | printk("cpu %d %s: low %d, high %d, batch %d used:%d\n", |
1336 | cpu, | 1336 | cpu, |
1337 | temperature ? "cold" : "hot", | 1337 | temperature ? "cold" : "hot", |
1338 | pageset->pcp[temperature].low, | 1338 | pageset->pcp[temperature].low, |
1339 | pageset->pcp[temperature].high, | 1339 | pageset->pcp[temperature].high, |
1340 | pageset->pcp[temperature].batch, | 1340 | pageset->pcp[temperature].batch, |
1341 | pageset->pcp[temperature].count); | 1341 | pageset->pcp[temperature].count); |
1342 | } | 1342 | } |
1343 | } | 1343 | } |
1344 | 1344 | ||
1345 | get_page_state(&ps); | 1345 | get_page_state(&ps); |
1346 | get_zone_counts(&active, &inactive, &free); | 1346 | get_zone_counts(&active, &inactive, &free); |
1347 | 1347 | ||
1348 | printk("Free pages: %11ukB (%ukB HighMem)\n", | 1348 | printk("Free pages: %11ukB (%ukB HighMem)\n", |
1349 | K(nr_free_pages()), | 1349 | K(nr_free_pages()), |
1350 | K(nr_free_highpages())); | 1350 | K(nr_free_highpages())); |
1351 | 1351 | ||
1352 | printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu " | 1352 | printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu " |
1353 | "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n", | 1353 | "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n", |
1354 | active, | 1354 | active, |
1355 | inactive, | 1355 | inactive, |
1356 | ps.nr_dirty, | 1356 | ps.nr_dirty, |
1357 | ps.nr_writeback, | 1357 | ps.nr_writeback, |
1358 | ps.nr_unstable, | 1358 | ps.nr_unstable, |
1359 | nr_free_pages(), | 1359 | nr_free_pages(), |
1360 | ps.nr_slab, | 1360 | ps.nr_slab, |
1361 | ps.nr_mapped, | 1361 | ps.nr_mapped, |
1362 | ps.nr_page_table_pages); | 1362 | ps.nr_page_table_pages); |
1363 | 1363 | ||
1364 | for_each_zone(zone) { | 1364 | for_each_zone(zone) { |
1365 | int i; | 1365 | int i; |
1366 | 1366 | ||
1367 | show_node(zone); | 1367 | show_node(zone); |
1368 | printk("%s" | 1368 | printk("%s" |
1369 | " free:%lukB" | 1369 | " free:%lukB" |
1370 | " min:%lukB" | 1370 | " min:%lukB" |
1371 | " low:%lukB" | 1371 | " low:%lukB" |
1372 | " high:%lukB" | 1372 | " high:%lukB" |
1373 | " active:%lukB" | 1373 | " active:%lukB" |
1374 | " inactive:%lukB" | 1374 | " inactive:%lukB" |
1375 | " present:%lukB" | 1375 | " present:%lukB" |
1376 | " pages_scanned:%lu" | 1376 | " pages_scanned:%lu" |
1377 | " all_unreclaimable? %s" | 1377 | " all_unreclaimable? %s" |
1378 | "\n", | 1378 | "\n", |
1379 | zone->name, | 1379 | zone->name, |
1380 | K(zone->free_pages), | 1380 | K(zone->free_pages), |
1381 | K(zone->pages_min), | 1381 | K(zone->pages_min), |
1382 | K(zone->pages_low), | 1382 | K(zone->pages_low), |
1383 | K(zone->pages_high), | 1383 | K(zone->pages_high), |
1384 | K(zone->nr_active), | 1384 | K(zone->nr_active), |
1385 | K(zone->nr_inactive), | 1385 | K(zone->nr_inactive), |
1386 | K(zone->present_pages), | 1386 | K(zone->present_pages), |
1387 | zone->pages_scanned, | 1387 | zone->pages_scanned, |
1388 | (zone->all_unreclaimable ? "yes" : "no") | 1388 | (zone->all_unreclaimable ? "yes" : "no") |
1389 | ); | 1389 | ); |
1390 | printk("lowmem_reserve[]:"); | 1390 | printk("lowmem_reserve[]:"); |
1391 | for (i = 0; i < MAX_NR_ZONES; i++) | 1391 | for (i = 0; i < MAX_NR_ZONES; i++) |
1392 | printk(" %lu", zone->lowmem_reserve[i]); | 1392 | printk(" %lu", zone->lowmem_reserve[i]); |
1393 | printk("\n"); | 1393 | printk("\n"); |
1394 | } | 1394 | } |
1395 | 1395 | ||
1396 | for_each_zone(zone) { | 1396 | for_each_zone(zone) { |
1397 | unsigned long nr, flags, order, total = 0; | 1397 | unsigned long nr, flags, order, total = 0; |
1398 | 1398 | ||
1399 | show_node(zone); | 1399 | show_node(zone); |
1400 | printk("%s: ", zone->name); | 1400 | printk("%s: ", zone->name); |
1401 | if (!zone->present_pages) { | 1401 | if (!zone->present_pages) { |
1402 | printk("empty\n"); | 1402 | printk("empty\n"); |
1403 | continue; | 1403 | continue; |
1404 | } | 1404 | } |
1405 | 1405 | ||
1406 | spin_lock_irqsave(&zone->lock, flags); | 1406 | spin_lock_irqsave(&zone->lock, flags); |
1407 | for (order = 0; order < MAX_ORDER; order++) { | 1407 | for (order = 0; order < MAX_ORDER; order++) { |
1408 | nr = zone->free_area[order].nr_free; | 1408 | nr = zone->free_area[order].nr_free; |
1409 | total += nr << order; | 1409 | total += nr << order; |
1410 | printk("%lu*%lukB ", nr, K(1UL) << order); | 1410 | printk("%lu*%lukB ", nr, K(1UL) << order); |
1411 | } | 1411 | } |
1412 | spin_unlock_irqrestore(&zone->lock, flags); | 1412 | spin_unlock_irqrestore(&zone->lock, flags); |
1413 | printk("= %lukB\n", K(total)); | 1413 | printk("= %lukB\n", K(total)); |
1414 | } | 1414 | } |
1415 | 1415 | ||
1416 | show_swap_cache_info(); | 1416 | show_swap_cache_info(); |
1417 | } | 1417 | } |
1418 | 1418 | ||
1419 | /* | 1419 | /* |
1420 | * Builds allocation fallback zone lists. | 1420 | * Builds allocation fallback zone lists. |
1421 | */ | 1421 | */ |
1422 | static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k) | 1422 | static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k) |
1423 | { | 1423 | { |
1424 | switch (k) { | 1424 | switch (k) { |
1425 | struct zone *zone; | 1425 | struct zone *zone; |
1426 | default: | 1426 | default: |
1427 | BUG(); | 1427 | BUG(); |
1428 | case ZONE_HIGHMEM: | 1428 | case ZONE_HIGHMEM: |
1429 | zone = pgdat->node_zones + ZONE_HIGHMEM; | 1429 | zone = pgdat->node_zones + ZONE_HIGHMEM; |
1430 | if (zone->present_pages) { | 1430 | if (zone->present_pages) { |
1431 | #ifndef CONFIG_HIGHMEM | 1431 | #ifndef CONFIG_HIGHMEM |
1432 | BUG(); | 1432 | BUG(); |
1433 | #endif | 1433 | #endif |
1434 | zonelist->zones[j++] = zone; | 1434 | zonelist->zones[j++] = zone; |
1435 | } | 1435 | } |
1436 | case ZONE_NORMAL: | 1436 | case ZONE_NORMAL: |
1437 | zone = pgdat->node_zones + ZONE_NORMAL; | 1437 | zone = pgdat->node_zones + ZONE_NORMAL; |
1438 | if (zone->present_pages) | 1438 | if (zone->present_pages) |
1439 | zonelist->zones[j++] = zone; | 1439 | zonelist->zones[j++] = zone; |
1440 | case ZONE_DMA: | 1440 | case ZONE_DMA: |
1441 | zone = pgdat->node_zones + ZONE_DMA; | 1441 | zone = pgdat->node_zones + ZONE_DMA; |
1442 | if (zone->present_pages) | 1442 | if (zone->present_pages) |
1443 | zonelist->zones[j++] = zone; | 1443 | zonelist->zones[j++] = zone; |
1444 | } | 1444 | } |
1445 | 1445 | ||
1446 | return j; | 1446 | return j; |
1447 | } | 1447 | } |
1448 | 1448 | ||
1449 | static inline int highest_zone(int zone_bits) | 1449 | static inline int highest_zone(int zone_bits) |
1450 | { | 1450 | { |
1451 | int res = ZONE_NORMAL; | 1451 | int res = ZONE_NORMAL; |
1452 | if (zone_bits & (__force int)__GFP_HIGHMEM) | 1452 | if (zone_bits & (__force int)__GFP_HIGHMEM) |
1453 | res = ZONE_HIGHMEM; | 1453 | res = ZONE_HIGHMEM; |
1454 | if (zone_bits & (__force int)__GFP_DMA) | 1454 | if (zone_bits & (__force int)__GFP_DMA) |
1455 | res = ZONE_DMA; | 1455 | res = ZONE_DMA; |
1456 | return res; | 1456 | return res; |
1457 | } | 1457 | } |
1458 | 1458 | ||
1459 | #ifdef CONFIG_NUMA | 1459 | #ifdef CONFIG_NUMA |
1460 | #define MAX_NODE_LOAD (num_online_nodes()) | 1460 | #define MAX_NODE_LOAD (num_online_nodes()) |
1461 | static int __initdata node_load[MAX_NUMNODES]; | 1461 | static int __initdata node_load[MAX_NUMNODES]; |
1462 | /** | 1462 | /** |
1463 | * find_next_best_node - find the next node that should appear in a given node's fallback list | 1463 | * find_next_best_node - find the next node that should appear in a given node's fallback list |
1464 | * @node: node whose fallback list we're appending | 1464 | * @node: node whose fallback list we're appending |
1465 | * @used_node_mask: nodemask_t of already used nodes | 1465 | * @used_node_mask: nodemask_t of already used nodes |
1466 | * | 1466 | * |
1467 | * We use a number of factors to determine which is the next node that should | 1467 | * We use a number of factors to determine which is the next node that should |
1468 | * appear on a given node's fallback list. The node should not have appeared | 1468 | * appear on a given node's fallback list. The node should not have appeared |
1469 | * already in @node's fallback list, and it should be the next closest node | 1469 | * already in @node's fallback list, and it should be the next closest node |
1470 | * according to the distance array (which contains arbitrary distance values | 1470 | * according to the distance array (which contains arbitrary distance values |
1471 | * from each node to each node in the system), and should also prefer nodes | 1471 | * from each node to each node in the system), and should also prefer nodes |
1472 | * with no CPUs, since presumably they'll have very little allocation pressure | 1472 | * with no CPUs, since presumably they'll have very little allocation pressure |
1473 | * on them otherwise. | 1473 | * on them otherwise. |
1474 | * It returns -1 if no node is found. | 1474 | * It returns -1 if no node is found. |
1475 | */ | 1475 | */ |
1476 | static int __init find_next_best_node(int node, nodemask_t *used_node_mask) | 1476 | static int __init find_next_best_node(int node, nodemask_t *used_node_mask) |
1477 | { | 1477 | { |
1478 | int i, n, val; | 1478 | int i, n, val; |
1479 | int min_val = INT_MAX; | 1479 | int min_val = INT_MAX; |
1480 | int best_node = -1; | 1480 | int best_node = -1; |
1481 | 1481 | ||
1482 | for_each_online_node(i) { | 1482 | for_each_online_node(i) { |
1483 | cpumask_t tmp; | 1483 | cpumask_t tmp; |
1484 | 1484 | ||
1485 | /* Start from local node */ | 1485 | /* Start from local node */ |
1486 | n = (node+i) % num_online_nodes(); | 1486 | n = (node+i) % num_online_nodes(); |
1487 | 1487 | ||
1488 | /* Don't want a node to appear more than once */ | 1488 | /* Don't want a node to appear more than once */ |
1489 | if (node_isset(n, *used_node_mask)) | 1489 | if (node_isset(n, *used_node_mask)) |
1490 | continue; | 1490 | continue; |
1491 | 1491 | ||
1492 | /* Use the local node if we haven't already */ | 1492 | /* Use the local node if we haven't already */ |
1493 | if (!node_isset(node, *used_node_mask)) { | 1493 | if (!node_isset(node, *used_node_mask)) { |
1494 | best_node = node; | 1494 | best_node = node; |
1495 | break; | 1495 | break; |
1496 | } | 1496 | } |
1497 | 1497 | ||
1498 | /* Use the distance array to find the distance */ | 1498 | /* Use the distance array to find the distance */ |
1499 | val = node_distance(node, n); | 1499 | val = node_distance(node, n); |
1500 | 1500 | ||
1501 | /* Give preference to headless and unused nodes */ | 1501 | /* Give preference to headless and unused nodes */ |
1502 | tmp = node_to_cpumask(n); | 1502 | tmp = node_to_cpumask(n); |
1503 | if (!cpus_empty(tmp)) | 1503 | if (!cpus_empty(tmp)) |
1504 | val += PENALTY_FOR_NODE_WITH_CPUS; | 1504 | val += PENALTY_FOR_NODE_WITH_CPUS; |
1505 | 1505 | ||
1506 | /* Slight preference for less loaded node */ | 1506 | /* Slight preference for less loaded node */ |
1507 | val *= (MAX_NODE_LOAD*MAX_NUMNODES); | 1507 | val *= (MAX_NODE_LOAD*MAX_NUMNODES); |
1508 | val += node_load[n]; | 1508 | val += node_load[n]; |
1509 | 1509 | ||
1510 | if (val < min_val) { | 1510 | if (val < min_val) { |
1511 | min_val = val; | 1511 | min_val = val; |
1512 | best_node = n; | 1512 | best_node = n; |
1513 | } | 1513 | } |
1514 | } | 1514 | } |
1515 | 1515 | ||
1516 | if (best_node >= 0) | 1516 | if (best_node >= 0) |
1517 | node_set(best_node, *used_node_mask); | 1517 | node_set(best_node, *used_node_mask); |
1518 | 1518 | ||
1519 | return best_node; | 1519 | return best_node; |
1520 | } | 1520 | } |
1521 | 1521 | ||
1522 | static void __init build_zonelists(pg_data_t *pgdat) | 1522 | static void __init build_zonelists(pg_data_t *pgdat) |
1523 | { | 1523 | { |
1524 | int i, j, k, node, local_node; | 1524 | int i, j, k, node, local_node; |
1525 | int prev_node, load; | 1525 | int prev_node, load; |
1526 | struct zonelist *zonelist; | 1526 | struct zonelist *zonelist; |
1527 | nodemask_t used_mask; | 1527 | nodemask_t used_mask; |
1528 | 1528 | ||
1529 | /* initialize zonelists */ | 1529 | /* initialize zonelists */ |
1530 | for (i = 0; i < GFP_ZONETYPES; i++) { | 1530 | for (i = 0; i < GFP_ZONETYPES; i++) { |
1531 | zonelist = pgdat->node_zonelists + i; | 1531 | zonelist = pgdat->node_zonelists + i; |
1532 | zonelist->zones[0] = NULL; | 1532 | zonelist->zones[0] = NULL; |
1533 | } | 1533 | } |
1534 | 1534 | ||
1535 | /* NUMA-aware ordering of nodes */ | 1535 | /* NUMA-aware ordering of nodes */ |
1536 | local_node = pgdat->node_id; | 1536 | local_node = pgdat->node_id; |
1537 | load = num_online_nodes(); | 1537 | load = num_online_nodes(); |
1538 | prev_node = local_node; | 1538 | prev_node = local_node; |
1539 | nodes_clear(used_mask); | 1539 | nodes_clear(used_mask); |
1540 | while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { | 1540 | while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { |
1541 | /* | 1541 | /* |
1542 | * We don't want to pressure a particular node. | 1542 | * We don't want to pressure a particular node. |
1543 | * So adding penalty to the first node in same | 1543 | * So adding penalty to the first node in same |
1544 | * distance group to make it round-robin. | 1544 | * distance group to make it round-robin. |
1545 | */ | 1545 | */ |
1546 | if (node_distance(local_node, node) != | 1546 | if (node_distance(local_node, node) != |
1547 | node_distance(local_node, prev_node)) | 1547 | node_distance(local_node, prev_node)) |
1548 | node_load[node] += load; | 1548 | node_load[node] += load; |
1549 | prev_node = node; | 1549 | prev_node = node; |
1550 | load--; | 1550 | load--; |
1551 | for (i = 0; i < GFP_ZONETYPES; i++) { | 1551 | for (i = 0; i < GFP_ZONETYPES; i++) { |
1552 | zonelist = pgdat->node_zonelists + i; | 1552 | zonelist = pgdat->node_zonelists + i; |
1553 | for (j = 0; zonelist->zones[j] != NULL; j++); | 1553 | for (j = 0; zonelist->zones[j] != NULL; j++); |
1554 | 1554 | ||
1555 | k = highest_zone(i); | 1555 | k = highest_zone(i); |
1556 | 1556 | ||
1557 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); | 1557 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); |
1558 | zonelist->zones[j] = NULL; | 1558 | zonelist->zones[j] = NULL; |
1559 | } | 1559 | } |
1560 | } | 1560 | } |
1561 | } | 1561 | } |
1562 | 1562 | ||
1563 | #else /* CONFIG_NUMA */ | 1563 | #else /* CONFIG_NUMA */ |
1564 | 1564 | ||
1565 | static void __init build_zonelists(pg_data_t *pgdat) | 1565 | static void __init build_zonelists(pg_data_t *pgdat) |
1566 | { | 1566 | { |
1567 | int i, j, k, node, local_node; | 1567 | int i, j, k, node, local_node; |
1568 | 1568 | ||
1569 | local_node = pgdat->node_id; | 1569 | local_node = pgdat->node_id; |
1570 | for (i = 0; i < GFP_ZONETYPES; i++) { | 1570 | for (i = 0; i < GFP_ZONETYPES; i++) { |
1571 | struct zonelist *zonelist; | 1571 | struct zonelist *zonelist; |
1572 | 1572 | ||
1573 | zonelist = pgdat->node_zonelists + i; | 1573 | zonelist = pgdat->node_zonelists + i; |
1574 | 1574 | ||
1575 | j = 0; | 1575 | j = 0; |
1576 | k = highest_zone(i); | 1576 | k = highest_zone(i); |
1577 | j = build_zonelists_node(pgdat, zonelist, j, k); | 1577 | j = build_zonelists_node(pgdat, zonelist, j, k); |
1578 | /* | 1578 | /* |
1579 | * Now we build the zonelist so that it contains the zones | 1579 | * Now we build the zonelist so that it contains the zones |
1580 | * of all the other nodes. | 1580 | * of all the other nodes. |
1581 | * We don't want to pressure a particular node, so when | 1581 | * We don't want to pressure a particular node, so when |
1582 | * building the zones for node N, we make sure that the | 1582 | * building the zones for node N, we make sure that the |
1583 | * zones coming right after the local ones are those from | 1583 | * zones coming right after the local ones are those from |
1584 | * node N+1 (modulo N) | 1584 | * node N+1 (modulo N) |
1585 | */ | 1585 | */ |
1586 | for (node = local_node + 1; node < MAX_NUMNODES; node++) { | 1586 | for (node = local_node + 1; node < MAX_NUMNODES; node++) { |
1587 | if (!node_online(node)) | 1587 | if (!node_online(node)) |
1588 | continue; | 1588 | continue; |
1589 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); | 1589 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); |
1590 | } | 1590 | } |
1591 | for (node = 0; node < local_node; node++) { | 1591 | for (node = 0; node < local_node; node++) { |
1592 | if (!node_online(node)) | 1592 | if (!node_online(node)) |
1593 | continue; | 1593 | continue; |
1594 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); | 1594 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); |
1595 | } | 1595 | } |
1596 | 1596 | ||
1597 | zonelist->zones[j] = NULL; | 1597 | zonelist->zones[j] = NULL; |
1598 | } | 1598 | } |
1599 | } | 1599 | } |
1600 | 1600 | ||
1601 | #endif /* CONFIG_NUMA */ | 1601 | #endif /* CONFIG_NUMA */ |
1602 | 1602 | ||
1603 | void __init build_all_zonelists(void) | 1603 | void __init build_all_zonelists(void) |
1604 | { | 1604 | { |
1605 | int i; | 1605 | int i; |
1606 | 1606 | ||
1607 | for_each_online_node(i) | 1607 | for_each_online_node(i) |
1608 | build_zonelists(NODE_DATA(i)); | 1608 | build_zonelists(NODE_DATA(i)); |
1609 | printk("Built %i zonelists\n", num_online_nodes()); | 1609 | printk("Built %i zonelists\n", num_online_nodes()); |
1610 | cpuset_init_current_mems_allowed(); | 1610 | cpuset_init_current_mems_allowed(); |
1611 | } | 1611 | } |
1612 | 1612 | ||
1613 | /* | 1613 | /* |
1614 | * Helper functions to size the waitqueue hash table. | 1614 | * Helper functions to size the waitqueue hash table. |
1615 | * Essentially these want to choose hash table sizes sufficiently | 1615 | * Essentially these want to choose hash table sizes sufficiently |
1616 | * large so that collisions trying to wait on pages are rare. | 1616 | * large so that collisions trying to wait on pages are rare. |
1617 | * But in fact, the number of active page waitqueues on typical | 1617 | * But in fact, the number of active page waitqueues on typical |
1618 | * systems is ridiculously low, less than 200. So this is even | 1618 | * systems is ridiculously low, less than 200. So this is even |
1619 | * conservative, even though it seems large. | 1619 | * conservative, even though it seems large. |
1620 | * | 1620 | * |
1621 | * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to | 1621 | * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to |
1622 | * waitqueues, i.e. the size of the waitq table given the number of pages. | 1622 | * waitqueues, i.e. the size of the waitq table given the number of pages. |
1623 | */ | 1623 | */ |
1624 | #define PAGES_PER_WAITQUEUE 256 | 1624 | #define PAGES_PER_WAITQUEUE 256 |
1625 | 1625 | ||
1626 | static inline unsigned long wait_table_size(unsigned long pages) | 1626 | static inline unsigned long wait_table_size(unsigned long pages) |
1627 | { | 1627 | { |
1628 | unsigned long size = 1; | 1628 | unsigned long size = 1; |
1629 | 1629 | ||
1630 | pages /= PAGES_PER_WAITQUEUE; | 1630 | pages /= PAGES_PER_WAITQUEUE; |
1631 | 1631 | ||
1632 | while (size < pages) | 1632 | while (size < pages) |
1633 | size <<= 1; | 1633 | size <<= 1; |
1634 | 1634 | ||
1635 | /* | 1635 | /* |
1636 | * Once we have dozens or even hundreds of threads sleeping | 1636 | * Once we have dozens or even hundreds of threads sleeping |
1637 | * on IO we've got bigger problems than wait queue collision. | 1637 | * on IO we've got bigger problems than wait queue collision. |
1638 | * Limit the size of the wait table to a reasonable size. | 1638 | * Limit the size of the wait table to a reasonable size. |
1639 | */ | 1639 | */ |
1640 | size = min(size, 4096UL); | 1640 | size = min(size, 4096UL); |
1641 | 1641 | ||
1642 | return max(size, 4UL); | 1642 | return max(size, 4UL); |
1643 | } | 1643 | } |
1644 | 1644 | ||
1645 | /* | 1645 | /* |
1646 | * This is an integer logarithm so that shifts can be used later | 1646 | * This is an integer logarithm so that shifts can be used later |
1647 | * to extract the more random high bits from the multiplicative | 1647 | * to extract the more random high bits from the multiplicative |
1648 | * hash function before the remainder is taken. | 1648 | * hash function before the remainder is taken. |
1649 | */ | 1649 | */ |
1650 | static inline unsigned long wait_table_bits(unsigned long size) | 1650 | static inline unsigned long wait_table_bits(unsigned long size) |
1651 | { | 1651 | { |
1652 | return ffz(~size); | 1652 | return ffz(~size); |
1653 | } | 1653 | } |
1654 | 1654 | ||
1655 | #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) | 1655 | #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) |
1656 | 1656 | ||
1657 | static void __init calculate_zone_totalpages(struct pglist_data *pgdat, | 1657 | static void __init calculate_zone_totalpages(struct pglist_data *pgdat, |
1658 | unsigned long *zones_size, unsigned long *zholes_size) | 1658 | unsigned long *zones_size, unsigned long *zholes_size) |
1659 | { | 1659 | { |
1660 | unsigned long realtotalpages, totalpages = 0; | 1660 | unsigned long realtotalpages, totalpages = 0; |
1661 | int i; | 1661 | int i; |
1662 | 1662 | ||
1663 | for (i = 0; i < MAX_NR_ZONES; i++) | 1663 | for (i = 0; i < MAX_NR_ZONES; i++) |
1664 | totalpages += zones_size[i]; | 1664 | totalpages += zones_size[i]; |
1665 | pgdat->node_spanned_pages = totalpages; | 1665 | pgdat->node_spanned_pages = totalpages; |
1666 | 1666 | ||
1667 | realtotalpages = totalpages; | 1667 | realtotalpages = totalpages; |
1668 | if (zholes_size) | 1668 | if (zholes_size) |
1669 | for (i = 0; i < MAX_NR_ZONES; i++) | 1669 | for (i = 0; i < MAX_NR_ZONES; i++) |
1670 | realtotalpages -= zholes_size[i]; | 1670 | realtotalpages -= zholes_size[i]; |
1671 | pgdat->node_present_pages = realtotalpages; | 1671 | pgdat->node_present_pages = realtotalpages; |
1672 | printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages); | 1672 | printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages); |
1673 | } | 1673 | } |
1674 | 1674 | ||
1675 | 1675 | ||
1676 | /* | 1676 | /* |
1677 | * Initially all pages are reserved - free ones are freed | 1677 | * Initially all pages are reserved - free ones are freed |
1678 | * up by free_all_bootmem() once the early boot process is | 1678 | * up by free_all_bootmem() once the early boot process is |
1679 | * done. Non-atomic initialization, single-pass. | 1679 | * done. Non-atomic initialization, single-pass. |
1680 | */ | 1680 | */ |
1681 | void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone, | 1681 | void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone, |
1682 | unsigned long start_pfn) | 1682 | unsigned long start_pfn) |
1683 | { | 1683 | { |
1684 | struct page *page; | 1684 | struct page *page; |
1685 | unsigned long end_pfn = start_pfn + size; | 1685 | unsigned long end_pfn = start_pfn + size; |
1686 | unsigned long pfn; | 1686 | unsigned long pfn; |
1687 | 1687 | ||
1688 | for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) { | 1688 | for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) { |
1689 | if (!early_pfn_valid(pfn)) | 1689 | if (!early_pfn_valid(pfn)) |
1690 | continue; | 1690 | continue; |
1691 | if (!early_pfn_in_nid(pfn, nid)) | 1691 | if (!early_pfn_in_nid(pfn, nid)) |
1692 | continue; | 1692 | continue; |
1693 | page = pfn_to_page(pfn); | 1693 | page = pfn_to_page(pfn); |
1694 | set_page_links(page, zone, nid, pfn); | 1694 | set_page_links(page, zone, nid, pfn); |
1695 | set_page_count(page, 1); | 1695 | set_page_count(page, 1); |
1696 | reset_page_mapcount(page); | 1696 | reset_page_mapcount(page); |
1697 | SetPageReserved(page); | 1697 | SetPageReserved(page); |
1698 | INIT_LIST_HEAD(&page->lru); | 1698 | INIT_LIST_HEAD(&page->lru); |
1699 | #ifdef WANT_PAGE_VIRTUAL | 1699 | #ifdef WANT_PAGE_VIRTUAL |
1700 | /* The shift won't overflow because ZONE_NORMAL is below 4G. */ | 1700 | /* The shift won't overflow because ZONE_NORMAL is below 4G. */ |
1701 | if (!is_highmem_idx(zone)) | 1701 | if (!is_highmem_idx(zone)) |
1702 | set_page_address(page, __va(pfn << PAGE_SHIFT)); | 1702 | set_page_address(page, __va(pfn << PAGE_SHIFT)); |
1703 | #endif | 1703 | #endif |
1704 | } | 1704 | } |
1705 | } | 1705 | } |
1706 | 1706 | ||
1707 | void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone, | 1707 | void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone, |
1708 | unsigned long size) | 1708 | unsigned long size) |
1709 | { | 1709 | { |
1710 | int order; | 1710 | int order; |
1711 | for (order = 0; order < MAX_ORDER ; order++) { | 1711 | for (order = 0; order < MAX_ORDER ; order++) { |
1712 | INIT_LIST_HEAD(&zone->free_area[order].free_list); | 1712 | INIT_LIST_HEAD(&zone->free_area[order].free_list); |
1713 | zone->free_area[order].nr_free = 0; | 1713 | zone->free_area[order].nr_free = 0; |
1714 | } | 1714 | } |
1715 | } | 1715 | } |
1716 | 1716 | ||
1717 | #define ZONETABLE_INDEX(x, zone_nr) ((x << ZONES_SHIFT) | zone_nr) | 1717 | #define ZONETABLE_INDEX(x, zone_nr) ((x << ZONES_SHIFT) | zone_nr) |
1718 | void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn, | 1718 | void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn, |
1719 | unsigned long size) | 1719 | unsigned long size) |
1720 | { | 1720 | { |
1721 | unsigned long snum = pfn_to_section_nr(pfn); | 1721 | unsigned long snum = pfn_to_section_nr(pfn); |
1722 | unsigned long end = pfn_to_section_nr(pfn + size); | 1722 | unsigned long end = pfn_to_section_nr(pfn + size); |
1723 | 1723 | ||
1724 | if (FLAGS_HAS_NODE) | 1724 | if (FLAGS_HAS_NODE) |
1725 | zone_table[ZONETABLE_INDEX(nid, zid)] = zone; | 1725 | zone_table[ZONETABLE_INDEX(nid, zid)] = zone; |
1726 | else | 1726 | else |
1727 | for (; snum <= end; snum++) | 1727 | for (; snum <= end; snum++) |
1728 | zone_table[ZONETABLE_INDEX(snum, zid)] = zone; | 1728 | zone_table[ZONETABLE_INDEX(snum, zid)] = zone; |
1729 | } | 1729 | } |
1730 | 1730 | ||
1731 | #ifndef __HAVE_ARCH_MEMMAP_INIT | 1731 | #ifndef __HAVE_ARCH_MEMMAP_INIT |
1732 | #define memmap_init(size, nid, zone, start_pfn) \ | 1732 | #define memmap_init(size, nid, zone, start_pfn) \ |
1733 | memmap_init_zone((size), (nid), (zone), (start_pfn)) | 1733 | memmap_init_zone((size), (nid), (zone), (start_pfn)) |
1734 | #endif | 1734 | #endif |
1735 | 1735 | ||
1736 | static int __devinit zone_batchsize(struct zone *zone) | 1736 | static int __devinit zone_batchsize(struct zone *zone) |
1737 | { | 1737 | { |
1738 | int batch; | 1738 | int batch; |
1739 | 1739 | ||
1740 | /* | 1740 | /* |
1741 | * The per-cpu-pages pools are set to around 1000th of the | 1741 | * The per-cpu-pages pools are set to around 1000th of the |
1742 | * size of the zone. But no more than 1/2 of a meg. | 1742 | * size of the zone. But no more than 1/2 of a meg. |
1743 | * | 1743 | * |
1744 | * OK, so we don't know how big the cache is. So guess. | 1744 | * OK, so we don't know how big the cache is. So guess. |
1745 | */ | 1745 | */ |
1746 | batch = zone->present_pages / 1024; | 1746 | batch = zone->present_pages / 1024; |
1747 | if (batch * PAGE_SIZE > 512 * 1024) | 1747 | if (batch * PAGE_SIZE > 512 * 1024) |
1748 | batch = (512 * 1024) / PAGE_SIZE; | 1748 | batch = (512 * 1024) / PAGE_SIZE; |
1749 | batch /= 4; /* We effectively *= 4 below */ | 1749 | batch /= 4; /* We effectively *= 4 below */ |
1750 | if (batch < 1) | 1750 | if (batch < 1) |
1751 | batch = 1; | 1751 | batch = 1; |
1752 | 1752 | ||
1753 | /* | 1753 | /* |
1754 | * We will be trying to allcoate bigger chunks of contiguous | 1754 | * We will be trying to allcoate bigger chunks of contiguous |
1755 | * memory of the order of fls(batch). This should result in | 1755 | * memory of the order of fls(batch). This should result in |
1756 | * better cache coloring. | 1756 | * better cache coloring. |
1757 | * | 1757 | * |
1758 | * A sanity check also to ensure that batch is still in limits. | 1758 | * A sanity check also to ensure that batch is still in limits. |
1759 | */ | 1759 | */ |
1760 | batch = (1 << fls(batch + batch/2)); | 1760 | batch = (1 << fls(batch + batch/2)); |
1761 | 1761 | ||
1762 | if (fls(batch) >= (PAGE_SHIFT + MAX_ORDER - 2)) | 1762 | if (fls(batch) >= (PAGE_SHIFT + MAX_ORDER - 2)) |
1763 | batch = PAGE_SHIFT + ((MAX_ORDER - 1 - PAGE_SHIFT)/2); | 1763 | batch = PAGE_SHIFT + ((MAX_ORDER - 1 - PAGE_SHIFT)/2); |
1764 | 1764 | ||
1765 | return batch; | 1765 | return batch; |
1766 | } | 1766 | } |
1767 | 1767 | ||
1768 | inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) | 1768 | inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) |
1769 | { | 1769 | { |
1770 | struct per_cpu_pages *pcp; | 1770 | struct per_cpu_pages *pcp; |
1771 | 1771 | ||
1772 | memset(p, 0, sizeof(*p)); | 1772 | memset(p, 0, sizeof(*p)); |
1773 | 1773 | ||
1774 | pcp = &p->pcp[0]; /* hot */ | 1774 | pcp = &p->pcp[0]; /* hot */ |
1775 | pcp->count = 0; | 1775 | pcp->count = 0; |
1776 | pcp->low = 0; | 1776 | pcp->low = 0; |
1777 | pcp->high = 6 * batch; | 1777 | pcp->high = 6 * batch; |
1778 | pcp->batch = max(1UL, 1 * batch); | 1778 | pcp->batch = max(1UL, 1 * batch); |
1779 | INIT_LIST_HEAD(&pcp->list); | 1779 | INIT_LIST_HEAD(&pcp->list); |
1780 | 1780 | ||
1781 | pcp = &p->pcp[1]; /* cold*/ | 1781 | pcp = &p->pcp[1]; /* cold*/ |
1782 | pcp->count = 0; | 1782 | pcp->count = 0; |
1783 | pcp->low = 0; | 1783 | pcp->low = 0; |
1784 | pcp->high = 2 * batch; | 1784 | pcp->high = 2 * batch; |
1785 | pcp->batch = max(1UL, batch/2); | 1785 | pcp->batch = max(1UL, batch/2); |
1786 | INIT_LIST_HEAD(&pcp->list); | 1786 | INIT_LIST_HEAD(&pcp->list); |
1787 | } | 1787 | } |
1788 | 1788 | ||
1789 | #ifdef CONFIG_NUMA | 1789 | #ifdef CONFIG_NUMA |
1790 | /* | 1790 | /* |
1791 | * Boot pageset table. One per cpu which is going to be used for all | 1791 | * Boot pageset table. One per cpu which is going to be used for all |
1792 | * zones and all nodes. The parameters will be set in such a way | 1792 | * zones and all nodes. The parameters will be set in such a way |
1793 | * that an item put on a list will immediately be handed over to | 1793 | * that an item put on a list will immediately be handed over to |
1794 | * the buddy list. This is safe since pageset manipulation is done | 1794 | * the buddy list. This is safe since pageset manipulation is done |
1795 | * with interrupts disabled. | 1795 | * with interrupts disabled. |
1796 | * | 1796 | * |
1797 | * Some NUMA counter updates may also be caught by the boot pagesets. | 1797 | * Some NUMA counter updates may also be caught by the boot pagesets. |
1798 | * | 1798 | * |
1799 | * The boot_pagesets must be kept even after bootup is complete for | 1799 | * The boot_pagesets must be kept even after bootup is complete for |
1800 | * unused processors and/or zones. They do play a role for bootstrapping | 1800 | * unused processors and/or zones. They do play a role for bootstrapping |
1801 | * hotplugged processors. | 1801 | * hotplugged processors. |
1802 | * | 1802 | * |
1803 | * zoneinfo_show() and maybe other functions do | 1803 | * zoneinfo_show() and maybe other functions do |
1804 | * not check if the processor is online before following the pageset pointer. | 1804 | * not check if the processor is online before following the pageset pointer. |
1805 | * Other parts of the kernel may not check if the zone is available. | 1805 | * Other parts of the kernel may not check if the zone is available. |
1806 | */ | 1806 | */ |
1807 | static struct per_cpu_pageset | 1807 | static struct per_cpu_pageset |
1808 | boot_pageset[NR_CPUS]; | 1808 | boot_pageset[NR_CPUS]; |
1809 | 1809 | ||
1810 | /* | 1810 | /* |
1811 | * Dynamically allocate memory for the | 1811 | * Dynamically allocate memory for the |
1812 | * per cpu pageset array in struct zone. | 1812 | * per cpu pageset array in struct zone. |
1813 | */ | 1813 | */ |
1814 | static int __devinit process_zones(int cpu) | 1814 | static int __devinit process_zones(int cpu) |
1815 | { | 1815 | { |
1816 | struct zone *zone, *dzone; | 1816 | struct zone *zone, *dzone; |
1817 | 1817 | ||
1818 | for_each_zone(zone) { | 1818 | for_each_zone(zone) { |
1819 | 1819 | ||
1820 | zone->pageset[cpu] = kmalloc_node(sizeof(struct per_cpu_pageset), | 1820 | zone->pageset[cpu] = kmalloc_node(sizeof(struct per_cpu_pageset), |
1821 | GFP_KERNEL, cpu_to_node(cpu)); | 1821 | GFP_KERNEL, cpu_to_node(cpu)); |
1822 | if (!zone->pageset[cpu]) | 1822 | if (!zone->pageset[cpu]) |
1823 | goto bad; | 1823 | goto bad; |
1824 | 1824 | ||
1825 | setup_pageset(zone->pageset[cpu], zone_batchsize(zone)); | 1825 | setup_pageset(zone->pageset[cpu], zone_batchsize(zone)); |
1826 | } | 1826 | } |
1827 | 1827 | ||
1828 | return 0; | 1828 | return 0; |
1829 | bad: | 1829 | bad: |
1830 | for_each_zone(dzone) { | 1830 | for_each_zone(dzone) { |
1831 | if (dzone == zone) | 1831 | if (dzone == zone) |
1832 | break; | 1832 | break; |
1833 | kfree(dzone->pageset[cpu]); | 1833 | kfree(dzone->pageset[cpu]); |
1834 | dzone->pageset[cpu] = NULL; | 1834 | dzone->pageset[cpu] = NULL; |
1835 | } | 1835 | } |
1836 | return -ENOMEM; | 1836 | return -ENOMEM; |
1837 | } | 1837 | } |
1838 | 1838 | ||
1839 | static inline void free_zone_pagesets(int cpu) | 1839 | static inline void free_zone_pagesets(int cpu) |
1840 | { | 1840 | { |
1841 | #ifdef CONFIG_NUMA | 1841 | #ifdef CONFIG_NUMA |
1842 | struct zone *zone; | 1842 | struct zone *zone; |
1843 | 1843 | ||
1844 | for_each_zone(zone) { | 1844 | for_each_zone(zone) { |
1845 | struct per_cpu_pageset *pset = zone_pcp(zone, cpu); | 1845 | struct per_cpu_pageset *pset = zone_pcp(zone, cpu); |
1846 | 1846 | ||
1847 | zone_pcp(zone, cpu) = NULL; | 1847 | zone_pcp(zone, cpu) = NULL; |
1848 | kfree(pset); | 1848 | kfree(pset); |
1849 | } | 1849 | } |
1850 | #endif | 1850 | #endif |
1851 | } | 1851 | } |
1852 | 1852 | ||
1853 | static int __devinit pageset_cpuup_callback(struct notifier_block *nfb, | 1853 | static int __devinit pageset_cpuup_callback(struct notifier_block *nfb, |
1854 | unsigned long action, | 1854 | unsigned long action, |
1855 | void *hcpu) | 1855 | void *hcpu) |
1856 | { | 1856 | { |
1857 | int cpu = (long)hcpu; | 1857 | int cpu = (long)hcpu; |
1858 | int ret = NOTIFY_OK; | 1858 | int ret = NOTIFY_OK; |
1859 | 1859 | ||
1860 | switch (action) { | 1860 | switch (action) { |
1861 | case CPU_UP_PREPARE: | 1861 | case CPU_UP_PREPARE: |
1862 | if (process_zones(cpu)) | 1862 | if (process_zones(cpu)) |
1863 | ret = NOTIFY_BAD; | 1863 | ret = NOTIFY_BAD; |
1864 | break; | 1864 | break; |
1865 | #ifdef CONFIG_HOTPLUG_CPU | 1865 | #ifdef CONFIG_HOTPLUG_CPU |
1866 | case CPU_DEAD: | 1866 | case CPU_DEAD: |
1867 | free_zone_pagesets(cpu); | 1867 | free_zone_pagesets(cpu); |
1868 | break; | 1868 | break; |
1869 | #endif | 1869 | #endif |
1870 | default: | 1870 | default: |
1871 | break; | 1871 | break; |
1872 | } | 1872 | } |
1873 | return ret; | 1873 | return ret; |
1874 | } | 1874 | } |
1875 | 1875 | ||
1876 | static struct notifier_block pageset_notifier = | 1876 | static struct notifier_block pageset_notifier = |
1877 | { &pageset_cpuup_callback, NULL, 0 }; | 1877 | { &pageset_cpuup_callback, NULL, 0 }; |
1878 | 1878 | ||
1879 | void __init setup_per_cpu_pageset() | 1879 | void __init setup_per_cpu_pageset() |
1880 | { | 1880 | { |
1881 | int err; | 1881 | int err; |
1882 | 1882 | ||
1883 | /* Initialize per_cpu_pageset for cpu 0. | 1883 | /* Initialize per_cpu_pageset for cpu 0. |
1884 | * A cpuup callback will do this for every cpu | 1884 | * A cpuup callback will do this for every cpu |
1885 | * as it comes online | 1885 | * as it comes online |
1886 | */ | 1886 | */ |
1887 | err = process_zones(smp_processor_id()); | 1887 | err = process_zones(smp_processor_id()); |
1888 | BUG_ON(err); | 1888 | BUG_ON(err); |
1889 | register_cpu_notifier(&pageset_notifier); | 1889 | register_cpu_notifier(&pageset_notifier); |
1890 | } | 1890 | } |
1891 | 1891 | ||
1892 | #endif | 1892 | #endif |
1893 | 1893 | ||
1894 | static __devinit | 1894 | static __devinit |
1895 | void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) | 1895 | void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) |
1896 | { | 1896 | { |
1897 | int i; | 1897 | int i; |
1898 | struct pglist_data *pgdat = zone->zone_pgdat; | 1898 | struct pglist_data *pgdat = zone->zone_pgdat; |
1899 | 1899 | ||
1900 | /* | 1900 | /* |
1901 | * The per-page waitqueue mechanism uses hashed waitqueues | 1901 | * The per-page waitqueue mechanism uses hashed waitqueues |
1902 | * per zone. | 1902 | * per zone. |
1903 | */ | 1903 | */ |
1904 | zone->wait_table_size = wait_table_size(zone_size_pages); | 1904 | zone->wait_table_size = wait_table_size(zone_size_pages); |
1905 | zone->wait_table_bits = wait_table_bits(zone->wait_table_size); | 1905 | zone->wait_table_bits = wait_table_bits(zone->wait_table_size); |
1906 | zone->wait_table = (wait_queue_head_t *) | 1906 | zone->wait_table = (wait_queue_head_t *) |
1907 | alloc_bootmem_node(pgdat, zone->wait_table_size | 1907 | alloc_bootmem_node(pgdat, zone->wait_table_size |
1908 | * sizeof(wait_queue_head_t)); | 1908 | * sizeof(wait_queue_head_t)); |
1909 | 1909 | ||
1910 | for(i = 0; i < zone->wait_table_size; ++i) | 1910 | for(i = 0; i < zone->wait_table_size; ++i) |
1911 | init_waitqueue_head(zone->wait_table + i); | 1911 | init_waitqueue_head(zone->wait_table + i); |
1912 | } | 1912 | } |
1913 | 1913 | ||
1914 | static __devinit void zone_pcp_init(struct zone *zone) | 1914 | static __devinit void zone_pcp_init(struct zone *zone) |
1915 | { | 1915 | { |
1916 | int cpu; | 1916 | int cpu; |
1917 | unsigned long batch = zone_batchsize(zone); | 1917 | unsigned long batch = zone_batchsize(zone); |
1918 | 1918 | ||
1919 | for (cpu = 0; cpu < NR_CPUS; cpu++) { | 1919 | for (cpu = 0; cpu < NR_CPUS; cpu++) { |
1920 | #ifdef CONFIG_NUMA | 1920 | #ifdef CONFIG_NUMA |
1921 | /* Early boot. Slab allocator not functional yet */ | 1921 | /* Early boot. Slab allocator not functional yet */ |
1922 | zone->pageset[cpu] = &boot_pageset[cpu]; | 1922 | zone->pageset[cpu] = &boot_pageset[cpu]; |
1923 | setup_pageset(&boot_pageset[cpu],0); | 1923 | setup_pageset(&boot_pageset[cpu],0); |
1924 | #else | 1924 | #else |
1925 | setup_pageset(zone_pcp(zone,cpu), batch); | 1925 | setup_pageset(zone_pcp(zone,cpu), batch); |
1926 | #endif | 1926 | #endif |
1927 | } | 1927 | } |
1928 | printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", | 1928 | printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", |
1929 | zone->name, zone->present_pages, batch); | 1929 | zone->name, zone->present_pages, batch); |
1930 | } | 1930 | } |
1931 | 1931 | ||
1932 | static __devinit void init_currently_empty_zone(struct zone *zone, | 1932 | static __devinit void init_currently_empty_zone(struct zone *zone, |
1933 | unsigned long zone_start_pfn, unsigned long size) | 1933 | unsigned long zone_start_pfn, unsigned long size) |
1934 | { | 1934 | { |
1935 | struct pglist_data *pgdat = zone->zone_pgdat; | 1935 | struct pglist_data *pgdat = zone->zone_pgdat; |
1936 | 1936 | ||
1937 | zone_wait_table_init(zone, size); | 1937 | zone_wait_table_init(zone, size); |
1938 | pgdat->nr_zones = zone_idx(zone) + 1; | 1938 | pgdat->nr_zones = zone_idx(zone) + 1; |
1939 | 1939 | ||
1940 | zone->zone_mem_map = pfn_to_page(zone_start_pfn); | 1940 | zone->zone_mem_map = pfn_to_page(zone_start_pfn); |
1941 | zone->zone_start_pfn = zone_start_pfn; | 1941 | zone->zone_start_pfn = zone_start_pfn; |
1942 | 1942 | ||
1943 | memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn); | 1943 | memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn); |
1944 | 1944 | ||
1945 | zone_init_free_lists(pgdat, zone, zone->spanned_pages); | 1945 | zone_init_free_lists(pgdat, zone, zone->spanned_pages); |
1946 | } | 1946 | } |
1947 | 1947 | ||
1948 | /* | 1948 | /* |
1949 | * Set up the zone data structures: | 1949 | * Set up the zone data structures: |
1950 | * - mark all pages reserved | 1950 | * - mark all pages reserved |
1951 | * - mark all memory queues empty | 1951 | * - mark all memory queues empty |
1952 | * - clear the memory bitmaps | 1952 | * - clear the memory bitmaps |
1953 | */ | 1953 | */ |
1954 | static void __init free_area_init_core(struct pglist_data *pgdat, | 1954 | static void __init free_area_init_core(struct pglist_data *pgdat, |
1955 | unsigned long *zones_size, unsigned long *zholes_size) | 1955 | unsigned long *zones_size, unsigned long *zholes_size) |
1956 | { | 1956 | { |
1957 | unsigned long j; | 1957 | unsigned long j; |
1958 | int nid = pgdat->node_id; | 1958 | int nid = pgdat->node_id; |
1959 | unsigned long zone_start_pfn = pgdat->node_start_pfn; | 1959 | unsigned long zone_start_pfn = pgdat->node_start_pfn; |
1960 | 1960 | ||
1961 | pgdat_resize_init(pgdat); | ||
1961 | pgdat->nr_zones = 0; | 1962 | pgdat->nr_zones = 0; |
1962 | init_waitqueue_head(&pgdat->kswapd_wait); | 1963 | init_waitqueue_head(&pgdat->kswapd_wait); |
1963 | pgdat->kswapd_max_order = 0; | 1964 | pgdat->kswapd_max_order = 0; |
1964 | 1965 | ||
1965 | for (j = 0; j < MAX_NR_ZONES; j++) { | 1966 | for (j = 0; j < MAX_NR_ZONES; j++) { |
1966 | struct zone *zone = pgdat->node_zones + j; | 1967 | struct zone *zone = pgdat->node_zones + j; |
1967 | unsigned long size, realsize; | 1968 | unsigned long size, realsize; |
1968 | 1969 | ||
1969 | realsize = size = zones_size[j]; | 1970 | realsize = size = zones_size[j]; |
1970 | if (zholes_size) | 1971 | if (zholes_size) |
1971 | realsize -= zholes_size[j]; | 1972 | realsize -= zholes_size[j]; |
1972 | 1973 | ||
1973 | if (j == ZONE_DMA || j == ZONE_NORMAL) | 1974 | if (j == ZONE_DMA || j == ZONE_NORMAL) |
1974 | nr_kernel_pages += realsize; | 1975 | nr_kernel_pages += realsize; |
1975 | nr_all_pages += realsize; | 1976 | nr_all_pages += realsize; |
1976 | 1977 | ||
1977 | zone->spanned_pages = size; | 1978 | zone->spanned_pages = size; |
1978 | zone->present_pages = realsize; | 1979 | zone->present_pages = realsize; |
1979 | zone->name = zone_names[j]; | 1980 | zone->name = zone_names[j]; |
1980 | spin_lock_init(&zone->lock); | 1981 | spin_lock_init(&zone->lock); |
1981 | spin_lock_init(&zone->lru_lock); | 1982 | spin_lock_init(&zone->lru_lock); |
1982 | zone->zone_pgdat = pgdat; | 1983 | zone->zone_pgdat = pgdat; |
1983 | zone->free_pages = 0; | 1984 | zone->free_pages = 0; |
1984 | 1985 | ||
1985 | zone->temp_priority = zone->prev_priority = DEF_PRIORITY; | 1986 | zone->temp_priority = zone->prev_priority = DEF_PRIORITY; |
1986 | 1987 | ||
1987 | zone_pcp_init(zone); | 1988 | zone_pcp_init(zone); |
1988 | INIT_LIST_HEAD(&zone->active_list); | 1989 | INIT_LIST_HEAD(&zone->active_list); |
1989 | INIT_LIST_HEAD(&zone->inactive_list); | 1990 | INIT_LIST_HEAD(&zone->inactive_list); |
1990 | zone->nr_scan_active = 0; | 1991 | zone->nr_scan_active = 0; |
1991 | zone->nr_scan_inactive = 0; | 1992 | zone->nr_scan_inactive = 0; |
1992 | zone->nr_active = 0; | 1993 | zone->nr_active = 0; |
1993 | zone->nr_inactive = 0; | 1994 | zone->nr_inactive = 0; |
1994 | atomic_set(&zone->reclaim_in_progress, 0); | 1995 | atomic_set(&zone->reclaim_in_progress, 0); |
1995 | if (!size) | 1996 | if (!size) |
1996 | continue; | 1997 | continue; |
1997 | 1998 | ||
1998 | zonetable_add(zone, nid, j, zone_start_pfn, size); | 1999 | zonetable_add(zone, nid, j, zone_start_pfn, size); |
1999 | init_currently_empty_zone(zone, zone_start_pfn, size); | 2000 | init_currently_empty_zone(zone, zone_start_pfn, size); |
2000 | zone_start_pfn += size; | 2001 | zone_start_pfn += size; |
2001 | } | 2002 | } |
2002 | } | 2003 | } |
2003 | 2004 | ||
2004 | static void __init alloc_node_mem_map(struct pglist_data *pgdat) | 2005 | static void __init alloc_node_mem_map(struct pglist_data *pgdat) |
2005 | { | 2006 | { |
2006 | /* Skip empty nodes */ | 2007 | /* Skip empty nodes */ |
2007 | if (!pgdat->node_spanned_pages) | 2008 | if (!pgdat->node_spanned_pages) |
2008 | return; | 2009 | return; |
2009 | 2010 | ||
2010 | #ifdef CONFIG_FLAT_NODE_MEM_MAP | 2011 | #ifdef CONFIG_FLAT_NODE_MEM_MAP |
2011 | /* ia64 gets its own node_mem_map, before this, without bootmem */ | 2012 | /* ia64 gets its own node_mem_map, before this, without bootmem */ |
2012 | if (!pgdat->node_mem_map) { | 2013 | if (!pgdat->node_mem_map) { |
2013 | unsigned long size; | 2014 | unsigned long size; |
2014 | struct page *map; | 2015 | struct page *map; |
2015 | 2016 | ||
2016 | size = (pgdat->node_spanned_pages + 1) * sizeof(struct page); | 2017 | size = (pgdat->node_spanned_pages + 1) * sizeof(struct page); |
2017 | map = alloc_remap(pgdat->node_id, size); | 2018 | map = alloc_remap(pgdat->node_id, size); |
2018 | if (!map) | 2019 | if (!map) |
2019 | map = alloc_bootmem_node(pgdat, size); | 2020 | map = alloc_bootmem_node(pgdat, size); |
2020 | pgdat->node_mem_map = map; | 2021 | pgdat->node_mem_map = map; |
2021 | } | 2022 | } |
2022 | #ifdef CONFIG_FLATMEM | 2023 | #ifdef CONFIG_FLATMEM |
2023 | /* | 2024 | /* |
2024 | * With no DISCONTIG, the global mem_map is just set as node 0's | 2025 | * With no DISCONTIG, the global mem_map is just set as node 0's |
2025 | */ | 2026 | */ |
2026 | if (pgdat == NODE_DATA(0)) | 2027 | if (pgdat == NODE_DATA(0)) |
2027 | mem_map = NODE_DATA(0)->node_mem_map; | 2028 | mem_map = NODE_DATA(0)->node_mem_map; |
2028 | #endif | 2029 | #endif |
2029 | #endif /* CONFIG_FLAT_NODE_MEM_MAP */ | 2030 | #endif /* CONFIG_FLAT_NODE_MEM_MAP */ |
2030 | } | 2031 | } |
2031 | 2032 | ||
2032 | void __init free_area_init_node(int nid, struct pglist_data *pgdat, | 2033 | void __init free_area_init_node(int nid, struct pglist_data *pgdat, |
2033 | unsigned long *zones_size, unsigned long node_start_pfn, | 2034 | unsigned long *zones_size, unsigned long node_start_pfn, |
2034 | unsigned long *zholes_size) | 2035 | unsigned long *zholes_size) |
2035 | { | 2036 | { |
2036 | pgdat->node_id = nid; | 2037 | pgdat->node_id = nid; |
2037 | pgdat->node_start_pfn = node_start_pfn; | 2038 | pgdat->node_start_pfn = node_start_pfn; |
2038 | calculate_zone_totalpages(pgdat, zones_size, zholes_size); | 2039 | calculate_zone_totalpages(pgdat, zones_size, zholes_size); |
2039 | 2040 | ||
2040 | alloc_node_mem_map(pgdat); | 2041 | alloc_node_mem_map(pgdat); |
2041 | 2042 | ||
2042 | free_area_init_core(pgdat, zones_size, zholes_size); | 2043 | free_area_init_core(pgdat, zones_size, zholes_size); |
2043 | } | 2044 | } |
2044 | 2045 | ||
2045 | #ifndef CONFIG_NEED_MULTIPLE_NODES | 2046 | #ifndef CONFIG_NEED_MULTIPLE_NODES |
2046 | static bootmem_data_t contig_bootmem_data; | 2047 | static bootmem_data_t contig_bootmem_data; |
2047 | struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data }; | 2048 | struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data }; |
2048 | 2049 | ||
2049 | EXPORT_SYMBOL(contig_page_data); | 2050 | EXPORT_SYMBOL(contig_page_data); |
2050 | #endif | 2051 | #endif |
2051 | 2052 | ||
2052 | void __init free_area_init(unsigned long *zones_size) | 2053 | void __init free_area_init(unsigned long *zones_size) |
2053 | { | 2054 | { |
2054 | free_area_init_node(0, NODE_DATA(0), zones_size, | 2055 | free_area_init_node(0, NODE_DATA(0), zones_size, |
2055 | __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); | 2056 | __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); |
2056 | } | 2057 | } |
2057 | 2058 | ||
2058 | #ifdef CONFIG_PROC_FS | 2059 | #ifdef CONFIG_PROC_FS |
2059 | 2060 | ||
2060 | #include <linux/seq_file.h> | 2061 | #include <linux/seq_file.h> |
2061 | 2062 | ||
2062 | static void *frag_start(struct seq_file *m, loff_t *pos) | 2063 | static void *frag_start(struct seq_file *m, loff_t *pos) |
2063 | { | 2064 | { |
2064 | pg_data_t *pgdat; | 2065 | pg_data_t *pgdat; |
2065 | loff_t node = *pos; | 2066 | loff_t node = *pos; |
2066 | 2067 | ||
2067 | for (pgdat = pgdat_list; pgdat && node; pgdat = pgdat->pgdat_next) | 2068 | for (pgdat = pgdat_list; pgdat && node; pgdat = pgdat->pgdat_next) |
2068 | --node; | 2069 | --node; |
2069 | 2070 | ||
2070 | return pgdat; | 2071 | return pgdat; |
2071 | } | 2072 | } |
2072 | 2073 | ||
2073 | static void *frag_next(struct seq_file *m, void *arg, loff_t *pos) | 2074 | static void *frag_next(struct seq_file *m, void *arg, loff_t *pos) |
2074 | { | 2075 | { |
2075 | pg_data_t *pgdat = (pg_data_t *)arg; | 2076 | pg_data_t *pgdat = (pg_data_t *)arg; |
2076 | 2077 | ||
2077 | (*pos)++; | 2078 | (*pos)++; |
2078 | return pgdat->pgdat_next; | 2079 | return pgdat->pgdat_next; |
2079 | } | 2080 | } |
2080 | 2081 | ||
2081 | static void frag_stop(struct seq_file *m, void *arg) | 2082 | static void frag_stop(struct seq_file *m, void *arg) |
2082 | { | 2083 | { |
2083 | } | 2084 | } |
2084 | 2085 | ||
2085 | /* | 2086 | /* |
2086 | * This walks the free areas for each zone. | 2087 | * This walks the free areas for each zone. |
2087 | */ | 2088 | */ |
2088 | static int frag_show(struct seq_file *m, void *arg) | 2089 | static int frag_show(struct seq_file *m, void *arg) |
2089 | { | 2090 | { |
2090 | pg_data_t *pgdat = (pg_data_t *)arg; | 2091 | pg_data_t *pgdat = (pg_data_t *)arg; |
2091 | struct zone *zone; | 2092 | struct zone *zone; |
2092 | struct zone *node_zones = pgdat->node_zones; | 2093 | struct zone *node_zones = pgdat->node_zones; |
2093 | unsigned long flags; | 2094 | unsigned long flags; |
2094 | int order; | 2095 | int order; |
2095 | 2096 | ||
2096 | for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { | 2097 | for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { |
2097 | if (!zone->present_pages) | 2098 | if (!zone->present_pages) |
2098 | continue; | 2099 | continue; |
2099 | 2100 | ||
2100 | spin_lock_irqsave(&zone->lock, flags); | 2101 | spin_lock_irqsave(&zone->lock, flags); |
2101 | seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); | 2102 | seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); |
2102 | for (order = 0; order < MAX_ORDER; ++order) | 2103 | for (order = 0; order < MAX_ORDER; ++order) |
2103 | seq_printf(m, "%6lu ", zone->free_area[order].nr_free); | 2104 | seq_printf(m, "%6lu ", zone->free_area[order].nr_free); |
2104 | spin_unlock_irqrestore(&zone->lock, flags); | 2105 | spin_unlock_irqrestore(&zone->lock, flags); |
2105 | seq_putc(m, '\n'); | 2106 | seq_putc(m, '\n'); |
2106 | } | 2107 | } |
2107 | return 0; | 2108 | return 0; |
2108 | } | 2109 | } |
2109 | 2110 | ||
2110 | struct seq_operations fragmentation_op = { | 2111 | struct seq_operations fragmentation_op = { |
2111 | .start = frag_start, | 2112 | .start = frag_start, |
2112 | .next = frag_next, | 2113 | .next = frag_next, |
2113 | .stop = frag_stop, | 2114 | .stop = frag_stop, |
2114 | .show = frag_show, | 2115 | .show = frag_show, |
2115 | }; | 2116 | }; |
2116 | 2117 | ||
2117 | /* | 2118 | /* |
2118 | * Output information about zones in @pgdat. | 2119 | * Output information about zones in @pgdat. |
2119 | */ | 2120 | */ |
2120 | static int zoneinfo_show(struct seq_file *m, void *arg) | 2121 | static int zoneinfo_show(struct seq_file *m, void *arg) |
2121 | { | 2122 | { |
2122 | pg_data_t *pgdat = arg; | 2123 | pg_data_t *pgdat = arg; |
2123 | struct zone *zone; | 2124 | struct zone *zone; |
2124 | struct zone *node_zones = pgdat->node_zones; | 2125 | struct zone *node_zones = pgdat->node_zones; |
2125 | unsigned long flags; | 2126 | unsigned long flags; |
2126 | 2127 | ||
2127 | for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) { | 2128 | for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) { |
2128 | int i; | 2129 | int i; |
2129 | 2130 | ||
2130 | if (!zone->present_pages) | 2131 | if (!zone->present_pages) |
2131 | continue; | 2132 | continue; |
2132 | 2133 | ||
2133 | spin_lock_irqsave(&zone->lock, flags); | 2134 | spin_lock_irqsave(&zone->lock, flags); |
2134 | seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name); | 2135 | seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name); |
2135 | seq_printf(m, | 2136 | seq_printf(m, |
2136 | "\n pages free %lu" | 2137 | "\n pages free %lu" |
2137 | "\n min %lu" | 2138 | "\n min %lu" |
2138 | "\n low %lu" | 2139 | "\n low %lu" |
2139 | "\n high %lu" | 2140 | "\n high %lu" |
2140 | "\n active %lu" | 2141 | "\n active %lu" |
2141 | "\n inactive %lu" | 2142 | "\n inactive %lu" |
2142 | "\n scanned %lu (a: %lu i: %lu)" | 2143 | "\n scanned %lu (a: %lu i: %lu)" |
2143 | "\n spanned %lu" | 2144 | "\n spanned %lu" |
2144 | "\n present %lu", | 2145 | "\n present %lu", |
2145 | zone->free_pages, | 2146 | zone->free_pages, |
2146 | zone->pages_min, | 2147 | zone->pages_min, |
2147 | zone->pages_low, | 2148 | zone->pages_low, |
2148 | zone->pages_high, | 2149 | zone->pages_high, |
2149 | zone->nr_active, | 2150 | zone->nr_active, |
2150 | zone->nr_inactive, | 2151 | zone->nr_inactive, |
2151 | zone->pages_scanned, | 2152 | zone->pages_scanned, |
2152 | zone->nr_scan_active, zone->nr_scan_inactive, | 2153 | zone->nr_scan_active, zone->nr_scan_inactive, |
2153 | zone->spanned_pages, | 2154 | zone->spanned_pages, |
2154 | zone->present_pages); | 2155 | zone->present_pages); |
2155 | seq_printf(m, | 2156 | seq_printf(m, |
2156 | "\n protection: (%lu", | 2157 | "\n protection: (%lu", |
2157 | zone->lowmem_reserve[0]); | 2158 | zone->lowmem_reserve[0]); |
2158 | for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) | 2159 | for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) |
2159 | seq_printf(m, ", %lu", zone->lowmem_reserve[i]); | 2160 | seq_printf(m, ", %lu", zone->lowmem_reserve[i]); |
2160 | seq_printf(m, | 2161 | seq_printf(m, |
2161 | ")" | 2162 | ")" |
2162 | "\n pagesets"); | 2163 | "\n pagesets"); |
2163 | for (i = 0; i < ARRAY_SIZE(zone->pageset); i++) { | 2164 | for (i = 0; i < ARRAY_SIZE(zone->pageset); i++) { |
2164 | struct per_cpu_pageset *pageset; | 2165 | struct per_cpu_pageset *pageset; |
2165 | int j; | 2166 | int j; |
2166 | 2167 | ||
2167 | pageset = zone_pcp(zone, i); | 2168 | pageset = zone_pcp(zone, i); |
2168 | for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { | 2169 | for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { |
2169 | if (pageset->pcp[j].count) | 2170 | if (pageset->pcp[j].count) |
2170 | break; | 2171 | break; |
2171 | } | 2172 | } |
2172 | if (j == ARRAY_SIZE(pageset->pcp)) | 2173 | if (j == ARRAY_SIZE(pageset->pcp)) |
2173 | continue; | 2174 | continue; |
2174 | for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { | 2175 | for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { |
2175 | seq_printf(m, | 2176 | seq_printf(m, |
2176 | "\n cpu: %i pcp: %i" | 2177 | "\n cpu: %i pcp: %i" |
2177 | "\n count: %i" | 2178 | "\n count: %i" |
2178 | "\n low: %i" | 2179 | "\n low: %i" |
2179 | "\n high: %i" | 2180 | "\n high: %i" |
2180 | "\n batch: %i", | 2181 | "\n batch: %i", |
2181 | i, j, | 2182 | i, j, |
2182 | pageset->pcp[j].count, | 2183 | pageset->pcp[j].count, |
2183 | pageset->pcp[j].low, | 2184 | pageset->pcp[j].low, |
2184 | pageset->pcp[j].high, | 2185 | pageset->pcp[j].high, |
2185 | pageset->pcp[j].batch); | 2186 | pageset->pcp[j].batch); |
2186 | } | 2187 | } |
2187 | #ifdef CONFIG_NUMA | 2188 | #ifdef CONFIG_NUMA |
2188 | seq_printf(m, | 2189 | seq_printf(m, |
2189 | "\n numa_hit: %lu" | 2190 | "\n numa_hit: %lu" |
2190 | "\n numa_miss: %lu" | 2191 | "\n numa_miss: %lu" |
2191 | "\n numa_foreign: %lu" | 2192 | "\n numa_foreign: %lu" |
2192 | "\n interleave_hit: %lu" | 2193 | "\n interleave_hit: %lu" |
2193 | "\n local_node: %lu" | 2194 | "\n local_node: %lu" |
2194 | "\n other_node: %lu", | 2195 | "\n other_node: %lu", |
2195 | pageset->numa_hit, | 2196 | pageset->numa_hit, |
2196 | pageset->numa_miss, | 2197 | pageset->numa_miss, |
2197 | pageset->numa_foreign, | 2198 | pageset->numa_foreign, |
2198 | pageset->interleave_hit, | 2199 | pageset->interleave_hit, |
2199 | pageset->local_node, | 2200 | pageset->local_node, |
2200 | pageset->other_node); | 2201 | pageset->other_node); |
2201 | #endif | 2202 | #endif |
2202 | } | 2203 | } |
2203 | seq_printf(m, | 2204 | seq_printf(m, |
2204 | "\n all_unreclaimable: %u" | 2205 | "\n all_unreclaimable: %u" |
2205 | "\n prev_priority: %i" | 2206 | "\n prev_priority: %i" |
2206 | "\n temp_priority: %i" | 2207 | "\n temp_priority: %i" |
2207 | "\n start_pfn: %lu", | 2208 | "\n start_pfn: %lu", |
2208 | zone->all_unreclaimable, | 2209 | zone->all_unreclaimable, |
2209 | zone->prev_priority, | 2210 | zone->prev_priority, |
2210 | zone->temp_priority, | 2211 | zone->temp_priority, |
2211 | zone->zone_start_pfn); | 2212 | zone->zone_start_pfn); |
2212 | spin_unlock_irqrestore(&zone->lock, flags); | 2213 | spin_unlock_irqrestore(&zone->lock, flags); |
2213 | seq_putc(m, '\n'); | 2214 | seq_putc(m, '\n'); |
2214 | } | 2215 | } |
2215 | return 0; | 2216 | return 0; |
2216 | } | 2217 | } |
2217 | 2218 | ||
2218 | struct seq_operations zoneinfo_op = { | 2219 | struct seq_operations zoneinfo_op = { |
2219 | .start = frag_start, /* iterate over all zones. The same as in | 2220 | .start = frag_start, /* iterate over all zones. The same as in |
2220 | * fragmentation. */ | 2221 | * fragmentation. */ |
2221 | .next = frag_next, | 2222 | .next = frag_next, |
2222 | .stop = frag_stop, | 2223 | .stop = frag_stop, |
2223 | .show = zoneinfo_show, | 2224 | .show = zoneinfo_show, |
2224 | }; | 2225 | }; |
2225 | 2226 | ||
2226 | static char *vmstat_text[] = { | 2227 | static char *vmstat_text[] = { |
2227 | "nr_dirty", | 2228 | "nr_dirty", |
2228 | "nr_writeback", | 2229 | "nr_writeback", |
2229 | "nr_unstable", | 2230 | "nr_unstable", |
2230 | "nr_page_table_pages", | 2231 | "nr_page_table_pages", |
2231 | "nr_mapped", | 2232 | "nr_mapped", |
2232 | "nr_slab", | 2233 | "nr_slab", |
2233 | 2234 | ||
2234 | "pgpgin", | 2235 | "pgpgin", |
2235 | "pgpgout", | 2236 | "pgpgout", |
2236 | "pswpin", | 2237 | "pswpin", |
2237 | "pswpout", | 2238 | "pswpout", |
2238 | "pgalloc_high", | 2239 | "pgalloc_high", |
2239 | 2240 | ||
2240 | "pgalloc_normal", | 2241 | "pgalloc_normal", |
2241 | "pgalloc_dma", | 2242 | "pgalloc_dma", |
2242 | "pgfree", | 2243 | "pgfree", |
2243 | "pgactivate", | 2244 | "pgactivate", |
2244 | "pgdeactivate", | 2245 | "pgdeactivate", |
2245 | 2246 | ||
2246 | "pgfault", | 2247 | "pgfault", |
2247 | "pgmajfault", | 2248 | "pgmajfault", |
2248 | "pgrefill_high", | 2249 | "pgrefill_high", |
2249 | "pgrefill_normal", | 2250 | "pgrefill_normal", |
2250 | "pgrefill_dma", | 2251 | "pgrefill_dma", |
2251 | 2252 | ||
2252 | "pgsteal_high", | 2253 | "pgsteal_high", |
2253 | "pgsteal_normal", | 2254 | "pgsteal_normal", |
2254 | "pgsteal_dma", | 2255 | "pgsteal_dma", |
2255 | "pgscan_kswapd_high", | 2256 | "pgscan_kswapd_high", |
2256 | "pgscan_kswapd_normal", | 2257 | "pgscan_kswapd_normal", |
2257 | 2258 | ||
2258 | "pgscan_kswapd_dma", | 2259 | "pgscan_kswapd_dma", |
2259 | "pgscan_direct_high", | 2260 | "pgscan_direct_high", |
2260 | "pgscan_direct_normal", | 2261 | "pgscan_direct_normal", |
2261 | "pgscan_direct_dma", | 2262 | "pgscan_direct_dma", |
2262 | "pginodesteal", | 2263 | "pginodesteal", |
2263 | 2264 | ||
2264 | "slabs_scanned", | 2265 | "slabs_scanned", |
2265 | "kswapd_steal", | 2266 | "kswapd_steal", |
2266 | "kswapd_inodesteal", | 2267 | "kswapd_inodesteal", |
2267 | "pageoutrun", | 2268 | "pageoutrun", |
2268 | "allocstall", | 2269 | "allocstall", |
2269 | 2270 | ||
2270 | "pgrotated", | 2271 | "pgrotated", |
2271 | "nr_bounce", | 2272 | "nr_bounce", |
2272 | }; | 2273 | }; |
2273 | 2274 | ||
2274 | static void *vmstat_start(struct seq_file *m, loff_t *pos) | 2275 | static void *vmstat_start(struct seq_file *m, loff_t *pos) |
2275 | { | 2276 | { |
2276 | struct page_state *ps; | 2277 | struct page_state *ps; |
2277 | 2278 | ||
2278 | if (*pos >= ARRAY_SIZE(vmstat_text)) | 2279 | if (*pos >= ARRAY_SIZE(vmstat_text)) |
2279 | return NULL; | 2280 | return NULL; |
2280 | 2281 | ||
2281 | ps = kmalloc(sizeof(*ps), GFP_KERNEL); | 2282 | ps = kmalloc(sizeof(*ps), GFP_KERNEL); |
2282 | m->private = ps; | 2283 | m->private = ps; |
2283 | if (!ps) | 2284 | if (!ps) |
2284 | return ERR_PTR(-ENOMEM); | 2285 | return ERR_PTR(-ENOMEM); |
2285 | get_full_page_state(ps); | 2286 | get_full_page_state(ps); |
2286 | ps->pgpgin /= 2; /* sectors -> kbytes */ | 2287 | ps->pgpgin /= 2; /* sectors -> kbytes */ |
2287 | ps->pgpgout /= 2; | 2288 | ps->pgpgout /= 2; |
2288 | return (unsigned long *)ps + *pos; | 2289 | return (unsigned long *)ps + *pos; |
2289 | } | 2290 | } |
2290 | 2291 | ||
2291 | static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos) | 2292 | static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos) |
2292 | { | 2293 | { |
2293 | (*pos)++; | 2294 | (*pos)++; |
2294 | if (*pos >= ARRAY_SIZE(vmstat_text)) | 2295 | if (*pos >= ARRAY_SIZE(vmstat_text)) |
2295 | return NULL; | 2296 | return NULL; |
2296 | return (unsigned long *)m->private + *pos; | 2297 | return (unsigned long *)m->private + *pos; |
2297 | } | 2298 | } |
2298 | 2299 | ||
2299 | static int vmstat_show(struct seq_file *m, void *arg) | 2300 | static int vmstat_show(struct seq_file *m, void *arg) |
2300 | { | 2301 | { |
2301 | unsigned long *l = arg; | 2302 | unsigned long *l = arg; |
2302 | unsigned long off = l - (unsigned long *)m->private; | 2303 | unsigned long off = l - (unsigned long *)m->private; |
2303 | 2304 | ||
2304 | seq_printf(m, "%s %lu\n", vmstat_text[off], *l); | 2305 | seq_printf(m, "%s %lu\n", vmstat_text[off], *l); |
2305 | return 0; | 2306 | return 0; |
2306 | } | 2307 | } |
2307 | 2308 | ||
2308 | static void vmstat_stop(struct seq_file *m, void *arg) | 2309 | static void vmstat_stop(struct seq_file *m, void *arg) |
2309 | { | 2310 | { |
2310 | kfree(m->private); | 2311 | kfree(m->private); |
2311 | m->private = NULL; | 2312 | m->private = NULL; |
2312 | } | 2313 | } |
2313 | 2314 | ||
2314 | struct seq_operations vmstat_op = { | 2315 | struct seq_operations vmstat_op = { |
2315 | .start = vmstat_start, | 2316 | .start = vmstat_start, |
2316 | .next = vmstat_next, | 2317 | .next = vmstat_next, |
2317 | .stop = vmstat_stop, | 2318 | .stop = vmstat_stop, |
2318 | .show = vmstat_show, | 2319 | .show = vmstat_show, |
2319 | }; | 2320 | }; |
2320 | 2321 | ||
2321 | #endif /* CONFIG_PROC_FS */ | 2322 | #endif /* CONFIG_PROC_FS */ |
2322 | 2323 | ||
2323 | #ifdef CONFIG_HOTPLUG_CPU | 2324 | #ifdef CONFIG_HOTPLUG_CPU |
2324 | static int page_alloc_cpu_notify(struct notifier_block *self, | 2325 | static int page_alloc_cpu_notify(struct notifier_block *self, |
2325 | unsigned long action, void *hcpu) | 2326 | unsigned long action, void *hcpu) |
2326 | { | 2327 | { |
2327 | int cpu = (unsigned long)hcpu; | 2328 | int cpu = (unsigned long)hcpu; |
2328 | long *count; | 2329 | long *count; |
2329 | unsigned long *src, *dest; | 2330 | unsigned long *src, *dest; |
2330 | 2331 | ||
2331 | if (action == CPU_DEAD) { | 2332 | if (action == CPU_DEAD) { |
2332 | int i; | 2333 | int i; |
2333 | 2334 | ||
2334 | /* Drain local pagecache count. */ | 2335 | /* Drain local pagecache count. */ |
2335 | count = &per_cpu(nr_pagecache_local, cpu); | 2336 | count = &per_cpu(nr_pagecache_local, cpu); |
2336 | atomic_add(*count, &nr_pagecache); | 2337 | atomic_add(*count, &nr_pagecache); |
2337 | *count = 0; | 2338 | *count = 0; |
2338 | local_irq_disable(); | 2339 | local_irq_disable(); |
2339 | __drain_pages(cpu); | 2340 | __drain_pages(cpu); |
2340 | 2341 | ||
2341 | /* Add dead cpu's page_states to our own. */ | 2342 | /* Add dead cpu's page_states to our own. */ |
2342 | dest = (unsigned long *)&__get_cpu_var(page_states); | 2343 | dest = (unsigned long *)&__get_cpu_var(page_states); |
2343 | src = (unsigned long *)&per_cpu(page_states, cpu); | 2344 | src = (unsigned long *)&per_cpu(page_states, cpu); |
2344 | 2345 | ||
2345 | for (i = 0; i < sizeof(struct page_state)/sizeof(unsigned long); | 2346 | for (i = 0; i < sizeof(struct page_state)/sizeof(unsigned long); |
2346 | i++) { | 2347 | i++) { |
2347 | dest[i] += src[i]; | 2348 | dest[i] += src[i]; |
2348 | src[i] = 0; | 2349 | src[i] = 0; |
2349 | } | 2350 | } |
2350 | 2351 | ||
2351 | local_irq_enable(); | 2352 | local_irq_enable(); |
2352 | } | 2353 | } |
2353 | return NOTIFY_OK; | 2354 | return NOTIFY_OK; |
2354 | } | 2355 | } |
2355 | #endif /* CONFIG_HOTPLUG_CPU */ | 2356 | #endif /* CONFIG_HOTPLUG_CPU */ |
2356 | 2357 | ||
2357 | void __init page_alloc_init(void) | 2358 | void __init page_alloc_init(void) |
2358 | { | 2359 | { |
2359 | hotcpu_notifier(page_alloc_cpu_notify, 0); | 2360 | hotcpu_notifier(page_alloc_cpu_notify, 0); |
2360 | } | 2361 | } |
2361 | 2362 | ||
2362 | /* | 2363 | /* |
2363 | * setup_per_zone_lowmem_reserve - called whenever | 2364 | * setup_per_zone_lowmem_reserve - called whenever |
2364 | * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone | 2365 | * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone |
2365 | * has a correct pages reserved value, so an adequate number of | 2366 | * has a correct pages reserved value, so an adequate number of |
2366 | * pages are left in the zone after a successful __alloc_pages(). | 2367 | * pages are left in the zone after a successful __alloc_pages(). |
2367 | */ | 2368 | */ |
2368 | static void setup_per_zone_lowmem_reserve(void) | 2369 | static void setup_per_zone_lowmem_reserve(void) |
2369 | { | 2370 | { |
2370 | struct pglist_data *pgdat; | 2371 | struct pglist_data *pgdat; |
2371 | int j, idx; | 2372 | int j, idx; |
2372 | 2373 | ||
2373 | for_each_pgdat(pgdat) { | 2374 | for_each_pgdat(pgdat) { |
2374 | for (j = 0; j < MAX_NR_ZONES; j++) { | 2375 | for (j = 0; j < MAX_NR_ZONES; j++) { |
2375 | struct zone *zone = pgdat->node_zones + j; | 2376 | struct zone *zone = pgdat->node_zones + j; |
2376 | unsigned long present_pages = zone->present_pages; | 2377 | unsigned long present_pages = zone->present_pages; |
2377 | 2378 | ||
2378 | zone->lowmem_reserve[j] = 0; | 2379 | zone->lowmem_reserve[j] = 0; |
2379 | 2380 | ||
2380 | for (idx = j-1; idx >= 0; idx--) { | 2381 | for (idx = j-1; idx >= 0; idx--) { |
2381 | struct zone *lower_zone; | 2382 | struct zone *lower_zone; |
2382 | 2383 | ||
2383 | if (sysctl_lowmem_reserve_ratio[idx] < 1) | 2384 | if (sysctl_lowmem_reserve_ratio[idx] < 1) |
2384 | sysctl_lowmem_reserve_ratio[idx] = 1; | 2385 | sysctl_lowmem_reserve_ratio[idx] = 1; |
2385 | 2386 | ||
2386 | lower_zone = pgdat->node_zones + idx; | 2387 | lower_zone = pgdat->node_zones + idx; |
2387 | lower_zone->lowmem_reserve[j] = present_pages / | 2388 | lower_zone->lowmem_reserve[j] = present_pages / |
2388 | sysctl_lowmem_reserve_ratio[idx]; | 2389 | sysctl_lowmem_reserve_ratio[idx]; |
2389 | present_pages += lower_zone->present_pages; | 2390 | present_pages += lower_zone->present_pages; |
2390 | } | 2391 | } |
2391 | } | 2392 | } |
2392 | } | 2393 | } |
2393 | } | 2394 | } |
2394 | 2395 | ||
2395 | /* | 2396 | /* |
2396 | * setup_per_zone_pages_min - called when min_free_kbytes changes. Ensures | 2397 | * setup_per_zone_pages_min - called when min_free_kbytes changes. Ensures |
2397 | * that the pages_{min,low,high} values for each zone are set correctly | 2398 | * that the pages_{min,low,high} values for each zone are set correctly |
2398 | * with respect to min_free_kbytes. | 2399 | * with respect to min_free_kbytes. |
2399 | */ | 2400 | */ |
2400 | static void setup_per_zone_pages_min(void) | 2401 | static void setup_per_zone_pages_min(void) |
2401 | { | 2402 | { |
2402 | unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); | 2403 | unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); |
2403 | unsigned long lowmem_pages = 0; | 2404 | unsigned long lowmem_pages = 0; |
2404 | struct zone *zone; | 2405 | struct zone *zone; |
2405 | unsigned long flags; | 2406 | unsigned long flags; |
2406 | 2407 | ||
2407 | /* Calculate total number of !ZONE_HIGHMEM pages */ | 2408 | /* Calculate total number of !ZONE_HIGHMEM pages */ |
2408 | for_each_zone(zone) { | 2409 | for_each_zone(zone) { |
2409 | if (!is_highmem(zone)) | 2410 | if (!is_highmem(zone)) |
2410 | lowmem_pages += zone->present_pages; | 2411 | lowmem_pages += zone->present_pages; |
2411 | } | 2412 | } |
2412 | 2413 | ||
2413 | for_each_zone(zone) { | 2414 | for_each_zone(zone) { |
2414 | spin_lock_irqsave(&zone->lru_lock, flags); | 2415 | spin_lock_irqsave(&zone->lru_lock, flags); |
2415 | if (is_highmem(zone)) { | 2416 | if (is_highmem(zone)) { |
2416 | /* | 2417 | /* |
2417 | * Often, highmem doesn't need to reserve any pages. | 2418 | * Often, highmem doesn't need to reserve any pages. |
2418 | * But the pages_min/low/high values are also used for | 2419 | * But the pages_min/low/high values are also used for |
2419 | * batching up page reclaim activity so we need a | 2420 | * batching up page reclaim activity so we need a |
2420 | * decent value here. | 2421 | * decent value here. |
2421 | */ | 2422 | */ |
2422 | int min_pages; | 2423 | int min_pages; |
2423 | 2424 | ||
2424 | min_pages = zone->present_pages / 1024; | 2425 | min_pages = zone->present_pages / 1024; |
2425 | if (min_pages < SWAP_CLUSTER_MAX) | 2426 | if (min_pages < SWAP_CLUSTER_MAX) |
2426 | min_pages = SWAP_CLUSTER_MAX; | 2427 | min_pages = SWAP_CLUSTER_MAX; |
2427 | if (min_pages > 128) | 2428 | if (min_pages > 128) |
2428 | min_pages = 128; | 2429 | min_pages = 128; |
2429 | zone->pages_min = min_pages; | 2430 | zone->pages_min = min_pages; |
2430 | } else { | 2431 | } else { |
2431 | /* if it's a lowmem zone, reserve a number of pages | 2432 | /* if it's a lowmem zone, reserve a number of pages |
2432 | * proportionate to the zone's size. | 2433 | * proportionate to the zone's size. |
2433 | */ | 2434 | */ |
2434 | zone->pages_min = (pages_min * zone->present_pages) / | 2435 | zone->pages_min = (pages_min * zone->present_pages) / |
2435 | lowmem_pages; | 2436 | lowmem_pages; |
2436 | } | 2437 | } |
2437 | 2438 | ||
2438 | /* | 2439 | /* |
2439 | * When interpreting these watermarks, just keep in mind that: | 2440 | * When interpreting these watermarks, just keep in mind that: |
2440 | * zone->pages_min == (zone->pages_min * 4) / 4; | 2441 | * zone->pages_min == (zone->pages_min * 4) / 4; |
2441 | */ | 2442 | */ |
2442 | zone->pages_low = (zone->pages_min * 5) / 4; | 2443 | zone->pages_low = (zone->pages_min * 5) / 4; |
2443 | zone->pages_high = (zone->pages_min * 6) / 4; | 2444 | zone->pages_high = (zone->pages_min * 6) / 4; |
2444 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 2445 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
2445 | } | 2446 | } |
2446 | } | 2447 | } |
2447 | 2448 | ||
2448 | /* | 2449 | /* |
2449 | * Initialise min_free_kbytes. | 2450 | * Initialise min_free_kbytes. |
2450 | * | 2451 | * |
2451 | * For small machines we want it small (128k min). For large machines | 2452 | * For small machines we want it small (128k min). For large machines |
2452 | * we want it large (64MB max). But it is not linear, because network | 2453 | * we want it large (64MB max). But it is not linear, because network |
2453 | * bandwidth does not increase linearly with machine size. We use | 2454 | * bandwidth does not increase linearly with machine size. We use |
2454 | * | 2455 | * |
2455 | * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: | 2456 | * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: |
2456 | * min_free_kbytes = sqrt(lowmem_kbytes * 16) | 2457 | * min_free_kbytes = sqrt(lowmem_kbytes * 16) |
2457 | * | 2458 | * |
2458 | * which yields | 2459 | * which yields |
2459 | * | 2460 | * |
2460 | * 16MB: 512k | 2461 | * 16MB: 512k |
2461 | * 32MB: 724k | 2462 | * 32MB: 724k |
2462 | * 64MB: 1024k | 2463 | * 64MB: 1024k |
2463 | * 128MB: 1448k | 2464 | * 128MB: 1448k |
2464 | * 256MB: 2048k | 2465 | * 256MB: 2048k |
2465 | * 512MB: 2896k | 2466 | * 512MB: 2896k |
2466 | * 1024MB: 4096k | 2467 | * 1024MB: 4096k |
2467 | * 2048MB: 5792k | 2468 | * 2048MB: 5792k |
2468 | * 4096MB: 8192k | 2469 | * 4096MB: 8192k |
2469 | * 8192MB: 11584k | 2470 | * 8192MB: 11584k |
2470 | * 16384MB: 16384k | 2471 | * 16384MB: 16384k |
2471 | */ | 2472 | */ |
2472 | static int __init init_per_zone_pages_min(void) | 2473 | static int __init init_per_zone_pages_min(void) |
2473 | { | 2474 | { |
2474 | unsigned long lowmem_kbytes; | 2475 | unsigned long lowmem_kbytes; |
2475 | 2476 | ||
2476 | lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); | 2477 | lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); |
2477 | 2478 | ||
2478 | min_free_kbytes = int_sqrt(lowmem_kbytes * 16); | 2479 | min_free_kbytes = int_sqrt(lowmem_kbytes * 16); |
2479 | if (min_free_kbytes < 128) | 2480 | if (min_free_kbytes < 128) |
2480 | min_free_kbytes = 128; | 2481 | min_free_kbytes = 128; |
2481 | if (min_free_kbytes > 65536) | 2482 | if (min_free_kbytes > 65536) |
2482 | min_free_kbytes = 65536; | 2483 | min_free_kbytes = 65536; |
2483 | setup_per_zone_pages_min(); | 2484 | setup_per_zone_pages_min(); |
2484 | setup_per_zone_lowmem_reserve(); | 2485 | setup_per_zone_lowmem_reserve(); |
2485 | return 0; | 2486 | return 0; |
2486 | } | 2487 | } |
2487 | module_init(init_per_zone_pages_min) | 2488 | module_init(init_per_zone_pages_min) |
2488 | 2489 | ||
2489 | /* | 2490 | /* |
2490 | * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so | 2491 | * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so |
2491 | * that we can call two helper functions whenever min_free_kbytes | 2492 | * that we can call two helper functions whenever min_free_kbytes |
2492 | * changes. | 2493 | * changes. |
2493 | */ | 2494 | */ |
2494 | int min_free_kbytes_sysctl_handler(ctl_table *table, int write, | 2495 | int min_free_kbytes_sysctl_handler(ctl_table *table, int write, |
2495 | struct file *file, void __user *buffer, size_t *length, loff_t *ppos) | 2496 | struct file *file, void __user *buffer, size_t *length, loff_t *ppos) |
2496 | { | 2497 | { |
2497 | proc_dointvec(table, write, file, buffer, length, ppos); | 2498 | proc_dointvec(table, write, file, buffer, length, ppos); |
2498 | setup_per_zone_pages_min(); | 2499 | setup_per_zone_pages_min(); |
2499 | return 0; | 2500 | return 0; |
2500 | } | 2501 | } |
2501 | 2502 | ||
2502 | /* | 2503 | /* |
2503 | * lowmem_reserve_ratio_sysctl_handler - just a wrapper around | 2504 | * lowmem_reserve_ratio_sysctl_handler - just a wrapper around |
2504 | * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve() | 2505 | * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve() |
2505 | * whenever sysctl_lowmem_reserve_ratio changes. | 2506 | * whenever sysctl_lowmem_reserve_ratio changes. |
2506 | * | 2507 | * |
2507 | * The reserve ratio obviously has absolutely no relation with the | 2508 | * The reserve ratio obviously has absolutely no relation with the |
2508 | * pages_min watermarks. The lowmem reserve ratio can only make sense | 2509 | * pages_min watermarks. The lowmem reserve ratio can only make sense |
2509 | * if in function of the boot time zone sizes. | 2510 | * if in function of the boot time zone sizes. |
2510 | */ | 2511 | */ |
2511 | int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, | 2512 | int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, |
2512 | struct file *file, void __user *buffer, size_t *length, loff_t *ppos) | 2513 | struct file *file, void __user *buffer, size_t *length, loff_t *ppos) |
2513 | { | 2514 | { |
2514 | proc_dointvec_minmax(table, write, file, buffer, length, ppos); | 2515 | proc_dointvec_minmax(table, write, file, buffer, length, ppos); |
2515 | setup_per_zone_lowmem_reserve(); | 2516 | setup_per_zone_lowmem_reserve(); |
2516 | return 0; | 2517 | return 0; |
2517 | } | 2518 | } |
2518 | 2519 | ||
2519 | __initdata int hashdist = HASHDIST_DEFAULT; | 2520 | __initdata int hashdist = HASHDIST_DEFAULT; |
2520 | 2521 | ||
2521 | #ifdef CONFIG_NUMA | 2522 | #ifdef CONFIG_NUMA |
2522 | static int __init set_hashdist(char *str) | 2523 | static int __init set_hashdist(char *str) |
2523 | { | 2524 | { |
2524 | if (!str) | 2525 | if (!str) |
2525 | return 0; | 2526 | return 0; |
2526 | hashdist = simple_strtoul(str, &str, 0); | 2527 | hashdist = simple_strtoul(str, &str, 0); |
2527 | return 1; | 2528 | return 1; |
2528 | } | 2529 | } |
2529 | __setup("hashdist=", set_hashdist); | 2530 | __setup("hashdist=", set_hashdist); |
2530 | #endif | 2531 | #endif |
2531 | 2532 | ||
2532 | /* | 2533 | /* |
2533 | * allocate a large system hash table from bootmem | 2534 | * allocate a large system hash table from bootmem |
2534 | * - it is assumed that the hash table must contain an exact power-of-2 | 2535 | * - it is assumed that the hash table must contain an exact power-of-2 |
2535 | * quantity of entries | 2536 | * quantity of entries |
2536 | * - limit is the number of hash buckets, not the total allocation size | 2537 | * - limit is the number of hash buckets, not the total allocation size |
2537 | */ | 2538 | */ |
2538 | void *__init alloc_large_system_hash(const char *tablename, | 2539 | void *__init alloc_large_system_hash(const char *tablename, |
2539 | unsigned long bucketsize, | 2540 | unsigned long bucketsize, |
2540 | unsigned long numentries, | 2541 | unsigned long numentries, |
2541 | int scale, | 2542 | int scale, |
2542 | int flags, | 2543 | int flags, |
2543 | unsigned int *_hash_shift, | 2544 | unsigned int *_hash_shift, |
2544 | unsigned int *_hash_mask, | 2545 | unsigned int *_hash_mask, |
2545 | unsigned long limit) | 2546 | unsigned long limit) |
2546 | { | 2547 | { |
2547 | unsigned long long max = limit; | 2548 | unsigned long long max = limit; |
2548 | unsigned long log2qty, size; | 2549 | unsigned long log2qty, size; |
2549 | void *table = NULL; | 2550 | void *table = NULL; |
2550 | 2551 | ||
2551 | /* allow the kernel cmdline to have a say */ | 2552 | /* allow the kernel cmdline to have a say */ |
2552 | if (!numentries) { | 2553 | if (!numentries) { |
2553 | /* round applicable memory size up to nearest megabyte */ | 2554 | /* round applicable memory size up to nearest megabyte */ |
2554 | numentries = (flags & HASH_HIGHMEM) ? nr_all_pages : nr_kernel_pages; | 2555 | numentries = (flags & HASH_HIGHMEM) ? nr_all_pages : nr_kernel_pages; |
2555 | numentries += (1UL << (20 - PAGE_SHIFT)) - 1; | 2556 | numentries += (1UL << (20 - PAGE_SHIFT)) - 1; |
2556 | numentries >>= 20 - PAGE_SHIFT; | 2557 | numentries >>= 20 - PAGE_SHIFT; |
2557 | numentries <<= 20 - PAGE_SHIFT; | 2558 | numentries <<= 20 - PAGE_SHIFT; |
2558 | 2559 | ||
2559 | /* limit to 1 bucket per 2^scale bytes of low memory */ | 2560 | /* limit to 1 bucket per 2^scale bytes of low memory */ |
2560 | if (scale > PAGE_SHIFT) | 2561 | if (scale > PAGE_SHIFT) |
2561 | numentries >>= (scale - PAGE_SHIFT); | 2562 | numentries >>= (scale - PAGE_SHIFT); |
2562 | else | 2563 | else |
2563 | numentries <<= (PAGE_SHIFT - scale); | 2564 | numentries <<= (PAGE_SHIFT - scale); |
2564 | } | 2565 | } |
2565 | /* rounded up to nearest power of 2 in size */ | 2566 | /* rounded up to nearest power of 2 in size */ |
2566 | numentries = 1UL << (long_log2(numentries) + 1); | 2567 | numentries = 1UL << (long_log2(numentries) + 1); |
2567 | 2568 | ||
2568 | /* limit allocation size to 1/16 total memory by default */ | 2569 | /* limit allocation size to 1/16 total memory by default */ |
2569 | if (max == 0) { | 2570 | if (max == 0) { |
2570 | max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4; | 2571 | max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4; |
2571 | do_div(max, bucketsize); | 2572 | do_div(max, bucketsize); |
2572 | } | 2573 | } |
2573 | 2574 | ||
2574 | if (numentries > max) | 2575 | if (numentries > max) |
2575 | numentries = max; | 2576 | numentries = max; |
2576 | 2577 | ||
2577 | log2qty = long_log2(numentries); | 2578 | log2qty = long_log2(numentries); |
2578 | 2579 | ||
2579 | do { | 2580 | do { |
2580 | size = bucketsize << log2qty; | 2581 | size = bucketsize << log2qty; |
2581 | if (flags & HASH_EARLY) | 2582 | if (flags & HASH_EARLY) |
2582 | table = alloc_bootmem(size); | 2583 | table = alloc_bootmem(size); |
2583 | else if (hashdist) | 2584 | else if (hashdist) |
2584 | table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); | 2585 | table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); |
2585 | else { | 2586 | else { |
2586 | unsigned long order; | 2587 | unsigned long order; |
2587 | for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++) | 2588 | for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++) |
2588 | ; | 2589 | ; |
2589 | table = (void*) __get_free_pages(GFP_ATOMIC, order); | 2590 | table = (void*) __get_free_pages(GFP_ATOMIC, order); |
2590 | } | 2591 | } |
2591 | } while (!table && size > PAGE_SIZE && --log2qty); | 2592 | } while (!table && size > PAGE_SIZE && --log2qty); |
2592 | 2593 | ||
2593 | if (!table) | 2594 | if (!table) |
2594 | panic("Failed to allocate %s hash table\n", tablename); | 2595 | panic("Failed to allocate %s hash table\n", tablename); |
2595 | 2596 | ||
2596 | printk("%s hash table entries: %d (order: %d, %lu bytes)\n", | 2597 | printk("%s hash table entries: %d (order: %d, %lu bytes)\n", |
2597 | tablename, | 2598 | tablename, |
2598 | (1U << log2qty), | 2599 | (1U << log2qty), |
2599 | long_log2(size) - PAGE_SHIFT, | 2600 | long_log2(size) - PAGE_SHIFT, |
2600 | size); | 2601 | size); |
2601 | 2602 | ||
2602 | if (_hash_shift) | 2603 | if (_hash_shift) |
2603 | *_hash_shift = log2qty; | 2604 | *_hash_shift = log2qty; |
2604 | if (_hash_mask) | 2605 | if (_hash_mask) |
2605 | *_hash_mask = (1 << log2qty) - 1; | 2606 | *_hash_mask = (1 << log2qty) - 1; |
2606 | 2607 | ||
2607 | return table; | 2608 | return table; |
2608 | } | 2609 | } |
2609 | 2610 |