Commit 9512938b885304f72c847379611d6018064af840

Authored by Wanlong Gao
Committed by Linus Torvalds
1 parent f1db7afd91

cpumask: update setup_node_to_cpumask_map() comments

node_to_cpumask() has been replaced by cpumask_of_node(), and wholly
removed since commit 29c337a0 ("cpumask: remove obsolete node_to_cpumask
now everyone uses cpumask_of_node").

So update the comments for setup_node_to_cpumask_map().

Signed-off-by: Wanlong Gao <gaowanlong@cn.fujitsu.com>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 2 changed files with 2 additions and 2 deletions Inline Diff

arch/powerpc/mm/numa.c
1 /* 1 /*
2 * pSeries NUMA support 2 * pSeries NUMA support
3 * 3 *
4 * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM 4 * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
5 * 5 *
6 * This program is free software; you can redistribute it and/or 6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License 7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version 8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version. 9 * 2 of the License, or (at your option) any later version.
10 */ 10 */
11 #include <linux/threads.h> 11 #include <linux/threads.h>
12 #include <linux/bootmem.h> 12 #include <linux/bootmem.h>
13 #include <linux/init.h> 13 #include <linux/init.h>
14 #include <linux/mm.h> 14 #include <linux/mm.h>
15 #include <linux/mmzone.h> 15 #include <linux/mmzone.h>
16 #include <linux/export.h> 16 #include <linux/export.h>
17 #include <linux/nodemask.h> 17 #include <linux/nodemask.h>
18 #include <linux/cpu.h> 18 #include <linux/cpu.h>
19 #include <linux/notifier.h> 19 #include <linux/notifier.h>
20 #include <linux/memblock.h> 20 #include <linux/memblock.h>
21 #include <linux/of.h> 21 #include <linux/of.h>
22 #include <linux/pfn.h> 22 #include <linux/pfn.h>
23 #include <linux/cpuset.h> 23 #include <linux/cpuset.h>
24 #include <linux/node.h> 24 #include <linux/node.h>
25 #include <asm/sparsemem.h> 25 #include <asm/sparsemem.h>
26 #include <asm/prom.h> 26 #include <asm/prom.h>
27 #include <asm/system.h> 27 #include <asm/system.h>
28 #include <asm/smp.h> 28 #include <asm/smp.h>
29 #include <asm/firmware.h> 29 #include <asm/firmware.h>
30 #include <asm/paca.h> 30 #include <asm/paca.h>
31 #include <asm/hvcall.h> 31 #include <asm/hvcall.h>
32 32
33 static int numa_enabled = 1; 33 static int numa_enabled = 1;
34 34
35 static char *cmdline __initdata; 35 static char *cmdline __initdata;
36 36
37 static int numa_debug; 37 static int numa_debug;
38 #define dbg(args...) if (numa_debug) { printk(KERN_INFO args); } 38 #define dbg(args...) if (numa_debug) { printk(KERN_INFO args); }
39 39
40 int numa_cpu_lookup_table[NR_CPUS]; 40 int numa_cpu_lookup_table[NR_CPUS];
41 cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; 41 cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
42 struct pglist_data *node_data[MAX_NUMNODES]; 42 struct pglist_data *node_data[MAX_NUMNODES];
43 43
44 EXPORT_SYMBOL(numa_cpu_lookup_table); 44 EXPORT_SYMBOL(numa_cpu_lookup_table);
45 EXPORT_SYMBOL(node_to_cpumask_map); 45 EXPORT_SYMBOL(node_to_cpumask_map);
46 EXPORT_SYMBOL(node_data); 46 EXPORT_SYMBOL(node_data);
47 47
48 static int min_common_depth; 48 static int min_common_depth;
49 static int n_mem_addr_cells, n_mem_size_cells; 49 static int n_mem_addr_cells, n_mem_size_cells;
50 static int form1_affinity; 50 static int form1_affinity;
51 51
52 #define MAX_DISTANCE_REF_POINTS 4 52 #define MAX_DISTANCE_REF_POINTS 4
53 static int distance_ref_points_depth; 53 static int distance_ref_points_depth;
54 static const unsigned int *distance_ref_points; 54 static const unsigned int *distance_ref_points;
55 static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS]; 55 static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS];
56 56
57 /* 57 /*
58 * Allocate node_to_cpumask_map based on number of available nodes 58 * Allocate node_to_cpumask_map based on number of available nodes
59 * Requires node_possible_map to be valid. 59 * Requires node_possible_map to be valid.
60 * 60 *
61 * Note: node_to_cpumask() is not valid until after this is done. 61 * Note: cpumask_of_node() is not valid until after this is done.
62 */ 62 */
63 static void __init setup_node_to_cpumask_map(void) 63 static void __init setup_node_to_cpumask_map(void)
64 { 64 {
65 unsigned int node, num = 0; 65 unsigned int node, num = 0;
66 66
67 /* setup nr_node_ids if not done yet */ 67 /* setup nr_node_ids if not done yet */
68 if (nr_node_ids == MAX_NUMNODES) { 68 if (nr_node_ids == MAX_NUMNODES) {
69 for_each_node_mask(node, node_possible_map) 69 for_each_node_mask(node, node_possible_map)
70 num = node; 70 num = node;
71 nr_node_ids = num + 1; 71 nr_node_ids = num + 1;
72 } 72 }
73 73
74 /* allocate the map */ 74 /* allocate the map */
75 for (node = 0; node < nr_node_ids; node++) 75 for (node = 0; node < nr_node_ids; node++)
76 alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]); 76 alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
77 77
78 /* cpumask_of_node() will now work */ 78 /* cpumask_of_node() will now work */
79 dbg("Node to cpumask map for %d nodes\n", nr_node_ids); 79 dbg("Node to cpumask map for %d nodes\n", nr_node_ids);
80 } 80 }
81 81
82 static int __cpuinit fake_numa_create_new_node(unsigned long end_pfn, 82 static int __cpuinit fake_numa_create_new_node(unsigned long end_pfn,
83 unsigned int *nid) 83 unsigned int *nid)
84 { 84 {
85 unsigned long long mem; 85 unsigned long long mem;
86 char *p = cmdline; 86 char *p = cmdline;
87 static unsigned int fake_nid; 87 static unsigned int fake_nid;
88 static unsigned long long curr_boundary; 88 static unsigned long long curr_boundary;
89 89
90 /* 90 /*
91 * Modify node id, iff we started creating NUMA nodes 91 * Modify node id, iff we started creating NUMA nodes
92 * We want to continue from where we left of the last time 92 * We want to continue from where we left of the last time
93 */ 93 */
94 if (fake_nid) 94 if (fake_nid)
95 *nid = fake_nid; 95 *nid = fake_nid;
96 /* 96 /*
97 * In case there are no more arguments to parse, the 97 * In case there are no more arguments to parse, the
98 * node_id should be the same as the last fake node id 98 * node_id should be the same as the last fake node id
99 * (we've handled this above). 99 * (we've handled this above).
100 */ 100 */
101 if (!p) 101 if (!p)
102 return 0; 102 return 0;
103 103
104 mem = memparse(p, &p); 104 mem = memparse(p, &p);
105 if (!mem) 105 if (!mem)
106 return 0; 106 return 0;
107 107
108 if (mem < curr_boundary) 108 if (mem < curr_boundary)
109 return 0; 109 return 0;
110 110
111 curr_boundary = mem; 111 curr_boundary = mem;
112 112
113 if ((end_pfn << PAGE_SHIFT) > mem) { 113 if ((end_pfn << PAGE_SHIFT) > mem) {
114 /* 114 /*
115 * Skip commas and spaces 115 * Skip commas and spaces
116 */ 116 */
117 while (*p == ',' || *p == ' ' || *p == '\t') 117 while (*p == ',' || *p == ' ' || *p == '\t')
118 p++; 118 p++;
119 119
120 cmdline = p; 120 cmdline = p;
121 fake_nid++; 121 fake_nid++;
122 *nid = fake_nid; 122 *nid = fake_nid;
123 dbg("created new fake_node with id %d\n", fake_nid); 123 dbg("created new fake_node with id %d\n", fake_nid);
124 return 1; 124 return 1;
125 } 125 }
126 return 0; 126 return 0;
127 } 127 }
128 128
129 /* 129 /*
130 * get_node_active_region - Return active region containing pfn 130 * get_node_active_region - Return active region containing pfn
131 * Active range returned is empty if none found. 131 * Active range returned is empty if none found.
132 * @pfn: The page to return the region for 132 * @pfn: The page to return the region for
133 * @node_ar: Returned set to the active region containing @pfn 133 * @node_ar: Returned set to the active region containing @pfn
134 */ 134 */
135 static void __init get_node_active_region(unsigned long pfn, 135 static void __init get_node_active_region(unsigned long pfn,
136 struct node_active_region *node_ar) 136 struct node_active_region *node_ar)
137 { 137 {
138 unsigned long start_pfn, end_pfn; 138 unsigned long start_pfn, end_pfn;
139 int i, nid; 139 int i, nid;
140 140
141 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { 141 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
142 if (pfn >= start_pfn && pfn < end_pfn) { 142 if (pfn >= start_pfn && pfn < end_pfn) {
143 node_ar->nid = nid; 143 node_ar->nid = nid;
144 node_ar->start_pfn = start_pfn; 144 node_ar->start_pfn = start_pfn;
145 node_ar->end_pfn = end_pfn; 145 node_ar->end_pfn = end_pfn;
146 break; 146 break;
147 } 147 }
148 } 148 }
149 } 149 }
150 150
151 static void map_cpu_to_node(int cpu, int node) 151 static void map_cpu_to_node(int cpu, int node)
152 { 152 {
153 numa_cpu_lookup_table[cpu] = node; 153 numa_cpu_lookup_table[cpu] = node;
154 154
155 dbg("adding cpu %d to node %d\n", cpu, node); 155 dbg("adding cpu %d to node %d\n", cpu, node);
156 156
157 if (!(cpumask_test_cpu(cpu, node_to_cpumask_map[node]))) 157 if (!(cpumask_test_cpu(cpu, node_to_cpumask_map[node])))
158 cpumask_set_cpu(cpu, node_to_cpumask_map[node]); 158 cpumask_set_cpu(cpu, node_to_cpumask_map[node]);
159 } 159 }
160 160
161 #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PPC_SPLPAR) 161 #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PPC_SPLPAR)
162 static void unmap_cpu_from_node(unsigned long cpu) 162 static void unmap_cpu_from_node(unsigned long cpu)
163 { 163 {
164 int node = numa_cpu_lookup_table[cpu]; 164 int node = numa_cpu_lookup_table[cpu];
165 165
166 dbg("removing cpu %lu from node %d\n", cpu, node); 166 dbg("removing cpu %lu from node %d\n", cpu, node);
167 167
168 if (cpumask_test_cpu(cpu, node_to_cpumask_map[node])) { 168 if (cpumask_test_cpu(cpu, node_to_cpumask_map[node])) {
169 cpumask_clear_cpu(cpu, node_to_cpumask_map[node]); 169 cpumask_clear_cpu(cpu, node_to_cpumask_map[node]);
170 } else { 170 } else {
171 printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n", 171 printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n",
172 cpu, node); 172 cpu, node);
173 } 173 }
174 } 174 }
175 #endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */ 175 #endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */
176 176
177 /* must hold reference to node during call */ 177 /* must hold reference to node during call */
178 static const int *of_get_associativity(struct device_node *dev) 178 static const int *of_get_associativity(struct device_node *dev)
179 { 179 {
180 return of_get_property(dev, "ibm,associativity", NULL); 180 return of_get_property(dev, "ibm,associativity", NULL);
181 } 181 }
182 182
183 /* 183 /*
184 * Returns the property linux,drconf-usable-memory if 184 * Returns the property linux,drconf-usable-memory if
185 * it exists (the property exists only in kexec/kdump kernels, 185 * it exists (the property exists only in kexec/kdump kernels,
186 * added by kexec-tools) 186 * added by kexec-tools)
187 */ 187 */
188 static const u32 *of_get_usable_memory(struct device_node *memory) 188 static const u32 *of_get_usable_memory(struct device_node *memory)
189 { 189 {
190 const u32 *prop; 190 const u32 *prop;
191 u32 len; 191 u32 len;
192 prop = of_get_property(memory, "linux,drconf-usable-memory", &len); 192 prop = of_get_property(memory, "linux,drconf-usable-memory", &len);
193 if (!prop || len < sizeof(unsigned int)) 193 if (!prop || len < sizeof(unsigned int))
194 return 0; 194 return 0;
195 return prop; 195 return prop;
196 } 196 }
197 197
198 int __node_distance(int a, int b) 198 int __node_distance(int a, int b)
199 { 199 {
200 int i; 200 int i;
201 int distance = LOCAL_DISTANCE; 201 int distance = LOCAL_DISTANCE;
202 202
203 if (!form1_affinity) 203 if (!form1_affinity)
204 return distance; 204 return distance;
205 205
206 for (i = 0; i < distance_ref_points_depth; i++) { 206 for (i = 0; i < distance_ref_points_depth; i++) {
207 if (distance_lookup_table[a][i] == distance_lookup_table[b][i]) 207 if (distance_lookup_table[a][i] == distance_lookup_table[b][i])
208 break; 208 break;
209 209
210 /* Double the distance for each NUMA level */ 210 /* Double the distance for each NUMA level */
211 distance *= 2; 211 distance *= 2;
212 } 212 }
213 213
214 return distance; 214 return distance;
215 } 215 }
216 216
217 static void initialize_distance_lookup_table(int nid, 217 static void initialize_distance_lookup_table(int nid,
218 const unsigned int *associativity) 218 const unsigned int *associativity)
219 { 219 {
220 int i; 220 int i;
221 221
222 if (!form1_affinity) 222 if (!form1_affinity)
223 return; 223 return;
224 224
225 for (i = 0; i < distance_ref_points_depth; i++) { 225 for (i = 0; i < distance_ref_points_depth; i++) {
226 distance_lookup_table[nid][i] = 226 distance_lookup_table[nid][i] =
227 associativity[distance_ref_points[i]]; 227 associativity[distance_ref_points[i]];
228 } 228 }
229 } 229 }
230 230
231 /* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa 231 /* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa
232 * info is found. 232 * info is found.
233 */ 233 */
234 static int associativity_to_nid(const unsigned int *associativity) 234 static int associativity_to_nid(const unsigned int *associativity)
235 { 235 {
236 int nid = -1; 236 int nid = -1;
237 237
238 if (min_common_depth == -1) 238 if (min_common_depth == -1)
239 goto out; 239 goto out;
240 240
241 if (associativity[0] >= min_common_depth) 241 if (associativity[0] >= min_common_depth)
242 nid = associativity[min_common_depth]; 242 nid = associativity[min_common_depth];
243 243
244 /* POWER4 LPAR uses 0xffff as invalid node */ 244 /* POWER4 LPAR uses 0xffff as invalid node */
245 if (nid == 0xffff || nid >= MAX_NUMNODES) 245 if (nid == 0xffff || nid >= MAX_NUMNODES)
246 nid = -1; 246 nid = -1;
247 247
248 if (nid > 0 && associativity[0] >= distance_ref_points_depth) 248 if (nid > 0 && associativity[0] >= distance_ref_points_depth)
249 initialize_distance_lookup_table(nid, associativity); 249 initialize_distance_lookup_table(nid, associativity);
250 250
251 out: 251 out:
252 return nid; 252 return nid;
253 } 253 }
254 254
255 /* Returns the nid associated with the given device tree node, 255 /* Returns the nid associated with the given device tree node,
256 * or -1 if not found. 256 * or -1 if not found.
257 */ 257 */
258 static int of_node_to_nid_single(struct device_node *device) 258 static int of_node_to_nid_single(struct device_node *device)
259 { 259 {
260 int nid = -1; 260 int nid = -1;
261 const unsigned int *tmp; 261 const unsigned int *tmp;
262 262
263 tmp = of_get_associativity(device); 263 tmp = of_get_associativity(device);
264 if (tmp) 264 if (tmp)
265 nid = associativity_to_nid(tmp); 265 nid = associativity_to_nid(tmp);
266 return nid; 266 return nid;
267 } 267 }
268 268
269 /* Walk the device tree upwards, looking for an associativity id */ 269 /* Walk the device tree upwards, looking for an associativity id */
270 int of_node_to_nid(struct device_node *device) 270 int of_node_to_nid(struct device_node *device)
271 { 271 {
272 struct device_node *tmp; 272 struct device_node *tmp;
273 int nid = -1; 273 int nid = -1;
274 274
275 of_node_get(device); 275 of_node_get(device);
276 while (device) { 276 while (device) {
277 nid = of_node_to_nid_single(device); 277 nid = of_node_to_nid_single(device);
278 if (nid != -1) 278 if (nid != -1)
279 break; 279 break;
280 280
281 tmp = device; 281 tmp = device;
282 device = of_get_parent(tmp); 282 device = of_get_parent(tmp);
283 of_node_put(tmp); 283 of_node_put(tmp);
284 } 284 }
285 of_node_put(device); 285 of_node_put(device);
286 286
287 return nid; 287 return nid;
288 } 288 }
289 EXPORT_SYMBOL_GPL(of_node_to_nid); 289 EXPORT_SYMBOL_GPL(of_node_to_nid);
290 290
291 static int __init find_min_common_depth(void) 291 static int __init find_min_common_depth(void)
292 { 292 {
293 int depth; 293 int depth;
294 struct device_node *chosen; 294 struct device_node *chosen;
295 struct device_node *root; 295 struct device_node *root;
296 const char *vec5; 296 const char *vec5;
297 297
298 if (firmware_has_feature(FW_FEATURE_OPAL)) 298 if (firmware_has_feature(FW_FEATURE_OPAL))
299 root = of_find_node_by_path("/ibm,opal"); 299 root = of_find_node_by_path("/ibm,opal");
300 else 300 else
301 root = of_find_node_by_path("/rtas"); 301 root = of_find_node_by_path("/rtas");
302 if (!root) 302 if (!root)
303 root = of_find_node_by_path("/"); 303 root = of_find_node_by_path("/");
304 304
305 /* 305 /*
306 * This property is a set of 32-bit integers, each representing 306 * This property is a set of 32-bit integers, each representing
307 * an index into the ibm,associativity nodes. 307 * an index into the ibm,associativity nodes.
308 * 308 *
309 * With form 0 affinity the first integer is for an SMP configuration 309 * With form 0 affinity the first integer is for an SMP configuration
310 * (should be all 0's) and the second is for a normal NUMA 310 * (should be all 0's) and the second is for a normal NUMA
311 * configuration. We have only one level of NUMA. 311 * configuration. We have only one level of NUMA.
312 * 312 *
313 * With form 1 affinity the first integer is the most significant 313 * With form 1 affinity the first integer is the most significant
314 * NUMA boundary and the following are progressively less significant 314 * NUMA boundary and the following are progressively less significant
315 * boundaries. There can be more than one level of NUMA. 315 * boundaries. There can be more than one level of NUMA.
316 */ 316 */
317 distance_ref_points = of_get_property(root, 317 distance_ref_points = of_get_property(root,
318 "ibm,associativity-reference-points", 318 "ibm,associativity-reference-points",
319 &distance_ref_points_depth); 319 &distance_ref_points_depth);
320 320
321 if (!distance_ref_points) { 321 if (!distance_ref_points) {
322 dbg("NUMA: ibm,associativity-reference-points not found.\n"); 322 dbg("NUMA: ibm,associativity-reference-points not found.\n");
323 goto err; 323 goto err;
324 } 324 }
325 325
326 distance_ref_points_depth /= sizeof(int); 326 distance_ref_points_depth /= sizeof(int);
327 327
328 #define VEC5_AFFINITY_BYTE 5 328 #define VEC5_AFFINITY_BYTE 5
329 #define VEC5_AFFINITY 0x80 329 #define VEC5_AFFINITY 0x80
330 330
331 if (firmware_has_feature(FW_FEATURE_OPAL)) 331 if (firmware_has_feature(FW_FEATURE_OPAL))
332 form1_affinity = 1; 332 form1_affinity = 1;
333 else { 333 else {
334 chosen = of_find_node_by_path("/chosen"); 334 chosen = of_find_node_by_path("/chosen");
335 if (chosen) { 335 if (chosen) {
336 vec5 = of_get_property(chosen, 336 vec5 = of_get_property(chosen,
337 "ibm,architecture-vec-5", NULL); 337 "ibm,architecture-vec-5", NULL);
338 if (vec5 && (vec5[VEC5_AFFINITY_BYTE] & 338 if (vec5 && (vec5[VEC5_AFFINITY_BYTE] &
339 VEC5_AFFINITY)) { 339 VEC5_AFFINITY)) {
340 dbg("Using form 1 affinity\n"); 340 dbg("Using form 1 affinity\n");
341 form1_affinity = 1; 341 form1_affinity = 1;
342 } 342 }
343 } 343 }
344 } 344 }
345 345
346 if (form1_affinity) { 346 if (form1_affinity) {
347 depth = distance_ref_points[0]; 347 depth = distance_ref_points[0];
348 } else { 348 } else {
349 if (distance_ref_points_depth < 2) { 349 if (distance_ref_points_depth < 2) {
350 printk(KERN_WARNING "NUMA: " 350 printk(KERN_WARNING "NUMA: "
351 "short ibm,associativity-reference-points\n"); 351 "short ibm,associativity-reference-points\n");
352 goto err; 352 goto err;
353 } 353 }
354 354
355 depth = distance_ref_points[1]; 355 depth = distance_ref_points[1];
356 } 356 }
357 357
358 /* 358 /*
359 * Warn and cap if the hardware supports more than 359 * Warn and cap if the hardware supports more than
360 * MAX_DISTANCE_REF_POINTS domains. 360 * MAX_DISTANCE_REF_POINTS domains.
361 */ 361 */
362 if (distance_ref_points_depth > MAX_DISTANCE_REF_POINTS) { 362 if (distance_ref_points_depth > MAX_DISTANCE_REF_POINTS) {
363 printk(KERN_WARNING "NUMA: distance array capped at " 363 printk(KERN_WARNING "NUMA: distance array capped at "
364 "%d entries\n", MAX_DISTANCE_REF_POINTS); 364 "%d entries\n", MAX_DISTANCE_REF_POINTS);
365 distance_ref_points_depth = MAX_DISTANCE_REF_POINTS; 365 distance_ref_points_depth = MAX_DISTANCE_REF_POINTS;
366 } 366 }
367 367
368 of_node_put(root); 368 of_node_put(root);
369 return depth; 369 return depth;
370 370
371 err: 371 err:
372 of_node_put(root); 372 of_node_put(root);
373 return -1; 373 return -1;
374 } 374 }
375 375
376 static void __init get_n_mem_cells(int *n_addr_cells, int *n_size_cells) 376 static void __init get_n_mem_cells(int *n_addr_cells, int *n_size_cells)
377 { 377 {
378 struct device_node *memory = NULL; 378 struct device_node *memory = NULL;
379 379
380 memory = of_find_node_by_type(memory, "memory"); 380 memory = of_find_node_by_type(memory, "memory");
381 if (!memory) 381 if (!memory)
382 panic("numa.c: No memory nodes found!"); 382 panic("numa.c: No memory nodes found!");
383 383
384 *n_addr_cells = of_n_addr_cells(memory); 384 *n_addr_cells = of_n_addr_cells(memory);
385 *n_size_cells = of_n_size_cells(memory); 385 *n_size_cells = of_n_size_cells(memory);
386 of_node_put(memory); 386 of_node_put(memory);
387 } 387 }
388 388
389 static unsigned long read_n_cells(int n, const unsigned int **buf) 389 static unsigned long read_n_cells(int n, const unsigned int **buf)
390 { 390 {
391 unsigned long result = 0; 391 unsigned long result = 0;
392 392
393 while (n--) { 393 while (n--) {
394 result = (result << 32) | **buf; 394 result = (result << 32) | **buf;
395 (*buf)++; 395 (*buf)++;
396 } 396 }
397 return result; 397 return result;
398 } 398 }
399 399
400 struct of_drconf_cell { 400 struct of_drconf_cell {
401 u64 base_addr; 401 u64 base_addr;
402 u32 drc_index; 402 u32 drc_index;
403 u32 reserved; 403 u32 reserved;
404 u32 aa_index; 404 u32 aa_index;
405 u32 flags; 405 u32 flags;
406 }; 406 };
407 407
408 #define DRCONF_MEM_ASSIGNED 0x00000008 408 #define DRCONF_MEM_ASSIGNED 0x00000008
409 #define DRCONF_MEM_AI_INVALID 0x00000040 409 #define DRCONF_MEM_AI_INVALID 0x00000040
410 #define DRCONF_MEM_RESERVED 0x00000080 410 #define DRCONF_MEM_RESERVED 0x00000080
411 411
412 /* 412 /*
413 * Read the next memblock list entry from the ibm,dynamic-memory property 413 * Read the next memblock list entry from the ibm,dynamic-memory property
414 * and return the information in the provided of_drconf_cell structure. 414 * and return the information in the provided of_drconf_cell structure.
415 */ 415 */
416 static void read_drconf_cell(struct of_drconf_cell *drmem, const u32 **cellp) 416 static void read_drconf_cell(struct of_drconf_cell *drmem, const u32 **cellp)
417 { 417 {
418 const u32 *cp; 418 const u32 *cp;
419 419
420 drmem->base_addr = read_n_cells(n_mem_addr_cells, cellp); 420 drmem->base_addr = read_n_cells(n_mem_addr_cells, cellp);
421 421
422 cp = *cellp; 422 cp = *cellp;
423 drmem->drc_index = cp[0]; 423 drmem->drc_index = cp[0];
424 drmem->reserved = cp[1]; 424 drmem->reserved = cp[1];
425 drmem->aa_index = cp[2]; 425 drmem->aa_index = cp[2];
426 drmem->flags = cp[3]; 426 drmem->flags = cp[3];
427 427
428 *cellp = cp + 4; 428 *cellp = cp + 4;
429 } 429 }
430 430
431 /* 431 /*
432 * Retrieve and validate the ibm,dynamic-memory property of the device tree. 432 * Retrieve and validate the ibm,dynamic-memory property of the device tree.
433 * 433 *
434 * The layout of the ibm,dynamic-memory property is a number N of memblock 434 * The layout of the ibm,dynamic-memory property is a number N of memblock
435 * list entries followed by N memblock list entries. Each memblock list entry 435 * list entries followed by N memblock list entries. Each memblock list entry
436 * contains information as laid out in the of_drconf_cell struct above. 436 * contains information as laid out in the of_drconf_cell struct above.
437 */ 437 */
438 static int of_get_drconf_memory(struct device_node *memory, const u32 **dm) 438 static int of_get_drconf_memory(struct device_node *memory, const u32 **dm)
439 { 439 {
440 const u32 *prop; 440 const u32 *prop;
441 u32 len, entries; 441 u32 len, entries;
442 442
443 prop = of_get_property(memory, "ibm,dynamic-memory", &len); 443 prop = of_get_property(memory, "ibm,dynamic-memory", &len);
444 if (!prop || len < sizeof(unsigned int)) 444 if (!prop || len < sizeof(unsigned int))
445 return 0; 445 return 0;
446 446
447 entries = *prop++; 447 entries = *prop++;
448 448
449 /* Now that we know the number of entries, revalidate the size 449 /* Now that we know the number of entries, revalidate the size
450 * of the property read in to ensure we have everything 450 * of the property read in to ensure we have everything
451 */ 451 */
452 if (len < (entries * (n_mem_addr_cells + 4) + 1) * sizeof(unsigned int)) 452 if (len < (entries * (n_mem_addr_cells + 4) + 1) * sizeof(unsigned int))
453 return 0; 453 return 0;
454 454
455 *dm = prop; 455 *dm = prop;
456 return entries; 456 return entries;
457 } 457 }
458 458
459 /* 459 /*
460 * Retrieve and validate the ibm,lmb-size property for drconf memory 460 * Retrieve and validate the ibm,lmb-size property for drconf memory
461 * from the device tree. 461 * from the device tree.
462 */ 462 */
463 static u64 of_get_lmb_size(struct device_node *memory) 463 static u64 of_get_lmb_size(struct device_node *memory)
464 { 464 {
465 const u32 *prop; 465 const u32 *prop;
466 u32 len; 466 u32 len;
467 467
468 prop = of_get_property(memory, "ibm,lmb-size", &len); 468 prop = of_get_property(memory, "ibm,lmb-size", &len);
469 if (!prop || len < sizeof(unsigned int)) 469 if (!prop || len < sizeof(unsigned int))
470 return 0; 470 return 0;
471 471
472 return read_n_cells(n_mem_size_cells, &prop); 472 return read_n_cells(n_mem_size_cells, &prop);
473 } 473 }
474 474
475 struct assoc_arrays { 475 struct assoc_arrays {
476 u32 n_arrays; 476 u32 n_arrays;
477 u32 array_sz; 477 u32 array_sz;
478 const u32 *arrays; 478 const u32 *arrays;
479 }; 479 };
480 480
481 /* 481 /*
482 * Retrieve and validate the list of associativity arrays for drconf 482 * Retrieve and validate the list of associativity arrays for drconf
483 * memory from the ibm,associativity-lookup-arrays property of the 483 * memory from the ibm,associativity-lookup-arrays property of the
484 * device tree.. 484 * device tree..
485 * 485 *
486 * The layout of the ibm,associativity-lookup-arrays property is a number N 486 * The layout of the ibm,associativity-lookup-arrays property is a number N
487 * indicating the number of associativity arrays, followed by a number M 487 * indicating the number of associativity arrays, followed by a number M
488 * indicating the size of each associativity array, followed by a list 488 * indicating the size of each associativity array, followed by a list
489 * of N associativity arrays. 489 * of N associativity arrays.
490 */ 490 */
491 static int of_get_assoc_arrays(struct device_node *memory, 491 static int of_get_assoc_arrays(struct device_node *memory,
492 struct assoc_arrays *aa) 492 struct assoc_arrays *aa)
493 { 493 {
494 const u32 *prop; 494 const u32 *prop;
495 u32 len; 495 u32 len;
496 496
497 prop = of_get_property(memory, "ibm,associativity-lookup-arrays", &len); 497 prop = of_get_property(memory, "ibm,associativity-lookup-arrays", &len);
498 if (!prop || len < 2 * sizeof(unsigned int)) 498 if (!prop || len < 2 * sizeof(unsigned int))
499 return -1; 499 return -1;
500 500
501 aa->n_arrays = *prop++; 501 aa->n_arrays = *prop++;
502 aa->array_sz = *prop++; 502 aa->array_sz = *prop++;
503 503
504 /* Now that we know the number of arrays and size of each array, 504 /* Now that we know the number of arrays and size of each array,
505 * revalidate the size of the property read in. 505 * revalidate the size of the property read in.
506 */ 506 */
507 if (len < (aa->n_arrays * aa->array_sz + 2) * sizeof(unsigned int)) 507 if (len < (aa->n_arrays * aa->array_sz + 2) * sizeof(unsigned int))
508 return -1; 508 return -1;
509 509
510 aa->arrays = prop; 510 aa->arrays = prop;
511 return 0; 511 return 0;
512 } 512 }
513 513
514 /* 514 /*
515 * This is like of_node_to_nid_single() for memory represented in the 515 * This is like of_node_to_nid_single() for memory represented in the
516 * ibm,dynamic-reconfiguration-memory node. 516 * ibm,dynamic-reconfiguration-memory node.
517 */ 517 */
518 static int of_drconf_to_nid_single(struct of_drconf_cell *drmem, 518 static int of_drconf_to_nid_single(struct of_drconf_cell *drmem,
519 struct assoc_arrays *aa) 519 struct assoc_arrays *aa)
520 { 520 {
521 int default_nid = 0; 521 int default_nid = 0;
522 int nid = default_nid; 522 int nid = default_nid;
523 int index; 523 int index;
524 524
525 if (min_common_depth > 0 && min_common_depth <= aa->array_sz && 525 if (min_common_depth > 0 && min_common_depth <= aa->array_sz &&
526 !(drmem->flags & DRCONF_MEM_AI_INVALID) && 526 !(drmem->flags & DRCONF_MEM_AI_INVALID) &&
527 drmem->aa_index < aa->n_arrays) { 527 drmem->aa_index < aa->n_arrays) {
528 index = drmem->aa_index * aa->array_sz + min_common_depth - 1; 528 index = drmem->aa_index * aa->array_sz + min_common_depth - 1;
529 nid = aa->arrays[index]; 529 nid = aa->arrays[index];
530 530
531 if (nid == 0xffff || nid >= MAX_NUMNODES) 531 if (nid == 0xffff || nid >= MAX_NUMNODES)
532 nid = default_nid; 532 nid = default_nid;
533 } 533 }
534 534
535 return nid; 535 return nid;
536 } 536 }
537 537
538 /* 538 /*
539 * Figure out to which domain a cpu belongs and stick it there. 539 * Figure out to which domain a cpu belongs and stick it there.
540 * Return the id of the domain used. 540 * Return the id of the domain used.
541 */ 541 */
542 static int __cpuinit numa_setup_cpu(unsigned long lcpu) 542 static int __cpuinit numa_setup_cpu(unsigned long lcpu)
543 { 543 {
544 int nid = 0; 544 int nid = 0;
545 struct device_node *cpu = of_get_cpu_node(lcpu, NULL); 545 struct device_node *cpu = of_get_cpu_node(lcpu, NULL);
546 546
547 if (!cpu) { 547 if (!cpu) {
548 WARN_ON(1); 548 WARN_ON(1);
549 goto out; 549 goto out;
550 } 550 }
551 551
552 nid = of_node_to_nid_single(cpu); 552 nid = of_node_to_nid_single(cpu);
553 553
554 if (nid < 0 || !node_online(nid)) 554 if (nid < 0 || !node_online(nid))
555 nid = first_online_node; 555 nid = first_online_node;
556 out: 556 out:
557 map_cpu_to_node(lcpu, nid); 557 map_cpu_to_node(lcpu, nid);
558 558
559 of_node_put(cpu); 559 of_node_put(cpu);
560 560
561 return nid; 561 return nid;
562 } 562 }
563 563
564 static int __cpuinit cpu_numa_callback(struct notifier_block *nfb, 564 static int __cpuinit cpu_numa_callback(struct notifier_block *nfb,
565 unsigned long action, 565 unsigned long action,
566 void *hcpu) 566 void *hcpu)
567 { 567 {
568 unsigned long lcpu = (unsigned long)hcpu; 568 unsigned long lcpu = (unsigned long)hcpu;
569 int ret = NOTIFY_DONE; 569 int ret = NOTIFY_DONE;
570 570
571 switch (action) { 571 switch (action) {
572 case CPU_UP_PREPARE: 572 case CPU_UP_PREPARE:
573 case CPU_UP_PREPARE_FROZEN: 573 case CPU_UP_PREPARE_FROZEN:
574 numa_setup_cpu(lcpu); 574 numa_setup_cpu(lcpu);
575 ret = NOTIFY_OK; 575 ret = NOTIFY_OK;
576 break; 576 break;
577 #ifdef CONFIG_HOTPLUG_CPU 577 #ifdef CONFIG_HOTPLUG_CPU
578 case CPU_DEAD: 578 case CPU_DEAD:
579 case CPU_DEAD_FROZEN: 579 case CPU_DEAD_FROZEN:
580 case CPU_UP_CANCELED: 580 case CPU_UP_CANCELED:
581 case CPU_UP_CANCELED_FROZEN: 581 case CPU_UP_CANCELED_FROZEN:
582 unmap_cpu_from_node(lcpu); 582 unmap_cpu_from_node(lcpu);
583 break; 583 break;
584 ret = NOTIFY_OK; 584 ret = NOTIFY_OK;
585 #endif 585 #endif
586 } 586 }
587 return ret; 587 return ret;
588 } 588 }
589 589
590 /* 590 /*
591 * Check and possibly modify a memory region to enforce the memory limit. 591 * Check and possibly modify a memory region to enforce the memory limit.
592 * 592 *
593 * Returns the size the region should have to enforce the memory limit. 593 * Returns the size the region should have to enforce the memory limit.
594 * This will either be the original value of size, a truncated value, 594 * This will either be the original value of size, a truncated value,
595 * or zero. If the returned value of size is 0 the region should be 595 * or zero. If the returned value of size is 0 the region should be
596 * discarded as it lies wholly above the memory limit. 596 * discarded as it lies wholly above the memory limit.
597 */ 597 */
598 static unsigned long __init numa_enforce_memory_limit(unsigned long start, 598 static unsigned long __init numa_enforce_memory_limit(unsigned long start,
599 unsigned long size) 599 unsigned long size)
600 { 600 {
601 /* 601 /*
602 * We use memblock_end_of_DRAM() in here instead of memory_limit because 602 * We use memblock_end_of_DRAM() in here instead of memory_limit because
603 * we've already adjusted it for the limit and it takes care of 603 * we've already adjusted it for the limit and it takes care of
604 * having memory holes below the limit. Also, in the case of 604 * having memory holes below the limit. Also, in the case of
605 * iommu_is_off, memory_limit is not set but is implicitly enforced. 605 * iommu_is_off, memory_limit is not set but is implicitly enforced.
606 */ 606 */
607 607
608 if (start + size <= memblock_end_of_DRAM()) 608 if (start + size <= memblock_end_of_DRAM())
609 return size; 609 return size;
610 610
611 if (start >= memblock_end_of_DRAM()) 611 if (start >= memblock_end_of_DRAM())
612 return 0; 612 return 0;
613 613
614 return memblock_end_of_DRAM() - start; 614 return memblock_end_of_DRAM() - start;
615 } 615 }
616 616
617 /* 617 /*
618 * Reads the counter for a given entry in 618 * Reads the counter for a given entry in
619 * linux,drconf-usable-memory property 619 * linux,drconf-usable-memory property
620 */ 620 */
621 static inline int __init read_usm_ranges(const u32 **usm) 621 static inline int __init read_usm_ranges(const u32 **usm)
622 { 622 {
623 /* 623 /*
624 * For each lmb in ibm,dynamic-memory a corresponding 624 * For each lmb in ibm,dynamic-memory a corresponding
625 * entry in linux,drconf-usable-memory property contains 625 * entry in linux,drconf-usable-memory property contains
626 * a counter followed by that many (base, size) duple. 626 * a counter followed by that many (base, size) duple.
627 * read the counter from linux,drconf-usable-memory 627 * read the counter from linux,drconf-usable-memory
628 */ 628 */
629 return read_n_cells(n_mem_size_cells, usm); 629 return read_n_cells(n_mem_size_cells, usm);
630 } 630 }
631 631
632 /* 632 /*
633 * Extract NUMA information from the ibm,dynamic-reconfiguration-memory 633 * Extract NUMA information from the ibm,dynamic-reconfiguration-memory
634 * node. This assumes n_mem_{addr,size}_cells have been set. 634 * node. This assumes n_mem_{addr,size}_cells have been set.
635 */ 635 */
636 static void __init parse_drconf_memory(struct device_node *memory) 636 static void __init parse_drconf_memory(struct device_node *memory)
637 { 637 {
638 const u32 *dm, *usm; 638 const u32 *dm, *usm;
639 unsigned int n, rc, ranges, is_kexec_kdump = 0; 639 unsigned int n, rc, ranges, is_kexec_kdump = 0;
640 unsigned long lmb_size, base, size, sz; 640 unsigned long lmb_size, base, size, sz;
641 int nid; 641 int nid;
642 struct assoc_arrays aa; 642 struct assoc_arrays aa;
643 643
644 n = of_get_drconf_memory(memory, &dm); 644 n = of_get_drconf_memory(memory, &dm);
645 if (!n) 645 if (!n)
646 return; 646 return;
647 647
648 lmb_size = of_get_lmb_size(memory); 648 lmb_size = of_get_lmb_size(memory);
649 if (!lmb_size) 649 if (!lmb_size)
650 return; 650 return;
651 651
652 rc = of_get_assoc_arrays(memory, &aa); 652 rc = of_get_assoc_arrays(memory, &aa);
653 if (rc) 653 if (rc)
654 return; 654 return;
655 655
656 /* check if this is a kexec/kdump kernel */ 656 /* check if this is a kexec/kdump kernel */
657 usm = of_get_usable_memory(memory); 657 usm = of_get_usable_memory(memory);
658 if (usm != NULL) 658 if (usm != NULL)
659 is_kexec_kdump = 1; 659 is_kexec_kdump = 1;
660 660
661 for (; n != 0; --n) { 661 for (; n != 0; --n) {
662 struct of_drconf_cell drmem; 662 struct of_drconf_cell drmem;
663 663
664 read_drconf_cell(&drmem, &dm); 664 read_drconf_cell(&drmem, &dm);
665 665
666 /* skip this block if the reserved bit is set in flags (0x80) 666 /* skip this block if the reserved bit is set in flags (0x80)
667 or if the block is not assigned to this partition (0x8) */ 667 or if the block is not assigned to this partition (0x8) */
668 if ((drmem.flags & DRCONF_MEM_RESERVED) 668 if ((drmem.flags & DRCONF_MEM_RESERVED)
669 || !(drmem.flags & DRCONF_MEM_ASSIGNED)) 669 || !(drmem.flags & DRCONF_MEM_ASSIGNED))
670 continue; 670 continue;
671 671
672 base = drmem.base_addr; 672 base = drmem.base_addr;
673 size = lmb_size; 673 size = lmb_size;
674 ranges = 1; 674 ranges = 1;
675 675
676 if (is_kexec_kdump) { 676 if (is_kexec_kdump) {
677 ranges = read_usm_ranges(&usm); 677 ranges = read_usm_ranges(&usm);
678 if (!ranges) /* there are no (base, size) duple */ 678 if (!ranges) /* there are no (base, size) duple */
679 continue; 679 continue;
680 } 680 }
681 do { 681 do {
682 if (is_kexec_kdump) { 682 if (is_kexec_kdump) {
683 base = read_n_cells(n_mem_addr_cells, &usm); 683 base = read_n_cells(n_mem_addr_cells, &usm);
684 size = read_n_cells(n_mem_size_cells, &usm); 684 size = read_n_cells(n_mem_size_cells, &usm);
685 } 685 }
686 nid = of_drconf_to_nid_single(&drmem, &aa); 686 nid = of_drconf_to_nid_single(&drmem, &aa);
687 fake_numa_create_new_node( 687 fake_numa_create_new_node(
688 ((base + size) >> PAGE_SHIFT), 688 ((base + size) >> PAGE_SHIFT),
689 &nid); 689 &nid);
690 node_set_online(nid); 690 node_set_online(nid);
691 sz = numa_enforce_memory_limit(base, size); 691 sz = numa_enforce_memory_limit(base, size);
692 if (sz) 692 if (sz)
693 memblock_set_node(base, sz, nid); 693 memblock_set_node(base, sz, nid);
694 } while (--ranges); 694 } while (--ranges);
695 } 695 }
696 } 696 }
697 697
698 static int __init parse_numa_properties(void) 698 static int __init parse_numa_properties(void)
699 { 699 {
700 struct device_node *memory; 700 struct device_node *memory;
701 int default_nid = 0; 701 int default_nid = 0;
702 unsigned long i; 702 unsigned long i;
703 703
704 if (numa_enabled == 0) { 704 if (numa_enabled == 0) {
705 printk(KERN_WARNING "NUMA disabled by user\n"); 705 printk(KERN_WARNING "NUMA disabled by user\n");
706 return -1; 706 return -1;
707 } 707 }
708 708
709 min_common_depth = find_min_common_depth(); 709 min_common_depth = find_min_common_depth();
710 710
711 if (min_common_depth < 0) 711 if (min_common_depth < 0)
712 return min_common_depth; 712 return min_common_depth;
713 713
714 dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth); 714 dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth);
715 715
716 /* 716 /*
717 * Even though we connect cpus to numa domains later in SMP 717 * Even though we connect cpus to numa domains later in SMP
718 * init, we need to know the node ids now. This is because 718 * init, we need to know the node ids now. This is because
719 * each node to be onlined must have NODE_DATA etc backing it. 719 * each node to be onlined must have NODE_DATA etc backing it.
720 */ 720 */
721 for_each_present_cpu(i) { 721 for_each_present_cpu(i) {
722 struct device_node *cpu; 722 struct device_node *cpu;
723 int nid; 723 int nid;
724 724
725 cpu = of_get_cpu_node(i, NULL); 725 cpu = of_get_cpu_node(i, NULL);
726 BUG_ON(!cpu); 726 BUG_ON(!cpu);
727 nid = of_node_to_nid_single(cpu); 727 nid = of_node_to_nid_single(cpu);
728 of_node_put(cpu); 728 of_node_put(cpu);
729 729
730 /* 730 /*
731 * Don't fall back to default_nid yet -- we will plug 731 * Don't fall back to default_nid yet -- we will plug
732 * cpus into nodes once the memory scan has discovered 732 * cpus into nodes once the memory scan has discovered
733 * the topology. 733 * the topology.
734 */ 734 */
735 if (nid < 0) 735 if (nid < 0)
736 continue; 736 continue;
737 node_set_online(nid); 737 node_set_online(nid);
738 } 738 }
739 739
740 get_n_mem_cells(&n_mem_addr_cells, &n_mem_size_cells); 740 get_n_mem_cells(&n_mem_addr_cells, &n_mem_size_cells);
741 741
742 for_each_node_by_type(memory, "memory") { 742 for_each_node_by_type(memory, "memory") {
743 unsigned long start; 743 unsigned long start;
744 unsigned long size; 744 unsigned long size;
745 int nid; 745 int nid;
746 int ranges; 746 int ranges;
747 const unsigned int *memcell_buf; 747 const unsigned int *memcell_buf;
748 unsigned int len; 748 unsigned int len;
749 749
750 memcell_buf = of_get_property(memory, 750 memcell_buf = of_get_property(memory,
751 "linux,usable-memory", &len); 751 "linux,usable-memory", &len);
752 if (!memcell_buf || len <= 0) 752 if (!memcell_buf || len <= 0)
753 memcell_buf = of_get_property(memory, "reg", &len); 753 memcell_buf = of_get_property(memory, "reg", &len);
754 if (!memcell_buf || len <= 0) 754 if (!memcell_buf || len <= 0)
755 continue; 755 continue;
756 756
757 /* ranges in cell */ 757 /* ranges in cell */
758 ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells); 758 ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);
759 new_range: 759 new_range:
760 /* these are order-sensitive, and modify the buffer pointer */ 760 /* these are order-sensitive, and modify the buffer pointer */
761 start = read_n_cells(n_mem_addr_cells, &memcell_buf); 761 start = read_n_cells(n_mem_addr_cells, &memcell_buf);
762 size = read_n_cells(n_mem_size_cells, &memcell_buf); 762 size = read_n_cells(n_mem_size_cells, &memcell_buf);
763 763
764 /* 764 /*
765 * Assumption: either all memory nodes or none will 765 * Assumption: either all memory nodes or none will
766 * have associativity properties. If none, then 766 * have associativity properties. If none, then
767 * everything goes to default_nid. 767 * everything goes to default_nid.
768 */ 768 */
769 nid = of_node_to_nid_single(memory); 769 nid = of_node_to_nid_single(memory);
770 if (nid < 0) 770 if (nid < 0)
771 nid = default_nid; 771 nid = default_nid;
772 772
773 fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), &nid); 773 fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), &nid);
774 node_set_online(nid); 774 node_set_online(nid);
775 775
776 if (!(size = numa_enforce_memory_limit(start, size))) { 776 if (!(size = numa_enforce_memory_limit(start, size))) {
777 if (--ranges) 777 if (--ranges)
778 goto new_range; 778 goto new_range;
779 else 779 else
780 continue; 780 continue;
781 } 781 }
782 782
783 memblock_set_node(start, size, nid); 783 memblock_set_node(start, size, nid);
784 784
785 if (--ranges) 785 if (--ranges)
786 goto new_range; 786 goto new_range;
787 } 787 }
788 788
789 /* 789 /*
790 * Now do the same thing for each MEMBLOCK listed in the 790 * Now do the same thing for each MEMBLOCK listed in the
791 * ibm,dynamic-memory property in the 791 * ibm,dynamic-memory property in the
792 * ibm,dynamic-reconfiguration-memory node. 792 * ibm,dynamic-reconfiguration-memory node.
793 */ 793 */
794 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); 794 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
795 if (memory) 795 if (memory)
796 parse_drconf_memory(memory); 796 parse_drconf_memory(memory);
797 797
798 return 0; 798 return 0;
799 } 799 }
800 800
801 static void __init setup_nonnuma(void) 801 static void __init setup_nonnuma(void)
802 { 802 {
803 unsigned long top_of_ram = memblock_end_of_DRAM(); 803 unsigned long top_of_ram = memblock_end_of_DRAM();
804 unsigned long total_ram = memblock_phys_mem_size(); 804 unsigned long total_ram = memblock_phys_mem_size();
805 unsigned long start_pfn, end_pfn; 805 unsigned long start_pfn, end_pfn;
806 unsigned int nid = 0; 806 unsigned int nid = 0;
807 struct memblock_region *reg; 807 struct memblock_region *reg;
808 808
809 printk(KERN_DEBUG "Top of RAM: 0x%lx, Total RAM: 0x%lx\n", 809 printk(KERN_DEBUG "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
810 top_of_ram, total_ram); 810 top_of_ram, total_ram);
811 printk(KERN_DEBUG "Memory hole size: %ldMB\n", 811 printk(KERN_DEBUG "Memory hole size: %ldMB\n",
812 (top_of_ram - total_ram) >> 20); 812 (top_of_ram - total_ram) >> 20);
813 813
814 for_each_memblock(memory, reg) { 814 for_each_memblock(memory, reg) {
815 start_pfn = memblock_region_memory_base_pfn(reg); 815 start_pfn = memblock_region_memory_base_pfn(reg);
816 end_pfn = memblock_region_memory_end_pfn(reg); 816 end_pfn = memblock_region_memory_end_pfn(reg);
817 817
818 fake_numa_create_new_node(end_pfn, &nid); 818 fake_numa_create_new_node(end_pfn, &nid);
819 memblock_set_node(PFN_PHYS(start_pfn), 819 memblock_set_node(PFN_PHYS(start_pfn),
820 PFN_PHYS(end_pfn - start_pfn), nid); 820 PFN_PHYS(end_pfn - start_pfn), nid);
821 node_set_online(nid); 821 node_set_online(nid);
822 } 822 }
823 } 823 }
824 824
825 void __init dump_numa_cpu_topology(void) 825 void __init dump_numa_cpu_topology(void)
826 { 826 {
827 unsigned int node; 827 unsigned int node;
828 unsigned int cpu, count; 828 unsigned int cpu, count;
829 829
830 if (min_common_depth == -1 || !numa_enabled) 830 if (min_common_depth == -1 || !numa_enabled)
831 return; 831 return;
832 832
833 for_each_online_node(node) { 833 for_each_online_node(node) {
834 printk(KERN_DEBUG "Node %d CPUs:", node); 834 printk(KERN_DEBUG "Node %d CPUs:", node);
835 835
836 count = 0; 836 count = 0;
837 /* 837 /*
838 * If we used a CPU iterator here we would miss printing 838 * If we used a CPU iterator here we would miss printing
839 * the holes in the cpumap. 839 * the holes in the cpumap.
840 */ 840 */
841 for (cpu = 0; cpu < nr_cpu_ids; cpu++) { 841 for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
842 if (cpumask_test_cpu(cpu, 842 if (cpumask_test_cpu(cpu,
843 node_to_cpumask_map[node])) { 843 node_to_cpumask_map[node])) {
844 if (count == 0) 844 if (count == 0)
845 printk(" %u", cpu); 845 printk(" %u", cpu);
846 ++count; 846 ++count;
847 } else { 847 } else {
848 if (count > 1) 848 if (count > 1)
849 printk("-%u", cpu - 1); 849 printk("-%u", cpu - 1);
850 count = 0; 850 count = 0;
851 } 851 }
852 } 852 }
853 853
854 if (count > 1) 854 if (count > 1)
855 printk("-%u", nr_cpu_ids - 1); 855 printk("-%u", nr_cpu_ids - 1);
856 printk("\n"); 856 printk("\n");
857 } 857 }
858 } 858 }
859 859
860 static void __init dump_numa_memory_topology(void) 860 static void __init dump_numa_memory_topology(void)
861 { 861 {
862 unsigned int node; 862 unsigned int node;
863 unsigned int count; 863 unsigned int count;
864 864
865 if (min_common_depth == -1 || !numa_enabled) 865 if (min_common_depth == -1 || !numa_enabled)
866 return; 866 return;
867 867
868 for_each_online_node(node) { 868 for_each_online_node(node) {
869 unsigned long i; 869 unsigned long i;
870 870
871 printk(KERN_DEBUG "Node %d Memory:", node); 871 printk(KERN_DEBUG "Node %d Memory:", node);
872 872
873 count = 0; 873 count = 0;
874 874
875 for (i = 0; i < memblock_end_of_DRAM(); 875 for (i = 0; i < memblock_end_of_DRAM();
876 i += (1 << SECTION_SIZE_BITS)) { 876 i += (1 << SECTION_SIZE_BITS)) {
877 if (early_pfn_to_nid(i >> PAGE_SHIFT) == node) { 877 if (early_pfn_to_nid(i >> PAGE_SHIFT) == node) {
878 if (count == 0) 878 if (count == 0)
879 printk(" 0x%lx", i); 879 printk(" 0x%lx", i);
880 ++count; 880 ++count;
881 } else { 881 } else {
882 if (count > 0) 882 if (count > 0)
883 printk("-0x%lx", i); 883 printk("-0x%lx", i);
884 count = 0; 884 count = 0;
885 } 885 }
886 } 886 }
887 887
888 if (count > 0) 888 if (count > 0)
889 printk("-0x%lx", i); 889 printk("-0x%lx", i);
890 printk("\n"); 890 printk("\n");
891 } 891 }
892 } 892 }
893 893
894 /* 894 /*
895 * Allocate some memory, satisfying the memblock or bootmem allocator where 895 * Allocate some memory, satisfying the memblock or bootmem allocator where
896 * required. nid is the preferred node and end is the physical address of 896 * required. nid is the preferred node and end is the physical address of
897 * the highest address in the node. 897 * the highest address in the node.
898 * 898 *
899 * Returns the virtual address of the memory. 899 * Returns the virtual address of the memory.
900 */ 900 */
901 static void __init *careful_zallocation(int nid, unsigned long size, 901 static void __init *careful_zallocation(int nid, unsigned long size,
902 unsigned long align, 902 unsigned long align,
903 unsigned long end_pfn) 903 unsigned long end_pfn)
904 { 904 {
905 void *ret; 905 void *ret;
906 int new_nid; 906 int new_nid;
907 unsigned long ret_paddr; 907 unsigned long ret_paddr;
908 908
909 ret_paddr = __memblock_alloc_base(size, align, end_pfn << PAGE_SHIFT); 909 ret_paddr = __memblock_alloc_base(size, align, end_pfn << PAGE_SHIFT);
910 910
911 /* retry over all memory */ 911 /* retry over all memory */
912 if (!ret_paddr) 912 if (!ret_paddr)
913 ret_paddr = __memblock_alloc_base(size, align, memblock_end_of_DRAM()); 913 ret_paddr = __memblock_alloc_base(size, align, memblock_end_of_DRAM());
914 914
915 if (!ret_paddr) 915 if (!ret_paddr)
916 panic("numa.c: cannot allocate %lu bytes for node %d", 916 panic("numa.c: cannot allocate %lu bytes for node %d",
917 size, nid); 917 size, nid);
918 918
919 ret = __va(ret_paddr); 919 ret = __va(ret_paddr);
920 920
921 /* 921 /*
922 * We initialize the nodes in numeric order: 0, 1, 2... 922 * We initialize the nodes in numeric order: 0, 1, 2...
923 * and hand over control from the MEMBLOCK allocator to the 923 * and hand over control from the MEMBLOCK allocator to the
924 * bootmem allocator. If this function is called for 924 * bootmem allocator. If this function is called for
925 * node 5, then we know that all nodes <5 are using the 925 * node 5, then we know that all nodes <5 are using the
926 * bootmem allocator instead of the MEMBLOCK allocator. 926 * bootmem allocator instead of the MEMBLOCK allocator.
927 * 927 *
928 * So, check the nid from which this allocation came 928 * So, check the nid from which this allocation came
929 * and double check to see if we need to use bootmem 929 * and double check to see if we need to use bootmem
930 * instead of the MEMBLOCK. We don't free the MEMBLOCK memory 930 * instead of the MEMBLOCK. We don't free the MEMBLOCK memory
931 * since it would be useless. 931 * since it would be useless.
932 */ 932 */
933 new_nid = early_pfn_to_nid(ret_paddr >> PAGE_SHIFT); 933 new_nid = early_pfn_to_nid(ret_paddr >> PAGE_SHIFT);
934 if (new_nid < nid) { 934 if (new_nid < nid) {
935 ret = __alloc_bootmem_node(NODE_DATA(new_nid), 935 ret = __alloc_bootmem_node(NODE_DATA(new_nid),
936 size, align, 0); 936 size, align, 0);
937 937
938 dbg("alloc_bootmem %p %lx\n", ret, size); 938 dbg("alloc_bootmem %p %lx\n", ret, size);
939 } 939 }
940 940
941 memset(ret, 0, size); 941 memset(ret, 0, size);
942 return ret; 942 return ret;
943 } 943 }
944 944
945 static struct notifier_block __cpuinitdata ppc64_numa_nb = { 945 static struct notifier_block __cpuinitdata ppc64_numa_nb = {
946 .notifier_call = cpu_numa_callback, 946 .notifier_call = cpu_numa_callback,
947 .priority = 1 /* Must run before sched domains notifier. */ 947 .priority = 1 /* Must run before sched domains notifier. */
948 }; 948 };
949 949
950 static void __init mark_reserved_regions_for_nid(int nid) 950 static void __init mark_reserved_regions_for_nid(int nid)
951 { 951 {
952 struct pglist_data *node = NODE_DATA(nid); 952 struct pglist_data *node = NODE_DATA(nid);
953 struct memblock_region *reg; 953 struct memblock_region *reg;
954 954
955 for_each_memblock(reserved, reg) { 955 for_each_memblock(reserved, reg) {
956 unsigned long physbase = reg->base; 956 unsigned long physbase = reg->base;
957 unsigned long size = reg->size; 957 unsigned long size = reg->size;
958 unsigned long start_pfn = physbase >> PAGE_SHIFT; 958 unsigned long start_pfn = physbase >> PAGE_SHIFT;
959 unsigned long end_pfn = PFN_UP(physbase + size); 959 unsigned long end_pfn = PFN_UP(physbase + size);
960 struct node_active_region node_ar; 960 struct node_active_region node_ar;
961 unsigned long node_end_pfn = node->node_start_pfn + 961 unsigned long node_end_pfn = node->node_start_pfn +
962 node->node_spanned_pages; 962 node->node_spanned_pages;
963 963
964 /* 964 /*
965 * Check to make sure that this memblock.reserved area is 965 * Check to make sure that this memblock.reserved area is
966 * within the bounds of the node that we care about. 966 * within the bounds of the node that we care about.
967 * Checking the nid of the start and end points is not 967 * Checking the nid of the start and end points is not
968 * sufficient because the reserved area could span the 968 * sufficient because the reserved area could span the
969 * entire node. 969 * entire node.
970 */ 970 */
971 if (end_pfn <= node->node_start_pfn || 971 if (end_pfn <= node->node_start_pfn ||
972 start_pfn >= node_end_pfn) 972 start_pfn >= node_end_pfn)
973 continue; 973 continue;
974 974
975 get_node_active_region(start_pfn, &node_ar); 975 get_node_active_region(start_pfn, &node_ar);
976 while (start_pfn < end_pfn && 976 while (start_pfn < end_pfn &&
977 node_ar.start_pfn < node_ar.end_pfn) { 977 node_ar.start_pfn < node_ar.end_pfn) {
978 unsigned long reserve_size = size; 978 unsigned long reserve_size = size;
979 /* 979 /*
980 * if reserved region extends past active region 980 * if reserved region extends past active region
981 * then trim size to active region 981 * then trim size to active region
982 */ 982 */
983 if (end_pfn > node_ar.end_pfn) 983 if (end_pfn > node_ar.end_pfn)
984 reserve_size = (node_ar.end_pfn << PAGE_SHIFT) 984 reserve_size = (node_ar.end_pfn << PAGE_SHIFT)
985 - physbase; 985 - physbase;
986 /* 986 /*
987 * Only worry about *this* node, others may not 987 * Only worry about *this* node, others may not
988 * yet have valid NODE_DATA(). 988 * yet have valid NODE_DATA().
989 */ 989 */
990 if (node_ar.nid == nid) { 990 if (node_ar.nid == nid) {
991 dbg("reserve_bootmem %lx %lx nid=%d\n", 991 dbg("reserve_bootmem %lx %lx nid=%d\n",
992 physbase, reserve_size, node_ar.nid); 992 physbase, reserve_size, node_ar.nid);
993 reserve_bootmem_node(NODE_DATA(node_ar.nid), 993 reserve_bootmem_node(NODE_DATA(node_ar.nid),
994 physbase, reserve_size, 994 physbase, reserve_size,
995 BOOTMEM_DEFAULT); 995 BOOTMEM_DEFAULT);
996 } 996 }
997 /* 997 /*
998 * if reserved region is contained in the active region 998 * if reserved region is contained in the active region
999 * then done. 999 * then done.
1000 */ 1000 */
1001 if (end_pfn <= node_ar.end_pfn) 1001 if (end_pfn <= node_ar.end_pfn)
1002 break; 1002 break;
1003 1003
1004 /* 1004 /*
1005 * reserved region extends past the active region 1005 * reserved region extends past the active region
1006 * get next active region that contains this 1006 * get next active region that contains this
1007 * reserved region 1007 * reserved region
1008 */ 1008 */
1009 start_pfn = node_ar.end_pfn; 1009 start_pfn = node_ar.end_pfn;
1010 physbase = start_pfn << PAGE_SHIFT; 1010 physbase = start_pfn << PAGE_SHIFT;
1011 size = size - reserve_size; 1011 size = size - reserve_size;
1012 get_node_active_region(start_pfn, &node_ar); 1012 get_node_active_region(start_pfn, &node_ar);
1013 } 1013 }
1014 } 1014 }
1015 } 1015 }
1016 1016
1017 1017
1018 void __init do_init_bootmem(void) 1018 void __init do_init_bootmem(void)
1019 { 1019 {
1020 int nid; 1020 int nid;
1021 1021
1022 min_low_pfn = 0; 1022 min_low_pfn = 0;
1023 max_low_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT; 1023 max_low_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT;
1024 max_pfn = max_low_pfn; 1024 max_pfn = max_low_pfn;
1025 1025
1026 if (parse_numa_properties()) 1026 if (parse_numa_properties())
1027 setup_nonnuma(); 1027 setup_nonnuma();
1028 else 1028 else
1029 dump_numa_memory_topology(); 1029 dump_numa_memory_topology();
1030 1030
1031 for_each_online_node(nid) { 1031 for_each_online_node(nid) {
1032 unsigned long start_pfn, end_pfn; 1032 unsigned long start_pfn, end_pfn;
1033 void *bootmem_vaddr; 1033 void *bootmem_vaddr;
1034 unsigned long bootmap_pages; 1034 unsigned long bootmap_pages;
1035 1035
1036 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); 1036 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
1037 1037
1038 /* 1038 /*
1039 * Allocate the node structure node local if possible 1039 * Allocate the node structure node local if possible
1040 * 1040 *
1041 * Be careful moving this around, as it relies on all 1041 * Be careful moving this around, as it relies on all
1042 * previous nodes' bootmem to be initialized and have 1042 * previous nodes' bootmem to be initialized and have
1043 * all reserved areas marked. 1043 * all reserved areas marked.
1044 */ 1044 */
1045 NODE_DATA(nid) = careful_zallocation(nid, 1045 NODE_DATA(nid) = careful_zallocation(nid,
1046 sizeof(struct pglist_data), 1046 sizeof(struct pglist_data),
1047 SMP_CACHE_BYTES, end_pfn); 1047 SMP_CACHE_BYTES, end_pfn);
1048 1048
1049 dbg("node %d\n", nid); 1049 dbg("node %d\n", nid);
1050 dbg("NODE_DATA() = %p\n", NODE_DATA(nid)); 1050 dbg("NODE_DATA() = %p\n", NODE_DATA(nid));
1051 1051
1052 NODE_DATA(nid)->bdata = &bootmem_node_data[nid]; 1052 NODE_DATA(nid)->bdata = &bootmem_node_data[nid];
1053 NODE_DATA(nid)->node_start_pfn = start_pfn; 1053 NODE_DATA(nid)->node_start_pfn = start_pfn;
1054 NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn; 1054 NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn;
1055 1055
1056 if (NODE_DATA(nid)->node_spanned_pages == 0) 1056 if (NODE_DATA(nid)->node_spanned_pages == 0)
1057 continue; 1057 continue;
1058 1058
1059 dbg("start_paddr = %lx\n", start_pfn << PAGE_SHIFT); 1059 dbg("start_paddr = %lx\n", start_pfn << PAGE_SHIFT);
1060 dbg("end_paddr = %lx\n", end_pfn << PAGE_SHIFT); 1060 dbg("end_paddr = %lx\n", end_pfn << PAGE_SHIFT);
1061 1061
1062 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); 1062 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
1063 bootmem_vaddr = careful_zallocation(nid, 1063 bootmem_vaddr = careful_zallocation(nid,
1064 bootmap_pages << PAGE_SHIFT, 1064 bootmap_pages << PAGE_SHIFT,
1065 PAGE_SIZE, end_pfn); 1065 PAGE_SIZE, end_pfn);
1066 1066
1067 dbg("bootmap_vaddr = %p\n", bootmem_vaddr); 1067 dbg("bootmap_vaddr = %p\n", bootmem_vaddr);
1068 1068
1069 init_bootmem_node(NODE_DATA(nid), 1069 init_bootmem_node(NODE_DATA(nid),
1070 __pa(bootmem_vaddr) >> PAGE_SHIFT, 1070 __pa(bootmem_vaddr) >> PAGE_SHIFT,
1071 start_pfn, end_pfn); 1071 start_pfn, end_pfn);
1072 1072
1073 free_bootmem_with_active_regions(nid, end_pfn); 1073 free_bootmem_with_active_regions(nid, end_pfn);
1074 /* 1074 /*
1075 * Be very careful about moving this around. Future 1075 * Be very careful about moving this around. Future
1076 * calls to careful_zallocation() depend on this getting 1076 * calls to careful_zallocation() depend on this getting
1077 * done correctly. 1077 * done correctly.
1078 */ 1078 */
1079 mark_reserved_regions_for_nid(nid); 1079 mark_reserved_regions_for_nid(nid);
1080 sparse_memory_present_with_active_regions(nid); 1080 sparse_memory_present_with_active_regions(nid);
1081 } 1081 }
1082 1082
1083 init_bootmem_done = 1; 1083 init_bootmem_done = 1;
1084 1084
1085 /* 1085 /*
1086 * Now bootmem is initialised we can create the node to cpumask 1086 * Now bootmem is initialised we can create the node to cpumask
1087 * lookup tables and setup the cpu callback to populate them. 1087 * lookup tables and setup the cpu callback to populate them.
1088 */ 1088 */
1089 setup_node_to_cpumask_map(); 1089 setup_node_to_cpumask_map();
1090 1090
1091 register_cpu_notifier(&ppc64_numa_nb); 1091 register_cpu_notifier(&ppc64_numa_nb);
1092 cpu_numa_callback(&ppc64_numa_nb, CPU_UP_PREPARE, 1092 cpu_numa_callback(&ppc64_numa_nb, CPU_UP_PREPARE,
1093 (void *)(unsigned long)boot_cpuid); 1093 (void *)(unsigned long)boot_cpuid);
1094 } 1094 }
1095 1095
1096 void __init paging_init(void) 1096 void __init paging_init(void)
1097 { 1097 {
1098 unsigned long max_zone_pfns[MAX_NR_ZONES]; 1098 unsigned long max_zone_pfns[MAX_NR_ZONES];
1099 memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); 1099 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
1100 max_zone_pfns[ZONE_DMA] = memblock_end_of_DRAM() >> PAGE_SHIFT; 1100 max_zone_pfns[ZONE_DMA] = memblock_end_of_DRAM() >> PAGE_SHIFT;
1101 free_area_init_nodes(max_zone_pfns); 1101 free_area_init_nodes(max_zone_pfns);
1102 } 1102 }
1103 1103
1104 static int __init early_numa(char *p) 1104 static int __init early_numa(char *p)
1105 { 1105 {
1106 if (!p) 1106 if (!p)
1107 return 0; 1107 return 0;
1108 1108
1109 if (strstr(p, "off")) 1109 if (strstr(p, "off"))
1110 numa_enabled = 0; 1110 numa_enabled = 0;
1111 1111
1112 if (strstr(p, "debug")) 1112 if (strstr(p, "debug"))
1113 numa_debug = 1; 1113 numa_debug = 1;
1114 1114
1115 p = strstr(p, "fake="); 1115 p = strstr(p, "fake=");
1116 if (p) 1116 if (p)
1117 cmdline = p + strlen("fake="); 1117 cmdline = p + strlen("fake=");
1118 1118
1119 return 0; 1119 return 0;
1120 } 1120 }
1121 early_param("numa", early_numa); 1121 early_param("numa", early_numa);
1122 1122
1123 #ifdef CONFIG_MEMORY_HOTPLUG 1123 #ifdef CONFIG_MEMORY_HOTPLUG
1124 /* 1124 /*
1125 * Find the node associated with a hot added memory section for 1125 * Find the node associated with a hot added memory section for
1126 * memory represented in the device tree by the property 1126 * memory represented in the device tree by the property
1127 * ibm,dynamic-reconfiguration-memory/ibm,dynamic-memory. 1127 * ibm,dynamic-reconfiguration-memory/ibm,dynamic-memory.
1128 */ 1128 */
1129 static int hot_add_drconf_scn_to_nid(struct device_node *memory, 1129 static int hot_add_drconf_scn_to_nid(struct device_node *memory,
1130 unsigned long scn_addr) 1130 unsigned long scn_addr)
1131 { 1131 {
1132 const u32 *dm; 1132 const u32 *dm;
1133 unsigned int drconf_cell_cnt, rc; 1133 unsigned int drconf_cell_cnt, rc;
1134 unsigned long lmb_size; 1134 unsigned long lmb_size;
1135 struct assoc_arrays aa; 1135 struct assoc_arrays aa;
1136 int nid = -1; 1136 int nid = -1;
1137 1137
1138 drconf_cell_cnt = of_get_drconf_memory(memory, &dm); 1138 drconf_cell_cnt = of_get_drconf_memory(memory, &dm);
1139 if (!drconf_cell_cnt) 1139 if (!drconf_cell_cnt)
1140 return -1; 1140 return -1;
1141 1141
1142 lmb_size = of_get_lmb_size(memory); 1142 lmb_size = of_get_lmb_size(memory);
1143 if (!lmb_size) 1143 if (!lmb_size)
1144 return -1; 1144 return -1;
1145 1145
1146 rc = of_get_assoc_arrays(memory, &aa); 1146 rc = of_get_assoc_arrays(memory, &aa);
1147 if (rc) 1147 if (rc)
1148 return -1; 1148 return -1;
1149 1149
1150 for (; drconf_cell_cnt != 0; --drconf_cell_cnt) { 1150 for (; drconf_cell_cnt != 0; --drconf_cell_cnt) {
1151 struct of_drconf_cell drmem; 1151 struct of_drconf_cell drmem;
1152 1152
1153 read_drconf_cell(&drmem, &dm); 1153 read_drconf_cell(&drmem, &dm);
1154 1154
1155 /* skip this block if it is reserved or not assigned to 1155 /* skip this block if it is reserved or not assigned to
1156 * this partition */ 1156 * this partition */
1157 if ((drmem.flags & DRCONF_MEM_RESERVED) 1157 if ((drmem.flags & DRCONF_MEM_RESERVED)
1158 || !(drmem.flags & DRCONF_MEM_ASSIGNED)) 1158 || !(drmem.flags & DRCONF_MEM_ASSIGNED))
1159 continue; 1159 continue;
1160 1160
1161 if ((scn_addr < drmem.base_addr) 1161 if ((scn_addr < drmem.base_addr)
1162 || (scn_addr >= (drmem.base_addr + lmb_size))) 1162 || (scn_addr >= (drmem.base_addr + lmb_size)))
1163 continue; 1163 continue;
1164 1164
1165 nid = of_drconf_to_nid_single(&drmem, &aa); 1165 nid = of_drconf_to_nid_single(&drmem, &aa);
1166 break; 1166 break;
1167 } 1167 }
1168 1168
1169 return nid; 1169 return nid;
1170 } 1170 }
1171 1171
1172 /* 1172 /*
1173 * Find the node associated with a hot added memory section for memory 1173 * Find the node associated with a hot added memory section for memory
1174 * represented in the device tree as a node (i.e. memory@XXXX) for 1174 * represented in the device tree as a node (i.e. memory@XXXX) for
1175 * each memblock. 1175 * each memblock.
1176 */ 1176 */
1177 int hot_add_node_scn_to_nid(unsigned long scn_addr) 1177 int hot_add_node_scn_to_nid(unsigned long scn_addr)
1178 { 1178 {
1179 struct device_node *memory; 1179 struct device_node *memory;
1180 int nid = -1; 1180 int nid = -1;
1181 1181
1182 for_each_node_by_type(memory, "memory") { 1182 for_each_node_by_type(memory, "memory") {
1183 unsigned long start, size; 1183 unsigned long start, size;
1184 int ranges; 1184 int ranges;
1185 const unsigned int *memcell_buf; 1185 const unsigned int *memcell_buf;
1186 unsigned int len; 1186 unsigned int len;
1187 1187
1188 memcell_buf = of_get_property(memory, "reg", &len); 1188 memcell_buf = of_get_property(memory, "reg", &len);
1189 if (!memcell_buf || len <= 0) 1189 if (!memcell_buf || len <= 0)
1190 continue; 1190 continue;
1191 1191
1192 /* ranges in cell */ 1192 /* ranges in cell */
1193 ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells); 1193 ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);
1194 1194
1195 while (ranges--) { 1195 while (ranges--) {
1196 start = read_n_cells(n_mem_addr_cells, &memcell_buf); 1196 start = read_n_cells(n_mem_addr_cells, &memcell_buf);
1197 size = read_n_cells(n_mem_size_cells, &memcell_buf); 1197 size = read_n_cells(n_mem_size_cells, &memcell_buf);
1198 1198
1199 if ((scn_addr < start) || (scn_addr >= (start + size))) 1199 if ((scn_addr < start) || (scn_addr >= (start + size)))
1200 continue; 1200 continue;
1201 1201
1202 nid = of_node_to_nid_single(memory); 1202 nid = of_node_to_nid_single(memory);
1203 break; 1203 break;
1204 } 1204 }
1205 1205
1206 if (nid >= 0) 1206 if (nid >= 0)
1207 break; 1207 break;
1208 } 1208 }
1209 1209
1210 of_node_put(memory); 1210 of_node_put(memory);
1211 1211
1212 return nid; 1212 return nid;
1213 } 1213 }
1214 1214
1215 /* 1215 /*
1216 * Find the node associated with a hot added memory section. Section 1216 * Find the node associated with a hot added memory section. Section
1217 * corresponds to a SPARSEMEM section, not an MEMBLOCK. It is assumed that 1217 * corresponds to a SPARSEMEM section, not an MEMBLOCK. It is assumed that
1218 * sections are fully contained within a single MEMBLOCK. 1218 * sections are fully contained within a single MEMBLOCK.
1219 */ 1219 */
1220 int hot_add_scn_to_nid(unsigned long scn_addr) 1220 int hot_add_scn_to_nid(unsigned long scn_addr)
1221 { 1221 {
1222 struct device_node *memory = NULL; 1222 struct device_node *memory = NULL;
1223 int nid, found = 0; 1223 int nid, found = 0;
1224 1224
1225 if (!numa_enabled || (min_common_depth < 0)) 1225 if (!numa_enabled || (min_common_depth < 0))
1226 return first_online_node; 1226 return first_online_node;
1227 1227
1228 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); 1228 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
1229 if (memory) { 1229 if (memory) {
1230 nid = hot_add_drconf_scn_to_nid(memory, scn_addr); 1230 nid = hot_add_drconf_scn_to_nid(memory, scn_addr);
1231 of_node_put(memory); 1231 of_node_put(memory);
1232 } else { 1232 } else {
1233 nid = hot_add_node_scn_to_nid(scn_addr); 1233 nid = hot_add_node_scn_to_nid(scn_addr);
1234 } 1234 }
1235 1235
1236 if (nid < 0 || !node_online(nid)) 1236 if (nid < 0 || !node_online(nid))
1237 nid = first_online_node; 1237 nid = first_online_node;
1238 1238
1239 if (NODE_DATA(nid)->node_spanned_pages) 1239 if (NODE_DATA(nid)->node_spanned_pages)
1240 return nid; 1240 return nid;
1241 1241
1242 for_each_online_node(nid) { 1242 for_each_online_node(nid) {
1243 if (NODE_DATA(nid)->node_spanned_pages) { 1243 if (NODE_DATA(nid)->node_spanned_pages) {
1244 found = 1; 1244 found = 1;
1245 break; 1245 break;
1246 } 1246 }
1247 } 1247 }
1248 1248
1249 BUG_ON(!found); 1249 BUG_ON(!found);
1250 return nid; 1250 return nid;
1251 } 1251 }
1252 1252
1253 static u64 hot_add_drconf_memory_max(void) 1253 static u64 hot_add_drconf_memory_max(void)
1254 { 1254 {
1255 struct device_node *memory = NULL; 1255 struct device_node *memory = NULL;
1256 unsigned int drconf_cell_cnt = 0; 1256 unsigned int drconf_cell_cnt = 0;
1257 u64 lmb_size = 0; 1257 u64 lmb_size = 0;
1258 const u32 *dm = 0; 1258 const u32 *dm = 0;
1259 1259
1260 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); 1260 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
1261 if (memory) { 1261 if (memory) {
1262 drconf_cell_cnt = of_get_drconf_memory(memory, &dm); 1262 drconf_cell_cnt = of_get_drconf_memory(memory, &dm);
1263 lmb_size = of_get_lmb_size(memory); 1263 lmb_size = of_get_lmb_size(memory);
1264 of_node_put(memory); 1264 of_node_put(memory);
1265 } 1265 }
1266 return lmb_size * drconf_cell_cnt; 1266 return lmb_size * drconf_cell_cnt;
1267 } 1267 }
1268 1268
1269 /* 1269 /*
1270 * memory_hotplug_max - return max address of memory that may be added 1270 * memory_hotplug_max - return max address of memory that may be added
1271 * 1271 *
1272 * This is currently only used on systems that support drconfig memory 1272 * This is currently only used on systems that support drconfig memory
1273 * hotplug. 1273 * hotplug.
1274 */ 1274 */
1275 u64 memory_hotplug_max(void) 1275 u64 memory_hotplug_max(void)
1276 { 1276 {
1277 return max(hot_add_drconf_memory_max(), memblock_end_of_DRAM()); 1277 return max(hot_add_drconf_memory_max(), memblock_end_of_DRAM());
1278 } 1278 }
1279 #endif /* CONFIG_MEMORY_HOTPLUG */ 1279 #endif /* CONFIG_MEMORY_HOTPLUG */
1280 1280
1281 /* Virtual Processor Home Node (VPHN) support */ 1281 /* Virtual Processor Home Node (VPHN) support */
1282 #ifdef CONFIG_PPC_SPLPAR 1282 #ifdef CONFIG_PPC_SPLPAR
1283 static u8 vphn_cpu_change_counts[NR_CPUS][MAX_DISTANCE_REF_POINTS]; 1283 static u8 vphn_cpu_change_counts[NR_CPUS][MAX_DISTANCE_REF_POINTS];
1284 static cpumask_t cpu_associativity_changes_mask; 1284 static cpumask_t cpu_associativity_changes_mask;
1285 static int vphn_enabled; 1285 static int vphn_enabled;
1286 static void set_topology_timer(void); 1286 static void set_topology_timer(void);
1287 1287
1288 /* 1288 /*
1289 * Store the current values of the associativity change counters in the 1289 * Store the current values of the associativity change counters in the
1290 * hypervisor. 1290 * hypervisor.
1291 */ 1291 */
1292 static void setup_cpu_associativity_change_counters(void) 1292 static void setup_cpu_associativity_change_counters(void)
1293 { 1293 {
1294 int cpu; 1294 int cpu;
1295 1295
1296 /* The VPHN feature supports a maximum of 8 reference points */ 1296 /* The VPHN feature supports a maximum of 8 reference points */
1297 BUILD_BUG_ON(MAX_DISTANCE_REF_POINTS > 8); 1297 BUILD_BUG_ON(MAX_DISTANCE_REF_POINTS > 8);
1298 1298
1299 for_each_possible_cpu(cpu) { 1299 for_each_possible_cpu(cpu) {
1300 int i; 1300 int i;
1301 u8 *counts = vphn_cpu_change_counts[cpu]; 1301 u8 *counts = vphn_cpu_change_counts[cpu];
1302 volatile u8 *hypervisor_counts = lppaca[cpu].vphn_assoc_counts; 1302 volatile u8 *hypervisor_counts = lppaca[cpu].vphn_assoc_counts;
1303 1303
1304 for (i = 0; i < distance_ref_points_depth; i++) 1304 for (i = 0; i < distance_ref_points_depth; i++)
1305 counts[i] = hypervisor_counts[i]; 1305 counts[i] = hypervisor_counts[i];
1306 } 1306 }
1307 } 1307 }
1308 1308
1309 /* 1309 /*
1310 * The hypervisor maintains a set of 8 associativity change counters in 1310 * The hypervisor maintains a set of 8 associativity change counters in
1311 * the VPA of each cpu that correspond to the associativity levels in the 1311 * the VPA of each cpu that correspond to the associativity levels in the
1312 * ibm,associativity-reference-points property. When an associativity 1312 * ibm,associativity-reference-points property. When an associativity
1313 * level changes, the corresponding counter is incremented. 1313 * level changes, the corresponding counter is incremented.
1314 * 1314 *
1315 * Set a bit in cpu_associativity_changes_mask for each cpu whose home 1315 * Set a bit in cpu_associativity_changes_mask for each cpu whose home
1316 * node associativity levels have changed. 1316 * node associativity levels have changed.
1317 * 1317 *
1318 * Returns the number of cpus with unhandled associativity changes. 1318 * Returns the number of cpus with unhandled associativity changes.
1319 */ 1319 */
1320 static int update_cpu_associativity_changes_mask(void) 1320 static int update_cpu_associativity_changes_mask(void)
1321 { 1321 {
1322 int cpu, nr_cpus = 0; 1322 int cpu, nr_cpus = 0;
1323 cpumask_t *changes = &cpu_associativity_changes_mask; 1323 cpumask_t *changes = &cpu_associativity_changes_mask;
1324 1324
1325 cpumask_clear(changes); 1325 cpumask_clear(changes);
1326 1326
1327 for_each_possible_cpu(cpu) { 1327 for_each_possible_cpu(cpu) {
1328 int i, changed = 0; 1328 int i, changed = 0;
1329 u8 *counts = vphn_cpu_change_counts[cpu]; 1329 u8 *counts = vphn_cpu_change_counts[cpu];
1330 volatile u8 *hypervisor_counts = lppaca[cpu].vphn_assoc_counts; 1330 volatile u8 *hypervisor_counts = lppaca[cpu].vphn_assoc_counts;
1331 1331
1332 for (i = 0; i < distance_ref_points_depth; i++) { 1332 for (i = 0; i < distance_ref_points_depth; i++) {
1333 if (hypervisor_counts[i] != counts[i]) { 1333 if (hypervisor_counts[i] != counts[i]) {
1334 counts[i] = hypervisor_counts[i]; 1334 counts[i] = hypervisor_counts[i];
1335 changed = 1; 1335 changed = 1;
1336 } 1336 }
1337 } 1337 }
1338 if (changed) { 1338 if (changed) {
1339 cpumask_set_cpu(cpu, changes); 1339 cpumask_set_cpu(cpu, changes);
1340 nr_cpus++; 1340 nr_cpus++;
1341 } 1341 }
1342 } 1342 }
1343 1343
1344 return nr_cpus; 1344 return nr_cpus;
1345 } 1345 }
1346 1346
1347 /* 1347 /*
1348 * 6 64-bit registers unpacked into 12 32-bit associativity values. To form 1348 * 6 64-bit registers unpacked into 12 32-bit associativity values. To form
1349 * the complete property we have to add the length in the first cell. 1349 * the complete property we have to add the length in the first cell.
1350 */ 1350 */
1351 #define VPHN_ASSOC_BUFSIZE (6*sizeof(u64)/sizeof(u32) + 1) 1351 #define VPHN_ASSOC_BUFSIZE (6*sizeof(u64)/sizeof(u32) + 1)
1352 1352
1353 /* 1353 /*
1354 * Convert the associativity domain numbers returned from the hypervisor 1354 * Convert the associativity domain numbers returned from the hypervisor
1355 * to the sequence they would appear in the ibm,associativity property. 1355 * to the sequence they would appear in the ibm,associativity property.
1356 */ 1356 */
1357 static int vphn_unpack_associativity(const long *packed, unsigned int *unpacked) 1357 static int vphn_unpack_associativity(const long *packed, unsigned int *unpacked)
1358 { 1358 {
1359 int i, nr_assoc_doms = 0; 1359 int i, nr_assoc_doms = 0;
1360 const u16 *field = (const u16*) packed; 1360 const u16 *field = (const u16*) packed;
1361 1361
1362 #define VPHN_FIELD_UNUSED (0xffff) 1362 #define VPHN_FIELD_UNUSED (0xffff)
1363 #define VPHN_FIELD_MSB (0x8000) 1363 #define VPHN_FIELD_MSB (0x8000)
1364 #define VPHN_FIELD_MASK (~VPHN_FIELD_MSB) 1364 #define VPHN_FIELD_MASK (~VPHN_FIELD_MSB)
1365 1365
1366 for (i = 1; i < VPHN_ASSOC_BUFSIZE; i++) { 1366 for (i = 1; i < VPHN_ASSOC_BUFSIZE; i++) {
1367 if (*field == VPHN_FIELD_UNUSED) { 1367 if (*field == VPHN_FIELD_UNUSED) {
1368 /* All significant fields processed, and remaining 1368 /* All significant fields processed, and remaining
1369 * fields contain the reserved value of all 1's. 1369 * fields contain the reserved value of all 1's.
1370 * Just store them. 1370 * Just store them.
1371 */ 1371 */
1372 unpacked[i] = *((u32*)field); 1372 unpacked[i] = *((u32*)field);
1373 field += 2; 1373 field += 2;
1374 } else if (*field & VPHN_FIELD_MSB) { 1374 } else if (*field & VPHN_FIELD_MSB) {
1375 /* Data is in the lower 15 bits of this field */ 1375 /* Data is in the lower 15 bits of this field */
1376 unpacked[i] = *field & VPHN_FIELD_MASK; 1376 unpacked[i] = *field & VPHN_FIELD_MASK;
1377 field++; 1377 field++;
1378 nr_assoc_doms++; 1378 nr_assoc_doms++;
1379 } else { 1379 } else {
1380 /* Data is in the lower 15 bits of this field 1380 /* Data is in the lower 15 bits of this field
1381 * concatenated with the next 16 bit field 1381 * concatenated with the next 16 bit field
1382 */ 1382 */
1383 unpacked[i] = *((u32*)field); 1383 unpacked[i] = *((u32*)field);
1384 field += 2; 1384 field += 2;
1385 nr_assoc_doms++; 1385 nr_assoc_doms++;
1386 } 1386 }
1387 } 1387 }
1388 1388
1389 /* The first cell contains the length of the property */ 1389 /* The first cell contains the length of the property */
1390 unpacked[0] = nr_assoc_doms; 1390 unpacked[0] = nr_assoc_doms;
1391 1391
1392 return nr_assoc_doms; 1392 return nr_assoc_doms;
1393 } 1393 }
1394 1394
1395 /* 1395 /*
1396 * Retrieve the new associativity information for a virtual processor's 1396 * Retrieve the new associativity information for a virtual processor's
1397 * home node. 1397 * home node.
1398 */ 1398 */
1399 static long hcall_vphn(unsigned long cpu, unsigned int *associativity) 1399 static long hcall_vphn(unsigned long cpu, unsigned int *associativity)
1400 { 1400 {
1401 long rc; 1401 long rc;
1402 long retbuf[PLPAR_HCALL9_BUFSIZE] = {0}; 1402 long retbuf[PLPAR_HCALL9_BUFSIZE] = {0};
1403 u64 flags = 1; 1403 u64 flags = 1;
1404 int hwcpu = get_hard_smp_processor_id(cpu); 1404 int hwcpu = get_hard_smp_processor_id(cpu);
1405 1405
1406 rc = plpar_hcall9(H_HOME_NODE_ASSOCIATIVITY, retbuf, flags, hwcpu); 1406 rc = plpar_hcall9(H_HOME_NODE_ASSOCIATIVITY, retbuf, flags, hwcpu);
1407 vphn_unpack_associativity(retbuf, associativity); 1407 vphn_unpack_associativity(retbuf, associativity);
1408 1408
1409 return rc; 1409 return rc;
1410 } 1410 }
1411 1411
1412 static long vphn_get_associativity(unsigned long cpu, 1412 static long vphn_get_associativity(unsigned long cpu,
1413 unsigned int *associativity) 1413 unsigned int *associativity)
1414 { 1414 {
1415 long rc; 1415 long rc;
1416 1416
1417 rc = hcall_vphn(cpu, associativity); 1417 rc = hcall_vphn(cpu, associativity);
1418 1418
1419 switch (rc) { 1419 switch (rc) {
1420 case H_FUNCTION: 1420 case H_FUNCTION:
1421 printk(KERN_INFO 1421 printk(KERN_INFO
1422 "VPHN is not supported. Disabling polling...\n"); 1422 "VPHN is not supported. Disabling polling...\n");
1423 stop_topology_update(); 1423 stop_topology_update();
1424 break; 1424 break;
1425 case H_HARDWARE: 1425 case H_HARDWARE:
1426 printk(KERN_ERR 1426 printk(KERN_ERR
1427 "hcall_vphn() experienced a hardware fault " 1427 "hcall_vphn() experienced a hardware fault "
1428 "preventing VPHN. Disabling polling...\n"); 1428 "preventing VPHN. Disabling polling...\n");
1429 stop_topology_update(); 1429 stop_topology_update();
1430 } 1430 }
1431 1431
1432 return rc; 1432 return rc;
1433 } 1433 }
1434 1434
1435 /* 1435 /*
1436 * Update the node maps and sysfs entries for each cpu whose home node 1436 * Update the node maps and sysfs entries for each cpu whose home node
1437 * has changed. 1437 * has changed.
1438 */ 1438 */
1439 int arch_update_cpu_topology(void) 1439 int arch_update_cpu_topology(void)
1440 { 1440 {
1441 int cpu, nid, old_nid; 1441 int cpu, nid, old_nid;
1442 unsigned int associativity[VPHN_ASSOC_BUFSIZE] = {0}; 1442 unsigned int associativity[VPHN_ASSOC_BUFSIZE] = {0};
1443 struct device *dev; 1443 struct device *dev;
1444 1444
1445 for_each_cpu(cpu,&cpu_associativity_changes_mask) { 1445 for_each_cpu(cpu,&cpu_associativity_changes_mask) {
1446 vphn_get_associativity(cpu, associativity); 1446 vphn_get_associativity(cpu, associativity);
1447 nid = associativity_to_nid(associativity); 1447 nid = associativity_to_nid(associativity);
1448 1448
1449 if (nid < 0 || !node_online(nid)) 1449 if (nid < 0 || !node_online(nid))
1450 nid = first_online_node; 1450 nid = first_online_node;
1451 1451
1452 old_nid = numa_cpu_lookup_table[cpu]; 1452 old_nid = numa_cpu_lookup_table[cpu];
1453 1453
1454 /* Disable hotplug while we update the cpu 1454 /* Disable hotplug while we update the cpu
1455 * masks and sysfs. 1455 * masks and sysfs.
1456 */ 1456 */
1457 get_online_cpus(); 1457 get_online_cpus();
1458 unregister_cpu_under_node(cpu, old_nid); 1458 unregister_cpu_under_node(cpu, old_nid);
1459 unmap_cpu_from_node(cpu); 1459 unmap_cpu_from_node(cpu);
1460 map_cpu_to_node(cpu, nid); 1460 map_cpu_to_node(cpu, nid);
1461 register_cpu_under_node(cpu, nid); 1461 register_cpu_under_node(cpu, nid);
1462 put_online_cpus(); 1462 put_online_cpus();
1463 1463
1464 dev = get_cpu_device(cpu); 1464 dev = get_cpu_device(cpu);
1465 if (dev) 1465 if (dev)
1466 kobject_uevent(&dev->kobj, KOBJ_CHANGE); 1466 kobject_uevent(&dev->kobj, KOBJ_CHANGE);
1467 } 1467 }
1468 1468
1469 return 1; 1469 return 1;
1470 } 1470 }
1471 1471
1472 static void topology_work_fn(struct work_struct *work) 1472 static void topology_work_fn(struct work_struct *work)
1473 { 1473 {
1474 rebuild_sched_domains(); 1474 rebuild_sched_domains();
1475 } 1475 }
1476 static DECLARE_WORK(topology_work, topology_work_fn); 1476 static DECLARE_WORK(topology_work, topology_work_fn);
1477 1477
1478 void topology_schedule_update(void) 1478 void topology_schedule_update(void)
1479 { 1479 {
1480 schedule_work(&topology_work); 1480 schedule_work(&topology_work);
1481 } 1481 }
1482 1482
1483 static void topology_timer_fn(unsigned long ignored) 1483 static void topology_timer_fn(unsigned long ignored)
1484 { 1484 {
1485 if (!vphn_enabled) 1485 if (!vphn_enabled)
1486 return; 1486 return;
1487 if (update_cpu_associativity_changes_mask() > 0) 1487 if (update_cpu_associativity_changes_mask() > 0)
1488 topology_schedule_update(); 1488 topology_schedule_update();
1489 set_topology_timer(); 1489 set_topology_timer();
1490 } 1490 }
1491 static struct timer_list topology_timer = 1491 static struct timer_list topology_timer =
1492 TIMER_INITIALIZER(topology_timer_fn, 0, 0); 1492 TIMER_INITIALIZER(topology_timer_fn, 0, 0);
1493 1493
1494 static void set_topology_timer(void) 1494 static void set_topology_timer(void)
1495 { 1495 {
1496 topology_timer.data = 0; 1496 topology_timer.data = 0;
1497 topology_timer.expires = jiffies + 60 * HZ; 1497 topology_timer.expires = jiffies + 60 * HZ;
1498 add_timer(&topology_timer); 1498 add_timer(&topology_timer);
1499 } 1499 }
1500 1500
1501 /* 1501 /*
1502 * Start polling for VPHN associativity changes. 1502 * Start polling for VPHN associativity changes.
1503 */ 1503 */
1504 int start_topology_update(void) 1504 int start_topology_update(void)
1505 { 1505 {
1506 int rc = 0; 1506 int rc = 0;
1507 1507
1508 /* Disabled until races with load balancing are fixed */ 1508 /* Disabled until races with load balancing are fixed */
1509 if (0 && firmware_has_feature(FW_FEATURE_VPHN) && 1509 if (0 && firmware_has_feature(FW_FEATURE_VPHN) &&
1510 get_lppaca()->shared_proc) { 1510 get_lppaca()->shared_proc) {
1511 vphn_enabled = 1; 1511 vphn_enabled = 1;
1512 setup_cpu_associativity_change_counters(); 1512 setup_cpu_associativity_change_counters();
1513 init_timer_deferrable(&topology_timer); 1513 init_timer_deferrable(&topology_timer);
1514 set_topology_timer(); 1514 set_topology_timer();
1515 rc = 1; 1515 rc = 1;
1516 } 1516 }
1517 1517
1518 return rc; 1518 return rc;
1519 } 1519 }
1520 __initcall(start_topology_update); 1520 __initcall(start_topology_update);
1521 1521
1522 /* 1522 /*
1523 * Disable polling for VPHN associativity changes. 1523 * Disable polling for VPHN associativity changes.
1524 */ 1524 */
1525 int stop_topology_update(void) 1525 int stop_topology_update(void)
1526 { 1526 {
1527 vphn_enabled = 0; 1527 vphn_enabled = 0;
1528 return del_timer_sync(&topology_timer); 1528 return del_timer_sync(&topology_timer);
1529 } 1529 }
1530 #endif /* CONFIG_PPC_SPLPAR */ 1530 #endif /* CONFIG_PPC_SPLPAR */
1531 1531
1 /* Common code for 32 and 64-bit NUMA */ 1 /* Common code for 32 and 64-bit NUMA */
2 #include <linux/kernel.h> 2 #include <linux/kernel.h>
3 #include <linux/mm.h> 3 #include <linux/mm.h>
4 #include <linux/string.h> 4 #include <linux/string.h>
5 #include <linux/init.h> 5 #include <linux/init.h>
6 #include <linux/bootmem.h> 6 #include <linux/bootmem.h>
7 #include <linux/memblock.h> 7 #include <linux/memblock.h>
8 #include <linux/mmzone.h> 8 #include <linux/mmzone.h>
9 #include <linux/ctype.h> 9 #include <linux/ctype.h>
10 #include <linux/module.h> 10 #include <linux/module.h>
11 #include <linux/nodemask.h> 11 #include <linux/nodemask.h>
12 #include <linux/sched.h> 12 #include <linux/sched.h>
13 #include <linux/topology.h> 13 #include <linux/topology.h>
14 14
15 #include <asm/e820.h> 15 #include <asm/e820.h>
16 #include <asm/proto.h> 16 #include <asm/proto.h>
17 #include <asm/dma.h> 17 #include <asm/dma.h>
18 #include <asm/acpi.h> 18 #include <asm/acpi.h>
19 #include <asm/amd_nb.h> 19 #include <asm/amd_nb.h>
20 20
21 #include "numa_internal.h" 21 #include "numa_internal.h"
22 22
23 int __initdata numa_off; 23 int __initdata numa_off;
24 nodemask_t numa_nodes_parsed __initdata; 24 nodemask_t numa_nodes_parsed __initdata;
25 25
26 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; 26 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
27 EXPORT_SYMBOL(node_data); 27 EXPORT_SYMBOL(node_data);
28 28
29 static struct numa_meminfo numa_meminfo 29 static struct numa_meminfo numa_meminfo
30 #ifndef CONFIG_MEMORY_HOTPLUG 30 #ifndef CONFIG_MEMORY_HOTPLUG
31 __initdata 31 __initdata
32 #endif 32 #endif
33 ; 33 ;
34 34
35 static int numa_distance_cnt; 35 static int numa_distance_cnt;
36 static u8 *numa_distance; 36 static u8 *numa_distance;
37 37
38 static __init int numa_setup(char *opt) 38 static __init int numa_setup(char *opt)
39 { 39 {
40 if (!opt) 40 if (!opt)
41 return -EINVAL; 41 return -EINVAL;
42 if (!strncmp(opt, "off", 3)) 42 if (!strncmp(opt, "off", 3))
43 numa_off = 1; 43 numa_off = 1;
44 #ifdef CONFIG_NUMA_EMU 44 #ifdef CONFIG_NUMA_EMU
45 if (!strncmp(opt, "fake=", 5)) 45 if (!strncmp(opt, "fake=", 5))
46 numa_emu_cmdline(opt + 5); 46 numa_emu_cmdline(opt + 5);
47 #endif 47 #endif
48 #ifdef CONFIG_ACPI_NUMA 48 #ifdef CONFIG_ACPI_NUMA
49 if (!strncmp(opt, "noacpi", 6)) 49 if (!strncmp(opt, "noacpi", 6))
50 acpi_numa = -1; 50 acpi_numa = -1;
51 #endif 51 #endif
52 return 0; 52 return 0;
53 } 53 }
54 early_param("numa", numa_setup); 54 early_param("numa", numa_setup);
55 55
56 /* 56 /*
57 * apicid, cpu, node mappings 57 * apicid, cpu, node mappings
58 */ 58 */
59 s16 __apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { 59 s16 __apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
60 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE 60 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
61 }; 61 };
62 62
63 int __cpuinit numa_cpu_node(int cpu) 63 int __cpuinit numa_cpu_node(int cpu)
64 { 64 {
65 int apicid = early_per_cpu(x86_cpu_to_apicid, cpu); 65 int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
66 66
67 if (apicid != BAD_APICID) 67 if (apicid != BAD_APICID)
68 return __apicid_to_node[apicid]; 68 return __apicid_to_node[apicid];
69 return NUMA_NO_NODE; 69 return NUMA_NO_NODE;
70 } 70 }
71 71
72 cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; 72 cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
73 EXPORT_SYMBOL(node_to_cpumask_map); 73 EXPORT_SYMBOL(node_to_cpumask_map);
74 74
75 /* 75 /*
76 * Map cpu index to node index 76 * Map cpu index to node index
77 */ 77 */
78 DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); 78 DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
79 EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); 79 EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
80 80
81 void __cpuinit numa_set_node(int cpu, int node) 81 void __cpuinit numa_set_node(int cpu, int node)
82 { 82 {
83 int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); 83 int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
84 84
85 /* early setting, no percpu area yet */ 85 /* early setting, no percpu area yet */
86 if (cpu_to_node_map) { 86 if (cpu_to_node_map) {
87 cpu_to_node_map[cpu] = node; 87 cpu_to_node_map[cpu] = node;
88 return; 88 return;
89 } 89 }
90 90
91 #ifdef CONFIG_DEBUG_PER_CPU_MAPS 91 #ifdef CONFIG_DEBUG_PER_CPU_MAPS
92 if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) { 92 if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) {
93 printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu); 93 printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
94 dump_stack(); 94 dump_stack();
95 return; 95 return;
96 } 96 }
97 #endif 97 #endif
98 per_cpu(x86_cpu_to_node_map, cpu) = node; 98 per_cpu(x86_cpu_to_node_map, cpu) = node;
99 99
100 if (node != NUMA_NO_NODE) 100 if (node != NUMA_NO_NODE)
101 set_cpu_numa_node(cpu, node); 101 set_cpu_numa_node(cpu, node);
102 } 102 }
103 103
104 void __cpuinit numa_clear_node(int cpu) 104 void __cpuinit numa_clear_node(int cpu)
105 { 105 {
106 numa_set_node(cpu, NUMA_NO_NODE); 106 numa_set_node(cpu, NUMA_NO_NODE);
107 } 107 }
108 108
109 /* 109 /*
110 * Allocate node_to_cpumask_map based on number of available nodes 110 * Allocate node_to_cpumask_map based on number of available nodes
111 * Requires node_possible_map to be valid. 111 * Requires node_possible_map to be valid.
112 * 112 *
113 * Note: node_to_cpumask() is not valid until after this is done. 113 * Note: cpumask_of_node() is not valid until after this is done.
114 * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.) 114 * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.)
115 */ 115 */
116 void __init setup_node_to_cpumask_map(void) 116 void __init setup_node_to_cpumask_map(void)
117 { 117 {
118 unsigned int node, num = 0; 118 unsigned int node, num = 0;
119 119
120 /* setup nr_node_ids if not done yet */ 120 /* setup nr_node_ids if not done yet */
121 if (nr_node_ids == MAX_NUMNODES) { 121 if (nr_node_ids == MAX_NUMNODES) {
122 for_each_node_mask(node, node_possible_map) 122 for_each_node_mask(node, node_possible_map)
123 num = node; 123 num = node;
124 nr_node_ids = num + 1; 124 nr_node_ids = num + 1;
125 } 125 }
126 126
127 /* allocate the map */ 127 /* allocate the map */
128 for (node = 0; node < nr_node_ids; node++) 128 for (node = 0; node < nr_node_ids; node++)
129 alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]); 129 alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
130 130
131 /* cpumask_of_node() will now work */ 131 /* cpumask_of_node() will now work */
132 pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids); 132 pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids);
133 } 133 }
134 134
135 static int __init numa_add_memblk_to(int nid, u64 start, u64 end, 135 static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
136 struct numa_meminfo *mi) 136 struct numa_meminfo *mi)
137 { 137 {
138 /* ignore zero length blks */ 138 /* ignore zero length blks */
139 if (start == end) 139 if (start == end)
140 return 0; 140 return 0;
141 141
142 /* whine about and ignore invalid blks */ 142 /* whine about and ignore invalid blks */
143 if (start > end || nid < 0 || nid >= MAX_NUMNODES) { 143 if (start > end || nid < 0 || nid >= MAX_NUMNODES) {
144 pr_warning("NUMA: Warning: invalid memblk node %d (%Lx-%Lx)\n", 144 pr_warning("NUMA: Warning: invalid memblk node %d (%Lx-%Lx)\n",
145 nid, start, end); 145 nid, start, end);
146 return 0; 146 return 0;
147 } 147 }
148 148
149 if (mi->nr_blks >= NR_NODE_MEMBLKS) { 149 if (mi->nr_blks >= NR_NODE_MEMBLKS) {
150 pr_err("NUMA: too many memblk ranges\n"); 150 pr_err("NUMA: too many memblk ranges\n");
151 return -EINVAL; 151 return -EINVAL;
152 } 152 }
153 153
154 mi->blk[mi->nr_blks].start = start; 154 mi->blk[mi->nr_blks].start = start;
155 mi->blk[mi->nr_blks].end = end; 155 mi->blk[mi->nr_blks].end = end;
156 mi->blk[mi->nr_blks].nid = nid; 156 mi->blk[mi->nr_blks].nid = nid;
157 mi->nr_blks++; 157 mi->nr_blks++;
158 return 0; 158 return 0;
159 } 159 }
160 160
161 /** 161 /**
162 * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo 162 * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo
163 * @idx: Index of memblk to remove 163 * @idx: Index of memblk to remove
164 * @mi: numa_meminfo to remove memblk from 164 * @mi: numa_meminfo to remove memblk from
165 * 165 *
166 * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and 166 * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and
167 * decrementing @mi->nr_blks. 167 * decrementing @mi->nr_blks.
168 */ 168 */
169 void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi) 169 void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
170 { 170 {
171 mi->nr_blks--; 171 mi->nr_blks--;
172 memmove(&mi->blk[idx], &mi->blk[idx + 1], 172 memmove(&mi->blk[idx], &mi->blk[idx + 1],
173 (mi->nr_blks - idx) * sizeof(mi->blk[0])); 173 (mi->nr_blks - idx) * sizeof(mi->blk[0]));
174 } 174 }
175 175
176 /** 176 /**
177 * numa_add_memblk - Add one numa_memblk to numa_meminfo 177 * numa_add_memblk - Add one numa_memblk to numa_meminfo
178 * @nid: NUMA node ID of the new memblk 178 * @nid: NUMA node ID of the new memblk
179 * @start: Start address of the new memblk 179 * @start: Start address of the new memblk
180 * @end: End address of the new memblk 180 * @end: End address of the new memblk
181 * 181 *
182 * Add a new memblk to the default numa_meminfo. 182 * Add a new memblk to the default numa_meminfo.
183 * 183 *
184 * RETURNS: 184 * RETURNS:
185 * 0 on success, -errno on failure. 185 * 0 on success, -errno on failure.
186 */ 186 */
187 int __init numa_add_memblk(int nid, u64 start, u64 end) 187 int __init numa_add_memblk(int nid, u64 start, u64 end)
188 { 188 {
189 return numa_add_memblk_to(nid, start, end, &numa_meminfo); 189 return numa_add_memblk_to(nid, start, end, &numa_meminfo);
190 } 190 }
191 191
192 /* Initialize NODE_DATA for a node on the local memory */ 192 /* Initialize NODE_DATA for a node on the local memory */
193 static void __init setup_node_data(int nid, u64 start, u64 end) 193 static void __init setup_node_data(int nid, u64 start, u64 end)
194 { 194 {
195 const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE); 195 const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
196 bool remapped = false; 196 bool remapped = false;
197 u64 nd_pa; 197 u64 nd_pa;
198 void *nd; 198 void *nd;
199 int tnid; 199 int tnid;
200 200
201 /* 201 /*
202 * Don't confuse VM with a node that doesn't have the 202 * Don't confuse VM with a node that doesn't have the
203 * minimum amount of memory: 203 * minimum amount of memory:
204 */ 204 */
205 if (end && (end - start) < NODE_MIN_SIZE) 205 if (end && (end - start) < NODE_MIN_SIZE)
206 return; 206 return;
207 207
208 /* initialize remap allocator before aligning to ZONE_ALIGN */ 208 /* initialize remap allocator before aligning to ZONE_ALIGN */
209 init_alloc_remap(nid, start, end); 209 init_alloc_remap(nid, start, end);
210 210
211 start = roundup(start, ZONE_ALIGN); 211 start = roundup(start, ZONE_ALIGN);
212 212
213 printk(KERN_INFO "Initmem setup node %d %016Lx-%016Lx\n", 213 printk(KERN_INFO "Initmem setup node %d %016Lx-%016Lx\n",
214 nid, start, end); 214 nid, start, end);
215 215
216 /* 216 /*
217 * Allocate node data. Try remap allocator first, node-local 217 * Allocate node data. Try remap allocator first, node-local
218 * memory and then any node. Never allocate in DMA zone. 218 * memory and then any node. Never allocate in DMA zone.
219 */ 219 */
220 nd = alloc_remap(nid, nd_size); 220 nd = alloc_remap(nid, nd_size);
221 if (nd) { 221 if (nd) {
222 nd_pa = __pa(nd); 222 nd_pa = __pa(nd);
223 remapped = true; 223 remapped = true;
224 } else { 224 } else {
225 nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid); 225 nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid);
226 if (!nd_pa) { 226 if (!nd_pa) {
227 pr_err("Cannot find %zu bytes in node %d\n", 227 pr_err("Cannot find %zu bytes in node %d\n",
228 nd_size, nid); 228 nd_size, nid);
229 return; 229 return;
230 } 230 }
231 nd = __va(nd_pa); 231 nd = __va(nd_pa);
232 } 232 }
233 233
234 /* report and initialize */ 234 /* report and initialize */
235 printk(KERN_INFO " NODE_DATA [%016Lx - %016Lx]%s\n", 235 printk(KERN_INFO " NODE_DATA [%016Lx - %016Lx]%s\n",
236 nd_pa, nd_pa + nd_size - 1, remapped ? " (remapped)" : ""); 236 nd_pa, nd_pa + nd_size - 1, remapped ? " (remapped)" : "");
237 tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT); 237 tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
238 if (!remapped && tnid != nid) 238 if (!remapped && tnid != nid)
239 printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid); 239 printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid);
240 240
241 node_data[nid] = nd; 241 node_data[nid] = nd;
242 memset(NODE_DATA(nid), 0, sizeof(pg_data_t)); 242 memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
243 NODE_DATA(nid)->node_id = nid; 243 NODE_DATA(nid)->node_id = nid;
244 NODE_DATA(nid)->node_start_pfn = start >> PAGE_SHIFT; 244 NODE_DATA(nid)->node_start_pfn = start >> PAGE_SHIFT;
245 NODE_DATA(nid)->node_spanned_pages = (end - start) >> PAGE_SHIFT; 245 NODE_DATA(nid)->node_spanned_pages = (end - start) >> PAGE_SHIFT;
246 246
247 node_set_online(nid); 247 node_set_online(nid);
248 } 248 }
249 249
250 /** 250 /**
251 * numa_cleanup_meminfo - Cleanup a numa_meminfo 251 * numa_cleanup_meminfo - Cleanup a numa_meminfo
252 * @mi: numa_meminfo to clean up 252 * @mi: numa_meminfo to clean up
253 * 253 *
254 * Sanitize @mi by merging and removing unncessary memblks. Also check for 254 * Sanitize @mi by merging and removing unncessary memblks. Also check for
255 * conflicts and clear unused memblks. 255 * conflicts and clear unused memblks.
256 * 256 *
257 * RETURNS: 257 * RETURNS:
258 * 0 on success, -errno on failure. 258 * 0 on success, -errno on failure.
259 */ 259 */
260 int __init numa_cleanup_meminfo(struct numa_meminfo *mi) 260 int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
261 { 261 {
262 const u64 low = 0; 262 const u64 low = 0;
263 const u64 high = PFN_PHYS(max_pfn); 263 const u64 high = PFN_PHYS(max_pfn);
264 int i, j, k; 264 int i, j, k;
265 265
266 /* first, trim all entries */ 266 /* first, trim all entries */
267 for (i = 0; i < mi->nr_blks; i++) { 267 for (i = 0; i < mi->nr_blks; i++) {
268 struct numa_memblk *bi = &mi->blk[i]; 268 struct numa_memblk *bi = &mi->blk[i];
269 269
270 /* make sure all blocks are inside the limits */ 270 /* make sure all blocks are inside the limits */
271 bi->start = max(bi->start, low); 271 bi->start = max(bi->start, low);
272 bi->end = min(bi->end, high); 272 bi->end = min(bi->end, high);
273 273
274 /* and there's no empty block */ 274 /* and there's no empty block */
275 if (bi->start >= bi->end) 275 if (bi->start >= bi->end)
276 numa_remove_memblk_from(i--, mi); 276 numa_remove_memblk_from(i--, mi);
277 } 277 }
278 278
279 /* merge neighboring / overlapping entries */ 279 /* merge neighboring / overlapping entries */
280 for (i = 0; i < mi->nr_blks; i++) { 280 for (i = 0; i < mi->nr_blks; i++) {
281 struct numa_memblk *bi = &mi->blk[i]; 281 struct numa_memblk *bi = &mi->blk[i];
282 282
283 for (j = i + 1; j < mi->nr_blks; j++) { 283 for (j = i + 1; j < mi->nr_blks; j++) {
284 struct numa_memblk *bj = &mi->blk[j]; 284 struct numa_memblk *bj = &mi->blk[j];
285 u64 start, end; 285 u64 start, end;
286 286
287 /* 287 /*
288 * See whether there are overlapping blocks. Whine 288 * See whether there are overlapping blocks. Whine
289 * about but allow overlaps of the same nid. They 289 * about but allow overlaps of the same nid. They
290 * will be merged below. 290 * will be merged below.
291 */ 291 */
292 if (bi->end > bj->start && bi->start < bj->end) { 292 if (bi->end > bj->start && bi->start < bj->end) {
293 if (bi->nid != bj->nid) { 293 if (bi->nid != bj->nid) {
294 pr_err("NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n", 294 pr_err("NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n",
295 bi->nid, bi->start, bi->end, 295 bi->nid, bi->start, bi->end,
296 bj->nid, bj->start, bj->end); 296 bj->nid, bj->start, bj->end);
297 return -EINVAL; 297 return -EINVAL;
298 } 298 }
299 pr_warning("NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n", 299 pr_warning("NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n",
300 bi->nid, bi->start, bi->end, 300 bi->nid, bi->start, bi->end,
301 bj->start, bj->end); 301 bj->start, bj->end);
302 } 302 }
303 303
304 /* 304 /*
305 * Join together blocks on the same node, holes 305 * Join together blocks on the same node, holes
306 * between which don't overlap with memory on other 306 * between which don't overlap with memory on other
307 * nodes. 307 * nodes.
308 */ 308 */
309 if (bi->nid != bj->nid) 309 if (bi->nid != bj->nid)
310 continue; 310 continue;
311 start = min(bi->start, bj->start); 311 start = min(bi->start, bj->start);
312 end = max(bi->end, bj->end); 312 end = max(bi->end, bj->end);
313 for (k = 0; k < mi->nr_blks; k++) { 313 for (k = 0; k < mi->nr_blks; k++) {
314 struct numa_memblk *bk = &mi->blk[k]; 314 struct numa_memblk *bk = &mi->blk[k];
315 315
316 if (bi->nid == bk->nid) 316 if (bi->nid == bk->nid)
317 continue; 317 continue;
318 if (start < bk->end && end > bk->start) 318 if (start < bk->end && end > bk->start)
319 break; 319 break;
320 } 320 }
321 if (k < mi->nr_blks) 321 if (k < mi->nr_blks)
322 continue; 322 continue;
323 printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%Lx,%Lx)\n", 323 printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%Lx,%Lx)\n",
324 bi->nid, bi->start, bi->end, bj->start, bj->end, 324 bi->nid, bi->start, bi->end, bj->start, bj->end,
325 start, end); 325 start, end);
326 bi->start = start; 326 bi->start = start;
327 bi->end = end; 327 bi->end = end;
328 numa_remove_memblk_from(j--, mi); 328 numa_remove_memblk_from(j--, mi);
329 } 329 }
330 } 330 }
331 331
332 /* clear unused ones */ 332 /* clear unused ones */
333 for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) { 333 for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) {
334 mi->blk[i].start = mi->blk[i].end = 0; 334 mi->blk[i].start = mi->blk[i].end = 0;
335 mi->blk[i].nid = NUMA_NO_NODE; 335 mi->blk[i].nid = NUMA_NO_NODE;
336 } 336 }
337 337
338 return 0; 338 return 0;
339 } 339 }
340 340
341 /* 341 /*
342 * Set nodes, which have memory in @mi, in *@nodemask. 342 * Set nodes, which have memory in @mi, in *@nodemask.
343 */ 343 */
344 static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask, 344 static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask,
345 const struct numa_meminfo *mi) 345 const struct numa_meminfo *mi)
346 { 346 {
347 int i; 347 int i;
348 348
349 for (i = 0; i < ARRAY_SIZE(mi->blk); i++) 349 for (i = 0; i < ARRAY_SIZE(mi->blk); i++)
350 if (mi->blk[i].start != mi->blk[i].end && 350 if (mi->blk[i].start != mi->blk[i].end &&
351 mi->blk[i].nid != NUMA_NO_NODE) 351 mi->blk[i].nid != NUMA_NO_NODE)
352 node_set(mi->blk[i].nid, *nodemask); 352 node_set(mi->blk[i].nid, *nodemask);
353 } 353 }
354 354
355 /** 355 /**
356 * numa_reset_distance - Reset NUMA distance table 356 * numa_reset_distance - Reset NUMA distance table
357 * 357 *
358 * The current table is freed. The next numa_set_distance() call will 358 * The current table is freed. The next numa_set_distance() call will
359 * create a new one. 359 * create a new one.
360 */ 360 */
361 void __init numa_reset_distance(void) 361 void __init numa_reset_distance(void)
362 { 362 {
363 size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]); 363 size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]);
364 364
365 /* numa_distance could be 1LU marking allocation failure, test cnt */ 365 /* numa_distance could be 1LU marking allocation failure, test cnt */
366 if (numa_distance_cnt) 366 if (numa_distance_cnt)
367 memblock_free(__pa(numa_distance), size); 367 memblock_free(__pa(numa_distance), size);
368 numa_distance_cnt = 0; 368 numa_distance_cnt = 0;
369 numa_distance = NULL; /* enable table creation */ 369 numa_distance = NULL; /* enable table creation */
370 } 370 }
371 371
372 static int __init numa_alloc_distance(void) 372 static int __init numa_alloc_distance(void)
373 { 373 {
374 nodemask_t nodes_parsed; 374 nodemask_t nodes_parsed;
375 size_t size; 375 size_t size;
376 int i, j, cnt = 0; 376 int i, j, cnt = 0;
377 u64 phys; 377 u64 phys;
378 378
379 /* size the new table and allocate it */ 379 /* size the new table and allocate it */
380 nodes_parsed = numa_nodes_parsed; 380 nodes_parsed = numa_nodes_parsed;
381 numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo); 381 numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo);
382 382
383 for_each_node_mask(i, nodes_parsed) 383 for_each_node_mask(i, nodes_parsed)
384 cnt = i; 384 cnt = i;
385 cnt++; 385 cnt++;
386 size = cnt * cnt * sizeof(numa_distance[0]); 386 size = cnt * cnt * sizeof(numa_distance[0]);
387 387
388 phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), 388 phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
389 size, PAGE_SIZE); 389 size, PAGE_SIZE);
390 if (!phys) { 390 if (!phys) {
391 pr_warning("NUMA: Warning: can't allocate distance table!\n"); 391 pr_warning("NUMA: Warning: can't allocate distance table!\n");
392 /* don't retry until explicitly reset */ 392 /* don't retry until explicitly reset */
393 numa_distance = (void *)1LU; 393 numa_distance = (void *)1LU;
394 return -ENOMEM; 394 return -ENOMEM;
395 } 395 }
396 memblock_reserve(phys, size); 396 memblock_reserve(phys, size);
397 397
398 numa_distance = __va(phys); 398 numa_distance = __va(phys);
399 numa_distance_cnt = cnt; 399 numa_distance_cnt = cnt;
400 400
401 /* fill with the default distances */ 401 /* fill with the default distances */
402 for (i = 0; i < cnt; i++) 402 for (i = 0; i < cnt; i++)
403 for (j = 0; j < cnt; j++) 403 for (j = 0; j < cnt; j++)
404 numa_distance[i * cnt + j] = i == j ? 404 numa_distance[i * cnt + j] = i == j ?
405 LOCAL_DISTANCE : REMOTE_DISTANCE; 405 LOCAL_DISTANCE : REMOTE_DISTANCE;
406 printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt); 406 printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt);
407 407
408 return 0; 408 return 0;
409 } 409 }
410 410
411 /** 411 /**
412 * numa_set_distance - Set NUMA distance from one NUMA to another 412 * numa_set_distance - Set NUMA distance from one NUMA to another
413 * @from: the 'from' node to set distance 413 * @from: the 'from' node to set distance
414 * @to: the 'to' node to set distance 414 * @to: the 'to' node to set distance
415 * @distance: NUMA distance 415 * @distance: NUMA distance
416 * 416 *
417 * Set the distance from node @from to @to to @distance. If distance table 417 * Set the distance from node @from to @to to @distance. If distance table
418 * doesn't exist, one which is large enough to accommodate all the currently 418 * doesn't exist, one which is large enough to accommodate all the currently
419 * known nodes will be created. 419 * known nodes will be created.
420 * 420 *
421 * If such table cannot be allocated, a warning is printed and further 421 * If such table cannot be allocated, a warning is printed and further
422 * calls are ignored until the distance table is reset with 422 * calls are ignored until the distance table is reset with
423 * numa_reset_distance(). 423 * numa_reset_distance().
424 * 424 *
425 * If @from or @to is higher than the highest known node or lower than zero 425 * If @from or @to is higher than the highest known node or lower than zero
426 * at the time of table creation or @distance doesn't make sense, the call 426 * at the time of table creation or @distance doesn't make sense, the call
427 * is ignored. 427 * is ignored.
428 * This is to allow simplification of specific NUMA config implementations. 428 * This is to allow simplification of specific NUMA config implementations.
429 */ 429 */
430 void __init numa_set_distance(int from, int to, int distance) 430 void __init numa_set_distance(int from, int to, int distance)
431 { 431 {
432 if (!numa_distance && numa_alloc_distance() < 0) 432 if (!numa_distance && numa_alloc_distance() < 0)
433 return; 433 return;
434 434
435 if (from >= numa_distance_cnt || to >= numa_distance_cnt || 435 if (from >= numa_distance_cnt || to >= numa_distance_cnt ||
436 from < 0 || to < 0) { 436 from < 0 || to < 0) {
437 pr_warn_once("NUMA: Warning: node ids are out of bound, from=%d to=%d distance=%d\n", 437 pr_warn_once("NUMA: Warning: node ids are out of bound, from=%d to=%d distance=%d\n",
438 from, to, distance); 438 from, to, distance);
439 return; 439 return;
440 } 440 }
441 441
442 if ((u8)distance != distance || 442 if ((u8)distance != distance ||
443 (from == to && distance != LOCAL_DISTANCE)) { 443 (from == to && distance != LOCAL_DISTANCE)) {
444 pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n", 444 pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
445 from, to, distance); 445 from, to, distance);
446 return; 446 return;
447 } 447 }
448 448
449 numa_distance[from * numa_distance_cnt + to] = distance; 449 numa_distance[from * numa_distance_cnt + to] = distance;
450 } 450 }
451 451
452 int __node_distance(int from, int to) 452 int __node_distance(int from, int to)
453 { 453 {
454 if (from >= numa_distance_cnt || to >= numa_distance_cnt) 454 if (from >= numa_distance_cnt || to >= numa_distance_cnt)
455 return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE; 455 return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE;
456 return numa_distance[from * numa_distance_cnt + to]; 456 return numa_distance[from * numa_distance_cnt + to];
457 } 457 }
458 EXPORT_SYMBOL(__node_distance); 458 EXPORT_SYMBOL(__node_distance);
459 459
460 /* 460 /*
461 * Sanity check to catch more bad NUMA configurations (they are amazingly 461 * Sanity check to catch more bad NUMA configurations (they are amazingly
462 * common). Make sure the nodes cover all memory. 462 * common). Make sure the nodes cover all memory.
463 */ 463 */
464 static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi) 464 static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi)
465 { 465 {
466 u64 numaram, e820ram; 466 u64 numaram, e820ram;
467 int i; 467 int i;
468 468
469 numaram = 0; 469 numaram = 0;
470 for (i = 0; i < mi->nr_blks; i++) { 470 for (i = 0; i < mi->nr_blks; i++) {
471 u64 s = mi->blk[i].start >> PAGE_SHIFT; 471 u64 s = mi->blk[i].start >> PAGE_SHIFT;
472 u64 e = mi->blk[i].end >> PAGE_SHIFT; 472 u64 e = mi->blk[i].end >> PAGE_SHIFT;
473 numaram += e - s; 473 numaram += e - s;
474 numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e); 474 numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e);
475 if ((s64)numaram < 0) 475 if ((s64)numaram < 0)
476 numaram = 0; 476 numaram = 0;
477 } 477 }
478 478
479 e820ram = max_pfn - absent_pages_in_range(0, max_pfn); 479 e820ram = max_pfn - absent_pages_in_range(0, max_pfn);
480 480
481 /* We seem to lose 3 pages somewhere. Allow 1M of slack. */ 481 /* We seem to lose 3 pages somewhere. Allow 1M of slack. */
482 if ((s64)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) { 482 if ((s64)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) {
483 printk(KERN_ERR "NUMA: nodes only cover %LuMB of your %LuMB e820 RAM. Not used.\n", 483 printk(KERN_ERR "NUMA: nodes only cover %LuMB of your %LuMB e820 RAM. Not used.\n",
484 (numaram << PAGE_SHIFT) >> 20, 484 (numaram << PAGE_SHIFT) >> 20,
485 (e820ram << PAGE_SHIFT) >> 20); 485 (e820ram << PAGE_SHIFT) >> 20);
486 return false; 486 return false;
487 } 487 }
488 return true; 488 return true;
489 } 489 }
490 490
491 static int __init numa_register_memblks(struct numa_meminfo *mi) 491 static int __init numa_register_memblks(struct numa_meminfo *mi)
492 { 492 {
493 unsigned long uninitialized_var(pfn_align); 493 unsigned long uninitialized_var(pfn_align);
494 int i, nid; 494 int i, nid;
495 495
496 /* Account for nodes with cpus and no memory */ 496 /* Account for nodes with cpus and no memory */
497 node_possible_map = numa_nodes_parsed; 497 node_possible_map = numa_nodes_parsed;
498 numa_nodemask_from_meminfo(&node_possible_map, mi); 498 numa_nodemask_from_meminfo(&node_possible_map, mi);
499 if (WARN_ON(nodes_empty(node_possible_map))) 499 if (WARN_ON(nodes_empty(node_possible_map)))
500 return -EINVAL; 500 return -EINVAL;
501 501
502 for (i = 0; i < mi->nr_blks; i++) { 502 for (i = 0; i < mi->nr_blks; i++) {
503 struct numa_memblk *mb = &mi->blk[i]; 503 struct numa_memblk *mb = &mi->blk[i];
504 memblock_set_node(mb->start, mb->end - mb->start, mb->nid); 504 memblock_set_node(mb->start, mb->end - mb->start, mb->nid);
505 } 505 }
506 506
507 /* 507 /*
508 * If sections array is gonna be used for pfn -> nid mapping, check 508 * If sections array is gonna be used for pfn -> nid mapping, check
509 * whether its granularity is fine enough. 509 * whether its granularity is fine enough.
510 */ 510 */
511 #ifdef NODE_NOT_IN_PAGE_FLAGS 511 #ifdef NODE_NOT_IN_PAGE_FLAGS
512 pfn_align = node_map_pfn_alignment(); 512 pfn_align = node_map_pfn_alignment();
513 if (pfn_align && pfn_align < PAGES_PER_SECTION) { 513 if (pfn_align && pfn_align < PAGES_PER_SECTION) {
514 printk(KERN_WARNING "Node alignment %LuMB < min %LuMB, rejecting NUMA config\n", 514 printk(KERN_WARNING "Node alignment %LuMB < min %LuMB, rejecting NUMA config\n",
515 PFN_PHYS(pfn_align) >> 20, 515 PFN_PHYS(pfn_align) >> 20,
516 PFN_PHYS(PAGES_PER_SECTION) >> 20); 516 PFN_PHYS(PAGES_PER_SECTION) >> 20);
517 return -EINVAL; 517 return -EINVAL;
518 } 518 }
519 #endif 519 #endif
520 if (!numa_meminfo_cover_memory(mi)) 520 if (!numa_meminfo_cover_memory(mi))
521 return -EINVAL; 521 return -EINVAL;
522 522
523 /* Finally register nodes. */ 523 /* Finally register nodes. */
524 for_each_node_mask(nid, node_possible_map) { 524 for_each_node_mask(nid, node_possible_map) {
525 u64 start = PFN_PHYS(max_pfn); 525 u64 start = PFN_PHYS(max_pfn);
526 u64 end = 0; 526 u64 end = 0;
527 527
528 for (i = 0; i < mi->nr_blks; i++) { 528 for (i = 0; i < mi->nr_blks; i++) {
529 if (nid != mi->blk[i].nid) 529 if (nid != mi->blk[i].nid)
530 continue; 530 continue;
531 start = min(mi->blk[i].start, start); 531 start = min(mi->blk[i].start, start);
532 end = max(mi->blk[i].end, end); 532 end = max(mi->blk[i].end, end);
533 } 533 }
534 534
535 if (start < end) 535 if (start < end)
536 setup_node_data(nid, start, end); 536 setup_node_data(nid, start, end);
537 } 537 }
538 538
539 /* Dump memblock with node info and return. */ 539 /* Dump memblock with node info and return. */
540 memblock_dump_all(); 540 memblock_dump_all();
541 return 0; 541 return 0;
542 } 542 }
543 543
544 /* 544 /*
545 * There are unfortunately some poorly designed mainboards around that 545 * There are unfortunately some poorly designed mainboards around that
546 * only connect memory to a single CPU. This breaks the 1:1 cpu->node 546 * only connect memory to a single CPU. This breaks the 1:1 cpu->node
547 * mapping. To avoid this fill in the mapping for all possible CPUs, 547 * mapping. To avoid this fill in the mapping for all possible CPUs,
548 * as the number of CPUs is not known yet. We round robin the existing 548 * as the number of CPUs is not known yet. We round robin the existing
549 * nodes. 549 * nodes.
550 */ 550 */
551 static void __init numa_init_array(void) 551 static void __init numa_init_array(void)
552 { 552 {
553 int rr, i; 553 int rr, i;
554 554
555 rr = first_node(node_online_map); 555 rr = first_node(node_online_map);
556 for (i = 0; i < nr_cpu_ids; i++) { 556 for (i = 0; i < nr_cpu_ids; i++) {
557 if (early_cpu_to_node(i) != NUMA_NO_NODE) 557 if (early_cpu_to_node(i) != NUMA_NO_NODE)
558 continue; 558 continue;
559 numa_set_node(i, rr); 559 numa_set_node(i, rr);
560 rr = next_node(rr, node_online_map); 560 rr = next_node(rr, node_online_map);
561 if (rr == MAX_NUMNODES) 561 if (rr == MAX_NUMNODES)
562 rr = first_node(node_online_map); 562 rr = first_node(node_online_map);
563 } 563 }
564 } 564 }
565 565
566 static int __init numa_init(int (*init_func)(void)) 566 static int __init numa_init(int (*init_func)(void))
567 { 567 {
568 int i; 568 int i;
569 int ret; 569 int ret;
570 570
571 for (i = 0; i < MAX_LOCAL_APIC; i++) 571 for (i = 0; i < MAX_LOCAL_APIC; i++)
572 set_apicid_to_node(i, NUMA_NO_NODE); 572 set_apicid_to_node(i, NUMA_NO_NODE);
573 573
574 nodes_clear(numa_nodes_parsed); 574 nodes_clear(numa_nodes_parsed);
575 nodes_clear(node_possible_map); 575 nodes_clear(node_possible_map);
576 nodes_clear(node_online_map); 576 nodes_clear(node_online_map);
577 memset(&numa_meminfo, 0, sizeof(numa_meminfo)); 577 memset(&numa_meminfo, 0, sizeof(numa_meminfo));
578 WARN_ON(memblock_set_node(0, ULLONG_MAX, MAX_NUMNODES)); 578 WARN_ON(memblock_set_node(0, ULLONG_MAX, MAX_NUMNODES));
579 numa_reset_distance(); 579 numa_reset_distance();
580 580
581 ret = init_func(); 581 ret = init_func();
582 if (ret < 0) 582 if (ret < 0)
583 return ret; 583 return ret;
584 ret = numa_cleanup_meminfo(&numa_meminfo); 584 ret = numa_cleanup_meminfo(&numa_meminfo);
585 if (ret < 0) 585 if (ret < 0)
586 return ret; 586 return ret;
587 587
588 numa_emulation(&numa_meminfo, numa_distance_cnt); 588 numa_emulation(&numa_meminfo, numa_distance_cnt);
589 589
590 ret = numa_register_memblks(&numa_meminfo); 590 ret = numa_register_memblks(&numa_meminfo);
591 if (ret < 0) 591 if (ret < 0)
592 return ret; 592 return ret;
593 593
594 for (i = 0; i < nr_cpu_ids; i++) { 594 for (i = 0; i < nr_cpu_ids; i++) {
595 int nid = early_cpu_to_node(i); 595 int nid = early_cpu_to_node(i);
596 596
597 if (nid == NUMA_NO_NODE) 597 if (nid == NUMA_NO_NODE)
598 continue; 598 continue;
599 if (!node_online(nid)) 599 if (!node_online(nid))
600 numa_clear_node(i); 600 numa_clear_node(i);
601 } 601 }
602 numa_init_array(); 602 numa_init_array();
603 return 0; 603 return 0;
604 } 604 }
605 605
606 /** 606 /**
607 * dummy_numa_init - Fallback dummy NUMA init 607 * dummy_numa_init - Fallback dummy NUMA init
608 * 608 *
609 * Used if there's no underlying NUMA architecture, NUMA initialization 609 * Used if there's no underlying NUMA architecture, NUMA initialization
610 * fails, or NUMA is disabled on the command line. 610 * fails, or NUMA is disabled on the command line.
611 * 611 *
612 * Must online at least one node and add memory blocks that cover all 612 * Must online at least one node and add memory blocks that cover all
613 * allowed memory. This function must not fail. 613 * allowed memory. This function must not fail.
614 */ 614 */
615 static int __init dummy_numa_init(void) 615 static int __init dummy_numa_init(void)
616 { 616 {
617 printk(KERN_INFO "%s\n", 617 printk(KERN_INFO "%s\n",
618 numa_off ? "NUMA turned off" : "No NUMA configuration found"); 618 numa_off ? "NUMA turned off" : "No NUMA configuration found");
619 printk(KERN_INFO "Faking a node at %016Lx-%016Lx\n", 619 printk(KERN_INFO "Faking a node at %016Lx-%016Lx\n",
620 0LLU, PFN_PHYS(max_pfn)); 620 0LLU, PFN_PHYS(max_pfn));
621 621
622 node_set(0, numa_nodes_parsed); 622 node_set(0, numa_nodes_parsed);
623 numa_add_memblk(0, 0, PFN_PHYS(max_pfn)); 623 numa_add_memblk(0, 0, PFN_PHYS(max_pfn));
624 624
625 return 0; 625 return 0;
626 } 626 }
627 627
628 /** 628 /**
629 * x86_numa_init - Initialize NUMA 629 * x86_numa_init - Initialize NUMA
630 * 630 *
631 * Try each configured NUMA initialization method until one succeeds. The 631 * Try each configured NUMA initialization method until one succeeds. The
632 * last fallback is dummy single node config encomapssing whole memory and 632 * last fallback is dummy single node config encomapssing whole memory and
633 * never fails. 633 * never fails.
634 */ 634 */
635 void __init x86_numa_init(void) 635 void __init x86_numa_init(void)
636 { 636 {
637 if (!numa_off) { 637 if (!numa_off) {
638 #ifdef CONFIG_X86_NUMAQ 638 #ifdef CONFIG_X86_NUMAQ
639 if (!numa_init(numaq_numa_init)) 639 if (!numa_init(numaq_numa_init))
640 return; 640 return;
641 #endif 641 #endif
642 #ifdef CONFIG_ACPI_NUMA 642 #ifdef CONFIG_ACPI_NUMA
643 if (!numa_init(x86_acpi_numa_init)) 643 if (!numa_init(x86_acpi_numa_init))
644 return; 644 return;
645 #endif 645 #endif
646 #ifdef CONFIG_AMD_NUMA 646 #ifdef CONFIG_AMD_NUMA
647 if (!numa_init(amd_numa_init)) 647 if (!numa_init(amd_numa_init))
648 return; 648 return;
649 #endif 649 #endif
650 } 650 }
651 651
652 numa_init(dummy_numa_init); 652 numa_init(dummy_numa_init);
653 } 653 }
654 654
655 static __init int find_near_online_node(int node) 655 static __init int find_near_online_node(int node)
656 { 656 {
657 int n, val; 657 int n, val;
658 int min_val = INT_MAX; 658 int min_val = INT_MAX;
659 int best_node = -1; 659 int best_node = -1;
660 660
661 for_each_online_node(n) { 661 for_each_online_node(n) {
662 val = node_distance(node, n); 662 val = node_distance(node, n);
663 663
664 if (val < min_val) { 664 if (val < min_val) {
665 min_val = val; 665 min_val = val;
666 best_node = n; 666 best_node = n;
667 } 667 }
668 } 668 }
669 669
670 return best_node; 670 return best_node;
671 } 671 }
672 672
673 /* 673 /*
674 * Setup early cpu_to_node. 674 * Setup early cpu_to_node.
675 * 675 *
676 * Populate cpu_to_node[] only if x86_cpu_to_apicid[], 676 * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
677 * and apicid_to_node[] tables have valid entries for a CPU. 677 * and apicid_to_node[] tables have valid entries for a CPU.
678 * This means we skip cpu_to_node[] initialisation for NUMA 678 * This means we skip cpu_to_node[] initialisation for NUMA
679 * emulation and faking node case (when running a kernel compiled 679 * emulation and faking node case (when running a kernel compiled
680 * for NUMA on a non NUMA box), which is OK as cpu_to_node[] 680 * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
681 * is already initialized in a round robin manner at numa_init_array, 681 * is already initialized in a round robin manner at numa_init_array,
682 * prior to this call, and this initialization is good enough 682 * prior to this call, and this initialization is good enough
683 * for the fake NUMA cases. 683 * for the fake NUMA cases.
684 * 684 *
685 * Called before the per_cpu areas are setup. 685 * Called before the per_cpu areas are setup.
686 */ 686 */
687 void __init init_cpu_to_node(void) 687 void __init init_cpu_to_node(void)
688 { 688 {
689 int cpu; 689 int cpu;
690 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); 690 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
691 691
692 BUG_ON(cpu_to_apicid == NULL); 692 BUG_ON(cpu_to_apicid == NULL);
693 693
694 for_each_possible_cpu(cpu) { 694 for_each_possible_cpu(cpu) {
695 int node = numa_cpu_node(cpu); 695 int node = numa_cpu_node(cpu);
696 696
697 if (node == NUMA_NO_NODE) 697 if (node == NUMA_NO_NODE)
698 continue; 698 continue;
699 if (!node_online(node)) 699 if (!node_online(node))
700 node = find_near_online_node(node); 700 node = find_near_online_node(node);
701 numa_set_node(cpu, node); 701 numa_set_node(cpu, node);
702 } 702 }
703 } 703 }
704 704
705 #ifndef CONFIG_DEBUG_PER_CPU_MAPS 705 #ifndef CONFIG_DEBUG_PER_CPU_MAPS
706 706
707 # ifndef CONFIG_NUMA_EMU 707 # ifndef CONFIG_NUMA_EMU
708 void __cpuinit numa_add_cpu(int cpu) 708 void __cpuinit numa_add_cpu(int cpu)
709 { 709 {
710 cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); 710 cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
711 } 711 }
712 712
713 void __cpuinit numa_remove_cpu(int cpu) 713 void __cpuinit numa_remove_cpu(int cpu)
714 { 714 {
715 cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); 715 cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
716 } 716 }
717 # endif /* !CONFIG_NUMA_EMU */ 717 # endif /* !CONFIG_NUMA_EMU */
718 718
719 #else /* !CONFIG_DEBUG_PER_CPU_MAPS */ 719 #else /* !CONFIG_DEBUG_PER_CPU_MAPS */
720 720
721 int __cpu_to_node(int cpu) 721 int __cpu_to_node(int cpu)
722 { 722 {
723 if (early_per_cpu_ptr(x86_cpu_to_node_map)) { 723 if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
724 printk(KERN_WARNING 724 printk(KERN_WARNING
725 "cpu_to_node(%d): usage too early!\n", cpu); 725 "cpu_to_node(%d): usage too early!\n", cpu);
726 dump_stack(); 726 dump_stack();
727 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; 727 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
728 } 728 }
729 return per_cpu(x86_cpu_to_node_map, cpu); 729 return per_cpu(x86_cpu_to_node_map, cpu);
730 } 730 }
731 EXPORT_SYMBOL(__cpu_to_node); 731 EXPORT_SYMBOL(__cpu_to_node);
732 732
733 /* 733 /*
734 * Same function as cpu_to_node() but used if called before the 734 * Same function as cpu_to_node() but used if called before the
735 * per_cpu areas are setup. 735 * per_cpu areas are setup.
736 */ 736 */
737 int early_cpu_to_node(int cpu) 737 int early_cpu_to_node(int cpu)
738 { 738 {
739 if (early_per_cpu_ptr(x86_cpu_to_node_map)) 739 if (early_per_cpu_ptr(x86_cpu_to_node_map))
740 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; 740 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
741 741
742 if (!cpu_possible(cpu)) { 742 if (!cpu_possible(cpu)) {
743 printk(KERN_WARNING 743 printk(KERN_WARNING
744 "early_cpu_to_node(%d): no per_cpu area!\n", cpu); 744 "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
745 dump_stack(); 745 dump_stack();
746 return NUMA_NO_NODE; 746 return NUMA_NO_NODE;
747 } 747 }
748 return per_cpu(x86_cpu_to_node_map, cpu); 748 return per_cpu(x86_cpu_to_node_map, cpu);
749 } 749 }
750 750
751 void debug_cpumask_set_cpu(int cpu, int node, bool enable) 751 void debug_cpumask_set_cpu(int cpu, int node, bool enable)
752 { 752 {
753 struct cpumask *mask; 753 struct cpumask *mask;
754 char buf[64]; 754 char buf[64];
755 755
756 if (node == NUMA_NO_NODE) { 756 if (node == NUMA_NO_NODE) {
757 /* early_cpu_to_node() already emits a warning and trace */ 757 /* early_cpu_to_node() already emits a warning and trace */
758 return; 758 return;
759 } 759 }
760 mask = node_to_cpumask_map[node]; 760 mask = node_to_cpumask_map[node];
761 if (!mask) { 761 if (!mask) {
762 pr_err("node_to_cpumask_map[%i] NULL\n", node); 762 pr_err("node_to_cpumask_map[%i] NULL\n", node);
763 dump_stack(); 763 dump_stack();
764 return; 764 return;
765 } 765 }
766 766
767 if (enable) 767 if (enable)
768 cpumask_set_cpu(cpu, mask); 768 cpumask_set_cpu(cpu, mask);
769 else 769 else
770 cpumask_clear_cpu(cpu, mask); 770 cpumask_clear_cpu(cpu, mask);
771 771
772 cpulist_scnprintf(buf, sizeof(buf), mask); 772 cpulist_scnprintf(buf, sizeof(buf), mask);
773 printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", 773 printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
774 enable ? "numa_add_cpu" : "numa_remove_cpu", 774 enable ? "numa_add_cpu" : "numa_remove_cpu",
775 cpu, node, buf); 775 cpu, node, buf);
776 return; 776 return;
777 } 777 }
778 778
779 # ifndef CONFIG_NUMA_EMU 779 # ifndef CONFIG_NUMA_EMU
780 static void __cpuinit numa_set_cpumask(int cpu, bool enable) 780 static void __cpuinit numa_set_cpumask(int cpu, bool enable)
781 { 781 {
782 debug_cpumask_set_cpu(cpu, early_cpu_to_node(cpu), enable); 782 debug_cpumask_set_cpu(cpu, early_cpu_to_node(cpu), enable);
783 } 783 }
784 784
785 void __cpuinit numa_add_cpu(int cpu) 785 void __cpuinit numa_add_cpu(int cpu)
786 { 786 {
787 numa_set_cpumask(cpu, true); 787 numa_set_cpumask(cpu, true);
788 } 788 }
789 789
790 void __cpuinit numa_remove_cpu(int cpu) 790 void __cpuinit numa_remove_cpu(int cpu)
791 { 791 {
792 numa_set_cpumask(cpu, false); 792 numa_set_cpumask(cpu, false);
793 } 793 }
794 # endif /* !CONFIG_NUMA_EMU */ 794 # endif /* !CONFIG_NUMA_EMU */
795 795
796 /* 796 /*
797 * Returns a pointer to the bitmask of CPUs on Node 'node'. 797 * Returns a pointer to the bitmask of CPUs on Node 'node'.
798 */ 798 */
799 const struct cpumask *cpumask_of_node(int node) 799 const struct cpumask *cpumask_of_node(int node)
800 { 800 {
801 if (node >= nr_node_ids) { 801 if (node >= nr_node_ids) {
802 printk(KERN_WARNING 802 printk(KERN_WARNING
803 "cpumask_of_node(%d): node > nr_node_ids(%d)\n", 803 "cpumask_of_node(%d): node > nr_node_ids(%d)\n",
804 node, nr_node_ids); 804 node, nr_node_ids);
805 dump_stack(); 805 dump_stack();
806 return cpu_none_mask; 806 return cpu_none_mask;
807 } 807 }
808 if (node_to_cpumask_map[node] == NULL) { 808 if (node_to_cpumask_map[node] == NULL) {
809 printk(KERN_WARNING 809 printk(KERN_WARNING
810 "cpumask_of_node(%d): no node_to_cpumask_map!\n", 810 "cpumask_of_node(%d): no node_to_cpumask_map!\n",
811 node); 811 node);
812 dump_stack(); 812 dump_stack();
813 return cpu_online_mask; 813 return cpu_online_mask;
814 } 814 }
815 return node_to_cpumask_map[node]; 815 return node_to_cpumask_map[node];
816 } 816 }
817 EXPORT_SYMBOL(cpumask_of_node); 817 EXPORT_SYMBOL(cpumask_of_node);
818 818
819 #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ 819 #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
820 820
821 #ifdef CONFIG_MEMORY_HOTPLUG 821 #ifdef CONFIG_MEMORY_HOTPLUG
822 int memory_add_physaddr_to_nid(u64 start) 822 int memory_add_physaddr_to_nid(u64 start)
823 { 823 {
824 struct numa_meminfo *mi = &numa_meminfo; 824 struct numa_meminfo *mi = &numa_meminfo;
825 int nid = mi->blk[0].nid; 825 int nid = mi->blk[0].nid;
826 int i; 826 int i;
827 827
828 for (i = 0; i < mi->nr_blks; i++) 828 for (i = 0; i < mi->nr_blks; i++)
829 if (mi->blk[i].start <= start && mi->blk[i].end > start) 829 if (mi->blk[i].start <= start && mi->blk[i].end > start)
830 nid = mi->blk[i].nid; 830 nid = mi->blk[i].nid;
831 return nid; 831 return nid;
832 } 832 }
833 EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); 833 EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
834 #endif 834 #endif
835 835