Commit 6ec6e0d9f2fd7cb6ca6bc3bfab5ae7b5cdd8c36f

Authored by Suresh Siddha
Committed by Ingo Molnar
1 parent 8705a49c35

srat, x86: add support for nodes spanning other nodes

For example, If the physical address layout on a two node system with 8 GB
memory is something like:
node 0: 0-2GB, 4-6GB
node 1: 2-4GB, 6-8GB

Current kernels fail to boot/detect this NUMA topology.

ACPI SRAT tables can expose such a topology which needs to be supported.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

Showing 5 changed files with 44 additions and 18 deletions Side-by-side Diff

... ... @@ -903,6 +903,15 @@
903 903 help
904 904 Enable ACPI SRAT based node topology detection.
905 905  
  906 +# Some NUMA nodes have memory ranges that span
  907 +# other nodes. Even though a pfn is valid and
  908 +# between a node's start and end pfns, it may not
  909 +# reside on that node. See memmap_init_zone()
  910 +# for details.
  911 +config NODES_SPAN_OTHER_NODES
  912 + def_bool y
  913 + depends on X86_64_ACPI_NUMA
  914 +
906 915 config NUMA_EMU
907 916 bool "NUMA emulation"
908 917 depends on X86_64 && NUMA
arch/x86/mm/k8topology_64.c
... ... @@ -164,7 +164,7 @@
164 164 if (!found)
165 165 return -1;
166 166  
167   - memnode_shift = compute_hash_shift(nodes, 8);
  167 + memnode_shift = compute_hash_shift(nodes, 8, NULL);
168 168 if (memnode_shift < 0) {
169 169 printk(KERN_ERR "No NUMA node hash function found. Contact maintainer\n");
170 170 return -1;
arch/x86/mm/numa_64.c
... ... @@ -60,7 +60,7 @@
60 60 * -1 if node overlap or lost ram (shift too big)
61 61 */
62 62 static int __init populate_memnodemap(const struct bootnode *nodes,
63   - int numnodes, int shift)
  63 + int numnodes, int shift, int *nodeids)
64 64 {
65 65 unsigned long addr, end;
66 66 int i, res = -1;
... ... @@ -76,7 +76,12 @@
76 76 do {
77 77 if (memnodemap[addr >> shift] != NUMA_NO_NODE)
78 78 return -1;
79   - memnodemap[addr >> shift] = i;
  79 +
  80 + if (!nodeids)
  81 + memnodemap[addr >> shift] = i;
  82 + else
  83 + memnodemap[addr >> shift] = nodeids[i];
  84 +
80 85 addr += (1UL << shift);
81 86 } while (addr < end);
82 87 res = 1;
... ... @@ -139,7 +144,8 @@
139 144 return i;
140 145 }
141 146  
142   -int __init compute_hash_shift(struct bootnode *nodes, int numnodes)
  147 +int __init compute_hash_shift(struct bootnode *nodes, int numnodes,
  148 + int *nodeids)
143 149 {
144 150 int shift;
145 151  
... ... @@ -149,7 +155,7 @@
149 155 printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
150 156 shift);
151 157  
152   - if (populate_memnodemap(nodes, numnodes, shift) != 1) {
  158 + if (populate_memnodemap(nodes, numnodes, shift, nodeids) != 1) {
153 159 printk(KERN_INFO "Your memory is not aligned you need to "
154 160 "rebuild your kernel with a bigger NODEMAPSIZE "
155 161 "shift=%d\n", shift);
... ... @@ -462,7 +468,7 @@
462 468 }
463 469 }
464 470 out:
465   - memnode_shift = compute_hash_shift(nodes, num_nodes);
  471 + memnode_shift = compute_hash_shift(nodes, num_nodes, NULL);
466 472 if (memnode_shift < 0) {
467 473 memnode_shift = 0;
468 474 printk(KERN_ERR "No NUMA hash function found. NUMA emulation "
arch/x86/mm/srat_64.c
... ... @@ -32,6 +32,10 @@
32 32 static int found_add_area __initdata;
33 33 int hotadd_percent __initdata = 0;
34 34  
  35 +static int num_node_memblks __initdata;
  36 +static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata;
  37 +static int memblk_nodeid[NR_NODE_MEMBLKS] __initdata;
  38 +
35 39 /* Too small nodes confuse the VM badly. Usually they result
36 40 from BIOS bugs. */
37 41 #define NODE_MIN_SIZE (4*1024*1024)
38 42  
39 43  
40 44  
... ... @@ -41,17 +45,17 @@
41 45 return acpi_map_pxm_to_node(pxm);
42 46 }
43 47  
44   -static __init int conflicting_nodes(unsigned long start, unsigned long end)
  48 +static __init int conflicting_memblks(unsigned long start, unsigned long end)
45 49 {
46 50 int i;
47   - for_each_node_mask(i, nodes_parsed) {
48   - struct bootnode *nd = &nodes[i];
  51 + for (i = 0; i < num_node_memblks; i++) {
  52 + struct bootnode *nd = &node_memblk_range[i];
49 53 if (nd->start == nd->end)
50 54 continue;
51 55 if (nd->end > start && nd->start < end)
52   - return i;
  56 + return memblk_nodeid[i];
53 57 if (nd->end == end && nd->start == start)
54   - return i;
  58 + return memblk_nodeid[i];
55 59 }
56 60 return -1;
57 61 }
... ... @@ -258,7 +262,7 @@
258 262 bad_srat();
259 263 return;
260 264 }
261   - i = conflicting_nodes(start, end);
  265 + i = conflicting_memblks(start, end);
262 266 if (i == node) {
263 267 printk(KERN_WARNING
264 268 "SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n",
... ... @@ -283,10 +287,10 @@
283 287 nd->end = end;
284 288 }
285 289  
286   - printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
287   - nd->start, nd->end);
288   - e820_register_active_regions(node, nd->start >> PAGE_SHIFT,
289   - nd->end >> PAGE_SHIFT);
  290 + printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm,
  291 + start, end);
  292 + e820_register_active_regions(node, start >> PAGE_SHIFT,
  293 + end >> PAGE_SHIFT);
290 294 push_node_boundaries(node, nd->start >> PAGE_SHIFT,
291 295 nd->end >> PAGE_SHIFT);
292 296  
... ... @@ -298,6 +302,11 @@
298 302 if ((nd->start | nd->end) == 0)
299 303 node_clear(node, nodes_parsed);
300 304 }
  305 +
  306 + node_memblk_range[num_node_memblks].start = start;
  307 + node_memblk_range[num_node_memblks].end = end;
  308 + memblk_nodeid[num_node_memblks] = node;
  309 + num_node_memblks++;
301 310 }
302 311  
303 312 /* Sanity check to catch more bad SRATs (they are amazingly common).
... ... @@ -368,7 +377,8 @@
368 377 return -1;
369 378 }
370 379  
371   - memnode_shift = compute_hash_shift(nodes, MAX_NUMNODES);
  380 + memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks,
  381 + memblk_nodeid);
372 382 if (memnode_shift < 0) {
373 383 printk(KERN_ERR
374 384 "SRAT: No NUMA node hash function found. Contact maintainer\n");
include/asm-x86/numa_64.h
... ... @@ -9,7 +9,8 @@
9 9 u64 end;
10 10 };
11 11  
12   -extern int compute_hash_shift(struct bootnode *nodes, int numnodes);
  12 +extern int compute_hash_shift(struct bootnode *nodes, int numblks,
  13 + int *nodeids);
13 14  
14 15 #define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
15 16