Commit 53fee04f318222a3179ca5933d8bda82c1eef17a

Authored by Rohit Seth
Committed by Andi Kleen
1 parent 3b3d5e1db6

[PATCH] x86-64: Fix fake numa for x86_64 machines with big IO hole

This patch resolves the issue of running with numa=fake=X on kernel command
line on x86_64 machines that have big IO hole.  While calculating the size
of each node now we look at the total hole size in that range.

Previously there were nodes that only had IO holes in them causing kernel
boot problems.  We now use the NODE_MIN_SIZE (64MB) as the minimum size of
memory that any node must have.  We reduce the number of allocated nodes if
the number of nodes specified on kernel command line results in any node
getting memory smaller than NODE_MIN_SIZE.

This change allows the extra memory to be incremented in NODE_MIN_SIZE
granule and uniformly distribute among as many nodes (called big nodes) as
possible.

[akpm@osdl.org: build fix]
Signed-off-by: David Rientjes <reintjes@google.com>
Signed-off-by: Paul Menage <menage@google.com>
Signed-off-by: Rohit Seth <rohitseth@google.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>

Showing 4 changed files with 133 additions and 14 deletions Side-by-side Diff

arch/x86_64/kernel/e820.c
... ... @@ -191,6 +191,37 @@
191 191 }
192 192  
193 193 /*
  194 + * Find the hole size in the range.
  195 + */
  196 +unsigned long __init e820_hole_size(unsigned long start, unsigned long end)
  197 +{
  198 + unsigned long ram = 0;
  199 + int i;
  200 +
  201 + for (i = 0; i < e820.nr_map; i++) {
  202 + struct e820entry *ei = &e820.map[i];
  203 + unsigned long last, addr;
  204 +
  205 + if (ei->type != E820_RAM ||
  206 + ei->addr+ei->size <= start ||
  207 + ei->addr >= end)
  208 + continue;
  209 +
  210 + addr = round_up(ei->addr, PAGE_SIZE);
  211 + if (addr < start)
  212 + addr = start;
  213 +
  214 + last = round_down(ei->addr + ei->size, PAGE_SIZE);
  215 + if (last >= end)
  216 + last = end;
  217 +
  218 + if (last > addr)
  219 + ram += last - addr;
  220 + }
  221 + return ((end - start) - ram);
  222 +}
  223 +
  224 +/*
194 225 * Mark e820 reserved areas as busy for the resource manager.
195 226 */
196 227 void __init e820_reserve_resources(void)
arch/x86_64/mm/numa.c
... ... @@ -272,31 +272,113 @@
272 272 }
273 273  
274 274 #ifdef CONFIG_NUMA_EMU
  275 +/* Numa emulation */
275 276 int numa_fake __initdata = 0;
276 277  
277   -/* Numa emulation */
  278 +/*
  279 + * This function is used to find out if the start and end correspond to
  280 + * different zones.
  281 + */
  282 +int zone_cross_over(unsigned long start, unsigned long end)
  283 +{
  284 + if ((start < (MAX_DMA32_PFN << PAGE_SHIFT)) &&
  285 + (end >= (MAX_DMA32_PFN << PAGE_SHIFT)))
  286 + return 1;
  287 + return 0;
  288 +}
  289 +
278 290 static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
279 291 {
280   - int i;
  292 + int i, big;
281 293 struct bootnode nodes[MAX_NUMNODES];
282   - unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
  294 + unsigned long sz, old_sz;
  295 + unsigned long hole_size;
  296 + unsigned long start, end;
  297 + unsigned long max_addr = (end_pfn << PAGE_SHIFT);
283 298  
  299 + start = (start_pfn << PAGE_SHIFT);
  300 + hole_size = e820_hole_size(start, max_addr);
  301 + sz = (max_addr - start - hole_size) / numa_fake;
  302 +
284 303 /* Kludge needed for the hash function */
285   - if (hweight64(sz) > 1) {
286   - unsigned long x = 1;
287   - while ((x << 1) < sz)
288   - x <<= 1;
289   - if (x < sz/2)
290   - printk(KERN_ERR "Numa emulation unbalanced. Complain to maintainer\n");
291   - sz = x;
292   - }
293 304  
  305 + old_sz = sz;
  306 + /*
  307 + * Round down to the nearest FAKE_NODE_MIN_SIZE.
  308 + */
  309 + sz &= FAKE_NODE_MIN_HASH_MASK;
  310 +
  311 + /*
  312 + * We ensure that each node is at least 64MB big. Smaller than this
  313 + * size can cause VM hiccups.
  314 + */
  315 + if (sz == 0) {
  316 + printk(KERN_INFO "Not enough memory for %d nodes. Reducing "
  317 + "the number of nodes\n", numa_fake);
  318 + numa_fake = (max_addr - start - hole_size) / FAKE_NODE_MIN_SIZE;
  319 + printk(KERN_INFO "Number of fake nodes will be = %d\n",
  320 + numa_fake);
  321 + sz = FAKE_NODE_MIN_SIZE;
  322 + }
  323 + /*
  324 + * Find out how many nodes can get an extra NODE_MIN_SIZE granule.
  325 + * This logic ensures the extra memory gets distributed among as many
  326 + * nodes as possible (as compared to one single node getting all that
  327 + * extra memory.
  328 + */
  329 + big = ((old_sz - sz) * numa_fake) / FAKE_NODE_MIN_SIZE;
  330 + printk(KERN_INFO "Fake node Size: %luMB hole_size: %luMB big nodes: "
  331 + "%d\n",
  332 + (sz >> 20), (hole_size >> 20), big);
294 333 memset(&nodes,0,sizeof(nodes));
  334 + end = start;
295 335 for (i = 0; i < numa_fake; i++) {
296   - nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
  336 + /*
  337 + * In case we are not able to allocate enough memory for all
  338 + * the nodes, we reduce the number of fake nodes.
  339 + */
  340 + if (end >= max_addr) {
  341 + numa_fake = i - 1;
  342 + break;
  343 + }
  344 + start = nodes[i].start = end;
  345 + /*
  346 + * Final node can have all the remaining memory.
  347 + */
297 348 if (i == numa_fake-1)
298   - sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
299   - nodes[i].end = nodes[i].start + sz;
  349 + sz = max_addr - start;
  350 + end = nodes[i].start + sz;
  351 + /*
  352 + * Fir "big" number of nodes get extra granule.
  353 + */
  354 + if (i < big)
  355 + end += FAKE_NODE_MIN_SIZE;
  356 + /*
  357 + * Iterate over the range to ensure that this node gets at
  358 + * least sz amount of RAM (excluding holes)
  359 + */
  360 + while ((end - start - e820_hole_size(start, end)) < sz) {
  361 + end += FAKE_NODE_MIN_SIZE;
  362 + if (end >= max_addr)
  363 + break;
  364 + }
  365 + /*
  366 + * Look at the next node to make sure there is some real memory
  367 + * to map. Bad things happen when the only memory present
  368 + * in a zone on a fake node is IO hole.
  369 + */
  370 + while (e820_hole_size(end, end + FAKE_NODE_MIN_SIZE) > 0) {
  371 + if (zone_cross_over(start, end + sz)) {
  372 + end = (MAX_DMA32_PFN << PAGE_SHIFT);
  373 + break;
  374 + }
  375 + if (end >= max_addr)
  376 + break;
  377 + end += FAKE_NODE_MIN_SIZE;
  378 + }
  379 + if (end > max_addr)
  380 + end = max_addr;
  381 + nodes[i].end = end;
300 382 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n",
301 383 i,
302 384 nodes[i].start, nodes[i].end,
include/asm-x86_64/e820.h
... ... @@ -46,6 +46,7 @@
46 46 extern void e820_print_map(char *who);
47 47 extern int e820_any_mapped(unsigned long start, unsigned long end, unsigned type);
48 48 extern int e820_all_mapped(unsigned long start, unsigned long end, unsigned type);
  49 +extern unsigned long e820_hole_size(unsigned long start, unsigned long end);
49 50  
50 51 extern void e820_setup_gap(void);
51 52 extern void e820_register_active_regions(int nid,
include/asm-x86_64/mmzone.h
... ... @@ -47,6 +47,11 @@
47 47 extern int pfn_valid(unsigned long pfn);
48 48 #endif
49 49  
  50 +#ifdef CONFIG_NUMA_EMU
  51 +#define FAKE_NODE_MIN_SIZE (64*1024*1024)
  52 +#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1ul))
  53 +#endif
  54 +
50 55 #endif
51 56 #endif