Commit 79442ed189acb8b949662676e750eda173c06f9b

Authored by Tang Chen
Committed by Linus Torvalds
1 parent 1402899e43

mm/memblock.c: introduce bottom-up allocation mode

The Linux kernel cannot migrate pages used by the kernel.  As a result,
kernel pages cannot be hot-removed.  So we cannot allocate hotpluggable
memory for the kernel.

ACPI SRAT (System Resource Affinity Table) contains the memory hotplug
info.  But before SRAT is parsed, memblock has already started to allocate
memory for the kernel.  So we need to prevent memblock from doing this.

In a memory hotplug system, any numa node the kernel resides in should be
unhotpluggable.  And for a modern server, each node could have at least
16GB memory.  So memory around the kernel image is highly likely
unhotpluggable.

So the basic idea is: Allocate memory from the end of the kernel image and
to the higher memory.  Since memory allocation before SRAT is parsed won't
be too much, it could highly likely be in the same node with kernel image.

The current memblock can only allocate memory top-down.  So this patch
introduces a new bottom-up allocation mode to allocate memory bottom-up.
And later when we use this allocation direction to allocate memory, we
will limit the start address above the kernel.

Signed-off-by: Tang Chen <tangchen@cn.fujitsu.com>
Signed-off-by: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Acked-by: Toshi Kani <toshi.kani@hp.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Cc: Thomas Renninger <trenn@suse.de>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Jiang Liu <jiang.liu@huawei.com>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: Taku Izumi <izumi.taku@jp.fujitsu.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 3 changed files with 108 additions and 3 deletions Side-by-side Diff

include/linux/memblock.h
... ... @@ -35,6 +35,7 @@
35 35 };
36 36  
37 37 struct memblock {
  38 + bool bottom_up; /* is bottom up direction? */
38 39 phys_addr_t current_limit;
39 40 struct memblock_type memory;
40 41 struct memblock_type reserved;
... ... @@ -147,6 +148,29 @@
147 148 phys_addr_t memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid);
148 149  
149 150 phys_addr_t memblock_alloc(phys_addr_t size, phys_addr_t align);
  151 +
  152 +#ifdef CONFIG_MOVABLE_NODE
  153 +/*
  154 + * Set the allocation direction to bottom-up or top-down.
  155 + */
  156 +static inline void memblock_set_bottom_up(bool enable)
  157 +{
  158 + memblock.bottom_up = enable;
  159 +}
  160 +
  161 +/*
  162 + * Check if the allocation direction is bottom-up or not.
  163 + * if this is true, that said, memblock will allocate memory
  164 + * in bottom-up direction.
  165 + */
  166 +static inline bool memblock_bottom_up(void)
  167 +{
  168 + return memblock.bottom_up;
  169 +}
  170 +#else
  171 +static inline void memblock_set_bottom_up(bool enable) {}
  172 +static inline bool memblock_bottom_up(void) { return false; }
  173 +#endif
150 174  
151 175 /* Flags for memblock_alloc_base() amd __memblock_alloc_base() */
152 176 #define MEMBLOCK_ALLOC_ANYWHERE (~(phys_addr_t)0)
... ... @@ -50,6 +50,10 @@
50 50 #include <asm/pgtable.h>
51 51 #include <asm/processor.h>
52 52  
  53 +#ifndef __pa_symbol
  54 +#define __pa_symbol(x) __pa(RELOC_HIDE((unsigned long)(x), 0))
  55 +#endif
  56 +
53 57 extern unsigned long sysctl_user_reserve_kbytes;
54 58 extern unsigned long sysctl_admin_reserve_kbytes;
55 59  
... ... @@ -20,6 +20,8 @@
20 20 #include <linux/seq_file.h>
21 21 #include <linux/memblock.h>
22 22  
  23 +#include <asm-generic/sections.h>
  24 +
23 25 static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
24 26 static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
25 27  
... ... @@ -32,6 +34,7 @@
32 34 .reserved.cnt = 1, /* empty dummy entry */
33 35 .reserved.max = INIT_MEMBLOCK_REGIONS,
34 36  
  37 + .bottom_up = false,
35 38 .current_limit = MEMBLOCK_ALLOC_ANYWHERE,
36 39 };
37 40  
... ... @@ -82,6 +85,38 @@
82 85 return (i < type->cnt) ? i : -1;
83 86 }
84 87  
  88 +/*
  89 + * __memblock_find_range_bottom_up - find free area utility in bottom-up
  90 + * @start: start of candidate range
  91 + * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
  92 + * @size: size of free area to find
  93 + * @align: alignment of free area to find
  94 + * @nid: nid of the free area to find, %MAX_NUMNODES for any node
  95 + *
  96 + * Utility called from memblock_find_in_range_node(), find free area bottom-up.
  97 + *
  98 + * RETURNS:
  99 + * Found address on success, 0 on failure.
  100 + */
  101 +static phys_addr_t __init_memblock
  102 +__memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
  103 + phys_addr_t size, phys_addr_t align, int nid)
  104 +{
  105 + phys_addr_t this_start, this_end, cand;
  106 + u64 i;
  107 +
  108 + for_each_free_mem_range(i, nid, &this_start, &this_end, NULL) {
  109 + this_start = clamp(this_start, start, end);
  110 + this_end = clamp(this_end, start, end);
  111 +
  112 + cand = round_up(this_start, align);
  113 + if (cand < this_end && this_end - cand >= size)
  114 + return cand;
  115 + }
  116 +
  117 + return 0;
  118 +}
  119 +
85 120 /**
86 121 * __memblock_find_range_top_down - find free area utility, in top-down
87 122 * @start: start of candidate range
... ... @@ -93,7 +128,7 @@
93 128 * Utility called from memblock_find_in_range_node(), find free area top-down.
94 129 *
95 130 * RETURNS:
96   - * Found address on success, %0 on failure.
  131 + * Found address on success, 0 on failure.
97 132 */
98 133 static phys_addr_t __init_memblock
99 134 __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
100 135  
101 136  
... ... @@ -127,13 +162,24 @@
127 162 *
128 163 * Find @size free area aligned to @align in the specified range and node.
129 164 *
  165 + * When allocation direction is bottom-up, the @start should be greater
  166 + * than the end of the kernel image. Otherwise, it will be trimmed. The
  167 + * reason is that we want the bottom-up allocation just near the kernel
  168 + * image so it is highly likely that the allocated memory and the kernel
  169 + * will reside in the same node.
  170 + *
  171 + * If bottom-up allocation failed, will try to allocate memory top-down.
  172 + *
130 173 * RETURNS:
131   - * Found address on success, %0 on failure.
  174 + * Found address on success, 0 on failure.
132 175 */
133 176 phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
134 177 phys_addr_t end, phys_addr_t size,
135 178 phys_addr_t align, int nid)
136 179 {
  180 + int ret;
  181 + phys_addr_t kernel_end;
  182 +
137 183 /* pump up @end */
138 184 if (end == MEMBLOCK_ALLOC_ACCESSIBLE)
139 185 end = memblock.current_limit;
140 186  
... ... @@ -141,7 +187,38 @@
141 187 /* avoid allocating the first page */
142 188 start = max_t(phys_addr_t, start, PAGE_SIZE);
143 189 end = max(start, end);
  190 + kernel_end = __pa_symbol(_end);
144 191  
  192 + /*
  193 + * try bottom-up allocation only when bottom-up mode
  194 + * is set and @end is above the kernel image.
  195 + */
  196 + if (memblock_bottom_up() && end > kernel_end) {
  197 + phys_addr_t bottom_up_start;
  198 +
  199 + /* make sure we will allocate above the kernel */
  200 + bottom_up_start = max(start, kernel_end);
  201 +
  202 + /* ok, try bottom-up allocation first */
  203 + ret = __memblock_find_range_bottom_up(bottom_up_start, end,
  204 + size, align, nid);
  205 + if (ret)
  206 + return ret;
  207 +
  208 + /*
  209 + * we always limit bottom-up allocation above the kernel,
  210 + * but top-down allocation doesn't have the limit, so
  211 + * retrying top-down allocation may succeed when bottom-up
  212 + * allocation failed.
  213 + *
  214 + * bottom-up allocation is expected to be fail very rarely,
  215 + * so we use WARN_ONCE() here to see the stack trace if
  216 + * fail happens.
  217 + */
  218 + WARN_ONCE(1, "memblock: bottom-up allocation failed, "
  219 + "memory hotunplug may be affected\n");
  220 + }
  221 +
145 222 return __memblock_find_range_top_down(start, end, size, align, nid);
146 223 }
147 224  
... ... @@ -155,7 +232,7 @@
155 232 * Find @size free area aligned to @align in the specified range.
156 233 *
157 234 * RETURNS:
158   - * Found address on success, %0 on failure.
  235 + * Found address on success, 0 on failure.
159 236 */
160 237 phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
161 238 phys_addr_t end, phys_addr_t size,