Commit 511c2aba8f07fc45bdcba548cb63f7b8a450c6dc
Committed by
Linus Torvalds
1 parent
fcf07d22f0
Exists in
master
and in
20 other branches
mm, memory-hotplug: dynamic configure movable memory and portion memory
Add online_movable and online_kernel for logic memory hotplug. This is the dynamic version of "movablecore" & "kernelcore". We have the same reason to introduce it as to introduce "movablecore" & "kernelcore". It has the same motive as "movablecore" & "kernelcore", but it is dynamic/running-time: o We can configure memory as kernelcore or movablecore after boot. Userspace workload is increased, we need more hugepage, we can't use "online_movable" to add memory and allow the system use more THP(transparent-huge-page), vice-verse when kernel workload is increase. Also help for virtualization to dynamic configure host/guest's memory, to save/(reduce waste) memory. Memory capacity on Demand o When a new node is physically online after boot, we need to use "online_movable" or "online_kernel" to configure/portion it as we expected when we logic-online it. This configuration also helps for physically-memory-migrate. o all benefit as the same as existed "movablecore" & "kernelcore". o Preparing for movable-node, which is very important for power-saving, hardware partitioning and high-available-system(hardware fault management). (Note, we don't introduce movable-node here.) Action behavior: When a memoryblock/memorysection is onlined by "online_movable", the kernel will not have directly reference to the page of the memoryblock, thus we can remove that memory any time when needed. When it is online by "online_kernel", the kernel can use it. When it is online by "online", the zone type doesn't changed. Current constraints: Only the memoryblock which is adjacent to the ZONE_MOVABLE can be online from ZONE_NORMAL to ZONE_MOVABLE. [akpm@linux-foundation.org: use min_t, cleanups] Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com> Signed-off-by: Wen Congyang <wency@cn.fujitsu.com> Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com> Cc: Lai Jiangshan <laijs@cn.fujitsu.com> Cc: Jiang Liu <jiang.liu@huawei.com> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Minchan Kim <minchan.kim@gmail.com> Cc: Mel Gorman <mgorman@suse.de> Cc: David Rientjes <rientjes@google.com> Cc: Yinghai Lu <yinghai@kernel.org> Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: Greg KH <greg@kroah.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Showing 4 changed files with 146 additions and 14 deletions Side-by-side Diff
Documentation/memory-hotplug.txt
... | ... | @@ -161,7 +161,8 @@ |
161 | 161 | in the memory block. |
162 | 162 | 'state' : read-write |
163 | 163 | at read: contains online/offline state of memory. |
164 | - at write: user can specify "online", "offline" command | |
164 | + at write: user can specify "online_kernel", | |
165 | + "online_movable", "online", "offline" command | |
165 | 166 | which will be performed on al sections in the block. |
166 | 167 | 'phys_device' : read-only: designed to show the name of physical memory |
167 | 168 | device. This is not well implemented now. |
... | ... | @@ -254,6 +255,17 @@ |
254 | 255 | For onlining, you have to write "online" to the section's state file as: |
255 | 256 | |
256 | 257 | % echo online > /sys/devices/system/memory/memoryXXX/state |
258 | + | |
259 | +This onlining will not change the ZONE type of the target memory section, | |
260 | +If the memory section is in ZONE_NORMAL, you can change it to ZONE_MOVABLE: | |
261 | + | |
262 | +% echo online_movable > /sys/devices/system/memory/memoryXXX/state | |
263 | +(NOTE: current limit: this memory section must be adjacent to ZONE_MOVABLE) | |
264 | + | |
265 | +And if the memory section is in ZONE_MOVABLE, you can change it to ZONE_NORMAL: | |
266 | + | |
267 | +% echo online_kernel > /sys/devices/system/memory/memoryXXX/state | |
268 | +(NOTE: current limit: this memory section must be adjacent to ZONE_NORMAL) | |
257 | 269 | |
258 | 270 | After this, section memoryXXX's state will be 'online' and the amount of |
259 | 271 | available memory will be increased. |
drivers/base/memory.c
... | ... | @@ -254,7 +254,7 @@ |
254 | 254 | * OK to have direct references to sparsemem variables in here. |
255 | 255 | */ |
256 | 256 | static int |
257 | -memory_block_action(unsigned long phys_index, unsigned long action) | |
257 | +memory_block_action(unsigned long phys_index, unsigned long action, int online_type) | |
258 | 258 | { |
259 | 259 | unsigned long start_pfn; |
260 | 260 | unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; |
... | ... | @@ -269,7 +269,7 @@ |
269 | 269 | if (!pages_correctly_reserved(start_pfn, nr_pages)) |
270 | 270 | return -EBUSY; |
271 | 271 | |
272 | - ret = online_pages(start_pfn, nr_pages); | |
272 | + ret = online_pages(start_pfn, nr_pages, online_type); | |
273 | 273 | break; |
274 | 274 | case MEM_OFFLINE: |
275 | 275 | ret = offline_pages(start_pfn, nr_pages); |
... | ... | @@ -284,7 +284,8 @@ |
284 | 284 | } |
285 | 285 | |
286 | 286 | static int __memory_block_change_state(struct memory_block *mem, |
287 | - unsigned long to_state, unsigned long from_state_req) | |
287 | + unsigned long to_state, unsigned long from_state_req, | |
288 | + int online_type) | |
288 | 289 | { |
289 | 290 | int ret = 0; |
290 | 291 | |
... | ... | @@ -296,7 +297,7 @@ |
296 | 297 | if (to_state == MEM_OFFLINE) |
297 | 298 | mem->state = MEM_GOING_OFFLINE; |
298 | 299 | |
299 | - ret = memory_block_action(mem->start_section_nr, to_state); | |
300 | + ret = memory_block_action(mem->start_section_nr, to_state, online_type); | |
300 | 301 | |
301 | 302 | if (ret) { |
302 | 303 | mem->state = from_state_req; |
303 | 304 | |
... | ... | @@ -319,12 +320,14 @@ |
319 | 320 | } |
320 | 321 | |
321 | 322 | static int memory_block_change_state(struct memory_block *mem, |
322 | - unsigned long to_state, unsigned long from_state_req) | |
323 | + unsigned long to_state, unsigned long from_state_req, | |
324 | + int online_type) | |
323 | 325 | { |
324 | 326 | int ret; |
325 | 327 | |
326 | 328 | mutex_lock(&mem->state_mutex); |
327 | - ret = __memory_block_change_state(mem, to_state, from_state_req); | |
329 | + ret = __memory_block_change_state(mem, to_state, from_state_req, | |
330 | + online_type); | |
328 | 331 | mutex_unlock(&mem->state_mutex); |
329 | 332 | |
330 | 333 | return ret; |
... | ... | @@ -338,10 +341,18 @@ |
338 | 341 | |
339 | 342 | mem = container_of(dev, struct memory_block, dev); |
340 | 343 | |
341 | - if (!strncmp(buf, "online", min((int)count, 6))) | |
342 | - ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); | |
343 | - else if(!strncmp(buf, "offline", min((int)count, 7))) | |
344 | - ret = memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE); | |
344 | + if (!strncmp(buf, "online_kernel", min_t(int, count, 13))) | |
345 | + ret = memory_block_change_state(mem, MEM_ONLINE, | |
346 | + MEM_OFFLINE, ONLINE_KERNEL); | |
347 | + else if (!strncmp(buf, "online_movable", min_t(int, count, 14))) | |
348 | + ret = memory_block_change_state(mem, MEM_ONLINE, | |
349 | + MEM_OFFLINE, ONLINE_MOVABLE); | |
350 | + else if (!strncmp(buf, "online", min_t(int, count, 6))) | |
351 | + ret = memory_block_change_state(mem, MEM_ONLINE, | |
352 | + MEM_OFFLINE, ONLINE_KEEP); | |
353 | + else if(!strncmp(buf, "offline", min_t(int, count, 7))) | |
354 | + ret = memory_block_change_state(mem, MEM_OFFLINE, | |
355 | + MEM_ONLINE, -1); | |
345 | 356 | |
346 | 357 | if (ret) |
347 | 358 | return ret; |
... | ... | @@ -676,7 +687,7 @@ |
676 | 687 | |
677 | 688 | mutex_lock(&mem->state_mutex); |
678 | 689 | if (mem->state != MEM_OFFLINE) |
679 | - ret = __memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE); | |
690 | + ret = __memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE, -1); | |
680 | 691 | mutex_unlock(&mem->state_mutex); |
681 | 692 | |
682 | 693 | return ret; |
include/linux/memory_hotplug.h
... | ... | @@ -26,6 +26,13 @@ |
26 | 26 | MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE = NODE_INFO, |
27 | 27 | }; |
28 | 28 | |
29 | +/* Types for control the zone type of onlined memory */ | |
30 | +enum { | |
31 | + ONLINE_KEEP, | |
32 | + ONLINE_KERNEL, | |
33 | + ONLINE_MOVABLE, | |
34 | +}; | |
35 | + | |
29 | 36 | /* |
30 | 37 | * pgdat resizing functions |
31 | 38 | */ |
... | ... | @@ -46,6 +53,10 @@ |
46 | 53 | } |
47 | 54 | /* |
48 | 55 | * Zone resizing functions |
56 | + * | |
57 | + * Note: any attempt to resize a zone should has pgdat_resize_lock() | |
58 | + * zone_span_writelock() both held. This ensure the size of a zone | |
59 | + * can't be changed while pgdat_resize_lock() held. | |
49 | 60 | */ |
50 | 61 | static inline unsigned zone_span_seqbegin(struct zone *zone) |
51 | 62 | { |
... | ... | @@ -71,7 +82,7 @@ |
71 | 82 | extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages); |
72 | 83 | extern int add_one_highpage(struct page *page, int pfn, int bad_ppro); |
73 | 84 | /* VM interface that may be used by firmware interface */ |
74 | -extern int online_pages(unsigned long, unsigned long); | |
85 | +extern int online_pages(unsigned long, unsigned long, int); | |
75 | 86 | extern void __offline_isolated_pages(unsigned long, unsigned long); |
76 | 87 | |
77 | 88 | typedef void (*online_page_callback_t)(struct page *page); |
mm/memory_hotplug.c
... | ... | @@ -214,6 +214,88 @@ |
214 | 214 | zone_span_writeunlock(zone); |
215 | 215 | } |
216 | 216 | |
217 | +static void resize_zone(struct zone *zone, unsigned long start_pfn, | |
218 | + unsigned long end_pfn) | |
219 | +{ | |
220 | + zone_span_writelock(zone); | |
221 | + | |
222 | + zone->zone_start_pfn = start_pfn; | |
223 | + zone->spanned_pages = end_pfn - start_pfn; | |
224 | + | |
225 | + zone_span_writeunlock(zone); | |
226 | +} | |
227 | + | |
228 | +static void fix_zone_id(struct zone *zone, unsigned long start_pfn, | |
229 | + unsigned long end_pfn) | |
230 | +{ | |
231 | + enum zone_type zid = zone_idx(zone); | |
232 | + int nid = zone->zone_pgdat->node_id; | |
233 | + unsigned long pfn; | |
234 | + | |
235 | + for (pfn = start_pfn; pfn < end_pfn; pfn++) | |
236 | + set_page_links(pfn_to_page(pfn), zid, nid, pfn); | |
237 | +} | |
238 | + | |
239 | +static int move_pfn_range_left(struct zone *z1, struct zone *z2, | |
240 | + unsigned long start_pfn, unsigned long end_pfn) | |
241 | +{ | |
242 | + unsigned long flags; | |
243 | + | |
244 | + pgdat_resize_lock(z1->zone_pgdat, &flags); | |
245 | + | |
246 | + /* can't move pfns which are higher than @z2 */ | |
247 | + if (end_pfn > z2->zone_start_pfn + z2->spanned_pages) | |
248 | + goto out_fail; | |
249 | + /* the move out part mast at the left most of @z2 */ | |
250 | + if (start_pfn > z2->zone_start_pfn) | |
251 | + goto out_fail; | |
252 | + /* must included/overlap */ | |
253 | + if (end_pfn <= z2->zone_start_pfn) | |
254 | + goto out_fail; | |
255 | + | |
256 | + resize_zone(z1, z1->zone_start_pfn, end_pfn); | |
257 | + resize_zone(z2, end_pfn, z2->zone_start_pfn + z2->spanned_pages); | |
258 | + | |
259 | + pgdat_resize_unlock(z1->zone_pgdat, &flags); | |
260 | + | |
261 | + fix_zone_id(z1, start_pfn, end_pfn); | |
262 | + | |
263 | + return 0; | |
264 | +out_fail: | |
265 | + pgdat_resize_unlock(z1->zone_pgdat, &flags); | |
266 | + return -1; | |
267 | +} | |
268 | + | |
269 | +static int move_pfn_range_right(struct zone *z1, struct zone *z2, | |
270 | + unsigned long start_pfn, unsigned long end_pfn) | |
271 | +{ | |
272 | + unsigned long flags; | |
273 | + | |
274 | + pgdat_resize_lock(z1->zone_pgdat, &flags); | |
275 | + | |
276 | + /* can't move pfns which are lower than @z1 */ | |
277 | + if (z1->zone_start_pfn > start_pfn) | |
278 | + goto out_fail; | |
279 | + /* the move out part mast at the right most of @z1 */ | |
280 | + if (z1->zone_start_pfn + z1->spanned_pages > end_pfn) | |
281 | + goto out_fail; | |
282 | + /* must included/overlap */ | |
283 | + if (start_pfn >= z1->zone_start_pfn + z1->spanned_pages) | |
284 | + goto out_fail; | |
285 | + | |
286 | + resize_zone(z1, z1->zone_start_pfn, start_pfn); | |
287 | + resize_zone(z2, start_pfn, z2->zone_start_pfn + z2->spanned_pages); | |
288 | + | |
289 | + pgdat_resize_unlock(z1->zone_pgdat, &flags); | |
290 | + | |
291 | + fix_zone_id(z2, start_pfn, end_pfn); | |
292 | + | |
293 | + return 0; | |
294 | +out_fail: | |
295 | + pgdat_resize_unlock(z1->zone_pgdat, &flags); | |
296 | + return -1; | |
297 | +} | |
298 | + | |
217 | 299 | static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, |
218 | 300 | unsigned long end_pfn) |
219 | 301 | { |
... | ... | @@ -508,7 +590,7 @@ |
508 | 590 | } |
509 | 591 | |
510 | 592 | |
511 | -int __ref online_pages(unsigned long pfn, unsigned long nr_pages) | |
593 | +int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) | |
512 | 594 | { |
513 | 595 | unsigned long onlined_pages = 0; |
514 | 596 | struct zone *zone; |
... | ... | @@ -523,6 +605,22 @@ |
523 | 605 | * The section can't be removed here because of the |
524 | 606 | * memory_block->state_mutex. |
525 | 607 | */ |
608 | + zone = page_zone(pfn_to_page(pfn)); | |
609 | + | |
610 | + if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) { | |
611 | + if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) { | |
612 | + unlock_memory_hotplug(); | |
613 | + return -1; | |
614 | + } | |
615 | + } | |
616 | + if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) { | |
617 | + if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) { | |
618 | + unlock_memory_hotplug(); | |
619 | + return -1; | |
620 | + } | |
621 | + } | |
622 | + | |
623 | + /* Previous code may changed the zone of the pfn range */ | |
526 | 624 | zone = page_zone(pfn_to_page(pfn)); |
527 | 625 | |
528 | 626 | arg.start_pfn = pfn; |