Commit 0c0e6195896535481173df98935ad8db174f4d45

Authored by KAMEZAWA Hiroyuki
Committed by Linus Torvalds
1 parent a5d76b54a3

memory unplug: page offline

Logic.
 - set all pages in  [start,end)  as isolated migration-type.
   by this, all free pages in the range will be not-for-use.
 - Migrate all LRU pages in the range.
 - Test all pages in the range's refcnt is zero or not.

Todo:
 - allocate migration destination page from better area.
 - confirm page_count(page)== 0 && PageReserved(page) page is safe to be freed..
 (I don't like this kind of page but..
 - Find out pages which cannot be migrated.
 - more running tests.
 - Use reclaim for unplugging other memory type area.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 6 changed files with 314 additions and 1 deletions Side-by-side Diff

... ... @@ -305,6 +305,9 @@
305 305 config ARCH_ENABLE_MEMORY_HOTPLUG
306 306 def_bool y
307 307  
  308 +config ARCH_ENABLE_MEMORY_HOTREMOVE
  309 + def_bool y
  310 +
308 311 config SCHED_SMT
309 312 bool "SMT scheduler support"
310 313 depends on SMP
include/linux/kernel.h
... ... @@ -35,6 +35,7 @@
35 35 #define ALIGN(x,a) __ALIGN_MASK(x,(typeof(x))(a)-1)
36 36 #define __ALIGN_MASK(x,mask) (((x)+(mask))&~(mask))
37 37 #define PTR_ALIGN(p, a) ((typeof(p))ALIGN((unsigned long)(p), (a)))
  38 +#define IS_ALIGNED(x,a) (((x) % ((typeof(x))(a))) == 0)
38 39  
39 40 #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]) + __must_be_array(arr))
40 41  
include/linux/memory_hotplug.h
... ... @@ -58,7 +58,10 @@
58 58 extern void online_page(struct page *page);
59 59 /* VM interface that may be used by firmware interface */
60 60 extern int online_pages(unsigned long, unsigned long);
61   -
  61 +#ifdef CONFIG_MEMORY_HOTREMOVE
  62 +extern int offline_pages(unsigned long, unsigned long, unsigned long);
  63 +extern void __offline_isolated_pages(unsigned long, unsigned long);
  64 +#endif
62 65 /* reasonably generic interface to expand the physical pages in a zone */
63 66 extern int __add_pages(struct zone *zone, unsigned long start_pfn,
64 67 unsigned long nr_pages);
... ... @@ -139,6 +139,11 @@
139 139 def_bool y
140 140 depends on SPARSEMEM && MEMORY_HOTPLUG
141 141  
  142 +config MEMORY_HOTREMOVE
  143 + bool "Allow for memory hot remove"
  144 + depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE
  145 + depends on MIGRATION
  146 +
142 147 # Heavily threaded applications may benefit from splitting the mm-wide
143 148 # page_table_lock, so that faults on different parts of the user address
144 149 # space can be handled with less contention: split it at this NR_CPUS.
... ... @@ -23,6 +23,9 @@
23 23 #include <linux/vmalloc.h>
24 24 #include <linux/ioport.h>
25 25 #include <linux/cpuset.h>
  26 +#include <linux/delay.h>
  27 +#include <linux/migrate.h>
  28 +#include <linux/page-isolation.h>
26 29  
27 30 #include <asm/tlbflush.h>
28 31  
... ... @@ -302,4 +305,255 @@
302 305 return ret;
303 306 }
304 307 EXPORT_SYMBOL_GPL(add_memory);
  308 +
  309 +#ifdef CONFIG_MEMORY_HOTREMOVE
  310 +/*
  311 + * Confirm all pages in a range [start, end) is belongs to the same zone.
  312 + */
  313 +static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
  314 +{
  315 + unsigned long pfn;
  316 + struct zone *zone = NULL;
  317 + struct page *page;
  318 + int i;
  319 + for (pfn = start_pfn;
  320 + pfn < end_pfn;
  321 + pfn += MAX_ORDER_NR_PAGES) {
  322 + i = 0;
  323 + /* This is just a CONFIG_HOLES_IN_ZONE check.*/
  324 + while ((i < MAX_ORDER_NR_PAGES) && !pfn_valid_within(pfn + i))
  325 + i++;
  326 + if (i == MAX_ORDER_NR_PAGES)
  327 + continue;
  328 + page = pfn_to_page(pfn + i);
  329 + if (zone && page_zone(page) != zone)
  330 + return 0;
  331 + zone = page_zone(page);
  332 + }
  333 + return 1;
  334 +}
  335 +
  336 +/*
  337 + * Scanning pfn is much easier than scanning lru list.
  338 + * Scan pfn from start to end and Find LRU page.
  339 + */
  340 +int scan_lru_pages(unsigned long start, unsigned long end)
  341 +{
  342 + unsigned long pfn;
  343 + struct page *page;
  344 + for (pfn = start; pfn < end; pfn++) {
  345 + if (pfn_valid(pfn)) {
  346 + page = pfn_to_page(pfn);
  347 + if (PageLRU(page))
  348 + return pfn;
  349 + }
  350 + }
  351 + return 0;
  352 +}
  353 +
  354 +static struct page *
  355 +hotremove_migrate_alloc(struct page *page,
  356 + unsigned long private,
  357 + int **x)
  358 +{
  359 + /* This should be improoooooved!! */
  360 + return alloc_page(GFP_HIGHUSER_PAGECACHE);
  361 +}
  362 +
  363 +
  364 +#define NR_OFFLINE_AT_ONCE_PAGES (256)
  365 +static int
  366 +do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
  367 +{
  368 + unsigned long pfn;
  369 + struct page *page;
  370 + int move_pages = NR_OFFLINE_AT_ONCE_PAGES;
  371 + int not_managed = 0;
  372 + int ret = 0;
  373 + LIST_HEAD(source);
  374 +
  375 + for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) {
  376 + if (!pfn_valid(pfn))
  377 + continue;
  378 + page = pfn_to_page(pfn);
  379 + if (!page_count(page))
  380 + continue;
  381 + /*
  382 + * We can skip free pages. And we can only deal with pages on
  383 + * LRU.
  384 + */
  385 + ret = isolate_lru_page(page, &source);
  386 + if (!ret) { /* Success */
  387 + move_pages--;
  388 + } else {
  389 + /* Becasue we don't have big zone->lock. we should
  390 + check this again here. */
  391 + if (page_count(page))
  392 + not_managed++;
  393 +#ifdef CONFIG_DEBUG_VM
  394 + printk(KERN_INFO "removing from LRU failed"
  395 + " %lx/%d/%lx\n",
  396 + pfn, page_count(page), page->flags);
  397 +#endif
  398 + }
  399 + }
  400 + ret = -EBUSY;
  401 + if (not_managed) {
  402 + if (!list_empty(&source))
  403 + putback_lru_pages(&source);
  404 + goto out;
  405 + }
  406 + ret = 0;
  407 + if (list_empty(&source))
  408 + goto out;
  409 + /* this function returns # of failed pages */
  410 + ret = migrate_pages(&source, hotremove_migrate_alloc, 0);
  411 +
  412 +out:
  413 + return ret;
  414 +}
  415 +
  416 +/*
  417 + * remove from free_area[] and mark all as Reserved.
  418 + */
  419 +static int
  420 +offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages,
  421 + void *data)
  422 +{
  423 + __offline_isolated_pages(start, start + nr_pages);
  424 + return 0;
  425 +}
  426 +
  427 +static void
  428 +offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
  429 +{
  430 + walk_memory_resource(start_pfn, end_pfn - start_pfn, NULL,
  431 + offline_isolated_pages_cb);
  432 +}
  433 +
  434 +/*
  435 + * Check all pages in range, recoreded as memory resource, are isolated.
  436 + */
  437 +static int
  438 +check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
  439 + void *data)
  440 +{
  441 + int ret;
  442 + long offlined = *(long *)data;
  443 + ret = test_pages_isolated(start_pfn, start_pfn + nr_pages);
  444 + offlined = nr_pages;
  445 + if (!ret)
  446 + *(long *)data += offlined;
  447 + return ret;
  448 +}
  449 +
  450 +static long
  451 +check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
  452 +{
  453 + long offlined = 0;
  454 + int ret;
  455 +
  456 + ret = walk_memory_resource(start_pfn, end_pfn - start_pfn, &offlined,
  457 + check_pages_isolated_cb);
  458 + if (ret < 0)
  459 + offlined = (long)ret;
  460 + return offlined;
  461 +}
  462 +
  463 +extern void drain_all_local_pages(void);
  464 +
  465 +int offline_pages(unsigned long start_pfn,
  466 + unsigned long end_pfn, unsigned long timeout)
  467 +{
  468 + unsigned long pfn, nr_pages, expire;
  469 + long offlined_pages;
  470 + int ret, drain, retry_max;
  471 + struct zone *zone;
  472 +
  473 + BUG_ON(start_pfn >= end_pfn);
  474 + /* at least, alignment against pageblock is necessary */
  475 + if (!IS_ALIGNED(start_pfn, pageblock_nr_pages))
  476 + return -EINVAL;
  477 + if (!IS_ALIGNED(end_pfn, pageblock_nr_pages))
  478 + return -EINVAL;
  479 + /* This makes hotplug much easier...and readable.
  480 + we assume this for now. .*/
  481 + if (!test_pages_in_a_zone(start_pfn, end_pfn))
  482 + return -EINVAL;
  483 + /* set above range as isolated */
  484 + ret = start_isolate_page_range(start_pfn, end_pfn);
  485 + if (ret)
  486 + return ret;
  487 + nr_pages = end_pfn - start_pfn;
  488 + pfn = start_pfn;
  489 + expire = jiffies + timeout;
  490 + drain = 0;
  491 + retry_max = 5;
  492 +repeat:
  493 + /* start memory hot removal */
  494 + ret = -EAGAIN;
  495 + if (time_after(jiffies, expire))
  496 + goto failed_removal;
  497 + ret = -EINTR;
  498 + if (signal_pending(current))
  499 + goto failed_removal;
  500 + ret = 0;
  501 + if (drain) {
  502 + lru_add_drain_all();
  503 + flush_scheduled_work();
  504 + cond_resched();
  505 + drain_all_local_pages();
  506 + }
  507 +
  508 + pfn = scan_lru_pages(start_pfn, end_pfn);
  509 + if (pfn) { /* We have page on LRU */
  510 + ret = do_migrate_range(pfn, end_pfn);
  511 + if (!ret) {
  512 + drain = 1;
  513 + goto repeat;
  514 + } else {
  515 + if (ret < 0)
  516 + if (--retry_max == 0)
  517 + goto failed_removal;
  518 + yield();
  519 + drain = 1;
  520 + goto repeat;
  521 + }
  522 + }
  523 + /* drain all zone's lru pagevec, this is asyncronous... */
  524 + lru_add_drain_all();
  525 + flush_scheduled_work();
  526 + yield();
  527 + /* drain pcp pages , this is synchrouns. */
  528 + drain_all_local_pages();
  529 + /* check again */
  530 + offlined_pages = check_pages_isolated(start_pfn, end_pfn);
  531 + if (offlined_pages < 0) {
  532 + ret = -EBUSY;
  533 + goto failed_removal;
  534 + }
  535 + printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages);
  536 + /* Ok, all of our target is islaoted.
  537 + We cannot do rollback at this point. */
  538 + offline_isolated_pages(start_pfn, end_pfn);
  539 + /* reset pagetype flags */
  540 + start_isolate_page_range(start_pfn, end_pfn);
  541 + /* removal success */
  542 + zone = page_zone(pfn_to_page(start_pfn));
  543 + zone->present_pages -= offlined_pages;
  544 + zone->zone_pgdat->node_present_pages -= offlined_pages;
  545 + totalram_pages -= offlined_pages;
  546 + num_physpages -= offlined_pages;
  547 + vm_total_pages = nr_free_pagecache_pages();
  548 + writeback_set_ratelimit();
  549 + return 0;
  550 +
  551 +failed_removal:
  552 + printk(KERN_INFO "memory offlining %lx to %lx failed\n",
  553 + start_pfn, end_pfn);
  554 + /* pushback to free area */
  555 + undo_isolate_page_range(start_pfn, end_pfn);
  556 + return ret;
  557 +}
  558 +#endif /* CONFIG_MEMORY_HOTREMOVE */
... ... @@ -4477,4 +4477,51 @@
4477 4477 out:
4478 4478 spin_unlock_irqrestore(&zone->lock, flags);
4479 4479 }
  4480 +
  4481 +#ifdef CONFIG_MEMORY_HOTREMOVE
  4482 +/*
  4483 + * All pages in the range must be isolated before calling this.
  4484 + */
  4485 +void
  4486 +__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
  4487 +{
  4488 + struct page *page;
  4489 + struct zone *zone;
  4490 + int order, i;
  4491 + unsigned long pfn;
  4492 + unsigned long flags;
  4493 + /* find the first valid pfn */
  4494 + for (pfn = start_pfn; pfn < end_pfn; pfn++)
  4495 + if (pfn_valid(pfn))
  4496 + break;
  4497 + if (pfn == end_pfn)
  4498 + return;
  4499 + zone = page_zone(pfn_to_page(pfn));
  4500 + spin_lock_irqsave(&zone->lock, flags);
  4501 + pfn = start_pfn;
  4502 + while (pfn < end_pfn) {
  4503 + if (!pfn_valid(pfn)) {
  4504 + pfn++;
  4505 + continue;
  4506 + }
  4507 + page = pfn_to_page(pfn);
  4508 + BUG_ON(page_count(page));
  4509 + BUG_ON(!PageBuddy(page));
  4510 + order = page_order(page);
  4511 +#ifdef CONFIG_DEBUG_VM
  4512 + printk(KERN_INFO "remove from free list %lx %d %lx\n",
  4513 + pfn, 1 << order, end_pfn);
  4514 +#endif
  4515 + list_del(&page->lru);
  4516 + rmv_page_order(page);
  4517 + zone->free_area[order].nr_free--;
  4518 + __mod_zone_page_state(zone, NR_FREE_PAGES,
  4519 + - (1UL << order));
  4520 + for (i = 0; i < (1 << order); i++)
  4521 + SetPageReserved((page+i));
  4522 + pfn += (1 << order);
  4523 + }
  4524 + spin_unlock_irqrestore(&zone->lock, flags);
  4525 +}
  4526 +#endif