Commit c8721bbbdd36382de51cd6b7a56322e0acca2414

Authored by Naoya Horiguchi
Committed by Linus Torvalds
1 parent 71ea2efb1e

mm: memory-hotplug: enable memory hotplug to handle hugepage

Until now we can't offline memory blocks which contain hugepages because a
hugepage is considered as an unmovable page.  But now with this patch
series, a hugepage has become movable, so by using hugepage migration we
can offline such memory blocks.

What's different from other users of hugepage migration is that we need to
decompose all the hugepages inside the target memory block into free buddy
pages after hugepage migration, because otherwise free hugepages remaining
in the memory block intervene the memory offlining.  For this reason we
introduce new functions dissolve_free_huge_page() and
dissolve_free_huge_pages().

Other than that, what this patch does is straightforwardly to add hugepage
migration code, that is, adding hugepage code to the functions which scan
over pfn and collect hugepages to be migrated, and adding a hugepage
allocation function to alloc_migrate_target().

As for larger hugepages (1GB for x86_64), it's not easy to do hotremove
over them because it's larger than memory block.  So we now simply leave
it to fail as it is.

[yongjun_wei@trendmicro.com.cn: remove duplicated include]
Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Acked-by: Andi Kleen <ak@linux.intel.com>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Hugh Dickins <hughd@google.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Rik van Riel <riel@redhat.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 5 changed files with 135 additions and 9 deletions Side-by-side Diff

include/linux/hugetlb.h
... ... @@ -68,6 +68,7 @@
68 68 int dequeue_hwpoisoned_huge_page(struct page *page);
69 69 bool isolate_huge_page(struct page *page, struct list_head *list);
70 70 void putback_active_hugepage(struct page *page);
  71 +bool is_hugepage_active(struct page *page);
71 72 void copy_huge_page(struct page *dst, struct page *src);
72 73  
73 74 #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
... ... @@ -138,6 +139,7 @@
138 139  
139 140 #define isolate_huge_page(p, l) false
140 141 #define putback_active_hugepage(p) do {} while (0)
  142 +#define is_hugepage_active(x) false
141 143 static inline void copy_huge_page(struct page *dst, struct page *src)
142 144 {
143 145 }
... ... @@ -377,6 +379,9 @@
377 379 return __basepage_index(page);
378 380 }
379 381  
  382 +extern void dissolve_free_huge_pages(unsigned long start_pfn,
  383 + unsigned long end_pfn);
  384 +
380 385 #else /* CONFIG_HUGETLB_PAGE */
381 386 struct hstate {};
382 387 #define alloc_huge_page_node(h, nid) NULL
... ... @@ -403,6 +408,7 @@
403 408 {
404 409 return page->index;
405 410 }
  411 +#define dissolve_free_huge_pages(s, e) do {} while (0)
406 412 #endif /* CONFIG_HUGETLB_PAGE */
407 413  
408 414 #endif /* _LINUX_HUGETLB_H */
... ... @@ -21,6 +21,7 @@
21 21 #include <linux/rmap.h>
22 22 #include <linux/swap.h>
23 23 #include <linux/swapops.h>
  24 +#include <linux/page-isolation.h>
24 25  
25 26 #include <asm/page.h>
26 27 #include <asm/pgtable.h>
27 28  
... ... @@ -522,9 +523,15 @@
522 523 {
523 524 struct page *page;
524 525  
525   - if (list_empty(&h->hugepage_freelists[nid]))
  526 + list_for_each_entry(page, &h->hugepage_freelists[nid], lru)
  527 + if (!is_migrate_isolate_page(page))
  528 + break;
  529 + /*
  530 + * if 'non-isolated free hugepage' not found on the list,
  531 + * the allocation fails.
  532 + */
  533 + if (&h->hugepage_freelists[nid] == &page->lru)
526 534 return NULL;
527   - page = list_entry(h->hugepage_freelists[nid].next, struct page, lru);
528 535 list_move(&page->lru, &h->hugepage_activelist);
529 536 set_page_refcounted(page);
530 537 h->free_huge_pages--;
... ... @@ -878,6 +885,44 @@
878 885 return ret;
879 886 }
880 887  
  888 +/*
  889 + * Dissolve a given free hugepage into free buddy pages. This function does
  890 + * nothing for in-use (including surplus) hugepages.
  891 + */
  892 +static void dissolve_free_huge_page(struct page *page)
  893 +{
  894 + spin_lock(&hugetlb_lock);
  895 + if (PageHuge(page) && !page_count(page)) {
  896 + struct hstate *h = page_hstate(page);
  897 + int nid = page_to_nid(page);
  898 + list_del(&page->lru);
  899 + h->free_huge_pages--;
  900 + h->free_huge_pages_node[nid]--;
  901 + update_and_free_page(h, page);
  902 + }
  903 + spin_unlock(&hugetlb_lock);
  904 +}
  905 +
  906 +/*
  907 + * Dissolve free hugepages in a given pfn range. Used by memory hotplug to
  908 + * make specified memory blocks removable from the system.
  909 + * Note that start_pfn should aligned with (minimum) hugepage size.
  910 + */
  911 +void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
  912 +{
  913 + unsigned int order = 8 * sizeof(void *);
  914 + unsigned long pfn;
  915 + struct hstate *h;
  916 +
  917 + /* Set scan step to minimum hugepage size */
  918 + for_each_hstate(h)
  919 + if (order > huge_page_order(h))
  920 + order = huge_page_order(h);
  921 + VM_BUG_ON(!IS_ALIGNED(start_pfn, 1 << order));
  922 + for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order)
  923 + dissolve_free_huge_page(pfn_to_page(pfn));
  924 +}
  925 +
881 926 static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
882 927 {
883 928 struct page *page;
... ... @@ -3456,5 +3501,27 @@
3456 3501 list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
3457 3502 spin_unlock(&hugetlb_lock);
3458 3503 put_page(page);
  3504 +}
  3505 +
  3506 +bool is_hugepage_active(struct page *page)
  3507 +{
  3508 + VM_BUG_ON(!PageHuge(page));
  3509 + /*
  3510 + * This function can be called for a tail page because the caller,
  3511 + * scan_movable_pages, scans through a given pfn-range which typically
  3512 + * covers one memory block. In systems using gigantic hugepage (1GB
  3513 + * for x86_64,) a hugepage is larger than a memory block, and we don't
  3514 + * support migrating such large hugepages for now, so return false
  3515 + * when called for tail pages.
  3516 + */
  3517 + if (PageTail(page))
  3518 + return false;
  3519 + /*
  3520 + * Refcount of a hwpoisoned hugepages is 1, but they are not active,
  3521 + * so we should return false for them.
  3522 + */
  3523 + if (unlikely(PageHWPoison(page)))
  3524 + return false;
  3525 + return page_count(page) > 0;
3459 3526 }
... ... @@ -30,6 +30,7 @@
30 30 #include <linux/mm_inline.h>
31 31 #include <linux/firmware-map.h>
32 32 #include <linux/stop_machine.h>
  33 +#include <linux/hugetlb.h>
33 34  
34 35 #include <asm/tlbflush.h>
35 36  
36 37  
... ... @@ -1230,10 +1231,12 @@
1230 1231 }
1231 1232  
1232 1233 /*
1233   - * Scanning pfn is much easier than scanning lru list.
1234   - * Scan pfn from start to end and Find LRU page.
  1234 + * Scan pfn range [start,end) to find movable/migratable pages (LRU pages
  1235 + * and hugepages). We scan pfn because it's much easier than scanning over
  1236 + * linked list. This function returns the pfn of the first found movable
  1237 + * page if it's found, otherwise 0.
1235 1238 */
1236   -static unsigned long scan_lru_pages(unsigned long start, unsigned long end)
  1239 +static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
1237 1240 {
1238 1241 unsigned long pfn;
1239 1242 struct page *page;
... ... @@ -1242,6 +1245,13 @@
1242 1245 page = pfn_to_page(pfn);
1243 1246 if (PageLRU(page))
1244 1247 return pfn;
  1248 + if (PageHuge(page)) {
  1249 + if (is_hugepage_active(page))
  1250 + return pfn;
  1251 + else
  1252 + pfn = round_up(pfn + 1,
  1253 + 1 << compound_order(page)) - 1;
  1254 + }
1245 1255 }
1246 1256 }
1247 1257 return 0;
... ... @@ -1262,6 +1272,19 @@
1262 1272 if (!pfn_valid(pfn))
1263 1273 continue;
1264 1274 page = pfn_to_page(pfn);
  1275 +
  1276 + if (PageHuge(page)) {
  1277 + struct page *head = compound_head(page);
  1278 + pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1;
  1279 + if (compound_order(head) > PFN_SECTION_SHIFT) {
  1280 + ret = -EBUSY;
  1281 + break;
  1282 + }
  1283 + if (isolate_huge_page(page, &source))
  1284 + move_pages -= 1 << compound_order(head);
  1285 + continue;
  1286 + }
  1287 +
1265 1288 if (!get_page_unless_zero(page))
1266 1289 continue;
1267 1290 /*
... ... @@ -1294,7 +1317,7 @@
1294 1317 }
1295 1318 if (!list_empty(&source)) {
1296 1319 if (not_managed) {
1297   - putback_lru_pages(&source);
  1320 + putback_movable_pages(&source);
1298 1321 goto out;
1299 1322 }
1300 1323  
... ... @@ -1305,7 +1328,7 @@
1305 1328 ret = migrate_pages(&source, alloc_migrate_target, 0,
1306 1329 MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
1307 1330 if (ret)
1308   - putback_lru_pages(&source);
  1331 + putback_movable_pages(&source);
1309 1332 }
1310 1333 out:
1311 1334 return ret;
... ... @@ -1548,8 +1571,8 @@
1548 1571 drain_all_pages();
1549 1572 }
1550 1573  
1551   - pfn = scan_lru_pages(start_pfn, end_pfn);
1552   - if (pfn) { /* We have page on LRU */
  1574 + pfn = scan_movable_pages(start_pfn, end_pfn);
  1575 + if (pfn) { /* We have movable pages */
1553 1576 ret = do_migrate_range(pfn, end_pfn);
1554 1577 if (!ret) {
1555 1578 drain = 1;
... ... @@ -1568,6 +1591,11 @@
1568 1591 yield();
1569 1592 /* drain pcp pages, this is synchronous. */
1570 1593 drain_all_pages();
  1594 + /*
  1595 + * dissolve free hugepages in the memory block before doing offlining
  1596 + * actually in order to make hugetlbfs's object counting consistent.
  1597 + */
  1598 + dissolve_free_huge_pages(start_pfn, end_pfn);
1571 1599 /* check again */
1572 1600 offlined_pages = check_pages_isolated(start_pfn, end_pfn);
1573 1601 if (offlined_pages < 0) {
... ... @@ -6008,6 +6008,17 @@
6008 6008 continue;
6009 6009  
6010 6010 page = pfn_to_page(check);
  6011 +
  6012 + /*
  6013 + * Hugepages are not in LRU lists, but they're movable.
  6014 + * We need not scan over tail pages bacause we don't
  6015 + * handle each tail page individually in migration.
  6016 + */
  6017 + if (PageHuge(page)) {
  6018 + iter = round_up(iter + 1, 1<<compound_order(page)) - 1;
  6019 + continue;
  6020 + }
  6021 +
6011 6022 /*
6012 6023 * We can't use page_count without pin a page
6013 6024 * because another CPU can free compound page.
... ... @@ -6,6 +6,7 @@
6 6 #include <linux/page-isolation.h>
7 7 #include <linux/pageblock-flags.h>
8 8 #include <linux/memory.h>
  9 +#include <linux/hugetlb.h>
9 10 #include "internal.h"
10 11  
11 12 int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages)
... ... @@ -251,6 +252,19 @@
251 252 int **resultp)
252 253 {
253 254 gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
  255 +
  256 + /*
  257 + * TODO: allocate a destination hugepage from a nearest neighbor node,
  258 + * accordance with memory policy of the user process if possible. For
  259 + * now as a simple work-around, we use the next node for destination.
  260 + */
  261 + if (PageHuge(page)) {
  262 + nodemask_t src = nodemask_of_node(page_to_nid(page));
  263 + nodemask_t dst;
  264 + nodes_complement(dst, src);
  265 + return alloc_huge_page_node(page_hstate(compound_head(page)),
  266 + next_node(page_to_nid(page), dst));
  267 + }
254 268  
255 269 if (PageHighMem(page))
256 270 gfp_mask |= __GFP_HIGHMEM;