Commit 46e387bbd82d438b9131e237e6e2cb55a825da49

Authored by Andi Kleen

Merge branch 'hwpoison-hugepages' into hwpoison

Conflicts:
	mm/memory-failure.c

Showing 10 changed files Side-by-side Diff

... ... @@ -11,6 +11,7 @@
11 11 #include <linux/kprobes.h> /* __kprobes, ... */
12 12 #include <linux/mmiotrace.h> /* kmmio_handler, ... */
13 13 #include <linux/perf_event.h> /* perf_sw_event */
  14 +#include <linux/hugetlb.h> /* hstate_index_to_shift */
14 15  
15 16 #include <asm/traps.h> /* dotraplinkage, ... */
16 17 #include <asm/pgalloc.h> /* pgd_*(), ... */
17 18  
18 19  
... ... @@ -160,15 +161,20 @@
160 161  
161 162 static void
162 163 force_sig_info_fault(int si_signo, int si_code, unsigned long address,
163   - struct task_struct *tsk)
  164 + struct task_struct *tsk, int fault)
164 165 {
  166 + unsigned lsb = 0;
165 167 siginfo_t info;
166 168  
167 169 info.si_signo = si_signo;
168 170 info.si_errno = 0;
169 171 info.si_code = si_code;
170 172 info.si_addr = (void __user *)address;
171   - info.si_addr_lsb = si_code == BUS_MCEERR_AR ? PAGE_SHIFT : 0;
  173 + if (fault & VM_FAULT_HWPOISON_LARGE)
  174 + lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
  175 + if (fault & VM_FAULT_HWPOISON)
  176 + lsb = PAGE_SHIFT;
  177 + info.si_addr_lsb = lsb;
172 178  
173 179 force_sig_info(si_signo, &info, tsk);
174 180 }
... ... @@ -722,7 +728,7 @@
722 728 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
723 729 tsk->thread.trap_no = 14;
724 730  
725   - force_sig_info_fault(SIGSEGV, si_code, address, tsk);
  731 + force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0);
726 732  
727 733 return;
728 734 }
729 735  
... ... @@ -807,14 +813,14 @@
807 813 tsk->thread.trap_no = 14;
808 814  
809 815 #ifdef CONFIG_MEMORY_FAILURE
810   - if (fault & VM_FAULT_HWPOISON) {
  816 + if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
811 817 printk(KERN_ERR
812 818 "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
813 819 tsk->comm, tsk->pid, address);
814 820 code = BUS_MCEERR_AR;
815 821 }
816 822 #endif
817   - force_sig_info_fault(SIGBUS, code, address, tsk);
  823 + force_sig_info_fault(SIGBUS, code, address, tsk, fault);
818 824 }
819 825  
820 826 static noinline void
... ... @@ -824,7 +830,8 @@
824 830 if (fault & VM_FAULT_OOM) {
825 831 out_of_memory(regs, error_code, address);
826 832 } else {
827   - if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON))
  833 + if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
  834 + VM_FAULT_HWPOISON_LARGE))
828 835 do_sigbus(regs, error_code, address, fault);
829 836 else
830 837 BUG();
fs/hugetlbfs/inode.c
... ... @@ -31,6 +31,7 @@
31 31 #include <linux/statfs.h>
32 32 #include <linux/security.h>
33 33 #include <linux/magic.h>
  34 +#include <linux/migrate.h>
34 35  
35 36 #include <asm/uaccess.h>
36 37  
... ... @@ -573,6 +574,19 @@
573 574 return 0;
574 575 }
575 576  
  577 +static int hugetlbfs_migrate_page(struct address_space *mapping,
  578 + struct page *newpage, struct page *page)
  579 +{
  580 + int rc;
  581 +
  582 + rc = migrate_huge_page_move_mapping(mapping, newpage, page);
  583 + if (rc)
  584 + return rc;
  585 + migrate_page_copy(newpage, page);
  586 +
  587 + return 0;
  588 +}
  589 +
576 590 static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
577 591 {
578 592 struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
... ... @@ -659,6 +673,7 @@
659 673 .write_begin = hugetlbfs_write_begin,
660 674 .write_end = hugetlbfs_write_end,
661 675 .set_page_dirty = hugetlbfs_set_page_dirty,
  676 + .migratepage = hugetlbfs_migrate_page,
662 677 };
663 678  
664 679  
include/linux/hugetlb.h
... ... @@ -43,7 +43,8 @@
43 43 struct vm_area_struct *vma,
44 44 int acctflags);
45 45 void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
46   -void __isolate_hwpoisoned_huge_page(struct page *page);
  46 +int dequeue_hwpoisoned_huge_page(struct page *page);
  47 +void copy_huge_page(struct page *dst, struct page *src);
47 48  
48 49 extern unsigned long hugepages_treat_as_movable;
49 50 extern const unsigned long hugetlb_zero, hugetlb_infinity;
... ... @@ -101,7 +102,10 @@
101 102 #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; })
102 103 #define hugetlb_fault(mm, vma, addr, flags) ({ BUG(); 0; })
103 104 #define huge_pte_offset(mm, address) 0
104   -#define __isolate_hwpoisoned_huge_page(page) 0
  105 +#define dequeue_hwpoisoned_huge_page(page) 0
  106 +static inline void copy_huge_page(struct page *dst, struct page *src)
  107 +{
  108 +}
105 109  
106 110 #define hugetlb_change_protection(vma, address, end, newprot)
107 111  
... ... @@ -228,6 +232,8 @@
228 232 struct hstate *hstate;
229 233 };
230 234  
  235 +struct page *alloc_huge_page_node(struct hstate *h, int nid);
  236 +
231 237 /* arch callback */
232 238 int __init alloc_bootmem_huge_page(struct hstate *h);
233 239  
234 240  
... ... @@ -301,8 +307,14 @@
301 307 return size_to_hstate(PAGE_SIZE << compound_order(page));
302 308 }
303 309  
  310 +static inline unsigned hstate_index_to_shift(unsigned index)
  311 +{
  312 + return hstates[index].order + PAGE_SHIFT;
  313 +}
  314 +
304 315 #else
305 316 struct hstate {};
  317 +#define alloc_huge_page_node(h, nid) NULL
306 318 #define alloc_bootmem_huge_page(h) NULL
307 319 #define hstate_file(f) NULL
308 320 #define hstate_vma(v) NULL
... ... @@ -317,6 +329,7 @@
317 329 {
318 330 return 1;
319 331 }
  332 +#define hstate_index_to_shift(index) 0
320 333 #endif
321 334  
322 335 #endif /* _LINUX_HUGETLB_H */
include/linux/migrate.h
... ... @@ -14,6 +14,8 @@
14 14 struct page *, struct page *);
15 15 extern int migrate_pages(struct list_head *l, new_page_t x,
16 16 unsigned long private, int offlining);
  17 +extern int migrate_huge_pages(struct list_head *l, new_page_t x,
  18 + unsigned long private, int offlining);
17 19  
18 20 extern int fail_migrate_page(struct address_space *,
19 21 struct page *, struct page *);
20 22  
... ... @@ -23,12 +25,17 @@
23 25 extern int migrate_vmas(struct mm_struct *mm,
24 26 const nodemask_t *from, const nodemask_t *to,
25 27 unsigned long flags);
  28 +extern void migrate_page_copy(struct page *newpage, struct page *page);
  29 +extern int migrate_huge_page_move_mapping(struct address_space *mapping,
  30 + struct page *newpage, struct page *page);
26 31 #else
27 32 #define PAGE_MIGRATION 0
28 33  
29 34 static inline void putback_lru_pages(struct list_head *l) {}
30 35 static inline int migrate_pages(struct list_head *l, new_page_t x,
31 36 unsigned long private, int offlining) { return -ENOSYS; }
  37 +static inline int migrate_huge_pages(struct list_head *l, new_page_t x,
  38 + unsigned long private, int offlining) { return -ENOSYS; }
32 39  
33 40 static inline int migrate_prep(void) { return -ENOSYS; }
34 41 static inline int migrate_prep_local(void) { return -ENOSYS; }
... ... @@ -36,6 +43,15 @@
36 43 static inline int migrate_vmas(struct mm_struct *mm,
37 44 const nodemask_t *from, const nodemask_t *to,
38 45 unsigned long flags)
  46 +{
  47 + return -ENOSYS;
  48 +}
  49 +
  50 +static inline void migrate_page_copy(struct page *newpage,
  51 + struct page *page) {}
  52 +
  53 +static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
  54 + struct page *newpage, struct page *page)
39 55 {
40 56 return -ENOSYS;
41 57 }
... ... @@ -718,12 +718,20 @@
718 718 #define VM_FAULT_SIGBUS 0x0002
719 719 #define VM_FAULT_MAJOR 0x0004
720 720 #define VM_FAULT_WRITE 0x0008 /* Special case for get_user_pages */
721   -#define VM_FAULT_HWPOISON 0x0010 /* Hit poisoned page */
  721 +#define VM_FAULT_HWPOISON 0x0010 /* Hit poisoned small page */
  722 +#define VM_FAULT_HWPOISON_LARGE 0x0020 /* Hit poisoned large page. Index encoded in upper bits */
722 723  
723 724 #define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page */
724 725 #define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */
725 726  
726   -#define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON)
  727 +#define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */
  728 +
  729 +#define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON | \
  730 + VM_FAULT_HWPOISON_LARGE)
  731 +
  732 +/* Encode hstate index for a hwpoisoned large page */
  733 +#define VM_FAULT_SET_HINDEX(x) ((x) << 12)
  734 +#define VM_FAULT_GET_HINDEX(x) (((x) >> 12) & 0xf)
727 735  
728 736 /*
729 737 * Can be called by the pagefault handler when it gets a VM_FAULT_OOM.
... ... @@ -423,14 +423,14 @@
423 423 }
424 424 }
425 425  
426   -static void copy_gigantic_page(struct page *dst, struct page *src,
  426 +static void copy_user_gigantic_page(struct page *dst, struct page *src,
427 427 unsigned long addr, struct vm_area_struct *vma)
428 428 {
429 429 int i;
430 430 struct hstate *h = hstate_vma(vma);
431 431 struct page *dst_base = dst;
432 432 struct page *src_base = src;
433   - might_sleep();
  433 +
434 434 for (i = 0; i < pages_per_huge_page(h); ) {
435 435 cond_resched();
436 436 copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
437 437  
... ... @@ -440,14 +440,15 @@
440 440 src = mem_map_next(src, src_base, i);
441 441 }
442 442 }
443   -static void copy_huge_page(struct page *dst, struct page *src,
  443 +
  444 +static void copy_user_huge_page(struct page *dst, struct page *src,
444 445 unsigned long addr, struct vm_area_struct *vma)
445 446 {
446 447 int i;
447 448 struct hstate *h = hstate_vma(vma);
448 449  
449 450 if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
450   - copy_gigantic_page(dst, src, addr, vma);
  451 + copy_user_gigantic_page(dst, src, addr, vma);
451 452 return;
452 453 }
453 454  
... ... @@ -458,6 +459,40 @@
458 459 }
459 460 }
460 461  
  462 +static void copy_gigantic_page(struct page *dst, struct page *src)
  463 +{
  464 + int i;
  465 + struct hstate *h = page_hstate(src);
  466 + struct page *dst_base = dst;
  467 + struct page *src_base = src;
  468 +
  469 + for (i = 0; i < pages_per_huge_page(h); ) {
  470 + cond_resched();
  471 + copy_highpage(dst, src);
  472 +
  473 + i++;
  474 + dst = mem_map_next(dst, dst_base, i);
  475 + src = mem_map_next(src, src_base, i);
  476 + }
  477 +}
  478 +
  479 +void copy_huge_page(struct page *dst, struct page *src)
  480 +{
  481 + int i;
  482 + struct hstate *h = page_hstate(src);
  483 +
  484 + if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
  485 + copy_gigantic_page(dst, src);
  486 + return;
  487 + }
  488 +
  489 + might_sleep();
  490 + for (i = 0; i < pages_per_huge_page(h); i++) {
  491 + cond_resched();
  492 + copy_highpage(dst + i, src + i);
  493 + }
  494 +}
  495 +
461 496 static void enqueue_huge_page(struct hstate *h, struct page *page)
462 497 {
463 498 int nid = page_to_nid(page);
464 499  
... ... @@ -466,11 +501,24 @@
466 501 h->free_huge_pages_node[nid]++;
467 502 }
468 503  
  504 +static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
  505 +{
  506 + struct page *page;
  507 +
  508 + if (list_empty(&h->hugepage_freelists[nid]))
  509 + return NULL;
  510 + page = list_entry(h->hugepage_freelists[nid].next, struct page, lru);
  511 + list_del(&page->lru);
  512 + set_page_refcounted(page);
  513 + h->free_huge_pages--;
  514 + h->free_huge_pages_node[nid]--;
  515 + return page;
  516 +}
  517 +
469 518 static struct page *dequeue_huge_page_vma(struct hstate *h,
470 519 struct vm_area_struct *vma,
471 520 unsigned long address, int avoid_reserve)
472 521 {
473   - int nid;
474 522 struct page *page = NULL;
475 523 struct mempolicy *mpol;
476 524 nodemask_t *nodemask;
... ... @@ -496,19 +544,13 @@
496 544  
497 545 for_each_zone_zonelist_nodemask(zone, z, zonelist,
498 546 MAX_NR_ZONES - 1, nodemask) {
499   - nid = zone_to_nid(zone);
500   - if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) &&
501   - !list_empty(&h->hugepage_freelists[nid])) {
502   - page = list_entry(h->hugepage_freelists[nid].next,
503   - struct page, lru);
504   - list_del(&page->lru);
505   - h->free_huge_pages--;
506   - h->free_huge_pages_node[nid]--;
507   -
508   - if (!avoid_reserve)
509   - decrement_hugepage_resv_vma(h, vma);
510   -
511   - break;
  547 + if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) {
  548 + page = dequeue_huge_page_node(h, zone_to_nid(zone));
  549 + if (page) {
  550 + if (!avoid_reserve)
  551 + decrement_hugepage_resv_vma(h, vma);
  552 + break;
  553 + }
512 554 }
513 555 }
514 556 err:
515 557  
... ... @@ -770,11 +812,10 @@
770 812 return ret;
771 813 }
772 814  
773   -static struct page *alloc_buddy_huge_page(struct hstate *h,
774   - struct vm_area_struct *vma, unsigned long address)
  815 +static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
775 816 {
776 817 struct page *page;
777   - unsigned int nid;
  818 + unsigned int r_nid;
778 819  
779 820 if (h->order >= MAX_ORDER)
780 821 return NULL;
... ... @@ -812,9 +853,14 @@
812 853 }
813 854 spin_unlock(&hugetlb_lock);
814 855  
815   - page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
816   - __GFP_REPEAT|__GFP_NOWARN,
817   - huge_page_order(h));
  856 + if (nid == NUMA_NO_NODE)
  857 + page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
  858 + __GFP_REPEAT|__GFP_NOWARN,
  859 + huge_page_order(h));
  860 + else
  861 + page = alloc_pages_exact_node(nid,
  862 + htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
  863 + __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h));
818 864  
819 865 if (page && arch_prepare_hugepage(page)) {
820 866 __free_pages(page, huge_page_order(h));
821 867  
... ... @@ -823,19 +869,13 @@
823 869  
824 870 spin_lock(&hugetlb_lock);
825 871 if (page) {
826   - /*
827   - * This page is now managed by the hugetlb allocator and has
828   - * no users -- drop the buddy allocator's reference.
829   - */
830   - put_page_testzero(page);
831   - VM_BUG_ON(page_count(page));
832   - nid = page_to_nid(page);
  872 + r_nid = page_to_nid(page);
833 873 set_compound_page_dtor(page, free_huge_page);
834 874 /*
835 875 * We incremented the global counters already
836 876 */
837   - h->nr_huge_pages_node[nid]++;
838   - h->surplus_huge_pages_node[nid]++;
  877 + h->nr_huge_pages_node[r_nid]++;
  878 + h->surplus_huge_pages_node[r_nid]++;
839 879 __count_vm_event(HTLB_BUDDY_PGALLOC);
840 880 } else {
841 881 h->nr_huge_pages--;
... ... @@ -848,6 +888,25 @@
848 888 }
849 889  
850 890 /*
  891 + * This allocation function is useful in the context where vma is irrelevant.
  892 + * E.g. soft-offlining uses this function because it only cares physical
  893 + * address of error page.
  894 + */
  895 +struct page *alloc_huge_page_node(struct hstate *h, int nid)
  896 +{
  897 + struct page *page;
  898 +
  899 + spin_lock(&hugetlb_lock);
  900 + page = dequeue_huge_page_node(h, nid);
  901 + spin_unlock(&hugetlb_lock);
  902 +
  903 + if (!page)
  904 + page = alloc_buddy_huge_page(h, nid);
  905 +
  906 + return page;
  907 +}
  908 +
  909 +/*
851 910 * Increase the hugetlb pool such that it can accomodate a reservation
852 911 * of size 'delta'.
853 912 */
854 913  
855 914  
... ... @@ -871,17 +930,14 @@
871 930 retry:
872 931 spin_unlock(&hugetlb_lock);
873 932 for (i = 0; i < needed; i++) {
874   - page = alloc_buddy_huge_page(h, NULL, 0);
875   - if (!page) {
  933 + page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
  934 + if (!page)
876 935 /*
877 936 * We were not able to allocate enough pages to
878 937 * satisfy the entire reservation so we free what
879 938 * we've allocated so far.
880 939 */
881   - spin_lock(&hugetlb_lock);
882   - needed = 0;
883 940 goto free;
884   - }
885 941  
886 942 list_add(&page->lru, &surplus_list);
887 943 }
888 944  
889 945  
890 946  
891 947  
892 948  
893 949  
... ... @@ -908,31 +964,31 @@
908 964 needed += allocated;
909 965 h->resv_huge_pages += delta;
910 966 ret = 0;
911   -free:
  967 +
  968 + spin_unlock(&hugetlb_lock);
912 969 /* Free the needed pages to the hugetlb pool */
913 970 list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
914 971 if ((--needed) < 0)
915 972 break;
916 973 list_del(&page->lru);
  974 + /*
  975 + * This page is now managed by the hugetlb allocator and has
  976 + * no users -- drop the buddy allocator's reference.
  977 + */
  978 + put_page_testzero(page);
  979 + VM_BUG_ON(page_count(page));
917 980 enqueue_huge_page(h, page);
918 981 }
919 982  
920 983 /* Free unnecessary surplus pages to the buddy allocator */
  984 +free:
921 985 if (!list_empty(&surplus_list)) {
922   - spin_unlock(&hugetlb_lock);
923 986 list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
924 987 list_del(&page->lru);
925   - /*
926   - * The page has a reference count of zero already, so
927   - * call free_huge_page directly instead of using
928   - * put_page. This must be done with hugetlb_lock
929   - * unlocked which is safe because free_huge_page takes
930   - * hugetlb_lock before deciding how to free the page.
931   - */
932   - free_huge_page(page);
  988 + put_page(page);
933 989 }
934   - spin_lock(&hugetlb_lock);
935 990 }
  991 + spin_lock(&hugetlb_lock);
936 992  
937 993 return ret;
938 994 }
939 995  
... ... @@ -1052,14 +1108,13 @@
1052 1108 spin_unlock(&hugetlb_lock);
1053 1109  
1054 1110 if (!page) {
1055   - page = alloc_buddy_huge_page(h, vma, addr);
  1111 + page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
1056 1112 if (!page) {
1057 1113 hugetlb_put_quota(inode->i_mapping, chg);
1058 1114 return ERR_PTR(-VM_FAULT_SIGBUS);
1059 1115 }
1060 1116 }
1061 1117  
1062   - set_page_refcounted(page);
1063 1118 set_page_private(page, (unsigned long) mapping);
1064 1119  
1065 1120 vma_commit_reservation(h, vma, addr);
... ... @@ -2153,6 +2208,19 @@
2153 2208 return -ENOMEM;
2154 2209 }
2155 2210  
  2211 +static int is_hugetlb_entry_migration(pte_t pte)
  2212 +{
  2213 + swp_entry_t swp;
  2214 +
  2215 + if (huge_pte_none(pte) || pte_present(pte))
  2216 + return 0;
  2217 + swp = pte_to_swp_entry(pte);
  2218 + if (non_swap_entry(swp) && is_migration_entry(swp)) {
  2219 + return 1;
  2220 + } else
  2221 + return 0;
  2222 +}
  2223 +
2156 2224 static int is_hugetlb_entry_hwpoisoned(pte_t pte)
2157 2225 {
2158 2226 swp_entry_t swp;
... ... @@ -2383,7 +2451,7 @@
2383 2451 if (unlikely(anon_vma_prepare(vma)))
2384 2452 return VM_FAULT_OOM;
2385 2453  
2386   - copy_huge_page(new_page, old_page, address, vma);
  2454 + copy_user_huge_page(new_page, old_page, address, vma);
2387 2455 __SetPageUptodate(new_page);
2388 2456  
2389 2457 /*
2390 2458  
... ... @@ -2515,22 +2583,20 @@
2515 2583 hugepage_add_new_anon_rmap(page, vma, address);
2516 2584 }
2517 2585 } else {
  2586 + /*
  2587 + * If memory error occurs between mmap() and fault, some process
  2588 + * don't have hwpoisoned swap entry for errored virtual address.
  2589 + * So we need to block hugepage fault by PG_hwpoison bit check.
  2590 + */
  2591 + if (unlikely(PageHWPoison(page))) {
  2592 + ret = VM_FAULT_HWPOISON |
  2593 + VM_FAULT_SET_HINDEX(h - hstates);
  2594 + goto backout_unlocked;
  2595 + }
2518 2596 page_dup_rmap(page);
2519 2597 }
2520 2598  
2521 2599 /*
2522   - * Since memory error handler replaces pte into hwpoison swap entry
2523   - * at the time of error handling, a process which reserved but not have
2524   - * the mapping to the error hugepage does not have hwpoison swap entry.
2525   - * So we need to block accesses from such a process by checking
2526   - * PG_hwpoison bit here.
2527   - */
2528   - if (unlikely(PageHWPoison(page))) {
2529   - ret = VM_FAULT_HWPOISON;
2530   - goto backout_unlocked;
2531   - }
2532   -
2533   - /*
2534 2600 * If we are going to COW a private mapping later, we examine the
2535 2601 * pending reservations for this page now. This will ensure that
2536 2602 * any allocations necessary to record that reservation occur outside
... ... @@ -2587,8 +2653,12 @@
2587 2653 ptep = huge_pte_offset(mm, address);
2588 2654 if (ptep) {
2589 2655 entry = huge_ptep_get(ptep);
2590   - if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
2591   - return VM_FAULT_HWPOISON;
  2656 + if (unlikely(is_hugetlb_entry_migration(entry))) {
  2657 + migration_entry_wait(mm, (pmd_t *)ptep, address);
  2658 + return 0;
  2659 + } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
  2660 + return VM_FAULT_HWPOISON_LARGE |
  2661 + VM_FAULT_SET_HINDEX(h - hstates);
2592 2662 }
2593 2663  
2594 2664 ptep = huge_pte_alloc(mm, address, huge_page_size(h));
2595 2665  
2596 2666  
2597 2667  
2598 2668  
2599 2669  
... ... @@ -2878,19 +2948,42 @@
2878 2948 hugetlb_acct_memory(h, -(chg - freed));
2879 2949 }
2880 2950  
  2951 +#ifdef CONFIG_MEMORY_FAILURE
  2952 +
  2953 +/* Should be called in hugetlb_lock */
  2954 +static int is_hugepage_on_freelist(struct page *hpage)
  2955 +{
  2956 + struct page *page;
  2957 + struct page *tmp;
  2958 + struct hstate *h = page_hstate(hpage);
  2959 + int nid = page_to_nid(hpage);
  2960 +
  2961 + list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru)
  2962 + if (page == hpage)
  2963 + return 1;
  2964 + return 0;
  2965 +}
  2966 +
2881 2967 /*
2882 2968 * This function is called from memory failure code.
2883 2969 * Assume the caller holds page lock of the head page.
2884 2970 */
2885   -void __isolate_hwpoisoned_huge_page(struct page *hpage)
  2971 +int dequeue_hwpoisoned_huge_page(struct page *hpage)
2886 2972 {
2887 2973 struct hstate *h = page_hstate(hpage);
2888 2974 int nid = page_to_nid(hpage);
  2975 + int ret = -EBUSY;
2889 2976  
2890 2977 spin_lock(&hugetlb_lock);
2891   - list_del(&hpage->lru);
2892   - h->free_huge_pages--;
2893   - h->free_huge_pages_node[nid]--;
  2978 + if (is_hugepage_on_freelist(hpage)) {
  2979 + list_del(&hpage->lru);
  2980 + set_page_refcounted(hpage);
  2981 + h->free_huge_pages--;
  2982 + h->free_huge_pages_node[nid]--;
  2983 + ret = 0;
  2984 + }
2894 2985 spin_unlock(&hugetlb_lock);
  2986 + return ret;
2895 2987 }
  2988 +#endif
... ... @@ -697,11 +697,10 @@
697 697 * Issues:
698 698 * - Error on hugepage is contained in hugepage unit (not in raw page unit.)
699 699 * To narrow down kill region to one page, we need to break up pmd.
700   - * - To support soft-offlining for hugepage, we need to support hugepage
701   - * migration.
702 700 */
703 701 static int me_huge_page(struct page *p, unsigned long pfn)
704 702 {
  703 + int res = 0;
705 704 struct page *hpage = compound_head(p);
706 705 /*
707 706 * We can safely recover from error on free or reserved (i.e.
... ... @@ -714,8 +713,9 @@
714 713 * so there is no race between isolation and mapping/unmapping.
715 714 */
716 715 if (!(page_mapping(hpage) || PageAnon(hpage))) {
717   - __isolate_hwpoisoned_huge_page(hpage);
718   - return RECOVERED;
  716 + res = dequeue_hwpoisoned_huge_page(hpage);
  717 + if (!res)
  718 + return RECOVERED;
719 719 }
720 720 return DELAYED;
721 721 }
... ... @@ -972,7 +972,10 @@
972 972 * We need/can do nothing about count=0 pages.
973 973 * 1) it's a free page, and therefore in safe hand:
974 974 * prep_new_page() will be the gate keeper.
975   - * 2) it's part of a non-compound high order page.
  975 + * 2) it's a free hugepage, which is also safe:
  976 + * an affected hugepage will be dequeued from hugepage freelist,
  977 + * so there's no concern about reusing it ever after.
  978 + * 3) it's part of a non-compound high order page.
976 979 * Implies some kernel user: cannot stop them from
977 980 * R/W the page; let's pray that the page has been
978 981 * used and will be freed some time later.
... ... @@ -984,6 +987,24 @@
984 987 if (is_free_buddy_page(p)) {
985 988 action_result(pfn, "free buddy", DELAYED);
986 989 return 0;
  990 + } else if (PageHuge(hpage)) {
  991 + /*
  992 + * Check "just unpoisoned", "filter hit", and
  993 + * "race with other subpage."
  994 + */
  995 + lock_page_nosync(hpage);
  996 + if (!PageHWPoison(hpage)
  997 + || (hwpoison_filter(p) && TestClearPageHWPoison(p))
  998 + || (p != hpage && TestSetPageHWPoison(hpage))) {
  999 + atomic_long_sub(nr_pages, &mce_bad_pages);
  1000 + return 0;
  1001 + }
  1002 + set_page_hwpoison_huge_page(hpage);
  1003 + res = dequeue_hwpoisoned_huge_page(hpage);
  1004 + action_result(pfn, "free huge",
  1005 + res ? IGNORED : DELAYED);
  1006 + unlock_page(hpage);
  1007 + return res;
987 1008 } else {
988 1009 action_result(pfn, "high order kernel", IGNORED);
989 1010 return -EBUSY;
... ... @@ -1145,6 +1166,16 @@
1145 1166 nr_pages = 1 << compound_order(page);
1146 1167  
1147 1168 if (!get_page_unless_zero(page)) {
  1169 + /*
  1170 + * Since HWPoisoned hugepage should have non-zero refcount,
  1171 + * race between memory failure and unpoison seems to happen.
  1172 + * In such case unpoison fails and memory failure runs
  1173 + * to the end.
  1174 + */
  1175 + if (PageHuge(page)) {
  1176 + pr_debug("MCE: Memory failure is now running on free hugepage %#lx\n", pfn);
  1177 + return 0;
  1178 + }
1148 1179 if (TestClearPageHWPoison(p))
1149 1180 atomic_long_sub(nr_pages, &mce_bad_pages);
1150 1181 pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
1151 1182  
... ... @@ -1162,9 +1193,9 @@
1162 1193 pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
1163 1194 atomic_long_sub(nr_pages, &mce_bad_pages);
1164 1195 freeit = 1;
  1196 + if (PageHuge(page))
  1197 + clear_page_hwpoison_huge_page(page);
1165 1198 }
1166   - if (PageHuge(p))
1167   - clear_page_hwpoison_huge_page(page);
1168 1199 unlock_page(page);
1169 1200  
1170 1201 put_page(page);
... ... @@ -1178,7 +1209,11 @@
1178 1209 static struct page *new_page(struct page *p, unsigned long private, int **x)
1179 1210 {
1180 1211 int nid = page_to_nid(p);
1181   - return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
  1212 + if (PageHuge(p))
  1213 + return alloc_huge_page_node(page_hstate(compound_head(p)),
  1214 + nid);
  1215 + else
  1216 + return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
1182 1217 }
1183 1218  
1184 1219 /*
1185 1220  
... ... @@ -1206,8 +1241,15 @@
1206 1241 * was free.
1207 1242 */
1208 1243 set_migratetype_isolate(p);
  1244 + /*
  1245 + * When the target page is a free hugepage, just remove it
  1246 + * from free hugepage list.
  1247 + */
1209 1248 if (!get_page_unless_zero(compound_head(p))) {
1210   - if (is_free_buddy_page(p)) {
  1249 + if (PageHuge(p)) {
  1250 + pr_info("get_any_page: %#lx free huge page\n", pfn);
  1251 + ret = dequeue_hwpoisoned_huge_page(compound_head(p));
  1252 + } else if (is_free_buddy_page(p)) {
1211 1253 pr_info("get_any_page: %#lx free buddy page\n", pfn);
1212 1254 /* Set hwpoison bit while page is still isolated */
1213 1255 SetPageHWPoison(p);
... ... @@ -1226,6 +1268,45 @@
1226 1268 return ret;
1227 1269 }
1228 1270  
  1271 +static int soft_offline_huge_page(struct page *page, int flags)
  1272 +{
  1273 + int ret;
  1274 + unsigned long pfn = page_to_pfn(page);
  1275 + struct page *hpage = compound_head(page);
  1276 + LIST_HEAD(pagelist);
  1277 +
  1278 + ret = get_any_page(page, pfn, flags);
  1279 + if (ret < 0)
  1280 + return ret;
  1281 + if (ret == 0)
  1282 + goto done;
  1283 +
  1284 + if (PageHWPoison(hpage)) {
  1285 + put_page(hpage);
  1286 + pr_debug("soft offline: %#lx hugepage already poisoned\n", pfn);
  1287 + return -EBUSY;
  1288 + }
  1289 +
  1290 + /* Keep page count to indicate a given hugepage is isolated. */
  1291 +
  1292 + list_add(&hpage->lru, &pagelist);
  1293 + ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
  1294 + if (ret) {
  1295 + pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
  1296 + pfn, ret, page->flags);
  1297 + if (ret > 0)
  1298 + ret = -EIO;
  1299 + return ret;
  1300 + }
  1301 +done:
  1302 + if (!PageHWPoison(hpage))
  1303 + atomic_long_add(1 << compound_order(hpage), &mce_bad_pages);
  1304 + set_page_hwpoison_huge_page(hpage);
  1305 + dequeue_hwpoisoned_huge_page(hpage);
  1306 + /* keep elevated page count for bad page */
  1307 + return ret;
  1308 +}
  1309 +
1229 1310 /**
1230 1311 * soft_offline_page - Soft offline a page.
1231 1312 * @page: page to offline
... ... @@ -1252,6 +1333,9 @@
1252 1333 {
1253 1334 int ret;
1254 1335 unsigned long pfn = page_to_pfn(page);
  1336 +
  1337 + if (PageHuge(page))
  1338 + return soft_offline_huge_page(page, flags);
1255 1339  
1256 1340 ret = get_any_page(page, pfn, flags);
1257 1341 if (ret < 0)
... ... @@ -1450,7 +1450,8 @@
1450 1450 if (ret & VM_FAULT_OOM)
1451 1451 return i ? i : -ENOMEM;
1452 1452 if (ret &
1453   - (VM_FAULT_HWPOISON|VM_FAULT_SIGBUS))
  1453 + (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE|
  1454 + VM_FAULT_SIGBUS))
1454 1455 return i ? i : -EFAULT;
1455 1456 BUG();
1456 1457 }
... ... @@ -32,6 +32,7 @@
32 32 #include <linux/security.h>
33 33 #include <linux/memcontrol.h>
34 34 #include <linux/syscalls.h>
  35 +#include <linux/hugetlb.h>
35 36 #include <linux/gfp.h>
36 37  
37 38 #include "internal.h"
38 39  
39 40  
40 41  
41 42  
42 43  
... ... @@ -95,26 +96,34 @@
95 96 pte_t *ptep, pte;
96 97 spinlock_t *ptl;
97 98  
98   - pgd = pgd_offset(mm, addr);
99   - if (!pgd_present(*pgd))
100   - goto out;
  99 + if (unlikely(PageHuge(new))) {
  100 + ptep = huge_pte_offset(mm, addr);
  101 + if (!ptep)
  102 + goto out;
  103 + ptl = &mm->page_table_lock;
  104 + } else {
  105 + pgd = pgd_offset(mm, addr);
  106 + if (!pgd_present(*pgd))
  107 + goto out;
101 108  
102   - pud = pud_offset(pgd, addr);
103   - if (!pud_present(*pud))
104   - goto out;
  109 + pud = pud_offset(pgd, addr);
  110 + if (!pud_present(*pud))
  111 + goto out;
105 112  
106   - pmd = pmd_offset(pud, addr);
107   - if (!pmd_present(*pmd))
108   - goto out;
  113 + pmd = pmd_offset(pud, addr);
  114 + if (!pmd_present(*pmd))
  115 + goto out;
109 116  
110   - ptep = pte_offset_map(pmd, addr);
  117 + ptep = pte_offset_map(pmd, addr);
111 118  
112   - if (!is_swap_pte(*ptep)) {
113   - pte_unmap(ptep);
114   - goto out;
115   - }
  119 + if (!is_swap_pte(*ptep)) {
  120 + pte_unmap(ptep);
  121 + goto out;
  122 + }
116 123  
117   - ptl = pte_lockptr(mm, pmd);
  124 + ptl = pte_lockptr(mm, pmd);
  125 + }
  126 +
118 127 spin_lock(ptl);
119 128 pte = *ptep;
120 129 if (!is_swap_pte(pte))
121 130  
... ... @@ -130,10 +139,19 @@
130 139 pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
131 140 if (is_write_migration_entry(entry))
132 141 pte = pte_mkwrite(pte);
  142 +#ifdef CONFIG_HUGETLB_PAGE
  143 + if (PageHuge(new))
  144 + pte = pte_mkhuge(pte);
  145 +#endif
133 146 flush_cache_page(vma, addr, pte_pfn(pte));
134 147 set_pte_at(mm, addr, ptep, pte);
135 148  
136   - if (PageAnon(new))
  149 + if (PageHuge(new)) {
  150 + if (PageAnon(new))
  151 + hugepage_add_anon_rmap(new, vma, addr);
  152 + else
  153 + page_dup_rmap(new);
  154 + } else if (PageAnon(new))
137 155 page_add_anon_rmap(new, vma, addr);
138 156 else
139 157 page_add_file_rmap(new);
140 158  
141 159  
... ... @@ -276,11 +294,59 @@
276 294 }
277 295  
278 296 /*
  297 + * The expected number of remaining references is the same as that
  298 + * of migrate_page_move_mapping().
  299 + */
  300 +int migrate_huge_page_move_mapping(struct address_space *mapping,
  301 + struct page *newpage, struct page *page)
  302 +{
  303 + int expected_count;
  304 + void **pslot;
  305 +
  306 + if (!mapping) {
  307 + if (page_count(page) != 1)
  308 + return -EAGAIN;
  309 + return 0;
  310 + }
  311 +
  312 + spin_lock_irq(&mapping->tree_lock);
  313 +
  314 + pslot = radix_tree_lookup_slot(&mapping->page_tree,
  315 + page_index(page));
  316 +
  317 + expected_count = 2 + page_has_private(page);
  318 + if (page_count(page) != expected_count ||
  319 + (struct page *)radix_tree_deref_slot(pslot) != page) {
  320 + spin_unlock_irq(&mapping->tree_lock);
  321 + return -EAGAIN;
  322 + }
  323 +
  324 + if (!page_freeze_refs(page, expected_count)) {
  325 + spin_unlock_irq(&mapping->tree_lock);
  326 + return -EAGAIN;
  327 + }
  328 +
  329 + get_page(newpage);
  330 +
  331 + radix_tree_replace_slot(pslot, newpage);
  332 +
  333 + page_unfreeze_refs(page, expected_count);
  334 +
  335 + __put_page(page);
  336 +
  337 + spin_unlock_irq(&mapping->tree_lock);
  338 + return 0;
  339 +}
  340 +
  341 +/*
279 342 * Copy the page to its new location
280 343 */
281   -static void migrate_page_copy(struct page *newpage, struct page *page)
  344 +void migrate_page_copy(struct page *newpage, struct page *page)
282 345 {
283   - copy_highpage(newpage, page);
  346 + if (PageHuge(page))
  347 + copy_huge_page(newpage, page);
  348 + else
  349 + copy_highpage(newpage, page);
284 350  
285 351 if (PageError(page))
286 352 SetPageError(newpage);
... ... @@ -724,6 +790,92 @@
724 790 }
725 791  
726 792 /*
  793 + * Counterpart of unmap_and_move_page() for hugepage migration.
  794 + *
  795 + * This function doesn't wait the completion of hugepage I/O
  796 + * because there is no race between I/O and migration for hugepage.
  797 + * Note that currently hugepage I/O occurs only in direct I/O
  798 + * where no lock is held and PG_writeback is irrelevant,
  799 + * and writeback status of all subpages are counted in the reference
  800 + * count of the head page (i.e. if all subpages of a 2MB hugepage are
  801 + * under direct I/O, the reference of the head page is 512 and a bit more.)
  802 + * This means that when we try to migrate hugepage whose subpages are
  803 + * doing direct I/O, some references remain after try_to_unmap() and
  804 + * hugepage migration fails without data corruption.
  805 + *
  806 + * There is also no race when direct I/O is issued on the page under migration,
  807 + * because then pte is replaced with migration swap entry and direct I/O code
  808 + * will wait in the page fault for migration to complete.
  809 + */
  810 +static int unmap_and_move_huge_page(new_page_t get_new_page,
  811 + unsigned long private, struct page *hpage,
  812 + int force, int offlining)
  813 +{
  814 + int rc = 0;
  815 + int *result = NULL;
  816 + struct page *new_hpage = get_new_page(hpage, private, &result);
  817 + int rcu_locked = 0;
  818 + struct anon_vma *anon_vma = NULL;
  819 +
  820 + if (!new_hpage)
  821 + return -ENOMEM;
  822 +
  823 + rc = -EAGAIN;
  824 +
  825 + if (!trylock_page(hpage)) {
  826 + if (!force)
  827 + goto out;
  828 + lock_page(hpage);
  829 + }
  830 +
  831 + if (PageAnon(hpage)) {
  832 + rcu_read_lock();
  833 + rcu_locked = 1;
  834 +
  835 + if (page_mapped(hpage)) {
  836 + anon_vma = page_anon_vma(hpage);
  837 + atomic_inc(&anon_vma->external_refcount);
  838 + }
  839 + }
  840 +
  841 + try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
  842 +
  843 + if (!page_mapped(hpage))
  844 + rc = move_to_new_page(new_hpage, hpage, 1);
  845 +
  846 + if (rc)
  847 + remove_migration_ptes(hpage, hpage);
  848 +
  849 + if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount,
  850 + &anon_vma->lock)) {
  851 + int empty = list_empty(&anon_vma->head);
  852 + spin_unlock(&anon_vma->lock);
  853 + if (empty)
  854 + anon_vma_free(anon_vma);
  855 + }
  856 +
  857 + if (rcu_locked)
  858 + rcu_read_unlock();
  859 +out:
  860 + unlock_page(hpage);
  861 +
  862 + if (rc != -EAGAIN) {
  863 + list_del(&hpage->lru);
  864 + put_page(hpage);
  865 + }
  866 +
  867 + put_page(new_hpage);
  868 +
  869 + if (result) {
  870 + if (rc)
  871 + *result = rc;
  872 + else
  873 + *result = page_to_nid(new_hpage);
  874 + }
  875 + return rc;
  876 +}
  877 +
  878 +/*
727 879 * migrate_pages
728 880 *
729 881 * The function takes one list of pages to migrate and a function
... ... @@ -781,6 +933,52 @@
781 933 current->flags &= ~PF_SWAPWRITE;
782 934  
783 935 putback_lru_pages(from);
  936 +
  937 + if (rc)
  938 + return rc;
  939 +
  940 + return nr_failed + retry;
  941 +}
  942 +
  943 +int migrate_huge_pages(struct list_head *from,
  944 + new_page_t get_new_page, unsigned long private, int offlining)
  945 +{
  946 + int retry = 1;
  947 + int nr_failed = 0;
  948 + int pass = 0;
  949 + struct page *page;
  950 + struct page *page2;
  951 + int rc;
  952 +
  953 + for (pass = 0; pass < 10 && retry; pass++) {
  954 + retry = 0;
  955 +
  956 + list_for_each_entry_safe(page, page2, from, lru) {
  957 + cond_resched();
  958 +
  959 + rc = unmap_and_move_huge_page(get_new_page,
  960 + private, page, pass > 2, offlining);
  961 +
  962 + switch(rc) {
  963 + case -ENOMEM:
  964 + goto out;
  965 + case -EAGAIN:
  966 + retry++;
  967 + break;
  968 + case 0:
  969 + break;
  970 + default:
  971 + /* Permanent failure */
  972 + nr_failed++;
  973 + break;
  974 + }
  975 + }
  976 + }
  977 + rc = 0;
  978 +out:
  979 +
  980 + list_for_each_entry_safe(page, page2, from, lru)
  981 + put_page(page);
784 982  
785 983 if (rc)
786 984 return rc;
... ... @@ -780,10 +780,10 @@
780 780 }
781 781  
782 782 /**
783   - * __page_set_anon_rmap - setup new anonymous rmap
784   - * @page: the page to add the mapping to
785   - * @vma: the vm area in which the mapping is added
786   - * @address: the user virtual address mapped
  783 + * __page_set_anon_rmap - set up new anonymous rmap
  784 + * @page: Page to add to rmap
  785 + * @vma: VM area to add page to.
  786 + * @address: User virtual address of the mapping
787 787 * @exclusive: the page is exclusively owned by the current process
788 788 */
789 789 static void __page_set_anon_rmap(struct page *page,
790 790  
791 791  
... ... @@ -793,25 +793,16 @@
793 793  
794 794 BUG_ON(!anon_vma);
795 795  
  796 + if (PageAnon(page))
  797 + return;
  798 +
796 799 /*
797 800 * If the page isn't exclusively mapped into this vma,
798 801 * we must use the _oldest_ possible anon_vma for the
799 802 * page mapping!
800 803 */
801   - if (!exclusive) {
802   - if (PageAnon(page))
803   - return;
  804 + if (!exclusive)
804 805 anon_vma = anon_vma->root;
805   - } else {
806   - /*
807   - * In this case, swapped-out-but-not-discarded swap-cache
808   - * is remapped. So, no need to update page->mapping here.
809   - * We convice anon_vma poitned by page->mapping is not obsolete
810   - * because vma->anon_vma is necessary to be a family of it.
811   - */
812   - if (PageAnon(page))
813   - return;
814   - }
815 806  
816 807 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
817 808 page->mapping = (struct address_space *) anon_vma;