Commit 46e387bbd82d438b9131e237e6e2cb55a825da49
Exists in
master
and in
4 other branches
Merge branch 'hwpoison-hugepages' into hwpoison
Conflicts: mm/memory-failure.c
Showing 10 changed files Side-by-side Diff
arch/x86/mm/fault.c
... | ... | @@ -11,6 +11,7 @@ |
11 | 11 | #include <linux/kprobes.h> /* __kprobes, ... */ |
12 | 12 | #include <linux/mmiotrace.h> /* kmmio_handler, ... */ |
13 | 13 | #include <linux/perf_event.h> /* perf_sw_event */ |
14 | +#include <linux/hugetlb.h> /* hstate_index_to_shift */ | |
14 | 15 | |
15 | 16 | #include <asm/traps.h> /* dotraplinkage, ... */ |
16 | 17 | #include <asm/pgalloc.h> /* pgd_*(), ... */ |
17 | 18 | |
18 | 19 | |
... | ... | @@ -160,15 +161,20 @@ |
160 | 161 | |
161 | 162 | static void |
162 | 163 | force_sig_info_fault(int si_signo, int si_code, unsigned long address, |
163 | - struct task_struct *tsk) | |
164 | + struct task_struct *tsk, int fault) | |
164 | 165 | { |
166 | + unsigned lsb = 0; | |
165 | 167 | siginfo_t info; |
166 | 168 | |
167 | 169 | info.si_signo = si_signo; |
168 | 170 | info.si_errno = 0; |
169 | 171 | info.si_code = si_code; |
170 | 172 | info.si_addr = (void __user *)address; |
171 | - info.si_addr_lsb = si_code == BUS_MCEERR_AR ? PAGE_SHIFT : 0; | |
173 | + if (fault & VM_FAULT_HWPOISON_LARGE) | |
174 | + lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); | |
175 | + if (fault & VM_FAULT_HWPOISON) | |
176 | + lsb = PAGE_SHIFT; | |
177 | + info.si_addr_lsb = lsb; | |
172 | 178 | |
173 | 179 | force_sig_info(si_signo, &info, tsk); |
174 | 180 | } |
... | ... | @@ -722,7 +728,7 @@ |
722 | 728 | tsk->thread.error_code = error_code | (address >= TASK_SIZE); |
723 | 729 | tsk->thread.trap_no = 14; |
724 | 730 | |
725 | - force_sig_info_fault(SIGSEGV, si_code, address, tsk); | |
731 | + force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0); | |
726 | 732 | |
727 | 733 | return; |
728 | 734 | } |
729 | 735 | |
... | ... | @@ -807,14 +813,14 @@ |
807 | 813 | tsk->thread.trap_no = 14; |
808 | 814 | |
809 | 815 | #ifdef CONFIG_MEMORY_FAILURE |
810 | - if (fault & VM_FAULT_HWPOISON) { | |
816 | + if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) { | |
811 | 817 | printk(KERN_ERR |
812 | 818 | "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", |
813 | 819 | tsk->comm, tsk->pid, address); |
814 | 820 | code = BUS_MCEERR_AR; |
815 | 821 | } |
816 | 822 | #endif |
817 | - force_sig_info_fault(SIGBUS, code, address, tsk); | |
823 | + force_sig_info_fault(SIGBUS, code, address, tsk, fault); | |
818 | 824 | } |
819 | 825 | |
820 | 826 | static noinline void |
... | ... | @@ -824,7 +830,8 @@ |
824 | 830 | if (fault & VM_FAULT_OOM) { |
825 | 831 | out_of_memory(regs, error_code, address); |
826 | 832 | } else { |
827 | - if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON)) | |
833 | + if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| | |
834 | + VM_FAULT_HWPOISON_LARGE)) | |
828 | 835 | do_sigbus(regs, error_code, address, fault); |
829 | 836 | else |
830 | 837 | BUG(); |
fs/hugetlbfs/inode.c
... | ... | @@ -31,6 +31,7 @@ |
31 | 31 | #include <linux/statfs.h> |
32 | 32 | #include <linux/security.h> |
33 | 33 | #include <linux/magic.h> |
34 | +#include <linux/migrate.h> | |
34 | 35 | |
35 | 36 | #include <asm/uaccess.h> |
36 | 37 | |
... | ... | @@ -573,6 +574,19 @@ |
573 | 574 | return 0; |
574 | 575 | } |
575 | 576 | |
577 | +static int hugetlbfs_migrate_page(struct address_space *mapping, | |
578 | + struct page *newpage, struct page *page) | |
579 | +{ | |
580 | + int rc; | |
581 | + | |
582 | + rc = migrate_huge_page_move_mapping(mapping, newpage, page); | |
583 | + if (rc) | |
584 | + return rc; | |
585 | + migrate_page_copy(newpage, page); | |
586 | + | |
587 | + return 0; | |
588 | +} | |
589 | + | |
576 | 590 | static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) |
577 | 591 | { |
578 | 592 | struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb); |
... | ... | @@ -659,6 +673,7 @@ |
659 | 673 | .write_begin = hugetlbfs_write_begin, |
660 | 674 | .write_end = hugetlbfs_write_end, |
661 | 675 | .set_page_dirty = hugetlbfs_set_page_dirty, |
676 | + .migratepage = hugetlbfs_migrate_page, | |
662 | 677 | }; |
663 | 678 | |
664 | 679 |
include/linux/hugetlb.h
... | ... | @@ -43,7 +43,8 @@ |
43 | 43 | struct vm_area_struct *vma, |
44 | 44 | int acctflags); |
45 | 45 | void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed); |
46 | -void __isolate_hwpoisoned_huge_page(struct page *page); | |
46 | +int dequeue_hwpoisoned_huge_page(struct page *page); | |
47 | +void copy_huge_page(struct page *dst, struct page *src); | |
47 | 48 | |
48 | 49 | extern unsigned long hugepages_treat_as_movable; |
49 | 50 | extern const unsigned long hugetlb_zero, hugetlb_infinity; |
... | ... | @@ -101,7 +102,10 @@ |
101 | 102 | #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; }) |
102 | 103 | #define hugetlb_fault(mm, vma, addr, flags) ({ BUG(); 0; }) |
103 | 104 | #define huge_pte_offset(mm, address) 0 |
104 | -#define __isolate_hwpoisoned_huge_page(page) 0 | |
105 | +#define dequeue_hwpoisoned_huge_page(page) 0 | |
106 | +static inline void copy_huge_page(struct page *dst, struct page *src) | |
107 | +{ | |
108 | +} | |
105 | 109 | |
106 | 110 | #define hugetlb_change_protection(vma, address, end, newprot) |
107 | 111 | |
... | ... | @@ -228,6 +232,8 @@ |
228 | 232 | struct hstate *hstate; |
229 | 233 | }; |
230 | 234 | |
235 | +struct page *alloc_huge_page_node(struct hstate *h, int nid); | |
236 | + | |
231 | 237 | /* arch callback */ |
232 | 238 | int __init alloc_bootmem_huge_page(struct hstate *h); |
233 | 239 | |
234 | 240 | |
... | ... | @@ -301,8 +307,14 @@ |
301 | 307 | return size_to_hstate(PAGE_SIZE << compound_order(page)); |
302 | 308 | } |
303 | 309 | |
310 | +static inline unsigned hstate_index_to_shift(unsigned index) | |
311 | +{ | |
312 | + return hstates[index].order + PAGE_SHIFT; | |
313 | +} | |
314 | + | |
304 | 315 | #else |
305 | 316 | struct hstate {}; |
317 | +#define alloc_huge_page_node(h, nid) NULL | |
306 | 318 | #define alloc_bootmem_huge_page(h) NULL |
307 | 319 | #define hstate_file(f) NULL |
308 | 320 | #define hstate_vma(v) NULL |
... | ... | @@ -317,6 +329,7 @@ |
317 | 329 | { |
318 | 330 | return 1; |
319 | 331 | } |
332 | +#define hstate_index_to_shift(index) 0 | |
320 | 333 | #endif |
321 | 334 | |
322 | 335 | #endif /* _LINUX_HUGETLB_H */ |
include/linux/migrate.h
... | ... | @@ -14,6 +14,8 @@ |
14 | 14 | struct page *, struct page *); |
15 | 15 | extern int migrate_pages(struct list_head *l, new_page_t x, |
16 | 16 | unsigned long private, int offlining); |
17 | +extern int migrate_huge_pages(struct list_head *l, new_page_t x, | |
18 | + unsigned long private, int offlining); | |
17 | 19 | |
18 | 20 | extern int fail_migrate_page(struct address_space *, |
19 | 21 | struct page *, struct page *); |
20 | 22 | |
... | ... | @@ -23,12 +25,17 @@ |
23 | 25 | extern int migrate_vmas(struct mm_struct *mm, |
24 | 26 | const nodemask_t *from, const nodemask_t *to, |
25 | 27 | unsigned long flags); |
28 | +extern void migrate_page_copy(struct page *newpage, struct page *page); | |
29 | +extern int migrate_huge_page_move_mapping(struct address_space *mapping, | |
30 | + struct page *newpage, struct page *page); | |
26 | 31 | #else |
27 | 32 | #define PAGE_MIGRATION 0 |
28 | 33 | |
29 | 34 | static inline void putback_lru_pages(struct list_head *l) {} |
30 | 35 | static inline int migrate_pages(struct list_head *l, new_page_t x, |
31 | 36 | unsigned long private, int offlining) { return -ENOSYS; } |
37 | +static inline int migrate_huge_pages(struct list_head *l, new_page_t x, | |
38 | + unsigned long private, int offlining) { return -ENOSYS; } | |
32 | 39 | |
33 | 40 | static inline int migrate_prep(void) { return -ENOSYS; } |
34 | 41 | static inline int migrate_prep_local(void) { return -ENOSYS; } |
... | ... | @@ -36,6 +43,15 @@ |
36 | 43 | static inline int migrate_vmas(struct mm_struct *mm, |
37 | 44 | const nodemask_t *from, const nodemask_t *to, |
38 | 45 | unsigned long flags) |
46 | +{ | |
47 | + return -ENOSYS; | |
48 | +} | |
49 | + | |
50 | +static inline void migrate_page_copy(struct page *newpage, | |
51 | + struct page *page) {} | |
52 | + | |
53 | +static inline int migrate_huge_page_move_mapping(struct address_space *mapping, | |
54 | + struct page *newpage, struct page *page) | |
39 | 55 | { |
40 | 56 | return -ENOSYS; |
41 | 57 | } |
include/linux/mm.h
... | ... | @@ -718,12 +718,20 @@ |
718 | 718 | #define VM_FAULT_SIGBUS 0x0002 |
719 | 719 | #define VM_FAULT_MAJOR 0x0004 |
720 | 720 | #define VM_FAULT_WRITE 0x0008 /* Special case for get_user_pages */ |
721 | -#define VM_FAULT_HWPOISON 0x0010 /* Hit poisoned page */ | |
721 | +#define VM_FAULT_HWPOISON 0x0010 /* Hit poisoned small page */ | |
722 | +#define VM_FAULT_HWPOISON_LARGE 0x0020 /* Hit poisoned large page. Index encoded in upper bits */ | |
722 | 723 | |
723 | 724 | #define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page */ |
724 | 725 | #define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */ |
725 | 726 | |
726 | -#define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON) | |
727 | +#define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */ | |
728 | + | |
729 | +#define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON | \ | |
730 | + VM_FAULT_HWPOISON_LARGE) | |
731 | + | |
732 | +/* Encode hstate index for a hwpoisoned large page */ | |
733 | +#define VM_FAULT_SET_HINDEX(x) ((x) << 12) | |
734 | +#define VM_FAULT_GET_HINDEX(x) (((x) >> 12) & 0xf) | |
727 | 735 | |
728 | 736 | /* |
729 | 737 | * Can be called by the pagefault handler when it gets a VM_FAULT_OOM. |
mm/hugetlb.c
... | ... | @@ -423,14 +423,14 @@ |
423 | 423 | } |
424 | 424 | } |
425 | 425 | |
426 | -static void copy_gigantic_page(struct page *dst, struct page *src, | |
426 | +static void copy_user_gigantic_page(struct page *dst, struct page *src, | |
427 | 427 | unsigned long addr, struct vm_area_struct *vma) |
428 | 428 | { |
429 | 429 | int i; |
430 | 430 | struct hstate *h = hstate_vma(vma); |
431 | 431 | struct page *dst_base = dst; |
432 | 432 | struct page *src_base = src; |
433 | - might_sleep(); | |
433 | + | |
434 | 434 | for (i = 0; i < pages_per_huge_page(h); ) { |
435 | 435 | cond_resched(); |
436 | 436 | copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); |
437 | 437 | |
... | ... | @@ -440,14 +440,15 @@ |
440 | 440 | src = mem_map_next(src, src_base, i); |
441 | 441 | } |
442 | 442 | } |
443 | -static void copy_huge_page(struct page *dst, struct page *src, | |
443 | + | |
444 | +static void copy_user_huge_page(struct page *dst, struct page *src, | |
444 | 445 | unsigned long addr, struct vm_area_struct *vma) |
445 | 446 | { |
446 | 447 | int i; |
447 | 448 | struct hstate *h = hstate_vma(vma); |
448 | 449 | |
449 | 450 | if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { |
450 | - copy_gigantic_page(dst, src, addr, vma); | |
451 | + copy_user_gigantic_page(dst, src, addr, vma); | |
451 | 452 | return; |
452 | 453 | } |
453 | 454 | |
... | ... | @@ -458,6 +459,40 @@ |
458 | 459 | } |
459 | 460 | } |
460 | 461 | |
462 | +static void copy_gigantic_page(struct page *dst, struct page *src) | |
463 | +{ | |
464 | + int i; | |
465 | + struct hstate *h = page_hstate(src); | |
466 | + struct page *dst_base = dst; | |
467 | + struct page *src_base = src; | |
468 | + | |
469 | + for (i = 0; i < pages_per_huge_page(h); ) { | |
470 | + cond_resched(); | |
471 | + copy_highpage(dst, src); | |
472 | + | |
473 | + i++; | |
474 | + dst = mem_map_next(dst, dst_base, i); | |
475 | + src = mem_map_next(src, src_base, i); | |
476 | + } | |
477 | +} | |
478 | + | |
479 | +void copy_huge_page(struct page *dst, struct page *src) | |
480 | +{ | |
481 | + int i; | |
482 | + struct hstate *h = page_hstate(src); | |
483 | + | |
484 | + if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { | |
485 | + copy_gigantic_page(dst, src); | |
486 | + return; | |
487 | + } | |
488 | + | |
489 | + might_sleep(); | |
490 | + for (i = 0; i < pages_per_huge_page(h); i++) { | |
491 | + cond_resched(); | |
492 | + copy_highpage(dst + i, src + i); | |
493 | + } | |
494 | +} | |
495 | + | |
461 | 496 | static void enqueue_huge_page(struct hstate *h, struct page *page) |
462 | 497 | { |
463 | 498 | int nid = page_to_nid(page); |
464 | 499 | |
... | ... | @@ -466,11 +501,24 @@ |
466 | 501 | h->free_huge_pages_node[nid]++; |
467 | 502 | } |
468 | 503 | |
504 | +static struct page *dequeue_huge_page_node(struct hstate *h, int nid) | |
505 | +{ | |
506 | + struct page *page; | |
507 | + | |
508 | + if (list_empty(&h->hugepage_freelists[nid])) | |
509 | + return NULL; | |
510 | + page = list_entry(h->hugepage_freelists[nid].next, struct page, lru); | |
511 | + list_del(&page->lru); | |
512 | + set_page_refcounted(page); | |
513 | + h->free_huge_pages--; | |
514 | + h->free_huge_pages_node[nid]--; | |
515 | + return page; | |
516 | +} | |
517 | + | |
469 | 518 | static struct page *dequeue_huge_page_vma(struct hstate *h, |
470 | 519 | struct vm_area_struct *vma, |
471 | 520 | unsigned long address, int avoid_reserve) |
472 | 521 | { |
473 | - int nid; | |
474 | 522 | struct page *page = NULL; |
475 | 523 | struct mempolicy *mpol; |
476 | 524 | nodemask_t *nodemask; |
... | ... | @@ -496,19 +544,13 @@ |
496 | 544 | |
497 | 545 | for_each_zone_zonelist_nodemask(zone, z, zonelist, |
498 | 546 | MAX_NR_ZONES - 1, nodemask) { |
499 | - nid = zone_to_nid(zone); | |
500 | - if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) && | |
501 | - !list_empty(&h->hugepage_freelists[nid])) { | |
502 | - page = list_entry(h->hugepage_freelists[nid].next, | |
503 | - struct page, lru); | |
504 | - list_del(&page->lru); | |
505 | - h->free_huge_pages--; | |
506 | - h->free_huge_pages_node[nid]--; | |
507 | - | |
508 | - if (!avoid_reserve) | |
509 | - decrement_hugepage_resv_vma(h, vma); | |
510 | - | |
511 | - break; | |
547 | + if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) { | |
548 | + page = dequeue_huge_page_node(h, zone_to_nid(zone)); | |
549 | + if (page) { | |
550 | + if (!avoid_reserve) | |
551 | + decrement_hugepage_resv_vma(h, vma); | |
552 | + break; | |
553 | + } | |
512 | 554 | } |
513 | 555 | } |
514 | 556 | err: |
515 | 557 | |
... | ... | @@ -770,11 +812,10 @@ |
770 | 812 | return ret; |
771 | 813 | } |
772 | 814 | |
773 | -static struct page *alloc_buddy_huge_page(struct hstate *h, | |
774 | - struct vm_area_struct *vma, unsigned long address) | |
815 | +static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) | |
775 | 816 | { |
776 | 817 | struct page *page; |
777 | - unsigned int nid; | |
818 | + unsigned int r_nid; | |
778 | 819 | |
779 | 820 | if (h->order >= MAX_ORDER) |
780 | 821 | return NULL; |
... | ... | @@ -812,9 +853,14 @@ |
812 | 853 | } |
813 | 854 | spin_unlock(&hugetlb_lock); |
814 | 855 | |
815 | - page = alloc_pages(htlb_alloc_mask|__GFP_COMP| | |
816 | - __GFP_REPEAT|__GFP_NOWARN, | |
817 | - huge_page_order(h)); | |
856 | + if (nid == NUMA_NO_NODE) | |
857 | + page = alloc_pages(htlb_alloc_mask|__GFP_COMP| | |
858 | + __GFP_REPEAT|__GFP_NOWARN, | |
859 | + huge_page_order(h)); | |
860 | + else | |
861 | + page = alloc_pages_exact_node(nid, | |
862 | + htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| | |
863 | + __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); | |
818 | 864 | |
819 | 865 | if (page && arch_prepare_hugepage(page)) { |
820 | 866 | __free_pages(page, huge_page_order(h)); |
821 | 867 | |
... | ... | @@ -823,19 +869,13 @@ |
823 | 869 | |
824 | 870 | spin_lock(&hugetlb_lock); |
825 | 871 | if (page) { |
826 | - /* | |
827 | - * This page is now managed by the hugetlb allocator and has | |
828 | - * no users -- drop the buddy allocator's reference. | |
829 | - */ | |
830 | - put_page_testzero(page); | |
831 | - VM_BUG_ON(page_count(page)); | |
832 | - nid = page_to_nid(page); | |
872 | + r_nid = page_to_nid(page); | |
833 | 873 | set_compound_page_dtor(page, free_huge_page); |
834 | 874 | /* |
835 | 875 | * We incremented the global counters already |
836 | 876 | */ |
837 | - h->nr_huge_pages_node[nid]++; | |
838 | - h->surplus_huge_pages_node[nid]++; | |
877 | + h->nr_huge_pages_node[r_nid]++; | |
878 | + h->surplus_huge_pages_node[r_nid]++; | |
839 | 879 | __count_vm_event(HTLB_BUDDY_PGALLOC); |
840 | 880 | } else { |
841 | 881 | h->nr_huge_pages--; |
... | ... | @@ -848,6 +888,25 @@ |
848 | 888 | } |
849 | 889 | |
850 | 890 | /* |
891 | + * This allocation function is useful in the context where vma is irrelevant. | |
892 | + * E.g. soft-offlining uses this function because it only cares physical | |
893 | + * address of error page. | |
894 | + */ | |
895 | +struct page *alloc_huge_page_node(struct hstate *h, int nid) | |
896 | +{ | |
897 | + struct page *page; | |
898 | + | |
899 | + spin_lock(&hugetlb_lock); | |
900 | + page = dequeue_huge_page_node(h, nid); | |
901 | + spin_unlock(&hugetlb_lock); | |
902 | + | |
903 | + if (!page) | |
904 | + page = alloc_buddy_huge_page(h, nid); | |
905 | + | |
906 | + return page; | |
907 | +} | |
908 | + | |
909 | +/* | |
851 | 910 | * Increase the hugetlb pool such that it can accomodate a reservation |
852 | 911 | * of size 'delta'. |
853 | 912 | */ |
854 | 913 | |
855 | 914 | |
... | ... | @@ -871,17 +930,14 @@ |
871 | 930 | retry: |
872 | 931 | spin_unlock(&hugetlb_lock); |
873 | 932 | for (i = 0; i < needed; i++) { |
874 | - page = alloc_buddy_huge_page(h, NULL, 0); | |
875 | - if (!page) { | |
933 | + page = alloc_buddy_huge_page(h, NUMA_NO_NODE); | |
934 | + if (!page) | |
876 | 935 | /* |
877 | 936 | * We were not able to allocate enough pages to |
878 | 937 | * satisfy the entire reservation so we free what |
879 | 938 | * we've allocated so far. |
880 | 939 | */ |
881 | - spin_lock(&hugetlb_lock); | |
882 | - needed = 0; | |
883 | 940 | goto free; |
884 | - } | |
885 | 941 | |
886 | 942 | list_add(&page->lru, &surplus_list); |
887 | 943 | } |
888 | 944 | |
889 | 945 | |
890 | 946 | |
891 | 947 | |
892 | 948 | |
893 | 949 | |
... | ... | @@ -908,31 +964,31 @@ |
908 | 964 | needed += allocated; |
909 | 965 | h->resv_huge_pages += delta; |
910 | 966 | ret = 0; |
911 | -free: | |
967 | + | |
968 | + spin_unlock(&hugetlb_lock); | |
912 | 969 | /* Free the needed pages to the hugetlb pool */ |
913 | 970 | list_for_each_entry_safe(page, tmp, &surplus_list, lru) { |
914 | 971 | if ((--needed) < 0) |
915 | 972 | break; |
916 | 973 | list_del(&page->lru); |
974 | + /* | |
975 | + * This page is now managed by the hugetlb allocator and has | |
976 | + * no users -- drop the buddy allocator's reference. | |
977 | + */ | |
978 | + put_page_testzero(page); | |
979 | + VM_BUG_ON(page_count(page)); | |
917 | 980 | enqueue_huge_page(h, page); |
918 | 981 | } |
919 | 982 | |
920 | 983 | /* Free unnecessary surplus pages to the buddy allocator */ |
984 | +free: | |
921 | 985 | if (!list_empty(&surplus_list)) { |
922 | - spin_unlock(&hugetlb_lock); | |
923 | 986 | list_for_each_entry_safe(page, tmp, &surplus_list, lru) { |
924 | 987 | list_del(&page->lru); |
925 | - /* | |
926 | - * The page has a reference count of zero already, so | |
927 | - * call free_huge_page directly instead of using | |
928 | - * put_page. This must be done with hugetlb_lock | |
929 | - * unlocked which is safe because free_huge_page takes | |
930 | - * hugetlb_lock before deciding how to free the page. | |
931 | - */ | |
932 | - free_huge_page(page); | |
988 | + put_page(page); | |
933 | 989 | } |
934 | - spin_lock(&hugetlb_lock); | |
935 | 990 | } |
991 | + spin_lock(&hugetlb_lock); | |
936 | 992 | |
937 | 993 | return ret; |
938 | 994 | } |
939 | 995 | |
... | ... | @@ -1052,14 +1108,13 @@ |
1052 | 1108 | spin_unlock(&hugetlb_lock); |
1053 | 1109 | |
1054 | 1110 | if (!page) { |
1055 | - page = alloc_buddy_huge_page(h, vma, addr); | |
1111 | + page = alloc_buddy_huge_page(h, NUMA_NO_NODE); | |
1056 | 1112 | if (!page) { |
1057 | 1113 | hugetlb_put_quota(inode->i_mapping, chg); |
1058 | 1114 | return ERR_PTR(-VM_FAULT_SIGBUS); |
1059 | 1115 | } |
1060 | 1116 | } |
1061 | 1117 | |
1062 | - set_page_refcounted(page); | |
1063 | 1118 | set_page_private(page, (unsigned long) mapping); |
1064 | 1119 | |
1065 | 1120 | vma_commit_reservation(h, vma, addr); |
... | ... | @@ -2153,6 +2208,19 @@ |
2153 | 2208 | return -ENOMEM; |
2154 | 2209 | } |
2155 | 2210 | |
2211 | +static int is_hugetlb_entry_migration(pte_t pte) | |
2212 | +{ | |
2213 | + swp_entry_t swp; | |
2214 | + | |
2215 | + if (huge_pte_none(pte) || pte_present(pte)) | |
2216 | + return 0; | |
2217 | + swp = pte_to_swp_entry(pte); | |
2218 | + if (non_swap_entry(swp) && is_migration_entry(swp)) { | |
2219 | + return 1; | |
2220 | + } else | |
2221 | + return 0; | |
2222 | +} | |
2223 | + | |
2156 | 2224 | static int is_hugetlb_entry_hwpoisoned(pte_t pte) |
2157 | 2225 | { |
2158 | 2226 | swp_entry_t swp; |
... | ... | @@ -2383,7 +2451,7 @@ |
2383 | 2451 | if (unlikely(anon_vma_prepare(vma))) |
2384 | 2452 | return VM_FAULT_OOM; |
2385 | 2453 | |
2386 | - copy_huge_page(new_page, old_page, address, vma); | |
2454 | + copy_user_huge_page(new_page, old_page, address, vma); | |
2387 | 2455 | __SetPageUptodate(new_page); |
2388 | 2456 | |
2389 | 2457 | /* |
2390 | 2458 | |
... | ... | @@ -2515,22 +2583,20 @@ |
2515 | 2583 | hugepage_add_new_anon_rmap(page, vma, address); |
2516 | 2584 | } |
2517 | 2585 | } else { |
2586 | + /* | |
2587 | + * If memory error occurs between mmap() and fault, some process | |
2588 | + * don't have hwpoisoned swap entry for errored virtual address. | |
2589 | + * So we need to block hugepage fault by PG_hwpoison bit check. | |
2590 | + */ | |
2591 | + if (unlikely(PageHWPoison(page))) { | |
2592 | + ret = VM_FAULT_HWPOISON | | |
2593 | + VM_FAULT_SET_HINDEX(h - hstates); | |
2594 | + goto backout_unlocked; | |
2595 | + } | |
2518 | 2596 | page_dup_rmap(page); |
2519 | 2597 | } |
2520 | 2598 | |
2521 | 2599 | /* |
2522 | - * Since memory error handler replaces pte into hwpoison swap entry | |
2523 | - * at the time of error handling, a process which reserved but not have | |
2524 | - * the mapping to the error hugepage does not have hwpoison swap entry. | |
2525 | - * So we need to block accesses from such a process by checking | |
2526 | - * PG_hwpoison bit here. | |
2527 | - */ | |
2528 | - if (unlikely(PageHWPoison(page))) { | |
2529 | - ret = VM_FAULT_HWPOISON; | |
2530 | - goto backout_unlocked; | |
2531 | - } | |
2532 | - | |
2533 | - /* | |
2534 | 2600 | * If we are going to COW a private mapping later, we examine the |
2535 | 2601 | * pending reservations for this page now. This will ensure that |
2536 | 2602 | * any allocations necessary to record that reservation occur outside |
... | ... | @@ -2587,8 +2653,12 @@ |
2587 | 2653 | ptep = huge_pte_offset(mm, address); |
2588 | 2654 | if (ptep) { |
2589 | 2655 | entry = huge_ptep_get(ptep); |
2590 | - if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) | |
2591 | - return VM_FAULT_HWPOISON; | |
2656 | + if (unlikely(is_hugetlb_entry_migration(entry))) { | |
2657 | + migration_entry_wait(mm, (pmd_t *)ptep, address); | |
2658 | + return 0; | |
2659 | + } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) | |
2660 | + return VM_FAULT_HWPOISON_LARGE | | |
2661 | + VM_FAULT_SET_HINDEX(h - hstates); | |
2592 | 2662 | } |
2593 | 2663 | |
2594 | 2664 | ptep = huge_pte_alloc(mm, address, huge_page_size(h)); |
2595 | 2665 | |
2596 | 2666 | |
2597 | 2667 | |
2598 | 2668 | |
2599 | 2669 | |
... | ... | @@ -2878,19 +2948,42 @@ |
2878 | 2948 | hugetlb_acct_memory(h, -(chg - freed)); |
2879 | 2949 | } |
2880 | 2950 | |
2951 | +#ifdef CONFIG_MEMORY_FAILURE | |
2952 | + | |
2953 | +/* Should be called in hugetlb_lock */ | |
2954 | +static int is_hugepage_on_freelist(struct page *hpage) | |
2955 | +{ | |
2956 | + struct page *page; | |
2957 | + struct page *tmp; | |
2958 | + struct hstate *h = page_hstate(hpage); | |
2959 | + int nid = page_to_nid(hpage); | |
2960 | + | |
2961 | + list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru) | |
2962 | + if (page == hpage) | |
2963 | + return 1; | |
2964 | + return 0; | |
2965 | +} | |
2966 | + | |
2881 | 2967 | /* |
2882 | 2968 | * This function is called from memory failure code. |
2883 | 2969 | * Assume the caller holds page lock of the head page. |
2884 | 2970 | */ |
2885 | -void __isolate_hwpoisoned_huge_page(struct page *hpage) | |
2971 | +int dequeue_hwpoisoned_huge_page(struct page *hpage) | |
2886 | 2972 | { |
2887 | 2973 | struct hstate *h = page_hstate(hpage); |
2888 | 2974 | int nid = page_to_nid(hpage); |
2975 | + int ret = -EBUSY; | |
2889 | 2976 | |
2890 | 2977 | spin_lock(&hugetlb_lock); |
2891 | - list_del(&hpage->lru); | |
2892 | - h->free_huge_pages--; | |
2893 | - h->free_huge_pages_node[nid]--; | |
2978 | + if (is_hugepage_on_freelist(hpage)) { | |
2979 | + list_del(&hpage->lru); | |
2980 | + set_page_refcounted(hpage); | |
2981 | + h->free_huge_pages--; | |
2982 | + h->free_huge_pages_node[nid]--; | |
2983 | + ret = 0; | |
2984 | + } | |
2894 | 2985 | spin_unlock(&hugetlb_lock); |
2986 | + return ret; | |
2895 | 2987 | } |
2988 | +#endif |
mm/memory-failure.c
... | ... | @@ -697,11 +697,10 @@ |
697 | 697 | * Issues: |
698 | 698 | * - Error on hugepage is contained in hugepage unit (not in raw page unit.) |
699 | 699 | * To narrow down kill region to one page, we need to break up pmd. |
700 | - * - To support soft-offlining for hugepage, we need to support hugepage | |
701 | - * migration. | |
702 | 700 | */ |
703 | 701 | static int me_huge_page(struct page *p, unsigned long pfn) |
704 | 702 | { |
703 | + int res = 0; | |
705 | 704 | struct page *hpage = compound_head(p); |
706 | 705 | /* |
707 | 706 | * We can safely recover from error on free or reserved (i.e. |
... | ... | @@ -714,8 +713,9 @@ |
714 | 713 | * so there is no race between isolation and mapping/unmapping. |
715 | 714 | */ |
716 | 715 | if (!(page_mapping(hpage) || PageAnon(hpage))) { |
717 | - __isolate_hwpoisoned_huge_page(hpage); | |
718 | - return RECOVERED; | |
716 | + res = dequeue_hwpoisoned_huge_page(hpage); | |
717 | + if (!res) | |
718 | + return RECOVERED; | |
719 | 719 | } |
720 | 720 | return DELAYED; |
721 | 721 | } |
... | ... | @@ -972,7 +972,10 @@ |
972 | 972 | * We need/can do nothing about count=0 pages. |
973 | 973 | * 1) it's a free page, and therefore in safe hand: |
974 | 974 | * prep_new_page() will be the gate keeper. |
975 | - * 2) it's part of a non-compound high order page. | |
975 | + * 2) it's a free hugepage, which is also safe: | |
976 | + * an affected hugepage will be dequeued from hugepage freelist, | |
977 | + * so there's no concern about reusing it ever after. | |
978 | + * 3) it's part of a non-compound high order page. | |
976 | 979 | * Implies some kernel user: cannot stop them from |
977 | 980 | * R/W the page; let's pray that the page has been |
978 | 981 | * used and will be freed some time later. |
... | ... | @@ -984,6 +987,24 @@ |
984 | 987 | if (is_free_buddy_page(p)) { |
985 | 988 | action_result(pfn, "free buddy", DELAYED); |
986 | 989 | return 0; |
990 | + } else if (PageHuge(hpage)) { | |
991 | + /* | |
992 | + * Check "just unpoisoned", "filter hit", and | |
993 | + * "race with other subpage." | |
994 | + */ | |
995 | + lock_page_nosync(hpage); | |
996 | + if (!PageHWPoison(hpage) | |
997 | + || (hwpoison_filter(p) && TestClearPageHWPoison(p)) | |
998 | + || (p != hpage && TestSetPageHWPoison(hpage))) { | |
999 | + atomic_long_sub(nr_pages, &mce_bad_pages); | |
1000 | + return 0; | |
1001 | + } | |
1002 | + set_page_hwpoison_huge_page(hpage); | |
1003 | + res = dequeue_hwpoisoned_huge_page(hpage); | |
1004 | + action_result(pfn, "free huge", | |
1005 | + res ? IGNORED : DELAYED); | |
1006 | + unlock_page(hpage); | |
1007 | + return res; | |
987 | 1008 | } else { |
988 | 1009 | action_result(pfn, "high order kernel", IGNORED); |
989 | 1010 | return -EBUSY; |
... | ... | @@ -1145,6 +1166,16 @@ |
1145 | 1166 | nr_pages = 1 << compound_order(page); |
1146 | 1167 | |
1147 | 1168 | if (!get_page_unless_zero(page)) { |
1169 | + /* | |
1170 | + * Since HWPoisoned hugepage should have non-zero refcount, | |
1171 | + * race between memory failure and unpoison seems to happen. | |
1172 | + * In such case unpoison fails and memory failure runs | |
1173 | + * to the end. | |
1174 | + */ | |
1175 | + if (PageHuge(page)) { | |
1176 | + pr_debug("MCE: Memory failure is now running on free hugepage %#lx\n", pfn); | |
1177 | + return 0; | |
1178 | + } | |
1148 | 1179 | if (TestClearPageHWPoison(p)) |
1149 | 1180 | atomic_long_sub(nr_pages, &mce_bad_pages); |
1150 | 1181 | pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn); |
1151 | 1182 | |
... | ... | @@ -1162,9 +1193,9 @@ |
1162 | 1193 | pr_info("MCE: Software-unpoisoned page %#lx\n", pfn); |
1163 | 1194 | atomic_long_sub(nr_pages, &mce_bad_pages); |
1164 | 1195 | freeit = 1; |
1196 | + if (PageHuge(page)) | |
1197 | + clear_page_hwpoison_huge_page(page); | |
1165 | 1198 | } |
1166 | - if (PageHuge(p)) | |
1167 | - clear_page_hwpoison_huge_page(page); | |
1168 | 1199 | unlock_page(page); |
1169 | 1200 | |
1170 | 1201 | put_page(page); |
... | ... | @@ -1178,7 +1209,11 @@ |
1178 | 1209 | static struct page *new_page(struct page *p, unsigned long private, int **x) |
1179 | 1210 | { |
1180 | 1211 | int nid = page_to_nid(p); |
1181 | - return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0); | |
1212 | + if (PageHuge(p)) | |
1213 | + return alloc_huge_page_node(page_hstate(compound_head(p)), | |
1214 | + nid); | |
1215 | + else | |
1216 | + return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0); | |
1182 | 1217 | } |
1183 | 1218 | |
1184 | 1219 | /* |
1185 | 1220 | |
... | ... | @@ -1206,8 +1241,15 @@ |
1206 | 1241 | * was free. |
1207 | 1242 | */ |
1208 | 1243 | set_migratetype_isolate(p); |
1244 | + /* | |
1245 | + * When the target page is a free hugepage, just remove it | |
1246 | + * from free hugepage list. | |
1247 | + */ | |
1209 | 1248 | if (!get_page_unless_zero(compound_head(p))) { |
1210 | - if (is_free_buddy_page(p)) { | |
1249 | + if (PageHuge(p)) { | |
1250 | + pr_info("get_any_page: %#lx free huge page\n", pfn); | |
1251 | + ret = dequeue_hwpoisoned_huge_page(compound_head(p)); | |
1252 | + } else if (is_free_buddy_page(p)) { | |
1211 | 1253 | pr_info("get_any_page: %#lx free buddy page\n", pfn); |
1212 | 1254 | /* Set hwpoison bit while page is still isolated */ |
1213 | 1255 | SetPageHWPoison(p); |
... | ... | @@ -1226,6 +1268,45 @@ |
1226 | 1268 | return ret; |
1227 | 1269 | } |
1228 | 1270 | |
1271 | +static int soft_offline_huge_page(struct page *page, int flags) | |
1272 | +{ | |
1273 | + int ret; | |
1274 | + unsigned long pfn = page_to_pfn(page); | |
1275 | + struct page *hpage = compound_head(page); | |
1276 | + LIST_HEAD(pagelist); | |
1277 | + | |
1278 | + ret = get_any_page(page, pfn, flags); | |
1279 | + if (ret < 0) | |
1280 | + return ret; | |
1281 | + if (ret == 0) | |
1282 | + goto done; | |
1283 | + | |
1284 | + if (PageHWPoison(hpage)) { | |
1285 | + put_page(hpage); | |
1286 | + pr_debug("soft offline: %#lx hugepage already poisoned\n", pfn); | |
1287 | + return -EBUSY; | |
1288 | + } | |
1289 | + | |
1290 | + /* Keep page count to indicate a given hugepage is isolated. */ | |
1291 | + | |
1292 | + list_add(&hpage->lru, &pagelist); | |
1293 | + ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); | |
1294 | + if (ret) { | |
1295 | + pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", | |
1296 | + pfn, ret, page->flags); | |
1297 | + if (ret > 0) | |
1298 | + ret = -EIO; | |
1299 | + return ret; | |
1300 | + } | |
1301 | +done: | |
1302 | + if (!PageHWPoison(hpage)) | |
1303 | + atomic_long_add(1 << compound_order(hpage), &mce_bad_pages); | |
1304 | + set_page_hwpoison_huge_page(hpage); | |
1305 | + dequeue_hwpoisoned_huge_page(hpage); | |
1306 | + /* keep elevated page count for bad page */ | |
1307 | + return ret; | |
1308 | +} | |
1309 | + | |
1229 | 1310 | /** |
1230 | 1311 | * soft_offline_page - Soft offline a page. |
1231 | 1312 | * @page: page to offline |
... | ... | @@ -1252,6 +1333,9 @@ |
1252 | 1333 | { |
1253 | 1334 | int ret; |
1254 | 1335 | unsigned long pfn = page_to_pfn(page); |
1336 | + | |
1337 | + if (PageHuge(page)) | |
1338 | + return soft_offline_huge_page(page, flags); | |
1255 | 1339 | |
1256 | 1340 | ret = get_any_page(page, pfn, flags); |
1257 | 1341 | if (ret < 0) |
mm/memory.c
... | ... | @@ -1450,7 +1450,8 @@ |
1450 | 1450 | if (ret & VM_FAULT_OOM) |
1451 | 1451 | return i ? i : -ENOMEM; |
1452 | 1452 | if (ret & |
1453 | - (VM_FAULT_HWPOISON|VM_FAULT_SIGBUS)) | |
1453 | + (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE| | |
1454 | + VM_FAULT_SIGBUS)) | |
1454 | 1455 | return i ? i : -EFAULT; |
1455 | 1456 | BUG(); |
1456 | 1457 | } |
mm/migrate.c
... | ... | @@ -32,6 +32,7 @@ |
32 | 32 | #include <linux/security.h> |
33 | 33 | #include <linux/memcontrol.h> |
34 | 34 | #include <linux/syscalls.h> |
35 | +#include <linux/hugetlb.h> | |
35 | 36 | #include <linux/gfp.h> |
36 | 37 | |
37 | 38 | #include "internal.h" |
38 | 39 | |
39 | 40 | |
40 | 41 | |
41 | 42 | |
42 | 43 | |
... | ... | @@ -95,26 +96,34 @@ |
95 | 96 | pte_t *ptep, pte; |
96 | 97 | spinlock_t *ptl; |
97 | 98 | |
98 | - pgd = pgd_offset(mm, addr); | |
99 | - if (!pgd_present(*pgd)) | |
100 | - goto out; | |
99 | + if (unlikely(PageHuge(new))) { | |
100 | + ptep = huge_pte_offset(mm, addr); | |
101 | + if (!ptep) | |
102 | + goto out; | |
103 | + ptl = &mm->page_table_lock; | |
104 | + } else { | |
105 | + pgd = pgd_offset(mm, addr); | |
106 | + if (!pgd_present(*pgd)) | |
107 | + goto out; | |
101 | 108 | |
102 | - pud = pud_offset(pgd, addr); | |
103 | - if (!pud_present(*pud)) | |
104 | - goto out; | |
109 | + pud = pud_offset(pgd, addr); | |
110 | + if (!pud_present(*pud)) | |
111 | + goto out; | |
105 | 112 | |
106 | - pmd = pmd_offset(pud, addr); | |
107 | - if (!pmd_present(*pmd)) | |
108 | - goto out; | |
113 | + pmd = pmd_offset(pud, addr); | |
114 | + if (!pmd_present(*pmd)) | |
115 | + goto out; | |
109 | 116 | |
110 | - ptep = pte_offset_map(pmd, addr); | |
117 | + ptep = pte_offset_map(pmd, addr); | |
111 | 118 | |
112 | - if (!is_swap_pte(*ptep)) { | |
113 | - pte_unmap(ptep); | |
114 | - goto out; | |
115 | - } | |
119 | + if (!is_swap_pte(*ptep)) { | |
120 | + pte_unmap(ptep); | |
121 | + goto out; | |
122 | + } | |
116 | 123 | |
117 | - ptl = pte_lockptr(mm, pmd); | |
124 | + ptl = pte_lockptr(mm, pmd); | |
125 | + } | |
126 | + | |
118 | 127 | spin_lock(ptl); |
119 | 128 | pte = *ptep; |
120 | 129 | if (!is_swap_pte(pte)) |
121 | 130 | |
... | ... | @@ -130,10 +139,19 @@ |
130 | 139 | pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); |
131 | 140 | if (is_write_migration_entry(entry)) |
132 | 141 | pte = pte_mkwrite(pte); |
142 | +#ifdef CONFIG_HUGETLB_PAGE | |
143 | + if (PageHuge(new)) | |
144 | + pte = pte_mkhuge(pte); | |
145 | +#endif | |
133 | 146 | flush_cache_page(vma, addr, pte_pfn(pte)); |
134 | 147 | set_pte_at(mm, addr, ptep, pte); |
135 | 148 | |
136 | - if (PageAnon(new)) | |
149 | + if (PageHuge(new)) { | |
150 | + if (PageAnon(new)) | |
151 | + hugepage_add_anon_rmap(new, vma, addr); | |
152 | + else | |
153 | + page_dup_rmap(new); | |
154 | + } else if (PageAnon(new)) | |
137 | 155 | page_add_anon_rmap(new, vma, addr); |
138 | 156 | else |
139 | 157 | page_add_file_rmap(new); |
140 | 158 | |
141 | 159 | |
... | ... | @@ -276,11 +294,59 @@ |
276 | 294 | } |
277 | 295 | |
278 | 296 | /* |
297 | + * The expected number of remaining references is the same as that | |
298 | + * of migrate_page_move_mapping(). | |
299 | + */ | |
300 | +int migrate_huge_page_move_mapping(struct address_space *mapping, | |
301 | + struct page *newpage, struct page *page) | |
302 | +{ | |
303 | + int expected_count; | |
304 | + void **pslot; | |
305 | + | |
306 | + if (!mapping) { | |
307 | + if (page_count(page) != 1) | |
308 | + return -EAGAIN; | |
309 | + return 0; | |
310 | + } | |
311 | + | |
312 | + spin_lock_irq(&mapping->tree_lock); | |
313 | + | |
314 | + pslot = radix_tree_lookup_slot(&mapping->page_tree, | |
315 | + page_index(page)); | |
316 | + | |
317 | + expected_count = 2 + page_has_private(page); | |
318 | + if (page_count(page) != expected_count || | |
319 | + (struct page *)radix_tree_deref_slot(pslot) != page) { | |
320 | + spin_unlock_irq(&mapping->tree_lock); | |
321 | + return -EAGAIN; | |
322 | + } | |
323 | + | |
324 | + if (!page_freeze_refs(page, expected_count)) { | |
325 | + spin_unlock_irq(&mapping->tree_lock); | |
326 | + return -EAGAIN; | |
327 | + } | |
328 | + | |
329 | + get_page(newpage); | |
330 | + | |
331 | + radix_tree_replace_slot(pslot, newpage); | |
332 | + | |
333 | + page_unfreeze_refs(page, expected_count); | |
334 | + | |
335 | + __put_page(page); | |
336 | + | |
337 | + spin_unlock_irq(&mapping->tree_lock); | |
338 | + return 0; | |
339 | +} | |
340 | + | |
341 | +/* | |
279 | 342 | * Copy the page to its new location |
280 | 343 | */ |
281 | -static void migrate_page_copy(struct page *newpage, struct page *page) | |
344 | +void migrate_page_copy(struct page *newpage, struct page *page) | |
282 | 345 | { |
283 | - copy_highpage(newpage, page); | |
346 | + if (PageHuge(page)) | |
347 | + copy_huge_page(newpage, page); | |
348 | + else | |
349 | + copy_highpage(newpage, page); | |
284 | 350 | |
285 | 351 | if (PageError(page)) |
286 | 352 | SetPageError(newpage); |
... | ... | @@ -724,6 +790,92 @@ |
724 | 790 | } |
725 | 791 | |
726 | 792 | /* |
793 | + * Counterpart of unmap_and_move_page() for hugepage migration. | |
794 | + * | |
795 | + * This function doesn't wait the completion of hugepage I/O | |
796 | + * because there is no race between I/O and migration for hugepage. | |
797 | + * Note that currently hugepage I/O occurs only in direct I/O | |
798 | + * where no lock is held and PG_writeback is irrelevant, | |
799 | + * and writeback status of all subpages are counted in the reference | |
800 | + * count of the head page (i.e. if all subpages of a 2MB hugepage are | |
801 | + * under direct I/O, the reference of the head page is 512 and a bit more.) | |
802 | + * This means that when we try to migrate hugepage whose subpages are | |
803 | + * doing direct I/O, some references remain after try_to_unmap() and | |
804 | + * hugepage migration fails without data corruption. | |
805 | + * | |
806 | + * There is also no race when direct I/O is issued on the page under migration, | |
807 | + * because then pte is replaced with migration swap entry and direct I/O code | |
808 | + * will wait in the page fault for migration to complete. | |
809 | + */ | |
810 | +static int unmap_and_move_huge_page(new_page_t get_new_page, | |
811 | + unsigned long private, struct page *hpage, | |
812 | + int force, int offlining) | |
813 | +{ | |
814 | + int rc = 0; | |
815 | + int *result = NULL; | |
816 | + struct page *new_hpage = get_new_page(hpage, private, &result); | |
817 | + int rcu_locked = 0; | |
818 | + struct anon_vma *anon_vma = NULL; | |
819 | + | |
820 | + if (!new_hpage) | |
821 | + return -ENOMEM; | |
822 | + | |
823 | + rc = -EAGAIN; | |
824 | + | |
825 | + if (!trylock_page(hpage)) { | |
826 | + if (!force) | |
827 | + goto out; | |
828 | + lock_page(hpage); | |
829 | + } | |
830 | + | |
831 | + if (PageAnon(hpage)) { | |
832 | + rcu_read_lock(); | |
833 | + rcu_locked = 1; | |
834 | + | |
835 | + if (page_mapped(hpage)) { | |
836 | + anon_vma = page_anon_vma(hpage); | |
837 | + atomic_inc(&anon_vma->external_refcount); | |
838 | + } | |
839 | + } | |
840 | + | |
841 | + try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); | |
842 | + | |
843 | + if (!page_mapped(hpage)) | |
844 | + rc = move_to_new_page(new_hpage, hpage, 1); | |
845 | + | |
846 | + if (rc) | |
847 | + remove_migration_ptes(hpage, hpage); | |
848 | + | |
849 | + if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount, | |
850 | + &anon_vma->lock)) { | |
851 | + int empty = list_empty(&anon_vma->head); | |
852 | + spin_unlock(&anon_vma->lock); | |
853 | + if (empty) | |
854 | + anon_vma_free(anon_vma); | |
855 | + } | |
856 | + | |
857 | + if (rcu_locked) | |
858 | + rcu_read_unlock(); | |
859 | +out: | |
860 | + unlock_page(hpage); | |
861 | + | |
862 | + if (rc != -EAGAIN) { | |
863 | + list_del(&hpage->lru); | |
864 | + put_page(hpage); | |
865 | + } | |
866 | + | |
867 | + put_page(new_hpage); | |
868 | + | |
869 | + if (result) { | |
870 | + if (rc) | |
871 | + *result = rc; | |
872 | + else | |
873 | + *result = page_to_nid(new_hpage); | |
874 | + } | |
875 | + return rc; | |
876 | +} | |
877 | + | |
878 | +/* | |
727 | 879 | * migrate_pages |
728 | 880 | * |
729 | 881 | * The function takes one list of pages to migrate and a function |
... | ... | @@ -781,6 +933,52 @@ |
781 | 933 | current->flags &= ~PF_SWAPWRITE; |
782 | 934 | |
783 | 935 | putback_lru_pages(from); |
936 | + | |
937 | + if (rc) | |
938 | + return rc; | |
939 | + | |
940 | + return nr_failed + retry; | |
941 | +} | |
942 | + | |
943 | +int migrate_huge_pages(struct list_head *from, | |
944 | + new_page_t get_new_page, unsigned long private, int offlining) | |
945 | +{ | |
946 | + int retry = 1; | |
947 | + int nr_failed = 0; | |
948 | + int pass = 0; | |
949 | + struct page *page; | |
950 | + struct page *page2; | |
951 | + int rc; | |
952 | + | |
953 | + for (pass = 0; pass < 10 && retry; pass++) { | |
954 | + retry = 0; | |
955 | + | |
956 | + list_for_each_entry_safe(page, page2, from, lru) { | |
957 | + cond_resched(); | |
958 | + | |
959 | + rc = unmap_and_move_huge_page(get_new_page, | |
960 | + private, page, pass > 2, offlining); | |
961 | + | |
962 | + switch(rc) { | |
963 | + case -ENOMEM: | |
964 | + goto out; | |
965 | + case -EAGAIN: | |
966 | + retry++; | |
967 | + break; | |
968 | + case 0: | |
969 | + break; | |
970 | + default: | |
971 | + /* Permanent failure */ | |
972 | + nr_failed++; | |
973 | + break; | |
974 | + } | |
975 | + } | |
976 | + } | |
977 | + rc = 0; | |
978 | +out: | |
979 | + | |
980 | + list_for_each_entry_safe(page, page2, from, lru) | |
981 | + put_page(page); | |
784 | 982 | |
785 | 983 | if (rc) |
786 | 984 | return rc; |
mm/rmap.c
... | ... | @@ -780,10 +780,10 @@ |
780 | 780 | } |
781 | 781 | |
782 | 782 | /** |
783 | - * __page_set_anon_rmap - setup new anonymous rmap | |
784 | - * @page: the page to add the mapping to | |
785 | - * @vma: the vm area in which the mapping is added | |
786 | - * @address: the user virtual address mapped | |
783 | + * __page_set_anon_rmap - set up new anonymous rmap | |
784 | + * @page: Page to add to rmap | |
785 | + * @vma: VM area to add page to. | |
786 | + * @address: User virtual address of the mapping | |
787 | 787 | * @exclusive: the page is exclusively owned by the current process |
788 | 788 | */ |
789 | 789 | static void __page_set_anon_rmap(struct page *page, |
790 | 790 | |
791 | 791 | |
... | ... | @@ -793,25 +793,16 @@ |
793 | 793 | |
794 | 794 | BUG_ON(!anon_vma); |
795 | 795 | |
796 | + if (PageAnon(page)) | |
797 | + return; | |
798 | + | |
796 | 799 | /* |
797 | 800 | * If the page isn't exclusively mapped into this vma, |
798 | 801 | * we must use the _oldest_ possible anon_vma for the |
799 | 802 | * page mapping! |
800 | 803 | */ |
801 | - if (!exclusive) { | |
802 | - if (PageAnon(page)) | |
803 | - return; | |
804 | + if (!exclusive) | |
804 | 805 | anon_vma = anon_vma->root; |
805 | - } else { | |
806 | - /* | |
807 | - * In this case, swapped-out-but-not-discarded swap-cache | |
808 | - * is remapped. So, no need to update page->mapping here. | |
809 | - * We convice anon_vma poitned by page->mapping is not obsolete | |
810 | - * because vma->anon_vma is necessary to be a family of it. | |
811 | - */ | |
812 | - if (PageAnon(page)) | |
813 | - return; | |
814 | - } | |
815 | 806 | |
816 | 807 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; |
817 | 808 | page->mapping = (struct address_space *) anon_vma; |