Merge branch 'hwpoison-hugepages' into hwpoison

Conflicts: mm/memory-failure.c

Merge branch 'hwpoison-hugepages' into hwpoison
Conflicts: mm/memory-failure.c
Andi Kleen
2 parents e9d08567ef 3ef8fd7f72
Showing 10 changed files Side-by-side Diff
arch/x86/mm/fault.c
fs/hugetlbfs/inode.c
include/linux/hugetlb.h
include/linux/migrate.h
include/linux/mm.h
mm/hugetlb.c
mm/memory-failure.c
mm/memory.c
mm/migrate.c
mm/rmap.c
@@ -11,6 +11,7 @@
 #include <linux/kprobes.h>		/* __kprobes, ...		*/
 #include <linux/mmiotrace.h>		/* kmmio_handler, ...		*/
 #include <linux/perf_event.h>		/* perf_sw_event		*/
+#include <linux/hugetlb.h>		/* hstate_index_to_shift	*/
  
 #include <asm/traps.h>			/* dotraplinkage, ...		*/
 #include <asm/pgalloc.h>		/* pgd_*(), ...			*/
  
  
@@ -160,15 +161,20 @@
  
 static void
 force_sig_info_fault(int si_signo, int si_code, unsigned long address,
-		     struct task_struct *tsk)
+		     struct task_struct *tsk, int fault)
 {
+	unsigned lsb = 0;
 	siginfo_t info;
  
 	info.si_signo	= si_signo;
 	info.si_errno	= 0;
 	info.si_code	= si_code;
 	info.si_addr	= (void __user *)address;
-	info.si_addr_lsb = si_code == BUS_MCEERR_AR ? PAGE_SHIFT : 0;
+	if (fault & VM_FAULT_HWPOISON_LARGE)
+		lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); 
+	if (fault & VM_FAULT_HWPOISON)
+		lsb = PAGE_SHIFT;
+	info.si_addr_lsb = lsb;
  
 	force_sig_info(si_signo, &info, tsk);
 }
@@ -722,7 +728,7 @@
 		tsk->thread.error_code	= error_code | (address >= TASK_SIZE);
 		tsk->thread.trap_no	= 14;
  
-		force_sig_info_fault(SIGSEGV, si_code, address, tsk);
+		force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0);
  
 		return;
 	}
  
@@ -807,14 +813,14 @@
 	tsk->thread.trap_no	= 14;
  
 #ifdef CONFIG_MEMORY_FAILURE
-	if (fault & VM_FAULT_HWPOISON) {
+	if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
 		printk(KERN_ERR
 	"MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
 			tsk->comm, tsk->pid, address);
 		code = BUS_MCEERR_AR;
 	}
 #endif
-	force_sig_info_fault(SIGBUS, code, address, tsk);
+	force_sig_info_fault(SIGBUS, code, address, tsk, fault);
 }
  
 static noinline void
@@ -824,7 +830,8 @@
 	if (fault & VM_FAULT_OOM) {
 		out_of_memory(regs, error_code, address);
 	} else {
-		if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON))
+		if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
+			     VM_FAULT_HWPOISON_LARGE))
 			do_sigbus(regs, error_code, address, fault);
 		else
 			BUG();
@@ -31,6 +31,7 @@
 #include <linux/statfs.h>
 #include <linux/security.h>
 #include <linux/magic.h>
+#include <linux/migrate.h>
  
 #include <asm/uaccess.h>
  
@@ -573,6 +574,19 @@
 	return 0;
 }
  
+static int hugetlbfs_migrate_page(struct address_space *mapping,
+				struct page *newpage, struct page *page)
+{
+	int rc;
+
+	rc = migrate_huge_page_move_mapping(mapping, newpage, page);
+	if (rc)
+		return rc;
+	migrate_page_copy(newpage, page);
+
+	return 0;
+}
+
 static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
@@ -659,6 +673,7 @@
 	.write_begin	= hugetlbfs_write_begin,
 	.write_end	= hugetlbfs_write_end,
 	.set_page_dirty	= hugetlbfs_set_page_dirty,
+	.migratepage    = hugetlbfs_migrate_page,
 };
  
  
@@ -43,7 +43,8 @@
 						struct vm_area_struct *vma,
 						int acctflags);
 void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
-void __isolate_hwpoisoned_huge_page(struct page *page);
+int dequeue_hwpoisoned_huge_page(struct page *page);
+void copy_huge_page(struct page *dst, struct page *src);
  
 extern unsigned long hugepages_treat_as_movable;
 extern const unsigned long hugetlb_zero, hugetlb_infinity;
@@ -101,7 +102,10 @@
 #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; })
 #define hugetlb_fault(mm, vma, addr, flags)	({ BUG(); 0; })
 #define huge_pte_offset(mm, address)	0
-#define __isolate_hwpoisoned_huge_page(page)	0
+#define dequeue_hwpoisoned_huge_page(page)	0
+static inline void copy_huge_page(struct page *dst, struct page *src)
+{
+}
  
 #define hugetlb_change_protection(vma, address, end, newprot)
  
@@ -228,6 +232,8 @@
 	struct hstate *hstate;
 };
  
+struct page *alloc_huge_page_node(struct hstate *h, int nid);
+
 /* arch callback */
 int __init alloc_bootmem_huge_page(struct hstate *h);
  
  
@@ -301,8 +307,14 @@
 	return size_to_hstate(PAGE_SIZE << compound_order(page));
 }
  
+static inline unsigned hstate_index_to_shift(unsigned index)
+{
+	return hstates[index].order + PAGE_SHIFT;
+}
+
 #else
 struct hstate {};
+#define alloc_huge_page_node(h, nid) NULL
 #define alloc_bootmem_huge_page(h) NULL
 #define hstate_file(f) NULL
 #define hstate_vma(v) NULL
@@ -317,6 +329,7 @@
 {
 	return 1;
 }
+#define hstate_index_to_shift(index) 0
 #endif
  
 #endif /* _LINUX_HUGETLB_H */
@@ -14,6 +14,8 @@
 			struct page *, struct page *);
 extern int migrate_pages(struct list_head *l, new_page_t x,
 			unsigned long private, int offlining);
+extern int migrate_huge_pages(struct list_head *l, new_page_t x,
+			unsigned long private, int offlining);
  
 extern int fail_migrate_page(struct address_space *,
 			struct page *, struct page *);
  
@@ -23,12 +25,17 @@
 extern int migrate_vmas(struct mm_struct *mm,
 		const nodemask_t *from, const nodemask_t *to,
 		unsigned long flags);
+extern void migrate_page_copy(struct page *newpage, struct page *page);
+extern int migrate_huge_page_move_mapping(struct address_space *mapping,
+				  struct page *newpage, struct page *page);
 #else
 #define PAGE_MIGRATION 0
  
 static inline void putback_lru_pages(struct list_head *l) {}
 static inline int migrate_pages(struct list_head *l, new_page_t x,
 		unsigned long private, int offlining) { return -ENOSYS; }
+static inline int migrate_huge_pages(struct list_head *l, new_page_t x,
+		unsigned long private, int offlining) { return -ENOSYS; }
  
 static inline int migrate_prep(void) { return -ENOSYS; }
 static inline int migrate_prep_local(void) { return -ENOSYS; }
@@ -36,6 +43,15 @@
 static inline int migrate_vmas(struct mm_struct *mm,
 		const nodemask_t *from, const nodemask_t *to,
 		unsigned long flags)
+{
+	return -ENOSYS;
+}
+
+static inline void migrate_page_copy(struct page *newpage,
+				     struct page *page) {}
+
+static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
+				  struct page *newpage, struct page *page)
 {
 	return -ENOSYS;
 }
@@ -718,12 +718,20 @@
 #define VM_FAULT_SIGBUS	0x0002
 #define VM_FAULT_MAJOR	0x0004
 #define VM_FAULT_WRITE	0x0008	/* Special case for get_user_pages */
-#define VM_FAULT_HWPOISON 0x0010	/* Hit poisoned page */
+#define VM_FAULT_HWPOISON 0x0010	/* Hit poisoned small page */
+#define VM_FAULT_HWPOISON_LARGE 0x0020  /* Hit poisoned large page. Index encoded in upper bits */
  
 #define VM_FAULT_NOPAGE	0x0100	/* ->fault installed the pte, not return page */
 #define VM_FAULT_LOCKED	0x0200	/* ->fault locked the returned page */
  
-#define VM_FAULT_ERROR	(VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON)
+#define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */
+
+#define VM_FAULT_ERROR	(VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON | \
+			 VM_FAULT_HWPOISON_LARGE)
+
+/* Encode hstate index for a hwpoisoned large page */
+#define VM_FAULT_SET_HINDEX(x) ((x) << 12)
+#define VM_FAULT_GET_HINDEX(x) (((x) >> 12) & 0xf)
  
 /*
  * Can be called by the pagefault handler when it gets a VM_FAULT_OOM.
@@ -423,14 +423,14 @@
 	}
 }
  
-static void copy_gigantic_page(struct page *dst, struct page *src,
+static void copy_user_gigantic_page(struct page *dst, struct page *src,
 			   unsigned long addr, struct vm_area_struct *vma)
 {
 	int i;
 	struct hstate *h = hstate_vma(vma);
 	struct page *dst_base = dst;
 	struct page *src_base = src;
-	might_sleep();
+
 	for (i = 0; i < pages_per_huge_page(h); ) {
 		cond_resched();
 		copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
  
@@ -440,14 +440,15 @@
 		src = mem_map_next(src, src_base, i);
 	}
 }
-static void copy_huge_page(struct page *dst, struct page *src,
+
+static void copy_user_huge_page(struct page *dst, struct page *src,
 			   unsigned long addr, struct vm_area_struct *vma)
 {
 	int i;
 	struct hstate *h = hstate_vma(vma);
  
 	if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
-		copy_gigantic_page(dst, src, addr, vma);
+		copy_user_gigantic_page(dst, src, addr, vma);
 		return;
 	}
  
@@ -458,6 +459,40 @@
 	}
 }
  
+static void copy_gigantic_page(struct page *dst, struct page *src)
+{
+	int i;
+	struct hstate *h = page_hstate(src);
+	struct page *dst_base = dst;
+	struct page *src_base = src;
+
+	for (i = 0; i < pages_per_huge_page(h); ) {
+		cond_resched();
+		copy_highpage(dst, src);
+
+		i++;
+		dst = mem_map_next(dst, dst_base, i);
+		src = mem_map_next(src, src_base, i);
+	}
+}
+
+void copy_huge_page(struct page *dst, struct page *src)
+{
+	int i;
+	struct hstate *h = page_hstate(src);
+
+	if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
+		copy_gigantic_page(dst, src);
+		return;
+	}
+
+	might_sleep();
+	for (i = 0; i < pages_per_huge_page(h); i++) {
+		cond_resched();
+		copy_highpage(dst + i, src + i);
+	}
+}
+
 static void enqueue_huge_page(struct hstate *h, struct page *page)
 {
 	int nid = page_to_nid(page);
  
@@ -466,11 +501,24 @@
 	h->free_huge_pages_node[nid]++;
 }
  
+static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
+{
+	struct page *page;
+
+	if (list_empty(&h->hugepage_freelists[nid]))
+		return NULL;
+	page = list_entry(h->hugepage_freelists[nid].next, struct page, lru);
+	list_del(&page->lru);
+	set_page_refcounted(page);
+	h->free_huge_pages--;
+	h->free_huge_pages_node[nid]--;
+	return page;
+}
+
 static struct page *dequeue_huge_page_vma(struct hstate *h,
 				struct vm_area_struct *vma,
 				unsigned long address, int avoid_reserve)
 {
-	int nid;
 	struct page *page = NULL;
 	struct mempolicy *mpol;
 	nodemask_t *nodemask;
@@ -496,19 +544,13 @@
  
 	for_each_zone_zonelist_nodemask(zone, z, zonelist,
 						MAX_NR_ZONES - 1, nodemask) {
-		nid = zone_to_nid(zone);
-		if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) &&
-		    !list_empty(&h->hugepage_freelists[nid])) {
-			page = list_entry(h->hugepage_freelists[nid].next,
-					  struct page, lru);
-			list_del(&page->lru);
-			h->free_huge_pages--;
-			h->free_huge_pages_node[nid]--;
-
-			if (!avoid_reserve)
-				decrement_hugepage_resv_vma(h, vma);
-
-			break;
+		if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) {
+			page = dequeue_huge_page_node(h, zone_to_nid(zone));
+			if (page) {
+				if (!avoid_reserve)
+					decrement_hugepage_resv_vma(h, vma);
+				break;
+			}
 		}
 	}
 err:
  
@@ -770,11 +812,10 @@
 	return ret;
 }
  
-static struct page *alloc_buddy_huge_page(struct hstate *h,
-			struct vm_area_struct *vma, unsigned long address)
+static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
 {
 	struct page *page;
-	unsigned int nid;
+	unsigned int r_nid;
  
 	if (h->order >= MAX_ORDER)
 		return NULL;
@@ -812,9 +853,14 @@
 	}
 	spin_unlock(&hugetlb_lock);
  
-	page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
-					__GFP_REPEAT|__GFP_NOWARN,
-					huge_page_order(h));
+	if (nid == NUMA_NO_NODE)
+		page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
+				   __GFP_REPEAT|__GFP_NOWARN,
+				   huge_page_order(h));
+	else
+		page = alloc_pages_exact_node(nid,
+			htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
+			__GFP_REPEAT|__GFP_NOWARN, huge_page_order(h));
  
 	if (page && arch_prepare_hugepage(page)) {
 		__free_pages(page, huge_page_order(h));
  
@@ -823,19 +869,13 @@
  
 	spin_lock(&hugetlb_lock);
 	if (page) {
-		/*
-		 * This page is now managed by the hugetlb allocator and has
-		 * no users -- drop the buddy allocator's reference.
-		 */
-		put_page_testzero(page);
-		VM_BUG_ON(page_count(page));
-		nid = page_to_nid(page);
+		r_nid = page_to_nid(page);
 		set_compound_page_dtor(page, free_huge_page);
 		/*
 		 * We incremented the global counters already
 		 */
-		h->nr_huge_pages_node[nid]++;
-		h->surplus_huge_pages_node[nid]++;
+		h->nr_huge_pages_node[r_nid]++;
+		h->surplus_huge_pages_node[r_nid]++;
 		__count_vm_event(HTLB_BUDDY_PGALLOC);
 	} else {
 		h->nr_huge_pages--;
@@ -848,6 +888,25 @@
 }
  
 /*
+ * This allocation function is useful in the context where vma is irrelevant.
+ * E.g. soft-offlining uses this function because it only cares physical
+ * address of error page.
+ */
+struct page *alloc_huge_page_node(struct hstate *h, int nid)
+{
+	struct page *page;
+
+	spin_lock(&hugetlb_lock);
+	page = dequeue_huge_page_node(h, nid);
+	spin_unlock(&hugetlb_lock);
+
+	if (!page)
+		page = alloc_buddy_huge_page(h, nid);
+
+	return page;
+}
+
+/*
  * Increase the hugetlb pool such that it can accomodate a reservation
  * of size 'delta'.
  */
  
  
@@ -871,17 +930,14 @@
 retry:
 	spin_unlock(&hugetlb_lock);
 	for (i = 0; i < needed; i++) {
-		page = alloc_buddy_huge_page(h, NULL, 0);
-		if (!page) {
+		page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
+		if (!page)
 			/*
 			 * We were not able to allocate enough pages to
 			 * satisfy the entire reservation so we free what
 			 * we've allocated so far.
 			 */
-			spin_lock(&hugetlb_lock);
-			needed = 0;
 			goto free;
-		}
  
 		list_add(&page->lru, &surplus_list);
 	}
  
  
  
  
  
  
@@ -908,31 +964,31 @@
 	needed += allocated;
 	h->resv_huge_pages += delta;
 	ret = 0;
-free:
+
+	spin_unlock(&hugetlb_lock);
 	/* Free the needed pages to the hugetlb pool */
 	list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
 		if ((--needed) < 0)
 			break;
 		list_del(&page->lru);
+		/*
+		 * This page is now managed by the hugetlb allocator and has
+		 * no users -- drop the buddy allocator's reference.
+		 */
+		put_page_testzero(page);
+		VM_BUG_ON(page_count(page));
 		enqueue_huge_page(h, page);
 	}
  
 	/* Free unnecessary surplus pages to the buddy allocator */
+free:
 	if (!list_empty(&surplus_list)) {
-		spin_unlock(&hugetlb_lock);
 		list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
 			list_del(&page->lru);
-			/*
-			 * The page has a reference count of zero already, so
-			 * call free_huge_page directly instead of using
-			 * put_page.  This must be done with hugetlb_lock
-			 * unlocked which is safe because free_huge_page takes
-			 * hugetlb_lock before deciding how to free the page.
-			 */
-			free_huge_page(page);
+			put_page(page);
 		}
-		spin_lock(&hugetlb_lock);
 	}
+	spin_lock(&hugetlb_lock);
  
 	return ret;
 }
  
@@ -1052,14 +1108,13 @@
 	spin_unlock(&hugetlb_lock);
  
 	if (!page) {
-		page = alloc_buddy_huge_page(h, vma, addr);
+		page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
 		if (!page) {
 			hugetlb_put_quota(inode->i_mapping, chg);
 			return ERR_PTR(-VM_FAULT_SIGBUS);
 		}
 	}
  
-	set_page_refcounted(page);
 	set_page_private(page, (unsigned long) mapping);
  
 	vma_commit_reservation(h, vma, addr);
@@ -2153,6 +2208,19 @@
 	return -ENOMEM;
 }
  
+static int is_hugetlb_entry_migration(pte_t pte)
+{
+	swp_entry_t swp;
+
+	if (huge_pte_none(pte) || pte_present(pte))
+		return 0;
+	swp = pte_to_swp_entry(pte);
+	if (non_swap_entry(swp) && is_migration_entry(swp)) {
+		return 1;
+	} else
+		return 0;
+}
+
 static int is_hugetlb_entry_hwpoisoned(pte_t pte)
 {
 	swp_entry_t swp;
@@ -2383,7 +2451,7 @@
 	if (unlikely(anon_vma_prepare(vma)))
 		return VM_FAULT_OOM;
  
-	copy_huge_page(new_page, old_page, address, vma);
+	copy_user_huge_page(new_page, old_page, address, vma);
 	__SetPageUptodate(new_page);
  
 	/*
  
@@ -2515,22 +2583,20 @@
 			hugepage_add_new_anon_rmap(page, vma, address);
 		}
 	} else {
+		/*
+		 * If memory error occurs between mmap() and fault, some process
+		 * don't have hwpoisoned swap entry for errored virtual address.
+		 * So we need to block hugepage fault by PG_hwpoison bit check.
+		 */
+		if (unlikely(PageHWPoison(page))) {
+			ret = VM_FAULT_HWPOISON | 
+			      VM_FAULT_SET_HINDEX(h - hstates);
+			goto backout_unlocked;
+		}
 		page_dup_rmap(page);
 	}
  
 	/*
-	 * Since memory error handler replaces pte into hwpoison swap entry
-	 * at the time of error handling, a process which reserved but not have
-	 * the mapping to the error hugepage does not have hwpoison swap entry.
-	 * So we need to block accesses from such a process by checking
-	 * PG_hwpoison bit here.
-	 */
-	if (unlikely(PageHWPoison(page))) {
-		ret = VM_FAULT_HWPOISON;
-		goto backout_unlocked;
-	}
-
-	/*
 	 * If we are going to COW a private mapping later, we examine the
 	 * pending reservations for this page now. This will ensure that
 	 * any allocations necessary to record that reservation occur outside
@@ -2587,8 +2653,12 @@
 	ptep = huge_pte_offset(mm, address);
 	if (ptep) {
 		entry = huge_ptep_get(ptep);
-		if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
-			return VM_FAULT_HWPOISON;
+		if (unlikely(is_hugetlb_entry_migration(entry))) {
+			migration_entry_wait(mm, (pmd_t *)ptep, address);
+			return 0;
+		} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
+			return VM_FAULT_HWPOISON_LARGE | 
+			       VM_FAULT_SET_HINDEX(h - hstates);
 	}
  
 	ptep = huge_pte_alloc(mm, address, huge_page_size(h));
  
  
  
  
  
@@ -2878,19 +2948,42 @@
 	hugetlb_acct_memory(h, -(chg - freed));
 }
  
+#ifdef CONFIG_MEMORY_FAILURE
+
+/* Should be called in hugetlb_lock */
+static int is_hugepage_on_freelist(struct page *hpage)
+{
+	struct page *page;
+	struct page *tmp;
+	struct hstate *h = page_hstate(hpage);
+	int nid = page_to_nid(hpage);
+
+	list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru)
+		if (page == hpage)
+			return 1;
+	return 0;
+}
+
 /*
  * This function is called from memory failure code.
  * Assume the caller holds page lock of the head page.
  */
-void __isolate_hwpoisoned_huge_page(struct page *hpage)
+int dequeue_hwpoisoned_huge_page(struct page *hpage)
 {
 	struct hstate *h = page_hstate(hpage);
 	int nid = page_to_nid(hpage);
+	int ret = -EBUSY;
  
 	spin_lock(&hugetlb_lock);
-	list_del(&hpage->lru);
-	h->free_huge_pages--;
-	h->free_huge_pages_node[nid]--;
+	if (is_hugepage_on_freelist(hpage)) {
+		list_del(&hpage->lru);
+		set_page_refcounted(hpage);
+		h->free_huge_pages--;
+		h->free_huge_pages_node[nid]--;
+		ret = 0;
+	}
 	spin_unlock(&hugetlb_lock);
+	return ret;
 }
+#endif
@@ -697,11 +697,10 @@
  * Issues:
  * - Error on hugepage is contained in hugepage unit (not in raw page unit.)
  *   To narrow down kill region to one page, we need to break up pmd.
- * - To support soft-offlining for hugepage, we need to support hugepage
- *   migration.
  */
 static int me_huge_page(struct page *p, unsigned long pfn)
 {
+	int res = 0;
 	struct page *hpage = compound_head(p);
 	/*
 	 * We can safely recover from error on free or reserved (i.e.
@@ -714,8 +713,9 @@
 	 * so there is no race between isolation and mapping/unmapping.
 	 */
 	if (!(page_mapping(hpage) || PageAnon(hpage))) {
-		__isolate_hwpoisoned_huge_page(hpage);
-		return RECOVERED;
+		res = dequeue_hwpoisoned_huge_page(hpage);
+		if (!res)
+			return RECOVERED;
 	}
 	return DELAYED;
 }
@@ -972,7 +972,10 @@
 	 * We need/can do nothing about count=0 pages.
 	 * 1) it's a free page, and therefore in safe hand:
 	 *    prep_new_page() will be the gate keeper.
-	 * 2) it's part of a non-compound high order page.
+	 * 2) it's a free hugepage, which is also safe:
+	 *    an affected hugepage will be dequeued from hugepage freelist,
+	 *    so there's no concern about reusing it ever after.
+	 * 3) it's part of a non-compound high order page.
 	 *    Implies some kernel user: cannot stop them from
 	 *    R/W the page; let's pray that the page has been
 	 *    used and will be freed some time later.
@@ -984,6 +987,24 @@
 		if (is_free_buddy_page(p)) {
 			action_result(pfn, "free buddy", DELAYED);
 			return 0;
+		} else if (PageHuge(hpage)) {
+			/*
+			 * Check "just unpoisoned", "filter hit", and
+			 * "race with other subpage."
+			 */
+			lock_page_nosync(hpage);
+			if (!PageHWPoison(hpage)
+			    || (hwpoison_filter(p) && TestClearPageHWPoison(p))
+			    || (p != hpage && TestSetPageHWPoison(hpage))) {
+				atomic_long_sub(nr_pages, &mce_bad_pages);
+				return 0;
+			}
+			set_page_hwpoison_huge_page(hpage);
+			res = dequeue_hwpoisoned_huge_page(hpage);
+			action_result(pfn, "free huge",
+				      res ? IGNORED : DELAYED);
+			unlock_page(hpage);
+			return res;
 		} else {
 			action_result(pfn, "high order kernel", IGNORED);
 			return -EBUSY;
@@ -1145,6 +1166,16 @@
 	nr_pages = 1 << compound_order(page);
  
 	if (!get_page_unless_zero(page)) {
+		/*
+		 * Since HWPoisoned hugepage should have non-zero refcount,
+		 * race between memory failure and unpoison seems to happen.
+		 * In such case unpoison fails and memory failure runs
+		 * to the end.
+		 */
+		if (PageHuge(page)) {
+			pr_debug("MCE: Memory failure is now running on free hugepage %#lx\n", pfn);
+			return 0;
+		}
 		if (TestClearPageHWPoison(p))
 			atomic_long_sub(nr_pages, &mce_bad_pages);
 		pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
  
@@ -1162,9 +1193,9 @@
 		pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
 		atomic_long_sub(nr_pages, &mce_bad_pages);
 		freeit = 1;
+		if (PageHuge(page))
+			clear_page_hwpoison_huge_page(page);
 	}
-	if (PageHuge(p))
-		clear_page_hwpoison_huge_page(page);
 	unlock_page(page);
  
 	put_page(page);
@@ -1178,7 +1209,11 @@
 static struct page *new_page(struct page *p, unsigned long private, int **x)
 {
 	int nid = page_to_nid(p);
-	return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
+	if (PageHuge(p))
+		return alloc_huge_page_node(page_hstate(compound_head(p)),
+						   nid);
+	else
+		return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
 }
  
 /*
  
@@ -1206,8 +1241,15 @@
 	 * was free.
 	 */
 	set_migratetype_isolate(p);
+	/*
+	 * When the target page is a free hugepage, just remove it
+	 * from free hugepage list.
+	 */
 	if (!get_page_unless_zero(compound_head(p))) {
-		if (is_free_buddy_page(p)) {
+		if (PageHuge(p)) {
+			pr_info("get_any_page: %#lx free huge page\n", pfn);
+			ret = dequeue_hwpoisoned_huge_page(compound_head(p));
+		} else if (is_free_buddy_page(p)) {
 			pr_info("get_any_page: %#lx free buddy page\n", pfn);
 			/* Set hwpoison bit while page is still isolated */
 			SetPageHWPoison(p);
@@ -1226,6 +1268,45 @@
 	return ret;
 }
  
+static int soft_offline_huge_page(struct page *page, int flags)
+{
+	int ret;
+	unsigned long pfn = page_to_pfn(page);
+	struct page *hpage = compound_head(page);
+	LIST_HEAD(pagelist);
+
+	ret = get_any_page(page, pfn, flags);
+	if (ret < 0)
+		return ret;
+	if (ret == 0)
+		goto done;
+
+	if (PageHWPoison(hpage)) {
+		put_page(hpage);
+		pr_debug("soft offline: %#lx hugepage already poisoned\n", pfn);
+		return -EBUSY;
+	}
+
+	/* Keep page count to indicate a given hugepage is isolated. */
+
+	list_add(&hpage->lru, &pagelist);
+	ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
+	if (ret) {
+		pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
+			 pfn, ret, page->flags);
+		if (ret > 0)
+			ret = -EIO;
+		return ret;
+	}
+done:
+	if (!PageHWPoison(hpage))
+		atomic_long_add(1 << compound_order(hpage), &mce_bad_pages);
+	set_page_hwpoison_huge_page(hpage);
+	dequeue_hwpoisoned_huge_page(hpage);
+	/* keep elevated page count for bad page */
+	return ret;
+}
+
 /**
  * soft_offline_page - Soft offline a page.
  * @page: page to offline
@@ -1252,6 +1333,9 @@
 {
 	int ret;
 	unsigned long pfn = page_to_pfn(page);
+
+	if (PageHuge(page))
+		return soft_offline_huge_page(page, flags);
  
 	ret = get_any_page(page, pfn, flags);
 	if (ret < 0)
@@ -1450,7 +1450,8 @@
 					if (ret & VM_FAULT_OOM)
 						return i ? i : -ENOMEM;
 					if (ret &
-					    (VM_FAULT_HWPOISON|VM_FAULT_SIGBUS))
+					    (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE|
+					     VM_FAULT_SIGBUS))
 						return i ? i : -EFAULT;
 					BUG();
 				}
@@ -32,6 +32,7 @@
 #include <linux/security.h>
 #include <linux/memcontrol.h>
 #include <linux/syscalls.h>
+#include <linux/hugetlb.h>
 #include <linux/gfp.h>
  
 #include "internal.h"
  
  
  
  
  
@@ -95,26 +96,34 @@
 	pte_t *ptep, pte;
  	spinlock_t *ptl;
  
- 	pgd = pgd_offset(mm, addr);
-	if (!pgd_present(*pgd))
-		goto out;
+	if (unlikely(PageHuge(new))) {
+		ptep = huge_pte_offset(mm, addr);
+		if (!ptep)
+			goto out;
+		ptl = &mm->page_table_lock;
+	} else {
+		pgd = pgd_offset(mm, addr);
+		if (!pgd_present(*pgd))
+			goto out;
  
-	pud = pud_offset(pgd, addr);
-	if (!pud_present(*pud))
-		goto out;
+		pud = pud_offset(pgd, addr);
+		if (!pud_present(*pud))
+			goto out;
  
-	pmd = pmd_offset(pud, addr);
-	if (!pmd_present(*pmd))
-		goto out;
+		pmd = pmd_offset(pud, addr);
+		if (!pmd_present(*pmd))
+			goto out;
  
-	ptep = pte_offset_map(pmd, addr);
+		ptep = pte_offset_map(pmd, addr);
  
-	if (!is_swap_pte(*ptep)) {
-		pte_unmap(ptep);
-		goto out;
- 	}
+		if (!is_swap_pte(*ptep)) {
+			pte_unmap(ptep);
+			goto out;
+		}
  
- 	ptl = pte_lockptr(mm, pmd);
+		ptl = pte_lockptr(mm, pmd);
+	}
+
  	spin_lock(ptl);
 	pte = *ptep;
 	if (!is_swap_pte(pte))
  
@@ -130,10 +139,19 @@
 	pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
 	if (is_write_migration_entry(entry))
 		pte = pte_mkwrite(pte);
+#ifdef CONFIG_HUGETLB_PAGE
+	if (PageHuge(new))
+		pte = pte_mkhuge(pte);
+#endif
 	flush_cache_page(vma, addr, pte_pfn(pte));
 	set_pte_at(mm, addr, ptep, pte);
  
-	if (PageAnon(new))
+	if (PageHuge(new)) {
+		if (PageAnon(new))
+			hugepage_add_anon_rmap(new, vma, addr);
+		else
+			page_dup_rmap(new);
+	} else if (PageAnon(new))
 		page_add_anon_rmap(new, vma, addr);
 	else
 		page_add_file_rmap(new);
  
  
@@ -276,11 +294,59 @@
 }
  
 /*
+ * The expected number of remaining references is the same as that
+ * of migrate_page_move_mapping().
+ */
+int migrate_huge_page_move_mapping(struct address_space *mapping,
+				   struct page *newpage, struct page *page)
+{
+	int expected_count;
+	void **pslot;
+
+	if (!mapping) {
+		if (page_count(page) != 1)
+			return -EAGAIN;
+		return 0;
+	}
+
+	spin_lock_irq(&mapping->tree_lock);
+
+	pslot = radix_tree_lookup_slot(&mapping->page_tree,
+					page_index(page));
+
+	expected_count = 2 + page_has_private(page);
+	if (page_count(page) != expected_count ||
+	    (struct page *)radix_tree_deref_slot(pslot) != page) {
+		spin_unlock_irq(&mapping->tree_lock);
+		return -EAGAIN;
+	}
+
+	if (!page_freeze_refs(page, expected_count)) {
+		spin_unlock_irq(&mapping->tree_lock);
+		return -EAGAIN;
+	}
+
+	get_page(newpage);
+
+	radix_tree_replace_slot(pslot, newpage);
+
+	page_unfreeze_refs(page, expected_count);
+
+	__put_page(page);
+
+	spin_unlock_irq(&mapping->tree_lock);
+	return 0;
+}
+
+/*
  * Copy the page to its new location
  */
-static void migrate_page_copy(struct page *newpage, struct page *page)
+void migrate_page_copy(struct page *newpage, struct page *page)
 {
-	copy_highpage(newpage, page);
+	if (PageHuge(page))
+		copy_huge_page(newpage, page);
+	else
+		copy_highpage(newpage, page);
  
 	if (PageError(page))
 		SetPageError(newpage);
@@ -724,6 +790,92 @@
 }
  
 /*
+ * Counterpart of unmap_and_move_page() for hugepage migration.
+ *
+ * This function doesn't wait the completion of hugepage I/O
+ * because there is no race between I/O and migration for hugepage.
+ * Note that currently hugepage I/O occurs only in direct I/O
+ * where no lock is held and PG_writeback is irrelevant,
+ * and writeback status of all subpages are counted in the reference
+ * count of the head page (i.e. if all subpages of a 2MB hugepage are
+ * under direct I/O, the reference of the head page is 512 and a bit more.)
+ * This means that when we try to migrate hugepage whose subpages are
+ * doing direct I/O, some references remain after try_to_unmap() and
+ * hugepage migration fails without data corruption.
+ *
+ * There is also no race when direct I/O is issued on the page under migration,
+ * because then pte is replaced with migration swap entry and direct I/O code
+ * will wait in the page fault for migration to complete.
+ */
+static int unmap_and_move_huge_page(new_page_t get_new_page,
+				unsigned long private, struct page *hpage,
+				int force, int offlining)
+{
+	int rc = 0;
+	int *result = NULL;
+	struct page *new_hpage = get_new_page(hpage, private, &result);
+	int rcu_locked = 0;
+	struct anon_vma *anon_vma = NULL;
+
+	if (!new_hpage)
+		return -ENOMEM;
+
+	rc = -EAGAIN;
+
+	if (!trylock_page(hpage)) {
+		if (!force)
+			goto out;
+		lock_page(hpage);
+	}
+
+	if (PageAnon(hpage)) {
+		rcu_read_lock();
+		rcu_locked = 1;
+
+		if (page_mapped(hpage)) {
+			anon_vma = page_anon_vma(hpage);
+			atomic_inc(&anon_vma->external_refcount);
+		}
+	}
+
+	try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
+
+	if (!page_mapped(hpage))
+		rc = move_to_new_page(new_hpage, hpage, 1);
+
+	if (rc)
+		remove_migration_ptes(hpage, hpage);
+
+	if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount,
+					    &anon_vma->lock)) {
+		int empty = list_empty(&anon_vma->head);
+		spin_unlock(&anon_vma->lock);
+		if (empty)
+			anon_vma_free(anon_vma);
+	}
+
+	if (rcu_locked)
+		rcu_read_unlock();
+out:
+	unlock_page(hpage);
+
+	if (rc != -EAGAIN) {
+		list_del(&hpage->lru);
+		put_page(hpage);
+	}
+
+	put_page(new_hpage);
+
+	if (result) {
+		if (rc)
+			*result = rc;
+		else
+			*result = page_to_nid(new_hpage);
+	}
+	return rc;
+}
+
+/*
  * migrate_pages
  *
  * The function takes one list of pages to migrate and a function
@@ -781,6 +933,52 @@
 		current->flags &= ~PF_SWAPWRITE;
  
 	putback_lru_pages(from);
+
+	if (rc)
+		return rc;
+
+	return nr_failed + retry;
+}
+
+int migrate_huge_pages(struct list_head *from,
+		new_page_t get_new_page, unsigned long private, int offlining)
+{
+	int retry = 1;
+	int nr_failed = 0;
+	int pass = 0;
+	struct page *page;
+	struct page *page2;
+	int rc;
+
+	for (pass = 0; pass < 10 && retry; pass++) {
+		retry = 0;
+
+		list_for_each_entry_safe(page, page2, from, lru) {
+			cond_resched();
+
+			rc = unmap_and_move_huge_page(get_new_page,
+					private, page, pass > 2, offlining);
+
+			switch(rc) {
+			case -ENOMEM:
+				goto out;
+			case -EAGAIN:
+				retry++;
+				break;
+			case 0:
+				break;
+			default:
+				/* Permanent failure */
+				nr_failed++;
+				break;
+			}
+		}
+	}
+	rc = 0;
+out:
+
+	list_for_each_entry_safe(page, page2, from, lru)
+		put_page(page);
  
 	if (rc)
 		return rc;
@@ -780,10 +780,10 @@
 }
  
 /**
- * __page_set_anon_rmap - setup new anonymous rmap
- * @page:	the page to add the mapping to
- * @vma:	the vm area in which the mapping is added
- * @address:	the user virtual address mapped
+ * __page_set_anon_rmap - set up new anonymous rmap
+ * @page:	Page to add to rmap	
+ * @vma:	VM area to add page to.
+ * @address:	User virtual address of the mapping	
  * @exclusive:	the page is exclusively owned by the current process
  */
 static void __page_set_anon_rmap(struct page *page,
  
  
@@ -793,25 +793,16 @@
  
 	BUG_ON(!anon_vma);
  
+	if (PageAnon(page))
+		return;
+
 	/*
 	 * If the page isn't exclusively mapped into this vma,
 	 * we must use the _oldest_ possible anon_vma for the
 	 * page mapping!
 	 */
-	if (!exclusive) {
-		if (PageAnon(page))
-			return;
+	if (!exclusive)
 		anon_vma = anon_vma->root;
-	} else {
-		/*
-		 * In this case, swapped-out-but-not-discarded swap-cache
-		 * is remapped. So, no need to update page->mapping here.
-		 * We convice anon_vma poitned by page->mapping is not obsolete
-		 * because vma->anon_vma is necessary to be a family of it.
-		 */
-		if (PageAnon(page))
-			return;
-	}
  
 	anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
 	page->mapping = (struct address_space *) anon_vma;
...	...	@@ -11,6 +11,7 @@
11	11	#include <linux/kprobes.h> /* __kprobes, ... */
12	12	#include <linux/mmiotrace.h> /* kmmio_handler, ... */
13	13	#include <linux/perf_event.h> /* perf_sw_event */
	14	+#include <linux/hugetlb.h> /* hstate_index_to_shift */
14	15
15	16	#include <asm/traps.h> /* dotraplinkage, ... */
16	17	#include <asm/pgalloc.h> /* pgd_(), ... /
17	18
18	19
...	...	@@ -160,15 +161,20 @@
160	161
161	162	static void
162	163	force_sig_info_fault(int si_signo, int si_code, unsigned long address,
163		- struct task_struct *tsk)
	164	+ struct task_struct *tsk, int fault)
164	165	{
	166	+ unsigned lsb = 0;
165	167	siginfo_t info;
166	168
167	169	info.si_signo = si_signo;
168	170	info.si_errno = 0;
169	171	info.si_code = si_code;
170	172	info.si_addr = (void __user *)address;
171		- info.si_addr_lsb = si_code == BUS_MCEERR_AR ? PAGE_SHIFT : 0;
	173	+ if (fault & VM_FAULT_HWPOISON_LARGE)
	174	+ lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
	175	+ if (fault & VM_FAULT_HWPOISON)
	176	+ lsb = PAGE_SHIFT;
	177	+ info.si_addr_lsb = lsb;
172	178
173	179	force_sig_info(si_signo, &info, tsk);
174	180	}
...	...	@@ -722,7 +728,7 @@
722	728	tsk->thread.error_code = error_code \| (address >= TASK_SIZE);
723	729	tsk->thread.trap_no = 14;
724	730
725		- force_sig_info_fault(SIGSEGV, si_code, address, tsk);
	731	+ force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0);
726	732
727	733	return;
728	734	}
729	735
...	...	@@ -807,14 +813,14 @@
807	813	tsk->thread.trap_no = 14;
808	814
809	815	#ifdef CONFIG_MEMORY_FAILURE
810		- if (fault & VM_FAULT_HWPOISON) {
	816	+ if (fault & (VM_FAULT_HWPOISON\|VM_FAULT_HWPOISON_LARGE)) {
811	817	printk(KERN_ERR
812	818	"MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
813	819	tsk->comm, tsk->pid, address);
814	820	code = BUS_MCEERR_AR;
815	821	}
816	822	#endif
817		- force_sig_info_fault(SIGBUS, code, address, tsk);
	823	+ force_sig_info_fault(SIGBUS, code, address, tsk, fault);
818	824	}
819	825
820	826	static noinline void
...	...	@@ -824,7 +830,8 @@
824	830	if (fault & VM_FAULT_OOM) {
825	831	out_of_memory(regs, error_code, address);
826	832	} else {
827		- if (fault & (VM_FAULT_SIGBUS\|VM_FAULT_HWPOISON))
	833	+ if (fault & (VM_FAULT_SIGBUS\|VM_FAULT_HWPOISON\|
	834	+ VM_FAULT_HWPOISON_LARGE))
828	835	do_sigbus(regs, error_code, address, fault);
829	836	else
830	837	BUG();
...	...	@@ -31,6 +31,7 @@
31	31	#include <linux/statfs.h>
32	32	#include <linux/security.h>
33	33	#include <linux/magic.h>
	34	+#include <linux/migrate.h>
34	35
35	36	#include <asm/uaccess.h>
36	37
...	...	@@ -573,6 +574,19 @@
573	574	return 0;
574	575	}
575	576
	577	+static int hugetlbfs_migrate_page(struct address_space *mapping,
	578	+ struct page newpage, struct page page)
	579	+{
	580	+ int rc;
	581	+
	582	+ rc = migrate_huge_page_move_mapping(mapping, newpage, page);
	583	+ if (rc)
	584	+ return rc;
	585	+ migrate_page_copy(newpage, page);
	586	+
	587	+ return 0;
	588	+}
	589	+
576	590	static int hugetlbfs_statfs(struct dentry dentry, struct kstatfs buf)
577	591	{
578	592	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
...	...	@@ -659,6 +673,7 @@
659	673	.write_begin = hugetlbfs_write_begin,
660	674	.write_end = hugetlbfs_write_end,
661	675	.set_page_dirty = hugetlbfs_set_page_dirty,
	676	+ .migratepage = hugetlbfs_migrate_page,
662	677	};
663	678
664	679
...	...	@@ -43,7 +43,8 @@
43	43	struct vm_area_struct *vma,
44	44	int acctflags);
45	45	void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
46		-void __isolate_hwpoisoned_huge_page(struct page *page);
	46	+int dequeue_hwpoisoned_huge_page(struct page *page);
	47	+void copy_huge_page(struct page dst, struct page src);
47	48
48	49	extern unsigned long hugepages_treat_as_movable;
49	50	extern const unsigned long hugetlb_zero, hugetlb_infinity;
...	...	@@ -101,7 +102,10 @@
101	102	#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; })
102	103	#define hugetlb_fault(mm, vma, addr, flags) ({ BUG(); 0; })
103	104	#define huge_pte_offset(mm, address) 0
104		-#define __isolate_hwpoisoned_huge_page(page) 0
	105	+#define dequeue_hwpoisoned_huge_page(page) 0
	106	+static inline void copy_huge_page(struct page dst, struct page src)
	107	+{
	108	+}
105	109
106	110	#define hugetlb_change_protection(vma, address, end, newprot)
107	111
...	...	@@ -228,6 +232,8 @@
228	232	struct hstate *hstate;
229	233	};
230	234
	235	+struct page alloc_huge_page_node(struct hstate h, int nid);
	236	+
231	237	/* arch callback */
232	238	int __init alloc_bootmem_huge_page(struct hstate *h);
233	239
234	240
...	...	@@ -301,8 +307,14 @@
301	307	return size_to_hstate(PAGE_SIZE << compound_order(page));
302	308	}
303	309
	310	+static inline unsigned hstate_index_to_shift(unsigned index)
	311	+{
	312	+ return hstates[index].order + PAGE_SHIFT;
	313	+}
	314	+
304	315	#else
305	316	struct hstate {};
	317	+#define alloc_huge_page_node(h, nid) NULL
306	318	#define alloc_bootmem_huge_page(h) NULL
307	319	#define hstate_file(f) NULL
308	320	#define hstate_vma(v) NULL
...	...	@@ -317,6 +329,7 @@
317	329	{
318	330	return 1;
319	331	}
	332	+#define hstate_index_to_shift(index) 0
320	333	#endif
321	334
322	335	#endif /* _LINUX_HUGETLB_H */
...	...	@@ -14,6 +14,8 @@
14	14	struct page , struct page );
15	15	extern int migrate_pages(struct list_head *l, new_page_t x,
16	16	unsigned long private, int offlining);
	17	+extern int migrate_huge_pages(struct list_head *l, new_page_t x,
	18	+ unsigned long private, int offlining);
17	19
18	20	extern int fail_migrate_page(struct address_space *,
19	21	struct page , struct page );
20	22
...	...	@@ -23,12 +25,17 @@
23	25	extern int migrate_vmas(struct mm_struct *mm,
24	26	const nodemask_t from, const nodemask_t to,
25	27	unsigned long flags);
	28	+extern void migrate_page_copy(struct page newpage, struct page page);
	29	+extern int migrate_huge_page_move_mapping(struct address_space *mapping,
	30	+ struct page newpage, struct page page);
26	31	#else
27	32	#define PAGE_MIGRATION 0
28	33
29	34	static inline void putback_lru_pages(struct list_head *l) {}
30	35	static inline int migrate_pages(struct list_head *l, new_page_t x,
31	36	unsigned long private, int offlining) { return -ENOSYS; }
	37	+static inline int migrate_huge_pages(struct list_head *l, new_page_t x,
	38	+ unsigned long private, int offlining) { return -ENOSYS; }
32	39
33	40	static inline int migrate_prep(void) { return -ENOSYS; }
34	41	static inline int migrate_prep_local(void) { return -ENOSYS; }
...	...	@@ -36,6 +43,15 @@
36	43	static inline int migrate_vmas(struct mm_struct *mm,
37	44	const nodemask_t from, const nodemask_t to,
38	45	unsigned long flags)
	46	+{
	47	+ return -ENOSYS;
	48	+}
	49	+
	50	+static inline void migrate_page_copy(struct page *newpage,
	51	+ struct page *page) {}
	52	+
	53	+static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
	54	+ struct page newpage, struct page page)
39	55	{
40	56	return -ENOSYS;
41	57	}
...	...	@@ -718,12 +718,20 @@
718	718	#define VM_FAULT_SIGBUS 0x0002
719	719	#define VM_FAULT_MAJOR 0x0004
720	720	#define VM_FAULT_WRITE 0x0008 /* Special case for get_user_pages */
721		-#define VM_FAULT_HWPOISON 0x0010 /* Hit poisoned page */
	721	+#define VM_FAULT_HWPOISON 0x0010 /* Hit poisoned small page */
	722	+#define VM_FAULT_HWPOISON_LARGE 0x0020 /* Hit poisoned large page. Index encoded in upper bits */
722	723
723	724	#define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page */
724	725	#define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */
725	726
726		-#define VM_FAULT_ERROR (VM_FAULT_OOM \| VM_FAULT_SIGBUS \| VM_FAULT_HWPOISON)
	727	+#define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */
	728	+
	729	+#define VM_FAULT_ERROR (VM_FAULT_OOM \| VM_FAULT_SIGBUS \| VM_FAULT_HWPOISON \| \
	730	+ VM_FAULT_HWPOISON_LARGE)
	731	+
	732	+/* Encode hstate index for a hwpoisoned large page */
	733	+#define VM_FAULT_SET_HINDEX(x) ((x) << 12)
	734	+#define VM_FAULT_GET_HINDEX(x) (((x) >> 12) & 0xf)
727	735
728	736	/*
729	737	* Can be called by the pagefault handler when it gets a VM_FAULT_OOM.
...	...	@@ -423,14 +423,14 @@
423	423	}
424	424	}
425	425
426		-static void copy_gigantic_page(struct page dst, struct page src,
	426	+static void copy_user_gigantic_page(struct page dst, struct page src,
427	427	unsigned long addr, struct vm_area_struct *vma)
428	428	{
429	429	int i;
430	430	struct hstate *h = hstate_vma(vma);
431	431	struct page *dst_base = dst;
432	432	struct page *src_base = src;
433		- might_sleep();
	433	+
434	434	for (i = 0; i < pages_per_huge_page(h); ) {
435	435	cond_resched();
436	436	copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
437	437
...	...	@@ -440,14 +440,15 @@
440	440	src = mem_map_next(src, src_base, i);
441	441	}
442	442	}
443		-static void copy_huge_page(struct page dst, struct page src,
	443	+
	444	+static void copy_user_huge_page(struct page dst, struct page src,
444	445	unsigned long addr, struct vm_area_struct *vma)
445	446	{
446	447	int i;
447	448	struct hstate *h = hstate_vma(vma);
448	449
449	450	if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
450		- copy_gigantic_page(dst, src, addr, vma);
	451	+ copy_user_gigantic_page(dst, src, addr, vma);
451	452	return;
452	453	}
453	454
...	...	@@ -458,6 +459,40 @@
458	459	}
459	460	}
460	461
	462	+static void copy_gigantic_page(struct page dst, struct page src)
	463	+{
	464	+ int i;
	465	+ struct hstate *h = page_hstate(src);
	466	+ struct page *dst_base = dst;
	467	+ struct page *src_base = src;
	468	+
	469	+ for (i = 0; i < pages_per_huge_page(h); ) {
	470	+ cond_resched();
	471	+ copy_highpage(dst, src);
	472	+
	473	+ i++;
	474	+ dst = mem_map_next(dst, dst_base, i);
	475	+ src = mem_map_next(src, src_base, i);
	476	+ }
	477	+}
	478	+
	479	+void copy_huge_page(struct page dst, struct page src)
	480	+{
	481	+ int i;
	482	+ struct hstate *h = page_hstate(src);
	483	+
	484	+ if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
	485	+ copy_gigantic_page(dst, src);
	486	+ return;
	487	+ }
	488	+
	489	+ might_sleep();
	490	+ for (i = 0; i < pages_per_huge_page(h); i++) {
	491	+ cond_resched();
	492	+ copy_highpage(dst + i, src + i);
	493	+ }
	494	+}
	495	+
461	496	static void enqueue_huge_page(struct hstate h, struct page page)
462	497	{
463	498	int nid = page_to_nid(page);
464	499
...	...	@@ -466,11 +501,24 @@
466	501	h->free_huge_pages_node[nid]++;
467	502	}
468	503
	504	+static struct page dequeue_huge_page_node(struct hstate h, int nid)
	505	+{
	506	+ struct page *page;
	507	+
	508	+ if (list_empty(&h->hugepage_freelists[nid]))
	509	+ return NULL;
	510	+ page = list_entry(h->hugepage_freelists[nid].next, struct page, lru);
	511	+ list_del(&page->lru);
	512	+ set_page_refcounted(page);
	513	+ h->free_huge_pages--;
	514	+ h->free_huge_pages_node[nid]--;
	515	+ return page;
	516	+}
	517	+
469	518	static struct page dequeue_huge_page_vma(struct hstate h,
470	519	struct vm_area_struct *vma,
471	520	unsigned long address, int avoid_reserve)
472	521	{
473		- int nid;
474	522	struct page *page = NULL;
475	523	struct mempolicy *mpol;
476	524	nodemask_t *nodemask;
...	...	@@ -496,19 +544,13 @@
496	544
497	545	for_each_zone_zonelist_nodemask(zone, z, zonelist,
498	546	MAX_NR_ZONES - 1, nodemask) {
499		- nid = zone_to_nid(zone);
500		- if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) &&
501		- !list_empty(&h->hugepage_freelists[nid])) {
502		- page = list_entry(h->hugepage_freelists[nid].next,
503		- struct page, lru);
504		- list_del(&page->lru);
505		- h->free_huge_pages--;
506		- h->free_huge_pages_node[nid]--;
507		-
508		- if (!avoid_reserve)
509		- decrement_hugepage_resv_vma(h, vma);
510		-
511		- break;
	547	+ if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) {
	548	+ page = dequeue_huge_page_node(h, zone_to_nid(zone));
	549	+ if (page) {
	550	+ if (!avoid_reserve)
	551	+ decrement_hugepage_resv_vma(h, vma);
	552	+ break;
	553	+ }
512	554	}
513	555	}
514	556	err:
515	557
...	...	@@ -770,11 +812,10 @@
770	812	return ret;
771	813	}
772	814
773		-static struct page alloc_buddy_huge_page(struct hstate h,
774		- struct vm_area_struct *vma, unsigned long address)
	815	+static struct page alloc_buddy_huge_page(struct hstate h, int nid)
775	816	{
776	817	struct page *page;
777		- unsigned int nid;
	818	+ unsigned int r_nid;
778	819
779	820	if (h->order >= MAX_ORDER)
780	821	return NULL;
...	...	@@ -812,9 +853,14 @@
812	853	}
813	854	spin_unlock(&hugetlb_lock);
814	855
815		- page = alloc_pages(htlb_alloc_mask\|__GFP_COMP\|
816		- __GFP_REPEAT\|__GFP_NOWARN,
817		- huge_page_order(h));
	856	+ if (nid == NUMA_NO_NODE)
	857	+ page = alloc_pages(htlb_alloc_mask\|__GFP_COMP\|
	858	+ __GFP_REPEAT\|__GFP_NOWARN,
	859	+ huge_page_order(h));
	860	+ else
	861	+ page = alloc_pages_exact_node(nid,
	862	+ htlb_alloc_mask\|__GFP_COMP\|__GFP_THISNODE\|
	863	+ __GFP_REPEAT\|__GFP_NOWARN, huge_page_order(h));
818	864
819	865	if (page && arch_prepare_hugepage(page)) {
820	866	__free_pages(page, huge_page_order(h));
821	867
...	...	@@ -823,19 +869,13 @@
823	869
824	870	spin_lock(&hugetlb_lock);
825	871	if (page) {
826		- /*
827		- * This page is now managed by the hugetlb allocator and has
828		- * no users -- drop the buddy allocator's reference.
829		- */
830		- put_page_testzero(page);
831		- VM_BUG_ON(page_count(page));
832		- nid = page_to_nid(page);
	872	+ r_nid = page_to_nid(page);
833	873	set_compound_page_dtor(page, free_huge_page);
834	874	/*
835	875	* We incremented the global counters already
836	876	*/
837		- h->nr_huge_pages_node[nid]++;
838		- h->surplus_huge_pages_node[nid]++;
	877	+ h->nr_huge_pages_node[r_nid]++;
	878	+ h->surplus_huge_pages_node[r_nid]++;
839	879	__count_vm_event(HTLB_BUDDY_PGALLOC);
840	880	} else {
841	881	h->nr_huge_pages--;
...	...	@@ -848,6 +888,25 @@
848	888	}
849	889
850	890	/*
	891	+ * This allocation function is useful in the context where vma is irrelevant.
	892	+ * E.g. soft-offlining uses this function because it only cares physical
	893	+ * address of error page.
	894	+ */
	895	+struct page alloc_huge_page_node(struct hstate h, int nid)
	896	+{
	897	+ struct page *page;
	898	+
	899	+ spin_lock(&hugetlb_lock);
	900	+ page = dequeue_huge_page_node(h, nid);
	901	+ spin_unlock(&hugetlb_lock);
	902	+
	903	+ if (!page)
	904	+ page = alloc_buddy_huge_page(h, nid);
	905	+
	906	+ return page;
	907	+}
	908	+
	909	+/*
851	910	* Increase the hugetlb pool such that it can accomodate a reservation
852	911	* of size 'delta'.
853	912	*/
854	913
855	914
...	...	@@ -871,17 +930,14 @@
871	930	retry:
872	931	spin_unlock(&hugetlb_lock);
873	932	for (i = 0; i < needed; i++) {
874		- page = alloc_buddy_huge_page(h, NULL, 0);
875		- if (!page) {
	933	+ page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
	934	+ if (!page)
876	935	/*
877	936	* We were not able to allocate enough pages to
878	937	* satisfy the entire reservation so we free what
879	938	* we've allocated so far.
880	939	*/
881		- spin_lock(&hugetlb_lock);
882		- needed = 0;
883	940	goto free;
884		- }
885	941
886	942	list_add(&page->lru, &surplus_list);
887	943	}
888	944
889	945
890	946
891	947
892	948
893	949
...	...	@@ -908,31 +964,31 @@
908	964	needed += allocated;
909	965	h->resv_huge_pages += delta;
910	966	ret = 0;
911		-free:
	967	+
	968	+ spin_unlock(&hugetlb_lock);
912	969	/* Free the needed pages to the hugetlb pool */
913	970	list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
914	971	if ((--needed) < 0)
915	972	break;
916	973	list_del(&page->lru);
	974	+ /*
	975	+ * This page is now managed by the hugetlb allocator and has
	976	+ * no users -- drop the buddy allocator's reference.
	977	+ */
	978	+ put_page_testzero(page);
	979	+ VM_BUG_ON(page_count(page));
917	980	enqueue_huge_page(h, page);
918	981	}
919	982
920	983	/* Free unnecessary surplus pages to the buddy allocator */
	984	+free:
921	985	if (!list_empty(&surplus_list)) {
922		- spin_unlock(&hugetlb_lock);
923	986	list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
924	987	list_del(&page->lru);
925		- /*
926		- * The page has a reference count of zero already, so
927		- * call free_huge_page directly instead of using
928		- * put_page. This must be done with hugetlb_lock
929		- * unlocked which is safe because free_huge_page takes
930		- * hugetlb_lock before deciding how to free the page.
931		- */
932		- free_huge_page(page);
	988	+ put_page(page);
933	989	}
934		- spin_lock(&hugetlb_lock);
935	990	}
	991	+ spin_lock(&hugetlb_lock);
936	992
937	993	return ret;
938	994	}
939	995
...	...	@@ -1052,14 +1108,13 @@
1052	1108	spin_unlock(&hugetlb_lock);
1053	1109
1054	1110	if (!page) {
1055		- page = alloc_buddy_huge_page(h, vma, addr);
	1111	+ page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
1056	1112	if (!page) {
1057	1113	hugetlb_put_quota(inode->i_mapping, chg);
1058	1114	return ERR_PTR(-VM_FAULT_SIGBUS);
1059	1115	}
1060	1116	}
1061	1117
1062		- set_page_refcounted(page);
1063	1118	set_page_private(page, (unsigned long) mapping);
1064	1119
1065	1120	vma_commit_reservation(h, vma, addr);
...	...	@@ -2153,6 +2208,19 @@
2153	2208	return -ENOMEM;
2154	2209	}
2155	2210
	2211	+static int is_hugetlb_entry_migration(pte_t pte)
	2212	+{
	2213	+ swp_entry_t swp;
	2214	+
	2215	+ if (huge_pte_none(pte) \|\| pte_present(pte))
	2216	+ return 0;
	2217	+ swp = pte_to_swp_entry(pte);
	2218	+ if (non_swap_entry(swp) && is_migration_entry(swp)) {
	2219	+ return 1;
	2220	+ } else
	2221	+ return 0;
	2222	+}
	2223	+
2156	2224	static int is_hugetlb_entry_hwpoisoned(pte_t pte)
2157	2225	{
2158	2226	swp_entry_t swp;
...	...	@@ -2383,7 +2451,7 @@
2383	2451	if (unlikely(anon_vma_prepare(vma)))
2384	2452	return VM_FAULT_OOM;
2385	2453
2386		- copy_huge_page(new_page, old_page, address, vma);
	2454	+ copy_user_huge_page(new_page, old_page, address, vma);
2387	2455	__SetPageUptodate(new_page);
2388	2456
2389	2457	/*
2390	2458
...	...	@@ -2515,22 +2583,20 @@
2515	2583	hugepage_add_new_anon_rmap(page, vma, address);
2516	2584	}
2517	2585	} else {
	2586	+ /*
	2587	+ * If memory error occurs between mmap() and fault, some process
	2588	+ * don't have hwpoisoned swap entry for errored virtual address.
	2589	+ * So we need to block hugepage fault by PG_hwpoison bit check.
	2590	+ */
	2591	+ if (unlikely(PageHWPoison(page))) {
	2592	+ ret = VM_FAULT_HWPOISON \|
	2593	+ VM_FAULT_SET_HINDEX(h - hstates);
	2594	+ goto backout_unlocked;
	2595	+ }
2518	2596	page_dup_rmap(page);
2519	2597	}
2520	2598
2521	2599	/*
2522		- * Since memory error handler replaces pte into hwpoison swap entry
2523		- * at the time of error handling, a process which reserved but not have
2524		- * the mapping to the error hugepage does not have hwpoison swap entry.
2525		- * So we need to block accesses from such a process by checking
2526		- * PG_hwpoison bit here.
2527		- */
2528		- if (unlikely(PageHWPoison(page))) {
2529		- ret = VM_FAULT_HWPOISON;
2530		- goto backout_unlocked;
2531		- }
2532		-
2533		- /*
2534	2600	* If we are going to COW a private mapping later, we examine the
2535	2601	* pending reservations for this page now. This will ensure that
2536	2602	* any allocations necessary to record that reservation occur outside
...	...	@@ -2587,8 +2653,12 @@
2587	2653	ptep = huge_pte_offset(mm, address);
2588	2654	if (ptep) {
2589	2655	entry = huge_ptep_get(ptep);
2590		- if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
2591		- return VM_FAULT_HWPOISON;
	2656	+ if (unlikely(is_hugetlb_entry_migration(entry))) {
	2657	+ migration_entry_wait(mm, (pmd_t *)ptep, address);
	2658	+ return 0;
	2659	+ } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
	2660	+ return VM_FAULT_HWPOISON_LARGE \|
	2661	+ VM_FAULT_SET_HINDEX(h - hstates);
2592	2662	}
2593	2663
2594	2664	ptep = huge_pte_alloc(mm, address, huge_page_size(h));
2595	2665
2596	2666
2597	2667
2598	2668
2599	2669
...	...	@@ -2878,19 +2948,42 @@
2878	2948	hugetlb_acct_memory(h, -(chg - freed));
2879	2949	}
2880	2950
	2951	+#ifdef CONFIG_MEMORY_FAILURE
	2952	+
	2953	+/* Should be called in hugetlb_lock */
	2954	+static int is_hugepage_on_freelist(struct page *hpage)
	2955	+{
	2956	+ struct page *page;
	2957	+ struct page *tmp;
	2958	+ struct hstate *h = page_hstate(hpage);
	2959	+ int nid = page_to_nid(hpage);
	2960	+
	2961	+ list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru)
	2962	+ if (page == hpage)
	2963	+ return 1;
	2964	+ return 0;
	2965	+}
	2966	+
2881	2967	/*
2882	2968	* This function is called from memory failure code.
2883	2969	* Assume the caller holds page lock of the head page.
2884	2970	*/
2885		-void __isolate_hwpoisoned_huge_page(struct page *hpage)
	2971	+int dequeue_hwpoisoned_huge_page(struct page *hpage)
2886	2972	{
2887	2973	struct hstate *h = page_hstate(hpage);
2888	2974	int nid = page_to_nid(hpage);
	2975	+ int ret = -EBUSY;
2889	2976
2890	2977	spin_lock(&hugetlb_lock);
2891		- list_del(&hpage->lru);
2892		- h->free_huge_pages--;
2893		- h->free_huge_pages_node[nid]--;
	2978	+ if (is_hugepage_on_freelist(hpage)) {
	2979	+ list_del(&hpage->lru);
	2980	+ set_page_refcounted(hpage);
	2981	+ h->free_huge_pages--;
	2982	+ h->free_huge_pages_node[nid]--;
	2983	+ ret = 0;
	2984	+ }
2894	2985	spin_unlock(&hugetlb_lock);
	2986	+ return ret;
2895	2987	}
	2988	+#endif
...	...	@@ -697,11 +697,10 @@
697	697	* Issues:
698	698	* - Error on hugepage is contained in hugepage unit (not in raw page unit.)
699	699	* To narrow down kill region to one page, we need to break up pmd.
700		- * - To support soft-offlining for hugepage, we need to support hugepage
701		- * migration.
702	700	*/
703	701	static int me_huge_page(struct page *p, unsigned long pfn)
704	702	{
	703	+ int res = 0;
705	704	struct page *hpage = compound_head(p);
706	705	/*
707	706	* We can safely recover from error on free or reserved (i.e.
...	...	@@ -714,8 +713,9 @@
714	713	* so there is no race between isolation and mapping/unmapping.
715	714	*/
716	715	if (!(page_mapping(hpage) \|\| PageAnon(hpage))) {
717		- __isolate_hwpoisoned_huge_page(hpage);
718		- return RECOVERED;
	716	+ res = dequeue_hwpoisoned_huge_page(hpage);
	717	+ if (!res)
	718	+ return RECOVERED;
719	719	}
720	720	return DELAYED;
721	721	}
...	...	@@ -972,7 +972,10 @@
972	972	* We need/can do nothing about count=0 pages.
973	973	* 1) it's a free page, and therefore in safe hand:
974	974	* prep_new_page() will be the gate keeper.
975		- * 2) it's part of a non-compound high order page.
	975	+ * 2) it's a free hugepage, which is also safe:
	976	+ * an affected hugepage will be dequeued from hugepage freelist,
	977	+ * so there's no concern about reusing it ever after.
	978	+ * 3) it's part of a non-compound high order page.
976	979	* Implies some kernel user: cannot stop them from
977	980	* R/W the page; let's pray that the page has been
978	981	* used and will be freed some time later.
...	...	@@ -984,6 +987,24 @@
984	987	if (is_free_buddy_page(p)) {
985	988	action_result(pfn, "free buddy", DELAYED);
986	989	return 0;
	990	+ } else if (PageHuge(hpage)) {
	991	+ /*
	992	+ * Check "just unpoisoned", "filter hit", and
	993	+ * "race with other subpage."
	994	+ */
	995	+ lock_page_nosync(hpage);
	996	+ if (!PageHWPoison(hpage)
	997	+ \|\| (hwpoison_filter(p) && TestClearPageHWPoison(p))
	998	+ \|\| (p != hpage && TestSetPageHWPoison(hpage))) {
	999	+ atomic_long_sub(nr_pages, &mce_bad_pages);
	1000	+ return 0;
	1001	+ }
	1002	+ set_page_hwpoison_huge_page(hpage);
	1003	+ res = dequeue_hwpoisoned_huge_page(hpage);
	1004	+ action_result(pfn, "free huge",
	1005	+ res ? IGNORED : DELAYED);
	1006	+ unlock_page(hpage);
	1007	+ return res;
987	1008	} else {
988	1009	action_result(pfn, "high order kernel", IGNORED);
989	1010	return -EBUSY;
...	...	@@ -1145,6 +1166,16 @@
1145	1166	nr_pages = 1 << compound_order(page);
1146	1167
1147	1168	if (!get_page_unless_zero(page)) {
	1169	+ /*
	1170	+ * Since HWPoisoned hugepage should have non-zero refcount,
	1171	+ * race between memory failure and unpoison seems to happen.
	1172	+ * In such case unpoison fails and memory failure runs
	1173	+ * to the end.
	1174	+ */
	1175	+ if (PageHuge(page)) {
	1176	+ pr_debug("MCE: Memory failure is now running on free hugepage %#lx\n", pfn);
	1177	+ return 0;
	1178	+ }
1148	1179	if (TestClearPageHWPoison(p))
1149	1180	atomic_long_sub(nr_pages, &mce_bad_pages);
1150	1181	pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
1151	1182
...	...	@@ -1162,9 +1193,9 @@
1162	1193	pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
1163	1194	atomic_long_sub(nr_pages, &mce_bad_pages);
1164	1195	freeit = 1;
	1196	+ if (PageHuge(page))
	1197	+ clear_page_hwpoison_huge_page(page);
1165	1198	}
1166		- if (PageHuge(p))
1167		- clear_page_hwpoison_huge_page(page);
1168	1199	unlock_page(page);
1169	1200
1170	1201	put_page(page);
...	...	@@ -1178,7 +1209,11 @@
1178	1209	static struct page new_page(struct page p, unsigned long private, int **x)
1179	1210	{
1180	1211	int nid = page_to_nid(p);
1181		- return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
	1212	+ if (PageHuge(p))
	1213	+ return alloc_huge_page_node(page_hstate(compound_head(p)),
	1214	+ nid);
	1215	+ else
	1216	+ return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
1182	1217	}
1183	1218
1184	1219	/*
1185	1220
...	...	@@ -1206,8 +1241,15 @@
1206	1241	* was free.
1207	1242	*/
1208	1243	set_migratetype_isolate(p);
	1244	+ /*
	1245	+ * When the target page is a free hugepage, just remove it
	1246	+ * from free hugepage list.
	1247	+ */
1209	1248	if (!get_page_unless_zero(compound_head(p))) {
1210		- if (is_free_buddy_page(p)) {
	1249	+ if (PageHuge(p)) {
	1250	+ pr_info("get_any_page: %#lx free huge page\n", pfn);
	1251	+ ret = dequeue_hwpoisoned_huge_page(compound_head(p));
	1252	+ } else if (is_free_buddy_page(p)) {
1211	1253	pr_info("get_any_page: %#lx free buddy page\n", pfn);
1212	1254	/* Set hwpoison bit while page is still isolated */
1213	1255	SetPageHWPoison(p);
...	...	@@ -1226,6 +1268,45 @@
1226	1268	return ret;
1227	1269	}
1228	1270
	1271	+static int soft_offline_huge_page(struct page *page, int flags)
	1272	+{
	1273	+ int ret;
	1274	+ unsigned long pfn = page_to_pfn(page);
	1275	+ struct page *hpage = compound_head(page);
	1276	+ LIST_HEAD(pagelist);
	1277	+
	1278	+ ret = get_any_page(page, pfn, flags);
	1279	+ if (ret < 0)
	1280	+ return ret;
	1281	+ if (ret == 0)
	1282	+ goto done;
	1283	+
	1284	+ if (PageHWPoison(hpage)) {
	1285	+ put_page(hpage);
	1286	+ pr_debug("soft offline: %#lx hugepage already poisoned\n", pfn);
	1287	+ return -EBUSY;
	1288	+ }
	1289	+
	1290	+ /* Keep page count to indicate a given hugepage is isolated. */
	1291	+
	1292	+ list_add(&hpage->lru, &pagelist);
	1293	+ ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
	1294	+ if (ret) {
	1295	+ pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
	1296	+ pfn, ret, page->flags);
	1297	+ if (ret > 0)
	1298	+ ret = -EIO;
	1299	+ return ret;
	1300	+ }
	1301	+done:
	1302	+ if (!PageHWPoison(hpage))
	1303	+ atomic_long_add(1 << compound_order(hpage), &mce_bad_pages);
	1304	+ set_page_hwpoison_huge_page(hpage);
	1305	+ dequeue_hwpoisoned_huge_page(hpage);
	1306	+ /* keep elevated page count for bad page */
	1307	+ return ret;
	1308	+}
	1309	+
1229	1310	/**
1230	1311	* soft_offline_page - Soft offline a page.
1231	1312	* @page: page to offline
...	...	@@ -1252,6 +1333,9 @@
1252	1333	{
1253	1334	int ret;
1254	1335	unsigned long pfn = page_to_pfn(page);
	1336	+
	1337	+ if (PageHuge(page))
	1338	+ return soft_offline_huge_page(page, flags);
1255	1339
1256	1340	ret = get_any_page(page, pfn, flags);
1257	1341	if (ret < 0)
...	...	@@ -1450,7 +1450,8 @@
1450	1450	if (ret & VM_FAULT_OOM)
1451	1451	return i ? i : -ENOMEM;
1452	1452	if (ret &
1453		- (VM_FAULT_HWPOISON\|VM_FAULT_SIGBUS))
	1453	+ (VM_FAULT_HWPOISON\|VM_FAULT_HWPOISON_LARGE\|
	1454	+ VM_FAULT_SIGBUS))
1454	1455	return i ? i : -EFAULT;
1455	1456	BUG();
1456	1457	}
...	...	@@ -32,6 +32,7 @@
32	32	#include <linux/security.h>
33	33	#include <linux/memcontrol.h>
34	34	#include <linux/syscalls.h>
	35	+#include <linux/hugetlb.h>
35	36	#include <linux/gfp.h>
36	37
37	38	#include "internal.h"
38	39
39	40
40	41
41	42
42	43
...	...	@@ -95,26 +96,34 @@
95	96	pte_t *ptep, pte;
96	97	spinlock_t *ptl;
97	98
98		- pgd = pgd_offset(mm, addr);
99		- if (!pgd_present(*pgd))
100		- goto out;
	99	+ if (unlikely(PageHuge(new))) {
	100	+ ptep = huge_pte_offset(mm, addr);
	101	+ if (!ptep)
	102	+ goto out;
	103	+ ptl = &mm->page_table_lock;
	104	+ } else {
	105	+ pgd = pgd_offset(mm, addr);
	106	+ if (!pgd_present(*pgd))
	107	+ goto out;
101	108
102		- pud = pud_offset(pgd, addr);
103		- if (!pud_present(*pud))
104		- goto out;
	109	+ pud = pud_offset(pgd, addr);
	110	+ if (!pud_present(*pud))
	111	+ goto out;
105	112
106		- pmd = pmd_offset(pud, addr);
107		- if (!pmd_present(*pmd))
108		- goto out;
	113	+ pmd = pmd_offset(pud, addr);
	114	+ if (!pmd_present(*pmd))
	115	+ goto out;
109	116
110		- ptep = pte_offset_map(pmd, addr);
	117	+ ptep = pte_offset_map(pmd, addr);
111	118
112		- if (!is_swap_pte(*ptep)) {
113		- pte_unmap(ptep);
114		- goto out;
115		- }
	119	+ if (!is_swap_pte(*ptep)) {
	120	+ pte_unmap(ptep);
	121	+ goto out;
	122	+ }
116	123
117		- ptl = pte_lockptr(mm, pmd);
	124	+ ptl = pte_lockptr(mm, pmd);
	125	+ }
	126	+
118	127	spin_lock(ptl);
119	128	pte = *ptep;
120	129	if (!is_swap_pte(pte))
121	130
...	...	@@ -130,10 +139,19 @@
130	139	pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
131	140	if (is_write_migration_entry(entry))
132	141	pte = pte_mkwrite(pte);
	142	+#ifdef CONFIG_HUGETLB_PAGE
	143	+ if (PageHuge(new))
	144	+ pte = pte_mkhuge(pte);
	145	+#endif
133	146	flush_cache_page(vma, addr, pte_pfn(pte));
134	147	set_pte_at(mm, addr, ptep, pte);
135	148
136		- if (PageAnon(new))
	149	+ if (PageHuge(new)) {
	150	+ if (PageAnon(new))
	151	+ hugepage_add_anon_rmap(new, vma, addr);
	152	+ else
	153	+ page_dup_rmap(new);
	154	+ } else if (PageAnon(new))
137	155	page_add_anon_rmap(new, vma, addr);
138	156	else
139	157	page_add_file_rmap(new);
140	158
141	159
...	...	@@ -276,11 +294,59 @@
276	294	}
277	295
278	296	/*
	297	+ * The expected number of remaining references is the same as that
	298	+ * of migrate_page_move_mapping().
	299	+ */
	300	+int migrate_huge_page_move_mapping(struct address_space *mapping,
	301	+ struct page newpage, struct page page)
	302	+{
	303	+ int expected_count;
	304	+ void **pslot;
	305	+
	306	+ if (!mapping) {
	307	+ if (page_count(page) != 1)
	308	+ return -EAGAIN;
	309	+ return 0;
	310	+ }
	311	+
	312	+ spin_lock_irq(&mapping->tree_lock);
	313	+
	314	+ pslot = radix_tree_lookup_slot(&mapping->page_tree,
	315	+ page_index(page));
	316	+
	317	+ expected_count = 2 + page_has_private(page);
	318	+ if (page_count(page) != expected_count \|\|
	319	+ (struct page *)radix_tree_deref_slot(pslot) != page) {
	320	+ spin_unlock_irq(&mapping->tree_lock);
	321	+ return -EAGAIN;
	322	+ }
	323	+
	324	+ if (!page_freeze_refs(page, expected_count)) {
	325	+ spin_unlock_irq(&mapping->tree_lock);
	326	+ return -EAGAIN;
	327	+ }
	328	+
	329	+ get_page(newpage);
	330	+
	331	+ radix_tree_replace_slot(pslot, newpage);
	332	+
	333	+ page_unfreeze_refs(page, expected_count);
	334	+
	335	+ __put_page(page);
	336	+
	337	+ spin_unlock_irq(&mapping->tree_lock);
	338	+ return 0;
	339	+}
	340	+
	341	+/*
279	342	* Copy the page to its new location
280	343	*/
281		-static void migrate_page_copy(struct page newpage, struct page page)
	344	+void migrate_page_copy(struct page newpage, struct page page)
282	345	{
283		- copy_highpage(newpage, page);
	346	+ if (PageHuge(page))
	347	+ copy_huge_page(newpage, page);
	348	+ else
	349	+ copy_highpage(newpage, page);
284	350
285	351	if (PageError(page))
286	352	SetPageError(newpage);
...	...	@@ -724,6 +790,92 @@
724	790	}
725	791
726	792	/*
	793	+ * Counterpart of unmap_and_move_page() for hugepage migration.
	794	+ *
	795	+ * This function doesn't wait the completion of hugepage I/O
	796	+ * because there is no race between I/O and migration for hugepage.
	797	+ * Note that currently hugepage I/O occurs only in direct I/O
	798	+ * where no lock is held and PG_writeback is irrelevant,
	799	+ * and writeback status of all subpages are counted in the reference
	800	+ * count of the head page (i.e. if all subpages of a 2MB hugepage are
	801	+ * under direct I/O, the reference of the head page is 512 and a bit more.)
	802	+ * This means that when we try to migrate hugepage whose subpages are
	803	+ * doing direct I/O, some references remain after try_to_unmap() and
	804	+ * hugepage migration fails without data corruption.
	805	+ *
	806	+ * There is also no race when direct I/O is issued on the page under migration,
	807	+ * because then pte is replaced with migration swap entry and direct I/O code
	808	+ * will wait in the page fault for migration to complete.
	809	+ */
	810	+static int unmap_and_move_huge_page(new_page_t get_new_page,
	811	+ unsigned long private, struct page *hpage,
	812	+ int force, int offlining)
	813	+{
	814	+ int rc = 0;
	815	+ int *result = NULL;
	816	+ struct page *new_hpage = get_new_page(hpage, private, &result);
	817	+ int rcu_locked = 0;
	818	+ struct anon_vma *anon_vma = NULL;
	819	+
	820	+ if (!new_hpage)
	821	+ return -ENOMEM;
	822	+
	823	+ rc = -EAGAIN;
	824	+
	825	+ if (!trylock_page(hpage)) {
	826	+ if (!force)
	827	+ goto out;
	828	+ lock_page(hpage);
	829	+ }
	830	+
	831	+ if (PageAnon(hpage)) {
	832	+ rcu_read_lock();
	833	+ rcu_locked = 1;
	834	+
	835	+ if (page_mapped(hpage)) {
	836	+ anon_vma = page_anon_vma(hpage);
	837	+ atomic_inc(&anon_vma->external_refcount);
	838	+ }
	839	+ }
	840	+
	841	+ try_to_unmap(hpage, TTU_MIGRATION\|TTU_IGNORE_MLOCK\|TTU_IGNORE_ACCESS);
	842	+
	843	+ if (!page_mapped(hpage))
	844	+ rc = move_to_new_page(new_hpage, hpage, 1);
	845	+
	846	+ if (rc)
	847	+ remove_migration_ptes(hpage, hpage);
	848	+
	849	+ if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount,
	850	+ &anon_vma->lock)) {
	851	+ int empty = list_empty(&anon_vma->head);
	852	+ spin_unlock(&anon_vma->lock);
	853	+ if (empty)
	854	+ anon_vma_free(anon_vma);
	855	+ }
	856	+
	857	+ if (rcu_locked)
	858	+ rcu_read_unlock();
	859	+out:
	860	+ unlock_page(hpage);
	861	+
	862	+ if (rc != -EAGAIN) {
	863	+ list_del(&hpage->lru);
	864	+ put_page(hpage);
	865	+ }
	866	+
	867	+ put_page(new_hpage);
	868	+
	869	+ if (result) {
	870	+ if (rc)
	871	+ *result = rc;
	872	+ else
	873	+ *result = page_to_nid(new_hpage);
	874	+ }
	875	+ return rc;
	876	+}
	877	+
	878	+/*
727	879	* migrate_pages
728	880	*
729	881	* The function takes one list of pages to migrate and a function
...	...	@@ -781,6 +933,52 @@
781	933	current->flags &= ~PF_SWAPWRITE;
782	934
783	935	putback_lru_pages(from);
	936	+
	937	+ if (rc)
	938	+ return rc;
	939	+
	940	+ return nr_failed + retry;
	941	+}
	942	+
	943	+int migrate_huge_pages(struct list_head *from,
	944	+ new_page_t get_new_page, unsigned long private, int offlining)
	945	+{
	946	+ int retry = 1;
	947	+ int nr_failed = 0;
	948	+ int pass = 0;
	949	+ struct page *page;
	950	+ struct page *page2;
	951	+ int rc;
	952	+
	953	+ for (pass = 0; pass < 10 && retry; pass++) {
	954	+ retry = 0;
	955	+
	956	+ list_for_each_entry_safe(page, page2, from, lru) {
	957	+ cond_resched();
	958	+
	959	+ rc = unmap_and_move_huge_page(get_new_page,
	960	+ private, page, pass > 2, offlining);
	961	+
	962	+ switch(rc) {
	963	+ case -ENOMEM:
	964	+ goto out;
	965	+ case -EAGAIN:
	966	+ retry++;
	967	+ break;
	968	+ case 0:
	969	+ break;
	970	+ default:
	971	+ /* Permanent failure */
	972	+ nr_failed++;
	973	+ break;
	974	+ }
	975	+ }
	976	+ }
	977	+ rc = 0;
	978	+out:
	979	+
	980	+ list_for_each_entry_safe(page, page2, from, lru)
	981	+ put_page(page);
784	982
785	983	if (rc)
786	984	return rc;
...	...	@@ -780,10 +780,10 @@
780	780	}
781	781
782	782	/**
783		- * __page_set_anon_rmap - setup new anonymous rmap
784		- * @page: the page to add the mapping to
785		- * @vma: the vm area in which the mapping is added
786		- * @address: the user virtual address mapped
	783	+ * __page_set_anon_rmap - set up new anonymous rmap
	784	+ * @page: Page to add to rmap
	785	+ * @vma: VM area to add page to.
	786	+ * @address: User virtual address of the mapping
787	787	* @exclusive: the page is exclusively owned by the current process
788	788	*/
789	789	static void __page_set_anon_rmap(struct page *page,
790	790
791	791
...	...	@@ -793,25 +793,16 @@
793	793
794	794	BUG_ON(!anon_vma);
795	795
	796	+ if (PageAnon(page))
	797	+ return;
	798	+
796	799	/*
797	800	* If the page isn't exclusively mapped into this vma,
798	801	* we must use the _oldest_ possible anon_vma for the
799	802	* page mapping!
800	803	*/
801		- if (!exclusive) {
802		- if (PageAnon(page))
803		- return;
	804	+ if (!exclusive)
804	805	anon_vma = anon_vma->root;
805		- } else {
806		- /*
807		- * In this case, swapped-out-but-not-discarded swap-cache
808		- * is remapped. So, no need to update page->mapping here.
809		- * We convice anon_vma poitned by page->mapping is not obsolete
810		- * because vma->anon_vma is necessary to be a family of it.
811		- */
812		- if (PageAnon(page))
813		- return;
814		- }
815	806
816	807	anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
817	808	page->mapping = (struct address_space *) anon_vma;