HWPOISON: Add soft page offline support

This is a simpler, gentler variant of memory_failure() for soft page offlining controlled from user space. It doesn't kill anything, just tries to invalidate and if that doesn't work migrate the page away. This is useful for predictive failure analysis, where a page has a high rate of corrected errors, but hasn't gone bad yet. Instead it can be offlined early and avoided. The offlining is controlled from sysfs, including a new generic entry point for hard page offlining for symmetry too. We use the page isolate facility to prevent re-allocation race. Normally this is only used by memory hotplug. To avoid races with memory allocation I am using lock_system_sleep(). This avoids the situation where memory hotplug is about to isolate a page range and then hwpoison undoes that work. This is a big hammer currently, but the simplest solution currently. When the page is not free or LRU we try to free pages from slab and other caches. The slab freeing is currently quite dumb and does not try to focus on the specific slab cache which might own the page. This could be potentially improved later. Thanks to Fengguang Wu and Haicheng Li for some fixes. [Added fix from Andrew Morton to adapt to new migrate_pages prototype] Signed-off-by: Andi Kleen <ak@linux.intel.com>

HWPOISON: Add soft page offline support
This is a simpler, gentler variant of memory_failure() for soft page offlining controlled from user space. It doesn't kill anything, just tries to invalidate and if that doesn't work migrate the page away. This is useful for predictive failure analysis, where a page has a high rate of corrected errors, but hasn't gone bad yet. Instead it can be offlined early and avoided. The offlining is controlled from sysfs, including a new generic entry point for hard page offlining for symmetry too. We use the page isolate facility to prevent re-allocation race. Normally this is only used by memory hotplug. To avoid races with memory allocation I am using lock_system_sleep(). This avoids the situation where memory hotplug is about to isolate a page range and then hwpoison undoes that work. This is a big hammer currently, but the simplest solution currently. When the page is not free or LRU we try to free pages from slab and other caches. The slab freeing is currently quite dumb and does not try to focus on the specific slab cache which might own the page. This could be potentially improved later. Thanks to Fengguang Wu and Haicheng Li for some fixes. [Added fix from Andrew Morton to adapt to new migrate_pages prototype] Signed-off-by: Andi Kleen <ak@linux.intel.com>
Andi Kleen · Andi Kleen
1 parent 2326c467df
Showing 5 changed files with 297 additions and 7 deletions Side-by-side Diff
Documentation/ABI/testing/sysfs-memory-page-offline
drivers/base/memory.c
include/linux/mm.h
mm/hwpoison-inject.c
mm/memory-failure.c
+What:		/sys/devices/system/memory/soft_offline_page
+Date:		Sep 2009
+KernelVersion:	2.6.33
+Contact:	andi@firstfloor.org
+Description:
+		Soft-offline the memory page containing the physical address
+		written into this file. Input is a hex number specifying the
+		physical address of the page. The kernel will then attempt
+		to soft-offline it, by moving the contents elsewhere or
+		dropping it if possible. The kernel will then be placed
+		on the bad page list and never be reused.
+
+		The offlining is done in kernel specific granuality.
+		Normally it's the base page size of the kernel, but
+		this might change.
+
+		The page must be still accessible, not poisoned. The
+		kernel will never kill anything for this, but rather
+		fail the offline.  Return value is the size of the
+		number, or a error when the offlining failed.  Reading
+		the file is not allowed.
+
+What:		/sys/devices/system/memory/hard_offline_page
+Date:		Sep 2009
+KernelVersion:	2.6.33
+Contact:	andi@firstfloor.org
+Description:
+		Hard-offline the memory page containing the physical
+		address written into this file. Input is a hex number
+		specifying the physical address of the page. The
+		kernel will then attempt to hard-offline the page, by
+		trying to drop the page or killing any owner or
+		triggering IO errors if needed.  Note this may kill
+		any processes owning the page. The kernel will avoid
+		to access this page assuming it's poisoned by the
+		hardware.
+
+		The offlining is done in kernel specific granuality.
+		Normally it's the base page size of the kernel, but
+		this might change.
+
+		Return value is the size of the number, or a error when
+		the offlining failed.
+		Reading the file is not allowed.
@@ -341,7 +341,65 @@
 }
 #endif
  
+#ifdef CONFIG_MEMORY_FAILURE
 /*
+ * Support for offlining pages of memory
+ */
+
+/* Soft offline a page */
+static ssize_t
+store_soft_offline_page(struct class *class, const char *buf, size_t count)
+{
+	int ret;
+	u64 pfn;
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	if (strict_strtoull(buf, 0, &pfn) < 0)
+		return -EINVAL;
+	pfn >>= PAGE_SHIFT;
+	if (!pfn_valid(pfn))
+		return -ENXIO;
+	ret = soft_offline_page(pfn_to_page(pfn), 0);
+	return ret == 0 ? count : ret;
+}
+
+/* Forcibly offline a page, including killing processes. */
+static ssize_t
+store_hard_offline_page(struct class *class, const char *buf, size_t count)
+{
+	int ret;
+	u64 pfn;
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	if (strict_strtoull(buf, 0, &pfn) < 0)
+		return -EINVAL;
+	pfn >>= PAGE_SHIFT;
+	ret = __memory_failure(pfn, 0, 0);
+	return ret ? ret : count;
+}
+
+static CLASS_ATTR(soft_offline_page, 0644, NULL, store_soft_offline_page);
+static CLASS_ATTR(hard_offline_page, 0644, NULL, store_hard_offline_page);
+
+static __init int memory_fail_init(void)
+{
+	int err;
+
+	err = sysfs_create_file(&memory_sysdev_class.kset.kobj,
+				&class_attr_soft_offline_page.attr);
+	if (!err)
+		err = sysfs_create_file(&memory_sysdev_class.kset.kobj,
+				&class_attr_hard_offline_page.attr);
+	return err;
+}
+#else
+static inline int memory_fail_init(void)
+{
+	return 0;
+}
+#endif
+
+/*
  * Note that phys_device is optional.  It is here to allow for
  * differentiation between which *physical* devices each
  * section belongs to...
@@ -471,6 +529,9 @@
 	}
  
 	err = memory_probe_init();
+	if (!ret)
+		ret = err;
+	err = memory_fail_init();
 	if (!ret)
 		ret = err;
 	err = block_size_init();
@@ -1339,8 +1339,9 @@
 extern int unpoison_memory(unsigned long pfn);
 extern int sysctl_memory_failure_early_kill;
 extern int sysctl_memory_failure_recovery;
-extern void shake_page(struct page *p);
+extern void shake_page(struct page *p, int access);
 extern atomic_long_t mce_bad_pages;
+extern int soft_offline_page(struct page *page, int flags);
  
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
@@ -29,7 +29,7 @@
 		return 0;
  
 	if (!PageLRU(p))
-		shake_page(p);
+		shake_page(p, 0);
 	/*
 	 * This implies unable to support non-LRU pages.
 	 */
@@ -41,6 +41,9 @@
 #include <linux/pagemap.h>
 #include <linux/swap.h>
 #include <linux/backing-dev.h>
+#include <linux/migrate.h>
+#include <linux/page-isolation.h>
+#include <linux/suspend.h>
 #include "internal.h"
  
 int sysctl_memory_failure_early_kill __read_mostly = 0;
@@ -201,7 +204,7 @@
  * When a unknown page type is encountered drain as many buffers as possible
  * in the hope to turn the page into a LRU or free page, which we can handle.
  */
-void shake_page(struct page *p)
+void shake_page(struct page *p, int access)
 {
 	if (!PageSlab(p)) {
 		lru_add_drain_all();
  
  
@@ -211,11 +214,19 @@
 		if (PageLRU(p) || is_free_buddy_page(p))
 			return;
 	}
+
 	/*
-	 * Could call shrink_slab here (which would also
-	 * shrink other caches). Unfortunately that might
-	 * also access the corrupted page, which could be fatal.
+	 * Only all shrink_slab here (which would also
+	 * shrink other caches) if access is not potentially fatal.
 	 */
+	if (access) {
+		int nr;
+		do {
+			nr = shrink_slab(1000, GFP_KERNEL, 1000);
+			if (page_count(p) == 0)
+				break;
+		} while (nr > 10);
+	}
 }
 EXPORT_SYMBOL_GPL(shake_page);
  
@@ -949,7 +960,7 @@
 	 * walked by the page reclaim code, however that's not a big loss.
 	 */
 	if (!PageLRU(p))
-		shake_page(p);
+		shake_page(p, 0);
 	if (!PageLRU(p)) {
 		/*
 		 * shake_page could have turned it free.
@@ -1099,4 +1110,177 @@
 	return 0;
 }
 EXPORT_SYMBOL(unpoison_memory);
+
+static struct page *new_page(struct page *p, unsigned long private, int **x)
+{
+	return alloc_pages(GFP_HIGHUSER_MOVABLE, 0);
+}
+
+/*
+ * Safely get reference count of an arbitrary page.
+ * Returns 0 for a free page, -EIO for a zero refcount page
+ * that is not free, and 1 for any other page type.
+ * For 1 the page is returned with increased page count, otherwise not.
+ */
+static int get_any_page(struct page *p, unsigned long pfn, int flags)
+{
+	int ret;
+
+	if (flags & MF_COUNT_INCREASED)
+		return 1;
+
+	/*
+	 * The lock_system_sleep prevents a race with memory hotplug,
+	 * because the isolation assumes there's only a single user.
+	 * This is a big hammer, a better would be nicer.
+	 */
+	lock_system_sleep();
+
+	/*
+	 * Isolate the page, so that it doesn't get reallocated if it
+	 * was free.
+	 */
+	set_migratetype_isolate(p);
+	if (!get_page_unless_zero(compound_head(p))) {
+		if (is_free_buddy_page(p)) {
+			pr_debug("get_any_page: %#lx free buddy page\n", pfn);
+			/* Set hwpoison bit while page is still isolated */
+			SetPageHWPoison(p);
+			ret = 0;
+		} else {
+			pr_debug("get_any_page: %#lx: unknown zero refcount page type %lx\n",
+				pfn, p->flags);
+			ret = -EIO;
+		}
+	} else {
+		/* Not a free page */
+		ret = 1;
+	}
+	unset_migratetype_isolate(p);
+	unlock_system_sleep();
+	return ret;
+}
+
+/**
+ * soft_offline_page - Soft offline a page.
+ * @page: page to offline
+ * @flags: flags. Same as memory_failure().
+ *
+ * Returns 0 on success, otherwise negated errno.
+ *
+ * Soft offline a page, by migration or invalidation,
+ * without killing anything. This is for the case when
+ * a page is not corrupted yet (so it's still valid to access),
+ * but has had a number of corrected errors and is better taken
+ * out.
+ *
+ * The actual policy on when to do that is maintained by
+ * user space.
+ *
+ * This should never impact any application or cause data loss,
+ * however it might take some time.
+ *
+ * This is not a 100% solution for all memory, but tries to be
+ * ``good enough'' for the majority of memory.
+ */
+int soft_offline_page(struct page *page, int flags)
+{
+	int ret;
+	unsigned long pfn = page_to_pfn(page);
+
+	ret = get_any_page(page, pfn, flags);
+	if (ret < 0)
+		return ret;
+	if (ret == 0)
+		goto done;
+
+	/*
+	 * Page cache page we can handle?
+	 */
+	if (!PageLRU(page)) {
+		/*
+		 * Try to free it.
+		 */
+		put_page(page);
+		shake_page(page, 1);
+
+		/*
+		 * Did it turn free?
+		 */
+		ret = get_any_page(page, pfn, 0);
+		if (ret < 0)
+			return ret;
+		if (ret == 0)
+			goto done;
+	}
+	if (!PageLRU(page)) {
+		pr_debug("soft_offline: %#lx: unknown non LRU page type %lx\n",
+				pfn, page->flags);
+		return -EIO;
+	}
+
+	lock_page(page);
+	wait_on_page_writeback(page);
+
+	/*
+	 * Synchronized using the page lock with memory_failure()
+	 */
+	if (PageHWPoison(page)) {
+		unlock_page(page);
+		put_page(page);
+		pr_debug("soft offline: %#lx page already poisoned\n", pfn);
+		return -EBUSY;
+	}
+
+	/*
+	 * Try to invalidate first. This should work for
+	 * non dirty unmapped page cache pages.
+	 */
+	ret = invalidate_inode_page(page);
+	unlock_page(page);
+
+	/*
+	 * Drop count because page migration doesn't like raised
+	 * counts. The page could get re-allocated, but if it becomes
+	 * LRU the isolation will just fail.
+	 * RED-PEN would be better to keep it isolated here, but we
+	 * would need to fix isolation locking first.
+	 */
+	put_page(page);
+	if (ret == 1) {
+		ret = 0;
+		pr_debug("soft_offline: %#lx: invalidated\n", pfn);
+		goto done;
+	}
+
+	/*
+	 * Simple invalidation didn't work.
+	 * Try to migrate to a new page instead. migrate.c
+	 * handles a large number of cases for us.
+	 */
+	ret = isolate_lru_page(page);
+	if (!ret) {
+		LIST_HEAD(pagelist);
+
+		list_add(&page->lru, &pagelist);
+		ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
+		if (ret) {
+			pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
+				pfn, ret, page->flags);
+			if (ret > 0)
+				ret = -EIO;
+		}
+	} else {
+		pr_debug("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
+				pfn, ret, page_count(page), page->flags);
+	}
+	if (ret)
+		return ret;
+
+done:
+	atomic_long_add(1, &mce_bad_pages);
+	SetPageHWPoison(page);
+	/* keep elevated page count for bad page */
+	return ret;
+}
	1	+What: /sys/devices/system/memory/soft_offline_page
	2	+Date: Sep 2009
	3	+KernelVersion: 2.6.33
	4	+Contact: andi@firstfloor.org
	5	+Description:
	6	+ Soft-offline the memory page containing the physical address
	7	+ written into this file. Input is a hex number specifying the
	8	+ physical address of the page. The kernel will then attempt
	9	+ to soft-offline it, by moving the contents elsewhere or
	10	+ dropping it if possible. The kernel will then be placed
	11	+ on the bad page list and never be reused.
	12	+
	13	+ The offlining is done in kernel specific granuality.
	14	+ Normally it's the base page size of the kernel, but
	15	+ this might change.
	16	+
	17	+ The page must be still accessible, not poisoned. The
	18	+ kernel will never kill anything for this, but rather
	19	+ fail the offline. Return value is the size of the
	20	+ number, or a error when the offlining failed. Reading
	21	+ the file is not allowed.
	22	+
	23	+What: /sys/devices/system/memory/hard_offline_page
	24	+Date: Sep 2009
	25	+KernelVersion: 2.6.33
	26	+Contact: andi@firstfloor.org
	27	+Description:
	28	+ Hard-offline the memory page containing the physical
	29	+ address written into this file. Input is a hex number
	30	+ specifying the physical address of the page. The
	31	+ kernel will then attempt to hard-offline the page, by
	32	+ trying to drop the page or killing any owner or
	33	+ triggering IO errors if needed. Note this may kill
	34	+ any processes owning the page. The kernel will avoid
	35	+ to access this page assuming it's poisoned by the
	36	+ hardware.
	37	+
	38	+ The offlining is done in kernel specific granuality.
	39	+ Normally it's the base page size of the kernel, but
	40	+ this might change.
	41	+
	42	+ Return value is the size of the number, or a error when
	43	+ the offlining failed.
	44	+ Reading the file is not allowed.
...	...	@@ -341,7 +341,65 @@
341	341	}
342	342	#endif
343	343
	344	+#ifdef CONFIG_MEMORY_FAILURE
344	345	/*
	346	+ * Support for offlining pages of memory
	347	+ */
	348	+
	349	+/* Soft offline a page */
	350	+static ssize_t
	351	+store_soft_offline_page(struct class class, const char buf, size_t count)
	352	+{
	353	+ int ret;
	354	+ u64 pfn;
	355	+ if (!capable(CAP_SYS_ADMIN))
	356	+ return -EPERM;
	357	+ if (strict_strtoull(buf, 0, &pfn) < 0)
	358	+ return -EINVAL;
	359	+ pfn >>= PAGE_SHIFT;
	360	+ if (!pfn_valid(pfn))
	361	+ return -ENXIO;
	362	+ ret = soft_offline_page(pfn_to_page(pfn), 0);
	363	+ return ret == 0 ? count : ret;
	364	+}
	365	+
	366	+/* Forcibly offline a page, including killing processes. */
	367	+static ssize_t
	368	+store_hard_offline_page(struct class class, const char buf, size_t count)
	369	+{
	370	+ int ret;
	371	+ u64 pfn;
	372	+ if (!capable(CAP_SYS_ADMIN))
	373	+ return -EPERM;
	374	+ if (strict_strtoull(buf, 0, &pfn) < 0)
	375	+ return -EINVAL;
	376	+ pfn >>= PAGE_SHIFT;
	377	+ ret = __memory_failure(pfn, 0, 0);
	378	+ return ret ? ret : count;
	379	+}
	380	+
	381	+static CLASS_ATTR(soft_offline_page, 0644, NULL, store_soft_offline_page);
	382	+static CLASS_ATTR(hard_offline_page, 0644, NULL, store_hard_offline_page);
	383	+
	384	+static __init int memory_fail_init(void)
	385	+{
	386	+ int err;
	387	+
	388	+ err = sysfs_create_file(&memory_sysdev_class.kset.kobj,
	389	+ &class_attr_soft_offline_page.attr);
	390	+ if (!err)
	391	+ err = sysfs_create_file(&memory_sysdev_class.kset.kobj,
	392	+ &class_attr_hard_offline_page.attr);
	393	+ return err;
	394	+}
	395	+#else
	396	+static inline int memory_fail_init(void)
	397	+{
	398	+ return 0;
	399	+}
	400	+#endif
	401	+
	402	+/*
345	403	* Note that phys_device is optional. It is here to allow for
346	404	* differentiation between which physical devices each
347	405	* section belongs to...
...	...	@@ -471,6 +529,9 @@
471	529	}
472	530
473	531	err = memory_probe_init();
	532	+ if (!ret)
	533	+ ret = err;
	534	+ err = memory_fail_init();
474	535	if (!ret)
475	536	ret = err;
476	537	err = block_size_init();
...	...	@@ -1339,8 +1339,9 @@
1339	1339	extern int unpoison_memory(unsigned long pfn);
1340	1340	extern int sysctl_memory_failure_early_kill;
1341	1341	extern int sysctl_memory_failure_recovery;
1342		-extern void shake_page(struct page *p);
	1342	+extern void shake_page(struct page *p, int access);
1343	1343	extern atomic_long_t mce_bad_pages;
	1344	+extern int soft_offline_page(struct page *page, int flags);
1344	1345
1345	1346	#endif /* __KERNEL__ */
1346	1347	#endif /* _LINUX_MM_H */
...	...	@@ -29,7 +29,7 @@
29	29	return 0;
30	30
31	31	if (!PageLRU(p))
32		- shake_page(p);
	32	+ shake_page(p, 0);
33	33	/*
34	34	* This implies unable to support non-LRU pages.
35	35	*/
...	...	@@ -41,6 +41,9 @@
41	41	#include <linux/pagemap.h>
42	42	#include <linux/swap.h>
43	43	#include <linux/backing-dev.h>
	44	+#include <linux/migrate.h>
	45	+#include <linux/page-isolation.h>
	46	+#include <linux/suspend.h>
44	47	#include "internal.h"
45	48
46	49	int sysctl_memory_failure_early_kill __read_mostly = 0;
...	...	@@ -201,7 +204,7 @@
201	204	* When a unknown page type is encountered drain as many buffers as possible
202	205	* in the hope to turn the page into a LRU or free page, which we can handle.
203	206	*/
204		-void shake_page(struct page *p)
	207	+void shake_page(struct page *p, int access)
205	208	{
206	209	if (!PageSlab(p)) {
207	210	lru_add_drain_all();
208	211
209	212
...	...	@@ -211,11 +214,19 @@
211	214	if (PageLRU(p) \|\| is_free_buddy_page(p))
212	215	return;
213	216	}
	217	+
214	218	/*
215		- * Could call shrink_slab here (which would also
216		- * shrink other caches). Unfortunately that might
217		- * also access the corrupted page, which could be fatal.
	219	+ * Only all shrink_slab here (which would also
	220	+ * shrink other caches) if access is not potentially fatal.
218	221	*/
	222	+ if (access) {
	223	+ int nr;
	224	+ do {
	225	+ nr = shrink_slab(1000, GFP_KERNEL, 1000);
	226	+ if (page_count(p) == 0)
	227	+ break;
	228	+ } while (nr > 10);
	229	+ }
219	230	}
220	231	EXPORT_SYMBOL_GPL(shake_page);
221	232
...	...	@@ -949,7 +960,7 @@
949	960	* walked by the page reclaim code, however that's not a big loss.
950	961	*/
951	962	if (!PageLRU(p))
952		- shake_page(p);
	963	+ shake_page(p, 0);
953	964	if (!PageLRU(p)) {
954	965	/*
955	966	* shake_page could have turned it free.
...	...	@@ -1099,4 +1110,177 @@
1099	1110	return 0;
1100	1111	}
1101	1112	EXPORT_SYMBOL(unpoison_memory);
	1113	+
	1114	+static struct page new_page(struct page p, unsigned long private, int **x)
	1115	+{
	1116	+ return alloc_pages(GFP_HIGHUSER_MOVABLE, 0);
	1117	+}
	1118	+
	1119	+/*
	1120	+ * Safely get reference count of an arbitrary page.
	1121	+ * Returns 0 for a free page, -EIO for a zero refcount page
	1122	+ * that is not free, and 1 for any other page type.
	1123	+ * For 1 the page is returned with increased page count, otherwise not.
	1124	+ */
	1125	+static int get_any_page(struct page *p, unsigned long pfn, int flags)
	1126	+{
	1127	+ int ret;
	1128	+
	1129	+ if (flags & MF_COUNT_INCREASED)
	1130	+ return 1;
	1131	+
	1132	+ /*
	1133	+ * The lock_system_sleep prevents a race with memory hotplug,
	1134	+ * because the isolation assumes there's only a single user.
	1135	+ * This is a big hammer, a better would be nicer.
	1136	+ */
	1137	+ lock_system_sleep();
	1138	+
	1139	+ /*
	1140	+ * Isolate the page, so that it doesn't get reallocated if it
	1141	+ * was free.
	1142	+ */
	1143	+ set_migratetype_isolate(p);
	1144	+ if (!get_page_unless_zero(compound_head(p))) {
	1145	+ if (is_free_buddy_page(p)) {
	1146	+ pr_debug("get_any_page: %#lx free buddy page\n", pfn);
	1147	+ /* Set hwpoison bit while page is still isolated */
	1148	+ SetPageHWPoison(p);
	1149	+ ret = 0;
	1150	+ } else {
	1151	+ pr_debug("get_any_page: %#lx: unknown zero refcount page type %lx\n",
	1152	+ pfn, p->flags);
	1153	+ ret = -EIO;
	1154	+ }
	1155	+ } else {
	1156	+ /* Not a free page */
	1157	+ ret = 1;
	1158	+ }
	1159	+ unset_migratetype_isolate(p);
	1160	+ unlock_system_sleep();
	1161	+ return ret;
	1162	+}
	1163	+
	1164	+/**
	1165	+ * soft_offline_page - Soft offline a page.
	1166	+ * @page: page to offline
	1167	+ * @flags: flags. Same as memory_failure().
	1168	+ *
	1169	+ * Returns 0 on success, otherwise negated errno.
	1170	+ *
	1171	+ * Soft offline a page, by migration or invalidation,
	1172	+ * without killing anything. This is for the case when
	1173	+ * a page is not corrupted yet (so it's still valid to access),
	1174	+ * but has had a number of corrected errors and is better taken
	1175	+ * out.
	1176	+ *
	1177	+ * The actual policy on when to do that is maintained by
	1178	+ * user space.
	1179	+ *
	1180	+ * This should never impact any application or cause data loss,
	1181	+ * however it might take some time.
	1182	+ *
	1183	+ * This is not a 100% solution for all memory, but tries to be
	1184	+ * ``good enough'' for the majority of memory.
	1185	+ */
	1186	+int soft_offline_page(struct page *page, int flags)
	1187	+{
	1188	+ int ret;
	1189	+ unsigned long pfn = page_to_pfn(page);
	1190	+
	1191	+ ret = get_any_page(page, pfn, flags);
	1192	+ if (ret < 0)
	1193	+ return ret;
	1194	+ if (ret == 0)
	1195	+ goto done;
	1196	+
	1197	+ /*
	1198	+ * Page cache page we can handle?
	1199	+ */
	1200	+ if (!PageLRU(page)) {
	1201	+ /*
	1202	+ * Try to free it.
	1203	+ */
	1204	+ put_page(page);
	1205	+ shake_page(page, 1);
	1206	+
	1207	+ /*
	1208	+ * Did it turn free?
	1209	+ */
	1210	+ ret = get_any_page(page, pfn, 0);
	1211	+ if (ret < 0)
	1212	+ return ret;
	1213	+ if (ret == 0)
	1214	+ goto done;
	1215	+ }
	1216	+ if (!PageLRU(page)) {
	1217	+ pr_debug("soft_offline: %#lx: unknown non LRU page type %lx\n",
	1218	+ pfn, page->flags);
	1219	+ return -EIO;
	1220	+ }
	1221	+
	1222	+ lock_page(page);
	1223	+ wait_on_page_writeback(page);
	1224	+
	1225	+ /*
	1226	+ * Synchronized using the page lock with memory_failure()
	1227	+ */
	1228	+ if (PageHWPoison(page)) {
	1229	+ unlock_page(page);
	1230	+ put_page(page);
	1231	+ pr_debug("soft offline: %#lx page already poisoned\n", pfn);
	1232	+ return -EBUSY;
	1233	+ }
	1234	+
	1235	+ /*
	1236	+ * Try to invalidate first. This should work for
	1237	+ * non dirty unmapped page cache pages.
	1238	+ */
	1239	+ ret = invalidate_inode_page(page);
	1240	+ unlock_page(page);
	1241	+
	1242	+ /*
	1243	+ * Drop count because page migration doesn't like raised
	1244	+ * counts. The page could get re-allocated, but if it becomes
	1245	+ * LRU the isolation will just fail.
	1246	+ * RED-PEN would be better to keep it isolated here, but we
	1247	+ * would need to fix isolation locking first.
	1248	+ */
	1249	+ put_page(page);
	1250	+ if (ret == 1) {
	1251	+ ret = 0;
	1252	+ pr_debug("soft_offline: %#lx: invalidated\n", pfn);
	1253	+ goto done;
	1254	+ }
	1255	+
	1256	+ /*
	1257	+ * Simple invalidation didn't work.
	1258	+ * Try to migrate to a new page instead. migrate.c
	1259	+ * handles a large number of cases for us.
	1260	+ */
	1261	+ ret = isolate_lru_page(page);
	1262	+ if (!ret) {
	1263	+ LIST_HEAD(pagelist);
	1264	+
	1265	+ list_add(&page->lru, &pagelist);
	1266	+ ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
	1267	+ if (ret) {
	1268	+ pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
	1269	+ pfn, ret, page->flags);
	1270	+ if (ret > 0)
	1271	+ ret = -EIO;
	1272	+ }
	1273	+ } else {
	1274	+ pr_debug("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
	1275	+ pfn, ret, page_count(page), page->flags);
	1276	+ }
	1277	+ if (ret)
	1278	+ return ret;
	1279	+
	1280	+done:
	1281	+ atomic_long_add(1, &mce_bad_pages);
	1282	+ SetPageHWPoison(page);
	1283	+ /* keep elevated page count for bad page */
	1284	+ return ret;
	1285	+}