Commit d4220f987cf473c65a342ca69e3eb13dea919a49

Authored by Linus Torvalds

Merge branch 'hwpoison' of git://git.kernel.org/pub/scm/linux/kernel/git/ak/linux-mce-2.6

* 'hwpoison' of git://git.kernel.org/pub/scm/linux/kernel/git/ak/linux-mce-2.6: (34 commits)
  HWPOISON: Remove stray phrase in a comment
  HWPOISON: Try to allocate migration page on the same node
  HWPOISON: Don't do early filtering if filter is disabled
  HWPOISON: Add a madvise() injector for soft page offlining
  HWPOISON: Add soft page offline support
  HWPOISON: Undefine short-hand macros after use to avoid namespace conflict
  HWPOISON: Use new shake_page in memory_failure
  HWPOISON: Use correct name for MADV_HWPOISON in documentation
  HWPOISON: mention HWPoison in Kconfig entry
  HWPOISON: Use get_user_page_fast in hwpoison madvise
  HWPOISON: add an interface to switch off/on all the page filters
  HWPOISON: add memory cgroup filter
  memcg: add accessor to mem_cgroup.css
  memcg: rename and export try_get_mem_cgroup_from_page()
  HWPOISON: add page flags filter
  mm: export stable page flags
  HWPOISON: limit hwpoison injector to known page types
  HWPOISON: add fs/device filters
  HWPOISON: return 0 to indicate success reliably
  HWPOISON: make semantics of IGNORED/DELAYED clear
  ...

Showing 19 changed files Side-by-side Diff

Documentation/ABI/testing/sysfs-memory-page-offline
  1 +What: /sys/devices/system/memory/soft_offline_page
  2 +Date: Sep 2009
  3 +KernelVersion: 2.6.33
  4 +Contact: andi@firstfloor.org
  5 +Description:
  6 + Soft-offline the memory page containing the physical address
  7 + written into this file. Input is a hex number specifying the
  8 + physical address of the page. The kernel will then attempt
  9 + to soft-offline it, by moving the contents elsewhere or
  10 + dropping it if possible. The kernel will then be placed
  11 + on the bad page list and never be reused.
  12 +
  13 + The offlining is done in kernel specific granuality.
  14 + Normally it's the base page size of the kernel, but
  15 + this might change.
  16 +
  17 + The page must be still accessible, not poisoned. The
  18 + kernel will never kill anything for this, but rather
  19 + fail the offline. Return value is the size of the
  20 + number, or a error when the offlining failed. Reading
  21 + the file is not allowed.
  22 +
  23 +What: /sys/devices/system/memory/hard_offline_page
  24 +Date: Sep 2009
  25 +KernelVersion: 2.6.33
  26 +Contact: andi@firstfloor.org
  27 +Description:
  28 + Hard-offline the memory page containing the physical
  29 + address written into this file. Input is a hex number
  30 + specifying the physical address of the page. The
  31 + kernel will then attempt to hard-offline the page, by
  32 + trying to drop the page or killing any owner or
  33 + triggering IO errors if needed. Note this may kill
  34 + any processes owning the page. The kernel will avoid
  35 + to access this page assuming it's poisoned by the
  36 + hardware.
  37 +
  38 + The offlining is done in kernel specific granuality.
  39 + Normally it's the base page size of the kernel, but
  40 + this might change.
  41 +
  42 + Return value is the size of the number, or a error when
  43 + the offlining failed.
  44 + Reading the file is not allowed.
Documentation/vm/hwpoison.txt
... ... @@ -92,16 +92,62 @@
92 92  
93 93 Testing:
94 94  
95   -madvise(MADV_POISON, ....)
  95 +madvise(MADV_HWPOISON, ....)
96 96 (as root)
97 97 Poison a page in the process for testing
98 98  
99 99  
100 100 hwpoison-inject module through debugfs
101   - /sys/debug/hwpoison/corrupt-pfn
102 101  
103   -Inject hwpoison fault at PFN echoed into this file
  102 +/sys/debug/hwpoison/
104 103  
  104 +corrupt-pfn
  105 +
  106 +Inject hwpoison fault at PFN echoed into this file. This does
  107 +some early filtering to avoid corrupted unintended pages in test suites.
  108 +
  109 +unpoison-pfn
  110 +
  111 +Software-unpoison page at PFN echoed into this file. This
  112 +way a page can be reused again.
  113 +This only works for Linux injected failures, not for real
  114 +memory failures.
  115 +
  116 +Note these injection interfaces are not stable and might change between
  117 +kernel versions
  118 +
  119 +corrupt-filter-dev-major
  120 +corrupt-filter-dev-minor
  121 +
  122 +Only handle memory failures to pages associated with the file system defined
  123 +by block device major/minor. -1U is the wildcard value.
  124 +This should be only used for testing with artificial injection.
  125 +
  126 +corrupt-filter-memcg
  127 +
  128 +Limit injection to pages owned by memgroup. Specified by inode number
  129 +of the memcg.
  130 +
  131 +Example:
  132 + mkdir /cgroup/hwpoison
  133 +
  134 + usemem -m 100 -s 1000 &
  135 + echo `jobs -p` > /cgroup/hwpoison/tasks
  136 +
  137 + memcg_ino=$(ls -id /cgroup/hwpoison | cut -f1 -d' ')
  138 + echo $memcg_ino > /debug/hwpoison/corrupt-filter-memcg
  139 +
  140 + page-types -p `pidof init` --hwpoison # shall do nothing
  141 + page-types -p `pidof usemem` --hwpoison # poison its pages
  142 +
  143 +corrupt-filter-flags-mask
  144 +corrupt-filter-flags-value
  145 +
  146 +When specified, only poison pages if ((page_flags & mask) == value).
  147 +This allows stress testing of many kinds of pages. The page_flags
  148 +are the same as in /proc/kpageflags. The flag bits are defined in
  149 +include/linux/kernel-page-flags.h and documented in
  150 +Documentation/vm/pagemap.txt
105 151  
106 152 Architecture specific MCE injector
107 153  
Documentation/vm/page-types.c
1 1 /*
2 2 * page-types: Tool for querying page flags
3 3 *
  4 + * This program is free software; you can redistribute it and/or modify it
  5 + * under the terms of the GNU General Public License as published by the Free
  6 + * Software Foundation; version 2.
  7 + *
  8 + * This program is distributed in the hope that it will be useful, but WITHOUT
  9 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  10 + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
  11 + * more details.
  12 + *
  13 + * You should find a copy of v2 of the GNU General Public License somewhere on
  14 + * your Linux system; if not, write to the Free Software Foundation, Inc., 59
  15 + * Temple Place, Suite 330, Boston, MA 02111-1307 USA.
  16 + *
4 17 * Copyright (C) 2009 Intel corporation
5 18 *
6 19 * Authors: Wu Fengguang <fengguang.wu@intel.com>
7   - *
8   - * Released under the General Public License (GPL).
9 20 */
10 21  
11 22 #define _LARGEFILE64_SOURCE
... ... @@ -2377,6 +2377,15 @@
2377 2377 S: Maintained
2378 2378 F: drivers/hwmon/hdaps.c
2379 2379  
  2380 +HWPOISON MEMORY FAILURE HANDLING
  2381 +M: Andi Kleen <andi@firstfloor.org>
  2382 +L: linux-mm@kvack.org
  2383 +L: linux-kernel@vger.kernel.org
  2384 +T: git git://git.kernel.org/pub/scm/linux/kernel/git/ak/linux-mce-2.6.git hwpoison
  2385 +S: Maintained
  2386 +F: mm/memory-failure.c
  2387 +F: mm/hwpoison-inject.c
  2388 +
2380 2389 HYPERVISOR VIRTUAL CONSOLE DRIVER
2381 2390 L: linuxppc-dev@ozlabs.org
2382 2391 S: Odd Fixes
drivers/base/memory.c
... ... @@ -341,7 +341,65 @@
341 341 }
342 342 #endif
343 343  
  344 +#ifdef CONFIG_MEMORY_FAILURE
344 345 /*
  346 + * Support for offlining pages of memory
  347 + */
  348 +
  349 +/* Soft offline a page */
  350 +static ssize_t
  351 +store_soft_offline_page(struct class *class, const char *buf, size_t count)
  352 +{
  353 + int ret;
  354 + u64 pfn;
  355 + if (!capable(CAP_SYS_ADMIN))
  356 + return -EPERM;
  357 + if (strict_strtoull(buf, 0, &pfn) < 0)
  358 + return -EINVAL;
  359 + pfn >>= PAGE_SHIFT;
  360 + if (!pfn_valid(pfn))
  361 + return -ENXIO;
  362 + ret = soft_offline_page(pfn_to_page(pfn), 0);
  363 + return ret == 0 ? count : ret;
  364 +}
  365 +
  366 +/* Forcibly offline a page, including killing processes. */
  367 +static ssize_t
  368 +store_hard_offline_page(struct class *class, const char *buf, size_t count)
  369 +{
  370 + int ret;
  371 + u64 pfn;
  372 + if (!capable(CAP_SYS_ADMIN))
  373 + return -EPERM;
  374 + if (strict_strtoull(buf, 0, &pfn) < 0)
  375 + return -EINVAL;
  376 + pfn >>= PAGE_SHIFT;
  377 + ret = __memory_failure(pfn, 0, 0);
  378 + return ret ? ret : count;
  379 +}
  380 +
  381 +static CLASS_ATTR(soft_offline_page, 0644, NULL, store_soft_offline_page);
  382 +static CLASS_ATTR(hard_offline_page, 0644, NULL, store_hard_offline_page);
  383 +
  384 +static __init int memory_fail_init(void)
  385 +{
  386 + int err;
  387 +
  388 + err = sysfs_create_file(&memory_sysdev_class.kset.kobj,
  389 + &class_attr_soft_offline_page.attr);
  390 + if (!err)
  391 + err = sysfs_create_file(&memory_sysdev_class.kset.kobj,
  392 + &class_attr_hard_offline_page.attr);
  393 + return err;
  394 +}
  395 +#else
  396 +static inline int memory_fail_init(void)
  397 +{
  398 + return 0;
  399 +}
  400 +#endif
  401 +
  402 +/*
345 403 * Note that phys_device is optional. It is here to allow for
346 404 * differentiation between which *physical* devices each
347 405 * section belongs to...
... ... @@ -471,6 +529,9 @@
471 529 }
472 530  
473 531 err = memory_probe_init();
  532 + if (!ret)
  533 + ret = err;
  534 + err = memory_fail_init();
474 535 if (!ret)
475 536 ret = err;
476 537 err = block_size_init();
... ... @@ -8,6 +8,7 @@
8 8 #include <linux/proc_fs.h>
9 9 #include <linux/seq_file.h>
10 10 #include <linux/hugetlb.h>
  11 +#include <linux/kernel-page-flags.h>
11 12 #include <asm/uaccess.h>
12 13 #include "internal.h"
13 14  
14 15  
... ... @@ -71,52 +72,12 @@
71 72 * physical page flags.
72 73 */
73 74  
74   -/* These macros are used to decouple internal flags from exported ones */
75   -
76   -#define KPF_LOCKED 0
77   -#define KPF_ERROR 1
78   -#define KPF_REFERENCED 2
79   -#define KPF_UPTODATE 3
80   -#define KPF_DIRTY 4
81   -#define KPF_LRU 5
82   -#define KPF_ACTIVE 6
83   -#define KPF_SLAB 7
84   -#define KPF_WRITEBACK 8
85   -#define KPF_RECLAIM 9
86   -#define KPF_BUDDY 10
87   -
88   -/* 11-20: new additions in 2.6.31 */
89   -#define KPF_MMAP 11
90   -#define KPF_ANON 12
91   -#define KPF_SWAPCACHE 13
92   -#define KPF_SWAPBACKED 14
93   -#define KPF_COMPOUND_HEAD 15
94   -#define KPF_COMPOUND_TAIL 16
95   -#define KPF_HUGE 17
96   -#define KPF_UNEVICTABLE 18
97   -#define KPF_HWPOISON 19
98   -#define KPF_NOPAGE 20
99   -
100   -#define KPF_KSM 21
101   -
102   -/* kernel hacking assistances
103   - * WARNING: subject to change, never rely on them!
104   - */
105   -#define KPF_RESERVED 32
106   -#define KPF_MLOCKED 33
107   -#define KPF_MAPPEDTODISK 34
108   -#define KPF_PRIVATE 35
109   -#define KPF_PRIVATE_2 36
110   -#define KPF_OWNER_PRIVATE 37
111   -#define KPF_ARCH 38
112   -#define KPF_UNCACHED 39
113   -
114 75 static inline u64 kpf_copy_bit(u64 kflags, int ubit, int kbit)
115 76 {
116 77 return ((kflags >> kbit) & 1) << ubit;
117 78 }
118 79  
119   -static u64 get_uflags(struct page *page)
  80 +u64 stable_page_flags(struct page *page)
120 81 {
121 82 u64 k;
122 83 u64 u;
... ... @@ -219,7 +180,7 @@
219 180 else
220 181 ppage = NULL;
221 182  
222   - if (put_user(get_uflags(ppage), out)) {
  183 + if (put_user(stable_page_flags(ppage), out)) {
223 184 ret = -EFAULT;
224 185 break;
225 186 }
include/asm-generic/mman-common.h
... ... @@ -40,6 +40,7 @@
40 40 #define MADV_DONTFORK 10 /* don't inherit across fork */
41 41 #define MADV_DOFORK 11 /* do inherit across fork */
42 42 #define MADV_HWPOISON 100 /* poison a page for testing */
  43 +#define MADV_SOFT_OFFLINE 101 /* soft offline page for testing */
43 44  
44 45 #define MADV_MERGEABLE 12 /* KSM may merge identical pages */
45 46 #define MADV_UNMERGEABLE 13 /* KSM may not merge identical pages */
include/linux/kernel-page-flags.h
  1 +#ifndef LINUX_KERNEL_PAGE_FLAGS_H
  2 +#define LINUX_KERNEL_PAGE_FLAGS_H
  3 +
  4 +/*
  5 + * Stable page flag bits exported to user space
  6 + */
  7 +
  8 +#define KPF_LOCKED 0
  9 +#define KPF_ERROR 1
  10 +#define KPF_REFERENCED 2
  11 +#define KPF_UPTODATE 3
  12 +#define KPF_DIRTY 4
  13 +#define KPF_LRU 5
  14 +#define KPF_ACTIVE 6
  15 +#define KPF_SLAB 7
  16 +#define KPF_WRITEBACK 8
  17 +#define KPF_RECLAIM 9
  18 +#define KPF_BUDDY 10
  19 +
  20 +/* 11-20: new additions in 2.6.31 */
  21 +#define KPF_MMAP 11
  22 +#define KPF_ANON 12
  23 +#define KPF_SWAPCACHE 13
  24 +#define KPF_SWAPBACKED 14
  25 +#define KPF_COMPOUND_HEAD 15
  26 +#define KPF_COMPOUND_TAIL 16
  27 +#define KPF_HUGE 17
  28 +#define KPF_UNEVICTABLE 18
  29 +#define KPF_HWPOISON 19
  30 +#define KPF_NOPAGE 20
  31 +
  32 +#define KPF_KSM 21
  33 +
  34 +/* kernel hacking assistances
  35 + * WARNING: subject to change, never rely on them!
  36 + */
  37 +#define KPF_RESERVED 32
  38 +#define KPF_MLOCKED 33
  39 +#define KPF_MAPPEDTODISK 34
  40 +#define KPF_PRIVATE 35
  41 +#define KPF_PRIVATE_2 36
  42 +#define KPF_OWNER_PRIVATE 37
  43 +#define KPF_ARCH 38
  44 +#define KPF_UNCACHED 39
  45 +
  46 +#endif /* LINUX_KERNEL_PAGE_FLAGS_H */
include/linux/memcontrol.h
... ... @@ -73,6 +73,7 @@
73 73 extern void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask);
74 74 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem);
75 75  
  76 +extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page);
76 77 extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
77 78  
78 79 static inline
... ... @@ -85,6 +86,8 @@
85 86 return cgroup == mem;
86 87 }
87 88  
  89 +extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem);
  90 +
88 91 extern int
89 92 mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr);
90 93 extern void mem_cgroup_end_migration(struct mem_cgroup *mem,
... ... @@ -202,6 +205,11 @@
202 205 {
203 206 }
204 207  
  208 +static inline struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
  209 +{
  210 + return NULL;
  211 +}
  212 +
205 213 static inline int mm_match_cgroup(struct mm_struct *mm, struct mem_cgroup *mem)
206 214 {
207 215 return 1;
... ... @@ -211,6 +219,11 @@
211 219 const struct mem_cgroup *mem)
212 220 {
213 221 return 1;
  222 +}
  223 +
  224 +static inline struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem)
  225 +{
  226 + return NULL;
214 227 }
215 228  
216 229 static inline int
... ... @@ -1331,11 +1331,17 @@
1331 1331 size_t size);
1332 1332 extern void refund_locked_memory(struct mm_struct *mm, size_t size);
1333 1333  
  1334 +enum mf_flags {
  1335 + MF_COUNT_INCREASED = 1 << 0,
  1336 +};
1334 1337 extern void memory_failure(unsigned long pfn, int trapno);
1335   -extern int __memory_failure(unsigned long pfn, int trapno, int ref);
  1338 +extern int __memory_failure(unsigned long pfn, int trapno, int flags);
  1339 +extern int unpoison_memory(unsigned long pfn);
1336 1340 extern int sysctl_memory_failure_early_kill;
1337 1341 extern int sysctl_memory_failure_recovery;
  1342 +extern void shake_page(struct page *p, int access);
1338 1343 extern atomic_long_t mce_bad_pages;
  1344 +extern int soft_offline_page(struct page *page, int flags);
1339 1345  
1340 1346 #endif /* __KERNEL__ */
1341 1347 #endif /* _LINUX_MM_H */
include/linux/page-flags.h
... ... @@ -275,12 +275,14 @@
275 275  
276 276 #ifdef CONFIG_MEMORY_FAILURE
277 277 PAGEFLAG(HWPoison, hwpoison)
278   -TESTSETFLAG(HWPoison, hwpoison)
  278 +TESTSCFLAG(HWPoison, hwpoison)
279 279 #define __PG_HWPOISON (1UL << PG_hwpoison)
280 280 #else
281 281 PAGEFLAG_FALSE(HWPoison)
282 282 #define __PG_HWPOISON 0
283 283 #endif
  284 +
  285 +u64 stable_page_flags(struct page *page);
284 286  
285 287 static inline int PageUptodate(struct page *page)
286 288 {
... ... @@ -251,8 +251,9 @@
251 251 special hardware support and typically ECC memory.
252 252  
253 253 config HWPOISON_INJECT
254   - tristate "Poison pages injector"
  254 + tristate "HWPoison pages injector"
255 255 depends on MEMORY_FAILURE && DEBUG_KERNEL
  256 + select PROC_PAGE_MONITOR
256 257  
257 258 config NOMMU_INITIAL_TRIM_EXCESS
258 259 int "Turn on mmap() excess space trimming before booting"
mm/hwpoison-inject.c
... ... @@ -3,18 +3,68 @@
3 3 #include <linux/debugfs.h>
4 4 #include <linux/kernel.h>
5 5 #include <linux/mm.h>
  6 +#include <linux/swap.h>
  7 +#include <linux/pagemap.h>
  8 +#include "internal.h"
6 9  
7   -static struct dentry *hwpoison_dir, *corrupt_pfn;
  10 +static struct dentry *hwpoison_dir;
8 11  
9 12 static int hwpoison_inject(void *data, u64 val)
10 13 {
  14 + unsigned long pfn = val;
  15 + struct page *p;
  16 + int err;
  17 +
11 18 if (!capable(CAP_SYS_ADMIN))
12 19 return -EPERM;
13   - printk(KERN_INFO "Injecting memory failure at pfn %Lx\n", val);
14   - return __memory_failure(val, 18, 0);
  20 +
  21 + if (!hwpoison_filter_enable)
  22 + goto inject;
  23 + if (!pfn_valid(pfn))
  24 + return -ENXIO;
  25 +
  26 + p = pfn_to_page(pfn);
  27 + /*
  28 + * This implies unable to support free buddy pages.
  29 + */
  30 + if (!get_page_unless_zero(p))
  31 + return 0;
  32 +
  33 + if (!PageLRU(p))
  34 + shake_page(p, 0);
  35 + /*
  36 + * This implies unable to support non-LRU pages.
  37 + */
  38 + if (!PageLRU(p))
  39 + return 0;
  40 +
  41 + /*
  42 + * do a racy check with elevated page count, to make sure PG_hwpoison
  43 + * will only be set for the targeted owner (or on a free page).
  44 + * We temporarily take page lock for try_get_mem_cgroup_from_page().
  45 + * __memory_failure() will redo the check reliably inside page lock.
  46 + */
  47 + lock_page(p);
  48 + err = hwpoison_filter(p);
  49 + unlock_page(p);
  50 + if (err)
  51 + return 0;
  52 +
  53 +inject:
  54 + printk(KERN_INFO "Injecting memory failure at pfn %lx\n", pfn);
  55 + return __memory_failure(pfn, 18, MF_COUNT_INCREASED);
15 56 }
16 57  
  58 +static int hwpoison_unpoison(void *data, u64 val)
  59 +{
  60 + if (!capable(CAP_SYS_ADMIN))
  61 + return -EPERM;
  62 +
  63 + return unpoison_memory(val);
  64 +}
  65 +
17 66 DEFINE_SIMPLE_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n");
  67 +DEFINE_SIMPLE_ATTRIBUTE(unpoison_fops, NULL, hwpoison_unpoison, "%lli\n");
18 68  
19 69 static void pfn_inject_exit(void)
20 70 {
21 71  
22 72  
23 73  
... ... @@ -24,16 +74,63 @@
24 74  
25 75 static int pfn_inject_init(void)
26 76 {
  77 + struct dentry *dentry;
  78 +
27 79 hwpoison_dir = debugfs_create_dir("hwpoison", NULL);
28 80 if (hwpoison_dir == NULL)
29 81 return -ENOMEM;
30   - corrupt_pfn = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir,
  82 +
  83 + /*
  84 + * Note that the below poison/unpoison interfaces do not involve
  85 + * hardware status change, hence do not require hardware support.
  86 + * They are mainly for testing hwpoison in software level.
  87 + */
  88 + dentry = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir,
31 89 NULL, &hwpoison_fops);
32   - if (corrupt_pfn == NULL) {
33   - pfn_inject_exit();
34   - return -ENOMEM;
35   - }
  90 + if (!dentry)
  91 + goto fail;
  92 +
  93 + dentry = debugfs_create_file("unpoison-pfn", 0600, hwpoison_dir,
  94 + NULL, &unpoison_fops);
  95 + if (!dentry)
  96 + goto fail;
  97 +
  98 + dentry = debugfs_create_u32("corrupt-filter-enable", 0600,
  99 + hwpoison_dir, &hwpoison_filter_enable);
  100 + if (!dentry)
  101 + goto fail;
  102 +
  103 + dentry = debugfs_create_u32("corrupt-filter-dev-major", 0600,
  104 + hwpoison_dir, &hwpoison_filter_dev_major);
  105 + if (!dentry)
  106 + goto fail;
  107 +
  108 + dentry = debugfs_create_u32("corrupt-filter-dev-minor", 0600,
  109 + hwpoison_dir, &hwpoison_filter_dev_minor);
  110 + if (!dentry)
  111 + goto fail;
  112 +
  113 + dentry = debugfs_create_u64("corrupt-filter-flags-mask", 0600,
  114 + hwpoison_dir, &hwpoison_filter_flags_mask);
  115 + if (!dentry)
  116 + goto fail;
  117 +
  118 + dentry = debugfs_create_u64("corrupt-filter-flags-value", 0600,
  119 + hwpoison_dir, &hwpoison_filter_flags_value);
  120 + if (!dentry)
  121 + goto fail;
  122 +
  123 +#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
  124 + dentry = debugfs_create_u64("corrupt-filter-memcg", 0600,
  125 + hwpoison_dir, &hwpoison_filter_memcg);
  126 + if (!dentry)
  127 + goto fail;
  128 +#endif
  129 +
36 130 return 0;
  131 +fail:
  132 + pfn_inject_exit();
  133 + return -ENOMEM;
37 134 }
38 135  
39 136 module_init(pfn_inject_init);
... ... @@ -50,6 +50,9 @@
50 50 */
51 51 extern void __free_pages_bootmem(struct page *page, unsigned int order);
52 52 extern void prep_compound_page(struct page *page, unsigned long order);
  53 +#ifdef CONFIG_MEMORY_FAILURE
  54 +extern bool is_free_buddy_page(struct page *page);
  55 +#endif
53 56  
54 57  
55 58 /*
... ... @@ -247,4 +250,13 @@
247 250 #define ZONE_RECLAIM_SOME 0
248 251 #define ZONE_RECLAIM_SUCCESS 1
249 252 #endif
  253 +
  254 +extern int hwpoison_filter(struct page *p);
  255 +
  256 +extern u32 hwpoison_filter_dev_major;
  257 +extern u32 hwpoison_filter_dev_minor;
  258 +extern u64 hwpoison_filter_flags_mask;
  259 +extern u64 hwpoison_filter_flags_value;
  260 +extern u64 hwpoison_filter_memcg;
  261 +extern u32 hwpoison_filter_enable;
... ... @@ -9,6 +9,7 @@
9 9 #include <linux/pagemap.h>
10 10 #include <linux/syscalls.h>
11 11 #include <linux/mempolicy.h>
  12 +#include <linux/page-isolation.h>
12 13 #include <linux/hugetlb.h>
13 14 #include <linux/sched.h>
14 15 #include <linux/ksm.h>
... ... @@ -222,7 +223,7 @@
222 223 /*
223 224 * Error injection support for memory error handling.
224 225 */
225   -static int madvise_hwpoison(unsigned long start, unsigned long end)
  226 +static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end)
226 227 {
227 228 int ret = 0;
228 229  
229 230  
230 231  
... ... @@ -230,15 +231,21 @@
230 231 return -EPERM;
231 232 for (; start < end; start += PAGE_SIZE) {
232 233 struct page *p;
233   - int ret = get_user_pages(current, current->mm, start, 1,
234   - 0, 0, &p, NULL);
  234 + int ret = get_user_pages_fast(start, 1, 0, &p);
235 235 if (ret != 1)
236 236 return ret;
  237 + if (bhv == MADV_SOFT_OFFLINE) {
  238 + printk(KERN_INFO "Soft offlining page %lx at %lx\n",
  239 + page_to_pfn(p), start);
  240 + ret = soft_offline_page(p, MF_COUNT_INCREASED);
  241 + if (ret)
  242 + break;
  243 + continue;
  244 + }
237 245 printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n",
238 246 page_to_pfn(p), start);
239 247 /* Ignore return value for now */
240   - __memory_failure(page_to_pfn(p), 0, 1);
241   - put_page(p);
  248 + __memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED);
242 249 }
243 250 return ret;
244 251 }
... ... @@ -335,8 +342,8 @@
335 342 size_t len;
336 343  
337 344 #ifdef CONFIG_MEMORY_FAILURE
338   - if (behavior == MADV_HWPOISON)
339   - return madvise_hwpoison(start, start+len_in);
  345 + if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
  346 + return madvise_hwpoison(behavior, start, start+len_in);
340 347 #endif
341 348 if (!madvise_behavior_valid(behavior))
342 349 return error;
... ... @@ -283,6 +283,11 @@
283 283 return &mem->info.nodeinfo[nid]->zoneinfo[zid];
284 284 }
285 285  
  286 +struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem)
  287 +{
  288 + return &mem->css;
  289 +}
  290 +
286 291 static struct mem_cgroup_per_zone *
287 292 page_cgroup_zoneinfo(struct page_cgroup *pc)
288 293 {
289 294  
290 295  
291 296  
... ... @@ -1536,25 +1541,22 @@
1536 1541 return container_of(css, struct mem_cgroup, css);
1537 1542 }
1538 1543  
1539   -static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page)
  1544 +struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
1540 1545 {
1541   - struct mem_cgroup *mem;
  1546 + struct mem_cgroup *mem = NULL;
1542 1547 struct page_cgroup *pc;
1543 1548 unsigned short id;
1544 1549 swp_entry_t ent;
1545 1550  
1546 1551 VM_BUG_ON(!PageLocked(page));
1547 1552  
1548   - if (!PageSwapCache(page))
1549   - return NULL;
1550   -
1551 1553 pc = lookup_page_cgroup(page);
1552 1554 lock_page_cgroup(pc);
1553 1555 if (PageCgroupUsed(pc)) {
1554 1556 mem = pc->mem_cgroup;
1555 1557 if (mem && !css_tryget(&mem->css))
1556 1558 mem = NULL;
1557   - } else {
  1559 + } else if (PageSwapCache(page)) {
1558 1560 ent.val = page_private(page);
1559 1561 id = lookup_swap_cgroup(ent);
1560 1562 rcu_read_lock();
... ... @@ -1874,7 +1876,7 @@
1874 1876 */
1875 1877 if (!PageSwapCache(page))
1876 1878 goto charge_cur_mm;
1877   - mem = try_get_mem_cgroup_from_swapcache(page);
  1879 + mem = try_get_mem_cgroup_from_page(page);
1878 1880 if (!mem)
1879 1881 goto charge_cur_mm;
1880 1882 *ptr = mem;
... ... @@ -34,12 +34,16 @@
34 34 #include <linux/kernel.h>
35 35 #include <linux/mm.h>
36 36 #include <linux/page-flags.h>
  37 +#include <linux/kernel-page-flags.h>
37 38 #include <linux/sched.h>
38 39 #include <linux/ksm.h>
39 40 #include <linux/rmap.h>
40 41 #include <linux/pagemap.h>
41 42 #include <linux/swap.h>
42 43 #include <linux/backing-dev.h>
  44 +#include <linux/migrate.h>
  45 +#include <linux/page-isolation.h>
  46 +#include <linux/suspend.h>
43 47 #include "internal.h"
44 48  
45 49 int sysctl_memory_failure_early_kill __read_mostly = 0;
46 50  
... ... @@ -48,7 +52,121 @@
48 52  
49 53 atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0);
50 54  
  55 +u32 hwpoison_filter_enable = 0;
  56 +u32 hwpoison_filter_dev_major = ~0U;
  57 +u32 hwpoison_filter_dev_minor = ~0U;
  58 +u64 hwpoison_filter_flags_mask;
  59 +u64 hwpoison_filter_flags_value;
  60 +EXPORT_SYMBOL_GPL(hwpoison_filter_enable);
  61 +EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major);
  62 +EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor);
  63 +EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask);
  64 +EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value);
  65 +
  66 +static int hwpoison_filter_dev(struct page *p)
  67 +{
  68 + struct address_space *mapping;
  69 + dev_t dev;
  70 +
  71 + if (hwpoison_filter_dev_major == ~0U &&
  72 + hwpoison_filter_dev_minor == ~0U)
  73 + return 0;
  74 +
  75 + /*
  76 + * page_mapping() does not accept slab page
  77 + */
  78 + if (PageSlab(p))
  79 + return -EINVAL;
  80 +
  81 + mapping = page_mapping(p);
  82 + if (mapping == NULL || mapping->host == NULL)
  83 + return -EINVAL;
  84 +
  85 + dev = mapping->host->i_sb->s_dev;
  86 + if (hwpoison_filter_dev_major != ~0U &&
  87 + hwpoison_filter_dev_major != MAJOR(dev))
  88 + return -EINVAL;
  89 + if (hwpoison_filter_dev_minor != ~0U &&
  90 + hwpoison_filter_dev_minor != MINOR(dev))
  91 + return -EINVAL;
  92 +
  93 + return 0;
  94 +}
  95 +
  96 +static int hwpoison_filter_flags(struct page *p)
  97 +{
  98 + if (!hwpoison_filter_flags_mask)
  99 + return 0;
  100 +
  101 + if ((stable_page_flags(p) & hwpoison_filter_flags_mask) ==
  102 + hwpoison_filter_flags_value)
  103 + return 0;
  104 + else
  105 + return -EINVAL;
  106 +}
  107 +
51 108 /*
  109 + * This allows stress tests to limit test scope to a collection of tasks
  110 + * by putting them under some memcg. This prevents killing unrelated/important
  111 + * processes such as /sbin/init. Note that the target task may share clean
  112 + * pages with init (eg. libc text), which is harmless. If the target task
  113 + * share _dirty_ pages with another task B, the test scheme must make sure B
  114 + * is also included in the memcg. At last, due to race conditions this filter
  115 + * can only guarantee that the page either belongs to the memcg tasks, or is
  116 + * a freed page.
  117 + */
  118 +#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
  119 +u64 hwpoison_filter_memcg;
  120 +EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
  121 +static int hwpoison_filter_task(struct page *p)
  122 +{
  123 + struct mem_cgroup *mem;
  124 + struct cgroup_subsys_state *css;
  125 + unsigned long ino;
  126 +
  127 + if (!hwpoison_filter_memcg)
  128 + return 0;
  129 +
  130 + mem = try_get_mem_cgroup_from_page(p);
  131 + if (!mem)
  132 + return -EINVAL;
  133 +
  134 + css = mem_cgroup_css(mem);
  135 + /* root_mem_cgroup has NULL dentries */
  136 + if (!css->cgroup->dentry)
  137 + return -EINVAL;
  138 +
  139 + ino = css->cgroup->dentry->d_inode->i_ino;
  140 + css_put(css);
  141 +
  142 + if (ino != hwpoison_filter_memcg)
  143 + return -EINVAL;
  144 +
  145 + return 0;
  146 +}
  147 +#else
  148 +static int hwpoison_filter_task(struct page *p) { return 0; }
  149 +#endif
  150 +
  151 +int hwpoison_filter(struct page *p)
  152 +{
  153 + if (!hwpoison_filter_enable)
  154 + return 0;
  155 +
  156 + if (hwpoison_filter_dev(p))
  157 + return -EINVAL;
  158 +
  159 + if (hwpoison_filter_flags(p))
  160 + return -EINVAL;
  161 +
  162 + if (hwpoison_filter_task(p))
  163 + return -EINVAL;
  164 +
  165 + return 0;
  166 +}
  167 +EXPORT_SYMBOL_GPL(hwpoison_filter);
  168 +
  169 +/*
52 170 * Send all the processes who have the page mapped an ``action optional''
53 171 * signal.
54 172 */
... ... @@ -83,6 +201,36 @@
83 201 }
84 202  
85 203 /*
  204 + * When a unknown page type is encountered drain as many buffers as possible
  205 + * in the hope to turn the page into a LRU or free page, which we can handle.
  206 + */
  207 +void shake_page(struct page *p, int access)
  208 +{
  209 + if (!PageSlab(p)) {
  210 + lru_add_drain_all();
  211 + if (PageLRU(p))
  212 + return;
  213 + drain_all_pages();
  214 + if (PageLRU(p) || is_free_buddy_page(p))
  215 + return;
  216 + }
  217 +
  218 + /*
  219 + * Only all shrink_slab here (which would also
  220 + * shrink other caches) if access is not potentially fatal.
  221 + */
  222 + if (access) {
  223 + int nr;
  224 + do {
  225 + nr = shrink_slab(1000, GFP_KERNEL, 1000);
  226 + if (page_count(p) == 0)
  227 + break;
  228 + } while (nr > 10);
  229 + }
  230 +}
  231 +EXPORT_SYMBOL_GPL(shake_page);
  232 +
  233 +/*
86 234 * Kill all processes that have a poisoned page mapped and then isolate
87 235 * the page.
88 236 *
... ... @@ -177,7 +325,6 @@
177 325 * In case something went wrong with munmapping
178 326 * make sure the process doesn't catch the
179 327 * signal and then access the memory. Just kill it.
180   - * the signal handlers
181 328 */
182 329 if (fail || tk->addr_valid == 0) {
183 330 printk(KERN_ERR
184 331  
185 332  
186 333  
187 334  
188 335  
189 336  
190 337  
191 338  
... ... @@ -314,33 +461,49 @@
314 461 */
315 462  
316 463 enum outcome {
317   - FAILED, /* Error handling failed */
  464 + IGNORED, /* Error: cannot be handled */
  465 + FAILED, /* Error: handling failed */
318 466 DELAYED, /* Will be handled later */
319   - IGNORED, /* Error safely ignored */
320 467 RECOVERED, /* Successfully recovered */
321 468 };
322 469  
323 470 static const char *action_name[] = {
  471 + [IGNORED] = "Ignored",
324 472 [FAILED] = "Failed",
325 473 [DELAYED] = "Delayed",
326   - [IGNORED] = "Ignored",
327 474 [RECOVERED] = "Recovered",
328 475 };
329 476  
330 477 /*
331   - * Error hit kernel page.
332   - * Do nothing, try to be lucky and not touch this instead. For a few cases we
333   - * could be more sophisticated.
  478 + * XXX: It is possible that a page is isolated from LRU cache,
  479 + * and then kept in swap cache or failed to remove from page cache.
  480 + * The page count will stop it from being freed by unpoison.
  481 + * Stress tests should be aware of this memory leak problem.
334 482 */
335   -static int me_kernel(struct page *p, unsigned long pfn)
  483 +static int delete_from_lru_cache(struct page *p)
336 484 {
337   - return DELAYED;
  485 + if (!isolate_lru_page(p)) {
  486 + /*
  487 + * Clear sensible page flags, so that the buddy system won't
  488 + * complain when the page is unpoison-and-freed.
  489 + */
  490 + ClearPageActive(p);
  491 + ClearPageUnevictable(p);
  492 + /*
  493 + * drop the page count elevated by isolate_lru_page()
  494 + */
  495 + page_cache_release(p);
  496 + return 0;
  497 + }
  498 + return -EIO;
338 499 }
339 500  
340 501 /*
341   - * Already poisoned page.
  502 + * Error hit kernel page.
  503 + * Do nothing, try to be lucky and not touch this instead. For a few cases we
  504 + * could be more sophisticated.
342 505 */
343   -static int me_ignore(struct page *p, unsigned long pfn)
  506 +static int me_kernel(struct page *p, unsigned long pfn)
344 507 {
345 508 return IGNORED;
346 509 }
... ... @@ -355,14 +518,6 @@
355 518 }
356 519  
357 520 /*
358   - * Free memory
359   - */
360   -static int me_free(struct page *p, unsigned long pfn)
361   -{
362   - return DELAYED;
363   -}
364   -
365   -/*
366 521 * Clean (or cleaned) page cache page.
367 522 */
368 523 static int me_pagecache_clean(struct page *p, unsigned long pfn)
... ... @@ -371,6 +526,8 @@
371 526 int ret = FAILED;
372 527 struct address_space *mapping;
373 528  
  529 + delete_from_lru_cache(p);
  530 +
374 531 /*
375 532 * For anonymous pages we're done the only reference left
376 533 * should be the one m_f() holds.
377 534  
... ... @@ -500,14 +657,20 @@
500 657 /* Trigger EIO in shmem: */
501 658 ClearPageUptodate(p);
502 659  
503   - return DELAYED;
  660 + if (!delete_from_lru_cache(p))
  661 + return DELAYED;
  662 + else
  663 + return FAILED;
504 664 }
505 665  
506 666 static int me_swapcache_clean(struct page *p, unsigned long pfn)
507 667 {
508 668 delete_from_swap_cache(p);
509 669  
510   - return RECOVERED;
  670 + if (!delete_from_lru_cache(p))
  671 + return RECOVERED;
  672 + else
  673 + return FAILED;
511 674 }
512 675  
513 676 /*
... ... @@ -550,7 +713,6 @@
550 713 #define tail (1UL << PG_tail)
551 714 #define compound (1UL << PG_compound)
552 715 #define slab (1UL << PG_slab)
553   -#define buddy (1UL << PG_buddy)
554 716 #define reserved (1UL << PG_reserved)
555 717  
556 718 static struct page_state {
... ... @@ -559,8 +721,11 @@
559 721 char *msg;
560 722 int (*action)(struct page *p, unsigned long pfn);
561 723 } error_states[] = {
562   - { reserved, reserved, "reserved kernel", me_ignore },
563   - { buddy, buddy, "free kernel", me_free },
  724 + { reserved, reserved, "reserved kernel", me_kernel },
  725 + /*
  726 + * free pages are specially detected outside this table:
  727 + * PG_buddy pages only make a small fraction of all free pages.
  728 + */
564 729  
565 730 /*
566 731 * Could in theory check if slab page is free or if we can drop
... ... @@ -587,7 +752,6 @@
587 752  
588 753 { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty },
589 754 { lru|dirty, lru, "clean LRU", me_pagecache_clean },
590   - { swapbacked, swapbacked, "anonymous", me_pagecache_clean },
591 755  
592 756 /*
593 757 * Catchall entry: must be at end.
594 758  
595 759  
596 760  
... ... @@ -595,20 +759,31 @@
595 759 { 0, 0, "unknown page state", me_unknown },
596 760 };
597 761  
  762 +#undef dirty
  763 +#undef sc
  764 +#undef unevict
  765 +#undef mlock
  766 +#undef writeback
  767 +#undef lru
  768 +#undef swapbacked
  769 +#undef head
  770 +#undef tail
  771 +#undef compound
  772 +#undef slab
  773 +#undef reserved
  774 +
598 775 static void action_result(unsigned long pfn, char *msg, int result)
599 776 {
600   - struct page *page = NULL;
601   - if (pfn_valid(pfn))
602   - page = pfn_to_page(pfn);
  777 + struct page *page = pfn_to_page(pfn);
603 778  
604 779 printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n",
605 780 pfn,
606   - page && PageDirty(page) ? "dirty " : "",
  781 + PageDirty(page) ? "dirty " : "",
607 782 msg, action_name[result]);
608 783 }
609 784  
610 785 static int page_action(struct page_state *ps, struct page *p,
611   - unsigned long pfn, int ref)
  786 + unsigned long pfn)
612 787 {
613 788 int result;
614 789 int count;
615 790  
616 791  
... ... @@ -616,18 +791,22 @@
616 791 result = ps->action(p, pfn);
617 792 action_result(pfn, ps->msg, result);
618 793  
619   - count = page_count(p) - 1 - ref;
620   - if (count != 0)
  794 + count = page_count(p) - 1;
  795 + if (ps->action == me_swapcache_dirty && result == DELAYED)
  796 + count--;
  797 + if (count != 0) {
621 798 printk(KERN_ERR
622 799 "MCE %#lx: %s page still referenced by %d users\n",
623 800 pfn, ps->msg, count);
  801 + result = FAILED;
  802 + }
624 803  
625 804 /* Could do more checks here if page looks ok */
626 805 /*
627 806 * Could adjust zone counters here to correct for the missing page.
628 807 */
629 808  
630   - return result == RECOVERED ? 0 : -EBUSY;
  809 + return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY;
631 810 }
632 811  
633 812 #define N_UNMAP_TRIES 5
... ... @@ -636,7 +815,7 @@
636 815 * Do all that is necessary to remove user space mappings. Unmap
637 816 * the pages and send SIGBUS to the processes if the data was dirty.
638 817 */
639   -static void hwpoison_user_mappings(struct page *p, unsigned long pfn,
  818 +static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
640 819 int trapno)
641 820 {
642 821 enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
643 822  
644 823  
... ... @@ -646,16 +825,19 @@
646 825 int i;
647 826 int kill = 1;
648 827  
649   - if (PageReserved(p) || PageCompound(p) || PageSlab(p) || PageKsm(p))
650   - return;
  828 + if (PageReserved(p) || PageSlab(p))
  829 + return SWAP_SUCCESS;
651 830  
652 831 /*
653 832 * This check implies we don't kill processes if their pages
654 833 * are in the swap cache early. Those are always late kills.
655 834 */
656 835 if (!page_mapped(p))
657   - return;
  836 + return SWAP_SUCCESS;
658 837  
  838 + if (PageCompound(p) || PageKsm(p))
  839 + return SWAP_FAIL;
  840 +
659 841 if (PageSwapCache(p)) {
660 842 printk(KERN_ERR
661 843 "MCE %#lx: keeping poisoned page in swap cache\n", pfn);
... ... @@ -665,6 +847,8 @@
665 847 /*
666 848 * Propagate the dirty bit from PTEs to struct page first, because we
667 849 * need this to decide if we should kill or just drop the page.
  850 + * XXX: the dirty test could be racy: set_page_dirty() may not always
  851 + * be called inside page lock (it's recommended but not enforced).
668 852 */
669 853 mapping = page_mapping(p);
670 854 if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) {
671 855  
672 856  
... ... @@ -716,11 +900,12 @@
716 900 */
717 901 kill_procs_ao(&tokill, !!PageDirty(p), trapno,
718 902 ret != SWAP_SUCCESS, pfn);
  903 +
  904 + return ret;
719 905 }
720 906  
721   -int __memory_failure(unsigned long pfn, int trapno, int ref)
  907 +int __memory_failure(unsigned long pfn, int trapno, int flags)
722 908 {
723   - unsigned long lru_flag;
724 909 struct page_state *ps;
725 910 struct page *p;
726 911 int res;
727 912  
... ... @@ -729,13 +914,15 @@
729 914 panic("Memory failure from trap %d on page %lx", trapno, pfn);
730 915  
731 916 if (!pfn_valid(pfn)) {
732   - action_result(pfn, "memory outside kernel control", IGNORED);
733   - return -EIO;
  917 + printk(KERN_ERR
  918 + "MCE %#lx: memory outside kernel control\n",
  919 + pfn);
  920 + return -ENXIO;
734 921 }
735 922  
736 923 p = pfn_to_page(pfn);
737 924 if (TestSetPageHWPoison(p)) {
738   - action_result(pfn, "already hardware poisoned", IGNORED);
  925 + printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn);
739 926 return 0;
740 927 }
741 928  
... ... @@ -752,9 +939,15 @@
752 939 * In fact it's dangerous to directly bump up page count from 0,
753 940 * that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
754 941 */
755   - if (!get_page_unless_zero(compound_head(p))) {
756   - action_result(pfn, "free or high order kernel", IGNORED);
757   - return PageBuddy(compound_head(p)) ? 0 : -EBUSY;
  942 + if (!(flags & MF_COUNT_INCREASED) &&
  943 + !get_page_unless_zero(compound_head(p))) {
  944 + if (is_free_buddy_page(p)) {
  945 + action_result(pfn, "free buddy", DELAYED);
  946 + return 0;
  947 + } else {
  948 + action_result(pfn, "high order kernel", IGNORED);
  949 + return -EBUSY;
  950 + }
758 951 }
759 952  
760 953 /*
761 954  
... ... @@ -766,14 +959,19 @@
766 959 * walked by the page reclaim code, however that's not a big loss.
767 960 */
768 961 if (!PageLRU(p))
769   - lru_add_drain_all();
770   - lru_flag = p->flags & lru;
771   - if (isolate_lru_page(p)) {
  962 + shake_page(p, 0);
  963 + if (!PageLRU(p)) {
  964 + /*
  965 + * shake_page could have turned it free.
  966 + */
  967 + if (is_free_buddy_page(p)) {
  968 + action_result(pfn, "free buddy, 2nd try", DELAYED);
  969 + return 0;
  970 + }
772 971 action_result(pfn, "non LRU", IGNORED);
773 972 put_page(p);
774 973 return -EBUSY;
775 974 }
776   - page_cache_release(p);
777 975  
778 976 /*
779 977 * Lock the page and wait for writeback to finish.
780 978  
781 979  
782 980  
783 981  
784 982  
... ... @@ -781,26 +979,48 @@
781 979 * and in many cases impossible, so we just avoid it here.
782 980 */
783 981 lock_page_nosync(p);
  982 +
  983 + /*
  984 + * unpoison always clear PG_hwpoison inside page lock
  985 + */
  986 + if (!PageHWPoison(p)) {
  987 + printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn);
  988 + res = 0;
  989 + goto out;
  990 + }
  991 + if (hwpoison_filter(p)) {
  992 + if (TestClearPageHWPoison(p))
  993 + atomic_long_dec(&mce_bad_pages);
  994 + unlock_page(p);
  995 + put_page(p);
  996 + return 0;
  997 + }
  998 +
784 999 wait_on_page_writeback(p);
785 1000  
786 1001 /*
787 1002 * Now take care of user space mappings.
  1003 + * Abort on fail: __remove_from_page_cache() assumes unmapped page.
788 1004 */
789   - hwpoison_user_mappings(p, pfn, trapno);
  1005 + if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) {
  1006 + printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn);
  1007 + res = -EBUSY;
  1008 + goto out;
  1009 + }
790 1010  
791 1011 /*
792 1012 * Torn down by someone else?
793 1013 */
794   - if ((lru_flag & lru) && !PageSwapCache(p) && p->mapping == NULL) {
  1014 + if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
795 1015 action_result(pfn, "already truncated LRU", IGNORED);
796   - res = 0;
  1016 + res = -EBUSY;
797 1017 goto out;
798 1018 }
799 1019  
800 1020 res = -EBUSY;
801 1021 for (ps = error_states;; ps++) {
802   - if (((p->flags | lru_flag)& ps->mask) == ps->res) {
803   - res = page_action(ps, p, pfn, ref);
  1022 + if ((p->flags & ps->mask) == ps->res) {
  1023 + res = page_action(ps, p, pfn);
804 1024 break;
805 1025 }
806 1026 }
... ... @@ -830,5 +1050,237 @@
830 1050 void memory_failure(unsigned long pfn, int trapno)
831 1051 {
832 1052 __memory_failure(pfn, trapno, 0);
  1053 +}
  1054 +
  1055 +/**
  1056 + * unpoison_memory - Unpoison a previously poisoned page
  1057 + * @pfn: Page number of the to be unpoisoned page
  1058 + *
  1059 + * Software-unpoison a page that has been poisoned by
  1060 + * memory_failure() earlier.
  1061 + *
  1062 + * This is only done on the software-level, so it only works
  1063 + * for linux injected failures, not real hardware failures
  1064 + *
  1065 + * Returns 0 for success, otherwise -errno.
  1066 + */
  1067 +int unpoison_memory(unsigned long pfn)
  1068 +{
  1069 + struct page *page;
  1070 + struct page *p;
  1071 + int freeit = 0;
  1072 +
  1073 + if (!pfn_valid(pfn))
  1074 + return -ENXIO;
  1075 +
  1076 + p = pfn_to_page(pfn);
  1077 + page = compound_head(p);
  1078 +
  1079 + if (!PageHWPoison(p)) {
  1080 + pr_debug("MCE: Page was already unpoisoned %#lx\n", pfn);
  1081 + return 0;
  1082 + }
  1083 +
  1084 + if (!get_page_unless_zero(page)) {
  1085 + if (TestClearPageHWPoison(p))
  1086 + atomic_long_dec(&mce_bad_pages);
  1087 + pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn);
  1088 + return 0;
  1089 + }
  1090 +
  1091 + lock_page_nosync(page);
  1092 + /*
  1093 + * This test is racy because PG_hwpoison is set outside of page lock.
  1094 + * That's acceptable because that won't trigger kernel panic. Instead,
  1095 + * the PG_hwpoison page will be caught and isolated on the entrance to
  1096 + * the free buddy page pool.
  1097 + */
  1098 + if (TestClearPageHWPoison(p)) {
  1099 + pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn);
  1100 + atomic_long_dec(&mce_bad_pages);
  1101 + freeit = 1;
  1102 + }
  1103 + unlock_page(page);
  1104 +
  1105 + put_page(page);
  1106 + if (freeit)
  1107 + put_page(page);
  1108 +
  1109 + return 0;
  1110 +}
  1111 +EXPORT_SYMBOL(unpoison_memory);
  1112 +
  1113 +static struct page *new_page(struct page *p, unsigned long private, int **x)
  1114 +{
  1115 + int nid = page_to_nid(p);
  1116 + return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
  1117 +}
  1118 +
  1119 +/*
  1120 + * Safely get reference count of an arbitrary page.
  1121 + * Returns 0 for a free page, -EIO for a zero refcount page
  1122 + * that is not free, and 1 for any other page type.
  1123 + * For 1 the page is returned with increased page count, otherwise not.
  1124 + */
  1125 +static int get_any_page(struct page *p, unsigned long pfn, int flags)
  1126 +{
  1127 + int ret;
  1128 +
  1129 + if (flags & MF_COUNT_INCREASED)
  1130 + return 1;
  1131 +
  1132 + /*
  1133 + * The lock_system_sleep prevents a race with memory hotplug,
  1134 + * because the isolation assumes there's only a single user.
  1135 + * This is a big hammer, a better would be nicer.
  1136 + */
  1137 + lock_system_sleep();
  1138 +
  1139 + /*
  1140 + * Isolate the page, so that it doesn't get reallocated if it
  1141 + * was free.
  1142 + */
  1143 + set_migratetype_isolate(p);
  1144 + if (!get_page_unless_zero(compound_head(p))) {
  1145 + if (is_free_buddy_page(p)) {
  1146 + pr_debug("get_any_page: %#lx free buddy page\n", pfn);
  1147 + /* Set hwpoison bit while page is still isolated */
  1148 + SetPageHWPoison(p);
  1149 + ret = 0;
  1150 + } else {
  1151 + pr_debug("get_any_page: %#lx: unknown zero refcount page type %lx\n",
  1152 + pfn, p->flags);
  1153 + ret = -EIO;
  1154 + }
  1155 + } else {
  1156 + /* Not a free page */
  1157 + ret = 1;
  1158 + }
  1159 + unset_migratetype_isolate(p);
  1160 + unlock_system_sleep();
  1161 + return ret;
  1162 +}
  1163 +
  1164 +/**
  1165 + * soft_offline_page - Soft offline a page.
  1166 + * @page: page to offline
  1167 + * @flags: flags. Same as memory_failure().
  1168 + *
  1169 + * Returns 0 on success, otherwise negated errno.
  1170 + *
  1171 + * Soft offline a page, by migration or invalidation,
  1172 + * without killing anything. This is for the case when
  1173 + * a page is not corrupted yet (so it's still valid to access),
  1174 + * but has had a number of corrected errors and is better taken
  1175 + * out.
  1176 + *
  1177 + * The actual policy on when to do that is maintained by
  1178 + * user space.
  1179 + *
  1180 + * This should never impact any application or cause data loss,
  1181 + * however it might take some time.
  1182 + *
  1183 + * This is not a 100% solution for all memory, but tries to be
  1184 + * ``good enough'' for the majority of memory.
  1185 + */
  1186 +int soft_offline_page(struct page *page, int flags)
  1187 +{
  1188 + int ret;
  1189 + unsigned long pfn = page_to_pfn(page);
  1190 +
  1191 + ret = get_any_page(page, pfn, flags);
  1192 + if (ret < 0)
  1193 + return ret;
  1194 + if (ret == 0)
  1195 + goto done;
  1196 +
  1197 + /*
  1198 + * Page cache page we can handle?
  1199 + */
  1200 + if (!PageLRU(page)) {
  1201 + /*
  1202 + * Try to free it.
  1203 + */
  1204 + put_page(page);
  1205 + shake_page(page, 1);
  1206 +
  1207 + /*
  1208 + * Did it turn free?
  1209 + */
  1210 + ret = get_any_page(page, pfn, 0);
  1211 + if (ret < 0)
  1212 + return ret;
  1213 + if (ret == 0)
  1214 + goto done;
  1215 + }
  1216 + if (!PageLRU(page)) {
  1217 + pr_debug("soft_offline: %#lx: unknown non LRU page type %lx\n",
  1218 + pfn, page->flags);
  1219 + return -EIO;
  1220 + }
  1221 +
  1222 + lock_page(page);
  1223 + wait_on_page_writeback(page);
  1224 +
  1225 + /*
  1226 + * Synchronized using the page lock with memory_failure()
  1227 + */
  1228 + if (PageHWPoison(page)) {
  1229 + unlock_page(page);
  1230 + put_page(page);
  1231 + pr_debug("soft offline: %#lx page already poisoned\n", pfn);
  1232 + return -EBUSY;
  1233 + }
  1234 +
  1235 + /*
  1236 + * Try to invalidate first. This should work for
  1237 + * non dirty unmapped page cache pages.
  1238 + */
  1239 + ret = invalidate_inode_page(page);
  1240 + unlock_page(page);
  1241 +
  1242 + /*
  1243 + * Drop count because page migration doesn't like raised
  1244 + * counts. The page could get re-allocated, but if it becomes
  1245 + * LRU the isolation will just fail.
  1246 + * RED-PEN would be better to keep it isolated here, but we
  1247 + * would need to fix isolation locking first.
  1248 + */
  1249 + put_page(page);
  1250 + if (ret == 1) {
  1251 + ret = 0;
  1252 + pr_debug("soft_offline: %#lx: invalidated\n", pfn);
  1253 + goto done;
  1254 + }
  1255 +
  1256 + /*
  1257 + * Simple invalidation didn't work.
  1258 + * Try to migrate to a new page instead. migrate.c
  1259 + * handles a large number of cases for us.
  1260 + */
  1261 + ret = isolate_lru_page(page);
  1262 + if (!ret) {
  1263 + LIST_HEAD(pagelist);
  1264 +
  1265 + list_add(&page->lru, &pagelist);
  1266 + ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
  1267 + if (ret) {
  1268 + pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
  1269 + pfn, ret, page->flags);
  1270 + if (ret > 0)
  1271 + ret = -EIO;
  1272 + }
  1273 + } else {
  1274 + pr_debug("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
  1275 + pfn, ret, page_count(page), page->flags);
  1276 + }
  1277 + if (ret)
  1278 + return ret;
  1279 +
  1280 +done:
  1281 + atomic_long_add(1, &mce_bad_pages);
  1282 + SetPageHWPoison(page);
  1283 + /* keep elevated page count for bad page */
  1284 + return ret;
833 1285 }
... ... @@ -2555,6 +2555,10 @@
2555 2555 ret = VM_FAULT_MAJOR;
2556 2556 count_vm_event(PGMAJFAULT);
2557 2557 } else if (PageHWPoison(page)) {
  2558 + /*
  2559 + * hwpoisoned dirty swapcache pages are kept for killing
  2560 + * owner processes (which may be unknown at hwpoison time)
  2561 + */
2558 2562 ret = VM_FAULT_HWPOISON;
2559 2563 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2560 2564 goto out_release;
... ... @@ -5091,4 +5091,25 @@
5091 5091 spin_unlock_irqrestore(&zone->lock, flags);
5092 5092 }
5093 5093 #endif
  5094 +
  5095 +#ifdef CONFIG_MEMORY_FAILURE
  5096 +bool is_free_buddy_page(struct page *page)
  5097 +{
  5098 + struct zone *zone = page_zone(page);
  5099 + unsigned long pfn = page_to_pfn(page);
  5100 + unsigned long flags;
  5101 + int order;
  5102 +
  5103 + spin_lock_irqsave(&zone->lock, flags);
  5104 + for (order = 0; order < MAX_ORDER; order++) {
  5105 + struct page *page_head = page - (pfn & ((1 << order) - 1));
  5106 +
  5107 + if (PageBuddy(page_head) && page_order(page_head) >= order)
  5108 + break;
  5109 + }
  5110 + spin_unlock_irqrestore(&zone->lock, flags);
  5111 +
  5112 + return order < MAX_ORDER;
  5113 +}
  5114 +#endif