Commit 93a9eb39fad1b5fc9077776caa3af207883b254d
Committed by
Linus Torvalds
1 parent
0193ed8225
Exists in
ti-lsk-linux-4.1.y
and in
10 other branches
hwpoison: fix hugetlbfs/thp precheck in hwpoison_user_mappings()
A recent fix from Chen Yucong, commit 0bc1f8b0682c ("hwpoison: fix the handling path of the victimized page frame that belong to non-LRU") rejects going into unmapping operation for hugetlbfs/thp pages, which results in failing error containing on such pages. This patch fixes it. With this patch, hwpoison functional tests in mce-test testsuite pass. Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Cc: Andi Kleen <andi@firstfloor.org> Cc: Chen Yucong <slaoub@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Showing 1 changed file with 7 additions and 1 deletions Inline Diff
mm/memory-failure.c
1 | /* | 1 | /* |
2 | * Copyright (C) 2008, 2009 Intel Corporation | 2 | * Copyright (C) 2008, 2009 Intel Corporation |
3 | * Authors: Andi Kleen, Fengguang Wu | 3 | * Authors: Andi Kleen, Fengguang Wu |
4 | * | 4 | * |
5 | * This software may be redistributed and/or modified under the terms of | 5 | * This software may be redistributed and/or modified under the terms of |
6 | * the GNU General Public License ("GPL") version 2 only as published by the | 6 | * the GNU General Public License ("GPL") version 2 only as published by the |
7 | * Free Software Foundation. | 7 | * Free Software Foundation. |
8 | * | 8 | * |
9 | * High level machine check handler. Handles pages reported by the | 9 | * High level machine check handler. Handles pages reported by the |
10 | * hardware as being corrupted usually due to a multi-bit ECC memory or cache | 10 | * hardware as being corrupted usually due to a multi-bit ECC memory or cache |
11 | * failure. | 11 | * failure. |
12 | * | 12 | * |
13 | * In addition there is a "soft offline" entry point that allows stop using | 13 | * In addition there is a "soft offline" entry point that allows stop using |
14 | * not-yet-corrupted-by-suspicious pages without killing anything. | 14 | * not-yet-corrupted-by-suspicious pages without killing anything. |
15 | * | 15 | * |
16 | * Handles page cache pages in various states. The tricky part | 16 | * Handles page cache pages in various states. The tricky part |
17 | * here is that we can access any page asynchronously in respect to | 17 | * here is that we can access any page asynchronously in respect to |
18 | * other VM users, because memory failures could happen anytime and | 18 | * other VM users, because memory failures could happen anytime and |
19 | * anywhere. This could violate some of their assumptions. This is why | 19 | * anywhere. This could violate some of their assumptions. This is why |
20 | * this code has to be extremely careful. Generally it tries to use | 20 | * this code has to be extremely careful. Generally it tries to use |
21 | * normal locking rules, as in get the standard locks, even if that means | 21 | * normal locking rules, as in get the standard locks, even if that means |
22 | * the error handling takes potentially a long time. | 22 | * the error handling takes potentially a long time. |
23 | * | 23 | * |
24 | * There are several operations here with exponential complexity because | 24 | * There are several operations here with exponential complexity because |
25 | * of unsuitable VM data structures. For example the operation to map back | 25 | * of unsuitable VM data structures. For example the operation to map back |
26 | * from RMAP chains to processes has to walk the complete process list and | 26 | * from RMAP chains to processes has to walk the complete process list and |
27 | * has non linear complexity with the number. But since memory corruptions | 27 | * has non linear complexity with the number. But since memory corruptions |
28 | * are rare we hope to get away with this. This avoids impacting the core | 28 | * are rare we hope to get away with this. This avoids impacting the core |
29 | * VM. | 29 | * VM. |
30 | */ | 30 | */ |
31 | 31 | ||
32 | /* | 32 | /* |
33 | * Notebook: | 33 | * Notebook: |
34 | * - hugetlb needs more code | 34 | * - hugetlb needs more code |
35 | * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages | 35 | * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages |
36 | * - pass bad pages to kdump next kernel | 36 | * - pass bad pages to kdump next kernel |
37 | */ | 37 | */ |
38 | #include <linux/kernel.h> | 38 | #include <linux/kernel.h> |
39 | #include <linux/mm.h> | 39 | #include <linux/mm.h> |
40 | #include <linux/page-flags.h> | 40 | #include <linux/page-flags.h> |
41 | #include <linux/kernel-page-flags.h> | 41 | #include <linux/kernel-page-flags.h> |
42 | #include <linux/sched.h> | 42 | #include <linux/sched.h> |
43 | #include <linux/ksm.h> | 43 | #include <linux/ksm.h> |
44 | #include <linux/rmap.h> | 44 | #include <linux/rmap.h> |
45 | #include <linux/export.h> | 45 | #include <linux/export.h> |
46 | #include <linux/pagemap.h> | 46 | #include <linux/pagemap.h> |
47 | #include <linux/swap.h> | 47 | #include <linux/swap.h> |
48 | #include <linux/backing-dev.h> | 48 | #include <linux/backing-dev.h> |
49 | #include <linux/migrate.h> | 49 | #include <linux/migrate.h> |
50 | #include <linux/page-isolation.h> | 50 | #include <linux/page-isolation.h> |
51 | #include <linux/suspend.h> | 51 | #include <linux/suspend.h> |
52 | #include <linux/slab.h> | 52 | #include <linux/slab.h> |
53 | #include <linux/swapops.h> | 53 | #include <linux/swapops.h> |
54 | #include <linux/hugetlb.h> | 54 | #include <linux/hugetlb.h> |
55 | #include <linux/memory_hotplug.h> | 55 | #include <linux/memory_hotplug.h> |
56 | #include <linux/mm_inline.h> | 56 | #include <linux/mm_inline.h> |
57 | #include <linux/kfifo.h> | 57 | #include <linux/kfifo.h> |
58 | #include "internal.h" | 58 | #include "internal.h" |
59 | 59 | ||
60 | int sysctl_memory_failure_early_kill __read_mostly = 0; | 60 | int sysctl_memory_failure_early_kill __read_mostly = 0; |
61 | 61 | ||
62 | int sysctl_memory_failure_recovery __read_mostly = 1; | 62 | int sysctl_memory_failure_recovery __read_mostly = 1; |
63 | 63 | ||
64 | atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0); | 64 | atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0); |
65 | 65 | ||
66 | #if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE) | 66 | #if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE) |
67 | 67 | ||
68 | u32 hwpoison_filter_enable = 0; | 68 | u32 hwpoison_filter_enable = 0; |
69 | u32 hwpoison_filter_dev_major = ~0U; | 69 | u32 hwpoison_filter_dev_major = ~0U; |
70 | u32 hwpoison_filter_dev_minor = ~0U; | 70 | u32 hwpoison_filter_dev_minor = ~0U; |
71 | u64 hwpoison_filter_flags_mask; | 71 | u64 hwpoison_filter_flags_mask; |
72 | u64 hwpoison_filter_flags_value; | 72 | u64 hwpoison_filter_flags_value; |
73 | EXPORT_SYMBOL_GPL(hwpoison_filter_enable); | 73 | EXPORT_SYMBOL_GPL(hwpoison_filter_enable); |
74 | EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major); | 74 | EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major); |
75 | EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor); | 75 | EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor); |
76 | EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask); | 76 | EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask); |
77 | EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value); | 77 | EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value); |
78 | 78 | ||
79 | static int hwpoison_filter_dev(struct page *p) | 79 | static int hwpoison_filter_dev(struct page *p) |
80 | { | 80 | { |
81 | struct address_space *mapping; | 81 | struct address_space *mapping; |
82 | dev_t dev; | 82 | dev_t dev; |
83 | 83 | ||
84 | if (hwpoison_filter_dev_major == ~0U && | 84 | if (hwpoison_filter_dev_major == ~0U && |
85 | hwpoison_filter_dev_minor == ~0U) | 85 | hwpoison_filter_dev_minor == ~0U) |
86 | return 0; | 86 | return 0; |
87 | 87 | ||
88 | /* | 88 | /* |
89 | * page_mapping() does not accept slab pages. | 89 | * page_mapping() does not accept slab pages. |
90 | */ | 90 | */ |
91 | if (PageSlab(p)) | 91 | if (PageSlab(p)) |
92 | return -EINVAL; | 92 | return -EINVAL; |
93 | 93 | ||
94 | mapping = page_mapping(p); | 94 | mapping = page_mapping(p); |
95 | if (mapping == NULL || mapping->host == NULL) | 95 | if (mapping == NULL || mapping->host == NULL) |
96 | return -EINVAL; | 96 | return -EINVAL; |
97 | 97 | ||
98 | dev = mapping->host->i_sb->s_dev; | 98 | dev = mapping->host->i_sb->s_dev; |
99 | if (hwpoison_filter_dev_major != ~0U && | 99 | if (hwpoison_filter_dev_major != ~0U && |
100 | hwpoison_filter_dev_major != MAJOR(dev)) | 100 | hwpoison_filter_dev_major != MAJOR(dev)) |
101 | return -EINVAL; | 101 | return -EINVAL; |
102 | if (hwpoison_filter_dev_minor != ~0U && | 102 | if (hwpoison_filter_dev_minor != ~0U && |
103 | hwpoison_filter_dev_minor != MINOR(dev)) | 103 | hwpoison_filter_dev_minor != MINOR(dev)) |
104 | return -EINVAL; | 104 | return -EINVAL; |
105 | 105 | ||
106 | return 0; | 106 | return 0; |
107 | } | 107 | } |
108 | 108 | ||
109 | static int hwpoison_filter_flags(struct page *p) | 109 | static int hwpoison_filter_flags(struct page *p) |
110 | { | 110 | { |
111 | if (!hwpoison_filter_flags_mask) | 111 | if (!hwpoison_filter_flags_mask) |
112 | return 0; | 112 | return 0; |
113 | 113 | ||
114 | if ((stable_page_flags(p) & hwpoison_filter_flags_mask) == | 114 | if ((stable_page_flags(p) & hwpoison_filter_flags_mask) == |
115 | hwpoison_filter_flags_value) | 115 | hwpoison_filter_flags_value) |
116 | return 0; | 116 | return 0; |
117 | else | 117 | else |
118 | return -EINVAL; | 118 | return -EINVAL; |
119 | } | 119 | } |
120 | 120 | ||
121 | /* | 121 | /* |
122 | * This allows stress tests to limit test scope to a collection of tasks | 122 | * This allows stress tests to limit test scope to a collection of tasks |
123 | * by putting them under some memcg. This prevents killing unrelated/important | 123 | * by putting them under some memcg. This prevents killing unrelated/important |
124 | * processes such as /sbin/init. Note that the target task may share clean | 124 | * processes such as /sbin/init. Note that the target task may share clean |
125 | * pages with init (eg. libc text), which is harmless. If the target task | 125 | * pages with init (eg. libc text), which is harmless. If the target task |
126 | * share _dirty_ pages with another task B, the test scheme must make sure B | 126 | * share _dirty_ pages with another task B, the test scheme must make sure B |
127 | * is also included in the memcg. At last, due to race conditions this filter | 127 | * is also included in the memcg. At last, due to race conditions this filter |
128 | * can only guarantee that the page either belongs to the memcg tasks, or is | 128 | * can only guarantee that the page either belongs to the memcg tasks, or is |
129 | * a freed page. | 129 | * a freed page. |
130 | */ | 130 | */ |
131 | #ifdef CONFIG_MEMCG_SWAP | 131 | #ifdef CONFIG_MEMCG_SWAP |
132 | u64 hwpoison_filter_memcg; | 132 | u64 hwpoison_filter_memcg; |
133 | EXPORT_SYMBOL_GPL(hwpoison_filter_memcg); | 133 | EXPORT_SYMBOL_GPL(hwpoison_filter_memcg); |
134 | static int hwpoison_filter_task(struct page *p) | 134 | static int hwpoison_filter_task(struct page *p) |
135 | { | 135 | { |
136 | struct mem_cgroup *mem; | 136 | struct mem_cgroup *mem; |
137 | struct cgroup_subsys_state *css; | 137 | struct cgroup_subsys_state *css; |
138 | unsigned long ino; | 138 | unsigned long ino; |
139 | 139 | ||
140 | if (!hwpoison_filter_memcg) | 140 | if (!hwpoison_filter_memcg) |
141 | return 0; | 141 | return 0; |
142 | 142 | ||
143 | mem = try_get_mem_cgroup_from_page(p); | 143 | mem = try_get_mem_cgroup_from_page(p); |
144 | if (!mem) | 144 | if (!mem) |
145 | return -EINVAL; | 145 | return -EINVAL; |
146 | 146 | ||
147 | css = mem_cgroup_css(mem); | 147 | css = mem_cgroup_css(mem); |
148 | ino = cgroup_ino(css->cgroup); | 148 | ino = cgroup_ino(css->cgroup); |
149 | css_put(css); | 149 | css_put(css); |
150 | 150 | ||
151 | if (!ino || ino != hwpoison_filter_memcg) | 151 | if (!ino || ino != hwpoison_filter_memcg) |
152 | return -EINVAL; | 152 | return -EINVAL; |
153 | 153 | ||
154 | return 0; | 154 | return 0; |
155 | } | 155 | } |
156 | #else | 156 | #else |
157 | static int hwpoison_filter_task(struct page *p) { return 0; } | 157 | static int hwpoison_filter_task(struct page *p) { return 0; } |
158 | #endif | 158 | #endif |
159 | 159 | ||
160 | int hwpoison_filter(struct page *p) | 160 | int hwpoison_filter(struct page *p) |
161 | { | 161 | { |
162 | if (!hwpoison_filter_enable) | 162 | if (!hwpoison_filter_enable) |
163 | return 0; | 163 | return 0; |
164 | 164 | ||
165 | if (hwpoison_filter_dev(p)) | 165 | if (hwpoison_filter_dev(p)) |
166 | return -EINVAL; | 166 | return -EINVAL; |
167 | 167 | ||
168 | if (hwpoison_filter_flags(p)) | 168 | if (hwpoison_filter_flags(p)) |
169 | return -EINVAL; | 169 | return -EINVAL; |
170 | 170 | ||
171 | if (hwpoison_filter_task(p)) | 171 | if (hwpoison_filter_task(p)) |
172 | return -EINVAL; | 172 | return -EINVAL; |
173 | 173 | ||
174 | return 0; | 174 | return 0; |
175 | } | 175 | } |
176 | #else | 176 | #else |
177 | int hwpoison_filter(struct page *p) | 177 | int hwpoison_filter(struct page *p) |
178 | { | 178 | { |
179 | return 0; | 179 | return 0; |
180 | } | 180 | } |
181 | #endif | 181 | #endif |
182 | 182 | ||
183 | EXPORT_SYMBOL_GPL(hwpoison_filter); | 183 | EXPORT_SYMBOL_GPL(hwpoison_filter); |
184 | 184 | ||
185 | /* | 185 | /* |
186 | * Send all the processes who have the page mapped a signal. | 186 | * Send all the processes who have the page mapped a signal. |
187 | * ``action optional'' if they are not immediately affected by the error | 187 | * ``action optional'' if they are not immediately affected by the error |
188 | * ``action required'' if error happened in current execution context | 188 | * ``action required'' if error happened in current execution context |
189 | */ | 189 | */ |
190 | static int kill_proc(struct task_struct *t, unsigned long addr, int trapno, | 190 | static int kill_proc(struct task_struct *t, unsigned long addr, int trapno, |
191 | unsigned long pfn, struct page *page, int flags) | 191 | unsigned long pfn, struct page *page, int flags) |
192 | { | 192 | { |
193 | struct siginfo si; | 193 | struct siginfo si; |
194 | int ret; | 194 | int ret; |
195 | 195 | ||
196 | printk(KERN_ERR | 196 | printk(KERN_ERR |
197 | "MCE %#lx: Killing %s:%d due to hardware memory corruption\n", | 197 | "MCE %#lx: Killing %s:%d due to hardware memory corruption\n", |
198 | pfn, t->comm, t->pid); | 198 | pfn, t->comm, t->pid); |
199 | si.si_signo = SIGBUS; | 199 | si.si_signo = SIGBUS; |
200 | si.si_errno = 0; | 200 | si.si_errno = 0; |
201 | si.si_addr = (void *)addr; | 201 | si.si_addr = (void *)addr; |
202 | #ifdef __ARCH_SI_TRAPNO | 202 | #ifdef __ARCH_SI_TRAPNO |
203 | si.si_trapno = trapno; | 203 | si.si_trapno = trapno; |
204 | #endif | 204 | #endif |
205 | si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT; | 205 | si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT; |
206 | 206 | ||
207 | if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) { | 207 | if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) { |
208 | si.si_code = BUS_MCEERR_AR; | 208 | si.si_code = BUS_MCEERR_AR; |
209 | ret = force_sig_info(SIGBUS, &si, current); | 209 | ret = force_sig_info(SIGBUS, &si, current); |
210 | } else { | 210 | } else { |
211 | /* | 211 | /* |
212 | * Don't use force here, it's convenient if the signal | 212 | * Don't use force here, it's convenient if the signal |
213 | * can be temporarily blocked. | 213 | * can be temporarily blocked. |
214 | * This could cause a loop when the user sets SIGBUS | 214 | * This could cause a loop when the user sets SIGBUS |
215 | * to SIG_IGN, but hopefully no one will do that? | 215 | * to SIG_IGN, but hopefully no one will do that? |
216 | */ | 216 | */ |
217 | si.si_code = BUS_MCEERR_AO; | 217 | si.si_code = BUS_MCEERR_AO; |
218 | ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */ | 218 | ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */ |
219 | } | 219 | } |
220 | if (ret < 0) | 220 | if (ret < 0) |
221 | printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n", | 221 | printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n", |
222 | t->comm, t->pid, ret); | 222 | t->comm, t->pid, ret); |
223 | return ret; | 223 | return ret; |
224 | } | 224 | } |
225 | 225 | ||
226 | /* | 226 | /* |
227 | * When a unknown page type is encountered drain as many buffers as possible | 227 | * When a unknown page type is encountered drain as many buffers as possible |
228 | * in the hope to turn the page into a LRU or free page, which we can handle. | 228 | * in the hope to turn the page into a LRU or free page, which we can handle. |
229 | */ | 229 | */ |
230 | void shake_page(struct page *p, int access) | 230 | void shake_page(struct page *p, int access) |
231 | { | 231 | { |
232 | if (!PageSlab(p)) { | 232 | if (!PageSlab(p)) { |
233 | lru_add_drain_all(); | 233 | lru_add_drain_all(); |
234 | if (PageLRU(p)) | 234 | if (PageLRU(p)) |
235 | return; | 235 | return; |
236 | drain_all_pages(); | 236 | drain_all_pages(); |
237 | if (PageLRU(p) || is_free_buddy_page(p)) | 237 | if (PageLRU(p) || is_free_buddy_page(p)) |
238 | return; | 238 | return; |
239 | } | 239 | } |
240 | 240 | ||
241 | /* | 241 | /* |
242 | * Only call shrink_slab here (which would also shrink other caches) if | 242 | * Only call shrink_slab here (which would also shrink other caches) if |
243 | * access is not potentially fatal. | 243 | * access is not potentially fatal. |
244 | */ | 244 | */ |
245 | if (access) { | 245 | if (access) { |
246 | int nr; | 246 | int nr; |
247 | int nid = page_to_nid(p); | 247 | int nid = page_to_nid(p); |
248 | do { | 248 | do { |
249 | struct shrink_control shrink = { | 249 | struct shrink_control shrink = { |
250 | .gfp_mask = GFP_KERNEL, | 250 | .gfp_mask = GFP_KERNEL, |
251 | }; | 251 | }; |
252 | node_set(nid, shrink.nodes_to_scan); | 252 | node_set(nid, shrink.nodes_to_scan); |
253 | 253 | ||
254 | nr = shrink_slab(&shrink, 1000, 1000); | 254 | nr = shrink_slab(&shrink, 1000, 1000); |
255 | if (page_count(p) == 1) | 255 | if (page_count(p) == 1) |
256 | break; | 256 | break; |
257 | } while (nr > 10); | 257 | } while (nr > 10); |
258 | } | 258 | } |
259 | } | 259 | } |
260 | EXPORT_SYMBOL_GPL(shake_page); | 260 | EXPORT_SYMBOL_GPL(shake_page); |
261 | 261 | ||
262 | /* | 262 | /* |
263 | * Kill all processes that have a poisoned page mapped and then isolate | 263 | * Kill all processes that have a poisoned page mapped and then isolate |
264 | * the page. | 264 | * the page. |
265 | * | 265 | * |
266 | * General strategy: | 266 | * General strategy: |
267 | * Find all processes having the page mapped and kill them. | 267 | * Find all processes having the page mapped and kill them. |
268 | * But we keep a page reference around so that the page is not | 268 | * But we keep a page reference around so that the page is not |
269 | * actually freed yet. | 269 | * actually freed yet. |
270 | * Then stash the page away | 270 | * Then stash the page away |
271 | * | 271 | * |
272 | * There's no convenient way to get back to mapped processes | 272 | * There's no convenient way to get back to mapped processes |
273 | * from the VMAs. So do a brute-force search over all | 273 | * from the VMAs. So do a brute-force search over all |
274 | * running processes. | 274 | * running processes. |
275 | * | 275 | * |
276 | * Remember that machine checks are not common (or rather | 276 | * Remember that machine checks are not common (or rather |
277 | * if they are common you have other problems), so this shouldn't | 277 | * if they are common you have other problems), so this shouldn't |
278 | * be a performance issue. | 278 | * be a performance issue. |
279 | * | 279 | * |
280 | * Also there are some races possible while we get from the | 280 | * Also there are some races possible while we get from the |
281 | * error detection to actually handle it. | 281 | * error detection to actually handle it. |
282 | */ | 282 | */ |
283 | 283 | ||
284 | struct to_kill { | 284 | struct to_kill { |
285 | struct list_head nd; | 285 | struct list_head nd; |
286 | struct task_struct *tsk; | 286 | struct task_struct *tsk; |
287 | unsigned long addr; | 287 | unsigned long addr; |
288 | char addr_valid; | 288 | char addr_valid; |
289 | }; | 289 | }; |
290 | 290 | ||
291 | /* | 291 | /* |
292 | * Failure handling: if we can't find or can't kill a process there's | 292 | * Failure handling: if we can't find or can't kill a process there's |
293 | * not much we can do. We just print a message and ignore otherwise. | 293 | * not much we can do. We just print a message and ignore otherwise. |
294 | */ | 294 | */ |
295 | 295 | ||
296 | /* | 296 | /* |
297 | * Schedule a process for later kill. | 297 | * Schedule a process for later kill. |
298 | * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM. | 298 | * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM. |
299 | * TBD would GFP_NOIO be enough? | 299 | * TBD would GFP_NOIO be enough? |
300 | */ | 300 | */ |
301 | static void add_to_kill(struct task_struct *tsk, struct page *p, | 301 | static void add_to_kill(struct task_struct *tsk, struct page *p, |
302 | struct vm_area_struct *vma, | 302 | struct vm_area_struct *vma, |
303 | struct list_head *to_kill, | 303 | struct list_head *to_kill, |
304 | struct to_kill **tkc) | 304 | struct to_kill **tkc) |
305 | { | 305 | { |
306 | struct to_kill *tk; | 306 | struct to_kill *tk; |
307 | 307 | ||
308 | if (*tkc) { | 308 | if (*tkc) { |
309 | tk = *tkc; | 309 | tk = *tkc; |
310 | *tkc = NULL; | 310 | *tkc = NULL; |
311 | } else { | 311 | } else { |
312 | tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC); | 312 | tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC); |
313 | if (!tk) { | 313 | if (!tk) { |
314 | printk(KERN_ERR | 314 | printk(KERN_ERR |
315 | "MCE: Out of memory while machine check handling\n"); | 315 | "MCE: Out of memory while machine check handling\n"); |
316 | return; | 316 | return; |
317 | } | 317 | } |
318 | } | 318 | } |
319 | tk->addr = page_address_in_vma(p, vma); | 319 | tk->addr = page_address_in_vma(p, vma); |
320 | tk->addr_valid = 1; | 320 | tk->addr_valid = 1; |
321 | 321 | ||
322 | /* | 322 | /* |
323 | * In theory we don't have to kill when the page was | 323 | * In theory we don't have to kill when the page was |
324 | * munmaped. But it could be also a mremap. Since that's | 324 | * munmaped. But it could be also a mremap. Since that's |
325 | * likely very rare kill anyways just out of paranoia, but use | 325 | * likely very rare kill anyways just out of paranoia, but use |
326 | * a SIGKILL because the error is not contained anymore. | 326 | * a SIGKILL because the error is not contained anymore. |
327 | */ | 327 | */ |
328 | if (tk->addr == -EFAULT) { | 328 | if (tk->addr == -EFAULT) { |
329 | pr_info("MCE: Unable to find user space address %lx in %s\n", | 329 | pr_info("MCE: Unable to find user space address %lx in %s\n", |
330 | page_to_pfn(p), tsk->comm); | 330 | page_to_pfn(p), tsk->comm); |
331 | tk->addr_valid = 0; | 331 | tk->addr_valid = 0; |
332 | } | 332 | } |
333 | get_task_struct(tsk); | 333 | get_task_struct(tsk); |
334 | tk->tsk = tsk; | 334 | tk->tsk = tsk; |
335 | list_add_tail(&tk->nd, to_kill); | 335 | list_add_tail(&tk->nd, to_kill); |
336 | } | 336 | } |
337 | 337 | ||
338 | /* | 338 | /* |
339 | * Kill the processes that have been collected earlier. | 339 | * Kill the processes that have been collected earlier. |
340 | * | 340 | * |
341 | * Only do anything when DOIT is set, otherwise just free the list | 341 | * Only do anything when DOIT is set, otherwise just free the list |
342 | * (this is used for clean pages which do not need killing) | 342 | * (this is used for clean pages which do not need killing) |
343 | * Also when FAIL is set do a force kill because something went | 343 | * Also when FAIL is set do a force kill because something went |
344 | * wrong earlier. | 344 | * wrong earlier. |
345 | */ | 345 | */ |
346 | static void kill_procs(struct list_head *to_kill, int forcekill, int trapno, | 346 | static void kill_procs(struct list_head *to_kill, int forcekill, int trapno, |
347 | int fail, struct page *page, unsigned long pfn, | 347 | int fail, struct page *page, unsigned long pfn, |
348 | int flags) | 348 | int flags) |
349 | { | 349 | { |
350 | struct to_kill *tk, *next; | 350 | struct to_kill *tk, *next; |
351 | 351 | ||
352 | list_for_each_entry_safe (tk, next, to_kill, nd) { | 352 | list_for_each_entry_safe (tk, next, to_kill, nd) { |
353 | if (forcekill) { | 353 | if (forcekill) { |
354 | /* | 354 | /* |
355 | * In case something went wrong with munmapping | 355 | * In case something went wrong with munmapping |
356 | * make sure the process doesn't catch the | 356 | * make sure the process doesn't catch the |
357 | * signal and then access the memory. Just kill it. | 357 | * signal and then access the memory. Just kill it. |
358 | */ | 358 | */ |
359 | if (fail || tk->addr_valid == 0) { | 359 | if (fail || tk->addr_valid == 0) { |
360 | printk(KERN_ERR | 360 | printk(KERN_ERR |
361 | "MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n", | 361 | "MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n", |
362 | pfn, tk->tsk->comm, tk->tsk->pid); | 362 | pfn, tk->tsk->comm, tk->tsk->pid); |
363 | force_sig(SIGKILL, tk->tsk); | 363 | force_sig(SIGKILL, tk->tsk); |
364 | } | 364 | } |
365 | 365 | ||
366 | /* | 366 | /* |
367 | * In theory the process could have mapped | 367 | * In theory the process could have mapped |
368 | * something else on the address in-between. We could | 368 | * something else on the address in-between. We could |
369 | * check for that, but we need to tell the | 369 | * check for that, but we need to tell the |
370 | * process anyways. | 370 | * process anyways. |
371 | */ | 371 | */ |
372 | else if (kill_proc(tk->tsk, tk->addr, trapno, | 372 | else if (kill_proc(tk->tsk, tk->addr, trapno, |
373 | pfn, page, flags) < 0) | 373 | pfn, page, flags) < 0) |
374 | printk(KERN_ERR | 374 | printk(KERN_ERR |
375 | "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n", | 375 | "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n", |
376 | pfn, tk->tsk->comm, tk->tsk->pid); | 376 | pfn, tk->tsk->comm, tk->tsk->pid); |
377 | } | 377 | } |
378 | put_task_struct(tk->tsk); | 378 | put_task_struct(tk->tsk); |
379 | kfree(tk); | 379 | kfree(tk); |
380 | } | 380 | } |
381 | } | 381 | } |
382 | 382 | ||
383 | /* | 383 | /* |
384 | * Find a dedicated thread which is supposed to handle SIGBUS(BUS_MCEERR_AO) | 384 | * Find a dedicated thread which is supposed to handle SIGBUS(BUS_MCEERR_AO) |
385 | * on behalf of the thread group. Return task_struct of the (first found) | 385 | * on behalf of the thread group. Return task_struct of the (first found) |
386 | * dedicated thread if found, and return NULL otherwise. | 386 | * dedicated thread if found, and return NULL otherwise. |
387 | * | 387 | * |
388 | * We already hold read_lock(&tasklist_lock) in the caller, so we don't | 388 | * We already hold read_lock(&tasklist_lock) in the caller, so we don't |
389 | * have to call rcu_read_lock/unlock() in this function. | 389 | * have to call rcu_read_lock/unlock() in this function. |
390 | */ | 390 | */ |
391 | static struct task_struct *find_early_kill_thread(struct task_struct *tsk) | 391 | static struct task_struct *find_early_kill_thread(struct task_struct *tsk) |
392 | { | 392 | { |
393 | struct task_struct *t; | 393 | struct task_struct *t; |
394 | 394 | ||
395 | for_each_thread(tsk, t) | 395 | for_each_thread(tsk, t) |
396 | if ((t->flags & PF_MCE_PROCESS) && (t->flags & PF_MCE_EARLY)) | 396 | if ((t->flags & PF_MCE_PROCESS) && (t->flags & PF_MCE_EARLY)) |
397 | return t; | 397 | return t; |
398 | return NULL; | 398 | return NULL; |
399 | } | 399 | } |
400 | 400 | ||
401 | /* | 401 | /* |
402 | * Determine whether a given process is "early kill" process which expects | 402 | * Determine whether a given process is "early kill" process which expects |
403 | * to be signaled when some page under the process is hwpoisoned. | 403 | * to be signaled when some page under the process is hwpoisoned. |
404 | * Return task_struct of the dedicated thread (main thread unless explicitly | 404 | * Return task_struct of the dedicated thread (main thread unless explicitly |
405 | * specified) if the process is "early kill," and otherwise returns NULL. | 405 | * specified) if the process is "early kill," and otherwise returns NULL. |
406 | */ | 406 | */ |
407 | static struct task_struct *task_early_kill(struct task_struct *tsk, | 407 | static struct task_struct *task_early_kill(struct task_struct *tsk, |
408 | int force_early) | 408 | int force_early) |
409 | { | 409 | { |
410 | struct task_struct *t; | 410 | struct task_struct *t; |
411 | if (!tsk->mm) | 411 | if (!tsk->mm) |
412 | return NULL; | 412 | return NULL; |
413 | if (force_early) | 413 | if (force_early) |
414 | return tsk; | 414 | return tsk; |
415 | t = find_early_kill_thread(tsk); | 415 | t = find_early_kill_thread(tsk); |
416 | if (t) | 416 | if (t) |
417 | return t; | 417 | return t; |
418 | if (sysctl_memory_failure_early_kill) | 418 | if (sysctl_memory_failure_early_kill) |
419 | return tsk; | 419 | return tsk; |
420 | return NULL; | 420 | return NULL; |
421 | } | 421 | } |
422 | 422 | ||
423 | /* | 423 | /* |
424 | * Collect processes when the error hit an anonymous page. | 424 | * Collect processes when the error hit an anonymous page. |
425 | */ | 425 | */ |
426 | static void collect_procs_anon(struct page *page, struct list_head *to_kill, | 426 | static void collect_procs_anon(struct page *page, struct list_head *to_kill, |
427 | struct to_kill **tkc, int force_early) | 427 | struct to_kill **tkc, int force_early) |
428 | { | 428 | { |
429 | struct vm_area_struct *vma; | 429 | struct vm_area_struct *vma; |
430 | struct task_struct *tsk; | 430 | struct task_struct *tsk; |
431 | struct anon_vma *av; | 431 | struct anon_vma *av; |
432 | pgoff_t pgoff; | 432 | pgoff_t pgoff; |
433 | 433 | ||
434 | av = page_lock_anon_vma_read(page); | 434 | av = page_lock_anon_vma_read(page); |
435 | if (av == NULL) /* Not actually mapped anymore */ | 435 | if (av == NULL) /* Not actually mapped anymore */ |
436 | return; | 436 | return; |
437 | 437 | ||
438 | pgoff = page_to_pgoff(page); | 438 | pgoff = page_to_pgoff(page); |
439 | read_lock(&tasklist_lock); | 439 | read_lock(&tasklist_lock); |
440 | for_each_process (tsk) { | 440 | for_each_process (tsk) { |
441 | struct anon_vma_chain *vmac; | 441 | struct anon_vma_chain *vmac; |
442 | struct task_struct *t = task_early_kill(tsk, force_early); | 442 | struct task_struct *t = task_early_kill(tsk, force_early); |
443 | 443 | ||
444 | if (!t) | 444 | if (!t) |
445 | continue; | 445 | continue; |
446 | anon_vma_interval_tree_foreach(vmac, &av->rb_root, | 446 | anon_vma_interval_tree_foreach(vmac, &av->rb_root, |
447 | pgoff, pgoff) { | 447 | pgoff, pgoff) { |
448 | vma = vmac->vma; | 448 | vma = vmac->vma; |
449 | if (!page_mapped_in_vma(page, vma)) | 449 | if (!page_mapped_in_vma(page, vma)) |
450 | continue; | 450 | continue; |
451 | if (vma->vm_mm == t->mm) | 451 | if (vma->vm_mm == t->mm) |
452 | add_to_kill(t, page, vma, to_kill, tkc); | 452 | add_to_kill(t, page, vma, to_kill, tkc); |
453 | } | 453 | } |
454 | } | 454 | } |
455 | read_unlock(&tasklist_lock); | 455 | read_unlock(&tasklist_lock); |
456 | page_unlock_anon_vma_read(av); | 456 | page_unlock_anon_vma_read(av); |
457 | } | 457 | } |
458 | 458 | ||
459 | /* | 459 | /* |
460 | * Collect processes when the error hit a file mapped page. | 460 | * Collect processes when the error hit a file mapped page. |
461 | */ | 461 | */ |
462 | static void collect_procs_file(struct page *page, struct list_head *to_kill, | 462 | static void collect_procs_file(struct page *page, struct list_head *to_kill, |
463 | struct to_kill **tkc, int force_early) | 463 | struct to_kill **tkc, int force_early) |
464 | { | 464 | { |
465 | struct vm_area_struct *vma; | 465 | struct vm_area_struct *vma; |
466 | struct task_struct *tsk; | 466 | struct task_struct *tsk; |
467 | struct address_space *mapping = page->mapping; | 467 | struct address_space *mapping = page->mapping; |
468 | 468 | ||
469 | mutex_lock(&mapping->i_mmap_mutex); | 469 | mutex_lock(&mapping->i_mmap_mutex); |
470 | read_lock(&tasklist_lock); | 470 | read_lock(&tasklist_lock); |
471 | for_each_process(tsk) { | 471 | for_each_process(tsk) { |
472 | pgoff_t pgoff = page_to_pgoff(page); | 472 | pgoff_t pgoff = page_to_pgoff(page); |
473 | struct task_struct *t = task_early_kill(tsk, force_early); | 473 | struct task_struct *t = task_early_kill(tsk, force_early); |
474 | 474 | ||
475 | if (!t) | 475 | if (!t) |
476 | continue; | 476 | continue; |
477 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, | 477 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, |
478 | pgoff) { | 478 | pgoff) { |
479 | /* | 479 | /* |
480 | * Send early kill signal to tasks where a vma covers | 480 | * Send early kill signal to tasks where a vma covers |
481 | * the page but the corrupted page is not necessarily | 481 | * the page but the corrupted page is not necessarily |
482 | * mapped it in its pte. | 482 | * mapped it in its pte. |
483 | * Assume applications who requested early kill want | 483 | * Assume applications who requested early kill want |
484 | * to be informed of all such data corruptions. | 484 | * to be informed of all such data corruptions. |
485 | */ | 485 | */ |
486 | if (vma->vm_mm == t->mm) | 486 | if (vma->vm_mm == t->mm) |
487 | add_to_kill(t, page, vma, to_kill, tkc); | 487 | add_to_kill(t, page, vma, to_kill, tkc); |
488 | } | 488 | } |
489 | } | 489 | } |
490 | read_unlock(&tasklist_lock); | 490 | read_unlock(&tasklist_lock); |
491 | mutex_unlock(&mapping->i_mmap_mutex); | 491 | mutex_unlock(&mapping->i_mmap_mutex); |
492 | } | 492 | } |
493 | 493 | ||
494 | /* | 494 | /* |
495 | * Collect the processes who have the corrupted page mapped to kill. | 495 | * Collect the processes who have the corrupted page mapped to kill. |
496 | * This is done in two steps for locking reasons. | 496 | * This is done in two steps for locking reasons. |
497 | * First preallocate one tokill structure outside the spin locks, | 497 | * First preallocate one tokill structure outside the spin locks, |
498 | * so that we can kill at least one process reasonably reliable. | 498 | * so that we can kill at least one process reasonably reliable. |
499 | */ | 499 | */ |
500 | static void collect_procs(struct page *page, struct list_head *tokill, | 500 | static void collect_procs(struct page *page, struct list_head *tokill, |
501 | int force_early) | 501 | int force_early) |
502 | { | 502 | { |
503 | struct to_kill *tk; | 503 | struct to_kill *tk; |
504 | 504 | ||
505 | if (!page->mapping) | 505 | if (!page->mapping) |
506 | return; | 506 | return; |
507 | 507 | ||
508 | tk = kmalloc(sizeof(struct to_kill), GFP_NOIO); | 508 | tk = kmalloc(sizeof(struct to_kill), GFP_NOIO); |
509 | if (!tk) | 509 | if (!tk) |
510 | return; | 510 | return; |
511 | if (PageAnon(page)) | 511 | if (PageAnon(page)) |
512 | collect_procs_anon(page, tokill, &tk, force_early); | 512 | collect_procs_anon(page, tokill, &tk, force_early); |
513 | else | 513 | else |
514 | collect_procs_file(page, tokill, &tk, force_early); | 514 | collect_procs_file(page, tokill, &tk, force_early); |
515 | kfree(tk); | 515 | kfree(tk); |
516 | } | 516 | } |
517 | 517 | ||
518 | /* | 518 | /* |
519 | * Error handlers for various types of pages. | 519 | * Error handlers for various types of pages. |
520 | */ | 520 | */ |
521 | 521 | ||
522 | enum outcome { | 522 | enum outcome { |
523 | IGNORED, /* Error: cannot be handled */ | 523 | IGNORED, /* Error: cannot be handled */ |
524 | FAILED, /* Error: handling failed */ | 524 | FAILED, /* Error: handling failed */ |
525 | DELAYED, /* Will be handled later */ | 525 | DELAYED, /* Will be handled later */ |
526 | RECOVERED, /* Successfully recovered */ | 526 | RECOVERED, /* Successfully recovered */ |
527 | }; | 527 | }; |
528 | 528 | ||
529 | static const char *action_name[] = { | 529 | static const char *action_name[] = { |
530 | [IGNORED] = "Ignored", | 530 | [IGNORED] = "Ignored", |
531 | [FAILED] = "Failed", | 531 | [FAILED] = "Failed", |
532 | [DELAYED] = "Delayed", | 532 | [DELAYED] = "Delayed", |
533 | [RECOVERED] = "Recovered", | 533 | [RECOVERED] = "Recovered", |
534 | }; | 534 | }; |
535 | 535 | ||
536 | /* | 536 | /* |
537 | * XXX: It is possible that a page is isolated from LRU cache, | 537 | * XXX: It is possible that a page is isolated from LRU cache, |
538 | * and then kept in swap cache or failed to remove from page cache. | 538 | * and then kept in swap cache or failed to remove from page cache. |
539 | * The page count will stop it from being freed by unpoison. | 539 | * The page count will stop it from being freed by unpoison. |
540 | * Stress tests should be aware of this memory leak problem. | 540 | * Stress tests should be aware of this memory leak problem. |
541 | */ | 541 | */ |
542 | static int delete_from_lru_cache(struct page *p) | 542 | static int delete_from_lru_cache(struct page *p) |
543 | { | 543 | { |
544 | if (!isolate_lru_page(p)) { | 544 | if (!isolate_lru_page(p)) { |
545 | /* | 545 | /* |
546 | * Clear sensible page flags, so that the buddy system won't | 546 | * Clear sensible page flags, so that the buddy system won't |
547 | * complain when the page is unpoison-and-freed. | 547 | * complain when the page is unpoison-and-freed. |
548 | */ | 548 | */ |
549 | ClearPageActive(p); | 549 | ClearPageActive(p); |
550 | ClearPageUnevictable(p); | 550 | ClearPageUnevictable(p); |
551 | /* | 551 | /* |
552 | * drop the page count elevated by isolate_lru_page() | 552 | * drop the page count elevated by isolate_lru_page() |
553 | */ | 553 | */ |
554 | page_cache_release(p); | 554 | page_cache_release(p); |
555 | return 0; | 555 | return 0; |
556 | } | 556 | } |
557 | return -EIO; | 557 | return -EIO; |
558 | } | 558 | } |
559 | 559 | ||
560 | /* | 560 | /* |
561 | * Error hit kernel page. | 561 | * Error hit kernel page. |
562 | * Do nothing, try to be lucky and not touch this instead. For a few cases we | 562 | * Do nothing, try to be lucky and not touch this instead. For a few cases we |
563 | * could be more sophisticated. | 563 | * could be more sophisticated. |
564 | */ | 564 | */ |
565 | static int me_kernel(struct page *p, unsigned long pfn) | 565 | static int me_kernel(struct page *p, unsigned long pfn) |
566 | { | 566 | { |
567 | return IGNORED; | 567 | return IGNORED; |
568 | } | 568 | } |
569 | 569 | ||
570 | /* | 570 | /* |
571 | * Page in unknown state. Do nothing. | 571 | * Page in unknown state. Do nothing. |
572 | */ | 572 | */ |
573 | static int me_unknown(struct page *p, unsigned long pfn) | 573 | static int me_unknown(struct page *p, unsigned long pfn) |
574 | { | 574 | { |
575 | printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn); | 575 | printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn); |
576 | return FAILED; | 576 | return FAILED; |
577 | } | 577 | } |
578 | 578 | ||
579 | /* | 579 | /* |
580 | * Clean (or cleaned) page cache page. | 580 | * Clean (or cleaned) page cache page. |
581 | */ | 581 | */ |
582 | static int me_pagecache_clean(struct page *p, unsigned long pfn) | 582 | static int me_pagecache_clean(struct page *p, unsigned long pfn) |
583 | { | 583 | { |
584 | int err; | 584 | int err; |
585 | int ret = FAILED; | 585 | int ret = FAILED; |
586 | struct address_space *mapping; | 586 | struct address_space *mapping; |
587 | 587 | ||
588 | delete_from_lru_cache(p); | 588 | delete_from_lru_cache(p); |
589 | 589 | ||
590 | /* | 590 | /* |
591 | * For anonymous pages we're done the only reference left | 591 | * For anonymous pages we're done the only reference left |
592 | * should be the one m_f() holds. | 592 | * should be the one m_f() holds. |
593 | */ | 593 | */ |
594 | if (PageAnon(p)) | 594 | if (PageAnon(p)) |
595 | return RECOVERED; | 595 | return RECOVERED; |
596 | 596 | ||
597 | /* | 597 | /* |
598 | * Now truncate the page in the page cache. This is really | 598 | * Now truncate the page in the page cache. This is really |
599 | * more like a "temporary hole punch" | 599 | * more like a "temporary hole punch" |
600 | * Don't do this for block devices when someone else | 600 | * Don't do this for block devices when someone else |
601 | * has a reference, because it could be file system metadata | 601 | * has a reference, because it could be file system metadata |
602 | * and that's not safe to truncate. | 602 | * and that's not safe to truncate. |
603 | */ | 603 | */ |
604 | mapping = page_mapping(p); | 604 | mapping = page_mapping(p); |
605 | if (!mapping) { | 605 | if (!mapping) { |
606 | /* | 606 | /* |
607 | * Page has been teared down in the meanwhile | 607 | * Page has been teared down in the meanwhile |
608 | */ | 608 | */ |
609 | return FAILED; | 609 | return FAILED; |
610 | } | 610 | } |
611 | 611 | ||
612 | /* | 612 | /* |
613 | * Truncation is a bit tricky. Enable it per file system for now. | 613 | * Truncation is a bit tricky. Enable it per file system for now. |
614 | * | 614 | * |
615 | * Open: to take i_mutex or not for this? Right now we don't. | 615 | * Open: to take i_mutex or not for this? Right now we don't. |
616 | */ | 616 | */ |
617 | if (mapping->a_ops->error_remove_page) { | 617 | if (mapping->a_ops->error_remove_page) { |
618 | err = mapping->a_ops->error_remove_page(mapping, p); | 618 | err = mapping->a_ops->error_remove_page(mapping, p); |
619 | if (err != 0) { | 619 | if (err != 0) { |
620 | printk(KERN_INFO "MCE %#lx: Failed to punch page: %d\n", | 620 | printk(KERN_INFO "MCE %#lx: Failed to punch page: %d\n", |
621 | pfn, err); | 621 | pfn, err); |
622 | } else if (page_has_private(p) && | 622 | } else if (page_has_private(p) && |
623 | !try_to_release_page(p, GFP_NOIO)) { | 623 | !try_to_release_page(p, GFP_NOIO)) { |
624 | pr_info("MCE %#lx: failed to release buffers\n", pfn); | 624 | pr_info("MCE %#lx: failed to release buffers\n", pfn); |
625 | } else { | 625 | } else { |
626 | ret = RECOVERED; | 626 | ret = RECOVERED; |
627 | } | 627 | } |
628 | } else { | 628 | } else { |
629 | /* | 629 | /* |
630 | * If the file system doesn't support it just invalidate | 630 | * If the file system doesn't support it just invalidate |
631 | * This fails on dirty or anything with private pages | 631 | * This fails on dirty or anything with private pages |
632 | */ | 632 | */ |
633 | if (invalidate_inode_page(p)) | 633 | if (invalidate_inode_page(p)) |
634 | ret = RECOVERED; | 634 | ret = RECOVERED; |
635 | else | 635 | else |
636 | printk(KERN_INFO "MCE %#lx: Failed to invalidate\n", | 636 | printk(KERN_INFO "MCE %#lx: Failed to invalidate\n", |
637 | pfn); | 637 | pfn); |
638 | } | 638 | } |
639 | return ret; | 639 | return ret; |
640 | } | 640 | } |
641 | 641 | ||
642 | /* | 642 | /* |
643 | * Dirty pagecache page | 643 | * Dirty pagecache page |
644 | * Issues: when the error hit a hole page the error is not properly | 644 | * Issues: when the error hit a hole page the error is not properly |
645 | * propagated. | 645 | * propagated. |
646 | */ | 646 | */ |
647 | static int me_pagecache_dirty(struct page *p, unsigned long pfn) | 647 | static int me_pagecache_dirty(struct page *p, unsigned long pfn) |
648 | { | 648 | { |
649 | struct address_space *mapping = page_mapping(p); | 649 | struct address_space *mapping = page_mapping(p); |
650 | 650 | ||
651 | SetPageError(p); | 651 | SetPageError(p); |
652 | /* TBD: print more information about the file. */ | 652 | /* TBD: print more information about the file. */ |
653 | if (mapping) { | 653 | if (mapping) { |
654 | /* | 654 | /* |
655 | * IO error will be reported by write(), fsync(), etc. | 655 | * IO error will be reported by write(), fsync(), etc. |
656 | * who check the mapping. | 656 | * who check the mapping. |
657 | * This way the application knows that something went | 657 | * This way the application knows that something went |
658 | * wrong with its dirty file data. | 658 | * wrong with its dirty file data. |
659 | * | 659 | * |
660 | * There's one open issue: | 660 | * There's one open issue: |
661 | * | 661 | * |
662 | * The EIO will be only reported on the next IO | 662 | * The EIO will be only reported on the next IO |
663 | * operation and then cleared through the IO map. | 663 | * operation and then cleared through the IO map. |
664 | * Normally Linux has two mechanisms to pass IO error | 664 | * Normally Linux has two mechanisms to pass IO error |
665 | * first through the AS_EIO flag in the address space | 665 | * first through the AS_EIO flag in the address space |
666 | * and then through the PageError flag in the page. | 666 | * and then through the PageError flag in the page. |
667 | * Since we drop pages on memory failure handling the | 667 | * Since we drop pages on memory failure handling the |
668 | * only mechanism open to use is through AS_AIO. | 668 | * only mechanism open to use is through AS_AIO. |
669 | * | 669 | * |
670 | * This has the disadvantage that it gets cleared on | 670 | * This has the disadvantage that it gets cleared on |
671 | * the first operation that returns an error, while | 671 | * the first operation that returns an error, while |
672 | * the PageError bit is more sticky and only cleared | 672 | * the PageError bit is more sticky and only cleared |
673 | * when the page is reread or dropped. If an | 673 | * when the page is reread or dropped. If an |
674 | * application assumes it will always get error on | 674 | * application assumes it will always get error on |
675 | * fsync, but does other operations on the fd before | 675 | * fsync, but does other operations on the fd before |
676 | * and the page is dropped between then the error | 676 | * and the page is dropped between then the error |
677 | * will not be properly reported. | 677 | * will not be properly reported. |
678 | * | 678 | * |
679 | * This can already happen even without hwpoisoned | 679 | * This can already happen even without hwpoisoned |
680 | * pages: first on metadata IO errors (which only | 680 | * pages: first on metadata IO errors (which only |
681 | * report through AS_EIO) or when the page is dropped | 681 | * report through AS_EIO) or when the page is dropped |
682 | * at the wrong time. | 682 | * at the wrong time. |
683 | * | 683 | * |
684 | * So right now we assume that the application DTRT on | 684 | * So right now we assume that the application DTRT on |
685 | * the first EIO, but we're not worse than other parts | 685 | * the first EIO, but we're not worse than other parts |
686 | * of the kernel. | 686 | * of the kernel. |
687 | */ | 687 | */ |
688 | mapping_set_error(mapping, EIO); | 688 | mapping_set_error(mapping, EIO); |
689 | } | 689 | } |
690 | 690 | ||
691 | return me_pagecache_clean(p, pfn); | 691 | return me_pagecache_clean(p, pfn); |
692 | } | 692 | } |
693 | 693 | ||
694 | /* | 694 | /* |
695 | * Clean and dirty swap cache. | 695 | * Clean and dirty swap cache. |
696 | * | 696 | * |
697 | * Dirty swap cache page is tricky to handle. The page could live both in page | 697 | * Dirty swap cache page is tricky to handle. The page could live both in page |
698 | * cache and swap cache(ie. page is freshly swapped in). So it could be | 698 | * cache and swap cache(ie. page is freshly swapped in). So it could be |
699 | * referenced concurrently by 2 types of PTEs: | 699 | * referenced concurrently by 2 types of PTEs: |
700 | * normal PTEs and swap PTEs. We try to handle them consistently by calling | 700 | * normal PTEs and swap PTEs. We try to handle them consistently by calling |
701 | * try_to_unmap(TTU_IGNORE_HWPOISON) to convert the normal PTEs to swap PTEs, | 701 | * try_to_unmap(TTU_IGNORE_HWPOISON) to convert the normal PTEs to swap PTEs, |
702 | * and then | 702 | * and then |
703 | * - clear dirty bit to prevent IO | 703 | * - clear dirty bit to prevent IO |
704 | * - remove from LRU | 704 | * - remove from LRU |
705 | * - but keep in the swap cache, so that when we return to it on | 705 | * - but keep in the swap cache, so that when we return to it on |
706 | * a later page fault, we know the application is accessing | 706 | * a later page fault, we know the application is accessing |
707 | * corrupted data and shall be killed (we installed simple | 707 | * corrupted data and shall be killed (we installed simple |
708 | * interception code in do_swap_page to catch it). | 708 | * interception code in do_swap_page to catch it). |
709 | * | 709 | * |
710 | * Clean swap cache pages can be directly isolated. A later page fault will | 710 | * Clean swap cache pages can be directly isolated. A later page fault will |
711 | * bring in the known good data from disk. | 711 | * bring in the known good data from disk. |
712 | */ | 712 | */ |
713 | static int me_swapcache_dirty(struct page *p, unsigned long pfn) | 713 | static int me_swapcache_dirty(struct page *p, unsigned long pfn) |
714 | { | 714 | { |
715 | ClearPageDirty(p); | 715 | ClearPageDirty(p); |
716 | /* Trigger EIO in shmem: */ | 716 | /* Trigger EIO in shmem: */ |
717 | ClearPageUptodate(p); | 717 | ClearPageUptodate(p); |
718 | 718 | ||
719 | if (!delete_from_lru_cache(p)) | 719 | if (!delete_from_lru_cache(p)) |
720 | return DELAYED; | 720 | return DELAYED; |
721 | else | 721 | else |
722 | return FAILED; | 722 | return FAILED; |
723 | } | 723 | } |
724 | 724 | ||
725 | static int me_swapcache_clean(struct page *p, unsigned long pfn) | 725 | static int me_swapcache_clean(struct page *p, unsigned long pfn) |
726 | { | 726 | { |
727 | delete_from_swap_cache(p); | 727 | delete_from_swap_cache(p); |
728 | 728 | ||
729 | if (!delete_from_lru_cache(p)) | 729 | if (!delete_from_lru_cache(p)) |
730 | return RECOVERED; | 730 | return RECOVERED; |
731 | else | 731 | else |
732 | return FAILED; | 732 | return FAILED; |
733 | } | 733 | } |
734 | 734 | ||
735 | /* | 735 | /* |
736 | * Huge pages. Needs work. | 736 | * Huge pages. Needs work. |
737 | * Issues: | 737 | * Issues: |
738 | * - Error on hugepage is contained in hugepage unit (not in raw page unit.) | 738 | * - Error on hugepage is contained in hugepage unit (not in raw page unit.) |
739 | * To narrow down kill region to one page, we need to break up pmd. | 739 | * To narrow down kill region to one page, we need to break up pmd. |
740 | */ | 740 | */ |
741 | static int me_huge_page(struct page *p, unsigned long pfn) | 741 | static int me_huge_page(struct page *p, unsigned long pfn) |
742 | { | 742 | { |
743 | int res = 0; | 743 | int res = 0; |
744 | struct page *hpage = compound_head(p); | 744 | struct page *hpage = compound_head(p); |
745 | /* | 745 | /* |
746 | * We can safely recover from error on free or reserved (i.e. | 746 | * We can safely recover from error on free or reserved (i.e. |
747 | * not in-use) hugepage by dequeuing it from freelist. | 747 | * not in-use) hugepage by dequeuing it from freelist. |
748 | * To check whether a hugepage is in-use or not, we can't use | 748 | * To check whether a hugepage is in-use or not, we can't use |
749 | * page->lru because it can be used in other hugepage operations, | 749 | * page->lru because it can be used in other hugepage operations, |
750 | * such as __unmap_hugepage_range() and gather_surplus_pages(). | 750 | * such as __unmap_hugepage_range() and gather_surplus_pages(). |
751 | * So instead we use page_mapping() and PageAnon(). | 751 | * So instead we use page_mapping() and PageAnon(). |
752 | * We assume that this function is called with page lock held, | 752 | * We assume that this function is called with page lock held, |
753 | * so there is no race between isolation and mapping/unmapping. | 753 | * so there is no race between isolation and mapping/unmapping. |
754 | */ | 754 | */ |
755 | if (!(page_mapping(hpage) || PageAnon(hpage))) { | 755 | if (!(page_mapping(hpage) || PageAnon(hpage))) { |
756 | res = dequeue_hwpoisoned_huge_page(hpage); | 756 | res = dequeue_hwpoisoned_huge_page(hpage); |
757 | if (!res) | 757 | if (!res) |
758 | return RECOVERED; | 758 | return RECOVERED; |
759 | } | 759 | } |
760 | return DELAYED; | 760 | return DELAYED; |
761 | } | 761 | } |
762 | 762 | ||
763 | /* | 763 | /* |
764 | * Various page states we can handle. | 764 | * Various page states we can handle. |
765 | * | 765 | * |
766 | * A page state is defined by its current page->flags bits. | 766 | * A page state is defined by its current page->flags bits. |
767 | * The table matches them in order and calls the right handler. | 767 | * The table matches them in order and calls the right handler. |
768 | * | 768 | * |
769 | * This is quite tricky because we can access page at any time | 769 | * This is quite tricky because we can access page at any time |
770 | * in its live cycle, so all accesses have to be extremely careful. | 770 | * in its live cycle, so all accesses have to be extremely careful. |
771 | * | 771 | * |
772 | * This is not complete. More states could be added. | 772 | * This is not complete. More states could be added. |
773 | * For any missing state don't attempt recovery. | 773 | * For any missing state don't attempt recovery. |
774 | */ | 774 | */ |
775 | 775 | ||
776 | #define dirty (1UL << PG_dirty) | 776 | #define dirty (1UL << PG_dirty) |
777 | #define sc (1UL << PG_swapcache) | 777 | #define sc (1UL << PG_swapcache) |
778 | #define unevict (1UL << PG_unevictable) | 778 | #define unevict (1UL << PG_unevictable) |
779 | #define mlock (1UL << PG_mlocked) | 779 | #define mlock (1UL << PG_mlocked) |
780 | #define writeback (1UL << PG_writeback) | 780 | #define writeback (1UL << PG_writeback) |
781 | #define lru (1UL << PG_lru) | 781 | #define lru (1UL << PG_lru) |
782 | #define swapbacked (1UL << PG_swapbacked) | 782 | #define swapbacked (1UL << PG_swapbacked) |
783 | #define head (1UL << PG_head) | 783 | #define head (1UL << PG_head) |
784 | #define tail (1UL << PG_tail) | 784 | #define tail (1UL << PG_tail) |
785 | #define compound (1UL << PG_compound) | 785 | #define compound (1UL << PG_compound) |
786 | #define slab (1UL << PG_slab) | 786 | #define slab (1UL << PG_slab) |
787 | #define reserved (1UL << PG_reserved) | 787 | #define reserved (1UL << PG_reserved) |
788 | 788 | ||
789 | static struct page_state { | 789 | static struct page_state { |
790 | unsigned long mask; | 790 | unsigned long mask; |
791 | unsigned long res; | 791 | unsigned long res; |
792 | char *msg; | 792 | char *msg; |
793 | int (*action)(struct page *p, unsigned long pfn); | 793 | int (*action)(struct page *p, unsigned long pfn); |
794 | } error_states[] = { | 794 | } error_states[] = { |
795 | { reserved, reserved, "reserved kernel", me_kernel }, | 795 | { reserved, reserved, "reserved kernel", me_kernel }, |
796 | /* | 796 | /* |
797 | * free pages are specially detected outside this table: | 797 | * free pages are specially detected outside this table: |
798 | * PG_buddy pages only make a small fraction of all free pages. | 798 | * PG_buddy pages only make a small fraction of all free pages. |
799 | */ | 799 | */ |
800 | 800 | ||
801 | /* | 801 | /* |
802 | * Could in theory check if slab page is free or if we can drop | 802 | * Could in theory check if slab page is free or if we can drop |
803 | * currently unused objects without touching them. But just | 803 | * currently unused objects without touching them. But just |
804 | * treat it as standard kernel for now. | 804 | * treat it as standard kernel for now. |
805 | */ | 805 | */ |
806 | { slab, slab, "kernel slab", me_kernel }, | 806 | { slab, slab, "kernel slab", me_kernel }, |
807 | 807 | ||
808 | #ifdef CONFIG_PAGEFLAGS_EXTENDED | 808 | #ifdef CONFIG_PAGEFLAGS_EXTENDED |
809 | { head, head, "huge", me_huge_page }, | 809 | { head, head, "huge", me_huge_page }, |
810 | { tail, tail, "huge", me_huge_page }, | 810 | { tail, tail, "huge", me_huge_page }, |
811 | #else | 811 | #else |
812 | { compound, compound, "huge", me_huge_page }, | 812 | { compound, compound, "huge", me_huge_page }, |
813 | #endif | 813 | #endif |
814 | 814 | ||
815 | { sc|dirty, sc|dirty, "dirty swapcache", me_swapcache_dirty }, | 815 | { sc|dirty, sc|dirty, "dirty swapcache", me_swapcache_dirty }, |
816 | { sc|dirty, sc, "clean swapcache", me_swapcache_clean }, | 816 | { sc|dirty, sc, "clean swapcache", me_swapcache_clean }, |
817 | 817 | ||
818 | { mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty }, | 818 | { mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty }, |
819 | { mlock|dirty, mlock, "clean mlocked LRU", me_pagecache_clean }, | 819 | { mlock|dirty, mlock, "clean mlocked LRU", me_pagecache_clean }, |
820 | 820 | ||
821 | { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty }, | 821 | { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty }, |
822 | { unevict|dirty, unevict, "clean unevictable LRU", me_pagecache_clean }, | 822 | { unevict|dirty, unevict, "clean unevictable LRU", me_pagecache_clean }, |
823 | 823 | ||
824 | { lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty }, | 824 | { lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty }, |
825 | { lru|dirty, lru, "clean LRU", me_pagecache_clean }, | 825 | { lru|dirty, lru, "clean LRU", me_pagecache_clean }, |
826 | 826 | ||
827 | /* | 827 | /* |
828 | * Catchall entry: must be at end. | 828 | * Catchall entry: must be at end. |
829 | */ | 829 | */ |
830 | { 0, 0, "unknown page state", me_unknown }, | 830 | { 0, 0, "unknown page state", me_unknown }, |
831 | }; | 831 | }; |
832 | 832 | ||
833 | #undef dirty | 833 | #undef dirty |
834 | #undef sc | 834 | #undef sc |
835 | #undef unevict | 835 | #undef unevict |
836 | #undef mlock | 836 | #undef mlock |
837 | #undef writeback | 837 | #undef writeback |
838 | #undef lru | 838 | #undef lru |
839 | #undef swapbacked | 839 | #undef swapbacked |
840 | #undef head | 840 | #undef head |
841 | #undef tail | 841 | #undef tail |
842 | #undef compound | 842 | #undef compound |
843 | #undef slab | 843 | #undef slab |
844 | #undef reserved | 844 | #undef reserved |
845 | 845 | ||
846 | /* | 846 | /* |
847 | * "Dirty/Clean" indication is not 100% accurate due to the possibility of | 847 | * "Dirty/Clean" indication is not 100% accurate due to the possibility of |
848 | * setting PG_dirty outside page lock. See also comment above set_page_dirty(). | 848 | * setting PG_dirty outside page lock. See also comment above set_page_dirty(). |
849 | */ | 849 | */ |
850 | static void action_result(unsigned long pfn, char *msg, int result) | 850 | static void action_result(unsigned long pfn, char *msg, int result) |
851 | { | 851 | { |
852 | pr_err("MCE %#lx: %s page recovery: %s\n", | 852 | pr_err("MCE %#lx: %s page recovery: %s\n", |
853 | pfn, msg, action_name[result]); | 853 | pfn, msg, action_name[result]); |
854 | } | 854 | } |
855 | 855 | ||
856 | static int page_action(struct page_state *ps, struct page *p, | 856 | static int page_action(struct page_state *ps, struct page *p, |
857 | unsigned long pfn) | 857 | unsigned long pfn) |
858 | { | 858 | { |
859 | int result; | 859 | int result; |
860 | int count; | 860 | int count; |
861 | 861 | ||
862 | result = ps->action(p, pfn); | 862 | result = ps->action(p, pfn); |
863 | action_result(pfn, ps->msg, result); | 863 | action_result(pfn, ps->msg, result); |
864 | 864 | ||
865 | count = page_count(p) - 1; | 865 | count = page_count(p) - 1; |
866 | if (ps->action == me_swapcache_dirty && result == DELAYED) | 866 | if (ps->action == me_swapcache_dirty && result == DELAYED) |
867 | count--; | 867 | count--; |
868 | if (count != 0) { | 868 | if (count != 0) { |
869 | printk(KERN_ERR | 869 | printk(KERN_ERR |
870 | "MCE %#lx: %s page still referenced by %d users\n", | 870 | "MCE %#lx: %s page still referenced by %d users\n", |
871 | pfn, ps->msg, count); | 871 | pfn, ps->msg, count); |
872 | result = FAILED; | 872 | result = FAILED; |
873 | } | 873 | } |
874 | 874 | ||
875 | /* Could do more checks here if page looks ok */ | 875 | /* Could do more checks here if page looks ok */ |
876 | /* | 876 | /* |
877 | * Could adjust zone counters here to correct for the missing page. | 877 | * Could adjust zone counters here to correct for the missing page. |
878 | */ | 878 | */ |
879 | 879 | ||
880 | return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY; | 880 | return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY; |
881 | } | 881 | } |
882 | 882 | ||
883 | /* | 883 | /* |
884 | * Do all that is necessary to remove user space mappings. Unmap | 884 | * Do all that is necessary to remove user space mappings. Unmap |
885 | * the pages and send SIGBUS to the processes if the data was dirty. | 885 | * the pages and send SIGBUS to the processes if the data was dirty. |
886 | */ | 886 | */ |
887 | static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | 887 | static int hwpoison_user_mappings(struct page *p, unsigned long pfn, |
888 | int trapno, int flags, struct page **hpagep) | 888 | int trapno, int flags, struct page **hpagep) |
889 | { | 889 | { |
890 | enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; | 890 | enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; |
891 | struct address_space *mapping; | 891 | struct address_space *mapping; |
892 | LIST_HEAD(tokill); | 892 | LIST_HEAD(tokill); |
893 | int ret; | 893 | int ret; |
894 | int kill = 1, forcekill; | 894 | int kill = 1, forcekill; |
895 | struct page *hpage = *hpagep; | 895 | struct page *hpage = *hpagep; |
896 | struct page *ppage; | 896 | struct page *ppage; |
897 | 897 | ||
898 | if (PageReserved(p) || PageSlab(p) || !PageLRU(p)) | 898 | /* |
899 | * Here we are interested only in user-mapped pages, so skip any | ||
900 | * other types of pages. | ||
901 | */ | ||
902 | if (PageReserved(p) || PageSlab(p)) | ||
903 | return SWAP_SUCCESS; | ||
904 | if (!(PageLRU(hpage) || PageHuge(p))) | ||
899 | return SWAP_SUCCESS; | 905 | return SWAP_SUCCESS; |
900 | 906 | ||
901 | /* | 907 | /* |
902 | * This check implies we don't kill processes if their pages | 908 | * This check implies we don't kill processes if their pages |
903 | * are in the swap cache early. Those are always late kills. | 909 | * are in the swap cache early. Those are always late kills. |
904 | */ | 910 | */ |
905 | if (!page_mapped(hpage)) | 911 | if (!page_mapped(hpage)) |
906 | return SWAP_SUCCESS; | 912 | return SWAP_SUCCESS; |
907 | 913 | ||
908 | if (PageKsm(p)) | 914 | if (PageKsm(p)) |
909 | return SWAP_FAIL; | 915 | return SWAP_FAIL; |
910 | 916 | ||
911 | if (PageSwapCache(p)) { | 917 | if (PageSwapCache(p)) { |
912 | printk(KERN_ERR | 918 | printk(KERN_ERR |
913 | "MCE %#lx: keeping poisoned page in swap cache\n", pfn); | 919 | "MCE %#lx: keeping poisoned page in swap cache\n", pfn); |
914 | ttu |= TTU_IGNORE_HWPOISON; | 920 | ttu |= TTU_IGNORE_HWPOISON; |
915 | } | 921 | } |
916 | 922 | ||
917 | /* | 923 | /* |
918 | * Propagate the dirty bit from PTEs to struct page first, because we | 924 | * Propagate the dirty bit from PTEs to struct page first, because we |
919 | * need this to decide if we should kill or just drop the page. | 925 | * need this to decide if we should kill or just drop the page. |
920 | * XXX: the dirty test could be racy: set_page_dirty() may not always | 926 | * XXX: the dirty test could be racy: set_page_dirty() may not always |
921 | * be called inside page lock (it's recommended but not enforced). | 927 | * be called inside page lock (it's recommended but not enforced). |
922 | */ | 928 | */ |
923 | mapping = page_mapping(hpage); | 929 | mapping = page_mapping(hpage); |
924 | if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping && | 930 | if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping && |
925 | mapping_cap_writeback_dirty(mapping)) { | 931 | mapping_cap_writeback_dirty(mapping)) { |
926 | if (page_mkclean(hpage)) { | 932 | if (page_mkclean(hpage)) { |
927 | SetPageDirty(hpage); | 933 | SetPageDirty(hpage); |
928 | } else { | 934 | } else { |
929 | kill = 0; | 935 | kill = 0; |
930 | ttu |= TTU_IGNORE_HWPOISON; | 936 | ttu |= TTU_IGNORE_HWPOISON; |
931 | printk(KERN_INFO | 937 | printk(KERN_INFO |
932 | "MCE %#lx: corrupted page was clean: dropped without side effects\n", | 938 | "MCE %#lx: corrupted page was clean: dropped without side effects\n", |
933 | pfn); | 939 | pfn); |
934 | } | 940 | } |
935 | } | 941 | } |
936 | 942 | ||
937 | /* | 943 | /* |
938 | * ppage: poisoned page | 944 | * ppage: poisoned page |
939 | * if p is regular page(4k page) | 945 | * if p is regular page(4k page) |
940 | * ppage == real poisoned page; | 946 | * ppage == real poisoned page; |
941 | * else p is hugetlb or THP, ppage == head page. | 947 | * else p is hugetlb or THP, ppage == head page. |
942 | */ | 948 | */ |
943 | ppage = hpage; | 949 | ppage = hpage; |
944 | 950 | ||
945 | if (PageTransHuge(hpage)) { | 951 | if (PageTransHuge(hpage)) { |
946 | /* | 952 | /* |
947 | * Verify that this isn't a hugetlbfs head page, the check for | 953 | * Verify that this isn't a hugetlbfs head page, the check for |
948 | * PageAnon is just for avoid tripping a split_huge_page | 954 | * PageAnon is just for avoid tripping a split_huge_page |
949 | * internal debug check, as split_huge_page refuses to deal with | 955 | * internal debug check, as split_huge_page refuses to deal with |
950 | * anything that isn't an anon page. PageAnon can't go away fro | 956 | * anything that isn't an anon page. PageAnon can't go away fro |
951 | * under us because we hold a refcount on the hpage, without a | 957 | * under us because we hold a refcount on the hpage, without a |
952 | * refcount on the hpage. split_huge_page can't be safely called | 958 | * refcount on the hpage. split_huge_page can't be safely called |
953 | * in the first place, having a refcount on the tail isn't | 959 | * in the first place, having a refcount on the tail isn't |
954 | * enough * to be safe. | 960 | * enough * to be safe. |
955 | */ | 961 | */ |
956 | if (!PageHuge(hpage) && PageAnon(hpage)) { | 962 | if (!PageHuge(hpage) && PageAnon(hpage)) { |
957 | if (unlikely(split_huge_page(hpage))) { | 963 | if (unlikely(split_huge_page(hpage))) { |
958 | /* | 964 | /* |
959 | * FIXME: if splitting THP is failed, it is | 965 | * FIXME: if splitting THP is failed, it is |
960 | * better to stop the following operation rather | 966 | * better to stop the following operation rather |
961 | * than causing panic by unmapping. System might | 967 | * than causing panic by unmapping. System might |
962 | * survive if the page is freed later. | 968 | * survive if the page is freed later. |
963 | */ | 969 | */ |
964 | printk(KERN_INFO | 970 | printk(KERN_INFO |
965 | "MCE %#lx: failed to split THP\n", pfn); | 971 | "MCE %#lx: failed to split THP\n", pfn); |
966 | 972 | ||
967 | BUG_ON(!PageHWPoison(p)); | 973 | BUG_ON(!PageHWPoison(p)); |
968 | return SWAP_FAIL; | 974 | return SWAP_FAIL; |
969 | } | 975 | } |
970 | /* | 976 | /* |
971 | * We pinned the head page for hwpoison handling, | 977 | * We pinned the head page for hwpoison handling, |
972 | * now we split the thp and we are interested in | 978 | * now we split the thp and we are interested in |
973 | * the hwpoisoned raw page, so move the refcount | 979 | * the hwpoisoned raw page, so move the refcount |
974 | * to it. Similarly, page lock is shifted. | 980 | * to it. Similarly, page lock is shifted. |
975 | */ | 981 | */ |
976 | if (hpage != p) { | 982 | if (hpage != p) { |
977 | if (!(flags & MF_COUNT_INCREASED)) { | 983 | if (!(flags & MF_COUNT_INCREASED)) { |
978 | put_page(hpage); | 984 | put_page(hpage); |
979 | get_page(p); | 985 | get_page(p); |
980 | } | 986 | } |
981 | lock_page(p); | 987 | lock_page(p); |
982 | unlock_page(hpage); | 988 | unlock_page(hpage); |
983 | *hpagep = p; | 989 | *hpagep = p; |
984 | } | 990 | } |
985 | /* THP is split, so ppage should be the real poisoned page. */ | 991 | /* THP is split, so ppage should be the real poisoned page. */ |
986 | ppage = p; | 992 | ppage = p; |
987 | } | 993 | } |
988 | } | 994 | } |
989 | 995 | ||
990 | /* | 996 | /* |
991 | * First collect all the processes that have the page | 997 | * First collect all the processes that have the page |
992 | * mapped in dirty form. This has to be done before try_to_unmap, | 998 | * mapped in dirty form. This has to be done before try_to_unmap, |
993 | * because ttu takes the rmap data structures down. | 999 | * because ttu takes the rmap data structures down. |
994 | * | 1000 | * |
995 | * Error handling: We ignore errors here because | 1001 | * Error handling: We ignore errors here because |
996 | * there's nothing that can be done. | 1002 | * there's nothing that can be done. |
997 | */ | 1003 | */ |
998 | if (kill) | 1004 | if (kill) |
999 | collect_procs(ppage, &tokill, flags & MF_ACTION_REQUIRED); | 1005 | collect_procs(ppage, &tokill, flags & MF_ACTION_REQUIRED); |
1000 | 1006 | ||
1001 | ret = try_to_unmap(ppage, ttu); | 1007 | ret = try_to_unmap(ppage, ttu); |
1002 | if (ret != SWAP_SUCCESS) | 1008 | if (ret != SWAP_SUCCESS) |
1003 | printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", | 1009 | printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", |
1004 | pfn, page_mapcount(ppage)); | 1010 | pfn, page_mapcount(ppage)); |
1005 | 1011 | ||
1006 | /* | 1012 | /* |
1007 | * Now that the dirty bit has been propagated to the | 1013 | * Now that the dirty bit has been propagated to the |
1008 | * struct page and all unmaps done we can decide if | 1014 | * struct page and all unmaps done we can decide if |
1009 | * killing is needed or not. Only kill when the page | 1015 | * killing is needed or not. Only kill when the page |
1010 | * was dirty or the process is not restartable, | 1016 | * was dirty or the process is not restartable, |
1011 | * otherwise the tokill list is merely | 1017 | * otherwise the tokill list is merely |
1012 | * freed. When there was a problem unmapping earlier | 1018 | * freed. When there was a problem unmapping earlier |
1013 | * use a more force-full uncatchable kill to prevent | 1019 | * use a more force-full uncatchable kill to prevent |
1014 | * any accesses to the poisoned memory. | 1020 | * any accesses to the poisoned memory. |
1015 | */ | 1021 | */ |
1016 | forcekill = PageDirty(ppage) || (flags & MF_MUST_KILL); | 1022 | forcekill = PageDirty(ppage) || (flags & MF_MUST_KILL); |
1017 | kill_procs(&tokill, forcekill, trapno, | 1023 | kill_procs(&tokill, forcekill, trapno, |
1018 | ret != SWAP_SUCCESS, p, pfn, flags); | 1024 | ret != SWAP_SUCCESS, p, pfn, flags); |
1019 | 1025 | ||
1020 | return ret; | 1026 | return ret; |
1021 | } | 1027 | } |
1022 | 1028 | ||
1023 | static void set_page_hwpoison_huge_page(struct page *hpage) | 1029 | static void set_page_hwpoison_huge_page(struct page *hpage) |
1024 | { | 1030 | { |
1025 | int i; | 1031 | int i; |
1026 | int nr_pages = 1 << compound_order(hpage); | 1032 | int nr_pages = 1 << compound_order(hpage); |
1027 | for (i = 0; i < nr_pages; i++) | 1033 | for (i = 0; i < nr_pages; i++) |
1028 | SetPageHWPoison(hpage + i); | 1034 | SetPageHWPoison(hpage + i); |
1029 | } | 1035 | } |
1030 | 1036 | ||
1031 | static void clear_page_hwpoison_huge_page(struct page *hpage) | 1037 | static void clear_page_hwpoison_huge_page(struct page *hpage) |
1032 | { | 1038 | { |
1033 | int i; | 1039 | int i; |
1034 | int nr_pages = 1 << compound_order(hpage); | 1040 | int nr_pages = 1 << compound_order(hpage); |
1035 | for (i = 0; i < nr_pages; i++) | 1041 | for (i = 0; i < nr_pages; i++) |
1036 | ClearPageHWPoison(hpage + i); | 1042 | ClearPageHWPoison(hpage + i); |
1037 | } | 1043 | } |
1038 | 1044 | ||
1039 | /** | 1045 | /** |
1040 | * memory_failure - Handle memory failure of a page. | 1046 | * memory_failure - Handle memory failure of a page. |
1041 | * @pfn: Page Number of the corrupted page | 1047 | * @pfn: Page Number of the corrupted page |
1042 | * @trapno: Trap number reported in the signal to user space. | 1048 | * @trapno: Trap number reported in the signal to user space. |
1043 | * @flags: fine tune action taken | 1049 | * @flags: fine tune action taken |
1044 | * | 1050 | * |
1045 | * This function is called by the low level machine check code | 1051 | * This function is called by the low level machine check code |
1046 | * of an architecture when it detects hardware memory corruption | 1052 | * of an architecture when it detects hardware memory corruption |
1047 | * of a page. It tries its best to recover, which includes | 1053 | * of a page. It tries its best to recover, which includes |
1048 | * dropping pages, killing processes etc. | 1054 | * dropping pages, killing processes etc. |
1049 | * | 1055 | * |
1050 | * The function is primarily of use for corruptions that | 1056 | * The function is primarily of use for corruptions that |
1051 | * happen outside the current execution context (e.g. when | 1057 | * happen outside the current execution context (e.g. when |
1052 | * detected by a background scrubber) | 1058 | * detected by a background scrubber) |
1053 | * | 1059 | * |
1054 | * Must run in process context (e.g. a work queue) with interrupts | 1060 | * Must run in process context (e.g. a work queue) with interrupts |
1055 | * enabled and no spinlocks hold. | 1061 | * enabled and no spinlocks hold. |
1056 | */ | 1062 | */ |
1057 | int memory_failure(unsigned long pfn, int trapno, int flags) | 1063 | int memory_failure(unsigned long pfn, int trapno, int flags) |
1058 | { | 1064 | { |
1059 | struct page_state *ps; | 1065 | struct page_state *ps; |
1060 | struct page *p; | 1066 | struct page *p; |
1061 | struct page *hpage; | 1067 | struct page *hpage; |
1062 | int res; | 1068 | int res; |
1063 | unsigned int nr_pages; | 1069 | unsigned int nr_pages; |
1064 | unsigned long page_flags; | 1070 | unsigned long page_flags; |
1065 | 1071 | ||
1066 | if (!sysctl_memory_failure_recovery) | 1072 | if (!sysctl_memory_failure_recovery) |
1067 | panic("Memory failure from trap %d on page %lx", trapno, pfn); | 1073 | panic("Memory failure from trap %d on page %lx", trapno, pfn); |
1068 | 1074 | ||
1069 | if (!pfn_valid(pfn)) { | 1075 | if (!pfn_valid(pfn)) { |
1070 | printk(KERN_ERR | 1076 | printk(KERN_ERR |
1071 | "MCE %#lx: memory outside kernel control\n", | 1077 | "MCE %#lx: memory outside kernel control\n", |
1072 | pfn); | 1078 | pfn); |
1073 | return -ENXIO; | 1079 | return -ENXIO; |
1074 | } | 1080 | } |
1075 | 1081 | ||
1076 | p = pfn_to_page(pfn); | 1082 | p = pfn_to_page(pfn); |
1077 | hpage = compound_head(p); | 1083 | hpage = compound_head(p); |
1078 | if (TestSetPageHWPoison(p)) { | 1084 | if (TestSetPageHWPoison(p)) { |
1079 | printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn); | 1085 | printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn); |
1080 | return 0; | 1086 | return 0; |
1081 | } | 1087 | } |
1082 | 1088 | ||
1083 | /* | 1089 | /* |
1084 | * Currently errors on hugetlbfs pages are measured in hugepage units, | 1090 | * Currently errors on hugetlbfs pages are measured in hugepage units, |
1085 | * so nr_pages should be 1 << compound_order. OTOH when errors are on | 1091 | * so nr_pages should be 1 << compound_order. OTOH when errors are on |
1086 | * transparent hugepages, they are supposed to be split and error | 1092 | * transparent hugepages, they are supposed to be split and error |
1087 | * measurement is done in normal page units. So nr_pages should be one | 1093 | * measurement is done in normal page units. So nr_pages should be one |
1088 | * in this case. | 1094 | * in this case. |
1089 | */ | 1095 | */ |
1090 | if (PageHuge(p)) | 1096 | if (PageHuge(p)) |
1091 | nr_pages = 1 << compound_order(hpage); | 1097 | nr_pages = 1 << compound_order(hpage); |
1092 | else /* normal page or thp */ | 1098 | else /* normal page or thp */ |
1093 | nr_pages = 1; | 1099 | nr_pages = 1; |
1094 | atomic_long_add(nr_pages, &num_poisoned_pages); | 1100 | atomic_long_add(nr_pages, &num_poisoned_pages); |
1095 | 1101 | ||
1096 | /* | 1102 | /* |
1097 | * We need/can do nothing about count=0 pages. | 1103 | * We need/can do nothing about count=0 pages. |
1098 | * 1) it's a free page, and therefore in safe hand: | 1104 | * 1) it's a free page, and therefore in safe hand: |
1099 | * prep_new_page() will be the gate keeper. | 1105 | * prep_new_page() will be the gate keeper. |
1100 | * 2) it's a free hugepage, which is also safe: | 1106 | * 2) it's a free hugepage, which is also safe: |
1101 | * an affected hugepage will be dequeued from hugepage freelist, | 1107 | * an affected hugepage will be dequeued from hugepage freelist, |
1102 | * so there's no concern about reusing it ever after. | 1108 | * so there's no concern about reusing it ever after. |
1103 | * 3) it's part of a non-compound high order page. | 1109 | * 3) it's part of a non-compound high order page. |
1104 | * Implies some kernel user: cannot stop them from | 1110 | * Implies some kernel user: cannot stop them from |
1105 | * R/W the page; let's pray that the page has been | 1111 | * R/W the page; let's pray that the page has been |
1106 | * used and will be freed some time later. | 1112 | * used and will be freed some time later. |
1107 | * In fact it's dangerous to directly bump up page count from 0, | 1113 | * In fact it's dangerous to directly bump up page count from 0, |
1108 | * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. | 1114 | * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. |
1109 | */ | 1115 | */ |
1110 | if (!(flags & MF_COUNT_INCREASED) && | 1116 | if (!(flags & MF_COUNT_INCREASED) && |
1111 | !get_page_unless_zero(hpage)) { | 1117 | !get_page_unless_zero(hpage)) { |
1112 | if (is_free_buddy_page(p)) { | 1118 | if (is_free_buddy_page(p)) { |
1113 | action_result(pfn, "free buddy", DELAYED); | 1119 | action_result(pfn, "free buddy", DELAYED); |
1114 | return 0; | 1120 | return 0; |
1115 | } else if (PageHuge(hpage)) { | 1121 | } else if (PageHuge(hpage)) { |
1116 | /* | 1122 | /* |
1117 | * Check "filter hit" and "race with other subpage." | 1123 | * Check "filter hit" and "race with other subpage." |
1118 | */ | 1124 | */ |
1119 | lock_page(hpage); | 1125 | lock_page(hpage); |
1120 | if (PageHWPoison(hpage)) { | 1126 | if (PageHWPoison(hpage)) { |
1121 | if ((hwpoison_filter(p) && TestClearPageHWPoison(p)) | 1127 | if ((hwpoison_filter(p) && TestClearPageHWPoison(p)) |
1122 | || (p != hpage && TestSetPageHWPoison(hpage))) { | 1128 | || (p != hpage && TestSetPageHWPoison(hpage))) { |
1123 | atomic_long_sub(nr_pages, &num_poisoned_pages); | 1129 | atomic_long_sub(nr_pages, &num_poisoned_pages); |
1124 | unlock_page(hpage); | 1130 | unlock_page(hpage); |
1125 | return 0; | 1131 | return 0; |
1126 | } | 1132 | } |
1127 | } | 1133 | } |
1128 | set_page_hwpoison_huge_page(hpage); | 1134 | set_page_hwpoison_huge_page(hpage); |
1129 | res = dequeue_hwpoisoned_huge_page(hpage); | 1135 | res = dequeue_hwpoisoned_huge_page(hpage); |
1130 | action_result(pfn, "free huge", | 1136 | action_result(pfn, "free huge", |
1131 | res ? IGNORED : DELAYED); | 1137 | res ? IGNORED : DELAYED); |
1132 | unlock_page(hpage); | 1138 | unlock_page(hpage); |
1133 | return res; | 1139 | return res; |
1134 | } else { | 1140 | } else { |
1135 | action_result(pfn, "high order kernel", IGNORED); | 1141 | action_result(pfn, "high order kernel", IGNORED); |
1136 | return -EBUSY; | 1142 | return -EBUSY; |
1137 | } | 1143 | } |
1138 | } | 1144 | } |
1139 | 1145 | ||
1140 | /* | 1146 | /* |
1141 | * We ignore non-LRU pages for good reasons. | 1147 | * We ignore non-LRU pages for good reasons. |
1142 | * - PG_locked is only well defined for LRU pages and a few others | 1148 | * - PG_locked is only well defined for LRU pages and a few others |
1143 | * - to avoid races with __set_page_locked() | 1149 | * - to avoid races with __set_page_locked() |
1144 | * - to avoid races with __SetPageSlab*() (and more non-atomic ops) | 1150 | * - to avoid races with __SetPageSlab*() (and more non-atomic ops) |
1145 | * The check (unnecessarily) ignores LRU pages being isolated and | 1151 | * The check (unnecessarily) ignores LRU pages being isolated and |
1146 | * walked by the page reclaim code, however that's not a big loss. | 1152 | * walked by the page reclaim code, however that's not a big loss. |
1147 | */ | 1153 | */ |
1148 | if (!PageHuge(p) && !PageTransTail(p)) { | 1154 | if (!PageHuge(p) && !PageTransTail(p)) { |
1149 | if (!PageLRU(p)) | 1155 | if (!PageLRU(p)) |
1150 | shake_page(p, 0); | 1156 | shake_page(p, 0); |
1151 | if (!PageLRU(p)) { | 1157 | if (!PageLRU(p)) { |
1152 | /* | 1158 | /* |
1153 | * shake_page could have turned it free. | 1159 | * shake_page could have turned it free. |
1154 | */ | 1160 | */ |
1155 | if (is_free_buddy_page(p)) { | 1161 | if (is_free_buddy_page(p)) { |
1156 | if (flags & MF_COUNT_INCREASED) | 1162 | if (flags & MF_COUNT_INCREASED) |
1157 | action_result(pfn, "free buddy", DELAYED); | 1163 | action_result(pfn, "free buddy", DELAYED); |
1158 | else | 1164 | else |
1159 | action_result(pfn, "free buddy, 2nd try", DELAYED); | 1165 | action_result(pfn, "free buddy, 2nd try", DELAYED); |
1160 | return 0; | 1166 | return 0; |
1161 | } | 1167 | } |
1162 | } | 1168 | } |
1163 | } | 1169 | } |
1164 | 1170 | ||
1165 | lock_page(hpage); | 1171 | lock_page(hpage); |
1166 | 1172 | ||
1167 | /* | 1173 | /* |
1168 | * We use page flags to determine what action should be taken, but | 1174 | * We use page flags to determine what action should be taken, but |
1169 | * the flags can be modified by the error containment action. One | 1175 | * the flags can be modified by the error containment action. One |
1170 | * example is an mlocked page, where PG_mlocked is cleared by | 1176 | * example is an mlocked page, where PG_mlocked is cleared by |
1171 | * page_remove_rmap() in try_to_unmap_one(). So to determine page status | 1177 | * page_remove_rmap() in try_to_unmap_one(). So to determine page status |
1172 | * correctly, we save a copy of the page flags at this time. | 1178 | * correctly, we save a copy of the page flags at this time. |
1173 | */ | 1179 | */ |
1174 | page_flags = p->flags; | 1180 | page_flags = p->flags; |
1175 | 1181 | ||
1176 | /* | 1182 | /* |
1177 | * unpoison always clear PG_hwpoison inside page lock | 1183 | * unpoison always clear PG_hwpoison inside page lock |
1178 | */ | 1184 | */ |
1179 | if (!PageHWPoison(p)) { | 1185 | if (!PageHWPoison(p)) { |
1180 | printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn); | 1186 | printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn); |
1181 | atomic_long_sub(nr_pages, &num_poisoned_pages); | 1187 | atomic_long_sub(nr_pages, &num_poisoned_pages); |
1182 | put_page(hpage); | 1188 | put_page(hpage); |
1183 | res = 0; | 1189 | res = 0; |
1184 | goto out; | 1190 | goto out; |
1185 | } | 1191 | } |
1186 | if (hwpoison_filter(p)) { | 1192 | if (hwpoison_filter(p)) { |
1187 | if (TestClearPageHWPoison(p)) | 1193 | if (TestClearPageHWPoison(p)) |
1188 | atomic_long_sub(nr_pages, &num_poisoned_pages); | 1194 | atomic_long_sub(nr_pages, &num_poisoned_pages); |
1189 | unlock_page(hpage); | 1195 | unlock_page(hpage); |
1190 | put_page(hpage); | 1196 | put_page(hpage); |
1191 | return 0; | 1197 | return 0; |
1192 | } | 1198 | } |
1193 | 1199 | ||
1194 | if (!PageHuge(p) && !PageTransTail(p) && !PageLRU(p)) | 1200 | if (!PageHuge(p) && !PageTransTail(p) && !PageLRU(p)) |
1195 | goto identify_page_state; | 1201 | goto identify_page_state; |
1196 | 1202 | ||
1197 | /* | 1203 | /* |
1198 | * For error on the tail page, we should set PG_hwpoison | 1204 | * For error on the tail page, we should set PG_hwpoison |
1199 | * on the head page to show that the hugepage is hwpoisoned | 1205 | * on the head page to show that the hugepage is hwpoisoned |
1200 | */ | 1206 | */ |
1201 | if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) { | 1207 | if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) { |
1202 | action_result(pfn, "hugepage already hardware poisoned", | 1208 | action_result(pfn, "hugepage already hardware poisoned", |
1203 | IGNORED); | 1209 | IGNORED); |
1204 | unlock_page(hpage); | 1210 | unlock_page(hpage); |
1205 | put_page(hpage); | 1211 | put_page(hpage); |
1206 | return 0; | 1212 | return 0; |
1207 | } | 1213 | } |
1208 | /* | 1214 | /* |
1209 | * Set PG_hwpoison on all pages in an error hugepage, | 1215 | * Set PG_hwpoison on all pages in an error hugepage, |
1210 | * because containment is done in hugepage unit for now. | 1216 | * because containment is done in hugepage unit for now. |
1211 | * Since we have done TestSetPageHWPoison() for the head page with | 1217 | * Since we have done TestSetPageHWPoison() for the head page with |
1212 | * page lock held, we can safely set PG_hwpoison bits on tail pages. | 1218 | * page lock held, we can safely set PG_hwpoison bits on tail pages. |
1213 | */ | 1219 | */ |
1214 | if (PageHuge(p)) | 1220 | if (PageHuge(p)) |
1215 | set_page_hwpoison_huge_page(hpage); | 1221 | set_page_hwpoison_huge_page(hpage); |
1216 | 1222 | ||
1217 | /* | 1223 | /* |
1218 | * It's very difficult to mess with pages currently under IO | 1224 | * It's very difficult to mess with pages currently under IO |
1219 | * and in many cases impossible, so we just avoid it here. | 1225 | * and in many cases impossible, so we just avoid it here. |
1220 | */ | 1226 | */ |
1221 | wait_on_page_writeback(p); | 1227 | wait_on_page_writeback(p); |
1222 | 1228 | ||
1223 | /* | 1229 | /* |
1224 | * Now take care of user space mappings. | 1230 | * Now take care of user space mappings. |
1225 | * Abort on fail: __delete_from_page_cache() assumes unmapped page. | 1231 | * Abort on fail: __delete_from_page_cache() assumes unmapped page. |
1226 | * | 1232 | * |
1227 | * When the raw error page is thp tail page, hpage points to the raw | 1233 | * When the raw error page is thp tail page, hpage points to the raw |
1228 | * page after thp split. | 1234 | * page after thp split. |
1229 | */ | 1235 | */ |
1230 | if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage) | 1236 | if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage) |
1231 | != SWAP_SUCCESS) { | 1237 | != SWAP_SUCCESS) { |
1232 | printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn); | 1238 | printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn); |
1233 | res = -EBUSY; | 1239 | res = -EBUSY; |
1234 | goto out; | 1240 | goto out; |
1235 | } | 1241 | } |
1236 | 1242 | ||
1237 | /* | 1243 | /* |
1238 | * Torn down by someone else? | 1244 | * Torn down by someone else? |
1239 | */ | 1245 | */ |
1240 | if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { | 1246 | if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { |
1241 | action_result(pfn, "already truncated LRU", IGNORED); | 1247 | action_result(pfn, "already truncated LRU", IGNORED); |
1242 | res = -EBUSY; | 1248 | res = -EBUSY; |
1243 | goto out; | 1249 | goto out; |
1244 | } | 1250 | } |
1245 | 1251 | ||
1246 | identify_page_state: | 1252 | identify_page_state: |
1247 | res = -EBUSY; | 1253 | res = -EBUSY; |
1248 | /* | 1254 | /* |
1249 | * The first check uses the current page flags which may not have any | 1255 | * The first check uses the current page flags which may not have any |
1250 | * relevant information. The second check with the saved page flagss is | 1256 | * relevant information. The second check with the saved page flagss is |
1251 | * carried out only if the first check can't determine the page status. | 1257 | * carried out only if the first check can't determine the page status. |
1252 | */ | 1258 | */ |
1253 | for (ps = error_states;; ps++) | 1259 | for (ps = error_states;; ps++) |
1254 | if ((p->flags & ps->mask) == ps->res) | 1260 | if ((p->flags & ps->mask) == ps->res) |
1255 | break; | 1261 | break; |
1256 | 1262 | ||
1257 | page_flags |= (p->flags & (1UL << PG_dirty)); | 1263 | page_flags |= (p->flags & (1UL << PG_dirty)); |
1258 | 1264 | ||
1259 | if (!ps->mask) | 1265 | if (!ps->mask) |
1260 | for (ps = error_states;; ps++) | 1266 | for (ps = error_states;; ps++) |
1261 | if ((page_flags & ps->mask) == ps->res) | 1267 | if ((page_flags & ps->mask) == ps->res) |
1262 | break; | 1268 | break; |
1263 | res = page_action(ps, p, pfn); | 1269 | res = page_action(ps, p, pfn); |
1264 | out: | 1270 | out: |
1265 | unlock_page(hpage); | 1271 | unlock_page(hpage); |
1266 | return res; | 1272 | return res; |
1267 | } | 1273 | } |
1268 | EXPORT_SYMBOL_GPL(memory_failure); | 1274 | EXPORT_SYMBOL_GPL(memory_failure); |
1269 | 1275 | ||
1270 | #define MEMORY_FAILURE_FIFO_ORDER 4 | 1276 | #define MEMORY_FAILURE_FIFO_ORDER 4 |
1271 | #define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER) | 1277 | #define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER) |
1272 | 1278 | ||
1273 | struct memory_failure_entry { | 1279 | struct memory_failure_entry { |
1274 | unsigned long pfn; | 1280 | unsigned long pfn; |
1275 | int trapno; | 1281 | int trapno; |
1276 | int flags; | 1282 | int flags; |
1277 | }; | 1283 | }; |
1278 | 1284 | ||
1279 | struct memory_failure_cpu { | 1285 | struct memory_failure_cpu { |
1280 | DECLARE_KFIFO(fifo, struct memory_failure_entry, | 1286 | DECLARE_KFIFO(fifo, struct memory_failure_entry, |
1281 | MEMORY_FAILURE_FIFO_SIZE); | 1287 | MEMORY_FAILURE_FIFO_SIZE); |
1282 | spinlock_t lock; | 1288 | spinlock_t lock; |
1283 | struct work_struct work; | 1289 | struct work_struct work; |
1284 | }; | 1290 | }; |
1285 | 1291 | ||
1286 | static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu); | 1292 | static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu); |
1287 | 1293 | ||
1288 | /** | 1294 | /** |
1289 | * memory_failure_queue - Schedule handling memory failure of a page. | 1295 | * memory_failure_queue - Schedule handling memory failure of a page. |
1290 | * @pfn: Page Number of the corrupted page | 1296 | * @pfn: Page Number of the corrupted page |
1291 | * @trapno: Trap number reported in the signal to user space. | 1297 | * @trapno: Trap number reported in the signal to user space. |
1292 | * @flags: Flags for memory failure handling | 1298 | * @flags: Flags for memory failure handling |
1293 | * | 1299 | * |
1294 | * This function is called by the low level hardware error handler | 1300 | * This function is called by the low level hardware error handler |
1295 | * when it detects hardware memory corruption of a page. It schedules | 1301 | * when it detects hardware memory corruption of a page. It schedules |
1296 | * the recovering of error page, including dropping pages, killing | 1302 | * the recovering of error page, including dropping pages, killing |
1297 | * processes etc. | 1303 | * processes etc. |
1298 | * | 1304 | * |
1299 | * The function is primarily of use for corruptions that | 1305 | * The function is primarily of use for corruptions that |
1300 | * happen outside the current execution context (e.g. when | 1306 | * happen outside the current execution context (e.g. when |
1301 | * detected by a background scrubber) | 1307 | * detected by a background scrubber) |
1302 | * | 1308 | * |
1303 | * Can run in IRQ context. | 1309 | * Can run in IRQ context. |
1304 | */ | 1310 | */ |
1305 | void memory_failure_queue(unsigned long pfn, int trapno, int flags) | 1311 | void memory_failure_queue(unsigned long pfn, int trapno, int flags) |
1306 | { | 1312 | { |
1307 | struct memory_failure_cpu *mf_cpu; | 1313 | struct memory_failure_cpu *mf_cpu; |
1308 | unsigned long proc_flags; | 1314 | unsigned long proc_flags; |
1309 | struct memory_failure_entry entry = { | 1315 | struct memory_failure_entry entry = { |
1310 | .pfn = pfn, | 1316 | .pfn = pfn, |
1311 | .trapno = trapno, | 1317 | .trapno = trapno, |
1312 | .flags = flags, | 1318 | .flags = flags, |
1313 | }; | 1319 | }; |
1314 | 1320 | ||
1315 | mf_cpu = &get_cpu_var(memory_failure_cpu); | 1321 | mf_cpu = &get_cpu_var(memory_failure_cpu); |
1316 | spin_lock_irqsave(&mf_cpu->lock, proc_flags); | 1322 | spin_lock_irqsave(&mf_cpu->lock, proc_flags); |
1317 | if (kfifo_put(&mf_cpu->fifo, entry)) | 1323 | if (kfifo_put(&mf_cpu->fifo, entry)) |
1318 | schedule_work_on(smp_processor_id(), &mf_cpu->work); | 1324 | schedule_work_on(smp_processor_id(), &mf_cpu->work); |
1319 | else | 1325 | else |
1320 | pr_err("Memory failure: buffer overflow when queuing memory failure at %#lx\n", | 1326 | pr_err("Memory failure: buffer overflow when queuing memory failure at %#lx\n", |
1321 | pfn); | 1327 | pfn); |
1322 | spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); | 1328 | spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); |
1323 | put_cpu_var(memory_failure_cpu); | 1329 | put_cpu_var(memory_failure_cpu); |
1324 | } | 1330 | } |
1325 | EXPORT_SYMBOL_GPL(memory_failure_queue); | 1331 | EXPORT_SYMBOL_GPL(memory_failure_queue); |
1326 | 1332 | ||
1327 | static void memory_failure_work_func(struct work_struct *work) | 1333 | static void memory_failure_work_func(struct work_struct *work) |
1328 | { | 1334 | { |
1329 | struct memory_failure_cpu *mf_cpu; | 1335 | struct memory_failure_cpu *mf_cpu; |
1330 | struct memory_failure_entry entry = { 0, }; | 1336 | struct memory_failure_entry entry = { 0, }; |
1331 | unsigned long proc_flags; | 1337 | unsigned long proc_flags; |
1332 | int gotten; | 1338 | int gotten; |
1333 | 1339 | ||
1334 | mf_cpu = this_cpu_ptr(&memory_failure_cpu); | 1340 | mf_cpu = this_cpu_ptr(&memory_failure_cpu); |
1335 | for (;;) { | 1341 | for (;;) { |
1336 | spin_lock_irqsave(&mf_cpu->lock, proc_flags); | 1342 | spin_lock_irqsave(&mf_cpu->lock, proc_flags); |
1337 | gotten = kfifo_get(&mf_cpu->fifo, &entry); | 1343 | gotten = kfifo_get(&mf_cpu->fifo, &entry); |
1338 | spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); | 1344 | spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); |
1339 | if (!gotten) | 1345 | if (!gotten) |
1340 | break; | 1346 | break; |
1341 | if (entry.flags & MF_SOFT_OFFLINE) | 1347 | if (entry.flags & MF_SOFT_OFFLINE) |
1342 | soft_offline_page(pfn_to_page(entry.pfn), entry.flags); | 1348 | soft_offline_page(pfn_to_page(entry.pfn), entry.flags); |
1343 | else | 1349 | else |
1344 | memory_failure(entry.pfn, entry.trapno, entry.flags); | 1350 | memory_failure(entry.pfn, entry.trapno, entry.flags); |
1345 | } | 1351 | } |
1346 | } | 1352 | } |
1347 | 1353 | ||
1348 | static int __init memory_failure_init(void) | 1354 | static int __init memory_failure_init(void) |
1349 | { | 1355 | { |
1350 | struct memory_failure_cpu *mf_cpu; | 1356 | struct memory_failure_cpu *mf_cpu; |
1351 | int cpu; | 1357 | int cpu; |
1352 | 1358 | ||
1353 | for_each_possible_cpu(cpu) { | 1359 | for_each_possible_cpu(cpu) { |
1354 | mf_cpu = &per_cpu(memory_failure_cpu, cpu); | 1360 | mf_cpu = &per_cpu(memory_failure_cpu, cpu); |
1355 | spin_lock_init(&mf_cpu->lock); | 1361 | spin_lock_init(&mf_cpu->lock); |
1356 | INIT_KFIFO(mf_cpu->fifo); | 1362 | INIT_KFIFO(mf_cpu->fifo); |
1357 | INIT_WORK(&mf_cpu->work, memory_failure_work_func); | 1363 | INIT_WORK(&mf_cpu->work, memory_failure_work_func); |
1358 | } | 1364 | } |
1359 | 1365 | ||
1360 | return 0; | 1366 | return 0; |
1361 | } | 1367 | } |
1362 | core_initcall(memory_failure_init); | 1368 | core_initcall(memory_failure_init); |
1363 | 1369 | ||
1364 | /** | 1370 | /** |
1365 | * unpoison_memory - Unpoison a previously poisoned page | 1371 | * unpoison_memory - Unpoison a previously poisoned page |
1366 | * @pfn: Page number of the to be unpoisoned page | 1372 | * @pfn: Page number of the to be unpoisoned page |
1367 | * | 1373 | * |
1368 | * Software-unpoison a page that has been poisoned by | 1374 | * Software-unpoison a page that has been poisoned by |
1369 | * memory_failure() earlier. | 1375 | * memory_failure() earlier. |
1370 | * | 1376 | * |
1371 | * This is only done on the software-level, so it only works | 1377 | * This is only done on the software-level, so it only works |
1372 | * for linux injected failures, not real hardware failures | 1378 | * for linux injected failures, not real hardware failures |
1373 | * | 1379 | * |
1374 | * Returns 0 for success, otherwise -errno. | 1380 | * Returns 0 for success, otherwise -errno. |
1375 | */ | 1381 | */ |
1376 | int unpoison_memory(unsigned long pfn) | 1382 | int unpoison_memory(unsigned long pfn) |
1377 | { | 1383 | { |
1378 | struct page *page; | 1384 | struct page *page; |
1379 | struct page *p; | 1385 | struct page *p; |
1380 | int freeit = 0; | 1386 | int freeit = 0; |
1381 | unsigned int nr_pages; | 1387 | unsigned int nr_pages; |
1382 | 1388 | ||
1383 | if (!pfn_valid(pfn)) | 1389 | if (!pfn_valid(pfn)) |
1384 | return -ENXIO; | 1390 | return -ENXIO; |
1385 | 1391 | ||
1386 | p = pfn_to_page(pfn); | 1392 | p = pfn_to_page(pfn); |
1387 | page = compound_head(p); | 1393 | page = compound_head(p); |
1388 | 1394 | ||
1389 | if (!PageHWPoison(p)) { | 1395 | if (!PageHWPoison(p)) { |
1390 | pr_info("MCE: Page was already unpoisoned %#lx\n", pfn); | 1396 | pr_info("MCE: Page was already unpoisoned %#lx\n", pfn); |
1391 | return 0; | 1397 | return 0; |
1392 | } | 1398 | } |
1393 | 1399 | ||
1394 | /* | 1400 | /* |
1395 | * unpoison_memory() can encounter thp only when the thp is being | 1401 | * unpoison_memory() can encounter thp only when the thp is being |
1396 | * worked by memory_failure() and the page lock is not held yet. | 1402 | * worked by memory_failure() and the page lock is not held yet. |
1397 | * In such case, we yield to memory_failure() and make unpoison fail. | 1403 | * In such case, we yield to memory_failure() and make unpoison fail. |
1398 | */ | 1404 | */ |
1399 | if (!PageHuge(page) && PageTransHuge(page)) { | 1405 | if (!PageHuge(page) && PageTransHuge(page)) { |
1400 | pr_info("MCE: Memory failure is now running on %#lx\n", pfn); | 1406 | pr_info("MCE: Memory failure is now running on %#lx\n", pfn); |
1401 | return 0; | 1407 | return 0; |
1402 | } | 1408 | } |
1403 | 1409 | ||
1404 | nr_pages = 1 << compound_order(page); | 1410 | nr_pages = 1 << compound_order(page); |
1405 | 1411 | ||
1406 | if (!get_page_unless_zero(page)) { | 1412 | if (!get_page_unless_zero(page)) { |
1407 | /* | 1413 | /* |
1408 | * Since HWPoisoned hugepage should have non-zero refcount, | 1414 | * Since HWPoisoned hugepage should have non-zero refcount, |
1409 | * race between memory failure and unpoison seems to happen. | 1415 | * race between memory failure and unpoison seems to happen. |
1410 | * In such case unpoison fails and memory failure runs | 1416 | * In such case unpoison fails and memory failure runs |
1411 | * to the end. | 1417 | * to the end. |
1412 | */ | 1418 | */ |
1413 | if (PageHuge(page)) { | 1419 | if (PageHuge(page)) { |
1414 | pr_info("MCE: Memory failure is now running on free hugepage %#lx\n", pfn); | 1420 | pr_info("MCE: Memory failure is now running on free hugepage %#lx\n", pfn); |
1415 | return 0; | 1421 | return 0; |
1416 | } | 1422 | } |
1417 | if (TestClearPageHWPoison(p)) | 1423 | if (TestClearPageHWPoison(p)) |
1418 | atomic_long_dec(&num_poisoned_pages); | 1424 | atomic_long_dec(&num_poisoned_pages); |
1419 | pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn); | 1425 | pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn); |
1420 | return 0; | 1426 | return 0; |
1421 | } | 1427 | } |
1422 | 1428 | ||
1423 | lock_page(page); | 1429 | lock_page(page); |
1424 | /* | 1430 | /* |
1425 | * This test is racy because PG_hwpoison is set outside of page lock. | 1431 | * This test is racy because PG_hwpoison is set outside of page lock. |
1426 | * That's acceptable because that won't trigger kernel panic. Instead, | 1432 | * That's acceptable because that won't trigger kernel panic. Instead, |
1427 | * the PG_hwpoison page will be caught and isolated on the entrance to | 1433 | * the PG_hwpoison page will be caught and isolated on the entrance to |
1428 | * the free buddy page pool. | 1434 | * the free buddy page pool. |
1429 | */ | 1435 | */ |
1430 | if (TestClearPageHWPoison(page)) { | 1436 | if (TestClearPageHWPoison(page)) { |
1431 | pr_info("MCE: Software-unpoisoned page %#lx\n", pfn); | 1437 | pr_info("MCE: Software-unpoisoned page %#lx\n", pfn); |
1432 | atomic_long_sub(nr_pages, &num_poisoned_pages); | 1438 | atomic_long_sub(nr_pages, &num_poisoned_pages); |
1433 | freeit = 1; | 1439 | freeit = 1; |
1434 | if (PageHuge(page)) | 1440 | if (PageHuge(page)) |
1435 | clear_page_hwpoison_huge_page(page); | 1441 | clear_page_hwpoison_huge_page(page); |
1436 | } | 1442 | } |
1437 | unlock_page(page); | 1443 | unlock_page(page); |
1438 | 1444 | ||
1439 | put_page(page); | 1445 | put_page(page); |
1440 | if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1)) | 1446 | if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1)) |
1441 | put_page(page); | 1447 | put_page(page); |
1442 | 1448 | ||
1443 | return 0; | 1449 | return 0; |
1444 | } | 1450 | } |
1445 | EXPORT_SYMBOL(unpoison_memory); | 1451 | EXPORT_SYMBOL(unpoison_memory); |
1446 | 1452 | ||
1447 | static struct page *new_page(struct page *p, unsigned long private, int **x) | 1453 | static struct page *new_page(struct page *p, unsigned long private, int **x) |
1448 | { | 1454 | { |
1449 | int nid = page_to_nid(p); | 1455 | int nid = page_to_nid(p); |
1450 | if (PageHuge(p)) | 1456 | if (PageHuge(p)) |
1451 | return alloc_huge_page_node(page_hstate(compound_head(p)), | 1457 | return alloc_huge_page_node(page_hstate(compound_head(p)), |
1452 | nid); | 1458 | nid); |
1453 | else | 1459 | else |
1454 | return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0); | 1460 | return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0); |
1455 | } | 1461 | } |
1456 | 1462 | ||
1457 | /* | 1463 | /* |
1458 | * Safely get reference count of an arbitrary page. | 1464 | * Safely get reference count of an arbitrary page. |
1459 | * Returns 0 for a free page, -EIO for a zero refcount page | 1465 | * Returns 0 for a free page, -EIO for a zero refcount page |
1460 | * that is not free, and 1 for any other page type. | 1466 | * that is not free, and 1 for any other page type. |
1461 | * For 1 the page is returned with increased page count, otherwise not. | 1467 | * For 1 the page is returned with increased page count, otherwise not. |
1462 | */ | 1468 | */ |
1463 | static int __get_any_page(struct page *p, unsigned long pfn, int flags) | 1469 | static int __get_any_page(struct page *p, unsigned long pfn, int flags) |
1464 | { | 1470 | { |
1465 | int ret; | 1471 | int ret; |
1466 | 1472 | ||
1467 | if (flags & MF_COUNT_INCREASED) | 1473 | if (flags & MF_COUNT_INCREASED) |
1468 | return 1; | 1474 | return 1; |
1469 | 1475 | ||
1470 | /* | 1476 | /* |
1471 | * When the target page is a free hugepage, just remove it | 1477 | * When the target page is a free hugepage, just remove it |
1472 | * from free hugepage list. | 1478 | * from free hugepage list. |
1473 | */ | 1479 | */ |
1474 | if (!get_page_unless_zero(compound_head(p))) { | 1480 | if (!get_page_unless_zero(compound_head(p))) { |
1475 | if (PageHuge(p)) { | 1481 | if (PageHuge(p)) { |
1476 | pr_info("%s: %#lx free huge page\n", __func__, pfn); | 1482 | pr_info("%s: %#lx free huge page\n", __func__, pfn); |
1477 | ret = 0; | 1483 | ret = 0; |
1478 | } else if (is_free_buddy_page(p)) { | 1484 | } else if (is_free_buddy_page(p)) { |
1479 | pr_info("%s: %#lx free buddy page\n", __func__, pfn); | 1485 | pr_info("%s: %#lx free buddy page\n", __func__, pfn); |
1480 | ret = 0; | 1486 | ret = 0; |
1481 | } else { | 1487 | } else { |
1482 | pr_info("%s: %#lx: unknown zero refcount page type %lx\n", | 1488 | pr_info("%s: %#lx: unknown zero refcount page type %lx\n", |
1483 | __func__, pfn, p->flags); | 1489 | __func__, pfn, p->flags); |
1484 | ret = -EIO; | 1490 | ret = -EIO; |
1485 | } | 1491 | } |
1486 | } else { | 1492 | } else { |
1487 | /* Not a free page */ | 1493 | /* Not a free page */ |
1488 | ret = 1; | 1494 | ret = 1; |
1489 | } | 1495 | } |
1490 | return ret; | 1496 | return ret; |
1491 | } | 1497 | } |
1492 | 1498 | ||
1493 | static int get_any_page(struct page *page, unsigned long pfn, int flags) | 1499 | static int get_any_page(struct page *page, unsigned long pfn, int flags) |
1494 | { | 1500 | { |
1495 | int ret = __get_any_page(page, pfn, flags); | 1501 | int ret = __get_any_page(page, pfn, flags); |
1496 | 1502 | ||
1497 | if (ret == 1 && !PageHuge(page) && !PageLRU(page)) { | 1503 | if (ret == 1 && !PageHuge(page) && !PageLRU(page)) { |
1498 | /* | 1504 | /* |
1499 | * Try to free it. | 1505 | * Try to free it. |
1500 | */ | 1506 | */ |
1501 | put_page(page); | 1507 | put_page(page); |
1502 | shake_page(page, 1); | 1508 | shake_page(page, 1); |
1503 | 1509 | ||
1504 | /* | 1510 | /* |
1505 | * Did it turn free? | 1511 | * Did it turn free? |
1506 | */ | 1512 | */ |
1507 | ret = __get_any_page(page, pfn, 0); | 1513 | ret = __get_any_page(page, pfn, 0); |
1508 | if (!PageLRU(page)) { | 1514 | if (!PageLRU(page)) { |
1509 | pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n", | 1515 | pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n", |
1510 | pfn, page->flags); | 1516 | pfn, page->flags); |
1511 | return -EIO; | 1517 | return -EIO; |
1512 | } | 1518 | } |
1513 | } | 1519 | } |
1514 | return ret; | 1520 | return ret; |
1515 | } | 1521 | } |
1516 | 1522 | ||
1517 | static int soft_offline_huge_page(struct page *page, int flags) | 1523 | static int soft_offline_huge_page(struct page *page, int flags) |
1518 | { | 1524 | { |
1519 | int ret; | 1525 | int ret; |
1520 | unsigned long pfn = page_to_pfn(page); | 1526 | unsigned long pfn = page_to_pfn(page); |
1521 | struct page *hpage = compound_head(page); | 1527 | struct page *hpage = compound_head(page); |
1522 | LIST_HEAD(pagelist); | 1528 | LIST_HEAD(pagelist); |
1523 | 1529 | ||
1524 | /* | 1530 | /* |
1525 | * This double-check of PageHWPoison is to avoid the race with | 1531 | * This double-check of PageHWPoison is to avoid the race with |
1526 | * memory_failure(). See also comment in __soft_offline_page(). | 1532 | * memory_failure(). See also comment in __soft_offline_page(). |
1527 | */ | 1533 | */ |
1528 | lock_page(hpage); | 1534 | lock_page(hpage); |
1529 | if (PageHWPoison(hpage)) { | 1535 | if (PageHWPoison(hpage)) { |
1530 | unlock_page(hpage); | 1536 | unlock_page(hpage); |
1531 | put_page(hpage); | 1537 | put_page(hpage); |
1532 | pr_info("soft offline: %#lx hugepage already poisoned\n", pfn); | 1538 | pr_info("soft offline: %#lx hugepage already poisoned\n", pfn); |
1533 | return -EBUSY; | 1539 | return -EBUSY; |
1534 | } | 1540 | } |
1535 | unlock_page(hpage); | 1541 | unlock_page(hpage); |
1536 | 1542 | ||
1537 | /* Keep page count to indicate a given hugepage is isolated. */ | 1543 | /* Keep page count to indicate a given hugepage is isolated. */ |
1538 | list_move(&hpage->lru, &pagelist); | 1544 | list_move(&hpage->lru, &pagelist); |
1539 | ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, | 1545 | ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, |
1540 | MIGRATE_SYNC, MR_MEMORY_FAILURE); | 1546 | MIGRATE_SYNC, MR_MEMORY_FAILURE); |
1541 | if (ret) { | 1547 | if (ret) { |
1542 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", | 1548 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", |
1543 | pfn, ret, page->flags); | 1549 | pfn, ret, page->flags); |
1544 | /* | 1550 | /* |
1545 | * We know that soft_offline_huge_page() tries to migrate | 1551 | * We know that soft_offline_huge_page() tries to migrate |
1546 | * only one hugepage pointed to by hpage, so we need not | 1552 | * only one hugepage pointed to by hpage, so we need not |
1547 | * run through the pagelist here. | 1553 | * run through the pagelist here. |
1548 | */ | 1554 | */ |
1549 | putback_active_hugepage(hpage); | 1555 | putback_active_hugepage(hpage); |
1550 | if (ret > 0) | 1556 | if (ret > 0) |
1551 | ret = -EIO; | 1557 | ret = -EIO; |
1552 | } else { | 1558 | } else { |
1553 | /* overcommit hugetlb page will be freed to buddy */ | 1559 | /* overcommit hugetlb page will be freed to buddy */ |
1554 | if (PageHuge(page)) { | 1560 | if (PageHuge(page)) { |
1555 | set_page_hwpoison_huge_page(hpage); | 1561 | set_page_hwpoison_huge_page(hpage); |
1556 | dequeue_hwpoisoned_huge_page(hpage); | 1562 | dequeue_hwpoisoned_huge_page(hpage); |
1557 | atomic_long_add(1 << compound_order(hpage), | 1563 | atomic_long_add(1 << compound_order(hpage), |
1558 | &num_poisoned_pages); | 1564 | &num_poisoned_pages); |
1559 | } else { | 1565 | } else { |
1560 | SetPageHWPoison(page); | 1566 | SetPageHWPoison(page); |
1561 | atomic_long_inc(&num_poisoned_pages); | 1567 | atomic_long_inc(&num_poisoned_pages); |
1562 | } | 1568 | } |
1563 | } | 1569 | } |
1564 | return ret; | 1570 | return ret; |
1565 | } | 1571 | } |
1566 | 1572 | ||
1567 | static int __soft_offline_page(struct page *page, int flags) | 1573 | static int __soft_offline_page(struct page *page, int flags) |
1568 | { | 1574 | { |
1569 | int ret; | 1575 | int ret; |
1570 | unsigned long pfn = page_to_pfn(page); | 1576 | unsigned long pfn = page_to_pfn(page); |
1571 | 1577 | ||
1572 | /* | 1578 | /* |
1573 | * Check PageHWPoison again inside page lock because PageHWPoison | 1579 | * Check PageHWPoison again inside page lock because PageHWPoison |
1574 | * is set by memory_failure() outside page lock. Note that | 1580 | * is set by memory_failure() outside page lock. Note that |
1575 | * memory_failure() also double-checks PageHWPoison inside page lock, | 1581 | * memory_failure() also double-checks PageHWPoison inside page lock, |
1576 | * so there's no race between soft_offline_page() and memory_failure(). | 1582 | * so there's no race between soft_offline_page() and memory_failure(). |
1577 | */ | 1583 | */ |
1578 | lock_page(page); | 1584 | lock_page(page); |
1579 | wait_on_page_writeback(page); | 1585 | wait_on_page_writeback(page); |
1580 | if (PageHWPoison(page)) { | 1586 | if (PageHWPoison(page)) { |
1581 | unlock_page(page); | 1587 | unlock_page(page); |
1582 | put_page(page); | 1588 | put_page(page); |
1583 | pr_info("soft offline: %#lx page already poisoned\n", pfn); | 1589 | pr_info("soft offline: %#lx page already poisoned\n", pfn); |
1584 | return -EBUSY; | 1590 | return -EBUSY; |
1585 | } | 1591 | } |
1586 | /* | 1592 | /* |
1587 | * Try to invalidate first. This should work for | 1593 | * Try to invalidate first. This should work for |
1588 | * non dirty unmapped page cache pages. | 1594 | * non dirty unmapped page cache pages. |
1589 | */ | 1595 | */ |
1590 | ret = invalidate_inode_page(page); | 1596 | ret = invalidate_inode_page(page); |
1591 | unlock_page(page); | 1597 | unlock_page(page); |
1592 | /* | 1598 | /* |
1593 | * RED-PEN would be better to keep it isolated here, but we | 1599 | * RED-PEN would be better to keep it isolated here, but we |
1594 | * would need to fix isolation locking first. | 1600 | * would need to fix isolation locking first. |
1595 | */ | 1601 | */ |
1596 | if (ret == 1) { | 1602 | if (ret == 1) { |
1597 | put_page(page); | 1603 | put_page(page); |
1598 | pr_info("soft_offline: %#lx: invalidated\n", pfn); | 1604 | pr_info("soft_offline: %#lx: invalidated\n", pfn); |
1599 | SetPageHWPoison(page); | 1605 | SetPageHWPoison(page); |
1600 | atomic_long_inc(&num_poisoned_pages); | 1606 | atomic_long_inc(&num_poisoned_pages); |
1601 | return 0; | 1607 | return 0; |
1602 | } | 1608 | } |
1603 | 1609 | ||
1604 | /* | 1610 | /* |
1605 | * Simple invalidation didn't work. | 1611 | * Simple invalidation didn't work. |
1606 | * Try to migrate to a new page instead. migrate.c | 1612 | * Try to migrate to a new page instead. migrate.c |
1607 | * handles a large number of cases for us. | 1613 | * handles a large number of cases for us. |
1608 | */ | 1614 | */ |
1609 | ret = isolate_lru_page(page); | 1615 | ret = isolate_lru_page(page); |
1610 | /* | 1616 | /* |
1611 | * Drop page reference which is came from get_any_page() | 1617 | * Drop page reference which is came from get_any_page() |
1612 | * successful isolate_lru_page() already took another one. | 1618 | * successful isolate_lru_page() already took another one. |
1613 | */ | 1619 | */ |
1614 | put_page(page); | 1620 | put_page(page); |
1615 | if (!ret) { | 1621 | if (!ret) { |
1616 | LIST_HEAD(pagelist); | 1622 | LIST_HEAD(pagelist); |
1617 | inc_zone_page_state(page, NR_ISOLATED_ANON + | 1623 | inc_zone_page_state(page, NR_ISOLATED_ANON + |
1618 | page_is_file_cache(page)); | 1624 | page_is_file_cache(page)); |
1619 | list_add(&page->lru, &pagelist); | 1625 | list_add(&page->lru, &pagelist); |
1620 | ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, | 1626 | ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, |
1621 | MIGRATE_SYNC, MR_MEMORY_FAILURE); | 1627 | MIGRATE_SYNC, MR_MEMORY_FAILURE); |
1622 | if (ret) { | 1628 | if (ret) { |
1623 | if (!list_empty(&pagelist)) { | 1629 | if (!list_empty(&pagelist)) { |
1624 | list_del(&page->lru); | 1630 | list_del(&page->lru); |
1625 | dec_zone_page_state(page, NR_ISOLATED_ANON + | 1631 | dec_zone_page_state(page, NR_ISOLATED_ANON + |
1626 | page_is_file_cache(page)); | 1632 | page_is_file_cache(page)); |
1627 | putback_lru_page(page); | 1633 | putback_lru_page(page); |
1628 | } | 1634 | } |
1629 | 1635 | ||
1630 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", | 1636 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", |
1631 | pfn, ret, page->flags); | 1637 | pfn, ret, page->flags); |
1632 | if (ret > 0) | 1638 | if (ret > 0) |
1633 | ret = -EIO; | 1639 | ret = -EIO; |
1634 | } else { | 1640 | } else { |
1635 | /* | 1641 | /* |
1636 | * After page migration succeeds, the source page can | 1642 | * After page migration succeeds, the source page can |
1637 | * be trapped in pagevec and actual freeing is delayed. | 1643 | * be trapped in pagevec and actual freeing is delayed. |
1638 | * Freeing code works differently based on PG_hwpoison, | 1644 | * Freeing code works differently based on PG_hwpoison, |
1639 | * so there's a race. We need to make sure that the | 1645 | * so there's a race. We need to make sure that the |
1640 | * source page should be freed back to buddy before | 1646 | * source page should be freed back to buddy before |
1641 | * setting PG_hwpoison. | 1647 | * setting PG_hwpoison. |
1642 | */ | 1648 | */ |
1643 | if (!is_free_buddy_page(page)) | 1649 | if (!is_free_buddy_page(page)) |
1644 | lru_add_drain_all(); | 1650 | lru_add_drain_all(); |
1645 | if (!is_free_buddy_page(page)) | 1651 | if (!is_free_buddy_page(page)) |
1646 | drain_all_pages(); | 1652 | drain_all_pages(); |
1647 | SetPageHWPoison(page); | 1653 | SetPageHWPoison(page); |
1648 | if (!is_free_buddy_page(page)) | 1654 | if (!is_free_buddy_page(page)) |
1649 | pr_info("soft offline: %#lx: page leaked\n", | 1655 | pr_info("soft offline: %#lx: page leaked\n", |
1650 | pfn); | 1656 | pfn); |
1651 | atomic_long_inc(&num_poisoned_pages); | 1657 | atomic_long_inc(&num_poisoned_pages); |
1652 | } | 1658 | } |
1653 | } else { | 1659 | } else { |
1654 | pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", | 1660 | pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", |
1655 | pfn, ret, page_count(page), page->flags); | 1661 | pfn, ret, page_count(page), page->flags); |
1656 | } | 1662 | } |
1657 | return ret; | 1663 | return ret; |
1658 | } | 1664 | } |
1659 | 1665 | ||
1660 | /** | 1666 | /** |
1661 | * soft_offline_page - Soft offline a page. | 1667 | * soft_offline_page - Soft offline a page. |
1662 | * @page: page to offline | 1668 | * @page: page to offline |
1663 | * @flags: flags. Same as memory_failure(). | 1669 | * @flags: flags. Same as memory_failure(). |
1664 | * | 1670 | * |
1665 | * Returns 0 on success, otherwise negated errno. | 1671 | * Returns 0 on success, otherwise negated errno. |
1666 | * | 1672 | * |
1667 | * Soft offline a page, by migration or invalidation, | 1673 | * Soft offline a page, by migration or invalidation, |
1668 | * without killing anything. This is for the case when | 1674 | * without killing anything. This is for the case when |
1669 | * a page is not corrupted yet (so it's still valid to access), | 1675 | * a page is not corrupted yet (so it's still valid to access), |
1670 | * but has had a number of corrected errors and is better taken | 1676 | * but has had a number of corrected errors and is better taken |
1671 | * out. | 1677 | * out. |
1672 | * | 1678 | * |
1673 | * The actual policy on when to do that is maintained by | 1679 | * The actual policy on when to do that is maintained by |
1674 | * user space. | 1680 | * user space. |
1675 | * | 1681 | * |
1676 | * This should never impact any application or cause data loss, | 1682 | * This should never impact any application or cause data loss, |
1677 | * however it might take some time. | 1683 | * however it might take some time. |
1678 | * | 1684 | * |
1679 | * This is not a 100% solution for all memory, but tries to be | 1685 | * This is not a 100% solution for all memory, but tries to be |
1680 | * ``good enough'' for the majority of memory. | 1686 | * ``good enough'' for the majority of memory. |
1681 | */ | 1687 | */ |
1682 | int soft_offline_page(struct page *page, int flags) | 1688 | int soft_offline_page(struct page *page, int flags) |
1683 | { | 1689 | { |
1684 | int ret; | 1690 | int ret; |
1685 | unsigned long pfn = page_to_pfn(page); | 1691 | unsigned long pfn = page_to_pfn(page); |
1686 | struct page *hpage = compound_head(page); | 1692 | struct page *hpage = compound_head(page); |
1687 | 1693 | ||
1688 | if (PageHWPoison(page)) { | 1694 | if (PageHWPoison(page)) { |
1689 | pr_info("soft offline: %#lx page already poisoned\n", pfn); | 1695 | pr_info("soft offline: %#lx page already poisoned\n", pfn); |
1690 | return -EBUSY; | 1696 | return -EBUSY; |
1691 | } | 1697 | } |
1692 | if (!PageHuge(page) && PageTransHuge(hpage)) { | 1698 | if (!PageHuge(page) && PageTransHuge(hpage)) { |
1693 | if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) { | 1699 | if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) { |
1694 | pr_info("soft offline: %#lx: failed to split THP\n", | 1700 | pr_info("soft offline: %#lx: failed to split THP\n", |
1695 | pfn); | 1701 | pfn); |
1696 | return -EBUSY; | 1702 | return -EBUSY; |
1697 | } | 1703 | } |
1698 | } | 1704 | } |
1699 | 1705 | ||
1700 | get_online_mems(); | 1706 | get_online_mems(); |
1701 | 1707 | ||
1702 | /* | 1708 | /* |
1703 | * Isolate the page, so that it doesn't get reallocated if it | 1709 | * Isolate the page, so that it doesn't get reallocated if it |
1704 | * was free. This flag should be kept set until the source page | 1710 | * was free. This flag should be kept set until the source page |
1705 | * is freed and PG_hwpoison on it is set. | 1711 | * is freed and PG_hwpoison on it is set. |
1706 | */ | 1712 | */ |
1707 | if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) | 1713 | if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) |
1708 | set_migratetype_isolate(page, true); | 1714 | set_migratetype_isolate(page, true); |
1709 | 1715 | ||
1710 | ret = get_any_page(page, pfn, flags); | 1716 | ret = get_any_page(page, pfn, flags); |
1711 | put_online_mems(); | 1717 | put_online_mems(); |
1712 | if (ret > 0) { /* for in-use pages */ | 1718 | if (ret > 0) { /* for in-use pages */ |
1713 | if (PageHuge(page)) | 1719 | if (PageHuge(page)) |
1714 | ret = soft_offline_huge_page(page, flags); | 1720 | ret = soft_offline_huge_page(page, flags); |
1715 | else | 1721 | else |
1716 | ret = __soft_offline_page(page, flags); | 1722 | ret = __soft_offline_page(page, flags); |
1717 | } else if (ret == 0) { /* for free pages */ | 1723 | } else if (ret == 0) { /* for free pages */ |
1718 | if (PageHuge(page)) { | 1724 | if (PageHuge(page)) { |
1719 | set_page_hwpoison_huge_page(hpage); | 1725 | set_page_hwpoison_huge_page(hpage); |
1720 | dequeue_hwpoisoned_huge_page(hpage); | 1726 | dequeue_hwpoisoned_huge_page(hpage); |
1721 | atomic_long_add(1 << compound_order(hpage), | 1727 | atomic_long_add(1 << compound_order(hpage), |
1722 | &num_poisoned_pages); | 1728 | &num_poisoned_pages); |
1723 | } else { | 1729 | } else { |
1724 | SetPageHWPoison(page); | 1730 | SetPageHWPoison(page); |
1725 | atomic_long_inc(&num_poisoned_pages); | 1731 | atomic_long_inc(&num_poisoned_pages); |
1726 | } | 1732 | } |
1727 | } | 1733 | } |
1728 | unset_migratetype_isolate(page, MIGRATE_MOVABLE); | 1734 | unset_migratetype_isolate(page, MIGRATE_MOVABLE); |
1729 | return ret; | 1735 | return ret; |
1730 | } | 1736 | } |
1731 | 1737 |