Commit bbeb34062fbad287c949a945a516a0c15b179993

Authored by Huang Ying
Committed by Avi Kivity
1 parent 6c3f604117

KVM: Fix a race condition for usage of is_hwpoison_address()

is_hwpoison_address accesses the page table, so the caller must hold
current->mm->mmap_sem in read mode. So fix its usage in hva_to_pfn of
kvm accordingly.

Comment is_hwpoison_address to remind other users.

Reported-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

Showing 2 changed files with 5 additions and 1 deletions Inline Diff

1 /* 1 /*
2 * Copyright (C) 2008, 2009 Intel Corporation 2 * Copyright (C) 2008, 2009 Intel Corporation
3 * Authors: Andi Kleen, Fengguang Wu 3 * Authors: Andi Kleen, Fengguang Wu
4 * 4 *
5 * This software may be redistributed and/or modified under the terms of 5 * This software may be redistributed and/or modified under the terms of
6 * the GNU General Public License ("GPL") version 2 only as published by the 6 * the GNU General Public License ("GPL") version 2 only as published by the
7 * Free Software Foundation. 7 * Free Software Foundation.
8 * 8 *
9 * High level machine check handler. Handles pages reported by the 9 * High level machine check handler. Handles pages reported by the
10 * hardware as being corrupted usually due to a 2bit ECC memory or cache 10 * hardware as being corrupted usually due to a 2bit ECC memory or cache
11 * failure. 11 * failure.
12 * 12 *
13 * Handles page cache pages in various states. The tricky part 13 * Handles page cache pages in various states. The tricky part
14 * here is that we can access any page asynchronous to other VM 14 * here is that we can access any page asynchronous to other VM
15 * users, because memory failures could happen anytime and anywhere, 15 * users, because memory failures could happen anytime and anywhere,
16 * possibly violating some of their assumptions. This is why this code 16 * possibly violating some of their assumptions. This is why this code
17 * has to be extremely careful. Generally it tries to use normal locking 17 * has to be extremely careful. Generally it tries to use normal locking
18 * rules, as in get the standard locks, even if that means the 18 * rules, as in get the standard locks, even if that means the
19 * error handling takes potentially a long time. 19 * error handling takes potentially a long time.
20 * 20 *
21 * The operation to map back from RMAP chains to processes has to walk 21 * The operation to map back from RMAP chains to processes has to walk
22 * the complete process list and has non linear complexity with the number 22 * the complete process list and has non linear complexity with the number
23 * mappings. In short it can be quite slow. But since memory corruptions 23 * mappings. In short it can be quite slow. But since memory corruptions
24 * are rare we hope to get away with this. 24 * are rare we hope to get away with this.
25 */ 25 */
26 26
27 /* 27 /*
28 * Notebook: 28 * Notebook:
29 * - hugetlb needs more code 29 * - hugetlb needs more code
30 * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages 30 * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages
31 * - pass bad pages to kdump next kernel 31 * - pass bad pages to kdump next kernel
32 */ 32 */
33 #define DEBUG 1 /* remove me in 2.6.34 */ 33 #define DEBUG 1 /* remove me in 2.6.34 */
34 #include <linux/kernel.h> 34 #include <linux/kernel.h>
35 #include <linux/mm.h> 35 #include <linux/mm.h>
36 #include <linux/page-flags.h> 36 #include <linux/page-flags.h>
37 #include <linux/kernel-page-flags.h> 37 #include <linux/kernel-page-flags.h>
38 #include <linux/sched.h> 38 #include <linux/sched.h>
39 #include <linux/ksm.h> 39 #include <linux/ksm.h>
40 #include <linux/rmap.h> 40 #include <linux/rmap.h>
41 #include <linux/pagemap.h> 41 #include <linux/pagemap.h>
42 #include <linux/swap.h> 42 #include <linux/swap.h>
43 #include <linux/backing-dev.h> 43 #include <linux/backing-dev.h>
44 #include <linux/migrate.h> 44 #include <linux/migrate.h>
45 #include <linux/page-isolation.h> 45 #include <linux/page-isolation.h>
46 #include <linux/suspend.h> 46 #include <linux/suspend.h>
47 #include <linux/slab.h> 47 #include <linux/slab.h>
48 #include <linux/swapops.h> 48 #include <linux/swapops.h>
49 #include "internal.h" 49 #include "internal.h"
50 50
51 int sysctl_memory_failure_early_kill __read_mostly = 0; 51 int sysctl_memory_failure_early_kill __read_mostly = 0;
52 52
53 int sysctl_memory_failure_recovery __read_mostly = 1; 53 int sysctl_memory_failure_recovery __read_mostly = 1;
54 54
55 atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0); 55 atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0);
56 56
57 #if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE) 57 #if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)
58 58
59 u32 hwpoison_filter_enable = 0; 59 u32 hwpoison_filter_enable = 0;
60 u32 hwpoison_filter_dev_major = ~0U; 60 u32 hwpoison_filter_dev_major = ~0U;
61 u32 hwpoison_filter_dev_minor = ~0U; 61 u32 hwpoison_filter_dev_minor = ~0U;
62 u64 hwpoison_filter_flags_mask; 62 u64 hwpoison_filter_flags_mask;
63 u64 hwpoison_filter_flags_value; 63 u64 hwpoison_filter_flags_value;
64 EXPORT_SYMBOL_GPL(hwpoison_filter_enable); 64 EXPORT_SYMBOL_GPL(hwpoison_filter_enable);
65 EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major); 65 EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major);
66 EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor); 66 EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor);
67 EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask); 67 EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask);
68 EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value); 68 EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value);
69 69
70 static int hwpoison_filter_dev(struct page *p) 70 static int hwpoison_filter_dev(struct page *p)
71 { 71 {
72 struct address_space *mapping; 72 struct address_space *mapping;
73 dev_t dev; 73 dev_t dev;
74 74
75 if (hwpoison_filter_dev_major == ~0U && 75 if (hwpoison_filter_dev_major == ~0U &&
76 hwpoison_filter_dev_minor == ~0U) 76 hwpoison_filter_dev_minor == ~0U)
77 return 0; 77 return 0;
78 78
79 /* 79 /*
80 * page_mapping() does not accept slab page 80 * page_mapping() does not accept slab page
81 */ 81 */
82 if (PageSlab(p)) 82 if (PageSlab(p))
83 return -EINVAL; 83 return -EINVAL;
84 84
85 mapping = page_mapping(p); 85 mapping = page_mapping(p);
86 if (mapping == NULL || mapping->host == NULL) 86 if (mapping == NULL || mapping->host == NULL)
87 return -EINVAL; 87 return -EINVAL;
88 88
89 dev = mapping->host->i_sb->s_dev; 89 dev = mapping->host->i_sb->s_dev;
90 if (hwpoison_filter_dev_major != ~0U && 90 if (hwpoison_filter_dev_major != ~0U &&
91 hwpoison_filter_dev_major != MAJOR(dev)) 91 hwpoison_filter_dev_major != MAJOR(dev))
92 return -EINVAL; 92 return -EINVAL;
93 if (hwpoison_filter_dev_minor != ~0U && 93 if (hwpoison_filter_dev_minor != ~0U &&
94 hwpoison_filter_dev_minor != MINOR(dev)) 94 hwpoison_filter_dev_minor != MINOR(dev))
95 return -EINVAL; 95 return -EINVAL;
96 96
97 return 0; 97 return 0;
98 } 98 }
99 99
100 static int hwpoison_filter_flags(struct page *p) 100 static int hwpoison_filter_flags(struct page *p)
101 { 101 {
102 if (!hwpoison_filter_flags_mask) 102 if (!hwpoison_filter_flags_mask)
103 return 0; 103 return 0;
104 104
105 if ((stable_page_flags(p) & hwpoison_filter_flags_mask) == 105 if ((stable_page_flags(p) & hwpoison_filter_flags_mask) ==
106 hwpoison_filter_flags_value) 106 hwpoison_filter_flags_value)
107 return 0; 107 return 0;
108 else 108 else
109 return -EINVAL; 109 return -EINVAL;
110 } 110 }
111 111
112 /* 112 /*
113 * This allows stress tests to limit test scope to a collection of tasks 113 * This allows stress tests to limit test scope to a collection of tasks
114 * by putting them under some memcg. This prevents killing unrelated/important 114 * by putting them under some memcg. This prevents killing unrelated/important
115 * processes such as /sbin/init. Note that the target task may share clean 115 * processes such as /sbin/init. Note that the target task may share clean
116 * pages with init (eg. libc text), which is harmless. If the target task 116 * pages with init (eg. libc text), which is harmless. If the target task
117 * share _dirty_ pages with another task B, the test scheme must make sure B 117 * share _dirty_ pages with another task B, the test scheme must make sure B
118 * is also included in the memcg. At last, due to race conditions this filter 118 * is also included in the memcg. At last, due to race conditions this filter
119 * can only guarantee that the page either belongs to the memcg tasks, or is 119 * can only guarantee that the page either belongs to the memcg tasks, or is
120 * a freed page. 120 * a freed page.
121 */ 121 */
122 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 122 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
123 u64 hwpoison_filter_memcg; 123 u64 hwpoison_filter_memcg;
124 EXPORT_SYMBOL_GPL(hwpoison_filter_memcg); 124 EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
125 static int hwpoison_filter_task(struct page *p) 125 static int hwpoison_filter_task(struct page *p)
126 { 126 {
127 struct mem_cgroup *mem; 127 struct mem_cgroup *mem;
128 struct cgroup_subsys_state *css; 128 struct cgroup_subsys_state *css;
129 unsigned long ino; 129 unsigned long ino;
130 130
131 if (!hwpoison_filter_memcg) 131 if (!hwpoison_filter_memcg)
132 return 0; 132 return 0;
133 133
134 mem = try_get_mem_cgroup_from_page(p); 134 mem = try_get_mem_cgroup_from_page(p);
135 if (!mem) 135 if (!mem)
136 return -EINVAL; 136 return -EINVAL;
137 137
138 css = mem_cgroup_css(mem); 138 css = mem_cgroup_css(mem);
139 /* root_mem_cgroup has NULL dentries */ 139 /* root_mem_cgroup has NULL dentries */
140 if (!css->cgroup->dentry) 140 if (!css->cgroup->dentry)
141 return -EINVAL; 141 return -EINVAL;
142 142
143 ino = css->cgroup->dentry->d_inode->i_ino; 143 ino = css->cgroup->dentry->d_inode->i_ino;
144 css_put(css); 144 css_put(css);
145 145
146 if (ino != hwpoison_filter_memcg) 146 if (ino != hwpoison_filter_memcg)
147 return -EINVAL; 147 return -EINVAL;
148 148
149 return 0; 149 return 0;
150 } 150 }
151 #else 151 #else
152 static int hwpoison_filter_task(struct page *p) { return 0; } 152 static int hwpoison_filter_task(struct page *p) { return 0; }
153 #endif 153 #endif
154 154
155 int hwpoison_filter(struct page *p) 155 int hwpoison_filter(struct page *p)
156 { 156 {
157 if (!hwpoison_filter_enable) 157 if (!hwpoison_filter_enable)
158 return 0; 158 return 0;
159 159
160 if (hwpoison_filter_dev(p)) 160 if (hwpoison_filter_dev(p))
161 return -EINVAL; 161 return -EINVAL;
162 162
163 if (hwpoison_filter_flags(p)) 163 if (hwpoison_filter_flags(p))
164 return -EINVAL; 164 return -EINVAL;
165 165
166 if (hwpoison_filter_task(p)) 166 if (hwpoison_filter_task(p))
167 return -EINVAL; 167 return -EINVAL;
168 168
169 return 0; 169 return 0;
170 } 170 }
171 #else 171 #else
172 int hwpoison_filter(struct page *p) 172 int hwpoison_filter(struct page *p)
173 { 173 {
174 return 0; 174 return 0;
175 } 175 }
176 #endif 176 #endif
177 177
178 EXPORT_SYMBOL_GPL(hwpoison_filter); 178 EXPORT_SYMBOL_GPL(hwpoison_filter);
179 179
180 /* 180 /*
181 * Send all the processes who have the page mapped an ``action optional'' 181 * Send all the processes who have the page mapped an ``action optional''
182 * signal. 182 * signal.
183 */ 183 */
184 static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno, 184 static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
185 unsigned long pfn) 185 unsigned long pfn)
186 { 186 {
187 struct siginfo si; 187 struct siginfo si;
188 int ret; 188 int ret;
189 189
190 printk(KERN_ERR 190 printk(KERN_ERR
191 "MCE %#lx: Killing %s:%d early due to hardware memory corruption\n", 191 "MCE %#lx: Killing %s:%d early due to hardware memory corruption\n",
192 pfn, t->comm, t->pid); 192 pfn, t->comm, t->pid);
193 si.si_signo = SIGBUS; 193 si.si_signo = SIGBUS;
194 si.si_errno = 0; 194 si.si_errno = 0;
195 si.si_code = BUS_MCEERR_AO; 195 si.si_code = BUS_MCEERR_AO;
196 si.si_addr = (void *)addr; 196 si.si_addr = (void *)addr;
197 #ifdef __ARCH_SI_TRAPNO 197 #ifdef __ARCH_SI_TRAPNO
198 si.si_trapno = trapno; 198 si.si_trapno = trapno;
199 #endif 199 #endif
200 si.si_addr_lsb = PAGE_SHIFT; 200 si.si_addr_lsb = PAGE_SHIFT;
201 /* 201 /*
202 * Don't use force here, it's convenient if the signal 202 * Don't use force here, it's convenient if the signal
203 * can be temporarily blocked. 203 * can be temporarily blocked.
204 * This could cause a loop when the user sets SIGBUS 204 * This could cause a loop when the user sets SIGBUS
205 * to SIG_IGN, but hopefully noone will do that? 205 * to SIG_IGN, but hopefully noone will do that?
206 */ 206 */
207 ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */ 207 ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */
208 if (ret < 0) 208 if (ret < 0)
209 printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n", 209 printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n",
210 t->comm, t->pid, ret); 210 t->comm, t->pid, ret);
211 return ret; 211 return ret;
212 } 212 }
213 213
214 /* 214 /*
215 * When a unknown page type is encountered drain as many buffers as possible 215 * When a unknown page type is encountered drain as many buffers as possible
216 * in the hope to turn the page into a LRU or free page, which we can handle. 216 * in the hope to turn the page into a LRU or free page, which we can handle.
217 */ 217 */
218 void shake_page(struct page *p, int access) 218 void shake_page(struct page *p, int access)
219 { 219 {
220 if (!PageSlab(p)) { 220 if (!PageSlab(p)) {
221 lru_add_drain_all(); 221 lru_add_drain_all();
222 if (PageLRU(p)) 222 if (PageLRU(p))
223 return; 223 return;
224 drain_all_pages(); 224 drain_all_pages();
225 if (PageLRU(p) || is_free_buddy_page(p)) 225 if (PageLRU(p) || is_free_buddy_page(p))
226 return; 226 return;
227 } 227 }
228 228
229 /* 229 /*
230 * Only all shrink_slab here (which would also 230 * Only all shrink_slab here (which would also
231 * shrink other caches) if access is not potentially fatal. 231 * shrink other caches) if access is not potentially fatal.
232 */ 232 */
233 if (access) { 233 if (access) {
234 int nr; 234 int nr;
235 do { 235 do {
236 nr = shrink_slab(1000, GFP_KERNEL, 1000); 236 nr = shrink_slab(1000, GFP_KERNEL, 1000);
237 if (page_count(p) == 0) 237 if (page_count(p) == 0)
238 break; 238 break;
239 } while (nr > 10); 239 } while (nr > 10);
240 } 240 }
241 } 241 }
242 EXPORT_SYMBOL_GPL(shake_page); 242 EXPORT_SYMBOL_GPL(shake_page);
243 243
244 /* 244 /*
245 * Kill all processes that have a poisoned page mapped and then isolate 245 * Kill all processes that have a poisoned page mapped and then isolate
246 * the page. 246 * the page.
247 * 247 *
248 * General strategy: 248 * General strategy:
249 * Find all processes having the page mapped and kill them. 249 * Find all processes having the page mapped and kill them.
250 * But we keep a page reference around so that the page is not 250 * But we keep a page reference around so that the page is not
251 * actually freed yet. 251 * actually freed yet.
252 * Then stash the page away 252 * Then stash the page away
253 * 253 *
254 * There's no convenient way to get back to mapped processes 254 * There's no convenient way to get back to mapped processes
255 * from the VMAs. So do a brute-force search over all 255 * from the VMAs. So do a brute-force search over all
256 * running processes. 256 * running processes.
257 * 257 *
258 * Remember that machine checks are not common (or rather 258 * Remember that machine checks are not common (or rather
259 * if they are common you have other problems), so this shouldn't 259 * if they are common you have other problems), so this shouldn't
260 * be a performance issue. 260 * be a performance issue.
261 * 261 *
262 * Also there are some races possible while we get from the 262 * Also there are some races possible while we get from the
263 * error detection to actually handle it. 263 * error detection to actually handle it.
264 */ 264 */
265 265
266 struct to_kill { 266 struct to_kill {
267 struct list_head nd; 267 struct list_head nd;
268 struct task_struct *tsk; 268 struct task_struct *tsk;
269 unsigned long addr; 269 unsigned long addr;
270 unsigned addr_valid:1; 270 unsigned addr_valid:1;
271 }; 271 };
272 272
273 /* 273 /*
274 * Failure handling: if we can't find or can't kill a process there's 274 * Failure handling: if we can't find or can't kill a process there's
275 * not much we can do. We just print a message and ignore otherwise. 275 * not much we can do. We just print a message and ignore otherwise.
276 */ 276 */
277 277
278 /* 278 /*
279 * Schedule a process for later kill. 279 * Schedule a process for later kill.
280 * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM. 280 * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
281 * TBD would GFP_NOIO be enough? 281 * TBD would GFP_NOIO be enough?
282 */ 282 */
283 static void add_to_kill(struct task_struct *tsk, struct page *p, 283 static void add_to_kill(struct task_struct *tsk, struct page *p,
284 struct vm_area_struct *vma, 284 struct vm_area_struct *vma,
285 struct list_head *to_kill, 285 struct list_head *to_kill,
286 struct to_kill **tkc) 286 struct to_kill **tkc)
287 { 287 {
288 struct to_kill *tk; 288 struct to_kill *tk;
289 289
290 if (*tkc) { 290 if (*tkc) {
291 tk = *tkc; 291 tk = *tkc;
292 *tkc = NULL; 292 *tkc = NULL;
293 } else { 293 } else {
294 tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC); 294 tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
295 if (!tk) { 295 if (!tk) {
296 printk(KERN_ERR 296 printk(KERN_ERR
297 "MCE: Out of memory while machine check handling\n"); 297 "MCE: Out of memory while machine check handling\n");
298 return; 298 return;
299 } 299 }
300 } 300 }
301 tk->addr = page_address_in_vma(p, vma); 301 tk->addr = page_address_in_vma(p, vma);
302 tk->addr_valid = 1; 302 tk->addr_valid = 1;
303 303
304 /* 304 /*
305 * In theory we don't have to kill when the page was 305 * In theory we don't have to kill when the page was
306 * munmaped. But it could be also a mremap. Since that's 306 * munmaped. But it could be also a mremap. Since that's
307 * likely very rare kill anyways just out of paranoia, but use 307 * likely very rare kill anyways just out of paranoia, but use
308 * a SIGKILL because the error is not contained anymore. 308 * a SIGKILL because the error is not contained anymore.
309 */ 309 */
310 if (tk->addr == -EFAULT) { 310 if (tk->addr == -EFAULT) {
311 pr_debug("MCE: Unable to find user space address %lx in %s\n", 311 pr_debug("MCE: Unable to find user space address %lx in %s\n",
312 page_to_pfn(p), tsk->comm); 312 page_to_pfn(p), tsk->comm);
313 tk->addr_valid = 0; 313 tk->addr_valid = 0;
314 } 314 }
315 get_task_struct(tsk); 315 get_task_struct(tsk);
316 tk->tsk = tsk; 316 tk->tsk = tsk;
317 list_add_tail(&tk->nd, to_kill); 317 list_add_tail(&tk->nd, to_kill);
318 } 318 }
319 319
320 /* 320 /*
321 * Kill the processes that have been collected earlier. 321 * Kill the processes that have been collected earlier.
322 * 322 *
323 * Only do anything when DOIT is set, otherwise just free the list 323 * Only do anything when DOIT is set, otherwise just free the list
324 * (this is used for clean pages which do not need killing) 324 * (this is used for clean pages which do not need killing)
325 * Also when FAIL is set do a force kill because something went 325 * Also when FAIL is set do a force kill because something went
326 * wrong earlier. 326 * wrong earlier.
327 */ 327 */
328 static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno, 328 static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno,
329 int fail, unsigned long pfn) 329 int fail, unsigned long pfn)
330 { 330 {
331 struct to_kill *tk, *next; 331 struct to_kill *tk, *next;
332 332
333 list_for_each_entry_safe (tk, next, to_kill, nd) { 333 list_for_each_entry_safe (tk, next, to_kill, nd) {
334 if (doit) { 334 if (doit) {
335 /* 335 /*
336 * In case something went wrong with munmapping 336 * In case something went wrong with munmapping
337 * make sure the process doesn't catch the 337 * make sure the process doesn't catch the
338 * signal and then access the memory. Just kill it. 338 * signal and then access the memory. Just kill it.
339 */ 339 */
340 if (fail || tk->addr_valid == 0) { 340 if (fail || tk->addr_valid == 0) {
341 printk(KERN_ERR 341 printk(KERN_ERR
342 "MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n", 342 "MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
343 pfn, tk->tsk->comm, tk->tsk->pid); 343 pfn, tk->tsk->comm, tk->tsk->pid);
344 force_sig(SIGKILL, tk->tsk); 344 force_sig(SIGKILL, tk->tsk);
345 } 345 }
346 346
347 /* 347 /*
348 * In theory the process could have mapped 348 * In theory the process could have mapped
349 * something else on the address in-between. We could 349 * something else on the address in-between. We could
350 * check for that, but we need to tell the 350 * check for that, but we need to tell the
351 * process anyways. 351 * process anyways.
352 */ 352 */
353 else if (kill_proc_ao(tk->tsk, tk->addr, trapno, 353 else if (kill_proc_ao(tk->tsk, tk->addr, trapno,
354 pfn) < 0) 354 pfn) < 0)
355 printk(KERN_ERR 355 printk(KERN_ERR
356 "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n", 356 "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",
357 pfn, tk->tsk->comm, tk->tsk->pid); 357 pfn, tk->tsk->comm, tk->tsk->pid);
358 } 358 }
359 put_task_struct(tk->tsk); 359 put_task_struct(tk->tsk);
360 kfree(tk); 360 kfree(tk);
361 } 361 }
362 } 362 }
363 363
364 static int task_early_kill(struct task_struct *tsk) 364 static int task_early_kill(struct task_struct *tsk)
365 { 365 {
366 if (!tsk->mm) 366 if (!tsk->mm)
367 return 0; 367 return 0;
368 if (tsk->flags & PF_MCE_PROCESS) 368 if (tsk->flags & PF_MCE_PROCESS)
369 return !!(tsk->flags & PF_MCE_EARLY); 369 return !!(tsk->flags & PF_MCE_EARLY);
370 return sysctl_memory_failure_early_kill; 370 return sysctl_memory_failure_early_kill;
371 } 371 }
372 372
373 /* 373 /*
374 * Collect processes when the error hit an anonymous page. 374 * Collect processes when the error hit an anonymous page.
375 */ 375 */
376 static void collect_procs_anon(struct page *page, struct list_head *to_kill, 376 static void collect_procs_anon(struct page *page, struct list_head *to_kill,
377 struct to_kill **tkc) 377 struct to_kill **tkc)
378 { 378 {
379 struct vm_area_struct *vma; 379 struct vm_area_struct *vma;
380 struct task_struct *tsk; 380 struct task_struct *tsk;
381 struct anon_vma *av; 381 struct anon_vma *av;
382 382
383 read_lock(&tasklist_lock); 383 read_lock(&tasklist_lock);
384 av = page_lock_anon_vma(page); 384 av = page_lock_anon_vma(page);
385 if (av == NULL) /* Not actually mapped anymore */ 385 if (av == NULL) /* Not actually mapped anymore */
386 goto out; 386 goto out;
387 for_each_process (tsk) { 387 for_each_process (tsk) {
388 struct anon_vma_chain *vmac; 388 struct anon_vma_chain *vmac;
389 389
390 if (!task_early_kill(tsk)) 390 if (!task_early_kill(tsk))
391 continue; 391 continue;
392 list_for_each_entry(vmac, &av->head, same_anon_vma) { 392 list_for_each_entry(vmac, &av->head, same_anon_vma) {
393 vma = vmac->vma; 393 vma = vmac->vma;
394 if (!page_mapped_in_vma(page, vma)) 394 if (!page_mapped_in_vma(page, vma))
395 continue; 395 continue;
396 if (vma->vm_mm == tsk->mm) 396 if (vma->vm_mm == tsk->mm)
397 add_to_kill(tsk, page, vma, to_kill, tkc); 397 add_to_kill(tsk, page, vma, to_kill, tkc);
398 } 398 }
399 } 399 }
400 page_unlock_anon_vma(av); 400 page_unlock_anon_vma(av);
401 out: 401 out:
402 read_unlock(&tasklist_lock); 402 read_unlock(&tasklist_lock);
403 } 403 }
404 404
405 /* 405 /*
406 * Collect processes when the error hit a file mapped page. 406 * Collect processes when the error hit a file mapped page.
407 */ 407 */
408 static void collect_procs_file(struct page *page, struct list_head *to_kill, 408 static void collect_procs_file(struct page *page, struct list_head *to_kill,
409 struct to_kill **tkc) 409 struct to_kill **tkc)
410 { 410 {
411 struct vm_area_struct *vma; 411 struct vm_area_struct *vma;
412 struct task_struct *tsk; 412 struct task_struct *tsk;
413 struct prio_tree_iter iter; 413 struct prio_tree_iter iter;
414 struct address_space *mapping = page->mapping; 414 struct address_space *mapping = page->mapping;
415 415
416 /* 416 /*
417 * A note on the locking order between the two locks. 417 * A note on the locking order between the two locks.
418 * We don't rely on this particular order. 418 * We don't rely on this particular order.
419 * If you have some other code that needs a different order 419 * If you have some other code that needs a different order
420 * feel free to switch them around. Or add a reverse link 420 * feel free to switch them around. Or add a reverse link
421 * from mm_struct to task_struct, then this could be all 421 * from mm_struct to task_struct, then this could be all
422 * done without taking tasklist_lock and looping over all tasks. 422 * done without taking tasklist_lock and looping over all tasks.
423 */ 423 */
424 424
425 read_lock(&tasklist_lock); 425 read_lock(&tasklist_lock);
426 spin_lock(&mapping->i_mmap_lock); 426 spin_lock(&mapping->i_mmap_lock);
427 for_each_process(tsk) { 427 for_each_process(tsk) {
428 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 428 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
429 429
430 if (!task_early_kill(tsk)) 430 if (!task_early_kill(tsk))
431 continue; 431 continue;
432 432
433 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, 433 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff,
434 pgoff) { 434 pgoff) {
435 /* 435 /*
436 * Send early kill signal to tasks where a vma covers 436 * Send early kill signal to tasks where a vma covers
437 * the page but the corrupted page is not necessarily 437 * the page but the corrupted page is not necessarily
438 * mapped it in its pte. 438 * mapped it in its pte.
439 * Assume applications who requested early kill want 439 * Assume applications who requested early kill want
440 * to be informed of all such data corruptions. 440 * to be informed of all such data corruptions.
441 */ 441 */
442 if (vma->vm_mm == tsk->mm) 442 if (vma->vm_mm == tsk->mm)
443 add_to_kill(tsk, page, vma, to_kill, tkc); 443 add_to_kill(tsk, page, vma, to_kill, tkc);
444 } 444 }
445 } 445 }
446 spin_unlock(&mapping->i_mmap_lock); 446 spin_unlock(&mapping->i_mmap_lock);
447 read_unlock(&tasklist_lock); 447 read_unlock(&tasklist_lock);
448 } 448 }
449 449
450 /* 450 /*
451 * Collect the processes who have the corrupted page mapped to kill. 451 * Collect the processes who have the corrupted page mapped to kill.
452 * This is done in two steps for locking reasons. 452 * This is done in two steps for locking reasons.
453 * First preallocate one tokill structure outside the spin locks, 453 * First preallocate one tokill structure outside the spin locks,
454 * so that we can kill at least one process reasonably reliable. 454 * so that we can kill at least one process reasonably reliable.
455 */ 455 */
456 static void collect_procs(struct page *page, struct list_head *tokill) 456 static void collect_procs(struct page *page, struct list_head *tokill)
457 { 457 {
458 struct to_kill *tk; 458 struct to_kill *tk;
459 459
460 if (!page->mapping) 460 if (!page->mapping)
461 return; 461 return;
462 462
463 tk = kmalloc(sizeof(struct to_kill), GFP_NOIO); 463 tk = kmalloc(sizeof(struct to_kill), GFP_NOIO);
464 if (!tk) 464 if (!tk)
465 return; 465 return;
466 if (PageAnon(page)) 466 if (PageAnon(page))
467 collect_procs_anon(page, tokill, &tk); 467 collect_procs_anon(page, tokill, &tk);
468 else 468 else
469 collect_procs_file(page, tokill, &tk); 469 collect_procs_file(page, tokill, &tk);
470 kfree(tk); 470 kfree(tk);
471 } 471 }
472 472
473 /* 473 /*
474 * Error handlers for various types of pages. 474 * Error handlers for various types of pages.
475 */ 475 */
476 476
477 enum outcome { 477 enum outcome {
478 IGNORED, /* Error: cannot be handled */ 478 IGNORED, /* Error: cannot be handled */
479 FAILED, /* Error: handling failed */ 479 FAILED, /* Error: handling failed */
480 DELAYED, /* Will be handled later */ 480 DELAYED, /* Will be handled later */
481 RECOVERED, /* Successfully recovered */ 481 RECOVERED, /* Successfully recovered */
482 }; 482 };
483 483
484 static const char *action_name[] = { 484 static const char *action_name[] = {
485 [IGNORED] = "Ignored", 485 [IGNORED] = "Ignored",
486 [FAILED] = "Failed", 486 [FAILED] = "Failed",
487 [DELAYED] = "Delayed", 487 [DELAYED] = "Delayed",
488 [RECOVERED] = "Recovered", 488 [RECOVERED] = "Recovered",
489 }; 489 };
490 490
491 /* 491 /*
492 * XXX: It is possible that a page is isolated from LRU cache, 492 * XXX: It is possible that a page is isolated from LRU cache,
493 * and then kept in swap cache or failed to remove from page cache. 493 * and then kept in swap cache or failed to remove from page cache.
494 * The page count will stop it from being freed by unpoison. 494 * The page count will stop it from being freed by unpoison.
495 * Stress tests should be aware of this memory leak problem. 495 * Stress tests should be aware of this memory leak problem.
496 */ 496 */
497 static int delete_from_lru_cache(struct page *p) 497 static int delete_from_lru_cache(struct page *p)
498 { 498 {
499 if (!isolate_lru_page(p)) { 499 if (!isolate_lru_page(p)) {
500 /* 500 /*
501 * Clear sensible page flags, so that the buddy system won't 501 * Clear sensible page flags, so that the buddy system won't
502 * complain when the page is unpoison-and-freed. 502 * complain when the page is unpoison-and-freed.
503 */ 503 */
504 ClearPageActive(p); 504 ClearPageActive(p);
505 ClearPageUnevictable(p); 505 ClearPageUnevictable(p);
506 /* 506 /*
507 * drop the page count elevated by isolate_lru_page() 507 * drop the page count elevated by isolate_lru_page()
508 */ 508 */
509 page_cache_release(p); 509 page_cache_release(p);
510 return 0; 510 return 0;
511 } 511 }
512 return -EIO; 512 return -EIO;
513 } 513 }
514 514
515 /* 515 /*
516 * Error hit kernel page. 516 * Error hit kernel page.
517 * Do nothing, try to be lucky and not touch this instead. For a few cases we 517 * Do nothing, try to be lucky and not touch this instead. For a few cases we
518 * could be more sophisticated. 518 * could be more sophisticated.
519 */ 519 */
520 static int me_kernel(struct page *p, unsigned long pfn) 520 static int me_kernel(struct page *p, unsigned long pfn)
521 { 521 {
522 return IGNORED; 522 return IGNORED;
523 } 523 }
524 524
525 /* 525 /*
526 * Page in unknown state. Do nothing. 526 * Page in unknown state. Do nothing.
527 */ 527 */
528 static int me_unknown(struct page *p, unsigned long pfn) 528 static int me_unknown(struct page *p, unsigned long pfn)
529 { 529 {
530 printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn); 530 printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn);
531 return FAILED; 531 return FAILED;
532 } 532 }
533 533
534 /* 534 /*
535 * Clean (or cleaned) page cache page. 535 * Clean (or cleaned) page cache page.
536 */ 536 */
537 static int me_pagecache_clean(struct page *p, unsigned long pfn) 537 static int me_pagecache_clean(struct page *p, unsigned long pfn)
538 { 538 {
539 int err; 539 int err;
540 int ret = FAILED; 540 int ret = FAILED;
541 struct address_space *mapping; 541 struct address_space *mapping;
542 542
543 delete_from_lru_cache(p); 543 delete_from_lru_cache(p);
544 544
545 /* 545 /*
546 * For anonymous pages we're done the only reference left 546 * For anonymous pages we're done the only reference left
547 * should be the one m_f() holds. 547 * should be the one m_f() holds.
548 */ 548 */
549 if (PageAnon(p)) 549 if (PageAnon(p))
550 return RECOVERED; 550 return RECOVERED;
551 551
552 /* 552 /*
553 * Now truncate the page in the page cache. This is really 553 * Now truncate the page in the page cache. This is really
554 * more like a "temporary hole punch" 554 * more like a "temporary hole punch"
555 * Don't do this for block devices when someone else 555 * Don't do this for block devices when someone else
556 * has a reference, because it could be file system metadata 556 * has a reference, because it could be file system metadata
557 * and that's not safe to truncate. 557 * and that's not safe to truncate.
558 */ 558 */
559 mapping = page_mapping(p); 559 mapping = page_mapping(p);
560 if (!mapping) { 560 if (!mapping) {
561 /* 561 /*
562 * Page has been teared down in the meanwhile 562 * Page has been teared down in the meanwhile
563 */ 563 */
564 return FAILED; 564 return FAILED;
565 } 565 }
566 566
567 /* 567 /*
568 * Truncation is a bit tricky. Enable it per file system for now. 568 * Truncation is a bit tricky. Enable it per file system for now.
569 * 569 *
570 * Open: to take i_mutex or not for this? Right now we don't. 570 * Open: to take i_mutex or not for this? Right now we don't.
571 */ 571 */
572 if (mapping->a_ops->error_remove_page) { 572 if (mapping->a_ops->error_remove_page) {
573 err = mapping->a_ops->error_remove_page(mapping, p); 573 err = mapping->a_ops->error_remove_page(mapping, p);
574 if (err != 0) { 574 if (err != 0) {
575 printk(KERN_INFO "MCE %#lx: Failed to punch page: %d\n", 575 printk(KERN_INFO "MCE %#lx: Failed to punch page: %d\n",
576 pfn, err); 576 pfn, err);
577 } else if (page_has_private(p) && 577 } else if (page_has_private(p) &&
578 !try_to_release_page(p, GFP_NOIO)) { 578 !try_to_release_page(p, GFP_NOIO)) {
579 pr_debug("MCE %#lx: failed to release buffers\n", pfn); 579 pr_debug("MCE %#lx: failed to release buffers\n", pfn);
580 } else { 580 } else {
581 ret = RECOVERED; 581 ret = RECOVERED;
582 } 582 }
583 } else { 583 } else {
584 /* 584 /*
585 * If the file system doesn't support it just invalidate 585 * If the file system doesn't support it just invalidate
586 * This fails on dirty or anything with private pages 586 * This fails on dirty or anything with private pages
587 */ 587 */
588 if (invalidate_inode_page(p)) 588 if (invalidate_inode_page(p))
589 ret = RECOVERED; 589 ret = RECOVERED;
590 else 590 else
591 printk(KERN_INFO "MCE %#lx: Failed to invalidate\n", 591 printk(KERN_INFO "MCE %#lx: Failed to invalidate\n",
592 pfn); 592 pfn);
593 } 593 }
594 return ret; 594 return ret;
595 } 595 }
596 596
597 /* 597 /*
598 * Dirty cache page page 598 * Dirty cache page page
599 * Issues: when the error hit a hole page the error is not properly 599 * Issues: when the error hit a hole page the error is not properly
600 * propagated. 600 * propagated.
601 */ 601 */
602 static int me_pagecache_dirty(struct page *p, unsigned long pfn) 602 static int me_pagecache_dirty(struct page *p, unsigned long pfn)
603 { 603 {
604 struct address_space *mapping = page_mapping(p); 604 struct address_space *mapping = page_mapping(p);
605 605
606 SetPageError(p); 606 SetPageError(p);
607 /* TBD: print more information about the file. */ 607 /* TBD: print more information about the file. */
608 if (mapping) { 608 if (mapping) {
609 /* 609 /*
610 * IO error will be reported by write(), fsync(), etc. 610 * IO error will be reported by write(), fsync(), etc.
611 * who check the mapping. 611 * who check the mapping.
612 * This way the application knows that something went 612 * This way the application knows that something went
613 * wrong with its dirty file data. 613 * wrong with its dirty file data.
614 * 614 *
615 * There's one open issue: 615 * There's one open issue:
616 * 616 *
617 * The EIO will be only reported on the next IO 617 * The EIO will be only reported on the next IO
618 * operation and then cleared through the IO map. 618 * operation and then cleared through the IO map.
619 * Normally Linux has two mechanisms to pass IO error 619 * Normally Linux has two mechanisms to pass IO error
620 * first through the AS_EIO flag in the address space 620 * first through the AS_EIO flag in the address space
621 * and then through the PageError flag in the page. 621 * and then through the PageError flag in the page.
622 * Since we drop pages on memory failure handling the 622 * Since we drop pages on memory failure handling the
623 * only mechanism open to use is through AS_AIO. 623 * only mechanism open to use is through AS_AIO.
624 * 624 *
625 * This has the disadvantage that it gets cleared on 625 * This has the disadvantage that it gets cleared on
626 * the first operation that returns an error, while 626 * the first operation that returns an error, while
627 * the PageError bit is more sticky and only cleared 627 * the PageError bit is more sticky and only cleared
628 * when the page is reread or dropped. If an 628 * when the page is reread or dropped. If an
629 * application assumes it will always get error on 629 * application assumes it will always get error on
630 * fsync, but does other operations on the fd before 630 * fsync, but does other operations on the fd before
631 * and the page is dropped inbetween then the error 631 * and the page is dropped inbetween then the error
632 * will not be properly reported. 632 * will not be properly reported.
633 * 633 *
634 * This can already happen even without hwpoisoned 634 * This can already happen even without hwpoisoned
635 * pages: first on metadata IO errors (which only 635 * pages: first on metadata IO errors (which only
636 * report through AS_EIO) or when the page is dropped 636 * report through AS_EIO) or when the page is dropped
637 * at the wrong time. 637 * at the wrong time.
638 * 638 *
639 * So right now we assume that the application DTRT on 639 * So right now we assume that the application DTRT on
640 * the first EIO, but we're not worse than other parts 640 * the first EIO, but we're not worse than other parts
641 * of the kernel. 641 * of the kernel.
642 */ 642 */
643 mapping_set_error(mapping, EIO); 643 mapping_set_error(mapping, EIO);
644 } 644 }
645 645
646 return me_pagecache_clean(p, pfn); 646 return me_pagecache_clean(p, pfn);
647 } 647 }
648 648
649 /* 649 /*
650 * Clean and dirty swap cache. 650 * Clean and dirty swap cache.
651 * 651 *
652 * Dirty swap cache page is tricky to handle. The page could live both in page 652 * Dirty swap cache page is tricky to handle. The page could live both in page
653 * cache and swap cache(ie. page is freshly swapped in). So it could be 653 * cache and swap cache(ie. page is freshly swapped in). So it could be
654 * referenced concurrently by 2 types of PTEs: 654 * referenced concurrently by 2 types of PTEs:
655 * normal PTEs and swap PTEs. We try to handle them consistently by calling 655 * normal PTEs and swap PTEs. We try to handle them consistently by calling
656 * try_to_unmap(TTU_IGNORE_HWPOISON) to convert the normal PTEs to swap PTEs, 656 * try_to_unmap(TTU_IGNORE_HWPOISON) to convert the normal PTEs to swap PTEs,
657 * and then 657 * and then
658 * - clear dirty bit to prevent IO 658 * - clear dirty bit to prevent IO
659 * - remove from LRU 659 * - remove from LRU
660 * - but keep in the swap cache, so that when we return to it on 660 * - but keep in the swap cache, so that when we return to it on
661 * a later page fault, we know the application is accessing 661 * a later page fault, we know the application is accessing
662 * corrupted data and shall be killed (we installed simple 662 * corrupted data and shall be killed (we installed simple
663 * interception code in do_swap_page to catch it). 663 * interception code in do_swap_page to catch it).
664 * 664 *
665 * Clean swap cache pages can be directly isolated. A later page fault will 665 * Clean swap cache pages can be directly isolated. A later page fault will
666 * bring in the known good data from disk. 666 * bring in the known good data from disk.
667 */ 667 */
668 static int me_swapcache_dirty(struct page *p, unsigned long pfn) 668 static int me_swapcache_dirty(struct page *p, unsigned long pfn)
669 { 669 {
670 ClearPageDirty(p); 670 ClearPageDirty(p);
671 /* Trigger EIO in shmem: */ 671 /* Trigger EIO in shmem: */
672 ClearPageUptodate(p); 672 ClearPageUptodate(p);
673 673
674 if (!delete_from_lru_cache(p)) 674 if (!delete_from_lru_cache(p))
675 return DELAYED; 675 return DELAYED;
676 else 676 else
677 return FAILED; 677 return FAILED;
678 } 678 }
679 679
680 static int me_swapcache_clean(struct page *p, unsigned long pfn) 680 static int me_swapcache_clean(struct page *p, unsigned long pfn)
681 { 681 {
682 delete_from_swap_cache(p); 682 delete_from_swap_cache(p);
683 683
684 if (!delete_from_lru_cache(p)) 684 if (!delete_from_lru_cache(p))
685 return RECOVERED; 685 return RECOVERED;
686 else 686 else
687 return FAILED; 687 return FAILED;
688 } 688 }
689 689
690 /* 690 /*
691 * Huge pages. Needs work. 691 * Huge pages. Needs work.
692 * Issues: 692 * Issues:
693 * No rmap support so we cannot find the original mapper. In theory could walk 693 * No rmap support so we cannot find the original mapper. In theory could walk
694 * all MMs and look for the mappings, but that would be non atomic and racy. 694 * all MMs and look for the mappings, but that would be non atomic and racy.
695 * Need rmap for hugepages for this. Alternatively we could employ a heuristic, 695 * Need rmap for hugepages for this. Alternatively we could employ a heuristic,
696 * like just walking the current process and hoping it has it mapped (that 696 * like just walking the current process and hoping it has it mapped (that
697 * should be usually true for the common "shared database cache" case) 697 * should be usually true for the common "shared database cache" case)
698 * Should handle free huge pages and dequeue them too, but this needs to 698 * Should handle free huge pages and dequeue them too, but this needs to
699 * handle huge page accounting correctly. 699 * handle huge page accounting correctly.
700 */ 700 */
701 static int me_huge_page(struct page *p, unsigned long pfn) 701 static int me_huge_page(struct page *p, unsigned long pfn)
702 { 702 {
703 return FAILED; 703 return FAILED;
704 } 704 }
705 705
706 /* 706 /*
707 * Various page states we can handle. 707 * Various page states we can handle.
708 * 708 *
709 * A page state is defined by its current page->flags bits. 709 * A page state is defined by its current page->flags bits.
710 * The table matches them in order and calls the right handler. 710 * The table matches them in order and calls the right handler.
711 * 711 *
712 * This is quite tricky because we can access page at any time 712 * This is quite tricky because we can access page at any time
713 * in its live cycle, so all accesses have to be extremly careful. 713 * in its live cycle, so all accesses have to be extremly careful.
714 * 714 *
715 * This is not complete. More states could be added. 715 * This is not complete. More states could be added.
716 * For any missing state don't attempt recovery. 716 * For any missing state don't attempt recovery.
717 */ 717 */
718 718
719 #define dirty (1UL << PG_dirty) 719 #define dirty (1UL << PG_dirty)
720 #define sc (1UL << PG_swapcache) 720 #define sc (1UL << PG_swapcache)
721 #define unevict (1UL << PG_unevictable) 721 #define unevict (1UL << PG_unevictable)
722 #define mlock (1UL << PG_mlocked) 722 #define mlock (1UL << PG_mlocked)
723 #define writeback (1UL << PG_writeback) 723 #define writeback (1UL << PG_writeback)
724 #define lru (1UL << PG_lru) 724 #define lru (1UL << PG_lru)
725 #define swapbacked (1UL << PG_swapbacked) 725 #define swapbacked (1UL << PG_swapbacked)
726 #define head (1UL << PG_head) 726 #define head (1UL << PG_head)
727 #define tail (1UL << PG_tail) 727 #define tail (1UL << PG_tail)
728 #define compound (1UL << PG_compound) 728 #define compound (1UL << PG_compound)
729 #define slab (1UL << PG_slab) 729 #define slab (1UL << PG_slab)
730 #define reserved (1UL << PG_reserved) 730 #define reserved (1UL << PG_reserved)
731 731
732 static struct page_state { 732 static struct page_state {
733 unsigned long mask; 733 unsigned long mask;
734 unsigned long res; 734 unsigned long res;
735 char *msg; 735 char *msg;
736 int (*action)(struct page *p, unsigned long pfn); 736 int (*action)(struct page *p, unsigned long pfn);
737 } error_states[] = { 737 } error_states[] = {
738 { reserved, reserved, "reserved kernel", me_kernel }, 738 { reserved, reserved, "reserved kernel", me_kernel },
739 /* 739 /*
740 * free pages are specially detected outside this table: 740 * free pages are specially detected outside this table:
741 * PG_buddy pages only make a small fraction of all free pages. 741 * PG_buddy pages only make a small fraction of all free pages.
742 */ 742 */
743 743
744 /* 744 /*
745 * Could in theory check if slab page is free or if we can drop 745 * Could in theory check if slab page is free or if we can drop
746 * currently unused objects without touching them. But just 746 * currently unused objects without touching them. But just
747 * treat it as standard kernel for now. 747 * treat it as standard kernel for now.
748 */ 748 */
749 { slab, slab, "kernel slab", me_kernel }, 749 { slab, slab, "kernel slab", me_kernel },
750 750
751 #ifdef CONFIG_PAGEFLAGS_EXTENDED 751 #ifdef CONFIG_PAGEFLAGS_EXTENDED
752 { head, head, "huge", me_huge_page }, 752 { head, head, "huge", me_huge_page },
753 { tail, tail, "huge", me_huge_page }, 753 { tail, tail, "huge", me_huge_page },
754 #else 754 #else
755 { compound, compound, "huge", me_huge_page }, 755 { compound, compound, "huge", me_huge_page },
756 #endif 756 #endif
757 757
758 { sc|dirty, sc|dirty, "swapcache", me_swapcache_dirty }, 758 { sc|dirty, sc|dirty, "swapcache", me_swapcache_dirty },
759 { sc|dirty, sc, "swapcache", me_swapcache_clean }, 759 { sc|dirty, sc, "swapcache", me_swapcache_clean },
760 760
761 { unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty}, 761 { unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty},
762 { unevict, unevict, "unevictable LRU", me_pagecache_clean}, 762 { unevict, unevict, "unevictable LRU", me_pagecache_clean},
763 763
764 { mlock|dirty, mlock|dirty, "mlocked LRU", me_pagecache_dirty }, 764 { mlock|dirty, mlock|dirty, "mlocked LRU", me_pagecache_dirty },
765 { mlock, mlock, "mlocked LRU", me_pagecache_clean }, 765 { mlock, mlock, "mlocked LRU", me_pagecache_clean },
766 766
767 { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty }, 767 { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty },
768 { lru|dirty, lru, "clean LRU", me_pagecache_clean }, 768 { lru|dirty, lru, "clean LRU", me_pagecache_clean },
769 769
770 /* 770 /*
771 * Catchall entry: must be at end. 771 * Catchall entry: must be at end.
772 */ 772 */
773 { 0, 0, "unknown page state", me_unknown }, 773 { 0, 0, "unknown page state", me_unknown },
774 }; 774 };
775 775
776 #undef dirty 776 #undef dirty
777 #undef sc 777 #undef sc
778 #undef unevict 778 #undef unevict
779 #undef mlock 779 #undef mlock
780 #undef writeback 780 #undef writeback
781 #undef lru 781 #undef lru
782 #undef swapbacked 782 #undef swapbacked
783 #undef head 783 #undef head
784 #undef tail 784 #undef tail
785 #undef compound 785 #undef compound
786 #undef slab 786 #undef slab
787 #undef reserved 787 #undef reserved
788 788
789 static void action_result(unsigned long pfn, char *msg, int result) 789 static void action_result(unsigned long pfn, char *msg, int result)
790 { 790 {
791 struct page *page = pfn_to_page(pfn); 791 struct page *page = pfn_to_page(pfn);
792 792
793 printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n", 793 printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n",
794 pfn, 794 pfn,
795 PageDirty(page) ? "dirty " : "", 795 PageDirty(page) ? "dirty " : "",
796 msg, action_name[result]); 796 msg, action_name[result]);
797 } 797 }
798 798
799 static int page_action(struct page_state *ps, struct page *p, 799 static int page_action(struct page_state *ps, struct page *p,
800 unsigned long pfn) 800 unsigned long pfn)
801 { 801 {
802 int result; 802 int result;
803 int count; 803 int count;
804 804
805 result = ps->action(p, pfn); 805 result = ps->action(p, pfn);
806 action_result(pfn, ps->msg, result); 806 action_result(pfn, ps->msg, result);
807 807
808 count = page_count(p) - 1; 808 count = page_count(p) - 1;
809 if (ps->action == me_swapcache_dirty && result == DELAYED) 809 if (ps->action == me_swapcache_dirty && result == DELAYED)
810 count--; 810 count--;
811 if (count != 0) { 811 if (count != 0) {
812 printk(KERN_ERR 812 printk(KERN_ERR
813 "MCE %#lx: %s page still referenced by %d users\n", 813 "MCE %#lx: %s page still referenced by %d users\n",
814 pfn, ps->msg, count); 814 pfn, ps->msg, count);
815 result = FAILED; 815 result = FAILED;
816 } 816 }
817 817
818 /* Could do more checks here if page looks ok */ 818 /* Could do more checks here if page looks ok */
819 /* 819 /*
820 * Could adjust zone counters here to correct for the missing page. 820 * Could adjust zone counters here to correct for the missing page.
821 */ 821 */
822 822
823 return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY; 823 return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY;
824 } 824 }
825 825
826 #define N_UNMAP_TRIES 5 826 #define N_UNMAP_TRIES 5
827 827
828 /* 828 /*
829 * Do all that is necessary to remove user space mappings. Unmap 829 * Do all that is necessary to remove user space mappings. Unmap
830 * the pages and send SIGBUS to the processes if the data was dirty. 830 * the pages and send SIGBUS to the processes if the data was dirty.
831 */ 831 */
832 static int hwpoison_user_mappings(struct page *p, unsigned long pfn, 832 static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
833 int trapno) 833 int trapno)
834 { 834 {
835 enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; 835 enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
836 struct address_space *mapping; 836 struct address_space *mapping;
837 LIST_HEAD(tokill); 837 LIST_HEAD(tokill);
838 int ret; 838 int ret;
839 int i; 839 int i;
840 int kill = 1; 840 int kill = 1;
841 841
842 if (PageReserved(p) || PageSlab(p)) 842 if (PageReserved(p) || PageSlab(p))
843 return SWAP_SUCCESS; 843 return SWAP_SUCCESS;
844 844
845 /* 845 /*
846 * This check implies we don't kill processes if their pages 846 * This check implies we don't kill processes if their pages
847 * are in the swap cache early. Those are always late kills. 847 * are in the swap cache early. Those are always late kills.
848 */ 848 */
849 if (!page_mapped(p)) 849 if (!page_mapped(p))
850 return SWAP_SUCCESS; 850 return SWAP_SUCCESS;
851 851
852 if (PageCompound(p) || PageKsm(p)) 852 if (PageCompound(p) || PageKsm(p))
853 return SWAP_FAIL; 853 return SWAP_FAIL;
854 854
855 if (PageSwapCache(p)) { 855 if (PageSwapCache(p)) {
856 printk(KERN_ERR 856 printk(KERN_ERR
857 "MCE %#lx: keeping poisoned page in swap cache\n", pfn); 857 "MCE %#lx: keeping poisoned page in swap cache\n", pfn);
858 ttu |= TTU_IGNORE_HWPOISON; 858 ttu |= TTU_IGNORE_HWPOISON;
859 } 859 }
860 860
861 /* 861 /*
862 * Propagate the dirty bit from PTEs to struct page first, because we 862 * Propagate the dirty bit from PTEs to struct page first, because we
863 * need this to decide if we should kill or just drop the page. 863 * need this to decide if we should kill or just drop the page.
864 * XXX: the dirty test could be racy: set_page_dirty() may not always 864 * XXX: the dirty test could be racy: set_page_dirty() may not always
865 * be called inside page lock (it's recommended but not enforced). 865 * be called inside page lock (it's recommended but not enforced).
866 */ 866 */
867 mapping = page_mapping(p); 867 mapping = page_mapping(p);
868 if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) { 868 if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) {
869 if (page_mkclean(p)) { 869 if (page_mkclean(p)) {
870 SetPageDirty(p); 870 SetPageDirty(p);
871 } else { 871 } else {
872 kill = 0; 872 kill = 0;
873 ttu |= TTU_IGNORE_HWPOISON; 873 ttu |= TTU_IGNORE_HWPOISON;
874 printk(KERN_INFO 874 printk(KERN_INFO
875 "MCE %#lx: corrupted page was clean: dropped without side effects\n", 875 "MCE %#lx: corrupted page was clean: dropped without side effects\n",
876 pfn); 876 pfn);
877 } 877 }
878 } 878 }
879 879
880 /* 880 /*
881 * First collect all the processes that have the page 881 * First collect all the processes that have the page
882 * mapped in dirty form. This has to be done before try_to_unmap, 882 * mapped in dirty form. This has to be done before try_to_unmap,
883 * because ttu takes the rmap data structures down. 883 * because ttu takes the rmap data structures down.
884 * 884 *
885 * Error handling: We ignore errors here because 885 * Error handling: We ignore errors here because
886 * there's nothing that can be done. 886 * there's nothing that can be done.
887 */ 887 */
888 if (kill) 888 if (kill)
889 collect_procs(p, &tokill); 889 collect_procs(p, &tokill);
890 890
891 /* 891 /*
892 * try_to_unmap can fail temporarily due to races. 892 * try_to_unmap can fail temporarily due to races.
893 * Try a few times (RED-PEN better strategy?) 893 * Try a few times (RED-PEN better strategy?)
894 */ 894 */
895 for (i = 0; i < N_UNMAP_TRIES; i++) { 895 for (i = 0; i < N_UNMAP_TRIES; i++) {
896 ret = try_to_unmap(p, ttu); 896 ret = try_to_unmap(p, ttu);
897 if (ret == SWAP_SUCCESS) 897 if (ret == SWAP_SUCCESS)
898 break; 898 break;
899 pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret); 899 pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret);
900 } 900 }
901 901
902 if (ret != SWAP_SUCCESS) 902 if (ret != SWAP_SUCCESS)
903 printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", 903 printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
904 pfn, page_mapcount(p)); 904 pfn, page_mapcount(p));
905 905
906 /* 906 /*
907 * Now that the dirty bit has been propagated to the 907 * Now that the dirty bit has been propagated to the
908 * struct page and all unmaps done we can decide if 908 * struct page and all unmaps done we can decide if
909 * killing is needed or not. Only kill when the page 909 * killing is needed or not. Only kill when the page
910 * was dirty, otherwise the tokill list is merely 910 * was dirty, otherwise the tokill list is merely
911 * freed. When there was a problem unmapping earlier 911 * freed. When there was a problem unmapping earlier
912 * use a more force-full uncatchable kill to prevent 912 * use a more force-full uncatchable kill to prevent
913 * any accesses to the poisoned memory. 913 * any accesses to the poisoned memory.
914 */ 914 */
915 kill_procs_ao(&tokill, !!PageDirty(p), trapno, 915 kill_procs_ao(&tokill, !!PageDirty(p), trapno,
916 ret != SWAP_SUCCESS, pfn); 916 ret != SWAP_SUCCESS, pfn);
917 917
918 return ret; 918 return ret;
919 } 919 }
920 920
921 int __memory_failure(unsigned long pfn, int trapno, int flags) 921 int __memory_failure(unsigned long pfn, int trapno, int flags)
922 { 922 {
923 struct page_state *ps; 923 struct page_state *ps;
924 struct page *p; 924 struct page *p;
925 int res; 925 int res;
926 926
927 if (!sysctl_memory_failure_recovery) 927 if (!sysctl_memory_failure_recovery)
928 panic("Memory failure from trap %d on page %lx", trapno, pfn); 928 panic("Memory failure from trap %d on page %lx", trapno, pfn);
929 929
930 if (!pfn_valid(pfn)) { 930 if (!pfn_valid(pfn)) {
931 printk(KERN_ERR 931 printk(KERN_ERR
932 "MCE %#lx: memory outside kernel control\n", 932 "MCE %#lx: memory outside kernel control\n",
933 pfn); 933 pfn);
934 return -ENXIO; 934 return -ENXIO;
935 } 935 }
936 936
937 p = pfn_to_page(pfn); 937 p = pfn_to_page(pfn);
938 if (TestSetPageHWPoison(p)) { 938 if (TestSetPageHWPoison(p)) {
939 printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn); 939 printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn);
940 return 0; 940 return 0;
941 } 941 }
942 942
943 atomic_long_add(1, &mce_bad_pages); 943 atomic_long_add(1, &mce_bad_pages);
944 944
945 /* 945 /*
946 * We need/can do nothing about count=0 pages. 946 * We need/can do nothing about count=0 pages.
947 * 1) it's a free page, and therefore in safe hand: 947 * 1) it's a free page, and therefore in safe hand:
948 * prep_new_page() will be the gate keeper. 948 * prep_new_page() will be the gate keeper.
949 * 2) it's part of a non-compound high order page. 949 * 2) it's part of a non-compound high order page.
950 * Implies some kernel user: cannot stop them from 950 * Implies some kernel user: cannot stop them from
951 * R/W the page; let's pray that the page has been 951 * R/W the page; let's pray that the page has been
952 * used and will be freed some time later. 952 * used and will be freed some time later.
953 * In fact it's dangerous to directly bump up page count from 0, 953 * In fact it's dangerous to directly bump up page count from 0,
954 * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. 954 * that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
955 */ 955 */
956 if (!(flags & MF_COUNT_INCREASED) && 956 if (!(flags & MF_COUNT_INCREASED) &&
957 !get_page_unless_zero(compound_head(p))) { 957 !get_page_unless_zero(compound_head(p))) {
958 if (is_free_buddy_page(p)) { 958 if (is_free_buddy_page(p)) {
959 action_result(pfn, "free buddy", DELAYED); 959 action_result(pfn, "free buddy", DELAYED);
960 return 0; 960 return 0;
961 } else { 961 } else {
962 action_result(pfn, "high order kernel", IGNORED); 962 action_result(pfn, "high order kernel", IGNORED);
963 return -EBUSY; 963 return -EBUSY;
964 } 964 }
965 } 965 }
966 966
967 /* 967 /*
968 * We ignore non-LRU pages for good reasons. 968 * We ignore non-LRU pages for good reasons.
969 * - PG_locked is only well defined for LRU pages and a few others 969 * - PG_locked is only well defined for LRU pages and a few others
970 * - to avoid races with __set_page_locked() 970 * - to avoid races with __set_page_locked()
971 * - to avoid races with __SetPageSlab*() (and more non-atomic ops) 971 * - to avoid races with __SetPageSlab*() (and more non-atomic ops)
972 * The check (unnecessarily) ignores LRU pages being isolated and 972 * The check (unnecessarily) ignores LRU pages being isolated and
973 * walked by the page reclaim code, however that's not a big loss. 973 * walked by the page reclaim code, however that's not a big loss.
974 */ 974 */
975 if (!PageLRU(p)) 975 if (!PageLRU(p))
976 shake_page(p, 0); 976 shake_page(p, 0);
977 if (!PageLRU(p)) { 977 if (!PageLRU(p)) {
978 /* 978 /*
979 * shake_page could have turned it free. 979 * shake_page could have turned it free.
980 */ 980 */
981 if (is_free_buddy_page(p)) { 981 if (is_free_buddy_page(p)) {
982 action_result(pfn, "free buddy, 2nd try", DELAYED); 982 action_result(pfn, "free buddy, 2nd try", DELAYED);
983 return 0; 983 return 0;
984 } 984 }
985 action_result(pfn, "non LRU", IGNORED); 985 action_result(pfn, "non LRU", IGNORED);
986 put_page(p); 986 put_page(p);
987 return -EBUSY; 987 return -EBUSY;
988 } 988 }
989 989
990 /* 990 /*
991 * Lock the page and wait for writeback to finish. 991 * Lock the page and wait for writeback to finish.
992 * It's very difficult to mess with pages currently under IO 992 * It's very difficult to mess with pages currently under IO
993 * and in many cases impossible, so we just avoid it here. 993 * and in many cases impossible, so we just avoid it here.
994 */ 994 */
995 lock_page_nosync(p); 995 lock_page_nosync(p);
996 996
997 /* 997 /*
998 * unpoison always clear PG_hwpoison inside page lock 998 * unpoison always clear PG_hwpoison inside page lock
999 */ 999 */
1000 if (!PageHWPoison(p)) { 1000 if (!PageHWPoison(p)) {
1001 printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn); 1001 printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn);
1002 res = 0; 1002 res = 0;
1003 goto out; 1003 goto out;
1004 } 1004 }
1005 if (hwpoison_filter(p)) { 1005 if (hwpoison_filter(p)) {
1006 if (TestClearPageHWPoison(p)) 1006 if (TestClearPageHWPoison(p))
1007 atomic_long_dec(&mce_bad_pages); 1007 atomic_long_dec(&mce_bad_pages);
1008 unlock_page(p); 1008 unlock_page(p);
1009 put_page(p); 1009 put_page(p);
1010 return 0; 1010 return 0;
1011 } 1011 }
1012 1012
1013 wait_on_page_writeback(p); 1013 wait_on_page_writeback(p);
1014 1014
1015 /* 1015 /*
1016 * Now take care of user space mappings. 1016 * Now take care of user space mappings.
1017 * Abort on fail: __remove_from_page_cache() assumes unmapped page. 1017 * Abort on fail: __remove_from_page_cache() assumes unmapped page.
1018 */ 1018 */
1019 if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) { 1019 if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) {
1020 printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn); 1020 printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn);
1021 res = -EBUSY; 1021 res = -EBUSY;
1022 goto out; 1022 goto out;
1023 } 1023 }
1024 1024
1025 /* 1025 /*
1026 * Torn down by someone else? 1026 * Torn down by someone else?
1027 */ 1027 */
1028 if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { 1028 if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
1029 action_result(pfn, "already truncated LRU", IGNORED); 1029 action_result(pfn, "already truncated LRU", IGNORED);
1030 res = -EBUSY; 1030 res = -EBUSY;
1031 goto out; 1031 goto out;
1032 } 1032 }
1033 1033
1034 res = -EBUSY; 1034 res = -EBUSY;
1035 for (ps = error_states;; ps++) { 1035 for (ps = error_states;; ps++) {
1036 if ((p->flags & ps->mask) == ps->res) { 1036 if ((p->flags & ps->mask) == ps->res) {
1037 res = page_action(ps, p, pfn); 1037 res = page_action(ps, p, pfn);
1038 break; 1038 break;
1039 } 1039 }
1040 } 1040 }
1041 out: 1041 out:
1042 unlock_page(p); 1042 unlock_page(p);
1043 return res; 1043 return res;
1044 } 1044 }
1045 EXPORT_SYMBOL_GPL(__memory_failure); 1045 EXPORT_SYMBOL_GPL(__memory_failure);
1046 1046
1047 /** 1047 /**
1048 * memory_failure - Handle memory failure of a page. 1048 * memory_failure - Handle memory failure of a page.
1049 * @pfn: Page Number of the corrupted page 1049 * @pfn: Page Number of the corrupted page
1050 * @trapno: Trap number reported in the signal to user space. 1050 * @trapno: Trap number reported in the signal to user space.
1051 * 1051 *
1052 * This function is called by the low level machine check code 1052 * This function is called by the low level machine check code
1053 * of an architecture when it detects hardware memory corruption 1053 * of an architecture when it detects hardware memory corruption
1054 * of a page. It tries its best to recover, which includes 1054 * of a page. It tries its best to recover, which includes
1055 * dropping pages, killing processes etc. 1055 * dropping pages, killing processes etc.
1056 * 1056 *
1057 * The function is primarily of use for corruptions that 1057 * The function is primarily of use for corruptions that
1058 * happen outside the current execution context (e.g. when 1058 * happen outside the current execution context (e.g. when
1059 * detected by a background scrubber) 1059 * detected by a background scrubber)
1060 * 1060 *
1061 * Must run in process context (e.g. a work queue) with interrupts 1061 * Must run in process context (e.g. a work queue) with interrupts
1062 * enabled and no spinlocks hold. 1062 * enabled and no spinlocks hold.
1063 */ 1063 */
1064 void memory_failure(unsigned long pfn, int trapno) 1064 void memory_failure(unsigned long pfn, int trapno)
1065 { 1065 {
1066 __memory_failure(pfn, trapno, 0); 1066 __memory_failure(pfn, trapno, 0);
1067 } 1067 }
1068 1068
1069 /** 1069 /**
1070 * unpoison_memory - Unpoison a previously poisoned page 1070 * unpoison_memory - Unpoison a previously poisoned page
1071 * @pfn: Page number of the to be unpoisoned page 1071 * @pfn: Page number of the to be unpoisoned page
1072 * 1072 *
1073 * Software-unpoison a page that has been poisoned by 1073 * Software-unpoison a page that has been poisoned by
1074 * memory_failure() earlier. 1074 * memory_failure() earlier.
1075 * 1075 *
1076 * This is only done on the software-level, so it only works 1076 * This is only done on the software-level, so it only works
1077 * for linux injected failures, not real hardware failures 1077 * for linux injected failures, not real hardware failures
1078 * 1078 *
1079 * Returns 0 for success, otherwise -errno. 1079 * Returns 0 for success, otherwise -errno.
1080 */ 1080 */
1081 int unpoison_memory(unsigned long pfn) 1081 int unpoison_memory(unsigned long pfn)
1082 { 1082 {
1083 struct page *page; 1083 struct page *page;
1084 struct page *p; 1084 struct page *p;
1085 int freeit = 0; 1085 int freeit = 0;
1086 1086
1087 if (!pfn_valid(pfn)) 1087 if (!pfn_valid(pfn))
1088 return -ENXIO; 1088 return -ENXIO;
1089 1089
1090 p = pfn_to_page(pfn); 1090 p = pfn_to_page(pfn);
1091 page = compound_head(p); 1091 page = compound_head(p);
1092 1092
1093 if (!PageHWPoison(p)) { 1093 if (!PageHWPoison(p)) {
1094 pr_debug("MCE: Page was already unpoisoned %#lx\n", pfn); 1094 pr_debug("MCE: Page was already unpoisoned %#lx\n", pfn);
1095 return 0; 1095 return 0;
1096 } 1096 }
1097 1097
1098 if (!get_page_unless_zero(page)) { 1098 if (!get_page_unless_zero(page)) {
1099 if (TestClearPageHWPoison(p)) 1099 if (TestClearPageHWPoison(p))
1100 atomic_long_dec(&mce_bad_pages); 1100 atomic_long_dec(&mce_bad_pages);
1101 pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn); 1101 pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn);
1102 return 0; 1102 return 0;
1103 } 1103 }
1104 1104
1105 lock_page_nosync(page); 1105 lock_page_nosync(page);
1106 /* 1106 /*
1107 * This test is racy because PG_hwpoison is set outside of page lock. 1107 * This test is racy because PG_hwpoison is set outside of page lock.
1108 * That's acceptable because that won't trigger kernel panic. Instead, 1108 * That's acceptable because that won't trigger kernel panic. Instead,
1109 * the PG_hwpoison page will be caught and isolated on the entrance to 1109 * the PG_hwpoison page will be caught and isolated on the entrance to
1110 * the free buddy page pool. 1110 * the free buddy page pool.
1111 */ 1111 */
1112 if (TestClearPageHWPoison(p)) { 1112 if (TestClearPageHWPoison(p)) {
1113 pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn); 1113 pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn);
1114 atomic_long_dec(&mce_bad_pages); 1114 atomic_long_dec(&mce_bad_pages);
1115 freeit = 1; 1115 freeit = 1;
1116 } 1116 }
1117 unlock_page(page); 1117 unlock_page(page);
1118 1118
1119 put_page(page); 1119 put_page(page);
1120 if (freeit) 1120 if (freeit)
1121 put_page(page); 1121 put_page(page);
1122 1122
1123 return 0; 1123 return 0;
1124 } 1124 }
1125 EXPORT_SYMBOL(unpoison_memory); 1125 EXPORT_SYMBOL(unpoison_memory);
1126 1126
1127 static struct page *new_page(struct page *p, unsigned long private, int **x) 1127 static struct page *new_page(struct page *p, unsigned long private, int **x)
1128 { 1128 {
1129 int nid = page_to_nid(p); 1129 int nid = page_to_nid(p);
1130 return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0); 1130 return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
1131 } 1131 }
1132 1132
1133 /* 1133 /*
1134 * Safely get reference count of an arbitrary page. 1134 * Safely get reference count of an arbitrary page.
1135 * Returns 0 for a free page, -EIO for a zero refcount page 1135 * Returns 0 for a free page, -EIO for a zero refcount page
1136 * that is not free, and 1 for any other page type. 1136 * that is not free, and 1 for any other page type.
1137 * For 1 the page is returned with increased page count, otherwise not. 1137 * For 1 the page is returned with increased page count, otherwise not.
1138 */ 1138 */
1139 static int get_any_page(struct page *p, unsigned long pfn, int flags) 1139 static int get_any_page(struct page *p, unsigned long pfn, int flags)
1140 { 1140 {
1141 int ret; 1141 int ret;
1142 1142
1143 if (flags & MF_COUNT_INCREASED) 1143 if (flags & MF_COUNT_INCREASED)
1144 return 1; 1144 return 1;
1145 1145
1146 /* 1146 /*
1147 * The lock_system_sleep prevents a race with memory hotplug, 1147 * The lock_system_sleep prevents a race with memory hotplug,
1148 * because the isolation assumes there's only a single user. 1148 * because the isolation assumes there's only a single user.
1149 * This is a big hammer, a better would be nicer. 1149 * This is a big hammer, a better would be nicer.
1150 */ 1150 */
1151 lock_system_sleep(); 1151 lock_system_sleep();
1152 1152
1153 /* 1153 /*
1154 * Isolate the page, so that it doesn't get reallocated if it 1154 * Isolate the page, so that it doesn't get reallocated if it
1155 * was free. 1155 * was free.
1156 */ 1156 */
1157 set_migratetype_isolate(p); 1157 set_migratetype_isolate(p);
1158 if (!get_page_unless_zero(compound_head(p))) { 1158 if (!get_page_unless_zero(compound_head(p))) {
1159 if (is_free_buddy_page(p)) { 1159 if (is_free_buddy_page(p)) {
1160 pr_debug("get_any_page: %#lx free buddy page\n", pfn); 1160 pr_debug("get_any_page: %#lx free buddy page\n", pfn);
1161 /* Set hwpoison bit while page is still isolated */ 1161 /* Set hwpoison bit while page is still isolated */
1162 SetPageHWPoison(p); 1162 SetPageHWPoison(p);
1163 ret = 0; 1163 ret = 0;
1164 } else { 1164 } else {
1165 pr_debug("get_any_page: %#lx: unknown zero refcount page type %lx\n", 1165 pr_debug("get_any_page: %#lx: unknown zero refcount page type %lx\n",
1166 pfn, p->flags); 1166 pfn, p->flags);
1167 ret = -EIO; 1167 ret = -EIO;
1168 } 1168 }
1169 } else { 1169 } else {
1170 /* Not a free page */ 1170 /* Not a free page */
1171 ret = 1; 1171 ret = 1;
1172 } 1172 }
1173 unset_migratetype_isolate(p); 1173 unset_migratetype_isolate(p);
1174 unlock_system_sleep(); 1174 unlock_system_sleep();
1175 return ret; 1175 return ret;
1176 } 1176 }
1177 1177
1178 /** 1178 /**
1179 * soft_offline_page - Soft offline a page. 1179 * soft_offline_page - Soft offline a page.
1180 * @page: page to offline 1180 * @page: page to offline
1181 * @flags: flags. Same as memory_failure(). 1181 * @flags: flags. Same as memory_failure().
1182 * 1182 *
1183 * Returns 0 on success, otherwise negated errno. 1183 * Returns 0 on success, otherwise negated errno.
1184 * 1184 *
1185 * Soft offline a page, by migration or invalidation, 1185 * Soft offline a page, by migration or invalidation,
1186 * without killing anything. This is for the case when 1186 * without killing anything. This is for the case when
1187 * a page is not corrupted yet (so it's still valid to access), 1187 * a page is not corrupted yet (so it's still valid to access),
1188 * but has had a number of corrected errors and is better taken 1188 * but has had a number of corrected errors and is better taken
1189 * out. 1189 * out.
1190 * 1190 *
1191 * The actual policy on when to do that is maintained by 1191 * The actual policy on when to do that is maintained by
1192 * user space. 1192 * user space.
1193 * 1193 *
1194 * This should never impact any application or cause data loss, 1194 * This should never impact any application or cause data loss,
1195 * however it might take some time. 1195 * however it might take some time.
1196 * 1196 *
1197 * This is not a 100% solution for all memory, but tries to be 1197 * This is not a 100% solution for all memory, but tries to be
1198 * ``good enough'' for the majority of memory. 1198 * ``good enough'' for the majority of memory.
1199 */ 1199 */
1200 int soft_offline_page(struct page *page, int flags) 1200 int soft_offline_page(struct page *page, int flags)
1201 { 1201 {
1202 int ret; 1202 int ret;
1203 unsigned long pfn = page_to_pfn(page); 1203 unsigned long pfn = page_to_pfn(page);
1204 1204
1205 ret = get_any_page(page, pfn, flags); 1205 ret = get_any_page(page, pfn, flags);
1206 if (ret < 0) 1206 if (ret < 0)
1207 return ret; 1207 return ret;
1208 if (ret == 0) 1208 if (ret == 0)
1209 goto done; 1209 goto done;
1210 1210
1211 /* 1211 /*
1212 * Page cache page we can handle? 1212 * Page cache page we can handle?
1213 */ 1213 */
1214 if (!PageLRU(page)) { 1214 if (!PageLRU(page)) {
1215 /* 1215 /*
1216 * Try to free it. 1216 * Try to free it.
1217 */ 1217 */
1218 put_page(page); 1218 put_page(page);
1219 shake_page(page, 1); 1219 shake_page(page, 1);
1220 1220
1221 /* 1221 /*
1222 * Did it turn free? 1222 * Did it turn free?
1223 */ 1223 */
1224 ret = get_any_page(page, pfn, 0); 1224 ret = get_any_page(page, pfn, 0);
1225 if (ret < 0) 1225 if (ret < 0)
1226 return ret; 1226 return ret;
1227 if (ret == 0) 1227 if (ret == 0)
1228 goto done; 1228 goto done;
1229 } 1229 }
1230 if (!PageLRU(page)) { 1230 if (!PageLRU(page)) {
1231 pr_debug("soft_offline: %#lx: unknown non LRU page type %lx\n", 1231 pr_debug("soft_offline: %#lx: unknown non LRU page type %lx\n",
1232 pfn, page->flags); 1232 pfn, page->flags);
1233 return -EIO; 1233 return -EIO;
1234 } 1234 }
1235 1235
1236 lock_page(page); 1236 lock_page(page);
1237 wait_on_page_writeback(page); 1237 wait_on_page_writeback(page);
1238 1238
1239 /* 1239 /*
1240 * Synchronized using the page lock with memory_failure() 1240 * Synchronized using the page lock with memory_failure()
1241 */ 1241 */
1242 if (PageHWPoison(page)) { 1242 if (PageHWPoison(page)) {
1243 unlock_page(page); 1243 unlock_page(page);
1244 put_page(page); 1244 put_page(page);
1245 pr_debug("soft offline: %#lx page already poisoned\n", pfn); 1245 pr_debug("soft offline: %#lx page already poisoned\n", pfn);
1246 return -EBUSY; 1246 return -EBUSY;
1247 } 1247 }
1248 1248
1249 /* 1249 /*
1250 * Try to invalidate first. This should work for 1250 * Try to invalidate first. This should work for
1251 * non dirty unmapped page cache pages. 1251 * non dirty unmapped page cache pages.
1252 */ 1252 */
1253 ret = invalidate_inode_page(page); 1253 ret = invalidate_inode_page(page);
1254 unlock_page(page); 1254 unlock_page(page);
1255 1255
1256 /* 1256 /*
1257 * Drop count because page migration doesn't like raised 1257 * Drop count because page migration doesn't like raised
1258 * counts. The page could get re-allocated, but if it becomes 1258 * counts. The page could get re-allocated, but if it becomes
1259 * LRU the isolation will just fail. 1259 * LRU the isolation will just fail.
1260 * RED-PEN would be better to keep it isolated here, but we 1260 * RED-PEN would be better to keep it isolated here, but we
1261 * would need to fix isolation locking first. 1261 * would need to fix isolation locking first.
1262 */ 1262 */
1263 put_page(page); 1263 put_page(page);
1264 if (ret == 1) { 1264 if (ret == 1) {
1265 ret = 0; 1265 ret = 0;
1266 pr_debug("soft_offline: %#lx: invalidated\n", pfn); 1266 pr_debug("soft_offline: %#lx: invalidated\n", pfn);
1267 goto done; 1267 goto done;
1268 } 1268 }
1269 1269
1270 /* 1270 /*
1271 * Simple invalidation didn't work. 1271 * Simple invalidation didn't work.
1272 * Try to migrate to a new page instead. migrate.c 1272 * Try to migrate to a new page instead. migrate.c
1273 * handles a large number of cases for us. 1273 * handles a large number of cases for us.
1274 */ 1274 */
1275 ret = isolate_lru_page(page); 1275 ret = isolate_lru_page(page);
1276 if (!ret) { 1276 if (!ret) {
1277 LIST_HEAD(pagelist); 1277 LIST_HEAD(pagelist);
1278 1278
1279 list_add(&page->lru, &pagelist); 1279 list_add(&page->lru, &pagelist);
1280 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); 1280 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
1281 if (ret) { 1281 if (ret) {
1282 pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", 1282 pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
1283 pfn, ret, page->flags); 1283 pfn, ret, page->flags);
1284 if (ret > 0) 1284 if (ret > 0)
1285 ret = -EIO; 1285 ret = -EIO;
1286 } 1286 }
1287 } else { 1287 } else {
1288 pr_debug("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", 1288 pr_debug("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
1289 pfn, ret, page_count(page), page->flags); 1289 pfn, ret, page_count(page), page->flags);
1290 } 1290 }
1291 if (ret) 1291 if (ret)
1292 return ret; 1292 return ret;
1293 1293
1294 done: 1294 done:
1295 atomic_long_add(1, &mce_bad_pages); 1295 atomic_long_add(1, &mce_bad_pages);
1296 SetPageHWPoison(page); 1296 SetPageHWPoison(page);
1297 /* keep elevated page count for bad page */ 1297 /* keep elevated page count for bad page */
1298 return ret; 1298 return ret;
1299 } 1299 }
1300 1300
1301 /*
1302 * The caller must hold current->mm->mmap_sem in read mode.
1303 */
1301 int is_hwpoison_address(unsigned long addr) 1304 int is_hwpoison_address(unsigned long addr)
1302 { 1305 {
1303 pgd_t *pgdp; 1306 pgd_t *pgdp;
1304 pud_t pud, *pudp; 1307 pud_t pud, *pudp;
1305 pmd_t pmd, *pmdp; 1308 pmd_t pmd, *pmdp;
1306 pte_t pte, *ptep; 1309 pte_t pte, *ptep;
1307 swp_entry_t entry; 1310 swp_entry_t entry;
1308 1311
1309 pgdp = pgd_offset(current->mm, addr); 1312 pgdp = pgd_offset(current->mm, addr);
1310 if (!pgd_present(*pgdp)) 1313 if (!pgd_present(*pgdp))
1311 return 0; 1314 return 0;
1312 pudp = pud_offset(pgdp, addr); 1315 pudp = pud_offset(pgdp, addr);
1313 pud = *pudp; 1316 pud = *pudp;
1314 if (!pud_present(pud) || pud_large(pud)) 1317 if (!pud_present(pud) || pud_large(pud))
1315 return 0; 1318 return 0;
1316 pmdp = pmd_offset(pudp, addr); 1319 pmdp = pmd_offset(pudp, addr);
1317 pmd = *pmdp; 1320 pmd = *pmdp;
1318 if (!pmd_present(pmd) || pmd_large(pmd)) 1321 if (!pmd_present(pmd) || pmd_large(pmd))
1319 return 0; 1322 return 0;
1320 ptep = pte_offset_map(pmdp, addr); 1323 ptep = pte_offset_map(pmdp, addr);
1321 pte = *ptep; 1324 pte = *ptep;
1322 pte_unmap(ptep); 1325 pte_unmap(ptep);
1323 if (!is_swap_pte(pte)) 1326 if (!is_swap_pte(pte))
1324 return 0; 1327 return 0;
1325 entry = pte_to_swp_entry(pte); 1328 entry = pte_to_swp_entry(pte);
1326 return is_hwpoison_entry(entry); 1329 return is_hwpoison_entry(entry);
1327 } 1330 }
1328 EXPORT_SYMBOL_GPL(is_hwpoison_address); 1331 EXPORT_SYMBOL_GPL(is_hwpoison_address);
1329 1332
1 /* 1 /*
2 * Kernel-based Virtual Machine driver for Linux 2 * Kernel-based Virtual Machine driver for Linux
3 * 3 *
4 * This module enables machines with Intel VT-x extensions to run virtual 4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation. 5 * machines without emulation or binary translation.
6 * 6 *
7 * Copyright (C) 2006 Qumranet, Inc. 7 * Copyright (C) 2006 Qumranet, Inc.
8 * Copyright 2010 Red Hat, Inc. and/or its affilates. 8 * Copyright 2010 Red Hat, Inc. and/or its affilates.
9 * 9 *
10 * Authors: 10 * Authors:
11 * Avi Kivity <avi@qumranet.com> 11 * Avi Kivity <avi@qumranet.com>
12 * Yaniv Kamay <yaniv@qumranet.com> 12 * Yaniv Kamay <yaniv@qumranet.com>
13 * 13 *
14 * This work is licensed under the terms of the GNU GPL, version 2. See 14 * This work is licensed under the terms of the GNU GPL, version 2. See
15 * the COPYING file in the top-level directory. 15 * the COPYING file in the top-level directory.
16 * 16 *
17 */ 17 */
18 18
19 #include "iodev.h" 19 #include "iodev.h"
20 20
21 #include <linux/kvm_host.h> 21 #include <linux/kvm_host.h>
22 #include <linux/kvm.h> 22 #include <linux/kvm.h>
23 #include <linux/module.h> 23 #include <linux/module.h>
24 #include <linux/errno.h> 24 #include <linux/errno.h>
25 #include <linux/percpu.h> 25 #include <linux/percpu.h>
26 #include <linux/mm.h> 26 #include <linux/mm.h>
27 #include <linux/miscdevice.h> 27 #include <linux/miscdevice.h>
28 #include <linux/vmalloc.h> 28 #include <linux/vmalloc.h>
29 #include <linux/reboot.h> 29 #include <linux/reboot.h>
30 #include <linux/debugfs.h> 30 #include <linux/debugfs.h>
31 #include <linux/highmem.h> 31 #include <linux/highmem.h>
32 #include <linux/file.h> 32 #include <linux/file.h>
33 #include <linux/sysdev.h> 33 #include <linux/sysdev.h>
34 #include <linux/cpu.h> 34 #include <linux/cpu.h>
35 #include <linux/sched.h> 35 #include <linux/sched.h>
36 #include <linux/cpumask.h> 36 #include <linux/cpumask.h>
37 #include <linux/smp.h> 37 #include <linux/smp.h>
38 #include <linux/anon_inodes.h> 38 #include <linux/anon_inodes.h>
39 #include <linux/profile.h> 39 #include <linux/profile.h>
40 #include <linux/kvm_para.h> 40 #include <linux/kvm_para.h>
41 #include <linux/pagemap.h> 41 #include <linux/pagemap.h>
42 #include <linux/mman.h> 42 #include <linux/mman.h>
43 #include <linux/swap.h> 43 #include <linux/swap.h>
44 #include <linux/bitops.h> 44 #include <linux/bitops.h>
45 #include <linux/spinlock.h> 45 #include <linux/spinlock.h>
46 #include <linux/compat.h> 46 #include <linux/compat.h>
47 #include <linux/srcu.h> 47 #include <linux/srcu.h>
48 #include <linux/hugetlb.h> 48 #include <linux/hugetlb.h>
49 #include <linux/slab.h> 49 #include <linux/slab.h>
50 50
51 #include <asm/processor.h> 51 #include <asm/processor.h>
52 #include <asm/io.h> 52 #include <asm/io.h>
53 #include <asm/uaccess.h> 53 #include <asm/uaccess.h>
54 #include <asm/pgtable.h> 54 #include <asm/pgtable.h>
55 #include <asm-generic/bitops/le.h> 55 #include <asm-generic/bitops/le.h>
56 56
57 #include "coalesced_mmio.h" 57 #include "coalesced_mmio.h"
58 58
59 #define CREATE_TRACE_POINTS 59 #define CREATE_TRACE_POINTS
60 #include <trace/events/kvm.h> 60 #include <trace/events/kvm.h>
61 61
62 MODULE_AUTHOR("Qumranet"); 62 MODULE_AUTHOR("Qumranet");
63 MODULE_LICENSE("GPL"); 63 MODULE_LICENSE("GPL");
64 64
65 /* 65 /*
66 * Ordering of locks: 66 * Ordering of locks:
67 * 67 *
68 * kvm->lock --> kvm->slots_lock --> kvm->irq_lock 68 * kvm->lock --> kvm->slots_lock --> kvm->irq_lock
69 */ 69 */
70 70
71 DEFINE_SPINLOCK(kvm_lock); 71 DEFINE_SPINLOCK(kvm_lock);
72 LIST_HEAD(vm_list); 72 LIST_HEAD(vm_list);
73 73
74 static cpumask_var_t cpus_hardware_enabled; 74 static cpumask_var_t cpus_hardware_enabled;
75 static int kvm_usage_count = 0; 75 static int kvm_usage_count = 0;
76 static atomic_t hardware_enable_failed; 76 static atomic_t hardware_enable_failed;
77 77
78 struct kmem_cache *kvm_vcpu_cache; 78 struct kmem_cache *kvm_vcpu_cache;
79 EXPORT_SYMBOL_GPL(kvm_vcpu_cache); 79 EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
80 80
81 static __read_mostly struct preempt_ops kvm_preempt_ops; 81 static __read_mostly struct preempt_ops kvm_preempt_ops;
82 82
83 struct dentry *kvm_debugfs_dir; 83 struct dentry *kvm_debugfs_dir;
84 84
85 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, 85 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
86 unsigned long arg); 86 unsigned long arg);
87 static int hardware_enable_all(void); 87 static int hardware_enable_all(void);
88 static void hardware_disable_all(void); 88 static void hardware_disable_all(void);
89 89
90 static void kvm_io_bus_destroy(struct kvm_io_bus *bus); 90 static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
91 91
92 static bool kvm_rebooting; 92 static bool kvm_rebooting;
93 93
94 static bool largepages_enabled = true; 94 static bool largepages_enabled = true;
95 95
96 struct page *hwpoison_page; 96 struct page *hwpoison_page;
97 pfn_t hwpoison_pfn; 97 pfn_t hwpoison_pfn;
98 98
99 inline int kvm_is_mmio_pfn(pfn_t pfn) 99 inline int kvm_is_mmio_pfn(pfn_t pfn)
100 { 100 {
101 if (pfn_valid(pfn)) { 101 if (pfn_valid(pfn)) {
102 struct page *page = compound_head(pfn_to_page(pfn)); 102 struct page *page = compound_head(pfn_to_page(pfn));
103 return PageReserved(page); 103 return PageReserved(page);
104 } 104 }
105 105
106 return true; 106 return true;
107 } 107 }
108 108
109 /* 109 /*
110 * Switches to specified vcpu, until a matching vcpu_put() 110 * Switches to specified vcpu, until a matching vcpu_put()
111 */ 111 */
112 void vcpu_load(struct kvm_vcpu *vcpu) 112 void vcpu_load(struct kvm_vcpu *vcpu)
113 { 113 {
114 int cpu; 114 int cpu;
115 115
116 mutex_lock(&vcpu->mutex); 116 mutex_lock(&vcpu->mutex);
117 cpu = get_cpu(); 117 cpu = get_cpu();
118 preempt_notifier_register(&vcpu->preempt_notifier); 118 preempt_notifier_register(&vcpu->preempt_notifier);
119 kvm_arch_vcpu_load(vcpu, cpu); 119 kvm_arch_vcpu_load(vcpu, cpu);
120 put_cpu(); 120 put_cpu();
121 } 121 }
122 122
123 void vcpu_put(struct kvm_vcpu *vcpu) 123 void vcpu_put(struct kvm_vcpu *vcpu)
124 { 124 {
125 preempt_disable(); 125 preempt_disable();
126 kvm_arch_vcpu_put(vcpu); 126 kvm_arch_vcpu_put(vcpu);
127 preempt_notifier_unregister(&vcpu->preempt_notifier); 127 preempt_notifier_unregister(&vcpu->preempt_notifier);
128 preempt_enable(); 128 preempt_enable();
129 mutex_unlock(&vcpu->mutex); 129 mutex_unlock(&vcpu->mutex);
130 } 130 }
131 131
132 static void ack_flush(void *_completed) 132 static void ack_flush(void *_completed)
133 { 133 {
134 } 134 }
135 135
136 static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) 136 static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
137 { 137 {
138 int i, cpu, me; 138 int i, cpu, me;
139 cpumask_var_t cpus; 139 cpumask_var_t cpus;
140 bool called = true; 140 bool called = true;
141 struct kvm_vcpu *vcpu; 141 struct kvm_vcpu *vcpu;
142 142
143 zalloc_cpumask_var(&cpus, GFP_ATOMIC); 143 zalloc_cpumask_var(&cpus, GFP_ATOMIC);
144 144
145 raw_spin_lock(&kvm->requests_lock); 145 raw_spin_lock(&kvm->requests_lock);
146 me = smp_processor_id(); 146 me = smp_processor_id();
147 kvm_for_each_vcpu(i, vcpu, kvm) { 147 kvm_for_each_vcpu(i, vcpu, kvm) {
148 if (kvm_make_check_request(req, vcpu)) 148 if (kvm_make_check_request(req, vcpu))
149 continue; 149 continue;
150 cpu = vcpu->cpu; 150 cpu = vcpu->cpu;
151 if (cpus != NULL && cpu != -1 && cpu != me) 151 if (cpus != NULL && cpu != -1 && cpu != me)
152 cpumask_set_cpu(cpu, cpus); 152 cpumask_set_cpu(cpu, cpus);
153 } 153 }
154 if (unlikely(cpus == NULL)) 154 if (unlikely(cpus == NULL))
155 smp_call_function_many(cpu_online_mask, ack_flush, NULL, 1); 155 smp_call_function_many(cpu_online_mask, ack_flush, NULL, 1);
156 else if (!cpumask_empty(cpus)) 156 else if (!cpumask_empty(cpus))
157 smp_call_function_many(cpus, ack_flush, NULL, 1); 157 smp_call_function_many(cpus, ack_flush, NULL, 1);
158 else 158 else
159 called = false; 159 called = false;
160 raw_spin_unlock(&kvm->requests_lock); 160 raw_spin_unlock(&kvm->requests_lock);
161 free_cpumask_var(cpus); 161 free_cpumask_var(cpus);
162 return called; 162 return called;
163 } 163 }
164 164
165 void kvm_flush_remote_tlbs(struct kvm *kvm) 165 void kvm_flush_remote_tlbs(struct kvm *kvm)
166 { 166 {
167 if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) 167 if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
168 ++kvm->stat.remote_tlb_flush; 168 ++kvm->stat.remote_tlb_flush;
169 } 169 }
170 170
171 void kvm_reload_remote_mmus(struct kvm *kvm) 171 void kvm_reload_remote_mmus(struct kvm *kvm)
172 { 172 {
173 make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD); 173 make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
174 } 174 }
175 175
176 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) 176 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
177 { 177 {
178 struct page *page; 178 struct page *page;
179 int r; 179 int r;
180 180
181 mutex_init(&vcpu->mutex); 181 mutex_init(&vcpu->mutex);
182 vcpu->cpu = -1; 182 vcpu->cpu = -1;
183 vcpu->kvm = kvm; 183 vcpu->kvm = kvm;
184 vcpu->vcpu_id = id; 184 vcpu->vcpu_id = id;
185 init_waitqueue_head(&vcpu->wq); 185 init_waitqueue_head(&vcpu->wq);
186 186
187 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 187 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
188 if (!page) { 188 if (!page) {
189 r = -ENOMEM; 189 r = -ENOMEM;
190 goto fail; 190 goto fail;
191 } 191 }
192 vcpu->run = page_address(page); 192 vcpu->run = page_address(page);
193 193
194 r = kvm_arch_vcpu_init(vcpu); 194 r = kvm_arch_vcpu_init(vcpu);
195 if (r < 0) 195 if (r < 0)
196 goto fail_free_run; 196 goto fail_free_run;
197 return 0; 197 return 0;
198 198
199 fail_free_run: 199 fail_free_run:
200 free_page((unsigned long)vcpu->run); 200 free_page((unsigned long)vcpu->run);
201 fail: 201 fail:
202 return r; 202 return r;
203 } 203 }
204 EXPORT_SYMBOL_GPL(kvm_vcpu_init); 204 EXPORT_SYMBOL_GPL(kvm_vcpu_init);
205 205
206 void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) 206 void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
207 { 207 {
208 kvm_arch_vcpu_uninit(vcpu); 208 kvm_arch_vcpu_uninit(vcpu);
209 free_page((unsigned long)vcpu->run); 209 free_page((unsigned long)vcpu->run);
210 } 210 }
211 EXPORT_SYMBOL_GPL(kvm_vcpu_uninit); 211 EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
212 212
213 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 213 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
214 static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) 214 static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
215 { 215 {
216 return container_of(mn, struct kvm, mmu_notifier); 216 return container_of(mn, struct kvm, mmu_notifier);
217 } 217 }
218 218
219 static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn, 219 static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
220 struct mm_struct *mm, 220 struct mm_struct *mm,
221 unsigned long address) 221 unsigned long address)
222 { 222 {
223 struct kvm *kvm = mmu_notifier_to_kvm(mn); 223 struct kvm *kvm = mmu_notifier_to_kvm(mn);
224 int need_tlb_flush, idx; 224 int need_tlb_flush, idx;
225 225
226 /* 226 /*
227 * When ->invalidate_page runs, the linux pte has been zapped 227 * When ->invalidate_page runs, the linux pte has been zapped
228 * already but the page is still allocated until 228 * already but the page is still allocated until
229 * ->invalidate_page returns. So if we increase the sequence 229 * ->invalidate_page returns. So if we increase the sequence
230 * here the kvm page fault will notice if the spte can't be 230 * here the kvm page fault will notice if the spte can't be
231 * established because the page is going to be freed. If 231 * established because the page is going to be freed. If
232 * instead the kvm page fault establishes the spte before 232 * instead the kvm page fault establishes the spte before
233 * ->invalidate_page runs, kvm_unmap_hva will release it 233 * ->invalidate_page runs, kvm_unmap_hva will release it
234 * before returning. 234 * before returning.
235 * 235 *
236 * The sequence increase only need to be seen at spin_unlock 236 * The sequence increase only need to be seen at spin_unlock
237 * time, and not at spin_lock time. 237 * time, and not at spin_lock time.
238 * 238 *
239 * Increasing the sequence after the spin_unlock would be 239 * Increasing the sequence after the spin_unlock would be
240 * unsafe because the kvm page fault could then establish the 240 * unsafe because the kvm page fault could then establish the
241 * pte after kvm_unmap_hva returned, without noticing the page 241 * pte after kvm_unmap_hva returned, without noticing the page
242 * is going to be freed. 242 * is going to be freed.
243 */ 243 */
244 idx = srcu_read_lock(&kvm->srcu); 244 idx = srcu_read_lock(&kvm->srcu);
245 spin_lock(&kvm->mmu_lock); 245 spin_lock(&kvm->mmu_lock);
246 kvm->mmu_notifier_seq++; 246 kvm->mmu_notifier_seq++;
247 need_tlb_flush = kvm_unmap_hva(kvm, address); 247 need_tlb_flush = kvm_unmap_hva(kvm, address);
248 spin_unlock(&kvm->mmu_lock); 248 spin_unlock(&kvm->mmu_lock);
249 srcu_read_unlock(&kvm->srcu, idx); 249 srcu_read_unlock(&kvm->srcu, idx);
250 250
251 /* we've to flush the tlb before the pages can be freed */ 251 /* we've to flush the tlb before the pages can be freed */
252 if (need_tlb_flush) 252 if (need_tlb_flush)
253 kvm_flush_remote_tlbs(kvm); 253 kvm_flush_remote_tlbs(kvm);
254 254
255 } 255 }
256 256
257 static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, 257 static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
258 struct mm_struct *mm, 258 struct mm_struct *mm,
259 unsigned long address, 259 unsigned long address,
260 pte_t pte) 260 pte_t pte)
261 { 261 {
262 struct kvm *kvm = mmu_notifier_to_kvm(mn); 262 struct kvm *kvm = mmu_notifier_to_kvm(mn);
263 int idx; 263 int idx;
264 264
265 idx = srcu_read_lock(&kvm->srcu); 265 idx = srcu_read_lock(&kvm->srcu);
266 spin_lock(&kvm->mmu_lock); 266 spin_lock(&kvm->mmu_lock);
267 kvm->mmu_notifier_seq++; 267 kvm->mmu_notifier_seq++;
268 kvm_set_spte_hva(kvm, address, pte); 268 kvm_set_spte_hva(kvm, address, pte);
269 spin_unlock(&kvm->mmu_lock); 269 spin_unlock(&kvm->mmu_lock);
270 srcu_read_unlock(&kvm->srcu, idx); 270 srcu_read_unlock(&kvm->srcu, idx);
271 } 271 }
272 272
273 static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, 273 static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
274 struct mm_struct *mm, 274 struct mm_struct *mm,
275 unsigned long start, 275 unsigned long start,
276 unsigned long end) 276 unsigned long end)
277 { 277 {
278 struct kvm *kvm = mmu_notifier_to_kvm(mn); 278 struct kvm *kvm = mmu_notifier_to_kvm(mn);
279 int need_tlb_flush = 0, idx; 279 int need_tlb_flush = 0, idx;
280 280
281 idx = srcu_read_lock(&kvm->srcu); 281 idx = srcu_read_lock(&kvm->srcu);
282 spin_lock(&kvm->mmu_lock); 282 spin_lock(&kvm->mmu_lock);
283 /* 283 /*
284 * The count increase must become visible at unlock time as no 284 * The count increase must become visible at unlock time as no
285 * spte can be established without taking the mmu_lock and 285 * spte can be established without taking the mmu_lock and
286 * count is also read inside the mmu_lock critical section. 286 * count is also read inside the mmu_lock critical section.
287 */ 287 */
288 kvm->mmu_notifier_count++; 288 kvm->mmu_notifier_count++;
289 for (; start < end; start += PAGE_SIZE) 289 for (; start < end; start += PAGE_SIZE)
290 need_tlb_flush |= kvm_unmap_hva(kvm, start); 290 need_tlb_flush |= kvm_unmap_hva(kvm, start);
291 spin_unlock(&kvm->mmu_lock); 291 spin_unlock(&kvm->mmu_lock);
292 srcu_read_unlock(&kvm->srcu, idx); 292 srcu_read_unlock(&kvm->srcu, idx);
293 293
294 /* we've to flush the tlb before the pages can be freed */ 294 /* we've to flush the tlb before the pages can be freed */
295 if (need_tlb_flush) 295 if (need_tlb_flush)
296 kvm_flush_remote_tlbs(kvm); 296 kvm_flush_remote_tlbs(kvm);
297 } 297 }
298 298
299 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, 299 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
300 struct mm_struct *mm, 300 struct mm_struct *mm,
301 unsigned long start, 301 unsigned long start,
302 unsigned long end) 302 unsigned long end)
303 { 303 {
304 struct kvm *kvm = mmu_notifier_to_kvm(mn); 304 struct kvm *kvm = mmu_notifier_to_kvm(mn);
305 305
306 spin_lock(&kvm->mmu_lock); 306 spin_lock(&kvm->mmu_lock);
307 /* 307 /*
308 * This sequence increase will notify the kvm page fault that 308 * This sequence increase will notify the kvm page fault that
309 * the page that is going to be mapped in the spte could have 309 * the page that is going to be mapped in the spte could have
310 * been freed. 310 * been freed.
311 */ 311 */
312 kvm->mmu_notifier_seq++; 312 kvm->mmu_notifier_seq++;
313 /* 313 /*
314 * The above sequence increase must be visible before the 314 * The above sequence increase must be visible before the
315 * below count decrease but both values are read by the kvm 315 * below count decrease but both values are read by the kvm
316 * page fault under mmu_lock spinlock so we don't need to add 316 * page fault under mmu_lock spinlock so we don't need to add
317 * a smb_wmb() here in between the two. 317 * a smb_wmb() here in between the two.
318 */ 318 */
319 kvm->mmu_notifier_count--; 319 kvm->mmu_notifier_count--;
320 spin_unlock(&kvm->mmu_lock); 320 spin_unlock(&kvm->mmu_lock);
321 321
322 BUG_ON(kvm->mmu_notifier_count < 0); 322 BUG_ON(kvm->mmu_notifier_count < 0);
323 } 323 }
324 324
325 static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, 325 static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
326 struct mm_struct *mm, 326 struct mm_struct *mm,
327 unsigned long address) 327 unsigned long address)
328 { 328 {
329 struct kvm *kvm = mmu_notifier_to_kvm(mn); 329 struct kvm *kvm = mmu_notifier_to_kvm(mn);
330 int young, idx; 330 int young, idx;
331 331
332 idx = srcu_read_lock(&kvm->srcu); 332 idx = srcu_read_lock(&kvm->srcu);
333 spin_lock(&kvm->mmu_lock); 333 spin_lock(&kvm->mmu_lock);
334 young = kvm_age_hva(kvm, address); 334 young = kvm_age_hva(kvm, address);
335 spin_unlock(&kvm->mmu_lock); 335 spin_unlock(&kvm->mmu_lock);
336 srcu_read_unlock(&kvm->srcu, idx); 336 srcu_read_unlock(&kvm->srcu, idx);
337 337
338 if (young) 338 if (young)
339 kvm_flush_remote_tlbs(kvm); 339 kvm_flush_remote_tlbs(kvm);
340 340
341 return young; 341 return young;
342 } 342 }
343 343
344 static void kvm_mmu_notifier_release(struct mmu_notifier *mn, 344 static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
345 struct mm_struct *mm) 345 struct mm_struct *mm)
346 { 346 {
347 struct kvm *kvm = mmu_notifier_to_kvm(mn); 347 struct kvm *kvm = mmu_notifier_to_kvm(mn);
348 int idx; 348 int idx;
349 349
350 idx = srcu_read_lock(&kvm->srcu); 350 idx = srcu_read_lock(&kvm->srcu);
351 kvm_arch_flush_shadow(kvm); 351 kvm_arch_flush_shadow(kvm);
352 srcu_read_unlock(&kvm->srcu, idx); 352 srcu_read_unlock(&kvm->srcu, idx);
353 } 353 }
354 354
355 static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { 355 static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
356 .invalidate_page = kvm_mmu_notifier_invalidate_page, 356 .invalidate_page = kvm_mmu_notifier_invalidate_page,
357 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, 357 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
358 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, 358 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
359 .clear_flush_young = kvm_mmu_notifier_clear_flush_young, 359 .clear_flush_young = kvm_mmu_notifier_clear_flush_young,
360 .change_pte = kvm_mmu_notifier_change_pte, 360 .change_pte = kvm_mmu_notifier_change_pte,
361 .release = kvm_mmu_notifier_release, 361 .release = kvm_mmu_notifier_release,
362 }; 362 };
363 363
364 static int kvm_init_mmu_notifier(struct kvm *kvm) 364 static int kvm_init_mmu_notifier(struct kvm *kvm)
365 { 365 {
366 kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops; 366 kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
367 return mmu_notifier_register(&kvm->mmu_notifier, current->mm); 367 return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
368 } 368 }
369 369
370 #else /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */ 370 #else /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */
371 371
372 static int kvm_init_mmu_notifier(struct kvm *kvm) 372 static int kvm_init_mmu_notifier(struct kvm *kvm)
373 { 373 {
374 return 0; 374 return 0;
375 } 375 }
376 376
377 #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */ 377 #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
378 378
379 static struct kvm *kvm_create_vm(void) 379 static struct kvm *kvm_create_vm(void)
380 { 380 {
381 int r = 0, i; 381 int r = 0, i;
382 struct kvm *kvm = kvm_arch_create_vm(); 382 struct kvm *kvm = kvm_arch_create_vm();
383 383
384 if (IS_ERR(kvm)) 384 if (IS_ERR(kvm))
385 goto out; 385 goto out;
386 386
387 r = hardware_enable_all(); 387 r = hardware_enable_all();
388 if (r) 388 if (r)
389 goto out_err_nodisable; 389 goto out_err_nodisable;
390 390
391 #ifdef CONFIG_HAVE_KVM_IRQCHIP 391 #ifdef CONFIG_HAVE_KVM_IRQCHIP
392 INIT_HLIST_HEAD(&kvm->mask_notifier_list); 392 INIT_HLIST_HEAD(&kvm->mask_notifier_list);
393 INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list); 393 INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
394 #endif 394 #endif
395 395
396 r = -ENOMEM; 396 r = -ENOMEM;
397 kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); 397 kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
398 if (!kvm->memslots) 398 if (!kvm->memslots)
399 goto out_err; 399 goto out_err;
400 if (init_srcu_struct(&kvm->srcu)) 400 if (init_srcu_struct(&kvm->srcu))
401 goto out_err; 401 goto out_err;
402 for (i = 0; i < KVM_NR_BUSES; i++) { 402 for (i = 0; i < KVM_NR_BUSES; i++) {
403 kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus), 403 kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus),
404 GFP_KERNEL); 404 GFP_KERNEL);
405 if (!kvm->buses[i]) { 405 if (!kvm->buses[i]) {
406 cleanup_srcu_struct(&kvm->srcu); 406 cleanup_srcu_struct(&kvm->srcu);
407 goto out_err; 407 goto out_err;
408 } 408 }
409 } 409 }
410 410
411 r = kvm_init_mmu_notifier(kvm); 411 r = kvm_init_mmu_notifier(kvm);
412 if (r) { 412 if (r) {
413 cleanup_srcu_struct(&kvm->srcu); 413 cleanup_srcu_struct(&kvm->srcu);
414 goto out_err; 414 goto out_err;
415 } 415 }
416 416
417 kvm->mm = current->mm; 417 kvm->mm = current->mm;
418 atomic_inc(&kvm->mm->mm_count); 418 atomic_inc(&kvm->mm->mm_count);
419 spin_lock_init(&kvm->mmu_lock); 419 spin_lock_init(&kvm->mmu_lock);
420 raw_spin_lock_init(&kvm->requests_lock); 420 raw_spin_lock_init(&kvm->requests_lock);
421 kvm_eventfd_init(kvm); 421 kvm_eventfd_init(kvm);
422 mutex_init(&kvm->lock); 422 mutex_init(&kvm->lock);
423 mutex_init(&kvm->irq_lock); 423 mutex_init(&kvm->irq_lock);
424 mutex_init(&kvm->slots_lock); 424 mutex_init(&kvm->slots_lock);
425 atomic_set(&kvm->users_count, 1); 425 atomic_set(&kvm->users_count, 1);
426 spin_lock(&kvm_lock); 426 spin_lock(&kvm_lock);
427 list_add(&kvm->vm_list, &vm_list); 427 list_add(&kvm->vm_list, &vm_list);
428 spin_unlock(&kvm_lock); 428 spin_unlock(&kvm_lock);
429 out: 429 out:
430 return kvm; 430 return kvm;
431 431
432 out_err: 432 out_err:
433 hardware_disable_all(); 433 hardware_disable_all();
434 out_err_nodisable: 434 out_err_nodisable:
435 for (i = 0; i < KVM_NR_BUSES; i++) 435 for (i = 0; i < KVM_NR_BUSES; i++)
436 kfree(kvm->buses[i]); 436 kfree(kvm->buses[i]);
437 kfree(kvm->memslots); 437 kfree(kvm->memslots);
438 kfree(kvm); 438 kfree(kvm);
439 return ERR_PTR(r); 439 return ERR_PTR(r);
440 } 440 }
441 441
442 /* 442 /*
443 * Free any memory in @free but not in @dont. 443 * Free any memory in @free but not in @dont.
444 */ 444 */
445 static void kvm_free_physmem_slot(struct kvm_memory_slot *free, 445 static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
446 struct kvm_memory_slot *dont) 446 struct kvm_memory_slot *dont)
447 { 447 {
448 int i; 448 int i;
449 449
450 if (!dont || free->rmap != dont->rmap) 450 if (!dont || free->rmap != dont->rmap)
451 vfree(free->rmap); 451 vfree(free->rmap);
452 452
453 if (!dont || free->dirty_bitmap != dont->dirty_bitmap) 453 if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
454 vfree(free->dirty_bitmap); 454 vfree(free->dirty_bitmap);
455 455
456 456
457 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { 457 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
458 if (!dont || free->lpage_info[i] != dont->lpage_info[i]) { 458 if (!dont || free->lpage_info[i] != dont->lpage_info[i]) {
459 vfree(free->lpage_info[i]); 459 vfree(free->lpage_info[i]);
460 free->lpage_info[i] = NULL; 460 free->lpage_info[i] = NULL;
461 } 461 }
462 } 462 }
463 463
464 free->npages = 0; 464 free->npages = 0;
465 free->dirty_bitmap = NULL; 465 free->dirty_bitmap = NULL;
466 free->rmap = NULL; 466 free->rmap = NULL;
467 } 467 }
468 468
469 void kvm_free_physmem(struct kvm *kvm) 469 void kvm_free_physmem(struct kvm *kvm)
470 { 470 {
471 int i; 471 int i;
472 struct kvm_memslots *slots = kvm->memslots; 472 struct kvm_memslots *slots = kvm->memslots;
473 473
474 for (i = 0; i < slots->nmemslots; ++i) 474 for (i = 0; i < slots->nmemslots; ++i)
475 kvm_free_physmem_slot(&slots->memslots[i], NULL); 475 kvm_free_physmem_slot(&slots->memslots[i], NULL);
476 476
477 kfree(kvm->memslots); 477 kfree(kvm->memslots);
478 } 478 }
479 479
480 static void kvm_destroy_vm(struct kvm *kvm) 480 static void kvm_destroy_vm(struct kvm *kvm)
481 { 481 {
482 int i; 482 int i;
483 struct mm_struct *mm = kvm->mm; 483 struct mm_struct *mm = kvm->mm;
484 484
485 kvm_arch_sync_events(kvm); 485 kvm_arch_sync_events(kvm);
486 spin_lock(&kvm_lock); 486 spin_lock(&kvm_lock);
487 list_del(&kvm->vm_list); 487 list_del(&kvm->vm_list);
488 spin_unlock(&kvm_lock); 488 spin_unlock(&kvm_lock);
489 kvm_free_irq_routing(kvm); 489 kvm_free_irq_routing(kvm);
490 for (i = 0; i < KVM_NR_BUSES; i++) 490 for (i = 0; i < KVM_NR_BUSES; i++)
491 kvm_io_bus_destroy(kvm->buses[i]); 491 kvm_io_bus_destroy(kvm->buses[i]);
492 kvm_coalesced_mmio_free(kvm); 492 kvm_coalesced_mmio_free(kvm);
493 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 493 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
494 mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); 494 mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
495 #else 495 #else
496 kvm_arch_flush_shadow(kvm); 496 kvm_arch_flush_shadow(kvm);
497 #endif 497 #endif
498 kvm_arch_destroy_vm(kvm); 498 kvm_arch_destroy_vm(kvm);
499 hardware_disable_all(); 499 hardware_disable_all();
500 mmdrop(mm); 500 mmdrop(mm);
501 } 501 }
502 502
503 void kvm_get_kvm(struct kvm *kvm) 503 void kvm_get_kvm(struct kvm *kvm)
504 { 504 {
505 atomic_inc(&kvm->users_count); 505 atomic_inc(&kvm->users_count);
506 } 506 }
507 EXPORT_SYMBOL_GPL(kvm_get_kvm); 507 EXPORT_SYMBOL_GPL(kvm_get_kvm);
508 508
509 void kvm_put_kvm(struct kvm *kvm) 509 void kvm_put_kvm(struct kvm *kvm)
510 { 510 {
511 if (atomic_dec_and_test(&kvm->users_count)) 511 if (atomic_dec_and_test(&kvm->users_count))
512 kvm_destroy_vm(kvm); 512 kvm_destroy_vm(kvm);
513 } 513 }
514 EXPORT_SYMBOL_GPL(kvm_put_kvm); 514 EXPORT_SYMBOL_GPL(kvm_put_kvm);
515 515
516 516
517 static int kvm_vm_release(struct inode *inode, struct file *filp) 517 static int kvm_vm_release(struct inode *inode, struct file *filp)
518 { 518 {
519 struct kvm *kvm = filp->private_data; 519 struct kvm *kvm = filp->private_data;
520 520
521 kvm_irqfd_release(kvm); 521 kvm_irqfd_release(kvm);
522 522
523 kvm_put_kvm(kvm); 523 kvm_put_kvm(kvm);
524 return 0; 524 return 0;
525 } 525 }
526 526
527 /* 527 /*
528 * Allocate some memory and give it an address in the guest physical address 528 * Allocate some memory and give it an address in the guest physical address
529 * space. 529 * space.
530 * 530 *
531 * Discontiguous memory is allowed, mostly for framebuffers. 531 * Discontiguous memory is allowed, mostly for framebuffers.
532 * 532 *
533 * Must be called holding mmap_sem for write. 533 * Must be called holding mmap_sem for write.
534 */ 534 */
535 int __kvm_set_memory_region(struct kvm *kvm, 535 int __kvm_set_memory_region(struct kvm *kvm,
536 struct kvm_userspace_memory_region *mem, 536 struct kvm_userspace_memory_region *mem,
537 int user_alloc) 537 int user_alloc)
538 { 538 {
539 int r, flush_shadow = 0; 539 int r, flush_shadow = 0;
540 gfn_t base_gfn; 540 gfn_t base_gfn;
541 unsigned long npages; 541 unsigned long npages;
542 unsigned long i; 542 unsigned long i;
543 struct kvm_memory_slot *memslot; 543 struct kvm_memory_slot *memslot;
544 struct kvm_memory_slot old, new; 544 struct kvm_memory_slot old, new;
545 struct kvm_memslots *slots, *old_memslots; 545 struct kvm_memslots *slots, *old_memslots;
546 546
547 r = -EINVAL; 547 r = -EINVAL;
548 /* General sanity checks */ 548 /* General sanity checks */
549 if (mem->memory_size & (PAGE_SIZE - 1)) 549 if (mem->memory_size & (PAGE_SIZE - 1))
550 goto out; 550 goto out;
551 if (mem->guest_phys_addr & (PAGE_SIZE - 1)) 551 if (mem->guest_phys_addr & (PAGE_SIZE - 1))
552 goto out; 552 goto out;
553 if (user_alloc && (mem->userspace_addr & (PAGE_SIZE - 1))) 553 if (user_alloc && (mem->userspace_addr & (PAGE_SIZE - 1)))
554 goto out; 554 goto out;
555 if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS) 555 if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS)
556 goto out; 556 goto out;
557 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) 557 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
558 goto out; 558 goto out;
559 559
560 memslot = &kvm->memslots->memslots[mem->slot]; 560 memslot = &kvm->memslots->memslots[mem->slot];
561 base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; 561 base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
562 npages = mem->memory_size >> PAGE_SHIFT; 562 npages = mem->memory_size >> PAGE_SHIFT;
563 563
564 r = -EINVAL; 564 r = -EINVAL;
565 if (npages > KVM_MEM_MAX_NR_PAGES) 565 if (npages > KVM_MEM_MAX_NR_PAGES)
566 goto out; 566 goto out;
567 567
568 if (!npages) 568 if (!npages)
569 mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES; 569 mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
570 570
571 new = old = *memslot; 571 new = old = *memslot;
572 572
573 new.id = mem->slot; 573 new.id = mem->slot;
574 new.base_gfn = base_gfn; 574 new.base_gfn = base_gfn;
575 new.npages = npages; 575 new.npages = npages;
576 new.flags = mem->flags; 576 new.flags = mem->flags;
577 577
578 /* Disallow changing a memory slot's size. */ 578 /* Disallow changing a memory slot's size. */
579 r = -EINVAL; 579 r = -EINVAL;
580 if (npages && old.npages && npages != old.npages) 580 if (npages && old.npages && npages != old.npages)
581 goto out_free; 581 goto out_free;
582 582
583 /* Check for overlaps */ 583 /* Check for overlaps */
584 r = -EEXIST; 584 r = -EEXIST;
585 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { 585 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
586 struct kvm_memory_slot *s = &kvm->memslots->memslots[i]; 586 struct kvm_memory_slot *s = &kvm->memslots->memslots[i];
587 587
588 if (s == memslot || !s->npages) 588 if (s == memslot || !s->npages)
589 continue; 589 continue;
590 if (!((base_gfn + npages <= s->base_gfn) || 590 if (!((base_gfn + npages <= s->base_gfn) ||
591 (base_gfn >= s->base_gfn + s->npages))) 591 (base_gfn >= s->base_gfn + s->npages)))
592 goto out_free; 592 goto out_free;
593 } 593 }
594 594
595 /* Free page dirty bitmap if unneeded */ 595 /* Free page dirty bitmap if unneeded */
596 if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES)) 596 if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
597 new.dirty_bitmap = NULL; 597 new.dirty_bitmap = NULL;
598 598
599 r = -ENOMEM; 599 r = -ENOMEM;
600 600
601 /* Allocate if a slot is being created */ 601 /* Allocate if a slot is being created */
602 #ifndef CONFIG_S390 602 #ifndef CONFIG_S390
603 if (npages && !new.rmap) { 603 if (npages && !new.rmap) {
604 new.rmap = vmalloc(npages * sizeof(*new.rmap)); 604 new.rmap = vmalloc(npages * sizeof(*new.rmap));
605 605
606 if (!new.rmap) 606 if (!new.rmap)
607 goto out_free; 607 goto out_free;
608 608
609 memset(new.rmap, 0, npages * sizeof(*new.rmap)); 609 memset(new.rmap, 0, npages * sizeof(*new.rmap));
610 610
611 new.user_alloc = user_alloc; 611 new.user_alloc = user_alloc;
612 new.userspace_addr = mem->userspace_addr; 612 new.userspace_addr = mem->userspace_addr;
613 } 613 }
614 if (!npages) 614 if (!npages)
615 goto skip_lpage; 615 goto skip_lpage;
616 616
617 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { 617 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
618 unsigned long ugfn; 618 unsigned long ugfn;
619 unsigned long j; 619 unsigned long j;
620 int lpages; 620 int lpages;
621 int level = i + 2; 621 int level = i + 2;
622 622
623 /* Avoid unused variable warning if no large pages */ 623 /* Avoid unused variable warning if no large pages */
624 (void)level; 624 (void)level;
625 625
626 if (new.lpage_info[i]) 626 if (new.lpage_info[i])
627 continue; 627 continue;
628 628
629 lpages = 1 + (base_gfn + npages - 1) / 629 lpages = 1 + (base_gfn + npages - 1) /
630 KVM_PAGES_PER_HPAGE(level); 630 KVM_PAGES_PER_HPAGE(level);
631 lpages -= base_gfn / KVM_PAGES_PER_HPAGE(level); 631 lpages -= base_gfn / KVM_PAGES_PER_HPAGE(level);
632 632
633 new.lpage_info[i] = vmalloc(lpages * sizeof(*new.lpage_info[i])); 633 new.lpage_info[i] = vmalloc(lpages * sizeof(*new.lpage_info[i]));
634 634
635 if (!new.lpage_info[i]) 635 if (!new.lpage_info[i])
636 goto out_free; 636 goto out_free;
637 637
638 memset(new.lpage_info[i], 0, 638 memset(new.lpage_info[i], 0,
639 lpages * sizeof(*new.lpage_info[i])); 639 lpages * sizeof(*new.lpage_info[i]));
640 640
641 if (base_gfn % KVM_PAGES_PER_HPAGE(level)) 641 if (base_gfn % KVM_PAGES_PER_HPAGE(level))
642 new.lpage_info[i][0].write_count = 1; 642 new.lpage_info[i][0].write_count = 1;
643 if ((base_gfn+npages) % KVM_PAGES_PER_HPAGE(level)) 643 if ((base_gfn+npages) % KVM_PAGES_PER_HPAGE(level))
644 new.lpage_info[i][lpages - 1].write_count = 1; 644 new.lpage_info[i][lpages - 1].write_count = 1;
645 ugfn = new.userspace_addr >> PAGE_SHIFT; 645 ugfn = new.userspace_addr >> PAGE_SHIFT;
646 /* 646 /*
647 * If the gfn and userspace address are not aligned wrt each 647 * If the gfn and userspace address are not aligned wrt each
648 * other, or if explicitly asked to, disable large page 648 * other, or if explicitly asked to, disable large page
649 * support for this slot 649 * support for this slot
650 */ 650 */
651 if ((base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) || 651 if ((base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) ||
652 !largepages_enabled) 652 !largepages_enabled)
653 for (j = 0; j < lpages; ++j) 653 for (j = 0; j < lpages; ++j)
654 new.lpage_info[i][j].write_count = 1; 654 new.lpage_info[i][j].write_count = 1;
655 } 655 }
656 656
657 skip_lpage: 657 skip_lpage:
658 658
659 /* Allocate page dirty bitmap if needed */ 659 /* Allocate page dirty bitmap if needed */
660 if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { 660 if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
661 unsigned long dirty_bytes = kvm_dirty_bitmap_bytes(&new); 661 unsigned long dirty_bytes = kvm_dirty_bitmap_bytes(&new);
662 662
663 new.dirty_bitmap = vmalloc(dirty_bytes); 663 new.dirty_bitmap = vmalloc(dirty_bytes);
664 if (!new.dirty_bitmap) 664 if (!new.dirty_bitmap)
665 goto out_free; 665 goto out_free;
666 memset(new.dirty_bitmap, 0, dirty_bytes); 666 memset(new.dirty_bitmap, 0, dirty_bytes);
667 /* destroy any largepage mappings for dirty tracking */ 667 /* destroy any largepage mappings for dirty tracking */
668 if (old.npages) 668 if (old.npages)
669 flush_shadow = 1; 669 flush_shadow = 1;
670 } 670 }
671 #else /* not defined CONFIG_S390 */ 671 #else /* not defined CONFIG_S390 */
672 new.user_alloc = user_alloc; 672 new.user_alloc = user_alloc;
673 if (user_alloc) 673 if (user_alloc)
674 new.userspace_addr = mem->userspace_addr; 674 new.userspace_addr = mem->userspace_addr;
675 #endif /* not defined CONFIG_S390 */ 675 #endif /* not defined CONFIG_S390 */
676 676
677 if (!npages) { 677 if (!npages) {
678 r = -ENOMEM; 678 r = -ENOMEM;
679 slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); 679 slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
680 if (!slots) 680 if (!slots)
681 goto out_free; 681 goto out_free;
682 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); 682 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
683 if (mem->slot >= slots->nmemslots) 683 if (mem->slot >= slots->nmemslots)
684 slots->nmemslots = mem->slot + 1; 684 slots->nmemslots = mem->slot + 1;
685 slots->memslots[mem->slot].flags |= KVM_MEMSLOT_INVALID; 685 slots->memslots[mem->slot].flags |= KVM_MEMSLOT_INVALID;
686 686
687 old_memslots = kvm->memslots; 687 old_memslots = kvm->memslots;
688 rcu_assign_pointer(kvm->memslots, slots); 688 rcu_assign_pointer(kvm->memslots, slots);
689 synchronize_srcu_expedited(&kvm->srcu); 689 synchronize_srcu_expedited(&kvm->srcu);
690 /* From this point no new shadow pages pointing to a deleted 690 /* From this point no new shadow pages pointing to a deleted
691 * memslot will be created. 691 * memslot will be created.
692 * 692 *
693 * validation of sp->gfn happens in: 693 * validation of sp->gfn happens in:
694 * - gfn_to_hva (kvm_read_guest, gfn_to_pfn) 694 * - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
695 * - kvm_is_visible_gfn (mmu_check_roots) 695 * - kvm_is_visible_gfn (mmu_check_roots)
696 */ 696 */
697 kvm_arch_flush_shadow(kvm); 697 kvm_arch_flush_shadow(kvm);
698 kfree(old_memslots); 698 kfree(old_memslots);
699 } 699 }
700 700
701 r = kvm_arch_prepare_memory_region(kvm, &new, old, mem, user_alloc); 701 r = kvm_arch_prepare_memory_region(kvm, &new, old, mem, user_alloc);
702 if (r) 702 if (r)
703 goto out_free; 703 goto out_free;
704 704
705 #ifdef CONFIG_DMAR 705 #ifdef CONFIG_DMAR
706 /* map the pages in iommu page table */ 706 /* map the pages in iommu page table */
707 if (npages) { 707 if (npages) {
708 r = kvm_iommu_map_pages(kvm, &new); 708 r = kvm_iommu_map_pages(kvm, &new);
709 if (r) 709 if (r)
710 goto out_free; 710 goto out_free;
711 } 711 }
712 #endif 712 #endif
713 713
714 r = -ENOMEM; 714 r = -ENOMEM;
715 slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); 715 slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
716 if (!slots) 716 if (!slots)
717 goto out_free; 717 goto out_free;
718 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); 718 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
719 if (mem->slot >= slots->nmemslots) 719 if (mem->slot >= slots->nmemslots)
720 slots->nmemslots = mem->slot + 1; 720 slots->nmemslots = mem->slot + 1;
721 721
722 /* actual memory is freed via old in kvm_free_physmem_slot below */ 722 /* actual memory is freed via old in kvm_free_physmem_slot below */
723 if (!npages) { 723 if (!npages) {
724 new.rmap = NULL; 724 new.rmap = NULL;
725 new.dirty_bitmap = NULL; 725 new.dirty_bitmap = NULL;
726 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) 726 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i)
727 new.lpage_info[i] = NULL; 727 new.lpage_info[i] = NULL;
728 } 728 }
729 729
730 slots->memslots[mem->slot] = new; 730 slots->memslots[mem->slot] = new;
731 old_memslots = kvm->memslots; 731 old_memslots = kvm->memslots;
732 rcu_assign_pointer(kvm->memslots, slots); 732 rcu_assign_pointer(kvm->memslots, slots);
733 synchronize_srcu_expedited(&kvm->srcu); 733 synchronize_srcu_expedited(&kvm->srcu);
734 734
735 kvm_arch_commit_memory_region(kvm, mem, old, user_alloc); 735 kvm_arch_commit_memory_region(kvm, mem, old, user_alloc);
736 736
737 kvm_free_physmem_slot(&old, &new); 737 kvm_free_physmem_slot(&old, &new);
738 kfree(old_memslots); 738 kfree(old_memslots);
739 739
740 if (flush_shadow) 740 if (flush_shadow)
741 kvm_arch_flush_shadow(kvm); 741 kvm_arch_flush_shadow(kvm);
742 742
743 return 0; 743 return 0;
744 744
745 out_free: 745 out_free:
746 kvm_free_physmem_slot(&new, &old); 746 kvm_free_physmem_slot(&new, &old);
747 out: 747 out:
748 return r; 748 return r;
749 749
750 } 750 }
751 EXPORT_SYMBOL_GPL(__kvm_set_memory_region); 751 EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
752 752
753 int kvm_set_memory_region(struct kvm *kvm, 753 int kvm_set_memory_region(struct kvm *kvm,
754 struct kvm_userspace_memory_region *mem, 754 struct kvm_userspace_memory_region *mem,
755 int user_alloc) 755 int user_alloc)
756 { 756 {
757 int r; 757 int r;
758 758
759 mutex_lock(&kvm->slots_lock); 759 mutex_lock(&kvm->slots_lock);
760 r = __kvm_set_memory_region(kvm, mem, user_alloc); 760 r = __kvm_set_memory_region(kvm, mem, user_alloc);
761 mutex_unlock(&kvm->slots_lock); 761 mutex_unlock(&kvm->slots_lock);
762 return r; 762 return r;
763 } 763 }
764 EXPORT_SYMBOL_GPL(kvm_set_memory_region); 764 EXPORT_SYMBOL_GPL(kvm_set_memory_region);
765 765
766 int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, 766 int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
767 struct 767 struct
768 kvm_userspace_memory_region *mem, 768 kvm_userspace_memory_region *mem,
769 int user_alloc) 769 int user_alloc)
770 { 770 {
771 if (mem->slot >= KVM_MEMORY_SLOTS) 771 if (mem->slot >= KVM_MEMORY_SLOTS)
772 return -EINVAL; 772 return -EINVAL;
773 return kvm_set_memory_region(kvm, mem, user_alloc); 773 return kvm_set_memory_region(kvm, mem, user_alloc);
774 } 774 }
775 775
776 int kvm_get_dirty_log(struct kvm *kvm, 776 int kvm_get_dirty_log(struct kvm *kvm,
777 struct kvm_dirty_log *log, int *is_dirty) 777 struct kvm_dirty_log *log, int *is_dirty)
778 { 778 {
779 struct kvm_memory_slot *memslot; 779 struct kvm_memory_slot *memslot;
780 int r, i; 780 int r, i;
781 unsigned long n; 781 unsigned long n;
782 unsigned long any = 0; 782 unsigned long any = 0;
783 783
784 r = -EINVAL; 784 r = -EINVAL;
785 if (log->slot >= KVM_MEMORY_SLOTS) 785 if (log->slot >= KVM_MEMORY_SLOTS)
786 goto out; 786 goto out;
787 787
788 memslot = &kvm->memslots->memslots[log->slot]; 788 memslot = &kvm->memslots->memslots[log->slot];
789 r = -ENOENT; 789 r = -ENOENT;
790 if (!memslot->dirty_bitmap) 790 if (!memslot->dirty_bitmap)
791 goto out; 791 goto out;
792 792
793 n = kvm_dirty_bitmap_bytes(memslot); 793 n = kvm_dirty_bitmap_bytes(memslot);
794 794
795 for (i = 0; !any && i < n/sizeof(long); ++i) 795 for (i = 0; !any && i < n/sizeof(long); ++i)
796 any = memslot->dirty_bitmap[i]; 796 any = memslot->dirty_bitmap[i];
797 797
798 r = -EFAULT; 798 r = -EFAULT;
799 if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) 799 if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
800 goto out; 800 goto out;
801 801
802 if (any) 802 if (any)
803 *is_dirty = 1; 803 *is_dirty = 1;
804 804
805 r = 0; 805 r = 0;
806 out: 806 out:
807 return r; 807 return r;
808 } 808 }
809 809
810 void kvm_disable_largepages(void) 810 void kvm_disable_largepages(void)
811 { 811 {
812 largepages_enabled = false; 812 largepages_enabled = false;
813 } 813 }
814 EXPORT_SYMBOL_GPL(kvm_disable_largepages); 814 EXPORT_SYMBOL_GPL(kvm_disable_largepages);
815 815
816 int is_error_page(struct page *page) 816 int is_error_page(struct page *page)
817 { 817 {
818 return page == bad_page || page == hwpoison_page; 818 return page == bad_page || page == hwpoison_page;
819 } 819 }
820 EXPORT_SYMBOL_GPL(is_error_page); 820 EXPORT_SYMBOL_GPL(is_error_page);
821 821
822 int is_error_pfn(pfn_t pfn) 822 int is_error_pfn(pfn_t pfn)
823 { 823 {
824 return pfn == bad_pfn || pfn == hwpoison_pfn; 824 return pfn == bad_pfn || pfn == hwpoison_pfn;
825 } 825 }
826 EXPORT_SYMBOL_GPL(is_error_pfn); 826 EXPORT_SYMBOL_GPL(is_error_pfn);
827 827
828 int is_hwpoison_pfn(pfn_t pfn) 828 int is_hwpoison_pfn(pfn_t pfn)
829 { 829 {
830 return pfn == hwpoison_pfn; 830 return pfn == hwpoison_pfn;
831 } 831 }
832 EXPORT_SYMBOL_GPL(is_hwpoison_pfn); 832 EXPORT_SYMBOL_GPL(is_hwpoison_pfn);
833 833
834 static inline unsigned long bad_hva(void) 834 static inline unsigned long bad_hva(void)
835 { 835 {
836 return PAGE_OFFSET; 836 return PAGE_OFFSET;
837 } 837 }
838 838
839 int kvm_is_error_hva(unsigned long addr) 839 int kvm_is_error_hva(unsigned long addr)
840 { 840 {
841 return addr == bad_hva(); 841 return addr == bad_hva();
842 } 842 }
843 EXPORT_SYMBOL_GPL(kvm_is_error_hva); 843 EXPORT_SYMBOL_GPL(kvm_is_error_hva);
844 844
845 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) 845 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
846 { 846 {
847 int i; 847 int i;
848 struct kvm_memslots *slots = kvm_memslots(kvm); 848 struct kvm_memslots *slots = kvm_memslots(kvm);
849 849
850 for (i = 0; i < slots->nmemslots; ++i) { 850 for (i = 0; i < slots->nmemslots; ++i) {
851 struct kvm_memory_slot *memslot = &slots->memslots[i]; 851 struct kvm_memory_slot *memslot = &slots->memslots[i];
852 852
853 if (gfn >= memslot->base_gfn 853 if (gfn >= memslot->base_gfn
854 && gfn < memslot->base_gfn + memslot->npages) 854 && gfn < memslot->base_gfn + memslot->npages)
855 return memslot; 855 return memslot;
856 } 856 }
857 return NULL; 857 return NULL;
858 } 858 }
859 EXPORT_SYMBOL_GPL(gfn_to_memslot); 859 EXPORT_SYMBOL_GPL(gfn_to_memslot);
860 860
861 int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) 861 int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
862 { 862 {
863 int i; 863 int i;
864 struct kvm_memslots *slots = kvm_memslots(kvm); 864 struct kvm_memslots *slots = kvm_memslots(kvm);
865 865
866 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { 866 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
867 struct kvm_memory_slot *memslot = &slots->memslots[i]; 867 struct kvm_memory_slot *memslot = &slots->memslots[i];
868 868
869 if (memslot->flags & KVM_MEMSLOT_INVALID) 869 if (memslot->flags & KVM_MEMSLOT_INVALID)
870 continue; 870 continue;
871 871
872 if (gfn >= memslot->base_gfn 872 if (gfn >= memslot->base_gfn
873 && gfn < memslot->base_gfn + memslot->npages) 873 && gfn < memslot->base_gfn + memslot->npages)
874 return 1; 874 return 1;
875 } 875 }
876 return 0; 876 return 0;
877 } 877 }
878 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); 878 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
879 879
880 unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn) 880 unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn)
881 { 881 {
882 struct vm_area_struct *vma; 882 struct vm_area_struct *vma;
883 unsigned long addr, size; 883 unsigned long addr, size;
884 884
885 size = PAGE_SIZE; 885 size = PAGE_SIZE;
886 886
887 addr = gfn_to_hva(kvm, gfn); 887 addr = gfn_to_hva(kvm, gfn);
888 if (kvm_is_error_hva(addr)) 888 if (kvm_is_error_hva(addr))
889 return PAGE_SIZE; 889 return PAGE_SIZE;
890 890
891 down_read(&current->mm->mmap_sem); 891 down_read(&current->mm->mmap_sem);
892 vma = find_vma(current->mm, addr); 892 vma = find_vma(current->mm, addr);
893 if (!vma) 893 if (!vma)
894 goto out; 894 goto out;
895 895
896 size = vma_kernel_pagesize(vma); 896 size = vma_kernel_pagesize(vma);
897 897
898 out: 898 out:
899 up_read(&current->mm->mmap_sem); 899 up_read(&current->mm->mmap_sem);
900 900
901 return size; 901 return size;
902 } 902 }
903 903
904 int memslot_id(struct kvm *kvm, gfn_t gfn) 904 int memslot_id(struct kvm *kvm, gfn_t gfn)
905 { 905 {
906 int i; 906 int i;
907 struct kvm_memslots *slots = kvm_memslots(kvm); 907 struct kvm_memslots *slots = kvm_memslots(kvm);
908 struct kvm_memory_slot *memslot = NULL; 908 struct kvm_memory_slot *memslot = NULL;
909 909
910 for (i = 0; i < slots->nmemslots; ++i) { 910 for (i = 0; i < slots->nmemslots; ++i) {
911 memslot = &slots->memslots[i]; 911 memslot = &slots->memslots[i];
912 912
913 if (gfn >= memslot->base_gfn 913 if (gfn >= memslot->base_gfn
914 && gfn < memslot->base_gfn + memslot->npages) 914 && gfn < memslot->base_gfn + memslot->npages)
915 break; 915 break;
916 } 916 }
917 917
918 return memslot - slots->memslots; 918 return memslot - slots->memslots;
919 } 919 }
920 920
921 static unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn) 921 static unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
922 { 922 {
923 return slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE; 923 return slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE;
924 } 924 }
925 925
926 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) 926 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
927 { 927 {
928 struct kvm_memory_slot *slot; 928 struct kvm_memory_slot *slot;
929 929
930 slot = gfn_to_memslot(kvm, gfn); 930 slot = gfn_to_memslot(kvm, gfn);
931 if (!slot || slot->flags & KVM_MEMSLOT_INVALID) 931 if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
932 return bad_hva(); 932 return bad_hva();
933 return gfn_to_hva_memslot(slot, gfn); 933 return gfn_to_hva_memslot(slot, gfn);
934 } 934 }
935 EXPORT_SYMBOL_GPL(gfn_to_hva); 935 EXPORT_SYMBOL_GPL(gfn_to_hva);
936 936
937 static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr) 937 static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr)
938 { 938 {
939 struct page *page[1]; 939 struct page *page[1];
940 int npages; 940 int npages;
941 pfn_t pfn; 941 pfn_t pfn;
942 942
943 might_sleep(); 943 might_sleep();
944 944
945 npages = get_user_pages_fast(addr, 1, 1, page); 945 npages = get_user_pages_fast(addr, 1, 1, page);
946 946
947 if (unlikely(npages != 1)) { 947 if (unlikely(npages != 1)) {
948 struct vm_area_struct *vma; 948 struct vm_area_struct *vma;
949 949
950 down_read(&current->mm->mmap_sem);
950 if (is_hwpoison_address(addr)) { 951 if (is_hwpoison_address(addr)) {
952 up_read(&current->mm->mmap_sem);
951 get_page(hwpoison_page); 953 get_page(hwpoison_page);
952 return page_to_pfn(hwpoison_page); 954 return page_to_pfn(hwpoison_page);
953 } 955 }
954 956
955 down_read(&current->mm->mmap_sem);
956 vma = find_vma(current->mm, addr); 957 vma = find_vma(current->mm, addr);
957 958
958 if (vma == NULL || addr < vma->vm_start || 959 if (vma == NULL || addr < vma->vm_start ||
959 !(vma->vm_flags & VM_PFNMAP)) { 960 !(vma->vm_flags & VM_PFNMAP)) {
960 up_read(&current->mm->mmap_sem); 961 up_read(&current->mm->mmap_sem);
961 get_page(bad_page); 962 get_page(bad_page);
962 return page_to_pfn(bad_page); 963 return page_to_pfn(bad_page);
963 } 964 }
964 965
965 pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 966 pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
966 up_read(&current->mm->mmap_sem); 967 up_read(&current->mm->mmap_sem);
967 BUG_ON(!kvm_is_mmio_pfn(pfn)); 968 BUG_ON(!kvm_is_mmio_pfn(pfn));
968 } else 969 } else
969 pfn = page_to_pfn(page[0]); 970 pfn = page_to_pfn(page[0]);
970 971
971 return pfn; 972 return pfn;
972 } 973 }
973 974
974 pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) 975 pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
975 { 976 {
976 unsigned long addr; 977 unsigned long addr;
977 978
978 addr = gfn_to_hva(kvm, gfn); 979 addr = gfn_to_hva(kvm, gfn);
979 if (kvm_is_error_hva(addr)) { 980 if (kvm_is_error_hva(addr)) {
980 get_page(bad_page); 981 get_page(bad_page);
981 return page_to_pfn(bad_page); 982 return page_to_pfn(bad_page);
982 } 983 }
983 984
984 return hva_to_pfn(kvm, addr); 985 return hva_to_pfn(kvm, addr);
985 } 986 }
986 EXPORT_SYMBOL_GPL(gfn_to_pfn); 987 EXPORT_SYMBOL_GPL(gfn_to_pfn);
987 988
988 pfn_t gfn_to_pfn_memslot(struct kvm *kvm, 989 pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
989 struct kvm_memory_slot *slot, gfn_t gfn) 990 struct kvm_memory_slot *slot, gfn_t gfn)
990 { 991 {
991 unsigned long addr = gfn_to_hva_memslot(slot, gfn); 992 unsigned long addr = gfn_to_hva_memslot(slot, gfn);
992 return hva_to_pfn(kvm, addr); 993 return hva_to_pfn(kvm, addr);
993 } 994 }
994 995
995 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) 996 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
996 { 997 {
997 pfn_t pfn; 998 pfn_t pfn;
998 999
999 pfn = gfn_to_pfn(kvm, gfn); 1000 pfn = gfn_to_pfn(kvm, gfn);
1000 if (!kvm_is_mmio_pfn(pfn)) 1001 if (!kvm_is_mmio_pfn(pfn))
1001 return pfn_to_page(pfn); 1002 return pfn_to_page(pfn);
1002 1003
1003 WARN_ON(kvm_is_mmio_pfn(pfn)); 1004 WARN_ON(kvm_is_mmio_pfn(pfn));
1004 1005
1005 get_page(bad_page); 1006 get_page(bad_page);
1006 return bad_page; 1007 return bad_page;
1007 } 1008 }
1008 1009
1009 EXPORT_SYMBOL_GPL(gfn_to_page); 1010 EXPORT_SYMBOL_GPL(gfn_to_page);
1010 1011
1011 void kvm_release_page_clean(struct page *page) 1012 void kvm_release_page_clean(struct page *page)
1012 { 1013 {
1013 kvm_release_pfn_clean(page_to_pfn(page)); 1014 kvm_release_pfn_clean(page_to_pfn(page));
1014 } 1015 }
1015 EXPORT_SYMBOL_GPL(kvm_release_page_clean); 1016 EXPORT_SYMBOL_GPL(kvm_release_page_clean);
1016 1017
1017 void kvm_release_pfn_clean(pfn_t pfn) 1018 void kvm_release_pfn_clean(pfn_t pfn)
1018 { 1019 {
1019 if (!kvm_is_mmio_pfn(pfn)) 1020 if (!kvm_is_mmio_pfn(pfn))
1020 put_page(pfn_to_page(pfn)); 1021 put_page(pfn_to_page(pfn));
1021 } 1022 }
1022 EXPORT_SYMBOL_GPL(kvm_release_pfn_clean); 1023 EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
1023 1024
1024 void kvm_release_page_dirty(struct page *page) 1025 void kvm_release_page_dirty(struct page *page)
1025 { 1026 {
1026 kvm_release_pfn_dirty(page_to_pfn(page)); 1027 kvm_release_pfn_dirty(page_to_pfn(page));
1027 } 1028 }
1028 EXPORT_SYMBOL_GPL(kvm_release_page_dirty); 1029 EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
1029 1030
1030 void kvm_release_pfn_dirty(pfn_t pfn) 1031 void kvm_release_pfn_dirty(pfn_t pfn)
1031 { 1032 {
1032 kvm_set_pfn_dirty(pfn); 1033 kvm_set_pfn_dirty(pfn);
1033 kvm_release_pfn_clean(pfn); 1034 kvm_release_pfn_clean(pfn);
1034 } 1035 }
1035 EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty); 1036 EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
1036 1037
1037 void kvm_set_page_dirty(struct page *page) 1038 void kvm_set_page_dirty(struct page *page)
1038 { 1039 {
1039 kvm_set_pfn_dirty(page_to_pfn(page)); 1040 kvm_set_pfn_dirty(page_to_pfn(page));
1040 } 1041 }
1041 EXPORT_SYMBOL_GPL(kvm_set_page_dirty); 1042 EXPORT_SYMBOL_GPL(kvm_set_page_dirty);
1042 1043
1043 void kvm_set_pfn_dirty(pfn_t pfn) 1044 void kvm_set_pfn_dirty(pfn_t pfn)
1044 { 1045 {
1045 if (!kvm_is_mmio_pfn(pfn)) { 1046 if (!kvm_is_mmio_pfn(pfn)) {
1046 struct page *page = pfn_to_page(pfn); 1047 struct page *page = pfn_to_page(pfn);
1047 if (!PageReserved(page)) 1048 if (!PageReserved(page))
1048 SetPageDirty(page); 1049 SetPageDirty(page);
1049 } 1050 }
1050 } 1051 }
1051 EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty); 1052 EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
1052 1053
1053 void kvm_set_pfn_accessed(pfn_t pfn) 1054 void kvm_set_pfn_accessed(pfn_t pfn)
1054 { 1055 {
1055 if (!kvm_is_mmio_pfn(pfn)) 1056 if (!kvm_is_mmio_pfn(pfn))
1056 mark_page_accessed(pfn_to_page(pfn)); 1057 mark_page_accessed(pfn_to_page(pfn));
1057 } 1058 }
1058 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed); 1059 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
1059 1060
1060 void kvm_get_pfn(pfn_t pfn) 1061 void kvm_get_pfn(pfn_t pfn)
1061 { 1062 {
1062 if (!kvm_is_mmio_pfn(pfn)) 1063 if (!kvm_is_mmio_pfn(pfn))
1063 get_page(pfn_to_page(pfn)); 1064 get_page(pfn_to_page(pfn));
1064 } 1065 }
1065 EXPORT_SYMBOL_GPL(kvm_get_pfn); 1066 EXPORT_SYMBOL_GPL(kvm_get_pfn);
1066 1067
1067 static int next_segment(unsigned long len, int offset) 1068 static int next_segment(unsigned long len, int offset)
1068 { 1069 {
1069 if (len > PAGE_SIZE - offset) 1070 if (len > PAGE_SIZE - offset)
1070 return PAGE_SIZE - offset; 1071 return PAGE_SIZE - offset;
1071 else 1072 else
1072 return len; 1073 return len;
1073 } 1074 }
1074 1075
1075 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, 1076 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
1076 int len) 1077 int len)
1077 { 1078 {
1078 int r; 1079 int r;
1079 unsigned long addr; 1080 unsigned long addr;
1080 1081
1081 addr = gfn_to_hva(kvm, gfn); 1082 addr = gfn_to_hva(kvm, gfn);
1082 if (kvm_is_error_hva(addr)) 1083 if (kvm_is_error_hva(addr))
1083 return -EFAULT; 1084 return -EFAULT;
1084 r = copy_from_user(data, (void __user *)addr + offset, len); 1085 r = copy_from_user(data, (void __user *)addr + offset, len);
1085 if (r) 1086 if (r)
1086 return -EFAULT; 1087 return -EFAULT;
1087 return 0; 1088 return 0;
1088 } 1089 }
1089 EXPORT_SYMBOL_GPL(kvm_read_guest_page); 1090 EXPORT_SYMBOL_GPL(kvm_read_guest_page);
1090 1091
1091 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) 1092 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
1092 { 1093 {
1093 gfn_t gfn = gpa >> PAGE_SHIFT; 1094 gfn_t gfn = gpa >> PAGE_SHIFT;
1094 int seg; 1095 int seg;
1095 int offset = offset_in_page(gpa); 1096 int offset = offset_in_page(gpa);
1096 int ret; 1097 int ret;
1097 1098
1098 while ((seg = next_segment(len, offset)) != 0) { 1099 while ((seg = next_segment(len, offset)) != 0) {
1099 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg); 1100 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
1100 if (ret < 0) 1101 if (ret < 0)
1101 return ret; 1102 return ret;
1102 offset = 0; 1103 offset = 0;
1103 len -= seg; 1104 len -= seg;
1104 data += seg; 1105 data += seg;
1105 ++gfn; 1106 ++gfn;
1106 } 1107 }
1107 return 0; 1108 return 0;
1108 } 1109 }
1109 EXPORT_SYMBOL_GPL(kvm_read_guest); 1110 EXPORT_SYMBOL_GPL(kvm_read_guest);
1110 1111
1111 int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, 1112 int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
1112 unsigned long len) 1113 unsigned long len)
1113 { 1114 {
1114 int r; 1115 int r;
1115 unsigned long addr; 1116 unsigned long addr;
1116 gfn_t gfn = gpa >> PAGE_SHIFT; 1117 gfn_t gfn = gpa >> PAGE_SHIFT;
1117 int offset = offset_in_page(gpa); 1118 int offset = offset_in_page(gpa);
1118 1119
1119 addr = gfn_to_hva(kvm, gfn); 1120 addr = gfn_to_hva(kvm, gfn);
1120 if (kvm_is_error_hva(addr)) 1121 if (kvm_is_error_hva(addr))
1121 return -EFAULT; 1122 return -EFAULT;
1122 pagefault_disable(); 1123 pagefault_disable();
1123 r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len); 1124 r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
1124 pagefault_enable(); 1125 pagefault_enable();
1125 if (r) 1126 if (r)
1126 return -EFAULT; 1127 return -EFAULT;
1127 return 0; 1128 return 0;
1128 } 1129 }
1129 EXPORT_SYMBOL(kvm_read_guest_atomic); 1130 EXPORT_SYMBOL(kvm_read_guest_atomic);
1130 1131
1131 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, 1132 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
1132 int offset, int len) 1133 int offset, int len)
1133 { 1134 {
1134 int r; 1135 int r;
1135 unsigned long addr; 1136 unsigned long addr;
1136 1137
1137 addr = gfn_to_hva(kvm, gfn); 1138 addr = gfn_to_hva(kvm, gfn);
1138 if (kvm_is_error_hva(addr)) 1139 if (kvm_is_error_hva(addr))
1139 return -EFAULT; 1140 return -EFAULT;
1140 r = copy_to_user((void __user *)addr + offset, data, len); 1141 r = copy_to_user((void __user *)addr + offset, data, len);
1141 if (r) 1142 if (r)
1142 return -EFAULT; 1143 return -EFAULT;
1143 mark_page_dirty(kvm, gfn); 1144 mark_page_dirty(kvm, gfn);
1144 return 0; 1145 return 0;
1145 } 1146 }
1146 EXPORT_SYMBOL_GPL(kvm_write_guest_page); 1147 EXPORT_SYMBOL_GPL(kvm_write_guest_page);
1147 1148
1148 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, 1149 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
1149 unsigned long len) 1150 unsigned long len)
1150 { 1151 {
1151 gfn_t gfn = gpa >> PAGE_SHIFT; 1152 gfn_t gfn = gpa >> PAGE_SHIFT;
1152 int seg; 1153 int seg;
1153 int offset = offset_in_page(gpa); 1154 int offset = offset_in_page(gpa);
1154 int ret; 1155 int ret;
1155 1156
1156 while ((seg = next_segment(len, offset)) != 0) { 1157 while ((seg = next_segment(len, offset)) != 0) {
1157 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg); 1158 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
1158 if (ret < 0) 1159 if (ret < 0)
1159 return ret; 1160 return ret;
1160 offset = 0; 1161 offset = 0;
1161 len -= seg; 1162 len -= seg;
1162 data += seg; 1163 data += seg;
1163 ++gfn; 1164 ++gfn;
1164 } 1165 }
1165 return 0; 1166 return 0;
1166 } 1167 }
1167 1168
1168 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) 1169 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
1169 { 1170 {
1170 return kvm_write_guest_page(kvm, gfn, empty_zero_page, offset, len); 1171 return kvm_write_guest_page(kvm, gfn, empty_zero_page, offset, len);
1171 } 1172 }
1172 EXPORT_SYMBOL_GPL(kvm_clear_guest_page); 1173 EXPORT_SYMBOL_GPL(kvm_clear_guest_page);
1173 1174
1174 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) 1175 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
1175 { 1176 {
1176 gfn_t gfn = gpa >> PAGE_SHIFT; 1177 gfn_t gfn = gpa >> PAGE_SHIFT;
1177 int seg; 1178 int seg;
1178 int offset = offset_in_page(gpa); 1179 int offset = offset_in_page(gpa);
1179 int ret; 1180 int ret;
1180 1181
1181 while ((seg = next_segment(len, offset)) != 0) { 1182 while ((seg = next_segment(len, offset)) != 0) {
1182 ret = kvm_clear_guest_page(kvm, gfn, offset, seg); 1183 ret = kvm_clear_guest_page(kvm, gfn, offset, seg);
1183 if (ret < 0) 1184 if (ret < 0)
1184 return ret; 1185 return ret;
1185 offset = 0; 1186 offset = 0;
1186 len -= seg; 1187 len -= seg;
1187 ++gfn; 1188 ++gfn;
1188 } 1189 }
1189 return 0; 1190 return 0;
1190 } 1191 }
1191 EXPORT_SYMBOL_GPL(kvm_clear_guest); 1192 EXPORT_SYMBOL_GPL(kvm_clear_guest);
1192 1193
1193 void mark_page_dirty(struct kvm *kvm, gfn_t gfn) 1194 void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
1194 { 1195 {
1195 struct kvm_memory_slot *memslot; 1196 struct kvm_memory_slot *memslot;
1196 1197
1197 memslot = gfn_to_memslot(kvm, gfn); 1198 memslot = gfn_to_memslot(kvm, gfn);
1198 if (memslot && memslot->dirty_bitmap) { 1199 if (memslot && memslot->dirty_bitmap) {
1199 unsigned long rel_gfn = gfn - memslot->base_gfn; 1200 unsigned long rel_gfn = gfn - memslot->base_gfn;
1200 1201
1201 generic___set_le_bit(rel_gfn, memslot->dirty_bitmap); 1202 generic___set_le_bit(rel_gfn, memslot->dirty_bitmap);
1202 } 1203 }
1203 } 1204 }
1204 1205
1205 /* 1206 /*
1206 * The vCPU has executed a HLT instruction with in-kernel mode enabled. 1207 * The vCPU has executed a HLT instruction with in-kernel mode enabled.
1207 */ 1208 */
1208 void kvm_vcpu_block(struct kvm_vcpu *vcpu) 1209 void kvm_vcpu_block(struct kvm_vcpu *vcpu)
1209 { 1210 {
1210 DEFINE_WAIT(wait); 1211 DEFINE_WAIT(wait);
1211 1212
1212 for (;;) { 1213 for (;;) {
1213 prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); 1214 prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
1214 1215
1215 if (kvm_arch_vcpu_runnable(vcpu)) { 1216 if (kvm_arch_vcpu_runnable(vcpu)) {
1216 kvm_make_request(KVM_REQ_UNHALT, vcpu); 1217 kvm_make_request(KVM_REQ_UNHALT, vcpu);
1217 break; 1218 break;
1218 } 1219 }
1219 if (kvm_cpu_has_pending_timer(vcpu)) 1220 if (kvm_cpu_has_pending_timer(vcpu))
1220 break; 1221 break;
1221 if (signal_pending(current)) 1222 if (signal_pending(current))
1222 break; 1223 break;
1223 1224
1224 schedule(); 1225 schedule();
1225 } 1226 }
1226 1227
1227 finish_wait(&vcpu->wq, &wait); 1228 finish_wait(&vcpu->wq, &wait);
1228 } 1229 }
1229 1230
1230 void kvm_resched(struct kvm_vcpu *vcpu) 1231 void kvm_resched(struct kvm_vcpu *vcpu)
1231 { 1232 {
1232 if (!need_resched()) 1233 if (!need_resched())
1233 return; 1234 return;
1234 cond_resched(); 1235 cond_resched();
1235 } 1236 }
1236 EXPORT_SYMBOL_GPL(kvm_resched); 1237 EXPORT_SYMBOL_GPL(kvm_resched);
1237 1238
1238 void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu) 1239 void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu)
1239 { 1240 {
1240 ktime_t expires; 1241 ktime_t expires;
1241 DEFINE_WAIT(wait); 1242 DEFINE_WAIT(wait);
1242 1243
1243 prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); 1244 prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
1244 1245
1245 /* Sleep for 100 us, and hope lock-holder got scheduled */ 1246 /* Sleep for 100 us, and hope lock-holder got scheduled */
1246 expires = ktime_add_ns(ktime_get(), 100000UL); 1247 expires = ktime_add_ns(ktime_get(), 100000UL);
1247 schedule_hrtimeout(&expires, HRTIMER_MODE_ABS); 1248 schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
1248 1249
1249 finish_wait(&vcpu->wq, &wait); 1250 finish_wait(&vcpu->wq, &wait);
1250 } 1251 }
1251 EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin); 1252 EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
1252 1253
1253 static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 1254 static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1254 { 1255 {
1255 struct kvm_vcpu *vcpu = vma->vm_file->private_data; 1256 struct kvm_vcpu *vcpu = vma->vm_file->private_data;
1256 struct page *page; 1257 struct page *page;
1257 1258
1258 if (vmf->pgoff == 0) 1259 if (vmf->pgoff == 0)
1259 page = virt_to_page(vcpu->run); 1260 page = virt_to_page(vcpu->run);
1260 #ifdef CONFIG_X86 1261 #ifdef CONFIG_X86
1261 else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET) 1262 else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
1262 page = virt_to_page(vcpu->arch.pio_data); 1263 page = virt_to_page(vcpu->arch.pio_data);
1263 #endif 1264 #endif
1264 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 1265 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
1265 else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET) 1266 else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
1266 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring); 1267 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
1267 #endif 1268 #endif
1268 else 1269 else
1269 return VM_FAULT_SIGBUS; 1270 return VM_FAULT_SIGBUS;
1270 get_page(page); 1271 get_page(page);
1271 vmf->page = page; 1272 vmf->page = page;
1272 return 0; 1273 return 0;
1273 } 1274 }
1274 1275
1275 static const struct vm_operations_struct kvm_vcpu_vm_ops = { 1276 static const struct vm_operations_struct kvm_vcpu_vm_ops = {
1276 .fault = kvm_vcpu_fault, 1277 .fault = kvm_vcpu_fault,
1277 }; 1278 };
1278 1279
1279 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma) 1280 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
1280 { 1281 {
1281 vma->vm_ops = &kvm_vcpu_vm_ops; 1282 vma->vm_ops = &kvm_vcpu_vm_ops;
1282 return 0; 1283 return 0;
1283 } 1284 }
1284 1285
1285 static int kvm_vcpu_release(struct inode *inode, struct file *filp) 1286 static int kvm_vcpu_release(struct inode *inode, struct file *filp)
1286 { 1287 {
1287 struct kvm_vcpu *vcpu = filp->private_data; 1288 struct kvm_vcpu *vcpu = filp->private_data;
1288 1289
1289 kvm_put_kvm(vcpu->kvm); 1290 kvm_put_kvm(vcpu->kvm);
1290 return 0; 1291 return 0;
1291 } 1292 }
1292 1293
1293 static struct file_operations kvm_vcpu_fops = { 1294 static struct file_operations kvm_vcpu_fops = {
1294 .release = kvm_vcpu_release, 1295 .release = kvm_vcpu_release,
1295 .unlocked_ioctl = kvm_vcpu_ioctl, 1296 .unlocked_ioctl = kvm_vcpu_ioctl,
1296 .compat_ioctl = kvm_vcpu_ioctl, 1297 .compat_ioctl = kvm_vcpu_ioctl,
1297 .mmap = kvm_vcpu_mmap, 1298 .mmap = kvm_vcpu_mmap,
1298 }; 1299 };
1299 1300
1300 /* 1301 /*
1301 * Allocates an inode for the vcpu. 1302 * Allocates an inode for the vcpu.
1302 */ 1303 */
1303 static int create_vcpu_fd(struct kvm_vcpu *vcpu) 1304 static int create_vcpu_fd(struct kvm_vcpu *vcpu)
1304 { 1305 {
1305 return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR); 1306 return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR);
1306 } 1307 }
1307 1308
1308 /* 1309 /*
1309 * Creates some virtual cpus. Good luck creating more than one. 1310 * Creates some virtual cpus. Good luck creating more than one.
1310 */ 1311 */
1311 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) 1312 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
1312 { 1313 {
1313 int r; 1314 int r;
1314 struct kvm_vcpu *vcpu, *v; 1315 struct kvm_vcpu *vcpu, *v;
1315 1316
1316 vcpu = kvm_arch_vcpu_create(kvm, id); 1317 vcpu = kvm_arch_vcpu_create(kvm, id);
1317 if (IS_ERR(vcpu)) 1318 if (IS_ERR(vcpu))
1318 return PTR_ERR(vcpu); 1319 return PTR_ERR(vcpu);
1319 1320
1320 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); 1321 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
1321 1322
1322 r = kvm_arch_vcpu_setup(vcpu); 1323 r = kvm_arch_vcpu_setup(vcpu);
1323 if (r) 1324 if (r)
1324 return r; 1325 return r;
1325 1326
1326 mutex_lock(&kvm->lock); 1327 mutex_lock(&kvm->lock);
1327 if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) { 1328 if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) {
1328 r = -EINVAL; 1329 r = -EINVAL;
1329 goto vcpu_destroy; 1330 goto vcpu_destroy;
1330 } 1331 }
1331 1332
1332 kvm_for_each_vcpu(r, v, kvm) 1333 kvm_for_each_vcpu(r, v, kvm)
1333 if (v->vcpu_id == id) { 1334 if (v->vcpu_id == id) {
1334 r = -EEXIST; 1335 r = -EEXIST;
1335 goto vcpu_destroy; 1336 goto vcpu_destroy;
1336 } 1337 }
1337 1338
1338 BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]); 1339 BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]);
1339 1340
1340 /* Now it's all set up, let userspace reach it */ 1341 /* Now it's all set up, let userspace reach it */
1341 kvm_get_kvm(kvm); 1342 kvm_get_kvm(kvm);
1342 r = create_vcpu_fd(vcpu); 1343 r = create_vcpu_fd(vcpu);
1343 if (r < 0) { 1344 if (r < 0) {
1344 kvm_put_kvm(kvm); 1345 kvm_put_kvm(kvm);
1345 goto vcpu_destroy; 1346 goto vcpu_destroy;
1346 } 1347 }
1347 1348
1348 kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu; 1349 kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu;
1349 smp_wmb(); 1350 smp_wmb();
1350 atomic_inc(&kvm->online_vcpus); 1351 atomic_inc(&kvm->online_vcpus);
1351 1352
1352 #ifdef CONFIG_KVM_APIC_ARCHITECTURE 1353 #ifdef CONFIG_KVM_APIC_ARCHITECTURE
1353 if (kvm->bsp_vcpu_id == id) 1354 if (kvm->bsp_vcpu_id == id)
1354 kvm->bsp_vcpu = vcpu; 1355 kvm->bsp_vcpu = vcpu;
1355 #endif 1356 #endif
1356 mutex_unlock(&kvm->lock); 1357 mutex_unlock(&kvm->lock);
1357 return r; 1358 return r;
1358 1359
1359 vcpu_destroy: 1360 vcpu_destroy:
1360 mutex_unlock(&kvm->lock); 1361 mutex_unlock(&kvm->lock);
1361 kvm_arch_vcpu_destroy(vcpu); 1362 kvm_arch_vcpu_destroy(vcpu);
1362 return r; 1363 return r;
1363 } 1364 }
1364 1365
1365 static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset) 1366 static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
1366 { 1367 {
1367 if (sigset) { 1368 if (sigset) {
1368 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP)); 1369 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
1369 vcpu->sigset_active = 1; 1370 vcpu->sigset_active = 1;
1370 vcpu->sigset = *sigset; 1371 vcpu->sigset = *sigset;
1371 } else 1372 } else
1372 vcpu->sigset_active = 0; 1373 vcpu->sigset_active = 0;
1373 return 0; 1374 return 0;
1374 } 1375 }
1375 1376
1376 static long kvm_vcpu_ioctl(struct file *filp, 1377 static long kvm_vcpu_ioctl(struct file *filp,
1377 unsigned int ioctl, unsigned long arg) 1378 unsigned int ioctl, unsigned long arg)
1378 { 1379 {
1379 struct kvm_vcpu *vcpu = filp->private_data; 1380 struct kvm_vcpu *vcpu = filp->private_data;
1380 void __user *argp = (void __user *)arg; 1381 void __user *argp = (void __user *)arg;
1381 int r; 1382 int r;
1382 struct kvm_fpu *fpu = NULL; 1383 struct kvm_fpu *fpu = NULL;
1383 struct kvm_sregs *kvm_sregs = NULL; 1384 struct kvm_sregs *kvm_sregs = NULL;
1384 1385
1385 if (vcpu->kvm->mm != current->mm) 1386 if (vcpu->kvm->mm != current->mm)
1386 return -EIO; 1387 return -EIO;
1387 1388
1388 #if defined(CONFIG_S390) || defined(CONFIG_PPC) 1389 #if defined(CONFIG_S390) || defined(CONFIG_PPC)
1389 /* 1390 /*
1390 * Special cases: vcpu ioctls that are asynchronous to vcpu execution, 1391 * Special cases: vcpu ioctls that are asynchronous to vcpu execution,
1391 * so vcpu_load() would break it. 1392 * so vcpu_load() would break it.
1392 */ 1393 */
1393 if (ioctl == KVM_S390_INTERRUPT || ioctl == KVM_INTERRUPT) 1394 if (ioctl == KVM_S390_INTERRUPT || ioctl == KVM_INTERRUPT)
1394 return kvm_arch_vcpu_ioctl(filp, ioctl, arg); 1395 return kvm_arch_vcpu_ioctl(filp, ioctl, arg);
1395 #endif 1396 #endif
1396 1397
1397 1398
1398 vcpu_load(vcpu); 1399 vcpu_load(vcpu);
1399 switch (ioctl) { 1400 switch (ioctl) {
1400 case KVM_RUN: 1401 case KVM_RUN:
1401 r = -EINVAL; 1402 r = -EINVAL;
1402 if (arg) 1403 if (arg)
1403 goto out; 1404 goto out;
1404 r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run); 1405 r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
1405 break; 1406 break;
1406 case KVM_GET_REGS: { 1407 case KVM_GET_REGS: {
1407 struct kvm_regs *kvm_regs; 1408 struct kvm_regs *kvm_regs;
1408 1409
1409 r = -ENOMEM; 1410 r = -ENOMEM;
1410 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL); 1411 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL);
1411 if (!kvm_regs) 1412 if (!kvm_regs)
1412 goto out; 1413 goto out;
1413 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs); 1414 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
1414 if (r) 1415 if (r)
1415 goto out_free1; 1416 goto out_free1;
1416 r = -EFAULT; 1417 r = -EFAULT;
1417 if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs))) 1418 if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs)))
1418 goto out_free1; 1419 goto out_free1;
1419 r = 0; 1420 r = 0;
1420 out_free1: 1421 out_free1:
1421 kfree(kvm_regs); 1422 kfree(kvm_regs);
1422 break; 1423 break;
1423 } 1424 }
1424 case KVM_SET_REGS: { 1425 case KVM_SET_REGS: {
1425 struct kvm_regs *kvm_regs; 1426 struct kvm_regs *kvm_regs;
1426 1427
1427 r = -ENOMEM; 1428 r = -ENOMEM;
1428 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL); 1429 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL);
1429 if (!kvm_regs) 1430 if (!kvm_regs)
1430 goto out; 1431 goto out;
1431 r = -EFAULT; 1432 r = -EFAULT;
1432 if (copy_from_user(kvm_regs, argp, sizeof(struct kvm_regs))) 1433 if (copy_from_user(kvm_regs, argp, sizeof(struct kvm_regs)))
1433 goto out_free2; 1434 goto out_free2;
1434 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs); 1435 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
1435 if (r) 1436 if (r)
1436 goto out_free2; 1437 goto out_free2;
1437 r = 0; 1438 r = 0;
1438 out_free2: 1439 out_free2:
1439 kfree(kvm_regs); 1440 kfree(kvm_regs);
1440 break; 1441 break;
1441 } 1442 }
1442 case KVM_GET_SREGS: { 1443 case KVM_GET_SREGS: {
1443 kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL); 1444 kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL);
1444 r = -ENOMEM; 1445 r = -ENOMEM;
1445 if (!kvm_sregs) 1446 if (!kvm_sregs)
1446 goto out; 1447 goto out;
1447 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs); 1448 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs);
1448 if (r) 1449 if (r)
1449 goto out; 1450 goto out;
1450 r = -EFAULT; 1451 r = -EFAULT;
1451 if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs))) 1452 if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs)))
1452 goto out; 1453 goto out;
1453 r = 0; 1454 r = 0;
1454 break; 1455 break;
1455 } 1456 }
1456 case KVM_SET_SREGS: { 1457 case KVM_SET_SREGS: {
1457 kvm_sregs = kmalloc(sizeof(struct kvm_sregs), GFP_KERNEL); 1458 kvm_sregs = kmalloc(sizeof(struct kvm_sregs), GFP_KERNEL);
1458 r = -ENOMEM; 1459 r = -ENOMEM;
1459 if (!kvm_sregs) 1460 if (!kvm_sregs)
1460 goto out; 1461 goto out;
1461 r = -EFAULT; 1462 r = -EFAULT;
1462 if (copy_from_user(kvm_sregs, argp, sizeof(struct kvm_sregs))) 1463 if (copy_from_user(kvm_sregs, argp, sizeof(struct kvm_sregs)))
1463 goto out; 1464 goto out;
1464 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs); 1465 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
1465 if (r) 1466 if (r)
1466 goto out; 1467 goto out;
1467 r = 0; 1468 r = 0;
1468 break; 1469 break;
1469 } 1470 }
1470 case KVM_GET_MP_STATE: { 1471 case KVM_GET_MP_STATE: {
1471 struct kvm_mp_state mp_state; 1472 struct kvm_mp_state mp_state;
1472 1473
1473 r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state); 1474 r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state);
1474 if (r) 1475 if (r)
1475 goto out; 1476 goto out;
1476 r = -EFAULT; 1477 r = -EFAULT;
1477 if (copy_to_user(argp, &mp_state, sizeof mp_state)) 1478 if (copy_to_user(argp, &mp_state, sizeof mp_state))
1478 goto out; 1479 goto out;
1479 r = 0; 1480 r = 0;
1480 break; 1481 break;
1481 } 1482 }
1482 case KVM_SET_MP_STATE: { 1483 case KVM_SET_MP_STATE: {
1483 struct kvm_mp_state mp_state; 1484 struct kvm_mp_state mp_state;
1484 1485
1485 r = -EFAULT; 1486 r = -EFAULT;
1486 if (copy_from_user(&mp_state, argp, sizeof mp_state)) 1487 if (copy_from_user(&mp_state, argp, sizeof mp_state))
1487 goto out; 1488 goto out;
1488 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state); 1489 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
1489 if (r) 1490 if (r)
1490 goto out; 1491 goto out;
1491 r = 0; 1492 r = 0;
1492 break; 1493 break;
1493 } 1494 }
1494 case KVM_TRANSLATE: { 1495 case KVM_TRANSLATE: {
1495 struct kvm_translation tr; 1496 struct kvm_translation tr;
1496 1497
1497 r = -EFAULT; 1498 r = -EFAULT;
1498 if (copy_from_user(&tr, argp, sizeof tr)) 1499 if (copy_from_user(&tr, argp, sizeof tr))
1499 goto out; 1500 goto out;
1500 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr); 1501 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
1501 if (r) 1502 if (r)
1502 goto out; 1503 goto out;
1503 r = -EFAULT; 1504 r = -EFAULT;
1504 if (copy_to_user(argp, &tr, sizeof tr)) 1505 if (copy_to_user(argp, &tr, sizeof tr))
1505 goto out; 1506 goto out;
1506 r = 0; 1507 r = 0;
1507 break; 1508 break;
1508 } 1509 }
1509 case KVM_SET_GUEST_DEBUG: { 1510 case KVM_SET_GUEST_DEBUG: {
1510 struct kvm_guest_debug dbg; 1511 struct kvm_guest_debug dbg;
1511 1512
1512 r = -EFAULT; 1513 r = -EFAULT;
1513 if (copy_from_user(&dbg, argp, sizeof dbg)) 1514 if (copy_from_user(&dbg, argp, sizeof dbg))
1514 goto out; 1515 goto out;
1515 r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg); 1516 r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
1516 if (r) 1517 if (r)
1517 goto out; 1518 goto out;
1518 r = 0; 1519 r = 0;
1519 break; 1520 break;
1520 } 1521 }
1521 case KVM_SET_SIGNAL_MASK: { 1522 case KVM_SET_SIGNAL_MASK: {
1522 struct kvm_signal_mask __user *sigmask_arg = argp; 1523 struct kvm_signal_mask __user *sigmask_arg = argp;
1523 struct kvm_signal_mask kvm_sigmask; 1524 struct kvm_signal_mask kvm_sigmask;
1524 sigset_t sigset, *p; 1525 sigset_t sigset, *p;
1525 1526
1526 p = NULL; 1527 p = NULL;
1527 if (argp) { 1528 if (argp) {
1528 r = -EFAULT; 1529 r = -EFAULT;
1529 if (copy_from_user(&kvm_sigmask, argp, 1530 if (copy_from_user(&kvm_sigmask, argp,
1530 sizeof kvm_sigmask)) 1531 sizeof kvm_sigmask))
1531 goto out; 1532 goto out;
1532 r = -EINVAL; 1533 r = -EINVAL;
1533 if (kvm_sigmask.len != sizeof sigset) 1534 if (kvm_sigmask.len != sizeof sigset)
1534 goto out; 1535 goto out;
1535 r = -EFAULT; 1536 r = -EFAULT;
1536 if (copy_from_user(&sigset, sigmask_arg->sigset, 1537 if (copy_from_user(&sigset, sigmask_arg->sigset,
1537 sizeof sigset)) 1538 sizeof sigset))
1538 goto out; 1539 goto out;
1539 p = &sigset; 1540 p = &sigset;
1540 } 1541 }
1541 r = kvm_vcpu_ioctl_set_sigmask(vcpu, p); 1542 r = kvm_vcpu_ioctl_set_sigmask(vcpu, p);
1542 break; 1543 break;
1543 } 1544 }
1544 case KVM_GET_FPU: { 1545 case KVM_GET_FPU: {
1545 fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL); 1546 fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL);
1546 r = -ENOMEM; 1547 r = -ENOMEM;
1547 if (!fpu) 1548 if (!fpu)
1548 goto out; 1549 goto out;
1549 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu); 1550 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu);
1550 if (r) 1551 if (r)
1551 goto out; 1552 goto out;
1552 r = -EFAULT; 1553 r = -EFAULT;
1553 if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu))) 1554 if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu)))
1554 goto out; 1555 goto out;
1555 r = 0; 1556 r = 0;
1556 break; 1557 break;
1557 } 1558 }
1558 case KVM_SET_FPU: { 1559 case KVM_SET_FPU: {
1559 fpu = kmalloc(sizeof(struct kvm_fpu), GFP_KERNEL); 1560 fpu = kmalloc(sizeof(struct kvm_fpu), GFP_KERNEL);
1560 r = -ENOMEM; 1561 r = -ENOMEM;
1561 if (!fpu) 1562 if (!fpu)
1562 goto out; 1563 goto out;
1563 r = -EFAULT; 1564 r = -EFAULT;
1564 if (copy_from_user(fpu, argp, sizeof(struct kvm_fpu))) 1565 if (copy_from_user(fpu, argp, sizeof(struct kvm_fpu)))
1565 goto out; 1566 goto out;
1566 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu); 1567 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
1567 if (r) 1568 if (r)
1568 goto out; 1569 goto out;
1569 r = 0; 1570 r = 0;
1570 break; 1571 break;
1571 } 1572 }
1572 default: 1573 default:
1573 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); 1574 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
1574 } 1575 }
1575 out: 1576 out:
1576 vcpu_put(vcpu); 1577 vcpu_put(vcpu);
1577 kfree(fpu); 1578 kfree(fpu);
1578 kfree(kvm_sregs); 1579 kfree(kvm_sregs);
1579 return r; 1580 return r;
1580 } 1581 }
1581 1582
1582 static long kvm_vm_ioctl(struct file *filp, 1583 static long kvm_vm_ioctl(struct file *filp,
1583 unsigned int ioctl, unsigned long arg) 1584 unsigned int ioctl, unsigned long arg)
1584 { 1585 {
1585 struct kvm *kvm = filp->private_data; 1586 struct kvm *kvm = filp->private_data;
1586 void __user *argp = (void __user *)arg; 1587 void __user *argp = (void __user *)arg;
1587 int r; 1588 int r;
1588 1589
1589 if (kvm->mm != current->mm) 1590 if (kvm->mm != current->mm)
1590 return -EIO; 1591 return -EIO;
1591 switch (ioctl) { 1592 switch (ioctl) {
1592 case KVM_CREATE_VCPU: 1593 case KVM_CREATE_VCPU:
1593 r = kvm_vm_ioctl_create_vcpu(kvm, arg); 1594 r = kvm_vm_ioctl_create_vcpu(kvm, arg);
1594 if (r < 0) 1595 if (r < 0)
1595 goto out; 1596 goto out;
1596 break; 1597 break;
1597 case KVM_SET_USER_MEMORY_REGION: { 1598 case KVM_SET_USER_MEMORY_REGION: {
1598 struct kvm_userspace_memory_region kvm_userspace_mem; 1599 struct kvm_userspace_memory_region kvm_userspace_mem;
1599 1600
1600 r = -EFAULT; 1601 r = -EFAULT;
1601 if (copy_from_user(&kvm_userspace_mem, argp, 1602 if (copy_from_user(&kvm_userspace_mem, argp,
1602 sizeof kvm_userspace_mem)) 1603 sizeof kvm_userspace_mem))
1603 goto out; 1604 goto out;
1604 1605
1605 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1); 1606 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1);
1606 if (r) 1607 if (r)
1607 goto out; 1608 goto out;
1608 break; 1609 break;
1609 } 1610 }
1610 case KVM_GET_DIRTY_LOG: { 1611 case KVM_GET_DIRTY_LOG: {
1611 struct kvm_dirty_log log; 1612 struct kvm_dirty_log log;
1612 1613
1613 r = -EFAULT; 1614 r = -EFAULT;
1614 if (copy_from_user(&log, argp, sizeof log)) 1615 if (copy_from_user(&log, argp, sizeof log))
1615 goto out; 1616 goto out;
1616 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 1617 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
1617 if (r) 1618 if (r)
1618 goto out; 1619 goto out;
1619 break; 1620 break;
1620 } 1621 }
1621 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 1622 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
1622 case KVM_REGISTER_COALESCED_MMIO: { 1623 case KVM_REGISTER_COALESCED_MMIO: {
1623 struct kvm_coalesced_mmio_zone zone; 1624 struct kvm_coalesced_mmio_zone zone;
1624 r = -EFAULT; 1625 r = -EFAULT;
1625 if (copy_from_user(&zone, argp, sizeof zone)) 1626 if (copy_from_user(&zone, argp, sizeof zone))
1626 goto out; 1627 goto out;
1627 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone); 1628 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
1628 if (r) 1629 if (r)
1629 goto out; 1630 goto out;
1630 r = 0; 1631 r = 0;
1631 break; 1632 break;
1632 } 1633 }
1633 case KVM_UNREGISTER_COALESCED_MMIO: { 1634 case KVM_UNREGISTER_COALESCED_MMIO: {
1634 struct kvm_coalesced_mmio_zone zone; 1635 struct kvm_coalesced_mmio_zone zone;
1635 r = -EFAULT; 1636 r = -EFAULT;
1636 if (copy_from_user(&zone, argp, sizeof zone)) 1637 if (copy_from_user(&zone, argp, sizeof zone))
1637 goto out; 1638 goto out;
1638 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone); 1639 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
1639 if (r) 1640 if (r)
1640 goto out; 1641 goto out;
1641 r = 0; 1642 r = 0;
1642 break; 1643 break;
1643 } 1644 }
1644 #endif 1645 #endif
1645 case KVM_IRQFD: { 1646 case KVM_IRQFD: {
1646 struct kvm_irqfd data; 1647 struct kvm_irqfd data;
1647 1648
1648 r = -EFAULT; 1649 r = -EFAULT;
1649 if (copy_from_user(&data, argp, sizeof data)) 1650 if (copy_from_user(&data, argp, sizeof data))
1650 goto out; 1651 goto out;
1651 r = kvm_irqfd(kvm, data.fd, data.gsi, data.flags); 1652 r = kvm_irqfd(kvm, data.fd, data.gsi, data.flags);
1652 break; 1653 break;
1653 } 1654 }
1654 case KVM_IOEVENTFD: { 1655 case KVM_IOEVENTFD: {
1655 struct kvm_ioeventfd data; 1656 struct kvm_ioeventfd data;
1656 1657
1657 r = -EFAULT; 1658 r = -EFAULT;
1658 if (copy_from_user(&data, argp, sizeof data)) 1659 if (copy_from_user(&data, argp, sizeof data))
1659 goto out; 1660 goto out;
1660 r = kvm_ioeventfd(kvm, &data); 1661 r = kvm_ioeventfd(kvm, &data);
1661 break; 1662 break;
1662 } 1663 }
1663 #ifdef CONFIG_KVM_APIC_ARCHITECTURE 1664 #ifdef CONFIG_KVM_APIC_ARCHITECTURE
1664 case KVM_SET_BOOT_CPU_ID: 1665 case KVM_SET_BOOT_CPU_ID:
1665 r = 0; 1666 r = 0;
1666 mutex_lock(&kvm->lock); 1667 mutex_lock(&kvm->lock);
1667 if (atomic_read(&kvm->online_vcpus) != 0) 1668 if (atomic_read(&kvm->online_vcpus) != 0)
1668 r = -EBUSY; 1669 r = -EBUSY;
1669 else 1670 else
1670 kvm->bsp_vcpu_id = arg; 1671 kvm->bsp_vcpu_id = arg;
1671 mutex_unlock(&kvm->lock); 1672 mutex_unlock(&kvm->lock);
1672 break; 1673 break;
1673 #endif 1674 #endif
1674 default: 1675 default:
1675 r = kvm_arch_vm_ioctl(filp, ioctl, arg); 1676 r = kvm_arch_vm_ioctl(filp, ioctl, arg);
1676 if (r == -ENOTTY) 1677 if (r == -ENOTTY)
1677 r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg); 1678 r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg);
1678 } 1679 }
1679 out: 1680 out:
1680 return r; 1681 return r;
1681 } 1682 }
1682 1683
1683 #ifdef CONFIG_COMPAT 1684 #ifdef CONFIG_COMPAT
1684 struct compat_kvm_dirty_log { 1685 struct compat_kvm_dirty_log {
1685 __u32 slot; 1686 __u32 slot;
1686 __u32 padding1; 1687 __u32 padding1;
1687 union { 1688 union {
1688 compat_uptr_t dirty_bitmap; /* one bit per page */ 1689 compat_uptr_t dirty_bitmap; /* one bit per page */
1689 __u64 padding2; 1690 __u64 padding2;
1690 }; 1691 };
1691 }; 1692 };
1692 1693
1693 static long kvm_vm_compat_ioctl(struct file *filp, 1694 static long kvm_vm_compat_ioctl(struct file *filp,
1694 unsigned int ioctl, unsigned long arg) 1695 unsigned int ioctl, unsigned long arg)
1695 { 1696 {
1696 struct kvm *kvm = filp->private_data; 1697 struct kvm *kvm = filp->private_data;
1697 int r; 1698 int r;
1698 1699
1699 if (kvm->mm != current->mm) 1700 if (kvm->mm != current->mm)
1700 return -EIO; 1701 return -EIO;
1701 switch (ioctl) { 1702 switch (ioctl) {
1702 case KVM_GET_DIRTY_LOG: { 1703 case KVM_GET_DIRTY_LOG: {
1703 struct compat_kvm_dirty_log compat_log; 1704 struct compat_kvm_dirty_log compat_log;
1704 struct kvm_dirty_log log; 1705 struct kvm_dirty_log log;
1705 1706
1706 r = -EFAULT; 1707 r = -EFAULT;
1707 if (copy_from_user(&compat_log, (void __user *)arg, 1708 if (copy_from_user(&compat_log, (void __user *)arg,
1708 sizeof(compat_log))) 1709 sizeof(compat_log)))
1709 goto out; 1710 goto out;
1710 log.slot = compat_log.slot; 1711 log.slot = compat_log.slot;
1711 log.padding1 = compat_log.padding1; 1712 log.padding1 = compat_log.padding1;
1712 log.padding2 = compat_log.padding2; 1713 log.padding2 = compat_log.padding2;
1713 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap); 1714 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
1714 1715
1715 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 1716 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
1716 if (r) 1717 if (r)
1717 goto out; 1718 goto out;
1718 break; 1719 break;
1719 } 1720 }
1720 default: 1721 default:
1721 r = kvm_vm_ioctl(filp, ioctl, arg); 1722 r = kvm_vm_ioctl(filp, ioctl, arg);
1722 } 1723 }
1723 1724
1724 out: 1725 out:
1725 return r; 1726 return r;
1726 } 1727 }
1727 #endif 1728 #endif
1728 1729
1729 static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 1730 static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1730 { 1731 {
1731 struct page *page[1]; 1732 struct page *page[1];
1732 unsigned long addr; 1733 unsigned long addr;
1733 int npages; 1734 int npages;
1734 gfn_t gfn = vmf->pgoff; 1735 gfn_t gfn = vmf->pgoff;
1735 struct kvm *kvm = vma->vm_file->private_data; 1736 struct kvm *kvm = vma->vm_file->private_data;
1736 1737
1737 addr = gfn_to_hva(kvm, gfn); 1738 addr = gfn_to_hva(kvm, gfn);
1738 if (kvm_is_error_hva(addr)) 1739 if (kvm_is_error_hva(addr))
1739 return VM_FAULT_SIGBUS; 1740 return VM_FAULT_SIGBUS;
1740 1741
1741 npages = get_user_pages(current, current->mm, addr, 1, 1, 0, page, 1742 npages = get_user_pages(current, current->mm, addr, 1, 1, 0, page,
1742 NULL); 1743 NULL);
1743 if (unlikely(npages != 1)) 1744 if (unlikely(npages != 1))
1744 return VM_FAULT_SIGBUS; 1745 return VM_FAULT_SIGBUS;
1745 1746
1746 vmf->page = page[0]; 1747 vmf->page = page[0];
1747 return 0; 1748 return 0;
1748 } 1749 }
1749 1750
1750 static const struct vm_operations_struct kvm_vm_vm_ops = { 1751 static const struct vm_operations_struct kvm_vm_vm_ops = {
1751 .fault = kvm_vm_fault, 1752 .fault = kvm_vm_fault,
1752 }; 1753 };
1753 1754
1754 static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma) 1755 static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
1755 { 1756 {
1756 vma->vm_ops = &kvm_vm_vm_ops; 1757 vma->vm_ops = &kvm_vm_vm_ops;
1757 return 0; 1758 return 0;
1758 } 1759 }
1759 1760
1760 static struct file_operations kvm_vm_fops = { 1761 static struct file_operations kvm_vm_fops = {
1761 .release = kvm_vm_release, 1762 .release = kvm_vm_release,
1762 .unlocked_ioctl = kvm_vm_ioctl, 1763 .unlocked_ioctl = kvm_vm_ioctl,
1763 #ifdef CONFIG_COMPAT 1764 #ifdef CONFIG_COMPAT
1764 .compat_ioctl = kvm_vm_compat_ioctl, 1765 .compat_ioctl = kvm_vm_compat_ioctl,
1765 #endif 1766 #endif
1766 .mmap = kvm_vm_mmap, 1767 .mmap = kvm_vm_mmap,
1767 }; 1768 };
1768 1769
1769 static int kvm_dev_ioctl_create_vm(void) 1770 static int kvm_dev_ioctl_create_vm(void)
1770 { 1771 {
1771 int fd, r; 1772 int fd, r;
1772 struct kvm *kvm; 1773 struct kvm *kvm;
1773 1774
1774 kvm = kvm_create_vm(); 1775 kvm = kvm_create_vm();
1775 if (IS_ERR(kvm)) 1776 if (IS_ERR(kvm))
1776 return PTR_ERR(kvm); 1777 return PTR_ERR(kvm);
1777 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 1778 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
1778 r = kvm_coalesced_mmio_init(kvm); 1779 r = kvm_coalesced_mmio_init(kvm);
1779 if (r < 0) { 1780 if (r < 0) {
1780 kvm_put_kvm(kvm); 1781 kvm_put_kvm(kvm);
1781 return r; 1782 return r;
1782 } 1783 }
1783 #endif 1784 #endif
1784 fd = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR); 1785 fd = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
1785 if (fd < 0) 1786 if (fd < 0)
1786 kvm_put_kvm(kvm); 1787 kvm_put_kvm(kvm);
1787 1788
1788 return fd; 1789 return fd;
1789 } 1790 }
1790 1791
1791 static long kvm_dev_ioctl_check_extension_generic(long arg) 1792 static long kvm_dev_ioctl_check_extension_generic(long arg)
1792 { 1793 {
1793 switch (arg) { 1794 switch (arg) {
1794 case KVM_CAP_USER_MEMORY: 1795 case KVM_CAP_USER_MEMORY:
1795 case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: 1796 case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
1796 case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS: 1797 case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
1797 #ifdef CONFIG_KVM_APIC_ARCHITECTURE 1798 #ifdef CONFIG_KVM_APIC_ARCHITECTURE
1798 case KVM_CAP_SET_BOOT_CPU_ID: 1799 case KVM_CAP_SET_BOOT_CPU_ID:
1799 #endif 1800 #endif
1800 case KVM_CAP_INTERNAL_ERROR_DATA: 1801 case KVM_CAP_INTERNAL_ERROR_DATA:
1801 return 1; 1802 return 1;
1802 #ifdef CONFIG_HAVE_KVM_IRQCHIP 1803 #ifdef CONFIG_HAVE_KVM_IRQCHIP
1803 case KVM_CAP_IRQ_ROUTING: 1804 case KVM_CAP_IRQ_ROUTING:
1804 return KVM_MAX_IRQ_ROUTES; 1805 return KVM_MAX_IRQ_ROUTES;
1805 #endif 1806 #endif
1806 default: 1807 default:
1807 break; 1808 break;
1808 } 1809 }
1809 return kvm_dev_ioctl_check_extension(arg); 1810 return kvm_dev_ioctl_check_extension(arg);
1810 } 1811 }
1811 1812
1812 static long kvm_dev_ioctl(struct file *filp, 1813 static long kvm_dev_ioctl(struct file *filp,
1813 unsigned int ioctl, unsigned long arg) 1814 unsigned int ioctl, unsigned long arg)
1814 { 1815 {
1815 long r = -EINVAL; 1816 long r = -EINVAL;
1816 1817
1817 switch (ioctl) { 1818 switch (ioctl) {
1818 case KVM_GET_API_VERSION: 1819 case KVM_GET_API_VERSION:
1819 r = -EINVAL; 1820 r = -EINVAL;
1820 if (arg) 1821 if (arg)
1821 goto out; 1822 goto out;
1822 r = KVM_API_VERSION; 1823 r = KVM_API_VERSION;
1823 break; 1824 break;
1824 case KVM_CREATE_VM: 1825 case KVM_CREATE_VM:
1825 r = -EINVAL; 1826 r = -EINVAL;
1826 if (arg) 1827 if (arg)
1827 goto out; 1828 goto out;
1828 r = kvm_dev_ioctl_create_vm(); 1829 r = kvm_dev_ioctl_create_vm();
1829 break; 1830 break;
1830 case KVM_CHECK_EXTENSION: 1831 case KVM_CHECK_EXTENSION:
1831 r = kvm_dev_ioctl_check_extension_generic(arg); 1832 r = kvm_dev_ioctl_check_extension_generic(arg);
1832 break; 1833 break;
1833 case KVM_GET_VCPU_MMAP_SIZE: 1834 case KVM_GET_VCPU_MMAP_SIZE:
1834 r = -EINVAL; 1835 r = -EINVAL;
1835 if (arg) 1836 if (arg)
1836 goto out; 1837 goto out;
1837 r = PAGE_SIZE; /* struct kvm_run */ 1838 r = PAGE_SIZE; /* struct kvm_run */
1838 #ifdef CONFIG_X86 1839 #ifdef CONFIG_X86
1839 r += PAGE_SIZE; /* pio data page */ 1840 r += PAGE_SIZE; /* pio data page */
1840 #endif 1841 #endif
1841 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 1842 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
1842 r += PAGE_SIZE; /* coalesced mmio ring page */ 1843 r += PAGE_SIZE; /* coalesced mmio ring page */
1843 #endif 1844 #endif
1844 break; 1845 break;
1845 case KVM_TRACE_ENABLE: 1846 case KVM_TRACE_ENABLE:
1846 case KVM_TRACE_PAUSE: 1847 case KVM_TRACE_PAUSE:
1847 case KVM_TRACE_DISABLE: 1848 case KVM_TRACE_DISABLE:
1848 r = -EOPNOTSUPP; 1849 r = -EOPNOTSUPP;
1849 break; 1850 break;
1850 default: 1851 default:
1851 return kvm_arch_dev_ioctl(filp, ioctl, arg); 1852 return kvm_arch_dev_ioctl(filp, ioctl, arg);
1852 } 1853 }
1853 out: 1854 out:
1854 return r; 1855 return r;
1855 } 1856 }
1856 1857
1857 static struct file_operations kvm_chardev_ops = { 1858 static struct file_operations kvm_chardev_ops = {
1858 .unlocked_ioctl = kvm_dev_ioctl, 1859 .unlocked_ioctl = kvm_dev_ioctl,
1859 .compat_ioctl = kvm_dev_ioctl, 1860 .compat_ioctl = kvm_dev_ioctl,
1860 }; 1861 };
1861 1862
1862 static struct miscdevice kvm_dev = { 1863 static struct miscdevice kvm_dev = {
1863 KVM_MINOR, 1864 KVM_MINOR,
1864 "kvm", 1865 "kvm",
1865 &kvm_chardev_ops, 1866 &kvm_chardev_ops,
1866 }; 1867 };
1867 1868
1868 static void hardware_enable(void *junk) 1869 static void hardware_enable(void *junk)
1869 { 1870 {
1870 int cpu = raw_smp_processor_id(); 1871 int cpu = raw_smp_processor_id();
1871 int r; 1872 int r;
1872 1873
1873 if (cpumask_test_cpu(cpu, cpus_hardware_enabled)) 1874 if (cpumask_test_cpu(cpu, cpus_hardware_enabled))
1874 return; 1875 return;
1875 1876
1876 cpumask_set_cpu(cpu, cpus_hardware_enabled); 1877 cpumask_set_cpu(cpu, cpus_hardware_enabled);
1877 1878
1878 r = kvm_arch_hardware_enable(NULL); 1879 r = kvm_arch_hardware_enable(NULL);
1879 1880
1880 if (r) { 1881 if (r) {
1881 cpumask_clear_cpu(cpu, cpus_hardware_enabled); 1882 cpumask_clear_cpu(cpu, cpus_hardware_enabled);
1882 atomic_inc(&hardware_enable_failed); 1883 atomic_inc(&hardware_enable_failed);
1883 printk(KERN_INFO "kvm: enabling virtualization on " 1884 printk(KERN_INFO "kvm: enabling virtualization on "
1884 "CPU%d failed\n", cpu); 1885 "CPU%d failed\n", cpu);
1885 } 1886 }
1886 } 1887 }
1887 1888
1888 static void hardware_disable(void *junk) 1889 static void hardware_disable(void *junk)
1889 { 1890 {
1890 int cpu = raw_smp_processor_id(); 1891 int cpu = raw_smp_processor_id();
1891 1892
1892 if (!cpumask_test_cpu(cpu, cpus_hardware_enabled)) 1893 if (!cpumask_test_cpu(cpu, cpus_hardware_enabled))
1893 return; 1894 return;
1894 cpumask_clear_cpu(cpu, cpus_hardware_enabled); 1895 cpumask_clear_cpu(cpu, cpus_hardware_enabled);
1895 kvm_arch_hardware_disable(NULL); 1896 kvm_arch_hardware_disable(NULL);
1896 } 1897 }
1897 1898
1898 static void hardware_disable_all_nolock(void) 1899 static void hardware_disable_all_nolock(void)
1899 { 1900 {
1900 BUG_ON(!kvm_usage_count); 1901 BUG_ON(!kvm_usage_count);
1901 1902
1902 kvm_usage_count--; 1903 kvm_usage_count--;
1903 if (!kvm_usage_count) 1904 if (!kvm_usage_count)
1904 on_each_cpu(hardware_disable, NULL, 1); 1905 on_each_cpu(hardware_disable, NULL, 1);
1905 } 1906 }
1906 1907
1907 static void hardware_disable_all(void) 1908 static void hardware_disable_all(void)
1908 { 1909 {
1909 spin_lock(&kvm_lock); 1910 spin_lock(&kvm_lock);
1910 hardware_disable_all_nolock(); 1911 hardware_disable_all_nolock();
1911 spin_unlock(&kvm_lock); 1912 spin_unlock(&kvm_lock);
1912 } 1913 }
1913 1914
1914 static int hardware_enable_all(void) 1915 static int hardware_enable_all(void)
1915 { 1916 {
1916 int r = 0; 1917 int r = 0;
1917 1918
1918 spin_lock(&kvm_lock); 1919 spin_lock(&kvm_lock);
1919 1920
1920 kvm_usage_count++; 1921 kvm_usage_count++;
1921 if (kvm_usage_count == 1) { 1922 if (kvm_usage_count == 1) {
1922 atomic_set(&hardware_enable_failed, 0); 1923 atomic_set(&hardware_enable_failed, 0);
1923 on_each_cpu(hardware_enable, NULL, 1); 1924 on_each_cpu(hardware_enable, NULL, 1);
1924 1925
1925 if (atomic_read(&hardware_enable_failed)) { 1926 if (atomic_read(&hardware_enable_failed)) {
1926 hardware_disable_all_nolock(); 1927 hardware_disable_all_nolock();
1927 r = -EBUSY; 1928 r = -EBUSY;
1928 } 1929 }
1929 } 1930 }
1930 1931
1931 spin_unlock(&kvm_lock); 1932 spin_unlock(&kvm_lock);
1932 1933
1933 return r; 1934 return r;
1934 } 1935 }
1935 1936
1936 static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, 1937 static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
1937 void *v) 1938 void *v)
1938 { 1939 {
1939 int cpu = (long)v; 1940 int cpu = (long)v;
1940 1941
1941 if (!kvm_usage_count) 1942 if (!kvm_usage_count)
1942 return NOTIFY_OK; 1943 return NOTIFY_OK;
1943 1944
1944 val &= ~CPU_TASKS_FROZEN; 1945 val &= ~CPU_TASKS_FROZEN;
1945 switch (val) { 1946 switch (val) {
1946 case CPU_DYING: 1947 case CPU_DYING:
1947 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", 1948 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
1948 cpu); 1949 cpu);
1949 hardware_disable(NULL); 1950 hardware_disable(NULL);
1950 break; 1951 break;
1951 case CPU_ONLINE: 1952 case CPU_ONLINE:
1952 printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n", 1953 printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
1953 cpu); 1954 cpu);
1954 smp_call_function_single(cpu, hardware_enable, NULL, 1); 1955 smp_call_function_single(cpu, hardware_enable, NULL, 1);
1955 break; 1956 break;
1956 } 1957 }
1957 return NOTIFY_OK; 1958 return NOTIFY_OK;
1958 } 1959 }
1959 1960
1960 1961
1961 asmlinkage void kvm_handle_fault_on_reboot(void) 1962 asmlinkage void kvm_handle_fault_on_reboot(void)
1962 { 1963 {
1963 if (kvm_rebooting) 1964 if (kvm_rebooting)
1964 /* spin while reset goes on */ 1965 /* spin while reset goes on */
1965 while (true) 1966 while (true)
1966 ; 1967 ;
1967 /* Fault while not rebooting. We want the trace. */ 1968 /* Fault while not rebooting. We want the trace. */
1968 BUG(); 1969 BUG();
1969 } 1970 }
1970 EXPORT_SYMBOL_GPL(kvm_handle_fault_on_reboot); 1971 EXPORT_SYMBOL_GPL(kvm_handle_fault_on_reboot);
1971 1972
1972 static int kvm_reboot(struct notifier_block *notifier, unsigned long val, 1973 static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
1973 void *v) 1974 void *v)
1974 { 1975 {
1975 /* 1976 /*
1976 * Some (well, at least mine) BIOSes hang on reboot if 1977 * Some (well, at least mine) BIOSes hang on reboot if
1977 * in vmx root mode. 1978 * in vmx root mode.
1978 * 1979 *
1979 * And Intel TXT required VMX off for all cpu when system shutdown. 1980 * And Intel TXT required VMX off for all cpu when system shutdown.
1980 */ 1981 */
1981 printk(KERN_INFO "kvm: exiting hardware virtualization\n"); 1982 printk(KERN_INFO "kvm: exiting hardware virtualization\n");
1982 kvm_rebooting = true; 1983 kvm_rebooting = true;
1983 on_each_cpu(hardware_disable, NULL, 1); 1984 on_each_cpu(hardware_disable, NULL, 1);
1984 return NOTIFY_OK; 1985 return NOTIFY_OK;
1985 } 1986 }
1986 1987
1987 static struct notifier_block kvm_reboot_notifier = { 1988 static struct notifier_block kvm_reboot_notifier = {
1988 .notifier_call = kvm_reboot, 1989 .notifier_call = kvm_reboot,
1989 .priority = 0, 1990 .priority = 0,
1990 }; 1991 };
1991 1992
1992 static void kvm_io_bus_destroy(struct kvm_io_bus *bus) 1993 static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
1993 { 1994 {
1994 int i; 1995 int i;
1995 1996
1996 for (i = 0; i < bus->dev_count; i++) { 1997 for (i = 0; i < bus->dev_count; i++) {
1997 struct kvm_io_device *pos = bus->devs[i]; 1998 struct kvm_io_device *pos = bus->devs[i];
1998 1999
1999 kvm_iodevice_destructor(pos); 2000 kvm_iodevice_destructor(pos);
2000 } 2001 }
2001 kfree(bus); 2002 kfree(bus);
2002 } 2003 }
2003 2004
2004 /* kvm_io_bus_write - called under kvm->slots_lock */ 2005 /* kvm_io_bus_write - called under kvm->slots_lock */
2005 int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, 2006 int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
2006 int len, const void *val) 2007 int len, const void *val)
2007 { 2008 {
2008 int i; 2009 int i;
2009 struct kvm_io_bus *bus; 2010 struct kvm_io_bus *bus;
2010 2011
2011 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); 2012 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
2012 for (i = 0; i < bus->dev_count; i++) 2013 for (i = 0; i < bus->dev_count; i++)
2013 if (!kvm_iodevice_write(bus->devs[i], addr, len, val)) 2014 if (!kvm_iodevice_write(bus->devs[i], addr, len, val))
2014 return 0; 2015 return 0;
2015 return -EOPNOTSUPP; 2016 return -EOPNOTSUPP;
2016 } 2017 }
2017 2018
2018 /* kvm_io_bus_read - called under kvm->slots_lock */ 2019 /* kvm_io_bus_read - called under kvm->slots_lock */
2019 int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, 2020 int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
2020 int len, void *val) 2021 int len, void *val)
2021 { 2022 {
2022 int i; 2023 int i;
2023 struct kvm_io_bus *bus; 2024 struct kvm_io_bus *bus;
2024 2025
2025 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); 2026 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
2026 for (i = 0; i < bus->dev_count; i++) 2027 for (i = 0; i < bus->dev_count; i++)
2027 if (!kvm_iodevice_read(bus->devs[i], addr, len, val)) 2028 if (!kvm_iodevice_read(bus->devs[i], addr, len, val))
2028 return 0; 2029 return 0;
2029 return -EOPNOTSUPP; 2030 return -EOPNOTSUPP;
2030 } 2031 }
2031 2032
2032 /* Caller must hold slots_lock. */ 2033 /* Caller must hold slots_lock. */
2033 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, 2034 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx,
2034 struct kvm_io_device *dev) 2035 struct kvm_io_device *dev)
2035 { 2036 {
2036 struct kvm_io_bus *new_bus, *bus; 2037 struct kvm_io_bus *new_bus, *bus;
2037 2038
2038 bus = kvm->buses[bus_idx]; 2039 bus = kvm->buses[bus_idx];
2039 if (bus->dev_count > NR_IOBUS_DEVS-1) 2040 if (bus->dev_count > NR_IOBUS_DEVS-1)
2040 return -ENOSPC; 2041 return -ENOSPC;
2041 2042
2042 new_bus = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL); 2043 new_bus = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL);
2043 if (!new_bus) 2044 if (!new_bus)
2044 return -ENOMEM; 2045 return -ENOMEM;
2045 memcpy(new_bus, bus, sizeof(struct kvm_io_bus)); 2046 memcpy(new_bus, bus, sizeof(struct kvm_io_bus));
2046 new_bus->devs[new_bus->dev_count++] = dev; 2047 new_bus->devs[new_bus->dev_count++] = dev;
2047 rcu_assign_pointer(kvm->buses[bus_idx], new_bus); 2048 rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
2048 synchronize_srcu_expedited(&kvm->srcu); 2049 synchronize_srcu_expedited(&kvm->srcu);
2049 kfree(bus); 2050 kfree(bus);
2050 2051
2051 return 0; 2052 return 0;
2052 } 2053 }
2053 2054
2054 /* Caller must hold slots_lock. */ 2055 /* Caller must hold slots_lock. */
2055 int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, 2056 int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
2056 struct kvm_io_device *dev) 2057 struct kvm_io_device *dev)
2057 { 2058 {
2058 int i, r; 2059 int i, r;
2059 struct kvm_io_bus *new_bus, *bus; 2060 struct kvm_io_bus *new_bus, *bus;
2060 2061
2061 new_bus = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL); 2062 new_bus = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL);
2062 if (!new_bus) 2063 if (!new_bus)
2063 return -ENOMEM; 2064 return -ENOMEM;
2064 2065
2065 bus = kvm->buses[bus_idx]; 2066 bus = kvm->buses[bus_idx];
2066 memcpy(new_bus, bus, sizeof(struct kvm_io_bus)); 2067 memcpy(new_bus, bus, sizeof(struct kvm_io_bus));
2067 2068
2068 r = -ENOENT; 2069 r = -ENOENT;
2069 for (i = 0; i < new_bus->dev_count; i++) 2070 for (i = 0; i < new_bus->dev_count; i++)
2070 if (new_bus->devs[i] == dev) { 2071 if (new_bus->devs[i] == dev) {
2071 r = 0; 2072 r = 0;
2072 new_bus->devs[i] = new_bus->devs[--new_bus->dev_count]; 2073 new_bus->devs[i] = new_bus->devs[--new_bus->dev_count];
2073 break; 2074 break;
2074 } 2075 }
2075 2076
2076 if (r) { 2077 if (r) {
2077 kfree(new_bus); 2078 kfree(new_bus);
2078 return r; 2079 return r;
2079 } 2080 }
2080 2081
2081 rcu_assign_pointer(kvm->buses[bus_idx], new_bus); 2082 rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
2082 synchronize_srcu_expedited(&kvm->srcu); 2083 synchronize_srcu_expedited(&kvm->srcu);
2083 kfree(bus); 2084 kfree(bus);
2084 return r; 2085 return r;
2085 } 2086 }
2086 2087
2087 static struct notifier_block kvm_cpu_notifier = { 2088 static struct notifier_block kvm_cpu_notifier = {
2088 .notifier_call = kvm_cpu_hotplug, 2089 .notifier_call = kvm_cpu_hotplug,
2089 .priority = 20, /* must be > scheduler priority */ 2090 .priority = 20, /* must be > scheduler priority */
2090 }; 2091 };
2091 2092
2092 static int vm_stat_get(void *_offset, u64 *val) 2093 static int vm_stat_get(void *_offset, u64 *val)
2093 { 2094 {
2094 unsigned offset = (long)_offset; 2095 unsigned offset = (long)_offset;
2095 struct kvm *kvm; 2096 struct kvm *kvm;
2096 2097
2097 *val = 0; 2098 *val = 0;
2098 spin_lock(&kvm_lock); 2099 spin_lock(&kvm_lock);
2099 list_for_each_entry(kvm, &vm_list, vm_list) 2100 list_for_each_entry(kvm, &vm_list, vm_list)
2100 *val += *(u32 *)((void *)kvm + offset); 2101 *val += *(u32 *)((void *)kvm + offset);
2101 spin_unlock(&kvm_lock); 2102 spin_unlock(&kvm_lock);
2102 return 0; 2103 return 0;
2103 } 2104 }
2104 2105
2105 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, NULL, "%llu\n"); 2106 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, NULL, "%llu\n");
2106 2107
2107 static int vcpu_stat_get(void *_offset, u64 *val) 2108 static int vcpu_stat_get(void *_offset, u64 *val)
2108 { 2109 {
2109 unsigned offset = (long)_offset; 2110 unsigned offset = (long)_offset;
2110 struct kvm *kvm; 2111 struct kvm *kvm;
2111 struct kvm_vcpu *vcpu; 2112 struct kvm_vcpu *vcpu;
2112 int i; 2113 int i;
2113 2114
2114 *val = 0; 2115 *val = 0;
2115 spin_lock(&kvm_lock); 2116 spin_lock(&kvm_lock);
2116 list_for_each_entry(kvm, &vm_list, vm_list) 2117 list_for_each_entry(kvm, &vm_list, vm_list)
2117 kvm_for_each_vcpu(i, vcpu, kvm) 2118 kvm_for_each_vcpu(i, vcpu, kvm)
2118 *val += *(u32 *)((void *)vcpu + offset); 2119 *val += *(u32 *)((void *)vcpu + offset);
2119 2120
2120 spin_unlock(&kvm_lock); 2121 spin_unlock(&kvm_lock);
2121 return 0; 2122 return 0;
2122 } 2123 }
2123 2124
2124 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, NULL, "%llu\n"); 2125 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, NULL, "%llu\n");
2125 2126
2126 static const struct file_operations *stat_fops[] = { 2127 static const struct file_operations *stat_fops[] = {
2127 [KVM_STAT_VCPU] = &vcpu_stat_fops, 2128 [KVM_STAT_VCPU] = &vcpu_stat_fops,
2128 [KVM_STAT_VM] = &vm_stat_fops, 2129 [KVM_STAT_VM] = &vm_stat_fops,
2129 }; 2130 };
2130 2131
2131 static void kvm_init_debug(void) 2132 static void kvm_init_debug(void)
2132 { 2133 {
2133 struct kvm_stats_debugfs_item *p; 2134 struct kvm_stats_debugfs_item *p;
2134 2135
2135 kvm_debugfs_dir = debugfs_create_dir("kvm", NULL); 2136 kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
2136 for (p = debugfs_entries; p->name; ++p) 2137 for (p = debugfs_entries; p->name; ++p)
2137 p->dentry = debugfs_create_file(p->name, 0444, kvm_debugfs_dir, 2138 p->dentry = debugfs_create_file(p->name, 0444, kvm_debugfs_dir,
2138 (void *)(long)p->offset, 2139 (void *)(long)p->offset,
2139 stat_fops[p->kind]); 2140 stat_fops[p->kind]);
2140 } 2141 }
2141 2142
2142 static void kvm_exit_debug(void) 2143 static void kvm_exit_debug(void)
2143 { 2144 {
2144 struct kvm_stats_debugfs_item *p; 2145 struct kvm_stats_debugfs_item *p;
2145 2146
2146 for (p = debugfs_entries; p->name; ++p) 2147 for (p = debugfs_entries; p->name; ++p)
2147 debugfs_remove(p->dentry); 2148 debugfs_remove(p->dentry);
2148 debugfs_remove(kvm_debugfs_dir); 2149 debugfs_remove(kvm_debugfs_dir);
2149 } 2150 }
2150 2151
2151 static int kvm_suspend(struct sys_device *dev, pm_message_t state) 2152 static int kvm_suspend(struct sys_device *dev, pm_message_t state)
2152 { 2153 {
2153 if (kvm_usage_count) 2154 if (kvm_usage_count)
2154 hardware_disable(NULL); 2155 hardware_disable(NULL);
2155 return 0; 2156 return 0;
2156 } 2157 }
2157 2158
2158 static int kvm_resume(struct sys_device *dev) 2159 static int kvm_resume(struct sys_device *dev)
2159 { 2160 {
2160 if (kvm_usage_count) 2161 if (kvm_usage_count)
2161 hardware_enable(NULL); 2162 hardware_enable(NULL);
2162 return 0; 2163 return 0;
2163 } 2164 }
2164 2165
2165 static struct sysdev_class kvm_sysdev_class = { 2166 static struct sysdev_class kvm_sysdev_class = {
2166 .name = "kvm", 2167 .name = "kvm",
2167 .suspend = kvm_suspend, 2168 .suspend = kvm_suspend,
2168 .resume = kvm_resume, 2169 .resume = kvm_resume,
2169 }; 2170 };
2170 2171
2171 static struct sys_device kvm_sysdev = { 2172 static struct sys_device kvm_sysdev = {
2172 .id = 0, 2173 .id = 0,
2173 .cls = &kvm_sysdev_class, 2174 .cls = &kvm_sysdev_class,
2174 }; 2175 };
2175 2176
2176 struct page *bad_page; 2177 struct page *bad_page;
2177 pfn_t bad_pfn; 2178 pfn_t bad_pfn;
2178 2179
2179 static inline 2180 static inline
2180 struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) 2181 struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
2181 { 2182 {
2182 return container_of(pn, struct kvm_vcpu, preempt_notifier); 2183 return container_of(pn, struct kvm_vcpu, preempt_notifier);
2183 } 2184 }
2184 2185
2185 static void kvm_sched_in(struct preempt_notifier *pn, int cpu) 2186 static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
2186 { 2187 {
2187 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 2188 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
2188 2189
2189 kvm_arch_vcpu_load(vcpu, cpu); 2190 kvm_arch_vcpu_load(vcpu, cpu);
2190 } 2191 }
2191 2192
2192 static void kvm_sched_out(struct preempt_notifier *pn, 2193 static void kvm_sched_out(struct preempt_notifier *pn,
2193 struct task_struct *next) 2194 struct task_struct *next)
2194 { 2195 {
2195 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 2196 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
2196 2197
2197 kvm_arch_vcpu_put(vcpu); 2198 kvm_arch_vcpu_put(vcpu);
2198 } 2199 }
2199 2200
2200 int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, 2201 int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
2201 struct module *module) 2202 struct module *module)
2202 { 2203 {
2203 int r; 2204 int r;
2204 int cpu; 2205 int cpu;
2205 2206
2206 r = kvm_arch_init(opaque); 2207 r = kvm_arch_init(opaque);
2207 if (r) 2208 if (r)
2208 goto out_fail; 2209 goto out_fail;
2209 2210
2210 bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO); 2211 bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2211 2212
2212 if (bad_page == NULL) { 2213 if (bad_page == NULL) {
2213 r = -ENOMEM; 2214 r = -ENOMEM;
2214 goto out; 2215 goto out;
2215 } 2216 }
2216 2217
2217 bad_pfn = page_to_pfn(bad_page); 2218 bad_pfn = page_to_pfn(bad_page);
2218 2219
2219 hwpoison_page = alloc_page(GFP_KERNEL | __GFP_ZERO); 2220 hwpoison_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2220 2221
2221 if (hwpoison_page == NULL) { 2222 if (hwpoison_page == NULL) {
2222 r = -ENOMEM; 2223 r = -ENOMEM;
2223 goto out_free_0; 2224 goto out_free_0;
2224 } 2225 }
2225 2226
2226 hwpoison_pfn = page_to_pfn(hwpoison_page); 2227 hwpoison_pfn = page_to_pfn(hwpoison_page);
2227 2228
2228 if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { 2229 if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
2229 r = -ENOMEM; 2230 r = -ENOMEM;
2230 goto out_free_0; 2231 goto out_free_0;
2231 } 2232 }
2232 2233
2233 r = kvm_arch_hardware_setup(); 2234 r = kvm_arch_hardware_setup();
2234 if (r < 0) 2235 if (r < 0)
2235 goto out_free_0a; 2236 goto out_free_0a;
2236 2237
2237 for_each_online_cpu(cpu) { 2238 for_each_online_cpu(cpu) {
2238 smp_call_function_single(cpu, 2239 smp_call_function_single(cpu,
2239 kvm_arch_check_processor_compat, 2240 kvm_arch_check_processor_compat,
2240 &r, 1); 2241 &r, 1);
2241 if (r < 0) 2242 if (r < 0)
2242 goto out_free_1; 2243 goto out_free_1;
2243 } 2244 }
2244 2245
2245 r = register_cpu_notifier(&kvm_cpu_notifier); 2246 r = register_cpu_notifier(&kvm_cpu_notifier);
2246 if (r) 2247 if (r)
2247 goto out_free_2; 2248 goto out_free_2;
2248 register_reboot_notifier(&kvm_reboot_notifier); 2249 register_reboot_notifier(&kvm_reboot_notifier);
2249 2250
2250 r = sysdev_class_register(&kvm_sysdev_class); 2251 r = sysdev_class_register(&kvm_sysdev_class);
2251 if (r) 2252 if (r)
2252 goto out_free_3; 2253 goto out_free_3;
2253 2254
2254 r = sysdev_register(&kvm_sysdev); 2255 r = sysdev_register(&kvm_sysdev);
2255 if (r) 2256 if (r)
2256 goto out_free_4; 2257 goto out_free_4;
2257 2258
2258 /* A kmem cache lets us meet the alignment requirements of fx_save. */ 2259 /* A kmem cache lets us meet the alignment requirements of fx_save. */
2259 if (!vcpu_align) 2260 if (!vcpu_align)
2260 vcpu_align = __alignof__(struct kvm_vcpu); 2261 vcpu_align = __alignof__(struct kvm_vcpu);
2261 kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, vcpu_align, 2262 kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, vcpu_align,
2262 0, NULL); 2263 0, NULL);
2263 if (!kvm_vcpu_cache) { 2264 if (!kvm_vcpu_cache) {
2264 r = -ENOMEM; 2265 r = -ENOMEM;
2265 goto out_free_5; 2266 goto out_free_5;
2266 } 2267 }
2267 2268
2268 kvm_chardev_ops.owner = module; 2269 kvm_chardev_ops.owner = module;
2269 kvm_vm_fops.owner = module; 2270 kvm_vm_fops.owner = module;
2270 kvm_vcpu_fops.owner = module; 2271 kvm_vcpu_fops.owner = module;
2271 2272
2272 r = misc_register(&kvm_dev); 2273 r = misc_register(&kvm_dev);
2273 if (r) { 2274 if (r) {
2274 printk(KERN_ERR "kvm: misc device register failed\n"); 2275 printk(KERN_ERR "kvm: misc device register failed\n");
2275 goto out_free; 2276 goto out_free;
2276 } 2277 }
2277 2278
2278 kvm_preempt_ops.sched_in = kvm_sched_in; 2279 kvm_preempt_ops.sched_in = kvm_sched_in;
2279 kvm_preempt_ops.sched_out = kvm_sched_out; 2280 kvm_preempt_ops.sched_out = kvm_sched_out;
2280 2281
2281 kvm_init_debug(); 2282 kvm_init_debug();
2282 2283
2283 return 0; 2284 return 0;
2284 2285
2285 out_free: 2286 out_free:
2286 kmem_cache_destroy(kvm_vcpu_cache); 2287 kmem_cache_destroy(kvm_vcpu_cache);
2287 out_free_5: 2288 out_free_5:
2288 sysdev_unregister(&kvm_sysdev); 2289 sysdev_unregister(&kvm_sysdev);
2289 out_free_4: 2290 out_free_4:
2290 sysdev_class_unregister(&kvm_sysdev_class); 2291 sysdev_class_unregister(&kvm_sysdev_class);
2291 out_free_3: 2292 out_free_3:
2292 unregister_reboot_notifier(&kvm_reboot_notifier); 2293 unregister_reboot_notifier(&kvm_reboot_notifier);
2293 unregister_cpu_notifier(&kvm_cpu_notifier); 2294 unregister_cpu_notifier(&kvm_cpu_notifier);
2294 out_free_2: 2295 out_free_2:
2295 out_free_1: 2296 out_free_1:
2296 kvm_arch_hardware_unsetup(); 2297 kvm_arch_hardware_unsetup();
2297 out_free_0a: 2298 out_free_0a:
2298 free_cpumask_var(cpus_hardware_enabled); 2299 free_cpumask_var(cpus_hardware_enabled);
2299 out_free_0: 2300 out_free_0:
2300 if (hwpoison_page) 2301 if (hwpoison_page)
2301 __free_page(hwpoison_page); 2302 __free_page(hwpoison_page);
2302 __free_page(bad_page); 2303 __free_page(bad_page);
2303 out: 2304 out:
2304 kvm_arch_exit(); 2305 kvm_arch_exit();
2305 out_fail: 2306 out_fail:
2306 return r; 2307 return r;
2307 } 2308 }
2308 EXPORT_SYMBOL_GPL(kvm_init); 2309 EXPORT_SYMBOL_GPL(kvm_init);
2309 2310
2310 void kvm_exit(void) 2311 void kvm_exit(void)
2311 { 2312 {
2312 kvm_exit_debug(); 2313 kvm_exit_debug();
2313 misc_deregister(&kvm_dev); 2314 misc_deregister(&kvm_dev);
2314 kmem_cache_destroy(kvm_vcpu_cache); 2315 kmem_cache_destroy(kvm_vcpu_cache);
2315 sysdev_unregister(&kvm_sysdev); 2316 sysdev_unregister(&kvm_sysdev);
2316 sysdev_class_unregister(&kvm_sysdev_class); 2317 sysdev_class_unregister(&kvm_sysdev_class);
2317 unregister_reboot_notifier(&kvm_reboot_notifier); 2318 unregister_reboot_notifier(&kvm_reboot_notifier);
2318 unregister_cpu_notifier(&kvm_cpu_notifier); 2319 unregister_cpu_notifier(&kvm_cpu_notifier);
2319 on_each_cpu(hardware_disable, NULL, 1); 2320 on_each_cpu(hardware_disable, NULL, 1);
2320 kvm_arch_hardware_unsetup(); 2321 kvm_arch_hardware_unsetup();
2321 kvm_arch_exit(); 2322 kvm_arch_exit();
2322 free_cpumask_var(cpus_hardware_enabled); 2323 free_cpumask_var(cpus_hardware_enabled);
2323 __free_page(hwpoison_page); 2324 __free_page(hwpoison_page);
2324 __free_page(bad_page); 2325 __free_page(bad_page);
2325 } 2326 }
2326 EXPORT_SYMBOL_GPL(kvm_exit); 2327 EXPORT_SYMBOL_GPL(kvm_exit);