Commit edba23e51578f7cb6781461568489fc1825db4ac

Authored by Gleb Natapov
Committed by Avi Kivity
1 parent fa7bff8f8a

KVM: Return EFAULT from kvm ioctl when guest accesses bad area

Currently if guest access address that belongs to memory slot but is not
backed up by page or page is read only KVM treats it like MMIO access.
Remove that capability. It was never part of the interface and should
not be relied upon.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

Showing 3 changed files with 28 additions and 5 deletions Inline Diff

1 /* 1 /*
2 * Kernel-based Virtual Machine driver for Linux 2 * Kernel-based Virtual Machine driver for Linux
3 * 3 *
4 * This module enables machines with Intel VT-x extensions to run virtual 4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation. 5 * machines without emulation or binary translation.
6 * 6 *
7 * MMU support 7 * MMU support
8 * 8 *
9 * Copyright (C) 2006 Qumranet, Inc. 9 * Copyright (C) 2006 Qumranet, Inc.
10 * Copyright 2010 Red Hat, Inc. and/or its affilates. 10 * Copyright 2010 Red Hat, Inc. and/or its affilates.
11 * 11 *
12 * Authors: 12 * Authors:
13 * Yaniv Kamay <yaniv@qumranet.com> 13 * Yaniv Kamay <yaniv@qumranet.com>
14 * Avi Kivity <avi@qumranet.com> 14 * Avi Kivity <avi@qumranet.com>
15 * 15 *
16 * This work is licensed under the terms of the GNU GPL, version 2. See 16 * This work is licensed under the terms of the GNU GPL, version 2. See
17 * the COPYING file in the top-level directory. 17 * the COPYING file in the top-level directory.
18 * 18 *
19 */ 19 */
20 20
21 #include "mmu.h" 21 #include "mmu.h"
22 #include "x86.h" 22 #include "x86.h"
23 #include "kvm_cache_regs.h" 23 #include "kvm_cache_regs.h"
24 24
25 #include <linux/kvm_host.h> 25 #include <linux/kvm_host.h>
26 #include <linux/types.h> 26 #include <linux/types.h>
27 #include <linux/string.h> 27 #include <linux/string.h>
28 #include <linux/mm.h> 28 #include <linux/mm.h>
29 #include <linux/highmem.h> 29 #include <linux/highmem.h>
30 #include <linux/module.h> 30 #include <linux/module.h>
31 #include <linux/swap.h> 31 #include <linux/swap.h>
32 #include <linux/hugetlb.h> 32 #include <linux/hugetlb.h>
33 #include <linux/compiler.h> 33 #include <linux/compiler.h>
34 #include <linux/srcu.h> 34 #include <linux/srcu.h>
35 #include <linux/slab.h> 35 #include <linux/slab.h>
36 #include <linux/uaccess.h> 36 #include <linux/uaccess.h>
37 37
38 #include <asm/page.h> 38 #include <asm/page.h>
39 #include <asm/cmpxchg.h> 39 #include <asm/cmpxchg.h>
40 #include <asm/io.h> 40 #include <asm/io.h>
41 #include <asm/vmx.h> 41 #include <asm/vmx.h>
42 42
43 /* 43 /*
44 * When setting this variable to true it enables Two-Dimensional-Paging 44 * When setting this variable to true it enables Two-Dimensional-Paging
45 * where the hardware walks 2 page tables: 45 * where the hardware walks 2 page tables:
46 * 1. the guest-virtual to guest-physical 46 * 1. the guest-virtual to guest-physical
47 * 2. while doing 1. it walks guest-physical to host-physical 47 * 2. while doing 1. it walks guest-physical to host-physical
48 * If the hardware supports that we don't need to do shadow paging. 48 * If the hardware supports that we don't need to do shadow paging.
49 */ 49 */
50 bool tdp_enabled = false; 50 bool tdp_enabled = false;
51 51
52 #undef MMU_DEBUG 52 #undef MMU_DEBUG
53 53
54 #undef AUDIT 54 #undef AUDIT
55 55
56 #ifdef AUDIT 56 #ifdef AUDIT
57 static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg); 57 static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
58 #else 58 #else
59 static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {} 59 static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
60 #endif 60 #endif
61 61
62 #ifdef MMU_DEBUG 62 #ifdef MMU_DEBUG
63 63
64 #define pgprintk(x...) do { if (dbg) printk(x); } while (0) 64 #define pgprintk(x...) do { if (dbg) printk(x); } while (0)
65 #define rmap_printk(x...) do { if (dbg) printk(x); } while (0) 65 #define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
66 66
67 #else 67 #else
68 68
69 #define pgprintk(x...) do { } while (0) 69 #define pgprintk(x...) do { } while (0)
70 #define rmap_printk(x...) do { } while (0) 70 #define rmap_printk(x...) do { } while (0)
71 71
72 #endif 72 #endif
73 73
74 #if defined(MMU_DEBUG) || defined(AUDIT) 74 #if defined(MMU_DEBUG) || defined(AUDIT)
75 static int dbg = 0; 75 static int dbg = 0;
76 module_param(dbg, bool, 0644); 76 module_param(dbg, bool, 0644);
77 #endif 77 #endif
78 78
79 static int oos_shadow = 1; 79 static int oos_shadow = 1;
80 module_param(oos_shadow, bool, 0644); 80 module_param(oos_shadow, bool, 0644);
81 81
82 #ifndef MMU_DEBUG 82 #ifndef MMU_DEBUG
83 #define ASSERT(x) do { } while (0) 83 #define ASSERT(x) do { } while (0)
84 #else 84 #else
85 #define ASSERT(x) \ 85 #define ASSERT(x) \
86 if (!(x)) { \ 86 if (!(x)) { \
87 printk(KERN_WARNING "assertion failed %s:%d: %s\n", \ 87 printk(KERN_WARNING "assertion failed %s:%d: %s\n", \
88 __FILE__, __LINE__, #x); \ 88 __FILE__, __LINE__, #x); \
89 } 89 }
90 #endif 90 #endif
91 91
92 #define PT_FIRST_AVAIL_BITS_SHIFT 9 92 #define PT_FIRST_AVAIL_BITS_SHIFT 9
93 #define PT64_SECOND_AVAIL_BITS_SHIFT 52 93 #define PT64_SECOND_AVAIL_BITS_SHIFT 52
94 94
95 #define PT64_LEVEL_BITS 9 95 #define PT64_LEVEL_BITS 9
96 96
97 #define PT64_LEVEL_SHIFT(level) \ 97 #define PT64_LEVEL_SHIFT(level) \
98 (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS) 98 (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
99 99
100 #define PT64_LEVEL_MASK(level) \ 100 #define PT64_LEVEL_MASK(level) \
101 (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level)) 101 (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))
102 102
103 #define PT64_INDEX(address, level)\ 103 #define PT64_INDEX(address, level)\
104 (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) 104 (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
105 105
106 106
107 #define PT32_LEVEL_BITS 10 107 #define PT32_LEVEL_BITS 10
108 108
109 #define PT32_LEVEL_SHIFT(level) \ 109 #define PT32_LEVEL_SHIFT(level) \
110 (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS) 110 (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
111 111
112 #define PT32_LEVEL_MASK(level) \ 112 #define PT32_LEVEL_MASK(level) \
113 (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level)) 113 (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
114 #define PT32_LVL_OFFSET_MASK(level) \ 114 #define PT32_LVL_OFFSET_MASK(level) \
115 (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ 115 (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
116 * PT32_LEVEL_BITS))) - 1)) 116 * PT32_LEVEL_BITS))) - 1))
117 117
118 #define PT32_INDEX(address, level)\ 118 #define PT32_INDEX(address, level)\
119 (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) 119 (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
120 120
121 121
122 #define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) 122 #define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
123 #define PT64_DIR_BASE_ADDR_MASK \ 123 #define PT64_DIR_BASE_ADDR_MASK \
124 (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1)) 124 (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
125 #define PT64_LVL_ADDR_MASK(level) \ 125 #define PT64_LVL_ADDR_MASK(level) \
126 (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ 126 (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
127 * PT64_LEVEL_BITS))) - 1)) 127 * PT64_LEVEL_BITS))) - 1))
128 #define PT64_LVL_OFFSET_MASK(level) \ 128 #define PT64_LVL_OFFSET_MASK(level) \
129 (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ 129 (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
130 * PT64_LEVEL_BITS))) - 1)) 130 * PT64_LEVEL_BITS))) - 1))
131 131
132 #define PT32_BASE_ADDR_MASK PAGE_MASK 132 #define PT32_BASE_ADDR_MASK PAGE_MASK
133 #define PT32_DIR_BASE_ADDR_MASK \ 133 #define PT32_DIR_BASE_ADDR_MASK \
134 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1)) 134 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
135 #define PT32_LVL_ADDR_MASK(level) \ 135 #define PT32_LVL_ADDR_MASK(level) \
136 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ 136 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
137 * PT32_LEVEL_BITS))) - 1)) 137 * PT32_LEVEL_BITS))) - 1))
138 138
139 #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ 139 #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
140 | PT64_NX_MASK) 140 | PT64_NX_MASK)
141 141
142 #define RMAP_EXT 4 142 #define RMAP_EXT 4
143 143
144 #define ACC_EXEC_MASK 1 144 #define ACC_EXEC_MASK 1
145 #define ACC_WRITE_MASK PT_WRITABLE_MASK 145 #define ACC_WRITE_MASK PT_WRITABLE_MASK
146 #define ACC_USER_MASK PT_USER_MASK 146 #define ACC_USER_MASK PT_USER_MASK
147 #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) 147 #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
148 148
149 #include <trace/events/kvm.h> 149 #include <trace/events/kvm.h>
150 150
151 #define CREATE_TRACE_POINTS 151 #define CREATE_TRACE_POINTS
152 #include "mmutrace.h" 152 #include "mmutrace.h"
153 153
154 #define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) 154 #define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
155 155
156 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) 156 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
157 157
158 struct kvm_rmap_desc { 158 struct kvm_rmap_desc {
159 u64 *sptes[RMAP_EXT]; 159 u64 *sptes[RMAP_EXT];
160 struct kvm_rmap_desc *more; 160 struct kvm_rmap_desc *more;
161 }; 161 };
162 162
163 struct kvm_shadow_walk_iterator { 163 struct kvm_shadow_walk_iterator {
164 u64 addr; 164 u64 addr;
165 hpa_t shadow_addr; 165 hpa_t shadow_addr;
166 int level; 166 int level;
167 u64 *sptep; 167 u64 *sptep;
168 unsigned index; 168 unsigned index;
169 }; 169 };
170 170
171 #define for_each_shadow_entry(_vcpu, _addr, _walker) \ 171 #define for_each_shadow_entry(_vcpu, _addr, _walker) \
172 for (shadow_walk_init(&(_walker), _vcpu, _addr); \ 172 for (shadow_walk_init(&(_walker), _vcpu, _addr); \
173 shadow_walk_okay(&(_walker)); \ 173 shadow_walk_okay(&(_walker)); \
174 shadow_walk_next(&(_walker))) 174 shadow_walk_next(&(_walker)))
175 175
176 typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte); 176 typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte);
177 177
178 static struct kmem_cache *pte_chain_cache; 178 static struct kmem_cache *pte_chain_cache;
179 static struct kmem_cache *rmap_desc_cache; 179 static struct kmem_cache *rmap_desc_cache;
180 static struct kmem_cache *mmu_page_header_cache; 180 static struct kmem_cache *mmu_page_header_cache;
181 181
182 static u64 __read_mostly shadow_trap_nonpresent_pte; 182 static u64 __read_mostly shadow_trap_nonpresent_pte;
183 static u64 __read_mostly shadow_notrap_nonpresent_pte; 183 static u64 __read_mostly shadow_notrap_nonpresent_pte;
184 static u64 __read_mostly shadow_base_present_pte; 184 static u64 __read_mostly shadow_base_present_pte;
185 static u64 __read_mostly shadow_nx_mask; 185 static u64 __read_mostly shadow_nx_mask;
186 static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ 186 static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
187 static u64 __read_mostly shadow_user_mask; 187 static u64 __read_mostly shadow_user_mask;
188 static u64 __read_mostly shadow_accessed_mask; 188 static u64 __read_mostly shadow_accessed_mask;
189 static u64 __read_mostly shadow_dirty_mask; 189 static u64 __read_mostly shadow_dirty_mask;
190 190
191 static inline u64 rsvd_bits(int s, int e) 191 static inline u64 rsvd_bits(int s, int e)
192 { 192 {
193 return ((1ULL << (e - s + 1)) - 1) << s; 193 return ((1ULL << (e - s + 1)) - 1) << s;
194 } 194 }
195 195
196 void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte) 196 void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
197 { 197 {
198 shadow_trap_nonpresent_pte = trap_pte; 198 shadow_trap_nonpresent_pte = trap_pte;
199 shadow_notrap_nonpresent_pte = notrap_pte; 199 shadow_notrap_nonpresent_pte = notrap_pte;
200 } 200 }
201 EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes); 201 EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
202 202
203 void kvm_mmu_set_base_ptes(u64 base_pte) 203 void kvm_mmu_set_base_ptes(u64 base_pte)
204 { 204 {
205 shadow_base_present_pte = base_pte; 205 shadow_base_present_pte = base_pte;
206 } 206 }
207 EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes); 207 EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes);
208 208
209 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, 209 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
210 u64 dirty_mask, u64 nx_mask, u64 x_mask) 210 u64 dirty_mask, u64 nx_mask, u64 x_mask)
211 { 211 {
212 shadow_user_mask = user_mask; 212 shadow_user_mask = user_mask;
213 shadow_accessed_mask = accessed_mask; 213 shadow_accessed_mask = accessed_mask;
214 shadow_dirty_mask = dirty_mask; 214 shadow_dirty_mask = dirty_mask;
215 shadow_nx_mask = nx_mask; 215 shadow_nx_mask = nx_mask;
216 shadow_x_mask = x_mask; 216 shadow_x_mask = x_mask;
217 } 217 }
218 EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); 218 EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
219 219
220 static bool is_write_protection(struct kvm_vcpu *vcpu) 220 static bool is_write_protection(struct kvm_vcpu *vcpu)
221 { 221 {
222 return kvm_read_cr0_bits(vcpu, X86_CR0_WP); 222 return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
223 } 223 }
224 224
225 static int is_cpuid_PSE36(void) 225 static int is_cpuid_PSE36(void)
226 { 226 {
227 return 1; 227 return 1;
228 } 228 }
229 229
230 static int is_nx(struct kvm_vcpu *vcpu) 230 static int is_nx(struct kvm_vcpu *vcpu)
231 { 231 {
232 return vcpu->arch.efer & EFER_NX; 232 return vcpu->arch.efer & EFER_NX;
233 } 233 }
234 234
235 static int is_shadow_present_pte(u64 pte) 235 static int is_shadow_present_pte(u64 pte)
236 { 236 {
237 return pte != shadow_trap_nonpresent_pte 237 return pte != shadow_trap_nonpresent_pte
238 && pte != shadow_notrap_nonpresent_pte; 238 && pte != shadow_notrap_nonpresent_pte;
239 } 239 }
240 240
241 static int is_large_pte(u64 pte) 241 static int is_large_pte(u64 pte)
242 { 242 {
243 return pte & PT_PAGE_SIZE_MASK; 243 return pte & PT_PAGE_SIZE_MASK;
244 } 244 }
245 245
246 static int is_writable_pte(unsigned long pte) 246 static int is_writable_pte(unsigned long pte)
247 { 247 {
248 return pte & PT_WRITABLE_MASK; 248 return pte & PT_WRITABLE_MASK;
249 } 249 }
250 250
251 static int is_dirty_gpte(unsigned long pte) 251 static int is_dirty_gpte(unsigned long pte)
252 { 252 {
253 return pte & PT_DIRTY_MASK; 253 return pte & PT_DIRTY_MASK;
254 } 254 }
255 255
256 static int is_rmap_spte(u64 pte) 256 static int is_rmap_spte(u64 pte)
257 { 257 {
258 return is_shadow_present_pte(pte); 258 return is_shadow_present_pte(pte);
259 } 259 }
260 260
261 static int is_last_spte(u64 pte, int level) 261 static int is_last_spte(u64 pte, int level)
262 { 262 {
263 if (level == PT_PAGE_TABLE_LEVEL) 263 if (level == PT_PAGE_TABLE_LEVEL)
264 return 1; 264 return 1;
265 if (is_large_pte(pte)) 265 if (is_large_pte(pte))
266 return 1; 266 return 1;
267 return 0; 267 return 0;
268 } 268 }
269 269
270 static pfn_t spte_to_pfn(u64 pte) 270 static pfn_t spte_to_pfn(u64 pte)
271 { 271 {
272 return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; 272 return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
273 } 273 }
274 274
275 static gfn_t pse36_gfn_delta(u32 gpte) 275 static gfn_t pse36_gfn_delta(u32 gpte)
276 { 276 {
277 int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT; 277 int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
278 278
279 return (gpte & PT32_DIR_PSE36_MASK) << shift; 279 return (gpte & PT32_DIR_PSE36_MASK) << shift;
280 } 280 }
281 281
282 static void __set_spte(u64 *sptep, u64 spte) 282 static void __set_spte(u64 *sptep, u64 spte)
283 { 283 {
284 #ifdef CONFIG_X86_64 284 #ifdef CONFIG_X86_64
285 set_64bit((unsigned long *)sptep, spte); 285 set_64bit((unsigned long *)sptep, spte);
286 #else 286 #else
287 set_64bit((unsigned long long *)sptep, spte); 287 set_64bit((unsigned long long *)sptep, spte);
288 #endif 288 #endif
289 } 289 }
290 290
291 static u64 __xchg_spte(u64 *sptep, u64 new_spte) 291 static u64 __xchg_spte(u64 *sptep, u64 new_spte)
292 { 292 {
293 #ifdef CONFIG_X86_64 293 #ifdef CONFIG_X86_64
294 return xchg(sptep, new_spte); 294 return xchg(sptep, new_spte);
295 #else 295 #else
296 u64 old_spte; 296 u64 old_spte;
297 297
298 do { 298 do {
299 old_spte = *sptep; 299 old_spte = *sptep;
300 } while (cmpxchg64(sptep, old_spte, new_spte) != old_spte); 300 } while (cmpxchg64(sptep, old_spte, new_spte) != old_spte);
301 301
302 return old_spte; 302 return old_spte;
303 #endif 303 #endif
304 } 304 }
305 305
306 static void update_spte(u64 *sptep, u64 new_spte) 306 static void update_spte(u64 *sptep, u64 new_spte)
307 { 307 {
308 u64 old_spte; 308 u64 old_spte;
309 309
310 if (!shadow_accessed_mask || (new_spte & shadow_accessed_mask)) { 310 if (!shadow_accessed_mask || (new_spte & shadow_accessed_mask)) {
311 __set_spte(sptep, new_spte); 311 __set_spte(sptep, new_spte);
312 } else { 312 } else {
313 old_spte = __xchg_spte(sptep, new_spte); 313 old_spte = __xchg_spte(sptep, new_spte);
314 if (old_spte & shadow_accessed_mask) 314 if (old_spte & shadow_accessed_mask)
315 mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte))); 315 mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte)));
316 } 316 }
317 } 317 }
318 318
319 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, 319 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
320 struct kmem_cache *base_cache, int min) 320 struct kmem_cache *base_cache, int min)
321 { 321 {
322 void *obj; 322 void *obj;
323 323
324 if (cache->nobjs >= min) 324 if (cache->nobjs >= min)
325 return 0; 325 return 0;
326 while (cache->nobjs < ARRAY_SIZE(cache->objects)) { 326 while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
327 obj = kmem_cache_zalloc(base_cache, GFP_KERNEL); 327 obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
328 if (!obj) 328 if (!obj)
329 return -ENOMEM; 329 return -ENOMEM;
330 cache->objects[cache->nobjs++] = obj; 330 cache->objects[cache->nobjs++] = obj;
331 } 331 }
332 return 0; 332 return 0;
333 } 333 }
334 334
335 static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc, 335 static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
336 struct kmem_cache *cache) 336 struct kmem_cache *cache)
337 { 337 {
338 while (mc->nobjs) 338 while (mc->nobjs)
339 kmem_cache_free(cache, mc->objects[--mc->nobjs]); 339 kmem_cache_free(cache, mc->objects[--mc->nobjs]);
340 } 340 }
341 341
342 static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, 342 static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
343 int min) 343 int min)
344 { 344 {
345 struct page *page; 345 struct page *page;
346 346
347 if (cache->nobjs >= min) 347 if (cache->nobjs >= min)
348 return 0; 348 return 0;
349 while (cache->nobjs < ARRAY_SIZE(cache->objects)) { 349 while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
350 page = alloc_page(GFP_KERNEL); 350 page = alloc_page(GFP_KERNEL);
351 if (!page) 351 if (!page)
352 return -ENOMEM; 352 return -ENOMEM;
353 cache->objects[cache->nobjs++] = page_address(page); 353 cache->objects[cache->nobjs++] = page_address(page);
354 } 354 }
355 return 0; 355 return 0;
356 } 356 }
357 357
358 static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc) 358 static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
359 { 359 {
360 while (mc->nobjs) 360 while (mc->nobjs)
361 free_page((unsigned long)mc->objects[--mc->nobjs]); 361 free_page((unsigned long)mc->objects[--mc->nobjs]);
362 } 362 }
363 363
364 static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) 364 static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
365 { 365 {
366 int r; 366 int r;
367 367
368 r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache, 368 r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache,
369 pte_chain_cache, 4); 369 pte_chain_cache, 4);
370 if (r) 370 if (r)
371 goto out; 371 goto out;
372 r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, 372 r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
373 rmap_desc_cache, 4); 373 rmap_desc_cache, 4);
374 if (r) 374 if (r)
375 goto out; 375 goto out;
376 r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8); 376 r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
377 if (r) 377 if (r)
378 goto out; 378 goto out;
379 r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache, 379 r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
380 mmu_page_header_cache, 4); 380 mmu_page_header_cache, 4);
381 out: 381 out:
382 return r; 382 return r;
383 } 383 }
384 384
385 static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) 385 static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
386 { 386 {
387 mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache, pte_chain_cache); 387 mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache, pte_chain_cache);
388 mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, rmap_desc_cache); 388 mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, rmap_desc_cache);
389 mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache); 389 mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
390 mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache, 390 mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
391 mmu_page_header_cache); 391 mmu_page_header_cache);
392 } 392 }
393 393
394 static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, 394 static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
395 size_t size) 395 size_t size)
396 { 396 {
397 void *p; 397 void *p;
398 398
399 BUG_ON(!mc->nobjs); 399 BUG_ON(!mc->nobjs);
400 p = mc->objects[--mc->nobjs]; 400 p = mc->objects[--mc->nobjs];
401 return p; 401 return p;
402 } 402 }
403 403
404 static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu) 404 static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
405 { 405 {
406 return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache, 406 return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache,
407 sizeof(struct kvm_pte_chain)); 407 sizeof(struct kvm_pte_chain));
408 } 408 }
409 409
410 static void mmu_free_pte_chain(struct kvm_pte_chain *pc) 410 static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
411 { 411 {
412 kmem_cache_free(pte_chain_cache, pc); 412 kmem_cache_free(pte_chain_cache, pc);
413 } 413 }
414 414
415 static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu) 415 static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
416 { 416 {
417 return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache, 417 return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache,
418 sizeof(struct kvm_rmap_desc)); 418 sizeof(struct kvm_rmap_desc));
419 } 419 }
420 420
421 static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd) 421 static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
422 { 422 {
423 kmem_cache_free(rmap_desc_cache, rd); 423 kmem_cache_free(rmap_desc_cache, rd);
424 } 424 }
425 425
426 static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) 426 static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
427 { 427 {
428 if (!sp->role.direct) 428 if (!sp->role.direct)
429 return sp->gfns[index]; 429 return sp->gfns[index];
430 430
431 return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS)); 431 return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
432 } 432 }
433 433
434 static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn) 434 static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
435 { 435 {
436 if (sp->role.direct) 436 if (sp->role.direct)
437 BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index)); 437 BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index));
438 else 438 else
439 sp->gfns[index] = gfn; 439 sp->gfns[index] = gfn;
440 } 440 }
441 441
442 /* 442 /*
443 * Return the pointer to the largepage write count for a given 443 * Return the pointer to the largepage write count for a given
444 * gfn, handling slots that are not large page aligned. 444 * gfn, handling slots that are not large page aligned.
445 */ 445 */
446 static int *slot_largepage_idx(gfn_t gfn, 446 static int *slot_largepage_idx(gfn_t gfn,
447 struct kvm_memory_slot *slot, 447 struct kvm_memory_slot *slot,
448 int level) 448 int level)
449 { 449 {
450 unsigned long idx; 450 unsigned long idx;
451 451
452 idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - 452 idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
453 (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); 453 (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
454 return &slot->lpage_info[level - 2][idx].write_count; 454 return &slot->lpage_info[level - 2][idx].write_count;
455 } 455 }
456 456
457 static void account_shadowed(struct kvm *kvm, gfn_t gfn) 457 static void account_shadowed(struct kvm *kvm, gfn_t gfn)
458 { 458 {
459 struct kvm_memory_slot *slot; 459 struct kvm_memory_slot *slot;
460 int *write_count; 460 int *write_count;
461 int i; 461 int i;
462 462
463 slot = gfn_to_memslot(kvm, gfn); 463 slot = gfn_to_memslot(kvm, gfn);
464 for (i = PT_DIRECTORY_LEVEL; 464 for (i = PT_DIRECTORY_LEVEL;
465 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { 465 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
466 write_count = slot_largepage_idx(gfn, slot, i); 466 write_count = slot_largepage_idx(gfn, slot, i);
467 *write_count += 1; 467 *write_count += 1;
468 } 468 }
469 } 469 }
470 470
471 static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) 471 static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
472 { 472 {
473 struct kvm_memory_slot *slot; 473 struct kvm_memory_slot *slot;
474 int *write_count; 474 int *write_count;
475 int i; 475 int i;
476 476
477 slot = gfn_to_memslot(kvm, gfn); 477 slot = gfn_to_memslot(kvm, gfn);
478 for (i = PT_DIRECTORY_LEVEL; 478 for (i = PT_DIRECTORY_LEVEL;
479 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { 479 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
480 write_count = slot_largepage_idx(gfn, slot, i); 480 write_count = slot_largepage_idx(gfn, slot, i);
481 *write_count -= 1; 481 *write_count -= 1;
482 WARN_ON(*write_count < 0); 482 WARN_ON(*write_count < 0);
483 } 483 }
484 } 484 }
485 485
486 static int has_wrprotected_page(struct kvm *kvm, 486 static int has_wrprotected_page(struct kvm *kvm,
487 gfn_t gfn, 487 gfn_t gfn,
488 int level) 488 int level)
489 { 489 {
490 struct kvm_memory_slot *slot; 490 struct kvm_memory_slot *slot;
491 int *largepage_idx; 491 int *largepage_idx;
492 492
493 slot = gfn_to_memslot(kvm, gfn); 493 slot = gfn_to_memslot(kvm, gfn);
494 if (slot) { 494 if (slot) {
495 largepage_idx = slot_largepage_idx(gfn, slot, level); 495 largepage_idx = slot_largepage_idx(gfn, slot, level);
496 return *largepage_idx; 496 return *largepage_idx;
497 } 497 }
498 498
499 return 1; 499 return 1;
500 } 500 }
501 501
502 static int host_mapping_level(struct kvm *kvm, gfn_t gfn) 502 static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
503 { 503 {
504 unsigned long page_size; 504 unsigned long page_size;
505 int i, ret = 0; 505 int i, ret = 0;
506 506
507 page_size = kvm_host_page_size(kvm, gfn); 507 page_size = kvm_host_page_size(kvm, gfn);
508 508
509 for (i = PT_PAGE_TABLE_LEVEL; 509 for (i = PT_PAGE_TABLE_LEVEL;
510 i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) { 510 i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) {
511 if (page_size >= KVM_HPAGE_SIZE(i)) 511 if (page_size >= KVM_HPAGE_SIZE(i))
512 ret = i; 512 ret = i;
513 else 513 else
514 break; 514 break;
515 } 515 }
516 516
517 return ret; 517 return ret;
518 } 518 }
519 519
520 static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) 520 static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
521 { 521 {
522 struct kvm_memory_slot *slot; 522 struct kvm_memory_slot *slot;
523 int host_level, level, max_level; 523 int host_level, level, max_level;
524 524
525 slot = gfn_to_memslot(vcpu->kvm, large_gfn); 525 slot = gfn_to_memslot(vcpu->kvm, large_gfn);
526 if (slot && slot->dirty_bitmap) 526 if (slot && slot->dirty_bitmap)
527 return PT_PAGE_TABLE_LEVEL; 527 return PT_PAGE_TABLE_LEVEL;
528 528
529 host_level = host_mapping_level(vcpu->kvm, large_gfn); 529 host_level = host_mapping_level(vcpu->kvm, large_gfn);
530 530
531 if (host_level == PT_PAGE_TABLE_LEVEL) 531 if (host_level == PT_PAGE_TABLE_LEVEL)
532 return host_level; 532 return host_level;
533 533
534 max_level = kvm_x86_ops->get_lpage_level() < host_level ? 534 max_level = kvm_x86_ops->get_lpage_level() < host_level ?
535 kvm_x86_ops->get_lpage_level() : host_level; 535 kvm_x86_ops->get_lpage_level() : host_level;
536 536
537 for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level) 537 for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
538 if (has_wrprotected_page(vcpu->kvm, large_gfn, level)) 538 if (has_wrprotected_page(vcpu->kvm, large_gfn, level))
539 break; 539 break;
540 540
541 return level - 1; 541 return level - 1;
542 } 542 }
543 543
544 /* 544 /*
545 * Take gfn and return the reverse mapping to it. 545 * Take gfn and return the reverse mapping to it.
546 */ 546 */
547 547
548 static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) 548 static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
549 { 549 {
550 struct kvm_memory_slot *slot; 550 struct kvm_memory_slot *slot;
551 unsigned long idx; 551 unsigned long idx;
552 552
553 slot = gfn_to_memslot(kvm, gfn); 553 slot = gfn_to_memslot(kvm, gfn);
554 if (likely(level == PT_PAGE_TABLE_LEVEL)) 554 if (likely(level == PT_PAGE_TABLE_LEVEL))
555 return &slot->rmap[gfn - slot->base_gfn]; 555 return &slot->rmap[gfn - slot->base_gfn];
556 556
557 idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - 557 idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
558 (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); 558 (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
559 559
560 return &slot->lpage_info[level - 2][idx].rmap_pde; 560 return &slot->lpage_info[level - 2][idx].rmap_pde;
561 } 561 }
562 562
563 /* 563 /*
564 * Reverse mapping data structures: 564 * Reverse mapping data structures:
565 * 565 *
566 * If rmapp bit zero is zero, then rmapp point to the shadw page table entry 566 * If rmapp bit zero is zero, then rmapp point to the shadw page table entry
567 * that points to page_address(page). 567 * that points to page_address(page).
568 * 568 *
569 * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc 569 * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc
570 * containing more mappings. 570 * containing more mappings.
571 * 571 *
572 * Returns the number of rmap entries before the spte was added or zero if 572 * Returns the number of rmap entries before the spte was added or zero if
573 * the spte was not added. 573 * the spte was not added.
574 * 574 *
575 */ 575 */
576 static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) 576 static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
577 { 577 {
578 struct kvm_mmu_page *sp; 578 struct kvm_mmu_page *sp;
579 struct kvm_rmap_desc *desc; 579 struct kvm_rmap_desc *desc;
580 unsigned long *rmapp; 580 unsigned long *rmapp;
581 int i, count = 0; 581 int i, count = 0;
582 582
583 if (!is_rmap_spte(*spte)) 583 if (!is_rmap_spte(*spte))
584 return count; 584 return count;
585 sp = page_header(__pa(spte)); 585 sp = page_header(__pa(spte));
586 kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); 586 kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
587 rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); 587 rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
588 if (!*rmapp) { 588 if (!*rmapp) {
589 rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte); 589 rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
590 *rmapp = (unsigned long)spte; 590 *rmapp = (unsigned long)spte;
591 } else if (!(*rmapp & 1)) { 591 } else if (!(*rmapp & 1)) {
592 rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte); 592 rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
593 desc = mmu_alloc_rmap_desc(vcpu); 593 desc = mmu_alloc_rmap_desc(vcpu);
594 desc->sptes[0] = (u64 *)*rmapp; 594 desc->sptes[0] = (u64 *)*rmapp;
595 desc->sptes[1] = spte; 595 desc->sptes[1] = spte;
596 *rmapp = (unsigned long)desc | 1; 596 *rmapp = (unsigned long)desc | 1;
597 } else { 597 } else {
598 rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); 598 rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
599 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); 599 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
600 while (desc->sptes[RMAP_EXT-1] && desc->more) { 600 while (desc->sptes[RMAP_EXT-1] && desc->more) {
601 desc = desc->more; 601 desc = desc->more;
602 count += RMAP_EXT; 602 count += RMAP_EXT;
603 } 603 }
604 if (desc->sptes[RMAP_EXT-1]) { 604 if (desc->sptes[RMAP_EXT-1]) {
605 desc->more = mmu_alloc_rmap_desc(vcpu); 605 desc->more = mmu_alloc_rmap_desc(vcpu);
606 desc = desc->more; 606 desc = desc->more;
607 } 607 }
608 for (i = 0; desc->sptes[i]; ++i) 608 for (i = 0; desc->sptes[i]; ++i)
609 ; 609 ;
610 desc->sptes[i] = spte; 610 desc->sptes[i] = spte;
611 } 611 }
612 return count; 612 return count;
613 } 613 }
614 614
615 static void rmap_desc_remove_entry(unsigned long *rmapp, 615 static void rmap_desc_remove_entry(unsigned long *rmapp,
616 struct kvm_rmap_desc *desc, 616 struct kvm_rmap_desc *desc,
617 int i, 617 int i,
618 struct kvm_rmap_desc *prev_desc) 618 struct kvm_rmap_desc *prev_desc)
619 { 619 {
620 int j; 620 int j;
621 621
622 for (j = RMAP_EXT - 1; !desc->sptes[j] && j > i; --j) 622 for (j = RMAP_EXT - 1; !desc->sptes[j] && j > i; --j)
623 ; 623 ;
624 desc->sptes[i] = desc->sptes[j]; 624 desc->sptes[i] = desc->sptes[j];
625 desc->sptes[j] = NULL; 625 desc->sptes[j] = NULL;
626 if (j != 0) 626 if (j != 0)
627 return; 627 return;
628 if (!prev_desc && !desc->more) 628 if (!prev_desc && !desc->more)
629 *rmapp = (unsigned long)desc->sptes[0]; 629 *rmapp = (unsigned long)desc->sptes[0];
630 else 630 else
631 if (prev_desc) 631 if (prev_desc)
632 prev_desc->more = desc->more; 632 prev_desc->more = desc->more;
633 else 633 else
634 *rmapp = (unsigned long)desc->more | 1; 634 *rmapp = (unsigned long)desc->more | 1;
635 mmu_free_rmap_desc(desc); 635 mmu_free_rmap_desc(desc);
636 } 636 }
637 637
638 static void rmap_remove(struct kvm *kvm, u64 *spte) 638 static void rmap_remove(struct kvm *kvm, u64 *spte)
639 { 639 {
640 struct kvm_rmap_desc *desc; 640 struct kvm_rmap_desc *desc;
641 struct kvm_rmap_desc *prev_desc; 641 struct kvm_rmap_desc *prev_desc;
642 struct kvm_mmu_page *sp; 642 struct kvm_mmu_page *sp;
643 gfn_t gfn; 643 gfn_t gfn;
644 unsigned long *rmapp; 644 unsigned long *rmapp;
645 int i; 645 int i;
646 646
647 sp = page_header(__pa(spte)); 647 sp = page_header(__pa(spte));
648 gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); 648 gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
649 rmapp = gfn_to_rmap(kvm, gfn, sp->role.level); 649 rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
650 if (!*rmapp) { 650 if (!*rmapp) {
651 printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); 651 printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
652 BUG(); 652 BUG();
653 } else if (!(*rmapp & 1)) { 653 } else if (!(*rmapp & 1)) {
654 rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte); 654 rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte);
655 if ((u64 *)*rmapp != spte) { 655 if ((u64 *)*rmapp != spte) {
656 printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n", 656 printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n",
657 spte, *spte); 657 spte, *spte);
658 BUG(); 658 BUG();
659 } 659 }
660 *rmapp = 0; 660 *rmapp = 0;
661 } else { 661 } else {
662 rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte); 662 rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte);
663 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); 663 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
664 prev_desc = NULL; 664 prev_desc = NULL;
665 while (desc) { 665 while (desc) {
666 for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) 666 for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i)
667 if (desc->sptes[i] == spte) { 667 if (desc->sptes[i] == spte) {
668 rmap_desc_remove_entry(rmapp, 668 rmap_desc_remove_entry(rmapp,
669 desc, i, 669 desc, i,
670 prev_desc); 670 prev_desc);
671 return; 671 return;
672 } 672 }
673 prev_desc = desc; 673 prev_desc = desc;
674 desc = desc->more; 674 desc = desc->more;
675 } 675 }
676 pr_err("rmap_remove: %p %llx many->many\n", spte, *spte); 676 pr_err("rmap_remove: %p %llx many->many\n", spte, *spte);
677 BUG(); 677 BUG();
678 } 678 }
679 } 679 }
680 680
681 static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte) 681 static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
682 { 682 {
683 pfn_t pfn; 683 pfn_t pfn;
684 u64 old_spte; 684 u64 old_spte;
685 685
686 old_spte = __xchg_spte(sptep, new_spte); 686 old_spte = __xchg_spte(sptep, new_spte);
687 if (!is_rmap_spte(old_spte)) 687 if (!is_rmap_spte(old_spte))
688 return; 688 return;
689 pfn = spte_to_pfn(old_spte); 689 pfn = spte_to_pfn(old_spte);
690 if (old_spte & shadow_accessed_mask) 690 if (old_spte & shadow_accessed_mask)
691 kvm_set_pfn_accessed(pfn); 691 kvm_set_pfn_accessed(pfn);
692 if (is_writable_pte(old_spte)) 692 if (is_writable_pte(old_spte))
693 kvm_set_pfn_dirty(pfn); 693 kvm_set_pfn_dirty(pfn);
694 rmap_remove(kvm, sptep); 694 rmap_remove(kvm, sptep);
695 } 695 }
696 696
697 static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) 697 static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
698 { 698 {
699 struct kvm_rmap_desc *desc; 699 struct kvm_rmap_desc *desc;
700 u64 *prev_spte; 700 u64 *prev_spte;
701 int i; 701 int i;
702 702
703 if (!*rmapp) 703 if (!*rmapp)
704 return NULL; 704 return NULL;
705 else if (!(*rmapp & 1)) { 705 else if (!(*rmapp & 1)) {
706 if (!spte) 706 if (!spte)
707 return (u64 *)*rmapp; 707 return (u64 *)*rmapp;
708 return NULL; 708 return NULL;
709 } 709 }
710 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); 710 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
711 prev_spte = NULL; 711 prev_spte = NULL;
712 while (desc) { 712 while (desc) {
713 for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) { 713 for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) {
714 if (prev_spte == spte) 714 if (prev_spte == spte)
715 return desc->sptes[i]; 715 return desc->sptes[i];
716 prev_spte = desc->sptes[i]; 716 prev_spte = desc->sptes[i];
717 } 717 }
718 desc = desc->more; 718 desc = desc->more;
719 } 719 }
720 return NULL; 720 return NULL;
721 } 721 }
722 722
723 static int rmap_write_protect(struct kvm *kvm, u64 gfn) 723 static int rmap_write_protect(struct kvm *kvm, u64 gfn)
724 { 724 {
725 unsigned long *rmapp; 725 unsigned long *rmapp;
726 u64 *spte; 726 u64 *spte;
727 int i, write_protected = 0; 727 int i, write_protected = 0;
728 728
729 rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL); 729 rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL);
730 730
731 spte = rmap_next(kvm, rmapp, NULL); 731 spte = rmap_next(kvm, rmapp, NULL);
732 while (spte) { 732 while (spte) {
733 BUG_ON(!spte); 733 BUG_ON(!spte);
734 BUG_ON(!(*spte & PT_PRESENT_MASK)); 734 BUG_ON(!(*spte & PT_PRESENT_MASK));
735 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); 735 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
736 if (is_writable_pte(*spte)) { 736 if (is_writable_pte(*spte)) {
737 update_spte(spte, *spte & ~PT_WRITABLE_MASK); 737 update_spte(spte, *spte & ~PT_WRITABLE_MASK);
738 write_protected = 1; 738 write_protected = 1;
739 } 739 }
740 spte = rmap_next(kvm, rmapp, spte); 740 spte = rmap_next(kvm, rmapp, spte);
741 } 741 }
742 if (write_protected) { 742 if (write_protected) {
743 pfn_t pfn; 743 pfn_t pfn;
744 744
745 spte = rmap_next(kvm, rmapp, NULL); 745 spte = rmap_next(kvm, rmapp, NULL);
746 pfn = spte_to_pfn(*spte); 746 pfn = spte_to_pfn(*spte);
747 kvm_set_pfn_dirty(pfn); 747 kvm_set_pfn_dirty(pfn);
748 } 748 }
749 749
750 /* check for huge page mappings */ 750 /* check for huge page mappings */
751 for (i = PT_DIRECTORY_LEVEL; 751 for (i = PT_DIRECTORY_LEVEL;
752 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { 752 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
753 rmapp = gfn_to_rmap(kvm, gfn, i); 753 rmapp = gfn_to_rmap(kvm, gfn, i);
754 spte = rmap_next(kvm, rmapp, NULL); 754 spte = rmap_next(kvm, rmapp, NULL);
755 while (spte) { 755 while (spte) {
756 BUG_ON(!spte); 756 BUG_ON(!spte);
757 BUG_ON(!(*spte & PT_PRESENT_MASK)); 757 BUG_ON(!(*spte & PT_PRESENT_MASK));
758 BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)); 758 BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
759 pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); 759 pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
760 if (is_writable_pte(*spte)) { 760 if (is_writable_pte(*spte)) {
761 drop_spte(kvm, spte, 761 drop_spte(kvm, spte,
762 shadow_trap_nonpresent_pte); 762 shadow_trap_nonpresent_pte);
763 --kvm->stat.lpages; 763 --kvm->stat.lpages;
764 spte = NULL; 764 spte = NULL;
765 write_protected = 1; 765 write_protected = 1;
766 } 766 }
767 spte = rmap_next(kvm, rmapp, spte); 767 spte = rmap_next(kvm, rmapp, spte);
768 } 768 }
769 } 769 }
770 770
771 return write_protected; 771 return write_protected;
772 } 772 }
773 773
774 static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, 774 static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
775 unsigned long data) 775 unsigned long data)
776 { 776 {
777 u64 *spte; 777 u64 *spte;
778 int need_tlb_flush = 0; 778 int need_tlb_flush = 0;
779 779
780 while ((spte = rmap_next(kvm, rmapp, NULL))) { 780 while ((spte = rmap_next(kvm, rmapp, NULL))) {
781 BUG_ON(!(*spte & PT_PRESENT_MASK)); 781 BUG_ON(!(*spte & PT_PRESENT_MASK));
782 rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); 782 rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
783 drop_spte(kvm, spte, shadow_trap_nonpresent_pte); 783 drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
784 need_tlb_flush = 1; 784 need_tlb_flush = 1;
785 } 785 }
786 return need_tlb_flush; 786 return need_tlb_flush;
787 } 787 }
788 788
789 static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, 789 static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
790 unsigned long data) 790 unsigned long data)
791 { 791 {
792 int need_flush = 0; 792 int need_flush = 0;
793 u64 *spte, new_spte, old_spte; 793 u64 *spte, new_spte, old_spte;
794 pte_t *ptep = (pte_t *)data; 794 pte_t *ptep = (pte_t *)data;
795 pfn_t new_pfn; 795 pfn_t new_pfn;
796 796
797 WARN_ON(pte_huge(*ptep)); 797 WARN_ON(pte_huge(*ptep));
798 new_pfn = pte_pfn(*ptep); 798 new_pfn = pte_pfn(*ptep);
799 spte = rmap_next(kvm, rmapp, NULL); 799 spte = rmap_next(kvm, rmapp, NULL);
800 while (spte) { 800 while (spte) {
801 BUG_ON(!is_shadow_present_pte(*spte)); 801 BUG_ON(!is_shadow_present_pte(*spte));
802 rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte); 802 rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
803 need_flush = 1; 803 need_flush = 1;
804 if (pte_write(*ptep)) { 804 if (pte_write(*ptep)) {
805 drop_spte(kvm, spte, shadow_trap_nonpresent_pte); 805 drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
806 spte = rmap_next(kvm, rmapp, NULL); 806 spte = rmap_next(kvm, rmapp, NULL);
807 } else { 807 } else {
808 new_spte = *spte &~ (PT64_BASE_ADDR_MASK); 808 new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
809 new_spte |= (u64)new_pfn << PAGE_SHIFT; 809 new_spte |= (u64)new_pfn << PAGE_SHIFT;
810 810
811 new_spte &= ~PT_WRITABLE_MASK; 811 new_spte &= ~PT_WRITABLE_MASK;
812 new_spte &= ~SPTE_HOST_WRITEABLE; 812 new_spte &= ~SPTE_HOST_WRITEABLE;
813 new_spte &= ~shadow_accessed_mask; 813 new_spte &= ~shadow_accessed_mask;
814 if (is_writable_pte(*spte)) 814 if (is_writable_pte(*spte))
815 kvm_set_pfn_dirty(spte_to_pfn(*spte)); 815 kvm_set_pfn_dirty(spte_to_pfn(*spte));
816 old_spte = __xchg_spte(spte, new_spte); 816 old_spte = __xchg_spte(spte, new_spte);
817 if (is_shadow_present_pte(old_spte) 817 if (is_shadow_present_pte(old_spte)
818 && (old_spte & shadow_accessed_mask)) 818 && (old_spte & shadow_accessed_mask))
819 mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte))); 819 mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte)));
820 spte = rmap_next(kvm, rmapp, spte); 820 spte = rmap_next(kvm, rmapp, spte);
821 } 821 }
822 } 822 }
823 if (need_flush) 823 if (need_flush)
824 kvm_flush_remote_tlbs(kvm); 824 kvm_flush_remote_tlbs(kvm);
825 825
826 return 0; 826 return 0;
827 } 827 }
828 828
829 static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, 829 static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
830 unsigned long data, 830 unsigned long data,
831 int (*handler)(struct kvm *kvm, unsigned long *rmapp, 831 int (*handler)(struct kvm *kvm, unsigned long *rmapp,
832 unsigned long data)) 832 unsigned long data))
833 { 833 {
834 int i, j; 834 int i, j;
835 int ret; 835 int ret;
836 int retval = 0; 836 int retval = 0;
837 struct kvm_memslots *slots; 837 struct kvm_memslots *slots;
838 838
839 slots = kvm_memslots(kvm); 839 slots = kvm_memslots(kvm);
840 840
841 for (i = 0; i < slots->nmemslots; i++) { 841 for (i = 0; i < slots->nmemslots; i++) {
842 struct kvm_memory_slot *memslot = &slots->memslots[i]; 842 struct kvm_memory_slot *memslot = &slots->memslots[i];
843 unsigned long start = memslot->userspace_addr; 843 unsigned long start = memslot->userspace_addr;
844 unsigned long end; 844 unsigned long end;
845 845
846 end = start + (memslot->npages << PAGE_SHIFT); 846 end = start + (memslot->npages << PAGE_SHIFT);
847 if (hva >= start && hva < end) { 847 if (hva >= start && hva < end) {
848 gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; 848 gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
849 849
850 ret = handler(kvm, &memslot->rmap[gfn_offset], data); 850 ret = handler(kvm, &memslot->rmap[gfn_offset], data);
851 851
852 for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { 852 for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
853 int idx = gfn_offset; 853 int idx = gfn_offset;
854 idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j); 854 idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j);
855 ret |= handler(kvm, 855 ret |= handler(kvm,
856 &memslot->lpage_info[j][idx].rmap_pde, 856 &memslot->lpage_info[j][idx].rmap_pde,
857 data); 857 data);
858 } 858 }
859 trace_kvm_age_page(hva, memslot, ret); 859 trace_kvm_age_page(hva, memslot, ret);
860 retval |= ret; 860 retval |= ret;
861 } 861 }
862 } 862 }
863 863
864 return retval; 864 return retval;
865 } 865 }
866 866
867 int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) 867 int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
868 { 868 {
869 return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp); 869 return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp);
870 } 870 }
871 871
872 void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) 872 void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
873 { 873 {
874 kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp); 874 kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
875 } 875 }
876 876
877 static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, 877 static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
878 unsigned long data) 878 unsigned long data)
879 { 879 {
880 u64 *spte; 880 u64 *spte;
881 int young = 0; 881 int young = 0;
882 882
883 /* 883 /*
884 * Emulate the accessed bit for EPT, by checking if this page has 884 * Emulate the accessed bit for EPT, by checking if this page has
885 * an EPT mapping, and clearing it if it does. On the next access, 885 * an EPT mapping, and clearing it if it does. On the next access,
886 * a new EPT mapping will be established. 886 * a new EPT mapping will be established.
887 * This has some overhead, but not as much as the cost of swapping 887 * This has some overhead, but not as much as the cost of swapping
888 * out actively used pages or breaking up actively used hugepages. 888 * out actively used pages or breaking up actively used hugepages.
889 */ 889 */
890 if (!shadow_accessed_mask) 890 if (!shadow_accessed_mask)
891 return kvm_unmap_rmapp(kvm, rmapp, data); 891 return kvm_unmap_rmapp(kvm, rmapp, data);
892 892
893 spte = rmap_next(kvm, rmapp, NULL); 893 spte = rmap_next(kvm, rmapp, NULL);
894 while (spte) { 894 while (spte) {
895 int _young; 895 int _young;
896 u64 _spte = *spte; 896 u64 _spte = *spte;
897 BUG_ON(!(_spte & PT_PRESENT_MASK)); 897 BUG_ON(!(_spte & PT_PRESENT_MASK));
898 _young = _spte & PT_ACCESSED_MASK; 898 _young = _spte & PT_ACCESSED_MASK;
899 if (_young) { 899 if (_young) {
900 young = 1; 900 young = 1;
901 clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte); 901 clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
902 } 902 }
903 spte = rmap_next(kvm, rmapp, spte); 903 spte = rmap_next(kvm, rmapp, spte);
904 } 904 }
905 return young; 905 return young;
906 } 906 }
907 907
908 #define RMAP_RECYCLE_THRESHOLD 1000 908 #define RMAP_RECYCLE_THRESHOLD 1000
909 909
910 static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) 910 static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
911 { 911 {
912 unsigned long *rmapp; 912 unsigned long *rmapp;
913 struct kvm_mmu_page *sp; 913 struct kvm_mmu_page *sp;
914 914
915 sp = page_header(__pa(spte)); 915 sp = page_header(__pa(spte));
916 916
917 rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); 917 rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
918 918
919 kvm_unmap_rmapp(vcpu->kvm, rmapp, 0); 919 kvm_unmap_rmapp(vcpu->kvm, rmapp, 0);
920 kvm_flush_remote_tlbs(vcpu->kvm); 920 kvm_flush_remote_tlbs(vcpu->kvm);
921 } 921 }
922 922
923 int kvm_age_hva(struct kvm *kvm, unsigned long hva) 923 int kvm_age_hva(struct kvm *kvm, unsigned long hva)
924 { 924 {
925 return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp); 925 return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp);
926 } 926 }
927 927
928 #ifdef MMU_DEBUG 928 #ifdef MMU_DEBUG
929 static int is_empty_shadow_page(u64 *spt) 929 static int is_empty_shadow_page(u64 *spt)
930 { 930 {
931 u64 *pos; 931 u64 *pos;
932 u64 *end; 932 u64 *end;
933 933
934 for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++) 934 for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
935 if (is_shadow_present_pte(*pos)) { 935 if (is_shadow_present_pte(*pos)) {
936 printk(KERN_ERR "%s: %p %llx\n", __func__, 936 printk(KERN_ERR "%s: %p %llx\n", __func__,
937 pos, *pos); 937 pos, *pos);
938 return 0; 938 return 0;
939 } 939 }
940 return 1; 940 return 1;
941 } 941 }
942 #endif 942 #endif
943 943
944 static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) 944 static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
945 { 945 {
946 ASSERT(is_empty_shadow_page(sp->spt)); 946 ASSERT(is_empty_shadow_page(sp->spt));
947 hlist_del(&sp->hash_link); 947 hlist_del(&sp->hash_link);
948 list_del(&sp->link); 948 list_del(&sp->link);
949 __free_page(virt_to_page(sp->spt)); 949 __free_page(virt_to_page(sp->spt));
950 if (!sp->role.direct) 950 if (!sp->role.direct)
951 __free_page(virt_to_page(sp->gfns)); 951 __free_page(virt_to_page(sp->gfns));
952 kmem_cache_free(mmu_page_header_cache, sp); 952 kmem_cache_free(mmu_page_header_cache, sp);
953 ++kvm->arch.n_free_mmu_pages; 953 ++kvm->arch.n_free_mmu_pages;
954 } 954 }
955 955
956 static unsigned kvm_page_table_hashfn(gfn_t gfn) 956 static unsigned kvm_page_table_hashfn(gfn_t gfn)
957 { 957 {
958 return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1); 958 return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1);
959 } 959 }
960 960
961 static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, 961 static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
962 u64 *parent_pte, int direct) 962 u64 *parent_pte, int direct)
963 { 963 {
964 struct kvm_mmu_page *sp; 964 struct kvm_mmu_page *sp;
965 965
966 sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp); 966 sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
967 sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); 967 sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
968 if (!direct) 968 if (!direct)
969 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, 969 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache,
970 PAGE_SIZE); 970 PAGE_SIZE);
971 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 971 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
972 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); 972 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
973 bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); 973 bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
974 sp->multimapped = 0; 974 sp->multimapped = 0;
975 sp->parent_pte = parent_pte; 975 sp->parent_pte = parent_pte;
976 --vcpu->kvm->arch.n_free_mmu_pages; 976 --vcpu->kvm->arch.n_free_mmu_pages;
977 return sp; 977 return sp;
978 } 978 }
979 979
980 static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu, 980 static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
981 struct kvm_mmu_page *sp, u64 *parent_pte) 981 struct kvm_mmu_page *sp, u64 *parent_pte)
982 { 982 {
983 struct kvm_pte_chain *pte_chain; 983 struct kvm_pte_chain *pte_chain;
984 struct hlist_node *node; 984 struct hlist_node *node;
985 int i; 985 int i;
986 986
987 if (!parent_pte) 987 if (!parent_pte)
988 return; 988 return;
989 if (!sp->multimapped) { 989 if (!sp->multimapped) {
990 u64 *old = sp->parent_pte; 990 u64 *old = sp->parent_pte;
991 991
992 if (!old) { 992 if (!old) {
993 sp->parent_pte = parent_pte; 993 sp->parent_pte = parent_pte;
994 return; 994 return;
995 } 995 }
996 sp->multimapped = 1; 996 sp->multimapped = 1;
997 pte_chain = mmu_alloc_pte_chain(vcpu); 997 pte_chain = mmu_alloc_pte_chain(vcpu);
998 INIT_HLIST_HEAD(&sp->parent_ptes); 998 INIT_HLIST_HEAD(&sp->parent_ptes);
999 hlist_add_head(&pte_chain->link, &sp->parent_ptes); 999 hlist_add_head(&pte_chain->link, &sp->parent_ptes);
1000 pte_chain->parent_ptes[0] = old; 1000 pte_chain->parent_ptes[0] = old;
1001 } 1001 }
1002 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) { 1002 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) {
1003 if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1]) 1003 if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
1004 continue; 1004 continue;
1005 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) 1005 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
1006 if (!pte_chain->parent_ptes[i]) { 1006 if (!pte_chain->parent_ptes[i]) {
1007 pte_chain->parent_ptes[i] = parent_pte; 1007 pte_chain->parent_ptes[i] = parent_pte;
1008 return; 1008 return;
1009 } 1009 }
1010 } 1010 }
1011 pte_chain = mmu_alloc_pte_chain(vcpu); 1011 pte_chain = mmu_alloc_pte_chain(vcpu);
1012 BUG_ON(!pte_chain); 1012 BUG_ON(!pte_chain);
1013 hlist_add_head(&pte_chain->link, &sp->parent_ptes); 1013 hlist_add_head(&pte_chain->link, &sp->parent_ptes);
1014 pte_chain->parent_ptes[0] = parent_pte; 1014 pte_chain->parent_ptes[0] = parent_pte;
1015 } 1015 }
1016 1016
1017 static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, 1017 static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
1018 u64 *parent_pte) 1018 u64 *parent_pte)
1019 { 1019 {
1020 struct kvm_pte_chain *pte_chain; 1020 struct kvm_pte_chain *pte_chain;
1021 struct hlist_node *node; 1021 struct hlist_node *node;
1022 int i; 1022 int i;
1023 1023
1024 if (!sp->multimapped) { 1024 if (!sp->multimapped) {
1025 BUG_ON(sp->parent_pte != parent_pte); 1025 BUG_ON(sp->parent_pte != parent_pte);
1026 sp->parent_pte = NULL; 1026 sp->parent_pte = NULL;
1027 return; 1027 return;
1028 } 1028 }
1029 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) 1029 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
1030 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { 1030 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
1031 if (!pte_chain->parent_ptes[i]) 1031 if (!pte_chain->parent_ptes[i])
1032 break; 1032 break;
1033 if (pte_chain->parent_ptes[i] != parent_pte) 1033 if (pte_chain->parent_ptes[i] != parent_pte)
1034 continue; 1034 continue;
1035 while (i + 1 < NR_PTE_CHAIN_ENTRIES 1035 while (i + 1 < NR_PTE_CHAIN_ENTRIES
1036 && pte_chain->parent_ptes[i + 1]) { 1036 && pte_chain->parent_ptes[i + 1]) {
1037 pte_chain->parent_ptes[i] 1037 pte_chain->parent_ptes[i]
1038 = pte_chain->parent_ptes[i + 1]; 1038 = pte_chain->parent_ptes[i + 1];
1039 ++i; 1039 ++i;
1040 } 1040 }
1041 pte_chain->parent_ptes[i] = NULL; 1041 pte_chain->parent_ptes[i] = NULL;
1042 if (i == 0) { 1042 if (i == 0) {
1043 hlist_del(&pte_chain->link); 1043 hlist_del(&pte_chain->link);
1044 mmu_free_pte_chain(pte_chain); 1044 mmu_free_pte_chain(pte_chain);
1045 if (hlist_empty(&sp->parent_ptes)) { 1045 if (hlist_empty(&sp->parent_ptes)) {
1046 sp->multimapped = 0; 1046 sp->multimapped = 0;
1047 sp->parent_pte = NULL; 1047 sp->parent_pte = NULL;
1048 } 1048 }
1049 } 1049 }
1050 return; 1050 return;
1051 } 1051 }
1052 BUG(); 1052 BUG();
1053 } 1053 }
1054 1054
1055 static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn) 1055 static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)
1056 { 1056 {
1057 struct kvm_pte_chain *pte_chain; 1057 struct kvm_pte_chain *pte_chain;
1058 struct hlist_node *node; 1058 struct hlist_node *node;
1059 struct kvm_mmu_page *parent_sp; 1059 struct kvm_mmu_page *parent_sp;
1060 int i; 1060 int i;
1061 1061
1062 if (!sp->multimapped && sp->parent_pte) { 1062 if (!sp->multimapped && sp->parent_pte) {
1063 parent_sp = page_header(__pa(sp->parent_pte)); 1063 parent_sp = page_header(__pa(sp->parent_pte));
1064 fn(parent_sp, sp->parent_pte); 1064 fn(parent_sp, sp->parent_pte);
1065 return; 1065 return;
1066 } 1066 }
1067 1067
1068 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) 1068 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
1069 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { 1069 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
1070 u64 *spte = pte_chain->parent_ptes[i]; 1070 u64 *spte = pte_chain->parent_ptes[i];
1071 1071
1072 if (!spte) 1072 if (!spte)
1073 break; 1073 break;
1074 parent_sp = page_header(__pa(spte)); 1074 parent_sp = page_header(__pa(spte));
1075 fn(parent_sp, spte); 1075 fn(parent_sp, spte);
1076 } 1076 }
1077 } 1077 }
1078 1078
1079 static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte); 1079 static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte);
1080 static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp) 1080 static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
1081 { 1081 {
1082 mmu_parent_walk(sp, mark_unsync); 1082 mmu_parent_walk(sp, mark_unsync);
1083 } 1083 }
1084 1084
1085 static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte) 1085 static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte)
1086 { 1086 {
1087 unsigned int index; 1087 unsigned int index;
1088 1088
1089 index = spte - sp->spt; 1089 index = spte - sp->spt;
1090 if (__test_and_set_bit(index, sp->unsync_child_bitmap)) 1090 if (__test_and_set_bit(index, sp->unsync_child_bitmap))
1091 return; 1091 return;
1092 if (sp->unsync_children++) 1092 if (sp->unsync_children++)
1093 return; 1093 return;
1094 kvm_mmu_mark_parents_unsync(sp); 1094 kvm_mmu_mark_parents_unsync(sp);
1095 } 1095 }
1096 1096
1097 static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, 1097 static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
1098 struct kvm_mmu_page *sp) 1098 struct kvm_mmu_page *sp)
1099 { 1099 {
1100 int i; 1100 int i;
1101 1101
1102 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) 1102 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
1103 sp->spt[i] = shadow_trap_nonpresent_pte; 1103 sp->spt[i] = shadow_trap_nonpresent_pte;
1104 } 1104 }
1105 1105
1106 static int nonpaging_sync_page(struct kvm_vcpu *vcpu, 1106 static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
1107 struct kvm_mmu_page *sp, bool clear_unsync) 1107 struct kvm_mmu_page *sp, bool clear_unsync)
1108 { 1108 {
1109 return 1; 1109 return 1;
1110 } 1110 }
1111 1111
1112 static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva) 1112 static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
1113 { 1113 {
1114 } 1114 }
1115 1115
1116 #define KVM_PAGE_ARRAY_NR 16 1116 #define KVM_PAGE_ARRAY_NR 16
1117 1117
1118 struct kvm_mmu_pages { 1118 struct kvm_mmu_pages {
1119 struct mmu_page_and_offset { 1119 struct mmu_page_and_offset {
1120 struct kvm_mmu_page *sp; 1120 struct kvm_mmu_page *sp;
1121 unsigned int idx; 1121 unsigned int idx;
1122 } page[KVM_PAGE_ARRAY_NR]; 1122 } page[KVM_PAGE_ARRAY_NR];
1123 unsigned int nr; 1123 unsigned int nr;
1124 }; 1124 };
1125 1125
1126 #define for_each_unsync_children(bitmap, idx) \ 1126 #define for_each_unsync_children(bitmap, idx) \
1127 for (idx = find_first_bit(bitmap, 512); \ 1127 for (idx = find_first_bit(bitmap, 512); \
1128 idx < 512; \ 1128 idx < 512; \
1129 idx = find_next_bit(bitmap, 512, idx+1)) 1129 idx = find_next_bit(bitmap, 512, idx+1))
1130 1130
1131 static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp, 1131 static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
1132 int idx) 1132 int idx)
1133 { 1133 {
1134 int i; 1134 int i;
1135 1135
1136 if (sp->unsync) 1136 if (sp->unsync)
1137 for (i=0; i < pvec->nr; i++) 1137 for (i=0; i < pvec->nr; i++)
1138 if (pvec->page[i].sp == sp) 1138 if (pvec->page[i].sp == sp)
1139 return 0; 1139 return 0;
1140 1140
1141 pvec->page[pvec->nr].sp = sp; 1141 pvec->page[pvec->nr].sp = sp;
1142 pvec->page[pvec->nr].idx = idx; 1142 pvec->page[pvec->nr].idx = idx;
1143 pvec->nr++; 1143 pvec->nr++;
1144 return (pvec->nr == KVM_PAGE_ARRAY_NR); 1144 return (pvec->nr == KVM_PAGE_ARRAY_NR);
1145 } 1145 }
1146 1146
1147 static int __mmu_unsync_walk(struct kvm_mmu_page *sp, 1147 static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
1148 struct kvm_mmu_pages *pvec) 1148 struct kvm_mmu_pages *pvec)
1149 { 1149 {
1150 int i, ret, nr_unsync_leaf = 0; 1150 int i, ret, nr_unsync_leaf = 0;
1151 1151
1152 for_each_unsync_children(sp->unsync_child_bitmap, i) { 1152 for_each_unsync_children(sp->unsync_child_bitmap, i) {
1153 struct kvm_mmu_page *child; 1153 struct kvm_mmu_page *child;
1154 u64 ent = sp->spt[i]; 1154 u64 ent = sp->spt[i];
1155 1155
1156 if (!is_shadow_present_pte(ent) || is_large_pte(ent)) 1156 if (!is_shadow_present_pte(ent) || is_large_pte(ent))
1157 goto clear_child_bitmap; 1157 goto clear_child_bitmap;
1158 1158
1159 child = page_header(ent & PT64_BASE_ADDR_MASK); 1159 child = page_header(ent & PT64_BASE_ADDR_MASK);
1160 1160
1161 if (child->unsync_children) { 1161 if (child->unsync_children) {
1162 if (mmu_pages_add(pvec, child, i)) 1162 if (mmu_pages_add(pvec, child, i))
1163 return -ENOSPC; 1163 return -ENOSPC;
1164 1164
1165 ret = __mmu_unsync_walk(child, pvec); 1165 ret = __mmu_unsync_walk(child, pvec);
1166 if (!ret) 1166 if (!ret)
1167 goto clear_child_bitmap; 1167 goto clear_child_bitmap;
1168 else if (ret > 0) 1168 else if (ret > 0)
1169 nr_unsync_leaf += ret; 1169 nr_unsync_leaf += ret;
1170 else 1170 else
1171 return ret; 1171 return ret;
1172 } else if (child->unsync) { 1172 } else if (child->unsync) {
1173 nr_unsync_leaf++; 1173 nr_unsync_leaf++;
1174 if (mmu_pages_add(pvec, child, i)) 1174 if (mmu_pages_add(pvec, child, i))
1175 return -ENOSPC; 1175 return -ENOSPC;
1176 } else 1176 } else
1177 goto clear_child_bitmap; 1177 goto clear_child_bitmap;
1178 1178
1179 continue; 1179 continue;
1180 1180
1181 clear_child_bitmap: 1181 clear_child_bitmap:
1182 __clear_bit(i, sp->unsync_child_bitmap); 1182 __clear_bit(i, sp->unsync_child_bitmap);
1183 sp->unsync_children--; 1183 sp->unsync_children--;
1184 WARN_ON((int)sp->unsync_children < 0); 1184 WARN_ON((int)sp->unsync_children < 0);
1185 } 1185 }
1186 1186
1187 1187
1188 return nr_unsync_leaf; 1188 return nr_unsync_leaf;
1189 } 1189 }
1190 1190
1191 static int mmu_unsync_walk(struct kvm_mmu_page *sp, 1191 static int mmu_unsync_walk(struct kvm_mmu_page *sp,
1192 struct kvm_mmu_pages *pvec) 1192 struct kvm_mmu_pages *pvec)
1193 { 1193 {
1194 if (!sp->unsync_children) 1194 if (!sp->unsync_children)
1195 return 0; 1195 return 0;
1196 1196
1197 mmu_pages_add(pvec, sp, 0); 1197 mmu_pages_add(pvec, sp, 0);
1198 return __mmu_unsync_walk(sp, pvec); 1198 return __mmu_unsync_walk(sp, pvec);
1199 } 1199 }
1200 1200
1201 static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) 1201 static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1202 { 1202 {
1203 WARN_ON(!sp->unsync); 1203 WARN_ON(!sp->unsync);
1204 trace_kvm_mmu_sync_page(sp); 1204 trace_kvm_mmu_sync_page(sp);
1205 sp->unsync = 0; 1205 sp->unsync = 0;
1206 --kvm->stat.mmu_unsync; 1206 --kvm->stat.mmu_unsync;
1207 } 1207 }
1208 1208
1209 static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, 1209 static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
1210 struct list_head *invalid_list); 1210 struct list_head *invalid_list);
1211 static void kvm_mmu_commit_zap_page(struct kvm *kvm, 1211 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1212 struct list_head *invalid_list); 1212 struct list_head *invalid_list);
1213 1213
1214 #define for_each_gfn_sp(kvm, sp, gfn, pos) \ 1214 #define for_each_gfn_sp(kvm, sp, gfn, pos) \
1215 hlist_for_each_entry(sp, pos, \ 1215 hlist_for_each_entry(sp, pos, \
1216 &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \ 1216 &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \
1217 if ((sp)->gfn != (gfn)) {} else 1217 if ((sp)->gfn != (gfn)) {} else
1218 1218
1219 #define for_each_gfn_indirect_valid_sp(kvm, sp, gfn, pos) \ 1219 #define for_each_gfn_indirect_valid_sp(kvm, sp, gfn, pos) \
1220 hlist_for_each_entry(sp, pos, \ 1220 hlist_for_each_entry(sp, pos, \
1221 &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \ 1221 &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \
1222 if ((sp)->gfn != (gfn) || (sp)->role.direct || \ 1222 if ((sp)->gfn != (gfn) || (sp)->role.direct || \
1223 (sp)->role.invalid) {} else 1223 (sp)->role.invalid) {} else
1224 1224
1225 /* @sp->gfn should be write-protected at the call site */ 1225 /* @sp->gfn should be write-protected at the call site */
1226 static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 1226 static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1227 struct list_head *invalid_list, bool clear_unsync) 1227 struct list_head *invalid_list, bool clear_unsync)
1228 { 1228 {
1229 if (sp->role.cr4_pae != !!is_pae(vcpu)) { 1229 if (sp->role.cr4_pae != !!is_pae(vcpu)) {
1230 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); 1230 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
1231 return 1; 1231 return 1;
1232 } 1232 }
1233 1233
1234 if (clear_unsync) 1234 if (clear_unsync)
1235 kvm_unlink_unsync_page(vcpu->kvm, sp); 1235 kvm_unlink_unsync_page(vcpu->kvm, sp);
1236 1236
1237 if (vcpu->arch.mmu.sync_page(vcpu, sp, clear_unsync)) { 1237 if (vcpu->arch.mmu.sync_page(vcpu, sp, clear_unsync)) {
1238 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); 1238 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
1239 return 1; 1239 return 1;
1240 } 1240 }
1241 1241
1242 kvm_mmu_flush_tlb(vcpu); 1242 kvm_mmu_flush_tlb(vcpu);
1243 return 0; 1243 return 0;
1244 } 1244 }
1245 1245
1246 static int kvm_sync_page_transient(struct kvm_vcpu *vcpu, 1246 static int kvm_sync_page_transient(struct kvm_vcpu *vcpu,
1247 struct kvm_mmu_page *sp) 1247 struct kvm_mmu_page *sp)
1248 { 1248 {
1249 LIST_HEAD(invalid_list); 1249 LIST_HEAD(invalid_list);
1250 int ret; 1250 int ret;
1251 1251
1252 ret = __kvm_sync_page(vcpu, sp, &invalid_list, false); 1252 ret = __kvm_sync_page(vcpu, sp, &invalid_list, false);
1253 if (ret) 1253 if (ret)
1254 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); 1254 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
1255 1255
1256 return ret; 1256 return ret;
1257 } 1257 }
1258 1258
1259 static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 1259 static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1260 struct list_head *invalid_list) 1260 struct list_head *invalid_list)
1261 { 1261 {
1262 return __kvm_sync_page(vcpu, sp, invalid_list, true); 1262 return __kvm_sync_page(vcpu, sp, invalid_list, true);
1263 } 1263 }
1264 1264
1265 /* @gfn should be write-protected at the call site */ 1265 /* @gfn should be write-protected at the call site */
1266 static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) 1266 static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
1267 { 1267 {
1268 struct kvm_mmu_page *s; 1268 struct kvm_mmu_page *s;
1269 struct hlist_node *node; 1269 struct hlist_node *node;
1270 LIST_HEAD(invalid_list); 1270 LIST_HEAD(invalid_list);
1271 bool flush = false; 1271 bool flush = false;
1272 1272
1273 for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) { 1273 for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
1274 if (!s->unsync) 1274 if (!s->unsync)
1275 continue; 1275 continue;
1276 1276
1277 WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); 1277 WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
1278 if ((s->role.cr4_pae != !!is_pae(vcpu)) || 1278 if ((s->role.cr4_pae != !!is_pae(vcpu)) ||
1279 (vcpu->arch.mmu.sync_page(vcpu, s, true))) { 1279 (vcpu->arch.mmu.sync_page(vcpu, s, true))) {
1280 kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list); 1280 kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list);
1281 continue; 1281 continue;
1282 } 1282 }
1283 kvm_unlink_unsync_page(vcpu->kvm, s); 1283 kvm_unlink_unsync_page(vcpu->kvm, s);
1284 flush = true; 1284 flush = true;
1285 } 1285 }
1286 1286
1287 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); 1287 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
1288 if (flush) 1288 if (flush)
1289 kvm_mmu_flush_tlb(vcpu); 1289 kvm_mmu_flush_tlb(vcpu);
1290 } 1290 }
1291 1291
1292 struct mmu_page_path { 1292 struct mmu_page_path {
1293 struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1]; 1293 struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1];
1294 unsigned int idx[PT64_ROOT_LEVEL-1]; 1294 unsigned int idx[PT64_ROOT_LEVEL-1];
1295 }; 1295 };
1296 1296
1297 #define for_each_sp(pvec, sp, parents, i) \ 1297 #define for_each_sp(pvec, sp, parents, i) \
1298 for (i = mmu_pages_next(&pvec, &parents, -1), \ 1298 for (i = mmu_pages_next(&pvec, &parents, -1), \
1299 sp = pvec.page[i].sp; \ 1299 sp = pvec.page[i].sp; \
1300 i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \ 1300 i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \
1301 i = mmu_pages_next(&pvec, &parents, i)) 1301 i = mmu_pages_next(&pvec, &parents, i))
1302 1302
1303 static int mmu_pages_next(struct kvm_mmu_pages *pvec, 1303 static int mmu_pages_next(struct kvm_mmu_pages *pvec,
1304 struct mmu_page_path *parents, 1304 struct mmu_page_path *parents,
1305 int i) 1305 int i)
1306 { 1306 {
1307 int n; 1307 int n;
1308 1308
1309 for (n = i+1; n < pvec->nr; n++) { 1309 for (n = i+1; n < pvec->nr; n++) {
1310 struct kvm_mmu_page *sp = pvec->page[n].sp; 1310 struct kvm_mmu_page *sp = pvec->page[n].sp;
1311 1311
1312 if (sp->role.level == PT_PAGE_TABLE_LEVEL) { 1312 if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
1313 parents->idx[0] = pvec->page[n].idx; 1313 parents->idx[0] = pvec->page[n].idx;
1314 return n; 1314 return n;
1315 } 1315 }
1316 1316
1317 parents->parent[sp->role.level-2] = sp; 1317 parents->parent[sp->role.level-2] = sp;
1318 parents->idx[sp->role.level-1] = pvec->page[n].idx; 1318 parents->idx[sp->role.level-1] = pvec->page[n].idx;
1319 } 1319 }
1320 1320
1321 return n; 1321 return n;
1322 } 1322 }
1323 1323
1324 static void mmu_pages_clear_parents(struct mmu_page_path *parents) 1324 static void mmu_pages_clear_parents(struct mmu_page_path *parents)
1325 { 1325 {
1326 struct kvm_mmu_page *sp; 1326 struct kvm_mmu_page *sp;
1327 unsigned int level = 0; 1327 unsigned int level = 0;
1328 1328
1329 do { 1329 do {
1330 unsigned int idx = parents->idx[level]; 1330 unsigned int idx = parents->idx[level];
1331 1331
1332 sp = parents->parent[level]; 1332 sp = parents->parent[level];
1333 if (!sp) 1333 if (!sp)
1334 return; 1334 return;
1335 1335
1336 --sp->unsync_children; 1336 --sp->unsync_children;
1337 WARN_ON((int)sp->unsync_children < 0); 1337 WARN_ON((int)sp->unsync_children < 0);
1338 __clear_bit(idx, sp->unsync_child_bitmap); 1338 __clear_bit(idx, sp->unsync_child_bitmap);
1339 level++; 1339 level++;
1340 } while (level < PT64_ROOT_LEVEL-1 && !sp->unsync_children); 1340 } while (level < PT64_ROOT_LEVEL-1 && !sp->unsync_children);
1341 } 1341 }
1342 1342
1343 static void kvm_mmu_pages_init(struct kvm_mmu_page *parent, 1343 static void kvm_mmu_pages_init(struct kvm_mmu_page *parent,
1344 struct mmu_page_path *parents, 1344 struct mmu_page_path *parents,
1345 struct kvm_mmu_pages *pvec) 1345 struct kvm_mmu_pages *pvec)
1346 { 1346 {
1347 parents->parent[parent->role.level-1] = NULL; 1347 parents->parent[parent->role.level-1] = NULL;
1348 pvec->nr = 0; 1348 pvec->nr = 0;
1349 } 1349 }
1350 1350
1351 static void mmu_sync_children(struct kvm_vcpu *vcpu, 1351 static void mmu_sync_children(struct kvm_vcpu *vcpu,
1352 struct kvm_mmu_page *parent) 1352 struct kvm_mmu_page *parent)
1353 { 1353 {
1354 int i; 1354 int i;
1355 struct kvm_mmu_page *sp; 1355 struct kvm_mmu_page *sp;
1356 struct mmu_page_path parents; 1356 struct mmu_page_path parents;
1357 struct kvm_mmu_pages pages; 1357 struct kvm_mmu_pages pages;
1358 LIST_HEAD(invalid_list); 1358 LIST_HEAD(invalid_list);
1359 1359
1360 kvm_mmu_pages_init(parent, &parents, &pages); 1360 kvm_mmu_pages_init(parent, &parents, &pages);
1361 while (mmu_unsync_walk(parent, &pages)) { 1361 while (mmu_unsync_walk(parent, &pages)) {
1362 int protected = 0; 1362 int protected = 0;
1363 1363
1364 for_each_sp(pages, sp, parents, i) 1364 for_each_sp(pages, sp, parents, i)
1365 protected |= rmap_write_protect(vcpu->kvm, sp->gfn); 1365 protected |= rmap_write_protect(vcpu->kvm, sp->gfn);
1366 1366
1367 if (protected) 1367 if (protected)
1368 kvm_flush_remote_tlbs(vcpu->kvm); 1368 kvm_flush_remote_tlbs(vcpu->kvm);
1369 1369
1370 for_each_sp(pages, sp, parents, i) { 1370 for_each_sp(pages, sp, parents, i) {
1371 kvm_sync_page(vcpu, sp, &invalid_list); 1371 kvm_sync_page(vcpu, sp, &invalid_list);
1372 mmu_pages_clear_parents(&parents); 1372 mmu_pages_clear_parents(&parents);
1373 } 1373 }
1374 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); 1374 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
1375 cond_resched_lock(&vcpu->kvm->mmu_lock); 1375 cond_resched_lock(&vcpu->kvm->mmu_lock);
1376 kvm_mmu_pages_init(parent, &parents, &pages); 1376 kvm_mmu_pages_init(parent, &parents, &pages);
1377 } 1377 }
1378 } 1378 }
1379 1379
1380 static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, 1380 static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1381 gfn_t gfn, 1381 gfn_t gfn,
1382 gva_t gaddr, 1382 gva_t gaddr,
1383 unsigned level, 1383 unsigned level,
1384 int direct, 1384 int direct,
1385 unsigned access, 1385 unsigned access,
1386 u64 *parent_pte) 1386 u64 *parent_pte)
1387 { 1387 {
1388 union kvm_mmu_page_role role; 1388 union kvm_mmu_page_role role;
1389 unsigned quadrant; 1389 unsigned quadrant;
1390 struct kvm_mmu_page *sp; 1390 struct kvm_mmu_page *sp;
1391 struct hlist_node *node; 1391 struct hlist_node *node;
1392 bool need_sync = false; 1392 bool need_sync = false;
1393 1393
1394 role = vcpu->arch.mmu.base_role; 1394 role = vcpu->arch.mmu.base_role;
1395 role.level = level; 1395 role.level = level;
1396 role.direct = direct; 1396 role.direct = direct;
1397 if (role.direct) 1397 if (role.direct)
1398 role.cr4_pae = 0; 1398 role.cr4_pae = 0;
1399 role.access = access; 1399 role.access = access;
1400 if (!tdp_enabled && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { 1400 if (!tdp_enabled && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
1401 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); 1401 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
1402 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; 1402 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
1403 role.quadrant = quadrant; 1403 role.quadrant = quadrant;
1404 } 1404 }
1405 for_each_gfn_sp(vcpu->kvm, sp, gfn, node) { 1405 for_each_gfn_sp(vcpu->kvm, sp, gfn, node) {
1406 if (!need_sync && sp->unsync) 1406 if (!need_sync && sp->unsync)
1407 need_sync = true; 1407 need_sync = true;
1408 1408
1409 if (sp->role.word != role.word) 1409 if (sp->role.word != role.word)
1410 continue; 1410 continue;
1411 1411
1412 if (sp->unsync && kvm_sync_page_transient(vcpu, sp)) 1412 if (sp->unsync && kvm_sync_page_transient(vcpu, sp))
1413 break; 1413 break;
1414 1414
1415 mmu_page_add_parent_pte(vcpu, sp, parent_pte); 1415 mmu_page_add_parent_pte(vcpu, sp, parent_pte);
1416 if (sp->unsync_children) { 1416 if (sp->unsync_children) {
1417 kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); 1417 kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
1418 kvm_mmu_mark_parents_unsync(sp); 1418 kvm_mmu_mark_parents_unsync(sp);
1419 } else if (sp->unsync) 1419 } else if (sp->unsync)
1420 kvm_mmu_mark_parents_unsync(sp); 1420 kvm_mmu_mark_parents_unsync(sp);
1421 1421
1422 trace_kvm_mmu_get_page(sp, false); 1422 trace_kvm_mmu_get_page(sp, false);
1423 return sp; 1423 return sp;
1424 } 1424 }
1425 ++vcpu->kvm->stat.mmu_cache_miss; 1425 ++vcpu->kvm->stat.mmu_cache_miss;
1426 sp = kvm_mmu_alloc_page(vcpu, parent_pte, direct); 1426 sp = kvm_mmu_alloc_page(vcpu, parent_pte, direct);
1427 if (!sp) 1427 if (!sp)
1428 return sp; 1428 return sp;
1429 sp->gfn = gfn; 1429 sp->gfn = gfn;
1430 sp->role = role; 1430 sp->role = role;
1431 hlist_add_head(&sp->hash_link, 1431 hlist_add_head(&sp->hash_link,
1432 &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]); 1432 &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
1433 if (!direct) { 1433 if (!direct) {
1434 if (rmap_write_protect(vcpu->kvm, gfn)) 1434 if (rmap_write_protect(vcpu->kvm, gfn))
1435 kvm_flush_remote_tlbs(vcpu->kvm); 1435 kvm_flush_remote_tlbs(vcpu->kvm);
1436 if (level > PT_PAGE_TABLE_LEVEL && need_sync) 1436 if (level > PT_PAGE_TABLE_LEVEL && need_sync)
1437 kvm_sync_pages(vcpu, gfn); 1437 kvm_sync_pages(vcpu, gfn);
1438 1438
1439 account_shadowed(vcpu->kvm, gfn); 1439 account_shadowed(vcpu->kvm, gfn);
1440 } 1440 }
1441 if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte) 1441 if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
1442 vcpu->arch.mmu.prefetch_page(vcpu, sp); 1442 vcpu->arch.mmu.prefetch_page(vcpu, sp);
1443 else 1443 else
1444 nonpaging_prefetch_page(vcpu, sp); 1444 nonpaging_prefetch_page(vcpu, sp);
1445 trace_kvm_mmu_get_page(sp, true); 1445 trace_kvm_mmu_get_page(sp, true);
1446 return sp; 1446 return sp;
1447 } 1447 }
1448 1448
1449 static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator, 1449 static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
1450 struct kvm_vcpu *vcpu, u64 addr) 1450 struct kvm_vcpu *vcpu, u64 addr)
1451 { 1451 {
1452 iterator->addr = addr; 1452 iterator->addr = addr;
1453 iterator->shadow_addr = vcpu->arch.mmu.root_hpa; 1453 iterator->shadow_addr = vcpu->arch.mmu.root_hpa;
1454 iterator->level = vcpu->arch.mmu.shadow_root_level; 1454 iterator->level = vcpu->arch.mmu.shadow_root_level;
1455 if (iterator->level == PT32E_ROOT_LEVEL) { 1455 if (iterator->level == PT32E_ROOT_LEVEL) {
1456 iterator->shadow_addr 1456 iterator->shadow_addr
1457 = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; 1457 = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
1458 iterator->shadow_addr &= PT64_BASE_ADDR_MASK; 1458 iterator->shadow_addr &= PT64_BASE_ADDR_MASK;
1459 --iterator->level; 1459 --iterator->level;
1460 if (!iterator->shadow_addr) 1460 if (!iterator->shadow_addr)
1461 iterator->level = 0; 1461 iterator->level = 0;
1462 } 1462 }
1463 } 1463 }
1464 1464
1465 static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator) 1465 static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
1466 { 1466 {
1467 if (iterator->level < PT_PAGE_TABLE_LEVEL) 1467 if (iterator->level < PT_PAGE_TABLE_LEVEL)
1468 return false; 1468 return false;
1469 1469
1470 if (iterator->level == PT_PAGE_TABLE_LEVEL) 1470 if (iterator->level == PT_PAGE_TABLE_LEVEL)
1471 if (is_large_pte(*iterator->sptep)) 1471 if (is_large_pte(*iterator->sptep))
1472 return false; 1472 return false;
1473 1473
1474 iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level); 1474 iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
1475 iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index; 1475 iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
1476 return true; 1476 return true;
1477 } 1477 }
1478 1478
1479 static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) 1479 static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
1480 { 1480 {
1481 iterator->shadow_addr = *iterator->sptep & PT64_BASE_ADDR_MASK; 1481 iterator->shadow_addr = *iterator->sptep & PT64_BASE_ADDR_MASK;
1482 --iterator->level; 1482 --iterator->level;
1483 } 1483 }
1484 1484
1485 static void kvm_mmu_page_unlink_children(struct kvm *kvm, 1485 static void kvm_mmu_page_unlink_children(struct kvm *kvm,
1486 struct kvm_mmu_page *sp) 1486 struct kvm_mmu_page *sp)
1487 { 1487 {
1488 unsigned i; 1488 unsigned i;
1489 u64 *pt; 1489 u64 *pt;
1490 u64 ent; 1490 u64 ent;
1491 1491
1492 pt = sp->spt; 1492 pt = sp->spt;
1493 1493
1494 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { 1494 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1495 ent = pt[i]; 1495 ent = pt[i];
1496 1496
1497 if (is_shadow_present_pte(ent)) { 1497 if (is_shadow_present_pte(ent)) {
1498 if (!is_last_spte(ent, sp->role.level)) { 1498 if (!is_last_spte(ent, sp->role.level)) {
1499 ent &= PT64_BASE_ADDR_MASK; 1499 ent &= PT64_BASE_ADDR_MASK;
1500 mmu_page_remove_parent_pte(page_header(ent), 1500 mmu_page_remove_parent_pte(page_header(ent),
1501 &pt[i]); 1501 &pt[i]);
1502 } else { 1502 } else {
1503 if (is_large_pte(ent)) 1503 if (is_large_pte(ent))
1504 --kvm->stat.lpages; 1504 --kvm->stat.lpages;
1505 drop_spte(kvm, &pt[i], 1505 drop_spte(kvm, &pt[i],
1506 shadow_trap_nonpresent_pte); 1506 shadow_trap_nonpresent_pte);
1507 } 1507 }
1508 } 1508 }
1509 pt[i] = shadow_trap_nonpresent_pte; 1509 pt[i] = shadow_trap_nonpresent_pte;
1510 } 1510 }
1511 } 1511 }
1512 1512
1513 static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) 1513 static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
1514 { 1514 {
1515 mmu_page_remove_parent_pte(sp, parent_pte); 1515 mmu_page_remove_parent_pte(sp, parent_pte);
1516 } 1516 }
1517 1517
1518 static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm) 1518 static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
1519 { 1519 {
1520 int i; 1520 int i;
1521 struct kvm_vcpu *vcpu; 1521 struct kvm_vcpu *vcpu;
1522 1522
1523 kvm_for_each_vcpu(i, vcpu, kvm) 1523 kvm_for_each_vcpu(i, vcpu, kvm)
1524 vcpu->arch.last_pte_updated = NULL; 1524 vcpu->arch.last_pte_updated = NULL;
1525 } 1525 }
1526 1526
1527 static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) 1527 static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
1528 { 1528 {
1529 u64 *parent_pte; 1529 u64 *parent_pte;
1530 1530
1531 while (sp->multimapped || sp->parent_pte) { 1531 while (sp->multimapped || sp->parent_pte) {
1532 if (!sp->multimapped) 1532 if (!sp->multimapped)
1533 parent_pte = sp->parent_pte; 1533 parent_pte = sp->parent_pte;
1534 else { 1534 else {
1535 struct kvm_pte_chain *chain; 1535 struct kvm_pte_chain *chain;
1536 1536
1537 chain = container_of(sp->parent_ptes.first, 1537 chain = container_of(sp->parent_ptes.first,
1538 struct kvm_pte_chain, link); 1538 struct kvm_pte_chain, link);
1539 parent_pte = chain->parent_ptes[0]; 1539 parent_pte = chain->parent_ptes[0];
1540 } 1540 }
1541 BUG_ON(!parent_pte); 1541 BUG_ON(!parent_pte);
1542 kvm_mmu_put_page(sp, parent_pte); 1542 kvm_mmu_put_page(sp, parent_pte);
1543 __set_spte(parent_pte, shadow_trap_nonpresent_pte); 1543 __set_spte(parent_pte, shadow_trap_nonpresent_pte);
1544 } 1544 }
1545 } 1545 }
1546 1546
1547 static int mmu_zap_unsync_children(struct kvm *kvm, 1547 static int mmu_zap_unsync_children(struct kvm *kvm,
1548 struct kvm_mmu_page *parent, 1548 struct kvm_mmu_page *parent,
1549 struct list_head *invalid_list) 1549 struct list_head *invalid_list)
1550 { 1550 {
1551 int i, zapped = 0; 1551 int i, zapped = 0;
1552 struct mmu_page_path parents; 1552 struct mmu_page_path parents;
1553 struct kvm_mmu_pages pages; 1553 struct kvm_mmu_pages pages;
1554 1554
1555 if (parent->role.level == PT_PAGE_TABLE_LEVEL) 1555 if (parent->role.level == PT_PAGE_TABLE_LEVEL)
1556 return 0; 1556 return 0;
1557 1557
1558 kvm_mmu_pages_init(parent, &parents, &pages); 1558 kvm_mmu_pages_init(parent, &parents, &pages);
1559 while (mmu_unsync_walk(parent, &pages)) { 1559 while (mmu_unsync_walk(parent, &pages)) {
1560 struct kvm_mmu_page *sp; 1560 struct kvm_mmu_page *sp;
1561 1561
1562 for_each_sp(pages, sp, parents, i) { 1562 for_each_sp(pages, sp, parents, i) {
1563 kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); 1563 kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
1564 mmu_pages_clear_parents(&parents); 1564 mmu_pages_clear_parents(&parents);
1565 zapped++; 1565 zapped++;
1566 } 1566 }
1567 kvm_mmu_pages_init(parent, &parents, &pages); 1567 kvm_mmu_pages_init(parent, &parents, &pages);
1568 } 1568 }
1569 1569
1570 return zapped; 1570 return zapped;
1571 } 1571 }
1572 1572
1573 static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, 1573 static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
1574 struct list_head *invalid_list) 1574 struct list_head *invalid_list)
1575 { 1575 {
1576 int ret; 1576 int ret;
1577 1577
1578 trace_kvm_mmu_prepare_zap_page(sp); 1578 trace_kvm_mmu_prepare_zap_page(sp);
1579 ++kvm->stat.mmu_shadow_zapped; 1579 ++kvm->stat.mmu_shadow_zapped;
1580 ret = mmu_zap_unsync_children(kvm, sp, invalid_list); 1580 ret = mmu_zap_unsync_children(kvm, sp, invalid_list);
1581 kvm_mmu_page_unlink_children(kvm, sp); 1581 kvm_mmu_page_unlink_children(kvm, sp);
1582 kvm_mmu_unlink_parents(kvm, sp); 1582 kvm_mmu_unlink_parents(kvm, sp);
1583 if (!sp->role.invalid && !sp->role.direct) 1583 if (!sp->role.invalid && !sp->role.direct)
1584 unaccount_shadowed(kvm, sp->gfn); 1584 unaccount_shadowed(kvm, sp->gfn);
1585 if (sp->unsync) 1585 if (sp->unsync)
1586 kvm_unlink_unsync_page(kvm, sp); 1586 kvm_unlink_unsync_page(kvm, sp);
1587 if (!sp->root_count) { 1587 if (!sp->root_count) {
1588 /* Count self */ 1588 /* Count self */
1589 ret++; 1589 ret++;
1590 list_move(&sp->link, invalid_list); 1590 list_move(&sp->link, invalid_list);
1591 } else { 1591 } else {
1592 list_move(&sp->link, &kvm->arch.active_mmu_pages); 1592 list_move(&sp->link, &kvm->arch.active_mmu_pages);
1593 kvm_reload_remote_mmus(kvm); 1593 kvm_reload_remote_mmus(kvm);
1594 } 1594 }
1595 1595
1596 sp->role.invalid = 1; 1596 sp->role.invalid = 1;
1597 kvm_mmu_reset_last_pte_updated(kvm); 1597 kvm_mmu_reset_last_pte_updated(kvm);
1598 return ret; 1598 return ret;
1599 } 1599 }
1600 1600
1601 static void kvm_mmu_commit_zap_page(struct kvm *kvm, 1601 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1602 struct list_head *invalid_list) 1602 struct list_head *invalid_list)
1603 { 1603 {
1604 struct kvm_mmu_page *sp; 1604 struct kvm_mmu_page *sp;
1605 1605
1606 if (list_empty(invalid_list)) 1606 if (list_empty(invalid_list))
1607 return; 1607 return;
1608 1608
1609 kvm_flush_remote_tlbs(kvm); 1609 kvm_flush_remote_tlbs(kvm);
1610 1610
1611 do { 1611 do {
1612 sp = list_first_entry(invalid_list, struct kvm_mmu_page, link); 1612 sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
1613 WARN_ON(!sp->role.invalid || sp->root_count); 1613 WARN_ON(!sp->role.invalid || sp->root_count);
1614 kvm_mmu_free_page(kvm, sp); 1614 kvm_mmu_free_page(kvm, sp);
1615 } while (!list_empty(invalid_list)); 1615 } while (!list_empty(invalid_list));
1616 1616
1617 } 1617 }
1618 1618
1619 /* 1619 /*
1620 * Changing the number of mmu pages allocated to the vm 1620 * Changing the number of mmu pages allocated to the vm
1621 * Note: if kvm_nr_mmu_pages is too small, you will get dead lock 1621 * Note: if kvm_nr_mmu_pages is too small, you will get dead lock
1622 */ 1622 */
1623 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) 1623 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
1624 { 1624 {
1625 int used_pages; 1625 int used_pages;
1626 LIST_HEAD(invalid_list); 1626 LIST_HEAD(invalid_list);
1627 1627
1628 used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages; 1628 used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages;
1629 used_pages = max(0, used_pages); 1629 used_pages = max(0, used_pages);
1630 1630
1631 /* 1631 /*
1632 * If we set the number of mmu pages to be smaller be than the 1632 * If we set the number of mmu pages to be smaller be than the
1633 * number of actived pages , we must to free some mmu pages before we 1633 * number of actived pages , we must to free some mmu pages before we
1634 * change the value 1634 * change the value
1635 */ 1635 */
1636 1636
1637 if (used_pages > kvm_nr_mmu_pages) { 1637 if (used_pages > kvm_nr_mmu_pages) {
1638 while (used_pages > kvm_nr_mmu_pages && 1638 while (used_pages > kvm_nr_mmu_pages &&
1639 !list_empty(&kvm->arch.active_mmu_pages)) { 1639 !list_empty(&kvm->arch.active_mmu_pages)) {
1640 struct kvm_mmu_page *page; 1640 struct kvm_mmu_page *page;
1641 1641
1642 page = container_of(kvm->arch.active_mmu_pages.prev, 1642 page = container_of(kvm->arch.active_mmu_pages.prev,
1643 struct kvm_mmu_page, link); 1643 struct kvm_mmu_page, link);
1644 used_pages -= kvm_mmu_prepare_zap_page(kvm, page, 1644 used_pages -= kvm_mmu_prepare_zap_page(kvm, page,
1645 &invalid_list); 1645 &invalid_list);
1646 } 1646 }
1647 kvm_mmu_commit_zap_page(kvm, &invalid_list); 1647 kvm_mmu_commit_zap_page(kvm, &invalid_list);
1648 kvm_nr_mmu_pages = used_pages; 1648 kvm_nr_mmu_pages = used_pages;
1649 kvm->arch.n_free_mmu_pages = 0; 1649 kvm->arch.n_free_mmu_pages = 0;
1650 } 1650 }
1651 else 1651 else
1652 kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages 1652 kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages
1653 - kvm->arch.n_alloc_mmu_pages; 1653 - kvm->arch.n_alloc_mmu_pages;
1654 1654
1655 kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages; 1655 kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages;
1656 } 1656 }
1657 1657
1658 static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) 1658 static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
1659 { 1659 {
1660 struct kvm_mmu_page *sp; 1660 struct kvm_mmu_page *sp;
1661 struct hlist_node *node; 1661 struct hlist_node *node;
1662 LIST_HEAD(invalid_list); 1662 LIST_HEAD(invalid_list);
1663 int r; 1663 int r;
1664 1664
1665 pgprintk("%s: looking for gfn %lx\n", __func__, gfn); 1665 pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
1666 r = 0; 1666 r = 0;
1667 1667
1668 for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { 1668 for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
1669 pgprintk("%s: gfn %lx role %x\n", __func__, gfn, 1669 pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
1670 sp->role.word); 1670 sp->role.word);
1671 r = 1; 1671 r = 1;
1672 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); 1672 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
1673 } 1673 }
1674 kvm_mmu_commit_zap_page(kvm, &invalid_list); 1674 kvm_mmu_commit_zap_page(kvm, &invalid_list);
1675 return r; 1675 return r;
1676 } 1676 }
1677 1677
1678 static void mmu_unshadow(struct kvm *kvm, gfn_t gfn) 1678 static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
1679 { 1679 {
1680 struct kvm_mmu_page *sp; 1680 struct kvm_mmu_page *sp;
1681 struct hlist_node *node; 1681 struct hlist_node *node;
1682 LIST_HEAD(invalid_list); 1682 LIST_HEAD(invalid_list);
1683 1683
1684 for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { 1684 for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
1685 pgprintk("%s: zap %lx %x\n", 1685 pgprintk("%s: zap %lx %x\n",
1686 __func__, gfn, sp->role.word); 1686 __func__, gfn, sp->role.word);
1687 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); 1687 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
1688 } 1688 }
1689 kvm_mmu_commit_zap_page(kvm, &invalid_list); 1689 kvm_mmu_commit_zap_page(kvm, &invalid_list);
1690 } 1690 }
1691 1691
1692 static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) 1692 static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
1693 { 1693 {
1694 int slot = memslot_id(kvm, gfn); 1694 int slot = memslot_id(kvm, gfn);
1695 struct kvm_mmu_page *sp = page_header(__pa(pte)); 1695 struct kvm_mmu_page *sp = page_header(__pa(pte));
1696 1696
1697 __set_bit(slot, sp->slot_bitmap); 1697 __set_bit(slot, sp->slot_bitmap);
1698 } 1698 }
1699 1699
1700 static void mmu_convert_notrap(struct kvm_mmu_page *sp) 1700 static void mmu_convert_notrap(struct kvm_mmu_page *sp)
1701 { 1701 {
1702 int i; 1702 int i;
1703 u64 *pt = sp->spt; 1703 u64 *pt = sp->spt;
1704 1704
1705 if (shadow_trap_nonpresent_pte == shadow_notrap_nonpresent_pte) 1705 if (shadow_trap_nonpresent_pte == shadow_notrap_nonpresent_pte)
1706 return; 1706 return;
1707 1707
1708 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { 1708 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1709 if (pt[i] == shadow_notrap_nonpresent_pte) 1709 if (pt[i] == shadow_notrap_nonpresent_pte)
1710 __set_spte(&pt[i], shadow_trap_nonpresent_pte); 1710 __set_spte(&pt[i], shadow_trap_nonpresent_pte);
1711 } 1711 }
1712 } 1712 }
1713 1713
1714 /* 1714 /*
1715 * The function is based on mtrr_type_lookup() in 1715 * The function is based on mtrr_type_lookup() in
1716 * arch/x86/kernel/cpu/mtrr/generic.c 1716 * arch/x86/kernel/cpu/mtrr/generic.c
1717 */ 1717 */
1718 static int get_mtrr_type(struct mtrr_state_type *mtrr_state, 1718 static int get_mtrr_type(struct mtrr_state_type *mtrr_state,
1719 u64 start, u64 end) 1719 u64 start, u64 end)
1720 { 1720 {
1721 int i; 1721 int i;
1722 u64 base, mask; 1722 u64 base, mask;
1723 u8 prev_match, curr_match; 1723 u8 prev_match, curr_match;
1724 int num_var_ranges = KVM_NR_VAR_MTRR; 1724 int num_var_ranges = KVM_NR_VAR_MTRR;
1725 1725
1726 if (!mtrr_state->enabled) 1726 if (!mtrr_state->enabled)
1727 return 0xFF; 1727 return 0xFF;
1728 1728
1729 /* Make end inclusive end, instead of exclusive */ 1729 /* Make end inclusive end, instead of exclusive */
1730 end--; 1730 end--;
1731 1731
1732 /* Look in fixed ranges. Just return the type as per start */ 1732 /* Look in fixed ranges. Just return the type as per start */
1733 if (mtrr_state->have_fixed && (start < 0x100000)) { 1733 if (mtrr_state->have_fixed && (start < 0x100000)) {
1734 int idx; 1734 int idx;
1735 1735
1736 if (start < 0x80000) { 1736 if (start < 0x80000) {
1737 idx = 0; 1737 idx = 0;
1738 idx += (start >> 16); 1738 idx += (start >> 16);
1739 return mtrr_state->fixed_ranges[idx]; 1739 return mtrr_state->fixed_ranges[idx];
1740 } else if (start < 0xC0000) { 1740 } else if (start < 0xC0000) {
1741 idx = 1 * 8; 1741 idx = 1 * 8;
1742 idx += ((start - 0x80000) >> 14); 1742 idx += ((start - 0x80000) >> 14);
1743 return mtrr_state->fixed_ranges[idx]; 1743 return mtrr_state->fixed_ranges[idx];
1744 } else if (start < 0x1000000) { 1744 } else if (start < 0x1000000) {
1745 idx = 3 * 8; 1745 idx = 3 * 8;
1746 idx += ((start - 0xC0000) >> 12); 1746 idx += ((start - 0xC0000) >> 12);
1747 return mtrr_state->fixed_ranges[idx]; 1747 return mtrr_state->fixed_ranges[idx];
1748 } 1748 }
1749 } 1749 }
1750 1750
1751 /* 1751 /*
1752 * Look in variable ranges 1752 * Look in variable ranges
1753 * Look of multiple ranges matching this address and pick type 1753 * Look of multiple ranges matching this address and pick type
1754 * as per MTRR precedence 1754 * as per MTRR precedence
1755 */ 1755 */
1756 if (!(mtrr_state->enabled & 2)) 1756 if (!(mtrr_state->enabled & 2))
1757 return mtrr_state->def_type; 1757 return mtrr_state->def_type;
1758 1758
1759 prev_match = 0xFF; 1759 prev_match = 0xFF;
1760 for (i = 0; i < num_var_ranges; ++i) { 1760 for (i = 0; i < num_var_ranges; ++i) {
1761 unsigned short start_state, end_state; 1761 unsigned short start_state, end_state;
1762 1762
1763 if (!(mtrr_state->var_ranges[i].mask_lo & (1 << 11))) 1763 if (!(mtrr_state->var_ranges[i].mask_lo & (1 << 11)))
1764 continue; 1764 continue;
1765 1765
1766 base = (((u64)mtrr_state->var_ranges[i].base_hi) << 32) + 1766 base = (((u64)mtrr_state->var_ranges[i].base_hi) << 32) +
1767 (mtrr_state->var_ranges[i].base_lo & PAGE_MASK); 1767 (mtrr_state->var_ranges[i].base_lo & PAGE_MASK);
1768 mask = (((u64)mtrr_state->var_ranges[i].mask_hi) << 32) + 1768 mask = (((u64)mtrr_state->var_ranges[i].mask_hi) << 32) +
1769 (mtrr_state->var_ranges[i].mask_lo & PAGE_MASK); 1769 (mtrr_state->var_ranges[i].mask_lo & PAGE_MASK);
1770 1770
1771 start_state = ((start & mask) == (base & mask)); 1771 start_state = ((start & mask) == (base & mask));
1772 end_state = ((end & mask) == (base & mask)); 1772 end_state = ((end & mask) == (base & mask));
1773 if (start_state != end_state) 1773 if (start_state != end_state)
1774 return 0xFE; 1774 return 0xFE;
1775 1775
1776 if ((start & mask) != (base & mask)) 1776 if ((start & mask) != (base & mask))
1777 continue; 1777 continue;
1778 1778
1779 curr_match = mtrr_state->var_ranges[i].base_lo & 0xff; 1779 curr_match = mtrr_state->var_ranges[i].base_lo & 0xff;
1780 if (prev_match == 0xFF) { 1780 if (prev_match == 0xFF) {
1781 prev_match = curr_match; 1781 prev_match = curr_match;
1782 continue; 1782 continue;
1783 } 1783 }
1784 1784
1785 if (prev_match == MTRR_TYPE_UNCACHABLE || 1785 if (prev_match == MTRR_TYPE_UNCACHABLE ||
1786 curr_match == MTRR_TYPE_UNCACHABLE) 1786 curr_match == MTRR_TYPE_UNCACHABLE)
1787 return MTRR_TYPE_UNCACHABLE; 1787 return MTRR_TYPE_UNCACHABLE;
1788 1788
1789 if ((prev_match == MTRR_TYPE_WRBACK && 1789 if ((prev_match == MTRR_TYPE_WRBACK &&
1790 curr_match == MTRR_TYPE_WRTHROUGH) || 1790 curr_match == MTRR_TYPE_WRTHROUGH) ||
1791 (prev_match == MTRR_TYPE_WRTHROUGH && 1791 (prev_match == MTRR_TYPE_WRTHROUGH &&
1792 curr_match == MTRR_TYPE_WRBACK)) { 1792 curr_match == MTRR_TYPE_WRBACK)) {
1793 prev_match = MTRR_TYPE_WRTHROUGH; 1793 prev_match = MTRR_TYPE_WRTHROUGH;
1794 curr_match = MTRR_TYPE_WRTHROUGH; 1794 curr_match = MTRR_TYPE_WRTHROUGH;
1795 } 1795 }
1796 1796
1797 if (prev_match != curr_match) 1797 if (prev_match != curr_match)
1798 return MTRR_TYPE_UNCACHABLE; 1798 return MTRR_TYPE_UNCACHABLE;
1799 } 1799 }
1800 1800
1801 if (prev_match != 0xFF) 1801 if (prev_match != 0xFF)
1802 return prev_match; 1802 return prev_match;
1803 1803
1804 return mtrr_state->def_type; 1804 return mtrr_state->def_type;
1805 } 1805 }
1806 1806
1807 u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn) 1807 u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
1808 { 1808 {
1809 u8 mtrr; 1809 u8 mtrr;
1810 1810
1811 mtrr = get_mtrr_type(&vcpu->arch.mtrr_state, gfn << PAGE_SHIFT, 1811 mtrr = get_mtrr_type(&vcpu->arch.mtrr_state, gfn << PAGE_SHIFT,
1812 (gfn << PAGE_SHIFT) + PAGE_SIZE); 1812 (gfn << PAGE_SHIFT) + PAGE_SIZE);
1813 if (mtrr == 0xfe || mtrr == 0xff) 1813 if (mtrr == 0xfe || mtrr == 0xff)
1814 mtrr = MTRR_TYPE_WRBACK; 1814 mtrr = MTRR_TYPE_WRBACK;
1815 return mtrr; 1815 return mtrr;
1816 } 1816 }
1817 EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type); 1817 EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type);
1818 1818
1819 static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) 1819 static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1820 { 1820 {
1821 trace_kvm_mmu_unsync_page(sp); 1821 trace_kvm_mmu_unsync_page(sp);
1822 ++vcpu->kvm->stat.mmu_unsync; 1822 ++vcpu->kvm->stat.mmu_unsync;
1823 sp->unsync = 1; 1823 sp->unsync = 1;
1824 1824
1825 kvm_mmu_mark_parents_unsync(sp); 1825 kvm_mmu_mark_parents_unsync(sp);
1826 mmu_convert_notrap(sp); 1826 mmu_convert_notrap(sp);
1827 } 1827 }
1828 1828
1829 static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) 1829 static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
1830 { 1830 {
1831 struct kvm_mmu_page *s; 1831 struct kvm_mmu_page *s;
1832 struct hlist_node *node; 1832 struct hlist_node *node;
1833 1833
1834 for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) { 1834 for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
1835 if (s->unsync) 1835 if (s->unsync)
1836 continue; 1836 continue;
1837 WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); 1837 WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
1838 __kvm_unsync_page(vcpu, s); 1838 __kvm_unsync_page(vcpu, s);
1839 } 1839 }
1840 } 1840 }
1841 1841
1842 static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, 1842 static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
1843 bool can_unsync) 1843 bool can_unsync)
1844 { 1844 {
1845 struct kvm_mmu_page *s; 1845 struct kvm_mmu_page *s;
1846 struct hlist_node *node; 1846 struct hlist_node *node;
1847 bool need_unsync = false; 1847 bool need_unsync = false;
1848 1848
1849 for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) { 1849 for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
1850 if (!can_unsync) 1850 if (!can_unsync)
1851 return 1; 1851 return 1;
1852 1852
1853 if (s->role.level != PT_PAGE_TABLE_LEVEL) 1853 if (s->role.level != PT_PAGE_TABLE_LEVEL)
1854 return 1; 1854 return 1;
1855 1855
1856 if (!need_unsync && !s->unsync) { 1856 if (!need_unsync && !s->unsync) {
1857 if (!oos_shadow) 1857 if (!oos_shadow)
1858 return 1; 1858 return 1;
1859 need_unsync = true; 1859 need_unsync = true;
1860 } 1860 }
1861 } 1861 }
1862 if (need_unsync) 1862 if (need_unsync)
1863 kvm_unsync_pages(vcpu, gfn); 1863 kvm_unsync_pages(vcpu, gfn);
1864 return 0; 1864 return 0;
1865 } 1865 }
1866 1866
1867 static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, 1867 static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1868 unsigned pte_access, int user_fault, 1868 unsigned pte_access, int user_fault,
1869 int write_fault, int dirty, int level, 1869 int write_fault, int dirty, int level,
1870 gfn_t gfn, pfn_t pfn, bool speculative, 1870 gfn_t gfn, pfn_t pfn, bool speculative,
1871 bool can_unsync, bool reset_host_protection) 1871 bool can_unsync, bool reset_host_protection)
1872 { 1872 {
1873 u64 spte; 1873 u64 spte;
1874 int ret = 0; 1874 int ret = 0;
1875 1875
1876 /* 1876 /*
1877 * We don't set the accessed bit, since we sometimes want to see 1877 * We don't set the accessed bit, since we sometimes want to see
1878 * whether the guest actually used the pte (in order to detect 1878 * whether the guest actually used the pte (in order to detect
1879 * demand paging). 1879 * demand paging).
1880 */ 1880 */
1881 spte = shadow_base_present_pte | shadow_dirty_mask; 1881 spte = shadow_base_present_pte | shadow_dirty_mask;
1882 if (!speculative) 1882 if (!speculative)
1883 spte |= shadow_accessed_mask; 1883 spte |= shadow_accessed_mask;
1884 if (!dirty) 1884 if (!dirty)
1885 pte_access &= ~ACC_WRITE_MASK; 1885 pte_access &= ~ACC_WRITE_MASK;
1886 if (pte_access & ACC_EXEC_MASK) 1886 if (pte_access & ACC_EXEC_MASK)
1887 spte |= shadow_x_mask; 1887 spte |= shadow_x_mask;
1888 else 1888 else
1889 spte |= shadow_nx_mask; 1889 spte |= shadow_nx_mask;
1890 if (pte_access & ACC_USER_MASK) 1890 if (pte_access & ACC_USER_MASK)
1891 spte |= shadow_user_mask; 1891 spte |= shadow_user_mask;
1892 if (level > PT_PAGE_TABLE_LEVEL) 1892 if (level > PT_PAGE_TABLE_LEVEL)
1893 spte |= PT_PAGE_SIZE_MASK; 1893 spte |= PT_PAGE_SIZE_MASK;
1894 if (tdp_enabled) 1894 if (tdp_enabled)
1895 spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn, 1895 spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
1896 kvm_is_mmio_pfn(pfn)); 1896 kvm_is_mmio_pfn(pfn));
1897 1897
1898 if (reset_host_protection) 1898 if (reset_host_protection)
1899 spte |= SPTE_HOST_WRITEABLE; 1899 spte |= SPTE_HOST_WRITEABLE;
1900 1900
1901 spte |= (u64)pfn << PAGE_SHIFT; 1901 spte |= (u64)pfn << PAGE_SHIFT;
1902 1902
1903 if ((pte_access & ACC_WRITE_MASK) 1903 if ((pte_access & ACC_WRITE_MASK)
1904 || (!tdp_enabled && write_fault && !is_write_protection(vcpu) 1904 || (!tdp_enabled && write_fault && !is_write_protection(vcpu)
1905 && !user_fault)) { 1905 && !user_fault)) {
1906 1906
1907 if (level > PT_PAGE_TABLE_LEVEL && 1907 if (level > PT_PAGE_TABLE_LEVEL &&
1908 has_wrprotected_page(vcpu->kvm, gfn, level)) { 1908 has_wrprotected_page(vcpu->kvm, gfn, level)) {
1909 ret = 1; 1909 ret = 1;
1910 drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); 1910 drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
1911 goto done; 1911 goto done;
1912 } 1912 }
1913 1913
1914 spte |= PT_WRITABLE_MASK; 1914 spte |= PT_WRITABLE_MASK;
1915 1915
1916 if (!tdp_enabled && !(pte_access & ACC_WRITE_MASK)) 1916 if (!tdp_enabled && !(pte_access & ACC_WRITE_MASK))
1917 spte &= ~PT_USER_MASK; 1917 spte &= ~PT_USER_MASK;
1918 1918
1919 /* 1919 /*
1920 * Optimization: for pte sync, if spte was writable the hash 1920 * Optimization: for pte sync, if spte was writable the hash
1921 * lookup is unnecessary (and expensive). Write protection 1921 * lookup is unnecessary (and expensive). Write protection
1922 * is responsibility of mmu_get_page / kvm_sync_page. 1922 * is responsibility of mmu_get_page / kvm_sync_page.
1923 * Same reasoning can be applied to dirty page accounting. 1923 * Same reasoning can be applied to dirty page accounting.
1924 */ 1924 */
1925 if (!can_unsync && is_writable_pte(*sptep)) 1925 if (!can_unsync && is_writable_pte(*sptep))
1926 goto set_pte; 1926 goto set_pte;
1927 1927
1928 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { 1928 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
1929 pgprintk("%s: found shadow page for %lx, marking ro\n", 1929 pgprintk("%s: found shadow page for %lx, marking ro\n",
1930 __func__, gfn); 1930 __func__, gfn);
1931 ret = 1; 1931 ret = 1;
1932 pte_access &= ~ACC_WRITE_MASK; 1932 pte_access &= ~ACC_WRITE_MASK;
1933 if (is_writable_pte(spte)) 1933 if (is_writable_pte(spte))
1934 spte &= ~PT_WRITABLE_MASK; 1934 spte &= ~PT_WRITABLE_MASK;
1935 } 1935 }
1936 } 1936 }
1937 1937
1938 if (pte_access & ACC_WRITE_MASK) 1938 if (pte_access & ACC_WRITE_MASK)
1939 mark_page_dirty(vcpu->kvm, gfn); 1939 mark_page_dirty(vcpu->kvm, gfn);
1940 1940
1941 set_pte: 1941 set_pte:
1942 update_spte(sptep, spte); 1942 update_spte(sptep, spte);
1943 done: 1943 done:
1944 return ret; 1944 return ret;
1945 } 1945 }
1946 1946
1947 static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, 1947 static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1948 unsigned pt_access, unsigned pte_access, 1948 unsigned pt_access, unsigned pte_access,
1949 int user_fault, int write_fault, int dirty, 1949 int user_fault, int write_fault, int dirty,
1950 int *ptwrite, int level, gfn_t gfn, 1950 int *ptwrite, int level, gfn_t gfn,
1951 pfn_t pfn, bool speculative, 1951 pfn_t pfn, bool speculative,
1952 bool reset_host_protection) 1952 bool reset_host_protection)
1953 { 1953 {
1954 int was_rmapped = 0; 1954 int was_rmapped = 0;
1955 int was_writable = is_writable_pte(*sptep); 1955 int was_writable = is_writable_pte(*sptep);
1956 int rmap_count; 1956 int rmap_count;
1957 1957
1958 pgprintk("%s: spte %llx access %x write_fault %d" 1958 pgprintk("%s: spte %llx access %x write_fault %d"
1959 " user_fault %d gfn %lx\n", 1959 " user_fault %d gfn %lx\n",
1960 __func__, *sptep, pt_access, 1960 __func__, *sptep, pt_access,
1961 write_fault, user_fault, gfn); 1961 write_fault, user_fault, gfn);
1962 1962
1963 if (is_rmap_spte(*sptep)) { 1963 if (is_rmap_spte(*sptep)) {
1964 /* 1964 /*
1965 * If we overwrite a PTE page pointer with a 2MB PMD, unlink 1965 * If we overwrite a PTE page pointer with a 2MB PMD, unlink
1966 * the parent of the now unreachable PTE. 1966 * the parent of the now unreachable PTE.
1967 */ 1967 */
1968 if (level > PT_PAGE_TABLE_LEVEL && 1968 if (level > PT_PAGE_TABLE_LEVEL &&
1969 !is_large_pte(*sptep)) { 1969 !is_large_pte(*sptep)) {
1970 struct kvm_mmu_page *child; 1970 struct kvm_mmu_page *child;
1971 u64 pte = *sptep; 1971 u64 pte = *sptep;
1972 1972
1973 child = page_header(pte & PT64_BASE_ADDR_MASK); 1973 child = page_header(pte & PT64_BASE_ADDR_MASK);
1974 mmu_page_remove_parent_pte(child, sptep); 1974 mmu_page_remove_parent_pte(child, sptep);
1975 __set_spte(sptep, shadow_trap_nonpresent_pte); 1975 __set_spte(sptep, shadow_trap_nonpresent_pte);
1976 kvm_flush_remote_tlbs(vcpu->kvm); 1976 kvm_flush_remote_tlbs(vcpu->kvm);
1977 } else if (pfn != spte_to_pfn(*sptep)) { 1977 } else if (pfn != spte_to_pfn(*sptep)) {
1978 pgprintk("hfn old %lx new %lx\n", 1978 pgprintk("hfn old %lx new %lx\n",
1979 spte_to_pfn(*sptep), pfn); 1979 spte_to_pfn(*sptep), pfn);
1980 drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); 1980 drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
1981 kvm_flush_remote_tlbs(vcpu->kvm); 1981 kvm_flush_remote_tlbs(vcpu->kvm);
1982 } else 1982 } else
1983 was_rmapped = 1; 1983 was_rmapped = 1;
1984 } 1984 }
1985 1985
1986 if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault, 1986 if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
1987 dirty, level, gfn, pfn, speculative, true, 1987 dirty, level, gfn, pfn, speculative, true,
1988 reset_host_protection)) { 1988 reset_host_protection)) {
1989 if (write_fault) 1989 if (write_fault)
1990 *ptwrite = 1; 1990 *ptwrite = 1;
1991 kvm_mmu_flush_tlb(vcpu); 1991 kvm_mmu_flush_tlb(vcpu);
1992 } 1992 }
1993 1993
1994 pgprintk("%s: setting spte %llx\n", __func__, *sptep); 1994 pgprintk("%s: setting spte %llx\n", __func__, *sptep);
1995 pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n", 1995 pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n",
1996 is_large_pte(*sptep)? "2MB" : "4kB", 1996 is_large_pte(*sptep)? "2MB" : "4kB",
1997 *sptep & PT_PRESENT_MASK ?"RW":"R", gfn, 1997 *sptep & PT_PRESENT_MASK ?"RW":"R", gfn,
1998 *sptep, sptep); 1998 *sptep, sptep);
1999 if (!was_rmapped && is_large_pte(*sptep)) 1999 if (!was_rmapped && is_large_pte(*sptep))
2000 ++vcpu->kvm->stat.lpages; 2000 ++vcpu->kvm->stat.lpages;
2001 2001
2002 page_header_update_slot(vcpu->kvm, sptep, gfn); 2002 page_header_update_slot(vcpu->kvm, sptep, gfn);
2003 if (!was_rmapped) { 2003 if (!was_rmapped) {
2004 rmap_count = rmap_add(vcpu, sptep, gfn); 2004 rmap_count = rmap_add(vcpu, sptep, gfn);
2005 kvm_release_pfn_clean(pfn); 2005 kvm_release_pfn_clean(pfn);
2006 if (rmap_count > RMAP_RECYCLE_THRESHOLD) 2006 if (rmap_count > RMAP_RECYCLE_THRESHOLD)
2007 rmap_recycle(vcpu, sptep, gfn); 2007 rmap_recycle(vcpu, sptep, gfn);
2008 } else { 2008 } else {
2009 if (was_writable) 2009 if (was_writable)
2010 kvm_release_pfn_dirty(pfn); 2010 kvm_release_pfn_dirty(pfn);
2011 else 2011 else
2012 kvm_release_pfn_clean(pfn); 2012 kvm_release_pfn_clean(pfn);
2013 } 2013 }
2014 if (speculative) { 2014 if (speculative) {
2015 vcpu->arch.last_pte_updated = sptep; 2015 vcpu->arch.last_pte_updated = sptep;
2016 vcpu->arch.last_pte_gfn = gfn; 2016 vcpu->arch.last_pte_gfn = gfn;
2017 } 2017 }
2018 } 2018 }
2019 2019
2020 static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) 2020 static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
2021 { 2021 {
2022 } 2022 }
2023 2023
2024 static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, 2024 static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2025 int level, gfn_t gfn, pfn_t pfn) 2025 int level, gfn_t gfn, pfn_t pfn)
2026 { 2026 {
2027 struct kvm_shadow_walk_iterator iterator; 2027 struct kvm_shadow_walk_iterator iterator;
2028 struct kvm_mmu_page *sp; 2028 struct kvm_mmu_page *sp;
2029 int pt_write = 0; 2029 int pt_write = 0;
2030 gfn_t pseudo_gfn; 2030 gfn_t pseudo_gfn;
2031 2031
2032 for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { 2032 for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
2033 if (iterator.level == level) { 2033 if (iterator.level == level) {
2034 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL, 2034 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
2035 0, write, 1, &pt_write, 2035 0, write, 1, &pt_write,
2036 level, gfn, pfn, false, true); 2036 level, gfn, pfn, false, true);
2037 ++vcpu->stat.pf_fixed; 2037 ++vcpu->stat.pf_fixed;
2038 break; 2038 break;
2039 } 2039 }
2040 2040
2041 if (*iterator.sptep == shadow_trap_nonpresent_pte) { 2041 if (*iterator.sptep == shadow_trap_nonpresent_pte) {
2042 u64 base_addr = iterator.addr; 2042 u64 base_addr = iterator.addr;
2043 2043
2044 base_addr &= PT64_LVL_ADDR_MASK(iterator.level); 2044 base_addr &= PT64_LVL_ADDR_MASK(iterator.level);
2045 pseudo_gfn = base_addr >> PAGE_SHIFT; 2045 pseudo_gfn = base_addr >> PAGE_SHIFT;
2046 sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr, 2046 sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
2047 iterator.level - 1, 2047 iterator.level - 1,
2048 1, ACC_ALL, iterator.sptep); 2048 1, ACC_ALL, iterator.sptep);
2049 if (!sp) { 2049 if (!sp) {
2050 pgprintk("nonpaging_map: ENOMEM\n"); 2050 pgprintk("nonpaging_map: ENOMEM\n");
2051 kvm_release_pfn_clean(pfn); 2051 kvm_release_pfn_clean(pfn);
2052 return -ENOMEM; 2052 return -ENOMEM;
2053 } 2053 }
2054 2054
2055 __set_spte(iterator.sptep, 2055 __set_spte(iterator.sptep,
2056 __pa(sp->spt) 2056 __pa(sp->spt)
2057 | PT_PRESENT_MASK | PT_WRITABLE_MASK 2057 | PT_PRESENT_MASK | PT_WRITABLE_MASK
2058 | shadow_user_mask | shadow_x_mask); 2058 | shadow_user_mask | shadow_x_mask);
2059 } 2059 }
2060 } 2060 }
2061 return pt_write; 2061 return pt_write;
2062 } 2062 }
2063 2063
2064 static void kvm_send_hwpoison_signal(struct kvm *kvm, gfn_t gfn) 2064 static void kvm_send_hwpoison_signal(struct kvm *kvm, gfn_t gfn)
2065 { 2065 {
2066 char buf[1]; 2066 char buf[1];
2067 void __user *hva; 2067 void __user *hva;
2068 int r; 2068 int r;
2069 2069
2070 /* Touch the page, so send SIGBUS */ 2070 /* Touch the page, so send SIGBUS */
2071 hva = (void __user *)gfn_to_hva(kvm, gfn); 2071 hva = (void __user *)gfn_to_hva(kvm, gfn);
2072 r = copy_from_user(buf, hva, 1); 2072 r = copy_from_user(buf, hva, 1);
2073 } 2073 }
2074 2074
2075 static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn) 2075 static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
2076 { 2076 {
2077 kvm_release_pfn_clean(pfn); 2077 kvm_release_pfn_clean(pfn);
2078 if (is_hwpoison_pfn(pfn)) { 2078 if (is_hwpoison_pfn(pfn)) {
2079 kvm_send_hwpoison_signal(kvm, gfn); 2079 kvm_send_hwpoison_signal(kvm, gfn);
2080 return 0; 2080 return 0;
2081 } 2081 } else if (is_fault_pfn(pfn))
2082 return -EFAULT;
2083
2082 return 1; 2084 return 1;
2083 } 2085 }
2084 2086
2085 static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) 2087 static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
2086 { 2088 {
2087 int r; 2089 int r;
2088 int level; 2090 int level;
2089 pfn_t pfn; 2091 pfn_t pfn;
2090 unsigned long mmu_seq; 2092 unsigned long mmu_seq;
2091 2093
2092 level = mapping_level(vcpu, gfn); 2094 level = mapping_level(vcpu, gfn);
2093 2095
2094 /* 2096 /*
2095 * This path builds a PAE pagetable - so we can map 2mb pages at 2097 * This path builds a PAE pagetable - so we can map 2mb pages at
2096 * maximum. Therefore check if the level is larger than that. 2098 * maximum. Therefore check if the level is larger than that.
2097 */ 2099 */
2098 if (level > PT_DIRECTORY_LEVEL) 2100 if (level > PT_DIRECTORY_LEVEL)
2099 level = PT_DIRECTORY_LEVEL; 2101 level = PT_DIRECTORY_LEVEL;
2100 2102
2101 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); 2103 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
2102 2104
2103 mmu_seq = vcpu->kvm->mmu_notifier_seq; 2105 mmu_seq = vcpu->kvm->mmu_notifier_seq;
2104 smp_rmb(); 2106 smp_rmb();
2105 pfn = gfn_to_pfn(vcpu->kvm, gfn); 2107 pfn = gfn_to_pfn(vcpu->kvm, gfn);
2106 2108
2107 /* mmio */ 2109 /* mmio */
2108 if (is_error_pfn(pfn)) 2110 if (is_error_pfn(pfn))
2109 return kvm_handle_bad_page(vcpu->kvm, gfn, pfn); 2111 return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
2110 2112
2111 spin_lock(&vcpu->kvm->mmu_lock); 2113 spin_lock(&vcpu->kvm->mmu_lock);
2112 if (mmu_notifier_retry(vcpu, mmu_seq)) 2114 if (mmu_notifier_retry(vcpu, mmu_seq))
2113 goto out_unlock; 2115 goto out_unlock;
2114 kvm_mmu_free_some_pages(vcpu); 2116 kvm_mmu_free_some_pages(vcpu);
2115 r = __direct_map(vcpu, v, write, level, gfn, pfn); 2117 r = __direct_map(vcpu, v, write, level, gfn, pfn);
2116 spin_unlock(&vcpu->kvm->mmu_lock); 2118 spin_unlock(&vcpu->kvm->mmu_lock);
2117 2119
2118 2120
2119 return r; 2121 return r;
2120 2122
2121 out_unlock: 2123 out_unlock:
2122 spin_unlock(&vcpu->kvm->mmu_lock); 2124 spin_unlock(&vcpu->kvm->mmu_lock);
2123 kvm_release_pfn_clean(pfn); 2125 kvm_release_pfn_clean(pfn);
2124 return 0; 2126 return 0;
2125 } 2127 }
2126 2128
2127 2129
2128 static void mmu_free_roots(struct kvm_vcpu *vcpu) 2130 static void mmu_free_roots(struct kvm_vcpu *vcpu)
2129 { 2131 {
2130 int i; 2132 int i;
2131 struct kvm_mmu_page *sp; 2133 struct kvm_mmu_page *sp;
2132 LIST_HEAD(invalid_list); 2134 LIST_HEAD(invalid_list);
2133 2135
2134 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) 2136 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
2135 return; 2137 return;
2136 spin_lock(&vcpu->kvm->mmu_lock); 2138 spin_lock(&vcpu->kvm->mmu_lock);
2137 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { 2139 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
2138 hpa_t root = vcpu->arch.mmu.root_hpa; 2140 hpa_t root = vcpu->arch.mmu.root_hpa;
2139 2141
2140 sp = page_header(root); 2142 sp = page_header(root);
2141 --sp->root_count; 2143 --sp->root_count;
2142 if (!sp->root_count && sp->role.invalid) { 2144 if (!sp->root_count && sp->role.invalid) {
2143 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); 2145 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
2144 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); 2146 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2145 } 2147 }
2146 vcpu->arch.mmu.root_hpa = INVALID_PAGE; 2148 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
2147 spin_unlock(&vcpu->kvm->mmu_lock); 2149 spin_unlock(&vcpu->kvm->mmu_lock);
2148 return; 2150 return;
2149 } 2151 }
2150 for (i = 0; i < 4; ++i) { 2152 for (i = 0; i < 4; ++i) {
2151 hpa_t root = vcpu->arch.mmu.pae_root[i]; 2153 hpa_t root = vcpu->arch.mmu.pae_root[i];
2152 2154
2153 if (root) { 2155 if (root) {
2154 root &= PT64_BASE_ADDR_MASK; 2156 root &= PT64_BASE_ADDR_MASK;
2155 sp = page_header(root); 2157 sp = page_header(root);
2156 --sp->root_count; 2158 --sp->root_count;
2157 if (!sp->root_count && sp->role.invalid) 2159 if (!sp->root_count && sp->role.invalid)
2158 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, 2160 kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
2159 &invalid_list); 2161 &invalid_list);
2160 } 2162 }
2161 vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; 2163 vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
2162 } 2164 }
2163 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); 2165 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2164 spin_unlock(&vcpu->kvm->mmu_lock); 2166 spin_unlock(&vcpu->kvm->mmu_lock);
2165 vcpu->arch.mmu.root_hpa = INVALID_PAGE; 2167 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
2166 } 2168 }
2167 2169
2168 static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn) 2170 static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
2169 { 2171 {
2170 int ret = 0; 2172 int ret = 0;
2171 2173
2172 if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) { 2174 if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
2173 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 2175 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2174 ret = 1; 2176 ret = 1;
2175 } 2177 }
2176 2178
2177 return ret; 2179 return ret;
2178 } 2180 }
2179 2181
2180 static int mmu_alloc_roots(struct kvm_vcpu *vcpu) 2182 static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
2181 { 2183 {
2182 int i; 2184 int i;
2183 gfn_t root_gfn; 2185 gfn_t root_gfn;
2184 struct kvm_mmu_page *sp; 2186 struct kvm_mmu_page *sp;
2185 int direct = 0; 2187 int direct = 0;
2186 u64 pdptr; 2188 u64 pdptr;
2187 2189
2188 root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT; 2190 root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
2189 2191
2190 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { 2192 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
2191 hpa_t root = vcpu->arch.mmu.root_hpa; 2193 hpa_t root = vcpu->arch.mmu.root_hpa;
2192 2194
2193 ASSERT(!VALID_PAGE(root)); 2195 ASSERT(!VALID_PAGE(root));
2194 if (mmu_check_root(vcpu, root_gfn)) 2196 if (mmu_check_root(vcpu, root_gfn))
2195 return 1; 2197 return 1;
2196 if (tdp_enabled) { 2198 if (tdp_enabled) {
2197 direct = 1; 2199 direct = 1;
2198 root_gfn = 0; 2200 root_gfn = 0;
2199 } 2201 }
2200 spin_lock(&vcpu->kvm->mmu_lock); 2202 spin_lock(&vcpu->kvm->mmu_lock);
2201 kvm_mmu_free_some_pages(vcpu); 2203 kvm_mmu_free_some_pages(vcpu);
2202 sp = kvm_mmu_get_page(vcpu, root_gfn, 0, 2204 sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
2203 PT64_ROOT_LEVEL, direct, 2205 PT64_ROOT_LEVEL, direct,
2204 ACC_ALL, NULL); 2206 ACC_ALL, NULL);
2205 root = __pa(sp->spt); 2207 root = __pa(sp->spt);
2206 ++sp->root_count; 2208 ++sp->root_count;
2207 spin_unlock(&vcpu->kvm->mmu_lock); 2209 spin_unlock(&vcpu->kvm->mmu_lock);
2208 vcpu->arch.mmu.root_hpa = root; 2210 vcpu->arch.mmu.root_hpa = root;
2209 return 0; 2211 return 0;
2210 } 2212 }
2211 direct = !is_paging(vcpu); 2213 direct = !is_paging(vcpu);
2212 for (i = 0; i < 4; ++i) { 2214 for (i = 0; i < 4; ++i) {
2213 hpa_t root = vcpu->arch.mmu.pae_root[i]; 2215 hpa_t root = vcpu->arch.mmu.pae_root[i];
2214 2216
2215 ASSERT(!VALID_PAGE(root)); 2217 ASSERT(!VALID_PAGE(root));
2216 if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { 2218 if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
2217 pdptr = kvm_pdptr_read(vcpu, i); 2219 pdptr = kvm_pdptr_read(vcpu, i);
2218 if (!is_present_gpte(pdptr)) { 2220 if (!is_present_gpte(pdptr)) {
2219 vcpu->arch.mmu.pae_root[i] = 0; 2221 vcpu->arch.mmu.pae_root[i] = 0;
2220 continue; 2222 continue;
2221 } 2223 }
2222 root_gfn = pdptr >> PAGE_SHIFT; 2224 root_gfn = pdptr >> PAGE_SHIFT;
2223 } else if (vcpu->arch.mmu.root_level == 0) 2225 } else if (vcpu->arch.mmu.root_level == 0)
2224 root_gfn = 0; 2226 root_gfn = 0;
2225 if (mmu_check_root(vcpu, root_gfn)) 2227 if (mmu_check_root(vcpu, root_gfn))
2226 return 1; 2228 return 1;
2227 if (tdp_enabled) { 2229 if (tdp_enabled) {
2228 direct = 1; 2230 direct = 1;
2229 root_gfn = i << 30; 2231 root_gfn = i << 30;
2230 } 2232 }
2231 spin_lock(&vcpu->kvm->mmu_lock); 2233 spin_lock(&vcpu->kvm->mmu_lock);
2232 kvm_mmu_free_some_pages(vcpu); 2234 kvm_mmu_free_some_pages(vcpu);
2233 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, 2235 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
2234 PT32_ROOT_LEVEL, direct, 2236 PT32_ROOT_LEVEL, direct,
2235 ACC_ALL, NULL); 2237 ACC_ALL, NULL);
2236 root = __pa(sp->spt); 2238 root = __pa(sp->spt);
2237 ++sp->root_count; 2239 ++sp->root_count;
2238 spin_unlock(&vcpu->kvm->mmu_lock); 2240 spin_unlock(&vcpu->kvm->mmu_lock);
2239 2241
2240 vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; 2242 vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
2241 } 2243 }
2242 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); 2244 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
2243 return 0; 2245 return 0;
2244 } 2246 }
2245 2247
2246 static void mmu_sync_roots(struct kvm_vcpu *vcpu) 2248 static void mmu_sync_roots(struct kvm_vcpu *vcpu)
2247 { 2249 {
2248 int i; 2250 int i;
2249 struct kvm_mmu_page *sp; 2251 struct kvm_mmu_page *sp;
2250 2252
2251 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) 2253 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
2252 return; 2254 return;
2253 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { 2255 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
2254 hpa_t root = vcpu->arch.mmu.root_hpa; 2256 hpa_t root = vcpu->arch.mmu.root_hpa;
2255 sp = page_header(root); 2257 sp = page_header(root);
2256 mmu_sync_children(vcpu, sp); 2258 mmu_sync_children(vcpu, sp);
2257 return; 2259 return;
2258 } 2260 }
2259 for (i = 0; i < 4; ++i) { 2261 for (i = 0; i < 4; ++i) {
2260 hpa_t root = vcpu->arch.mmu.pae_root[i]; 2262 hpa_t root = vcpu->arch.mmu.pae_root[i];
2261 2263
2262 if (root && VALID_PAGE(root)) { 2264 if (root && VALID_PAGE(root)) {
2263 root &= PT64_BASE_ADDR_MASK; 2265 root &= PT64_BASE_ADDR_MASK;
2264 sp = page_header(root); 2266 sp = page_header(root);
2265 mmu_sync_children(vcpu, sp); 2267 mmu_sync_children(vcpu, sp);
2266 } 2268 }
2267 } 2269 }
2268 } 2270 }
2269 2271
2270 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) 2272 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
2271 { 2273 {
2272 spin_lock(&vcpu->kvm->mmu_lock); 2274 spin_lock(&vcpu->kvm->mmu_lock);
2273 mmu_sync_roots(vcpu); 2275 mmu_sync_roots(vcpu);
2274 spin_unlock(&vcpu->kvm->mmu_lock); 2276 spin_unlock(&vcpu->kvm->mmu_lock);
2275 } 2277 }
2276 2278
2277 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr, 2279 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
2278 u32 access, u32 *error) 2280 u32 access, u32 *error)
2279 { 2281 {
2280 if (error) 2282 if (error)
2281 *error = 0; 2283 *error = 0;
2282 return vaddr; 2284 return vaddr;
2283 } 2285 }
2284 2286
2285 static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, 2287 static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
2286 u32 error_code) 2288 u32 error_code)
2287 { 2289 {
2288 gfn_t gfn; 2290 gfn_t gfn;
2289 int r; 2291 int r;
2290 2292
2291 pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code); 2293 pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
2292 r = mmu_topup_memory_caches(vcpu); 2294 r = mmu_topup_memory_caches(vcpu);
2293 if (r) 2295 if (r)
2294 return r; 2296 return r;
2295 2297
2296 ASSERT(vcpu); 2298 ASSERT(vcpu);
2297 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); 2299 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
2298 2300
2299 gfn = gva >> PAGE_SHIFT; 2301 gfn = gva >> PAGE_SHIFT;
2300 2302
2301 return nonpaging_map(vcpu, gva & PAGE_MASK, 2303 return nonpaging_map(vcpu, gva & PAGE_MASK,
2302 error_code & PFERR_WRITE_MASK, gfn); 2304 error_code & PFERR_WRITE_MASK, gfn);
2303 } 2305 }
2304 2306
2305 static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, 2307 static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
2306 u32 error_code) 2308 u32 error_code)
2307 { 2309 {
2308 pfn_t pfn; 2310 pfn_t pfn;
2309 int r; 2311 int r;
2310 int level; 2312 int level;
2311 gfn_t gfn = gpa >> PAGE_SHIFT; 2313 gfn_t gfn = gpa >> PAGE_SHIFT;
2312 unsigned long mmu_seq; 2314 unsigned long mmu_seq;
2313 2315
2314 ASSERT(vcpu); 2316 ASSERT(vcpu);
2315 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); 2317 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
2316 2318
2317 r = mmu_topup_memory_caches(vcpu); 2319 r = mmu_topup_memory_caches(vcpu);
2318 if (r) 2320 if (r)
2319 return r; 2321 return r;
2320 2322
2321 level = mapping_level(vcpu, gfn); 2323 level = mapping_level(vcpu, gfn);
2322 2324
2323 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); 2325 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
2324 2326
2325 mmu_seq = vcpu->kvm->mmu_notifier_seq; 2327 mmu_seq = vcpu->kvm->mmu_notifier_seq;
2326 smp_rmb(); 2328 smp_rmb();
2327 pfn = gfn_to_pfn(vcpu->kvm, gfn); 2329 pfn = gfn_to_pfn(vcpu->kvm, gfn);
2328 if (is_error_pfn(pfn)) 2330 if (is_error_pfn(pfn))
2329 return kvm_handle_bad_page(vcpu->kvm, gfn, pfn); 2331 return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
2330 spin_lock(&vcpu->kvm->mmu_lock); 2332 spin_lock(&vcpu->kvm->mmu_lock);
2331 if (mmu_notifier_retry(vcpu, mmu_seq)) 2333 if (mmu_notifier_retry(vcpu, mmu_seq))
2332 goto out_unlock; 2334 goto out_unlock;
2333 kvm_mmu_free_some_pages(vcpu); 2335 kvm_mmu_free_some_pages(vcpu);
2334 r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, 2336 r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
2335 level, gfn, pfn); 2337 level, gfn, pfn);
2336 spin_unlock(&vcpu->kvm->mmu_lock); 2338 spin_unlock(&vcpu->kvm->mmu_lock);
2337 2339
2338 return r; 2340 return r;
2339 2341
2340 out_unlock: 2342 out_unlock:
2341 spin_unlock(&vcpu->kvm->mmu_lock); 2343 spin_unlock(&vcpu->kvm->mmu_lock);
2342 kvm_release_pfn_clean(pfn); 2344 kvm_release_pfn_clean(pfn);
2343 return 0; 2345 return 0;
2344 } 2346 }
2345 2347
2346 static void nonpaging_free(struct kvm_vcpu *vcpu) 2348 static void nonpaging_free(struct kvm_vcpu *vcpu)
2347 { 2349 {
2348 mmu_free_roots(vcpu); 2350 mmu_free_roots(vcpu);
2349 } 2351 }
2350 2352
2351 static int nonpaging_init_context(struct kvm_vcpu *vcpu) 2353 static int nonpaging_init_context(struct kvm_vcpu *vcpu)
2352 { 2354 {
2353 struct kvm_mmu *context = &vcpu->arch.mmu; 2355 struct kvm_mmu *context = &vcpu->arch.mmu;
2354 2356
2355 context->new_cr3 = nonpaging_new_cr3; 2357 context->new_cr3 = nonpaging_new_cr3;
2356 context->page_fault = nonpaging_page_fault; 2358 context->page_fault = nonpaging_page_fault;
2357 context->gva_to_gpa = nonpaging_gva_to_gpa; 2359 context->gva_to_gpa = nonpaging_gva_to_gpa;
2358 context->free = nonpaging_free; 2360 context->free = nonpaging_free;
2359 context->prefetch_page = nonpaging_prefetch_page; 2361 context->prefetch_page = nonpaging_prefetch_page;
2360 context->sync_page = nonpaging_sync_page; 2362 context->sync_page = nonpaging_sync_page;
2361 context->invlpg = nonpaging_invlpg; 2363 context->invlpg = nonpaging_invlpg;
2362 context->root_level = 0; 2364 context->root_level = 0;
2363 context->shadow_root_level = PT32E_ROOT_LEVEL; 2365 context->shadow_root_level = PT32E_ROOT_LEVEL;
2364 context->root_hpa = INVALID_PAGE; 2366 context->root_hpa = INVALID_PAGE;
2365 return 0; 2367 return 0;
2366 } 2368 }
2367 2369
2368 void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) 2370 void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
2369 { 2371 {
2370 ++vcpu->stat.tlb_flush; 2372 ++vcpu->stat.tlb_flush;
2371 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 2373 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
2372 } 2374 }
2373 2375
2374 static void paging_new_cr3(struct kvm_vcpu *vcpu) 2376 static void paging_new_cr3(struct kvm_vcpu *vcpu)
2375 { 2377 {
2376 pgprintk("%s: cr3 %lx\n", __func__, vcpu->arch.cr3); 2378 pgprintk("%s: cr3 %lx\n", __func__, vcpu->arch.cr3);
2377 mmu_free_roots(vcpu); 2379 mmu_free_roots(vcpu);
2378 } 2380 }
2379 2381
2380 static void inject_page_fault(struct kvm_vcpu *vcpu, 2382 static void inject_page_fault(struct kvm_vcpu *vcpu,
2381 u64 addr, 2383 u64 addr,
2382 u32 err_code) 2384 u32 err_code)
2383 { 2385 {
2384 kvm_inject_page_fault(vcpu, addr, err_code); 2386 kvm_inject_page_fault(vcpu, addr, err_code);
2385 } 2387 }
2386 2388
2387 static void paging_free(struct kvm_vcpu *vcpu) 2389 static void paging_free(struct kvm_vcpu *vcpu)
2388 { 2390 {
2389 nonpaging_free(vcpu); 2391 nonpaging_free(vcpu);
2390 } 2392 }
2391 2393
2392 static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level) 2394 static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level)
2393 { 2395 {
2394 int bit7; 2396 int bit7;
2395 2397
2396 bit7 = (gpte >> 7) & 1; 2398 bit7 = (gpte >> 7) & 1;
2397 return (gpte & vcpu->arch.mmu.rsvd_bits_mask[bit7][level-1]) != 0; 2399 return (gpte & vcpu->arch.mmu.rsvd_bits_mask[bit7][level-1]) != 0;
2398 } 2400 }
2399 2401
2400 #define PTTYPE 64 2402 #define PTTYPE 64
2401 #include "paging_tmpl.h" 2403 #include "paging_tmpl.h"
2402 #undef PTTYPE 2404 #undef PTTYPE
2403 2405
2404 #define PTTYPE 32 2406 #define PTTYPE 32
2405 #include "paging_tmpl.h" 2407 #include "paging_tmpl.h"
2406 #undef PTTYPE 2408 #undef PTTYPE
2407 2409
2408 static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level) 2410 static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
2409 { 2411 {
2410 struct kvm_mmu *context = &vcpu->arch.mmu; 2412 struct kvm_mmu *context = &vcpu->arch.mmu;
2411 int maxphyaddr = cpuid_maxphyaddr(vcpu); 2413 int maxphyaddr = cpuid_maxphyaddr(vcpu);
2412 u64 exb_bit_rsvd = 0; 2414 u64 exb_bit_rsvd = 0;
2413 2415
2414 if (!is_nx(vcpu)) 2416 if (!is_nx(vcpu))
2415 exb_bit_rsvd = rsvd_bits(63, 63); 2417 exb_bit_rsvd = rsvd_bits(63, 63);
2416 switch (level) { 2418 switch (level) {
2417 case PT32_ROOT_LEVEL: 2419 case PT32_ROOT_LEVEL:
2418 /* no rsvd bits for 2 level 4K page table entries */ 2420 /* no rsvd bits for 2 level 4K page table entries */
2419 context->rsvd_bits_mask[0][1] = 0; 2421 context->rsvd_bits_mask[0][1] = 0;
2420 context->rsvd_bits_mask[0][0] = 0; 2422 context->rsvd_bits_mask[0][0] = 0;
2421 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0]; 2423 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
2422 2424
2423 if (!is_pse(vcpu)) { 2425 if (!is_pse(vcpu)) {
2424 context->rsvd_bits_mask[1][1] = 0; 2426 context->rsvd_bits_mask[1][1] = 0;
2425 break; 2427 break;
2426 } 2428 }
2427 2429
2428 if (is_cpuid_PSE36()) 2430 if (is_cpuid_PSE36())
2429 /* 36bits PSE 4MB page */ 2431 /* 36bits PSE 4MB page */
2430 context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21); 2432 context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
2431 else 2433 else
2432 /* 32 bits PSE 4MB page */ 2434 /* 32 bits PSE 4MB page */
2433 context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21); 2435 context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
2434 break; 2436 break;
2435 case PT32E_ROOT_LEVEL: 2437 case PT32E_ROOT_LEVEL:
2436 context->rsvd_bits_mask[0][2] = 2438 context->rsvd_bits_mask[0][2] =
2437 rsvd_bits(maxphyaddr, 63) | 2439 rsvd_bits(maxphyaddr, 63) |
2438 rsvd_bits(7, 8) | rsvd_bits(1, 2); /* PDPTE */ 2440 rsvd_bits(7, 8) | rsvd_bits(1, 2); /* PDPTE */
2439 context->rsvd_bits_mask[0][1] = exb_bit_rsvd | 2441 context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
2440 rsvd_bits(maxphyaddr, 62); /* PDE */ 2442 rsvd_bits(maxphyaddr, 62); /* PDE */
2441 context->rsvd_bits_mask[0][0] = exb_bit_rsvd | 2443 context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
2442 rsvd_bits(maxphyaddr, 62); /* PTE */ 2444 rsvd_bits(maxphyaddr, 62); /* PTE */
2443 context->rsvd_bits_mask[1][1] = exb_bit_rsvd | 2445 context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
2444 rsvd_bits(maxphyaddr, 62) | 2446 rsvd_bits(maxphyaddr, 62) |
2445 rsvd_bits(13, 20); /* large page */ 2447 rsvd_bits(13, 20); /* large page */
2446 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0]; 2448 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
2447 break; 2449 break;
2448 case PT64_ROOT_LEVEL: 2450 case PT64_ROOT_LEVEL:
2449 context->rsvd_bits_mask[0][3] = exb_bit_rsvd | 2451 context->rsvd_bits_mask[0][3] = exb_bit_rsvd |
2450 rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8); 2452 rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
2451 context->rsvd_bits_mask[0][2] = exb_bit_rsvd | 2453 context->rsvd_bits_mask[0][2] = exb_bit_rsvd |
2452 rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8); 2454 rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
2453 context->rsvd_bits_mask[0][1] = exb_bit_rsvd | 2455 context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
2454 rsvd_bits(maxphyaddr, 51); 2456 rsvd_bits(maxphyaddr, 51);
2455 context->rsvd_bits_mask[0][0] = exb_bit_rsvd | 2457 context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
2456 rsvd_bits(maxphyaddr, 51); 2458 rsvd_bits(maxphyaddr, 51);
2457 context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3]; 2459 context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3];
2458 context->rsvd_bits_mask[1][2] = exb_bit_rsvd | 2460 context->rsvd_bits_mask[1][2] = exb_bit_rsvd |
2459 rsvd_bits(maxphyaddr, 51) | 2461 rsvd_bits(maxphyaddr, 51) |
2460 rsvd_bits(13, 29); 2462 rsvd_bits(13, 29);
2461 context->rsvd_bits_mask[1][1] = exb_bit_rsvd | 2463 context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
2462 rsvd_bits(maxphyaddr, 51) | 2464 rsvd_bits(maxphyaddr, 51) |
2463 rsvd_bits(13, 20); /* large page */ 2465 rsvd_bits(13, 20); /* large page */
2464 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0]; 2466 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
2465 break; 2467 break;
2466 } 2468 }
2467 } 2469 }
2468 2470
2469 static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) 2471 static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
2470 { 2472 {
2471 struct kvm_mmu *context = &vcpu->arch.mmu; 2473 struct kvm_mmu *context = &vcpu->arch.mmu;
2472 2474
2473 ASSERT(is_pae(vcpu)); 2475 ASSERT(is_pae(vcpu));
2474 context->new_cr3 = paging_new_cr3; 2476 context->new_cr3 = paging_new_cr3;
2475 context->page_fault = paging64_page_fault; 2477 context->page_fault = paging64_page_fault;
2476 context->gva_to_gpa = paging64_gva_to_gpa; 2478 context->gva_to_gpa = paging64_gva_to_gpa;
2477 context->prefetch_page = paging64_prefetch_page; 2479 context->prefetch_page = paging64_prefetch_page;
2478 context->sync_page = paging64_sync_page; 2480 context->sync_page = paging64_sync_page;
2479 context->invlpg = paging64_invlpg; 2481 context->invlpg = paging64_invlpg;
2480 context->free = paging_free; 2482 context->free = paging_free;
2481 context->root_level = level; 2483 context->root_level = level;
2482 context->shadow_root_level = level; 2484 context->shadow_root_level = level;
2483 context->root_hpa = INVALID_PAGE; 2485 context->root_hpa = INVALID_PAGE;
2484 return 0; 2486 return 0;
2485 } 2487 }
2486 2488
2487 static int paging64_init_context(struct kvm_vcpu *vcpu) 2489 static int paging64_init_context(struct kvm_vcpu *vcpu)
2488 { 2490 {
2489 reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL); 2491 reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL);
2490 return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL); 2492 return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
2491 } 2493 }
2492 2494
2493 static int paging32_init_context(struct kvm_vcpu *vcpu) 2495 static int paging32_init_context(struct kvm_vcpu *vcpu)
2494 { 2496 {
2495 struct kvm_mmu *context = &vcpu->arch.mmu; 2497 struct kvm_mmu *context = &vcpu->arch.mmu;
2496 2498
2497 reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL); 2499 reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL);
2498 context->new_cr3 = paging_new_cr3; 2500 context->new_cr3 = paging_new_cr3;
2499 context->page_fault = paging32_page_fault; 2501 context->page_fault = paging32_page_fault;
2500 context->gva_to_gpa = paging32_gva_to_gpa; 2502 context->gva_to_gpa = paging32_gva_to_gpa;
2501 context->free = paging_free; 2503 context->free = paging_free;
2502 context->prefetch_page = paging32_prefetch_page; 2504 context->prefetch_page = paging32_prefetch_page;
2503 context->sync_page = paging32_sync_page; 2505 context->sync_page = paging32_sync_page;
2504 context->invlpg = paging32_invlpg; 2506 context->invlpg = paging32_invlpg;
2505 context->root_level = PT32_ROOT_LEVEL; 2507 context->root_level = PT32_ROOT_LEVEL;
2506 context->shadow_root_level = PT32E_ROOT_LEVEL; 2508 context->shadow_root_level = PT32E_ROOT_LEVEL;
2507 context->root_hpa = INVALID_PAGE; 2509 context->root_hpa = INVALID_PAGE;
2508 return 0; 2510 return 0;
2509 } 2511 }
2510 2512
2511 static int paging32E_init_context(struct kvm_vcpu *vcpu) 2513 static int paging32E_init_context(struct kvm_vcpu *vcpu)
2512 { 2514 {
2513 reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL); 2515 reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL);
2514 return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL); 2516 return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
2515 } 2517 }
2516 2518
2517 static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) 2519 static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
2518 { 2520 {
2519 struct kvm_mmu *context = &vcpu->arch.mmu; 2521 struct kvm_mmu *context = &vcpu->arch.mmu;
2520 2522
2521 context->new_cr3 = nonpaging_new_cr3; 2523 context->new_cr3 = nonpaging_new_cr3;
2522 context->page_fault = tdp_page_fault; 2524 context->page_fault = tdp_page_fault;
2523 context->free = nonpaging_free; 2525 context->free = nonpaging_free;
2524 context->prefetch_page = nonpaging_prefetch_page; 2526 context->prefetch_page = nonpaging_prefetch_page;
2525 context->sync_page = nonpaging_sync_page; 2527 context->sync_page = nonpaging_sync_page;
2526 context->invlpg = nonpaging_invlpg; 2528 context->invlpg = nonpaging_invlpg;
2527 context->shadow_root_level = kvm_x86_ops->get_tdp_level(); 2529 context->shadow_root_level = kvm_x86_ops->get_tdp_level();
2528 context->root_hpa = INVALID_PAGE; 2530 context->root_hpa = INVALID_PAGE;
2529 2531
2530 if (!is_paging(vcpu)) { 2532 if (!is_paging(vcpu)) {
2531 context->gva_to_gpa = nonpaging_gva_to_gpa; 2533 context->gva_to_gpa = nonpaging_gva_to_gpa;
2532 context->root_level = 0; 2534 context->root_level = 0;
2533 } else if (is_long_mode(vcpu)) { 2535 } else if (is_long_mode(vcpu)) {
2534 reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL); 2536 reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL);
2535 context->gva_to_gpa = paging64_gva_to_gpa; 2537 context->gva_to_gpa = paging64_gva_to_gpa;
2536 context->root_level = PT64_ROOT_LEVEL; 2538 context->root_level = PT64_ROOT_LEVEL;
2537 } else if (is_pae(vcpu)) { 2539 } else if (is_pae(vcpu)) {
2538 reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL); 2540 reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL);
2539 context->gva_to_gpa = paging64_gva_to_gpa; 2541 context->gva_to_gpa = paging64_gva_to_gpa;
2540 context->root_level = PT32E_ROOT_LEVEL; 2542 context->root_level = PT32E_ROOT_LEVEL;
2541 } else { 2543 } else {
2542 reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL); 2544 reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL);
2543 context->gva_to_gpa = paging32_gva_to_gpa; 2545 context->gva_to_gpa = paging32_gva_to_gpa;
2544 context->root_level = PT32_ROOT_LEVEL; 2546 context->root_level = PT32_ROOT_LEVEL;
2545 } 2547 }
2546 2548
2547 return 0; 2549 return 0;
2548 } 2550 }
2549 2551
2550 static int init_kvm_softmmu(struct kvm_vcpu *vcpu) 2552 static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
2551 { 2553 {
2552 int r; 2554 int r;
2553 2555
2554 ASSERT(vcpu); 2556 ASSERT(vcpu);
2555 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); 2557 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
2556 2558
2557 if (!is_paging(vcpu)) 2559 if (!is_paging(vcpu))
2558 r = nonpaging_init_context(vcpu); 2560 r = nonpaging_init_context(vcpu);
2559 else if (is_long_mode(vcpu)) 2561 else if (is_long_mode(vcpu))
2560 r = paging64_init_context(vcpu); 2562 r = paging64_init_context(vcpu);
2561 else if (is_pae(vcpu)) 2563 else if (is_pae(vcpu))
2562 r = paging32E_init_context(vcpu); 2564 r = paging32E_init_context(vcpu);
2563 else 2565 else
2564 r = paging32_init_context(vcpu); 2566 r = paging32_init_context(vcpu);
2565 2567
2566 vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu); 2568 vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
2567 vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); 2569 vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu);
2568 2570
2569 return r; 2571 return r;
2570 } 2572 }
2571 2573
2572 static int init_kvm_mmu(struct kvm_vcpu *vcpu) 2574 static int init_kvm_mmu(struct kvm_vcpu *vcpu)
2573 { 2575 {
2574 vcpu->arch.update_pte.pfn = bad_pfn; 2576 vcpu->arch.update_pte.pfn = bad_pfn;
2575 2577
2576 if (tdp_enabled) 2578 if (tdp_enabled)
2577 return init_kvm_tdp_mmu(vcpu); 2579 return init_kvm_tdp_mmu(vcpu);
2578 else 2580 else
2579 return init_kvm_softmmu(vcpu); 2581 return init_kvm_softmmu(vcpu);
2580 } 2582 }
2581 2583
2582 static void destroy_kvm_mmu(struct kvm_vcpu *vcpu) 2584 static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
2583 { 2585 {
2584 ASSERT(vcpu); 2586 ASSERT(vcpu);
2585 if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) 2587 if (VALID_PAGE(vcpu->arch.mmu.root_hpa))
2586 /* mmu.free() should set root_hpa = INVALID_PAGE */ 2588 /* mmu.free() should set root_hpa = INVALID_PAGE */
2587 vcpu->arch.mmu.free(vcpu); 2589 vcpu->arch.mmu.free(vcpu);
2588 } 2590 }
2589 2591
2590 int kvm_mmu_reset_context(struct kvm_vcpu *vcpu) 2592 int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
2591 { 2593 {
2592 destroy_kvm_mmu(vcpu); 2594 destroy_kvm_mmu(vcpu);
2593 return init_kvm_mmu(vcpu); 2595 return init_kvm_mmu(vcpu);
2594 } 2596 }
2595 EXPORT_SYMBOL_GPL(kvm_mmu_reset_context); 2597 EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
2596 2598
2597 int kvm_mmu_load(struct kvm_vcpu *vcpu) 2599 int kvm_mmu_load(struct kvm_vcpu *vcpu)
2598 { 2600 {
2599 int r; 2601 int r;
2600 2602
2601 r = mmu_topup_memory_caches(vcpu); 2603 r = mmu_topup_memory_caches(vcpu);
2602 if (r) 2604 if (r)
2603 goto out; 2605 goto out;
2604 r = mmu_alloc_roots(vcpu); 2606 r = mmu_alloc_roots(vcpu);
2605 spin_lock(&vcpu->kvm->mmu_lock); 2607 spin_lock(&vcpu->kvm->mmu_lock);
2606 mmu_sync_roots(vcpu); 2608 mmu_sync_roots(vcpu);
2607 spin_unlock(&vcpu->kvm->mmu_lock); 2609 spin_unlock(&vcpu->kvm->mmu_lock);
2608 if (r) 2610 if (r)
2609 goto out; 2611 goto out;
2610 /* set_cr3() should ensure TLB has been flushed */ 2612 /* set_cr3() should ensure TLB has been flushed */
2611 kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); 2613 kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
2612 out: 2614 out:
2613 return r; 2615 return r;
2614 } 2616 }
2615 EXPORT_SYMBOL_GPL(kvm_mmu_load); 2617 EXPORT_SYMBOL_GPL(kvm_mmu_load);
2616 2618
2617 void kvm_mmu_unload(struct kvm_vcpu *vcpu) 2619 void kvm_mmu_unload(struct kvm_vcpu *vcpu)
2618 { 2620 {
2619 mmu_free_roots(vcpu); 2621 mmu_free_roots(vcpu);
2620 } 2622 }
2621 2623
2622 static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, 2624 static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
2623 struct kvm_mmu_page *sp, 2625 struct kvm_mmu_page *sp,
2624 u64 *spte) 2626 u64 *spte)
2625 { 2627 {
2626 u64 pte; 2628 u64 pte;
2627 struct kvm_mmu_page *child; 2629 struct kvm_mmu_page *child;
2628 2630
2629 pte = *spte; 2631 pte = *spte;
2630 if (is_shadow_present_pte(pte)) { 2632 if (is_shadow_present_pte(pte)) {
2631 if (is_last_spte(pte, sp->role.level)) 2633 if (is_last_spte(pte, sp->role.level))
2632 drop_spte(vcpu->kvm, spte, shadow_trap_nonpresent_pte); 2634 drop_spte(vcpu->kvm, spte, shadow_trap_nonpresent_pte);
2633 else { 2635 else {
2634 child = page_header(pte & PT64_BASE_ADDR_MASK); 2636 child = page_header(pte & PT64_BASE_ADDR_MASK);
2635 mmu_page_remove_parent_pte(child, spte); 2637 mmu_page_remove_parent_pte(child, spte);
2636 } 2638 }
2637 } 2639 }
2638 __set_spte(spte, shadow_trap_nonpresent_pte); 2640 __set_spte(spte, shadow_trap_nonpresent_pte);
2639 if (is_large_pte(pte)) 2641 if (is_large_pte(pte))
2640 --vcpu->kvm->stat.lpages; 2642 --vcpu->kvm->stat.lpages;
2641 } 2643 }
2642 2644
2643 static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, 2645 static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
2644 struct kvm_mmu_page *sp, 2646 struct kvm_mmu_page *sp,
2645 u64 *spte, 2647 u64 *spte,
2646 const void *new) 2648 const void *new)
2647 { 2649 {
2648 if (sp->role.level != PT_PAGE_TABLE_LEVEL) { 2650 if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
2649 ++vcpu->kvm->stat.mmu_pde_zapped; 2651 ++vcpu->kvm->stat.mmu_pde_zapped;
2650 return; 2652 return;
2651 } 2653 }
2652 2654
2653 ++vcpu->kvm->stat.mmu_pte_updated; 2655 ++vcpu->kvm->stat.mmu_pte_updated;
2654 if (!sp->role.cr4_pae) 2656 if (!sp->role.cr4_pae)
2655 paging32_update_pte(vcpu, sp, spte, new); 2657 paging32_update_pte(vcpu, sp, spte, new);
2656 else 2658 else
2657 paging64_update_pte(vcpu, sp, spte, new); 2659 paging64_update_pte(vcpu, sp, spte, new);
2658 } 2660 }
2659 2661
2660 static bool need_remote_flush(u64 old, u64 new) 2662 static bool need_remote_flush(u64 old, u64 new)
2661 { 2663 {
2662 if (!is_shadow_present_pte(old)) 2664 if (!is_shadow_present_pte(old))
2663 return false; 2665 return false;
2664 if (!is_shadow_present_pte(new)) 2666 if (!is_shadow_present_pte(new))
2665 return true; 2667 return true;
2666 if ((old ^ new) & PT64_BASE_ADDR_MASK) 2668 if ((old ^ new) & PT64_BASE_ADDR_MASK)
2667 return true; 2669 return true;
2668 old ^= PT64_NX_MASK; 2670 old ^= PT64_NX_MASK;
2669 new ^= PT64_NX_MASK; 2671 new ^= PT64_NX_MASK;
2670 return (old & ~new & PT64_PERM_MASK) != 0; 2672 return (old & ~new & PT64_PERM_MASK) != 0;
2671 } 2673 }
2672 2674
2673 static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page, 2675 static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page,
2674 bool remote_flush, bool local_flush) 2676 bool remote_flush, bool local_flush)
2675 { 2677 {
2676 if (zap_page) 2678 if (zap_page)
2677 return; 2679 return;
2678 2680
2679 if (remote_flush) 2681 if (remote_flush)
2680 kvm_flush_remote_tlbs(vcpu->kvm); 2682 kvm_flush_remote_tlbs(vcpu->kvm);
2681 else if (local_flush) 2683 else if (local_flush)
2682 kvm_mmu_flush_tlb(vcpu); 2684 kvm_mmu_flush_tlb(vcpu);
2683 } 2685 }
2684 2686
2685 static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu) 2687 static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
2686 { 2688 {
2687 u64 *spte = vcpu->arch.last_pte_updated; 2689 u64 *spte = vcpu->arch.last_pte_updated;
2688 2690
2689 return !!(spte && (*spte & shadow_accessed_mask)); 2691 return !!(spte && (*spte & shadow_accessed_mask));
2690 } 2692 }
2691 2693
2692 static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, 2694 static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2693 u64 gpte) 2695 u64 gpte)
2694 { 2696 {
2695 gfn_t gfn; 2697 gfn_t gfn;
2696 pfn_t pfn; 2698 pfn_t pfn;
2697 2699
2698 if (!is_present_gpte(gpte)) 2700 if (!is_present_gpte(gpte))
2699 return; 2701 return;
2700 gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; 2702 gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
2701 2703
2702 vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq; 2704 vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq;
2703 smp_rmb(); 2705 smp_rmb();
2704 pfn = gfn_to_pfn(vcpu->kvm, gfn); 2706 pfn = gfn_to_pfn(vcpu->kvm, gfn);
2705 2707
2706 if (is_error_pfn(pfn)) { 2708 if (is_error_pfn(pfn)) {
2707 kvm_release_pfn_clean(pfn); 2709 kvm_release_pfn_clean(pfn);
2708 return; 2710 return;
2709 } 2711 }
2710 vcpu->arch.update_pte.gfn = gfn; 2712 vcpu->arch.update_pte.gfn = gfn;
2711 vcpu->arch.update_pte.pfn = pfn; 2713 vcpu->arch.update_pte.pfn = pfn;
2712 } 2714 }
2713 2715
2714 static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn) 2716 static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn)
2715 { 2717 {
2716 u64 *spte = vcpu->arch.last_pte_updated; 2718 u64 *spte = vcpu->arch.last_pte_updated;
2717 2719
2718 if (spte 2720 if (spte
2719 && vcpu->arch.last_pte_gfn == gfn 2721 && vcpu->arch.last_pte_gfn == gfn
2720 && shadow_accessed_mask 2722 && shadow_accessed_mask
2721 && !(*spte & shadow_accessed_mask) 2723 && !(*spte & shadow_accessed_mask)
2722 && is_shadow_present_pte(*spte)) 2724 && is_shadow_present_pte(*spte))
2723 set_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte); 2725 set_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
2724 } 2726 }
2725 2727
2726 void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, 2728 void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2727 const u8 *new, int bytes, 2729 const u8 *new, int bytes,
2728 bool guest_initiated) 2730 bool guest_initiated)
2729 { 2731 {
2730 gfn_t gfn = gpa >> PAGE_SHIFT; 2732 gfn_t gfn = gpa >> PAGE_SHIFT;
2731 struct kvm_mmu_page *sp; 2733 struct kvm_mmu_page *sp;
2732 struct hlist_node *node; 2734 struct hlist_node *node;
2733 LIST_HEAD(invalid_list); 2735 LIST_HEAD(invalid_list);
2734 u64 entry, gentry; 2736 u64 entry, gentry;
2735 u64 *spte; 2737 u64 *spte;
2736 unsigned offset = offset_in_page(gpa); 2738 unsigned offset = offset_in_page(gpa);
2737 unsigned pte_size; 2739 unsigned pte_size;
2738 unsigned page_offset; 2740 unsigned page_offset;
2739 unsigned misaligned; 2741 unsigned misaligned;
2740 unsigned quadrant; 2742 unsigned quadrant;
2741 int level; 2743 int level;
2742 int flooded = 0; 2744 int flooded = 0;
2743 int npte; 2745 int npte;
2744 int r; 2746 int r;
2745 int invlpg_counter; 2747 int invlpg_counter;
2746 bool remote_flush, local_flush, zap_page; 2748 bool remote_flush, local_flush, zap_page;
2747 2749
2748 zap_page = remote_flush = local_flush = false; 2750 zap_page = remote_flush = local_flush = false;
2749 2751
2750 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); 2752 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
2751 2753
2752 invlpg_counter = atomic_read(&vcpu->kvm->arch.invlpg_counter); 2754 invlpg_counter = atomic_read(&vcpu->kvm->arch.invlpg_counter);
2753 2755
2754 /* 2756 /*
2755 * Assume that the pte write on a page table of the same type 2757 * Assume that the pte write on a page table of the same type
2756 * as the current vcpu paging mode. This is nearly always true 2758 * as the current vcpu paging mode. This is nearly always true
2757 * (might be false while changing modes). Note it is verified later 2759 * (might be false while changing modes). Note it is verified later
2758 * by update_pte(). 2760 * by update_pte().
2759 */ 2761 */
2760 if ((is_pae(vcpu) && bytes == 4) || !new) { 2762 if ((is_pae(vcpu) && bytes == 4) || !new) {
2761 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ 2763 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
2762 if (is_pae(vcpu)) { 2764 if (is_pae(vcpu)) {
2763 gpa &= ~(gpa_t)7; 2765 gpa &= ~(gpa_t)7;
2764 bytes = 8; 2766 bytes = 8;
2765 } 2767 }
2766 r = kvm_read_guest(vcpu->kvm, gpa, &gentry, min(bytes, 8)); 2768 r = kvm_read_guest(vcpu->kvm, gpa, &gentry, min(bytes, 8));
2767 if (r) 2769 if (r)
2768 gentry = 0; 2770 gentry = 0;
2769 new = (const u8 *)&gentry; 2771 new = (const u8 *)&gentry;
2770 } 2772 }
2771 2773
2772 switch (bytes) { 2774 switch (bytes) {
2773 case 4: 2775 case 4:
2774 gentry = *(const u32 *)new; 2776 gentry = *(const u32 *)new;
2775 break; 2777 break;
2776 case 8: 2778 case 8:
2777 gentry = *(const u64 *)new; 2779 gentry = *(const u64 *)new;
2778 break; 2780 break;
2779 default: 2781 default:
2780 gentry = 0; 2782 gentry = 0;
2781 break; 2783 break;
2782 } 2784 }
2783 2785
2784 mmu_guess_page_from_pte_write(vcpu, gpa, gentry); 2786 mmu_guess_page_from_pte_write(vcpu, gpa, gentry);
2785 spin_lock(&vcpu->kvm->mmu_lock); 2787 spin_lock(&vcpu->kvm->mmu_lock);
2786 if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter) 2788 if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter)
2787 gentry = 0; 2789 gentry = 0;
2788 kvm_mmu_access_page(vcpu, gfn); 2790 kvm_mmu_access_page(vcpu, gfn);
2789 kvm_mmu_free_some_pages(vcpu); 2791 kvm_mmu_free_some_pages(vcpu);
2790 ++vcpu->kvm->stat.mmu_pte_write; 2792 ++vcpu->kvm->stat.mmu_pte_write;
2791 kvm_mmu_audit(vcpu, "pre pte write"); 2793 kvm_mmu_audit(vcpu, "pre pte write");
2792 if (guest_initiated) { 2794 if (guest_initiated) {
2793 if (gfn == vcpu->arch.last_pt_write_gfn 2795 if (gfn == vcpu->arch.last_pt_write_gfn
2794 && !last_updated_pte_accessed(vcpu)) { 2796 && !last_updated_pte_accessed(vcpu)) {
2795 ++vcpu->arch.last_pt_write_count; 2797 ++vcpu->arch.last_pt_write_count;
2796 if (vcpu->arch.last_pt_write_count >= 3) 2798 if (vcpu->arch.last_pt_write_count >= 3)
2797 flooded = 1; 2799 flooded = 1;
2798 } else { 2800 } else {
2799 vcpu->arch.last_pt_write_gfn = gfn; 2801 vcpu->arch.last_pt_write_gfn = gfn;
2800 vcpu->arch.last_pt_write_count = 1; 2802 vcpu->arch.last_pt_write_count = 1;
2801 vcpu->arch.last_pte_updated = NULL; 2803 vcpu->arch.last_pte_updated = NULL;
2802 } 2804 }
2803 } 2805 }
2804 2806
2805 for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) { 2807 for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) {
2806 pte_size = sp->role.cr4_pae ? 8 : 4; 2808 pte_size = sp->role.cr4_pae ? 8 : 4;
2807 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); 2809 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
2808 misaligned |= bytes < 4; 2810 misaligned |= bytes < 4;
2809 if (misaligned || flooded) { 2811 if (misaligned || flooded) {
2810 /* 2812 /*
2811 * Misaligned accesses are too much trouble to fix 2813 * Misaligned accesses are too much trouble to fix
2812 * up; also, they usually indicate a page is not used 2814 * up; also, they usually indicate a page is not used
2813 * as a page table. 2815 * as a page table.
2814 * 2816 *
2815 * If we're seeing too many writes to a page, 2817 * If we're seeing too many writes to a page,
2816 * it may no longer be a page table, or we may be 2818 * it may no longer be a page table, or we may be
2817 * forking, in which case it is better to unmap the 2819 * forking, in which case it is better to unmap the
2818 * page. 2820 * page.
2819 */ 2821 */
2820 pgprintk("misaligned: gpa %llx bytes %d role %x\n", 2822 pgprintk("misaligned: gpa %llx bytes %d role %x\n",
2821 gpa, bytes, sp->role.word); 2823 gpa, bytes, sp->role.word);
2822 zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp, 2824 zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
2823 &invalid_list); 2825 &invalid_list);
2824 ++vcpu->kvm->stat.mmu_flooded; 2826 ++vcpu->kvm->stat.mmu_flooded;
2825 continue; 2827 continue;
2826 } 2828 }
2827 page_offset = offset; 2829 page_offset = offset;
2828 level = sp->role.level; 2830 level = sp->role.level;
2829 npte = 1; 2831 npte = 1;
2830 if (!sp->role.cr4_pae) { 2832 if (!sp->role.cr4_pae) {
2831 page_offset <<= 1; /* 32->64 */ 2833 page_offset <<= 1; /* 32->64 */
2832 /* 2834 /*
2833 * A 32-bit pde maps 4MB while the shadow pdes map 2835 * A 32-bit pde maps 4MB while the shadow pdes map
2834 * only 2MB. So we need to double the offset again 2836 * only 2MB. So we need to double the offset again
2835 * and zap two pdes instead of one. 2837 * and zap two pdes instead of one.
2836 */ 2838 */
2837 if (level == PT32_ROOT_LEVEL) { 2839 if (level == PT32_ROOT_LEVEL) {
2838 page_offset &= ~7; /* kill rounding error */ 2840 page_offset &= ~7; /* kill rounding error */
2839 page_offset <<= 1; 2841 page_offset <<= 1;
2840 npte = 2; 2842 npte = 2;
2841 } 2843 }
2842 quadrant = page_offset >> PAGE_SHIFT; 2844 quadrant = page_offset >> PAGE_SHIFT;
2843 page_offset &= ~PAGE_MASK; 2845 page_offset &= ~PAGE_MASK;
2844 if (quadrant != sp->role.quadrant) 2846 if (quadrant != sp->role.quadrant)
2845 continue; 2847 continue;
2846 } 2848 }
2847 local_flush = true; 2849 local_flush = true;
2848 spte = &sp->spt[page_offset / sizeof(*spte)]; 2850 spte = &sp->spt[page_offset / sizeof(*spte)];
2849 while (npte--) { 2851 while (npte--) {
2850 entry = *spte; 2852 entry = *spte;
2851 mmu_pte_write_zap_pte(vcpu, sp, spte); 2853 mmu_pte_write_zap_pte(vcpu, sp, spte);
2852 if (gentry) 2854 if (gentry)
2853 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); 2855 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
2854 if (!remote_flush && need_remote_flush(entry, *spte)) 2856 if (!remote_flush && need_remote_flush(entry, *spte))
2855 remote_flush = true; 2857 remote_flush = true;
2856 ++spte; 2858 ++spte;
2857 } 2859 }
2858 } 2860 }
2859 mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush); 2861 mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
2860 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); 2862 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2861 kvm_mmu_audit(vcpu, "post pte write"); 2863 kvm_mmu_audit(vcpu, "post pte write");
2862 spin_unlock(&vcpu->kvm->mmu_lock); 2864 spin_unlock(&vcpu->kvm->mmu_lock);
2863 if (!is_error_pfn(vcpu->arch.update_pte.pfn)) { 2865 if (!is_error_pfn(vcpu->arch.update_pte.pfn)) {
2864 kvm_release_pfn_clean(vcpu->arch.update_pte.pfn); 2866 kvm_release_pfn_clean(vcpu->arch.update_pte.pfn);
2865 vcpu->arch.update_pte.pfn = bad_pfn; 2867 vcpu->arch.update_pte.pfn = bad_pfn;
2866 } 2868 }
2867 } 2869 }
2868 2870
2869 int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) 2871 int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
2870 { 2872 {
2871 gpa_t gpa; 2873 gpa_t gpa;
2872 int r; 2874 int r;
2873 2875
2874 if (tdp_enabled) 2876 if (tdp_enabled)
2875 return 0; 2877 return 0;
2876 2878
2877 gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); 2879 gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
2878 2880
2879 spin_lock(&vcpu->kvm->mmu_lock); 2881 spin_lock(&vcpu->kvm->mmu_lock);
2880 r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); 2882 r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
2881 spin_unlock(&vcpu->kvm->mmu_lock); 2883 spin_unlock(&vcpu->kvm->mmu_lock);
2882 return r; 2884 return r;
2883 } 2885 }
2884 EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt); 2886 EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
2885 2887
2886 void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) 2888 void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
2887 { 2889 {
2888 int free_pages; 2890 int free_pages;
2889 LIST_HEAD(invalid_list); 2891 LIST_HEAD(invalid_list);
2890 2892
2891 free_pages = vcpu->kvm->arch.n_free_mmu_pages; 2893 free_pages = vcpu->kvm->arch.n_free_mmu_pages;
2892 while (free_pages < KVM_REFILL_PAGES && 2894 while (free_pages < KVM_REFILL_PAGES &&
2893 !list_empty(&vcpu->kvm->arch.active_mmu_pages)) { 2895 !list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
2894 struct kvm_mmu_page *sp; 2896 struct kvm_mmu_page *sp;
2895 2897
2896 sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, 2898 sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
2897 struct kvm_mmu_page, link); 2899 struct kvm_mmu_page, link);
2898 free_pages += kvm_mmu_prepare_zap_page(vcpu->kvm, sp, 2900 free_pages += kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
2899 &invalid_list); 2901 &invalid_list);
2900 ++vcpu->kvm->stat.mmu_recycled; 2902 ++vcpu->kvm->stat.mmu_recycled;
2901 } 2903 }
2902 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); 2904 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2903 } 2905 }
2904 2906
2905 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) 2907 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
2906 { 2908 {
2907 int r; 2909 int r;
2908 enum emulation_result er; 2910 enum emulation_result er;
2909 2911
2910 r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code); 2912 r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code);
2911 if (r < 0) 2913 if (r < 0)
2912 goto out; 2914 goto out;
2913 2915
2914 if (!r) { 2916 if (!r) {
2915 r = 1; 2917 r = 1;
2916 goto out; 2918 goto out;
2917 } 2919 }
2918 2920
2919 r = mmu_topup_memory_caches(vcpu); 2921 r = mmu_topup_memory_caches(vcpu);
2920 if (r) 2922 if (r)
2921 goto out; 2923 goto out;
2922 2924
2923 er = emulate_instruction(vcpu, cr2, error_code, 0); 2925 er = emulate_instruction(vcpu, cr2, error_code, 0);
2924 2926
2925 switch (er) { 2927 switch (er) {
2926 case EMULATE_DONE: 2928 case EMULATE_DONE:
2927 return 1; 2929 return 1;
2928 case EMULATE_DO_MMIO: 2930 case EMULATE_DO_MMIO:
2929 ++vcpu->stat.mmio_exits; 2931 ++vcpu->stat.mmio_exits;
2930 /* fall through */ 2932 /* fall through */
2931 case EMULATE_FAIL: 2933 case EMULATE_FAIL:
2932 return 0; 2934 return 0;
2933 default: 2935 default:
2934 BUG(); 2936 BUG();
2935 } 2937 }
2936 out: 2938 out:
2937 return r; 2939 return r;
2938 } 2940 }
2939 EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); 2941 EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
2940 2942
2941 void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) 2943 void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
2942 { 2944 {
2943 vcpu->arch.mmu.invlpg(vcpu, gva); 2945 vcpu->arch.mmu.invlpg(vcpu, gva);
2944 kvm_mmu_flush_tlb(vcpu); 2946 kvm_mmu_flush_tlb(vcpu);
2945 ++vcpu->stat.invlpg; 2947 ++vcpu->stat.invlpg;
2946 } 2948 }
2947 EXPORT_SYMBOL_GPL(kvm_mmu_invlpg); 2949 EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
2948 2950
2949 void kvm_enable_tdp(void) 2951 void kvm_enable_tdp(void)
2950 { 2952 {
2951 tdp_enabled = true; 2953 tdp_enabled = true;
2952 } 2954 }
2953 EXPORT_SYMBOL_GPL(kvm_enable_tdp); 2955 EXPORT_SYMBOL_GPL(kvm_enable_tdp);
2954 2956
2955 void kvm_disable_tdp(void) 2957 void kvm_disable_tdp(void)
2956 { 2958 {
2957 tdp_enabled = false; 2959 tdp_enabled = false;
2958 } 2960 }
2959 EXPORT_SYMBOL_GPL(kvm_disable_tdp); 2961 EXPORT_SYMBOL_GPL(kvm_disable_tdp);
2960 2962
2961 static void free_mmu_pages(struct kvm_vcpu *vcpu) 2963 static void free_mmu_pages(struct kvm_vcpu *vcpu)
2962 { 2964 {
2963 free_page((unsigned long)vcpu->arch.mmu.pae_root); 2965 free_page((unsigned long)vcpu->arch.mmu.pae_root);
2964 } 2966 }
2965 2967
2966 static int alloc_mmu_pages(struct kvm_vcpu *vcpu) 2968 static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
2967 { 2969 {
2968 struct page *page; 2970 struct page *page;
2969 int i; 2971 int i;
2970 2972
2971 ASSERT(vcpu); 2973 ASSERT(vcpu);
2972 2974
2973 /* 2975 /*
2974 * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. 2976 * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
2975 * Therefore we need to allocate shadow page tables in the first 2977 * Therefore we need to allocate shadow page tables in the first
2976 * 4GB of memory, which happens to fit the DMA32 zone. 2978 * 4GB of memory, which happens to fit the DMA32 zone.
2977 */ 2979 */
2978 page = alloc_page(GFP_KERNEL | __GFP_DMA32); 2980 page = alloc_page(GFP_KERNEL | __GFP_DMA32);
2979 if (!page) 2981 if (!page)
2980 return -ENOMEM; 2982 return -ENOMEM;
2981 2983
2982 vcpu->arch.mmu.pae_root = page_address(page); 2984 vcpu->arch.mmu.pae_root = page_address(page);
2983 for (i = 0; i < 4; ++i) 2985 for (i = 0; i < 4; ++i)
2984 vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; 2986 vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
2985 2987
2986 return 0; 2988 return 0;
2987 } 2989 }
2988 2990
2989 int kvm_mmu_create(struct kvm_vcpu *vcpu) 2991 int kvm_mmu_create(struct kvm_vcpu *vcpu)
2990 { 2992 {
2991 ASSERT(vcpu); 2993 ASSERT(vcpu);
2992 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); 2994 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
2993 2995
2994 return alloc_mmu_pages(vcpu); 2996 return alloc_mmu_pages(vcpu);
2995 } 2997 }
2996 2998
2997 int kvm_mmu_setup(struct kvm_vcpu *vcpu) 2999 int kvm_mmu_setup(struct kvm_vcpu *vcpu)
2998 { 3000 {
2999 ASSERT(vcpu); 3001 ASSERT(vcpu);
3000 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); 3002 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
3001 3003
3002 return init_kvm_mmu(vcpu); 3004 return init_kvm_mmu(vcpu);
3003 } 3005 }
3004 3006
3005 void kvm_mmu_destroy(struct kvm_vcpu *vcpu) 3007 void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
3006 { 3008 {
3007 ASSERT(vcpu); 3009 ASSERT(vcpu);
3008 3010
3009 destroy_kvm_mmu(vcpu); 3011 destroy_kvm_mmu(vcpu);
3010 free_mmu_pages(vcpu); 3012 free_mmu_pages(vcpu);
3011 mmu_free_memory_caches(vcpu); 3013 mmu_free_memory_caches(vcpu);
3012 } 3014 }
3013 3015
3014 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) 3016 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
3015 { 3017 {
3016 struct kvm_mmu_page *sp; 3018 struct kvm_mmu_page *sp;
3017 3019
3018 list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { 3020 list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
3019 int i; 3021 int i;
3020 u64 *pt; 3022 u64 *pt;
3021 3023
3022 if (!test_bit(slot, sp->slot_bitmap)) 3024 if (!test_bit(slot, sp->slot_bitmap))
3023 continue; 3025 continue;
3024 3026
3025 pt = sp->spt; 3027 pt = sp->spt;
3026 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) 3028 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
3027 /* avoid RMW */ 3029 /* avoid RMW */
3028 if (is_writable_pte(pt[i])) 3030 if (is_writable_pte(pt[i]))
3029 pt[i] &= ~PT_WRITABLE_MASK; 3031 pt[i] &= ~PT_WRITABLE_MASK;
3030 } 3032 }
3031 kvm_flush_remote_tlbs(kvm); 3033 kvm_flush_remote_tlbs(kvm);
3032 } 3034 }
3033 3035
3034 void kvm_mmu_zap_all(struct kvm *kvm) 3036 void kvm_mmu_zap_all(struct kvm *kvm)
3035 { 3037 {
3036 struct kvm_mmu_page *sp, *node; 3038 struct kvm_mmu_page *sp, *node;
3037 LIST_HEAD(invalid_list); 3039 LIST_HEAD(invalid_list);
3038 3040
3039 spin_lock(&kvm->mmu_lock); 3041 spin_lock(&kvm->mmu_lock);
3040 restart: 3042 restart:
3041 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) 3043 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
3042 if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) 3044 if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
3043 goto restart; 3045 goto restart;
3044 3046
3045 kvm_mmu_commit_zap_page(kvm, &invalid_list); 3047 kvm_mmu_commit_zap_page(kvm, &invalid_list);
3046 spin_unlock(&kvm->mmu_lock); 3048 spin_unlock(&kvm->mmu_lock);
3047 } 3049 }
3048 3050
3049 static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm, 3051 static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,
3050 struct list_head *invalid_list) 3052 struct list_head *invalid_list)
3051 { 3053 {
3052 struct kvm_mmu_page *page; 3054 struct kvm_mmu_page *page;
3053 3055
3054 page = container_of(kvm->arch.active_mmu_pages.prev, 3056 page = container_of(kvm->arch.active_mmu_pages.prev,
3055 struct kvm_mmu_page, link); 3057 struct kvm_mmu_page, link);
3056 return kvm_mmu_prepare_zap_page(kvm, page, invalid_list); 3058 return kvm_mmu_prepare_zap_page(kvm, page, invalid_list);
3057 } 3059 }
3058 3060
3059 static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) 3061 static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
3060 { 3062 {
3061 struct kvm *kvm; 3063 struct kvm *kvm;
3062 struct kvm *kvm_freed = NULL; 3064 struct kvm *kvm_freed = NULL;
3063 int cache_count = 0; 3065 int cache_count = 0;
3064 3066
3065 spin_lock(&kvm_lock); 3067 spin_lock(&kvm_lock);
3066 3068
3067 list_for_each_entry(kvm, &vm_list, vm_list) { 3069 list_for_each_entry(kvm, &vm_list, vm_list) {
3068 int npages, idx, freed_pages; 3070 int npages, idx, freed_pages;
3069 LIST_HEAD(invalid_list); 3071 LIST_HEAD(invalid_list);
3070 3072
3071 idx = srcu_read_lock(&kvm->srcu); 3073 idx = srcu_read_lock(&kvm->srcu);
3072 spin_lock(&kvm->mmu_lock); 3074 spin_lock(&kvm->mmu_lock);
3073 npages = kvm->arch.n_alloc_mmu_pages - 3075 npages = kvm->arch.n_alloc_mmu_pages -
3074 kvm->arch.n_free_mmu_pages; 3076 kvm->arch.n_free_mmu_pages;
3075 cache_count += npages; 3077 cache_count += npages;
3076 if (!kvm_freed && nr_to_scan > 0 && npages > 0) { 3078 if (!kvm_freed && nr_to_scan > 0 && npages > 0) {
3077 freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm, 3079 freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm,
3078 &invalid_list); 3080 &invalid_list);
3079 cache_count -= freed_pages; 3081 cache_count -= freed_pages;
3080 kvm_freed = kvm; 3082 kvm_freed = kvm;
3081 } 3083 }
3082 nr_to_scan--; 3084 nr_to_scan--;
3083 3085
3084 kvm_mmu_commit_zap_page(kvm, &invalid_list); 3086 kvm_mmu_commit_zap_page(kvm, &invalid_list);
3085 spin_unlock(&kvm->mmu_lock); 3087 spin_unlock(&kvm->mmu_lock);
3086 srcu_read_unlock(&kvm->srcu, idx); 3088 srcu_read_unlock(&kvm->srcu, idx);
3087 } 3089 }
3088 if (kvm_freed) 3090 if (kvm_freed)
3089 list_move_tail(&kvm_freed->vm_list, &vm_list); 3091 list_move_tail(&kvm_freed->vm_list, &vm_list);
3090 3092
3091 spin_unlock(&kvm_lock); 3093 spin_unlock(&kvm_lock);
3092 3094
3093 return cache_count; 3095 return cache_count;
3094 } 3096 }
3095 3097
3096 static struct shrinker mmu_shrinker = { 3098 static struct shrinker mmu_shrinker = {
3097 .shrink = mmu_shrink, 3099 .shrink = mmu_shrink,
3098 .seeks = DEFAULT_SEEKS * 10, 3100 .seeks = DEFAULT_SEEKS * 10,
3099 }; 3101 };
3100 3102
3101 static void mmu_destroy_caches(void) 3103 static void mmu_destroy_caches(void)
3102 { 3104 {
3103 if (pte_chain_cache) 3105 if (pte_chain_cache)
3104 kmem_cache_destroy(pte_chain_cache); 3106 kmem_cache_destroy(pte_chain_cache);
3105 if (rmap_desc_cache) 3107 if (rmap_desc_cache)
3106 kmem_cache_destroy(rmap_desc_cache); 3108 kmem_cache_destroy(rmap_desc_cache);
3107 if (mmu_page_header_cache) 3109 if (mmu_page_header_cache)
3108 kmem_cache_destroy(mmu_page_header_cache); 3110 kmem_cache_destroy(mmu_page_header_cache);
3109 } 3111 }
3110 3112
3111 void kvm_mmu_module_exit(void) 3113 void kvm_mmu_module_exit(void)
3112 { 3114 {
3113 mmu_destroy_caches(); 3115 mmu_destroy_caches();
3114 unregister_shrinker(&mmu_shrinker); 3116 unregister_shrinker(&mmu_shrinker);
3115 } 3117 }
3116 3118
3117 int kvm_mmu_module_init(void) 3119 int kvm_mmu_module_init(void)
3118 { 3120 {
3119 pte_chain_cache = kmem_cache_create("kvm_pte_chain", 3121 pte_chain_cache = kmem_cache_create("kvm_pte_chain",
3120 sizeof(struct kvm_pte_chain), 3122 sizeof(struct kvm_pte_chain),
3121 0, 0, NULL); 3123 0, 0, NULL);
3122 if (!pte_chain_cache) 3124 if (!pte_chain_cache)
3123 goto nomem; 3125 goto nomem;
3124 rmap_desc_cache = kmem_cache_create("kvm_rmap_desc", 3126 rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
3125 sizeof(struct kvm_rmap_desc), 3127 sizeof(struct kvm_rmap_desc),
3126 0, 0, NULL); 3128 0, 0, NULL);
3127 if (!rmap_desc_cache) 3129 if (!rmap_desc_cache)
3128 goto nomem; 3130 goto nomem;
3129 3131
3130 mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header", 3132 mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
3131 sizeof(struct kvm_mmu_page), 3133 sizeof(struct kvm_mmu_page),
3132 0, 0, NULL); 3134 0, 0, NULL);
3133 if (!mmu_page_header_cache) 3135 if (!mmu_page_header_cache)
3134 goto nomem; 3136 goto nomem;
3135 3137
3136 register_shrinker(&mmu_shrinker); 3138 register_shrinker(&mmu_shrinker);
3137 3139
3138 return 0; 3140 return 0;
3139 3141
3140 nomem: 3142 nomem:
3141 mmu_destroy_caches(); 3143 mmu_destroy_caches();
3142 return -ENOMEM; 3144 return -ENOMEM;
3143 } 3145 }
3144 3146
3145 /* 3147 /*
3146 * Caculate mmu pages needed for kvm. 3148 * Caculate mmu pages needed for kvm.
3147 */ 3149 */
3148 unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm) 3150 unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
3149 { 3151 {
3150 int i; 3152 int i;
3151 unsigned int nr_mmu_pages; 3153 unsigned int nr_mmu_pages;
3152 unsigned int nr_pages = 0; 3154 unsigned int nr_pages = 0;
3153 struct kvm_memslots *slots; 3155 struct kvm_memslots *slots;
3154 3156
3155 slots = kvm_memslots(kvm); 3157 slots = kvm_memslots(kvm);
3156 3158
3157 for (i = 0; i < slots->nmemslots; i++) 3159 for (i = 0; i < slots->nmemslots; i++)
3158 nr_pages += slots->memslots[i].npages; 3160 nr_pages += slots->memslots[i].npages;
3159 3161
3160 nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000; 3162 nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
3161 nr_mmu_pages = max(nr_mmu_pages, 3163 nr_mmu_pages = max(nr_mmu_pages,
3162 (unsigned int) KVM_MIN_ALLOC_MMU_PAGES); 3164 (unsigned int) KVM_MIN_ALLOC_MMU_PAGES);
3163 3165
3164 return nr_mmu_pages; 3166 return nr_mmu_pages;
3165 } 3167 }
3166 3168
3167 static void *pv_mmu_peek_buffer(struct kvm_pv_mmu_op_buffer *buffer, 3169 static void *pv_mmu_peek_buffer(struct kvm_pv_mmu_op_buffer *buffer,
3168 unsigned len) 3170 unsigned len)
3169 { 3171 {
3170 if (len > buffer->len) 3172 if (len > buffer->len)
3171 return NULL; 3173 return NULL;
3172 return buffer->ptr; 3174 return buffer->ptr;
3173 } 3175 }
3174 3176
3175 static void *pv_mmu_read_buffer(struct kvm_pv_mmu_op_buffer *buffer, 3177 static void *pv_mmu_read_buffer(struct kvm_pv_mmu_op_buffer *buffer,
3176 unsigned len) 3178 unsigned len)
3177 { 3179 {
3178 void *ret; 3180 void *ret;
3179 3181
3180 ret = pv_mmu_peek_buffer(buffer, len); 3182 ret = pv_mmu_peek_buffer(buffer, len);
3181 if (!ret) 3183 if (!ret)
3182 return ret; 3184 return ret;
3183 buffer->ptr += len; 3185 buffer->ptr += len;
3184 buffer->len -= len; 3186 buffer->len -= len;
3185 buffer->processed += len; 3187 buffer->processed += len;
3186 return ret; 3188 return ret;
3187 } 3189 }
3188 3190
3189 static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu, 3191 static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
3190 gpa_t addr, gpa_t value) 3192 gpa_t addr, gpa_t value)
3191 { 3193 {
3192 int bytes = 8; 3194 int bytes = 8;
3193 int r; 3195 int r;
3194 3196
3195 if (!is_long_mode(vcpu) && !is_pae(vcpu)) 3197 if (!is_long_mode(vcpu) && !is_pae(vcpu))
3196 bytes = 4; 3198 bytes = 4;
3197 3199
3198 r = mmu_topup_memory_caches(vcpu); 3200 r = mmu_topup_memory_caches(vcpu);
3199 if (r) 3201 if (r)
3200 return r; 3202 return r;
3201 3203
3202 if (!emulator_write_phys(vcpu, addr, &value, bytes)) 3204 if (!emulator_write_phys(vcpu, addr, &value, bytes))
3203 return -EFAULT; 3205 return -EFAULT;
3204 3206
3205 return 1; 3207 return 1;
3206 } 3208 }
3207 3209
3208 static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu) 3210 static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
3209 { 3211 {
3210 (void)kvm_set_cr3(vcpu, vcpu->arch.cr3); 3212 (void)kvm_set_cr3(vcpu, vcpu->arch.cr3);
3211 return 1; 3213 return 1;
3212 } 3214 }
3213 3215
3214 static int kvm_pv_mmu_release_pt(struct kvm_vcpu *vcpu, gpa_t addr) 3216 static int kvm_pv_mmu_release_pt(struct kvm_vcpu *vcpu, gpa_t addr)
3215 { 3217 {
3216 spin_lock(&vcpu->kvm->mmu_lock); 3218 spin_lock(&vcpu->kvm->mmu_lock);
3217 mmu_unshadow(vcpu->kvm, addr >> PAGE_SHIFT); 3219 mmu_unshadow(vcpu->kvm, addr >> PAGE_SHIFT);
3218 spin_unlock(&vcpu->kvm->mmu_lock); 3220 spin_unlock(&vcpu->kvm->mmu_lock);
3219 return 1; 3221 return 1;
3220 } 3222 }
3221 3223
3222 static int kvm_pv_mmu_op_one(struct kvm_vcpu *vcpu, 3224 static int kvm_pv_mmu_op_one(struct kvm_vcpu *vcpu,
3223 struct kvm_pv_mmu_op_buffer *buffer) 3225 struct kvm_pv_mmu_op_buffer *buffer)
3224 { 3226 {
3225 struct kvm_mmu_op_header *header; 3227 struct kvm_mmu_op_header *header;
3226 3228
3227 header = pv_mmu_peek_buffer(buffer, sizeof *header); 3229 header = pv_mmu_peek_buffer(buffer, sizeof *header);
3228 if (!header) 3230 if (!header)
3229 return 0; 3231 return 0;
3230 switch (header->op) { 3232 switch (header->op) {
3231 case KVM_MMU_OP_WRITE_PTE: { 3233 case KVM_MMU_OP_WRITE_PTE: {
3232 struct kvm_mmu_op_write_pte *wpte; 3234 struct kvm_mmu_op_write_pte *wpte;
3233 3235
3234 wpte = pv_mmu_read_buffer(buffer, sizeof *wpte); 3236 wpte = pv_mmu_read_buffer(buffer, sizeof *wpte);
3235 if (!wpte) 3237 if (!wpte)
3236 return 0; 3238 return 0;
3237 return kvm_pv_mmu_write(vcpu, wpte->pte_phys, 3239 return kvm_pv_mmu_write(vcpu, wpte->pte_phys,
3238 wpte->pte_val); 3240 wpte->pte_val);
3239 } 3241 }
3240 case KVM_MMU_OP_FLUSH_TLB: { 3242 case KVM_MMU_OP_FLUSH_TLB: {
3241 struct kvm_mmu_op_flush_tlb *ftlb; 3243 struct kvm_mmu_op_flush_tlb *ftlb;
3242 3244
3243 ftlb = pv_mmu_read_buffer(buffer, sizeof *ftlb); 3245 ftlb = pv_mmu_read_buffer(buffer, sizeof *ftlb);
3244 if (!ftlb) 3246 if (!ftlb)
3245 return 0; 3247 return 0;
3246 return kvm_pv_mmu_flush_tlb(vcpu); 3248 return kvm_pv_mmu_flush_tlb(vcpu);
3247 } 3249 }
3248 case KVM_MMU_OP_RELEASE_PT: { 3250 case KVM_MMU_OP_RELEASE_PT: {
3249 struct kvm_mmu_op_release_pt *rpt; 3251 struct kvm_mmu_op_release_pt *rpt;
3250 3252
3251 rpt = pv_mmu_read_buffer(buffer, sizeof *rpt); 3253 rpt = pv_mmu_read_buffer(buffer, sizeof *rpt);
3252 if (!rpt) 3254 if (!rpt)
3253 return 0; 3255 return 0;
3254 return kvm_pv_mmu_release_pt(vcpu, rpt->pt_phys); 3256 return kvm_pv_mmu_release_pt(vcpu, rpt->pt_phys);
3255 } 3257 }
3256 default: return 0; 3258 default: return 0;
3257 } 3259 }
3258 } 3260 }
3259 3261
3260 int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes, 3262 int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
3261 gpa_t addr, unsigned long *ret) 3263 gpa_t addr, unsigned long *ret)
3262 { 3264 {
3263 int r; 3265 int r;
3264 struct kvm_pv_mmu_op_buffer *buffer = &vcpu->arch.mmu_op_buffer; 3266 struct kvm_pv_mmu_op_buffer *buffer = &vcpu->arch.mmu_op_buffer;
3265 3267
3266 buffer->ptr = buffer->buf; 3268 buffer->ptr = buffer->buf;
3267 buffer->len = min_t(unsigned long, bytes, sizeof buffer->buf); 3269 buffer->len = min_t(unsigned long, bytes, sizeof buffer->buf);
3268 buffer->processed = 0; 3270 buffer->processed = 0;
3269 3271
3270 r = kvm_read_guest(vcpu->kvm, addr, buffer->buf, buffer->len); 3272 r = kvm_read_guest(vcpu->kvm, addr, buffer->buf, buffer->len);
3271 if (r) 3273 if (r)
3272 goto out; 3274 goto out;
3273 3275
3274 while (buffer->len) { 3276 while (buffer->len) {
3275 r = kvm_pv_mmu_op_one(vcpu, buffer); 3277 r = kvm_pv_mmu_op_one(vcpu, buffer);
3276 if (r < 0) 3278 if (r < 0)
3277 goto out; 3279 goto out;
3278 if (r == 0) 3280 if (r == 0)
3279 break; 3281 break;
3280 } 3282 }
3281 3283
3282 r = 1; 3284 r = 1;
3283 out: 3285 out:
3284 *ret = buffer->processed; 3286 *ret = buffer->processed;
3285 return r; 3287 return r;
3286 } 3288 }
3287 3289
3288 int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]) 3290 int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
3289 { 3291 {
3290 struct kvm_shadow_walk_iterator iterator; 3292 struct kvm_shadow_walk_iterator iterator;
3291 int nr_sptes = 0; 3293 int nr_sptes = 0;
3292 3294
3293 spin_lock(&vcpu->kvm->mmu_lock); 3295 spin_lock(&vcpu->kvm->mmu_lock);
3294 for_each_shadow_entry(vcpu, addr, iterator) { 3296 for_each_shadow_entry(vcpu, addr, iterator) {
3295 sptes[iterator.level-1] = *iterator.sptep; 3297 sptes[iterator.level-1] = *iterator.sptep;
3296 nr_sptes++; 3298 nr_sptes++;
3297 if (!is_shadow_present_pte(*iterator.sptep)) 3299 if (!is_shadow_present_pte(*iterator.sptep))
3298 break; 3300 break;
3299 } 3301 }
3300 spin_unlock(&vcpu->kvm->mmu_lock); 3302 spin_unlock(&vcpu->kvm->mmu_lock);
3301 3303
3302 return nr_sptes; 3304 return nr_sptes;
3303 } 3305 }
3304 EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy); 3306 EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
3305 3307
3306 #ifdef AUDIT 3308 #ifdef AUDIT
3307 3309
3308 static const char *audit_msg; 3310 static const char *audit_msg;
3309 3311
3310 static gva_t canonicalize(gva_t gva) 3312 static gva_t canonicalize(gva_t gva)
3311 { 3313 {
3312 #ifdef CONFIG_X86_64 3314 #ifdef CONFIG_X86_64
3313 gva = (long long)(gva << 16) >> 16; 3315 gva = (long long)(gva << 16) >> 16;
3314 #endif 3316 #endif
3315 return gva; 3317 return gva;
3316 } 3318 }
3317 3319
3318 3320
3319 typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep); 3321 typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep);
3320 3322
3321 static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp, 3323 static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
3322 inspect_spte_fn fn) 3324 inspect_spte_fn fn)
3323 { 3325 {
3324 int i; 3326 int i;
3325 3327
3326 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { 3328 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
3327 u64 ent = sp->spt[i]; 3329 u64 ent = sp->spt[i];
3328 3330
3329 if (is_shadow_present_pte(ent)) { 3331 if (is_shadow_present_pte(ent)) {
3330 if (!is_last_spte(ent, sp->role.level)) { 3332 if (!is_last_spte(ent, sp->role.level)) {
3331 struct kvm_mmu_page *child; 3333 struct kvm_mmu_page *child;
3332 child = page_header(ent & PT64_BASE_ADDR_MASK); 3334 child = page_header(ent & PT64_BASE_ADDR_MASK);
3333 __mmu_spte_walk(kvm, child, fn); 3335 __mmu_spte_walk(kvm, child, fn);
3334 } else 3336 } else
3335 fn(kvm, &sp->spt[i]); 3337 fn(kvm, &sp->spt[i]);
3336 } 3338 }
3337 } 3339 }
3338 } 3340 }
3339 3341
3340 static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn) 3342 static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
3341 { 3343 {
3342 int i; 3344 int i;
3343 struct kvm_mmu_page *sp; 3345 struct kvm_mmu_page *sp;
3344 3346
3345 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) 3347 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
3346 return; 3348 return;
3347 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { 3349 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
3348 hpa_t root = vcpu->arch.mmu.root_hpa; 3350 hpa_t root = vcpu->arch.mmu.root_hpa;
3349 sp = page_header(root); 3351 sp = page_header(root);
3350 __mmu_spte_walk(vcpu->kvm, sp, fn); 3352 __mmu_spte_walk(vcpu->kvm, sp, fn);
3351 return; 3353 return;
3352 } 3354 }
3353 for (i = 0; i < 4; ++i) { 3355 for (i = 0; i < 4; ++i) {
3354 hpa_t root = vcpu->arch.mmu.pae_root[i]; 3356 hpa_t root = vcpu->arch.mmu.pae_root[i];
3355 3357
3356 if (root && VALID_PAGE(root)) { 3358 if (root && VALID_PAGE(root)) {
3357 root &= PT64_BASE_ADDR_MASK; 3359 root &= PT64_BASE_ADDR_MASK;
3358 sp = page_header(root); 3360 sp = page_header(root);
3359 __mmu_spte_walk(vcpu->kvm, sp, fn); 3361 __mmu_spte_walk(vcpu->kvm, sp, fn);
3360 } 3362 }
3361 } 3363 }
3362 return; 3364 return;
3363 } 3365 }
3364 3366
3365 static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, 3367 static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
3366 gva_t va, int level) 3368 gva_t va, int level)
3367 { 3369 {
3368 u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK); 3370 u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
3369 int i; 3371 int i;
3370 gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1)); 3372 gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
3371 3373
3372 for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) { 3374 for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
3373 u64 ent = pt[i]; 3375 u64 ent = pt[i];
3374 3376
3375 if (ent == shadow_trap_nonpresent_pte) 3377 if (ent == shadow_trap_nonpresent_pte)
3376 continue; 3378 continue;
3377 3379
3378 va = canonicalize(va); 3380 va = canonicalize(va);
3379 if (is_shadow_present_pte(ent) && !is_last_spte(ent, level)) 3381 if (is_shadow_present_pte(ent) && !is_last_spte(ent, level))
3380 audit_mappings_page(vcpu, ent, va, level - 1); 3382 audit_mappings_page(vcpu, ent, va, level - 1);
3381 else { 3383 else {
3382 gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, va, NULL); 3384 gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, va, NULL);
3383 gfn_t gfn = gpa >> PAGE_SHIFT; 3385 gfn_t gfn = gpa >> PAGE_SHIFT;
3384 pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn); 3386 pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn);
3385 hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT; 3387 hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT;
3386 3388
3387 if (is_error_pfn(pfn)) { 3389 if (is_error_pfn(pfn)) {
3388 kvm_release_pfn_clean(pfn); 3390 kvm_release_pfn_clean(pfn);
3389 continue; 3391 continue;
3390 } 3392 }
3391 3393
3392 if (is_shadow_present_pte(ent) 3394 if (is_shadow_present_pte(ent)
3393 && (ent & PT64_BASE_ADDR_MASK) != hpa) 3395 && (ent & PT64_BASE_ADDR_MASK) != hpa)
3394 printk(KERN_ERR "xx audit error: (%s) levels %d" 3396 printk(KERN_ERR "xx audit error: (%s) levels %d"
3395 " gva %lx gpa %llx hpa %llx ent %llx %d\n", 3397 " gva %lx gpa %llx hpa %llx ent %llx %d\n",
3396 audit_msg, vcpu->arch.mmu.root_level, 3398 audit_msg, vcpu->arch.mmu.root_level,
3397 va, gpa, hpa, ent, 3399 va, gpa, hpa, ent,
3398 is_shadow_present_pte(ent)); 3400 is_shadow_present_pte(ent));
3399 else if (ent == shadow_notrap_nonpresent_pte 3401 else if (ent == shadow_notrap_nonpresent_pte
3400 && !is_error_hpa(hpa)) 3402 && !is_error_hpa(hpa))
3401 printk(KERN_ERR "audit: (%s) notrap shadow," 3403 printk(KERN_ERR "audit: (%s) notrap shadow,"
3402 " valid guest gva %lx\n", audit_msg, va); 3404 " valid guest gva %lx\n", audit_msg, va);
3403 kvm_release_pfn_clean(pfn); 3405 kvm_release_pfn_clean(pfn);
3404 3406
3405 } 3407 }
3406 } 3408 }
3407 } 3409 }
3408 3410
3409 static void audit_mappings(struct kvm_vcpu *vcpu) 3411 static void audit_mappings(struct kvm_vcpu *vcpu)
3410 { 3412 {
3411 unsigned i; 3413 unsigned i;
3412 3414
3413 if (vcpu->arch.mmu.root_level == 4) 3415 if (vcpu->arch.mmu.root_level == 4)
3414 audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4); 3416 audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
3415 else 3417 else
3416 for (i = 0; i < 4; ++i) 3418 for (i = 0; i < 4; ++i)
3417 if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK) 3419 if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
3418 audit_mappings_page(vcpu, 3420 audit_mappings_page(vcpu,
3419 vcpu->arch.mmu.pae_root[i], 3421 vcpu->arch.mmu.pae_root[i],
3420 i << 30, 3422 i << 30,
3421 2); 3423 2);
3422 } 3424 }
3423 3425
3424 static int count_rmaps(struct kvm_vcpu *vcpu) 3426 static int count_rmaps(struct kvm_vcpu *vcpu)
3425 { 3427 {
3426 struct kvm *kvm = vcpu->kvm; 3428 struct kvm *kvm = vcpu->kvm;
3427 struct kvm_memslots *slots; 3429 struct kvm_memslots *slots;
3428 int nmaps = 0; 3430 int nmaps = 0;
3429 int i, j, k, idx; 3431 int i, j, k, idx;
3430 3432
3431 idx = srcu_read_lock(&kvm->srcu); 3433 idx = srcu_read_lock(&kvm->srcu);
3432 slots = kvm_memslots(kvm); 3434 slots = kvm_memslots(kvm);
3433 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { 3435 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
3434 struct kvm_memory_slot *m = &slots->memslots[i]; 3436 struct kvm_memory_slot *m = &slots->memslots[i];
3435 struct kvm_rmap_desc *d; 3437 struct kvm_rmap_desc *d;
3436 3438
3437 for (j = 0; j < m->npages; ++j) { 3439 for (j = 0; j < m->npages; ++j) {
3438 unsigned long *rmapp = &m->rmap[j]; 3440 unsigned long *rmapp = &m->rmap[j];
3439 3441
3440 if (!*rmapp) 3442 if (!*rmapp)
3441 continue; 3443 continue;
3442 if (!(*rmapp & 1)) { 3444 if (!(*rmapp & 1)) {
3443 ++nmaps; 3445 ++nmaps;
3444 continue; 3446 continue;
3445 } 3447 }
3446 d = (struct kvm_rmap_desc *)(*rmapp & ~1ul); 3448 d = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
3447 while (d) { 3449 while (d) {
3448 for (k = 0; k < RMAP_EXT; ++k) 3450 for (k = 0; k < RMAP_EXT; ++k)
3449 if (d->sptes[k]) 3451 if (d->sptes[k])
3450 ++nmaps; 3452 ++nmaps;
3451 else 3453 else
3452 break; 3454 break;
3453 d = d->more; 3455 d = d->more;
3454 } 3456 }
3455 } 3457 }
3456 } 3458 }
3457 srcu_read_unlock(&kvm->srcu, idx); 3459 srcu_read_unlock(&kvm->srcu, idx);
3458 return nmaps; 3460 return nmaps;
3459 } 3461 }
3460 3462
3461 void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) 3463 void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
3462 { 3464 {
3463 unsigned long *rmapp; 3465 unsigned long *rmapp;
3464 struct kvm_mmu_page *rev_sp; 3466 struct kvm_mmu_page *rev_sp;
3465 gfn_t gfn; 3467 gfn_t gfn;
3466 3468
3467 if (is_writable_pte(*sptep)) { 3469 if (is_writable_pte(*sptep)) {
3468 rev_sp = page_header(__pa(sptep)); 3470 rev_sp = page_header(__pa(sptep));
3469 gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt); 3471 gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
3470 3472
3471 if (!gfn_to_memslot(kvm, gfn)) { 3473 if (!gfn_to_memslot(kvm, gfn)) {
3472 if (!printk_ratelimit()) 3474 if (!printk_ratelimit())
3473 return; 3475 return;
3474 printk(KERN_ERR "%s: no memslot for gfn %ld\n", 3476 printk(KERN_ERR "%s: no memslot for gfn %ld\n",
3475 audit_msg, gfn); 3477 audit_msg, gfn);
3476 printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n", 3478 printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n",
3477 audit_msg, (long int)(sptep - rev_sp->spt), 3479 audit_msg, (long int)(sptep - rev_sp->spt),
3478 rev_sp->gfn); 3480 rev_sp->gfn);
3479 dump_stack(); 3481 dump_stack();
3480 return; 3482 return;
3481 } 3483 }
3482 3484
3483 rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level); 3485 rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
3484 if (!*rmapp) { 3486 if (!*rmapp) {
3485 if (!printk_ratelimit()) 3487 if (!printk_ratelimit())
3486 return; 3488 return;
3487 printk(KERN_ERR "%s: no rmap for writable spte %llx\n", 3489 printk(KERN_ERR "%s: no rmap for writable spte %llx\n",
3488 audit_msg, *sptep); 3490 audit_msg, *sptep);
3489 dump_stack(); 3491 dump_stack();
3490 } 3492 }
3491 } 3493 }
3492 3494
3493 } 3495 }
3494 3496
3495 void audit_writable_sptes_have_rmaps(struct kvm_vcpu *vcpu) 3497 void audit_writable_sptes_have_rmaps(struct kvm_vcpu *vcpu)
3496 { 3498 {
3497 mmu_spte_walk(vcpu, inspect_spte_has_rmap); 3499 mmu_spte_walk(vcpu, inspect_spte_has_rmap);
3498 } 3500 }
3499 3501
3500 static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu) 3502 static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu)
3501 { 3503 {
3502 struct kvm_mmu_page *sp; 3504 struct kvm_mmu_page *sp;
3503 int i; 3505 int i;
3504 3506
3505 list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) { 3507 list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
3506 u64 *pt = sp->spt; 3508 u64 *pt = sp->spt;
3507 3509
3508 if (sp->role.level != PT_PAGE_TABLE_LEVEL) 3510 if (sp->role.level != PT_PAGE_TABLE_LEVEL)
3509 continue; 3511 continue;
3510 3512
3511 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { 3513 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
3512 u64 ent = pt[i]; 3514 u64 ent = pt[i];
3513 3515
3514 if (!(ent & PT_PRESENT_MASK)) 3516 if (!(ent & PT_PRESENT_MASK))
3515 continue; 3517 continue;
3516 if (!is_writable_pte(ent)) 3518 if (!is_writable_pte(ent))
3517 continue; 3519 continue;
3518 inspect_spte_has_rmap(vcpu->kvm, &pt[i]); 3520 inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
3519 } 3521 }
3520 } 3522 }
3521 return; 3523 return;
3522 } 3524 }
3523 3525
3524 static void audit_rmap(struct kvm_vcpu *vcpu) 3526 static void audit_rmap(struct kvm_vcpu *vcpu)
3525 { 3527 {
3526 check_writable_mappings_rmap(vcpu); 3528 check_writable_mappings_rmap(vcpu);
3527 count_rmaps(vcpu); 3529 count_rmaps(vcpu);
3528 } 3530 }
3529 3531
3530 static void audit_write_protection(struct kvm_vcpu *vcpu) 3532 static void audit_write_protection(struct kvm_vcpu *vcpu)
3531 { 3533 {
3532 struct kvm_mmu_page *sp; 3534 struct kvm_mmu_page *sp;
3533 struct kvm_memory_slot *slot; 3535 struct kvm_memory_slot *slot;
3534 unsigned long *rmapp; 3536 unsigned long *rmapp;
3535 u64 *spte; 3537 u64 *spte;
3536 gfn_t gfn; 3538 gfn_t gfn;
3537 3539
3538 list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) { 3540 list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
3539 if (sp->role.direct) 3541 if (sp->role.direct)
3540 continue; 3542 continue;
3541 if (sp->unsync) 3543 if (sp->unsync)
3542 continue; 3544 continue;
3543 3545
3544 slot = gfn_to_memslot(vcpu->kvm, sp->gfn); 3546 slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
3545 rmapp = &slot->rmap[gfn - slot->base_gfn]; 3547 rmapp = &slot->rmap[gfn - slot->base_gfn];
3546 3548
3547 spte = rmap_next(vcpu->kvm, rmapp, NULL); 3549 spte = rmap_next(vcpu->kvm, rmapp, NULL);
3548 while (spte) { 3550 while (spte) {
3549 if (is_writable_pte(*spte)) 3551 if (is_writable_pte(*spte))
3550 printk(KERN_ERR "%s: (%s) shadow page has " 3552 printk(KERN_ERR "%s: (%s) shadow page has "
3551 "writable mappings: gfn %lx role %x\n", 3553 "writable mappings: gfn %lx role %x\n",
3552 __func__, audit_msg, sp->gfn, 3554 __func__, audit_msg, sp->gfn,
3553 sp->role.word); 3555 sp->role.word);
3554 spte = rmap_next(vcpu->kvm, rmapp, spte); 3556 spte = rmap_next(vcpu->kvm, rmapp, spte);
3555 } 3557 }
3556 } 3558 }
3557 } 3559 }
3558 3560
3559 static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) 3561 static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
3560 { 3562 {
3561 int olddbg = dbg; 3563 int olddbg = dbg;
3562 3564
3563 dbg = 0; 3565 dbg = 0;
3564 audit_msg = msg; 3566 audit_msg = msg;
3565 audit_rmap(vcpu); 3567 audit_rmap(vcpu);
3566 audit_write_protection(vcpu); 3568 audit_write_protection(vcpu);
3567 if (strcmp("pre pte write", audit_msg) != 0) 3569 if (strcmp("pre pte write", audit_msg) != 0)
3568 audit_mappings(vcpu); 3570 audit_mappings(vcpu);
3569 audit_writable_sptes_have_rmaps(vcpu); 3571 audit_writable_sptes_have_rmaps(vcpu);
3570 dbg = olddbg; 3572 dbg = olddbg;
3571 } 3573 }
3572 3574
3573 #endif 3575 #endif
3574 3576
include/linux/kvm_host.h
1 #ifndef __KVM_HOST_H 1 #ifndef __KVM_HOST_H
2 #define __KVM_HOST_H 2 #define __KVM_HOST_H
3 3
4 /* 4 /*
5 * This work is licensed under the terms of the GNU GPL, version 2. See 5 * This work is licensed under the terms of the GNU GPL, version 2. See
6 * the COPYING file in the top-level directory. 6 * the COPYING file in the top-level directory.
7 */ 7 */
8 8
9 #include <linux/types.h> 9 #include <linux/types.h>
10 #include <linux/hardirq.h> 10 #include <linux/hardirq.h>
11 #include <linux/list.h> 11 #include <linux/list.h>
12 #include <linux/mutex.h> 12 #include <linux/mutex.h>
13 #include <linux/spinlock.h> 13 #include <linux/spinlock.h>
14 #include <linux/signal.h> 14 #include <linux/signal.h>
15 #include <linux/sched.h> 15 #include <linux/sched.h>
16 #include <linux/mm.h> 16 #include <linux/mm.h>
17 #include <linux/preempt.h> 17 #include <linux/preempt.h>
18 #include <linux/msi.h> 18 #include <linux/msi.h>
19 #include <asm/signal.h> 19 #include <asm/signal.h>
20 20
21 #include <linux/kvm.h> 21 #include <linux/kvm.h>
22 #include <linux/kvm_para.h> 22 #include <linux/kvm_para.h>
23 23
24 #include <linux/kvm_types.h> 24 #include <linux/kvm_types.h>
25 25
26 #include <asm/kvm_host.h> 26 #include <asm/kvm_host.h>
27 27
28 /* 28 /*
29 * vcpu->requests bit members 29 * vcpu->requests bit members
30 */ 30 */
31 #define KVM_REQ_TLB_FLUSH 0 31 #define KVM_REQ_TLB_FLUSH 0
32 #define KVM_REQ_MIGRATE_TIMER 1 32 #define KVM_REQ_MIGRATE_TIMER 1
33 #define KVM_REQ_REPORT_TPR_ACCESS 2 33 #define KVM_REQ_REPORT_TPR_ACCESS 2
34 #define KVM_REQ_MMU_RELOAD 3 34 #define KVM_REQ_MMU_RELOAD 3
35 #define KVM_REQ_TRIPLE_FAULT 4 35 #define KVM_REQ_TRIPLE_FAULT 4
36 #define KVM_REQ_PENDING_TIMER 5 36 #define KVM_REQ_PENDING_TIMER 5
37 #define KVM_REQ_UNHALT 6 37 #define KVM_REQ_UNHALT 6
38 #define KVM_REQ_MMU_SYNC 7 38 #define KVM_REQ_MMU_SYNC 7
39 #define KVM_REQ_KVMCLOCK_UPDATE 8 39 #define KVM_REQ_KVMCLOCK_UPDATE 8
40 #define KVM_REQ_KICK 9 40 #define KVM_REQ_KICK 9
41 #define KVM_REQ_DEACTIVATE_FPU 10 41 #define KVM_REQ_DEACTIVATE_FPU 10
42 42
43 #define KVM_USERSPACE_IRQ_SOURCE_ID 0 43 #define KVM_USERSPACE_IRQ_SOURCE_ID 0
44 44
45 struct kvm; 45 struct kvm;
46 struct kvm_vcpu; 46 struct kvm_vcpu;
47 extern struct kmem_cache *kvm_vcpu_cache; 47 extern struct kmem_cache *kvm_vcpu_cache;
48 48
49 /* 49 /*
50 * It would be nice to use something smarter than a linear search, TBD... 50 * It would be nice to use something smarter than a linear search, TBD...
51 * Thankfully we dont expect many devices to register (famous last words :), 51 * Thankfully we dont expect many devices to register (famous last words :),
52 * so until then it will suffice. At least its abstracted so we can change 52 * so until then it will suffice. At least its abstracted so we can change
53 * in one place. 53 * in one place.
54 */ 54 */
55 struct kvm_io_bus { 55 struct kvm_io_bus {
56 int dev_count; 56 int dev_count;
57 #define NR_IOBUS_DEVS 200 57 #define NR_IOBUS_DEVS 200
58 struct kvm_io_device *devs[NR_IOBUS_DEVS]; 58 struct kvm_io_device *devs[NR_IOBUS_DEVS];
59 }; 59 };
60 60
61 enum kvm_bus { 61 enum kvm_bus {
62 KVM_MMIO_BUS, 62 KVM_MMIO_BUS,
63 KVM_PIO_BUS, 63 KVM_PIO_BUS,
64 KVM_NR_BUSES 64 KVM_NR_BUSES
65 }; 65 };
66 66
67 int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, 67 int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
68 int len, const void *val); 68 int len, const void *val);
69 int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len, 69 int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len,
70 void *val); 70 void *val);
71 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, 71 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx,
72 struct kvm_io_device *dev); 72 struct kvm_io_device *dev);
73 int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, 73 int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
74 struct kvm_io_device *dev); 74 struct kvm_io_device *dev);
75 75
76 struct kvm_vcpu { 76 struct kvm_vcpu {
77 struct kvm *kvm; 77 struct kvm *kvm;
78 #ifdef CONFIG_PREEMPT_NOTIFIERS 78 #ifdef CONFIG_PREEMPT_NOTIFIERS
79 struct preempt_notifier preempt_notifier; 79 struct preempt_notifier preempt_notifier;
80 #endif 80 #endif
81 int vcpu_id; 81 int vcpu_id;
82 struct mutex mutex; 82 struct mutex mutex;
83 int cpu; 83 int cpu;
84 atomic_t guest_mode; 84 atomic_t guest_mode;
85 struct kvm_run *run; 85 struct kvm_run *run;
86 unsigned long requests; 86 unsigned long requests;
87 unsigned long guest_debug; 87 unsigned long guest_debug;
88 int srcu_idx; 88 int srcu_idx;
89 89
90 int fpu_active; 90 int fpu_active;
91 int guest_fpu_loaded, guest_xcr0_loaded; 91 int guest_fpu_loaded, guest_xcr0_loaded;
92 wait_queue_head_t wq; 92 wait_queue_head_t wq;
93 int sigset_active; 93 int sigset_active;
94 sigset_t sigset; 94 sigset_t sigset;
95 struct kvm_vcpu_stat stat; 95 struct kvm_vcpu_stat stat;
96 96
97 #ifdef CONFIG_HAS_IOMEM 97 #ifdef CONFIG_HAS_IOMEM
98 int mmio_needed; 98 int mmio_needed;
99 int mmio_read_completed; 99 int mmio_read_completed;
100 int mmio_is_write; 100 int mmio_is_write;
101 int mmio_size; 101 int mmio_size;
102 unsigned char mmio_data[8]; 102 unsigned char mmio_data[8];
103 gpa_t mmio_phys_addr; 103 gpa_t mmio_phys_addr;
104 #endif 104 #endif
105 105
106 struct kvm_vcpu_arch arch; 106 struct kvm_vcpu_arch arch;
107 }; 107 };
108 108
109 /* 109 /*
110 * Some of the bitops functions do not support too long bitmaps. 110 * Some of the bitops functions do not support too long bitmaps.
111 * This number must be determined not to exceed such limits. 111 * This number must be determined not to exceed such limits.
112 */ 112 */
113 #define KVM_MEM_MAX_NR_PAGES ((1UL << 31) - 1) 113 #define KVM_MEM_MAX_NR_PAGES ((1UL << 31) - 1)
114 114
115 struct kvm_memory_slot { 115 struct kvm_memory_slot {
116 gfn_t base_gfn; 116 gfn_t base_gfn;
117 unsigned long npages; 117 unsigned long npages;
118 unsigned long flags; 118 unsigned long flags;
119 unsigned long *rmap; 119 unsigned long *rmap;
120 unsigned long *dirty_bitmap; 120 unsigned long *dirty_bitmap;
121 struct { 121 struct {
122 unsigned long rmap_pde; 122 unsigned long rmap_pde;
123 int write_count; 123 int write_count;
124 } *lpage_info[KVM_NR_PAGE_SIZES - 1]; 124 } *lpage_info[KVM_NR_PAGE_SIZES - 1];
125 unsigned long userspace_addr; 125 unsigned long userspace_addr;
126 int user_alloc; 126 int user_alloc;
127 int id; 127 int id;
128 }; 128 };
129 129
130 static inline unsigned long kvm_dirty_bitmap_bytes(struct kvm_memory_slot *memslot) 130 static inline unsigned long kvm_dirty_bitmap_bytes(struct kvm_memory_slot *memslot)
131 { 131 {
132 return ALIGN(memslot->npages, BITS_PER_LONG) / 8; 132 return ALIGN(memslot->npages, BITS_PER_LONG) / 8;
133 } 133 }
134 134
135 struct kvm_kernel_irq_routing_entry { 135 struct kvm_kernel_irq_routing_entry {
136 u32 gsi; 136 u32 gsi;
137 u32 type; 137 u32 type;
138 int (*set)(struct kvm_kernel_irq_routing_entry *e, 138 int (*set)(struct kvm_kernel_irq_routing_entry *e,
139 struct kvm *kvm, int irq_source_id, int level); 139 struct kvm *kvm, int irq_source_id, int level);
140 union { 140 union {
141 struct { 141 struct {
142 unsigned irqchip; 142 unsigned irqchip;
143 unsigned pin; 143 unsigned pin;
144 } irqchip; 144 } irqchip;
145 struct msi_msg msi; 145 struct msi_msg msi;
146 }; 146 };
147 struct hlist_node link; 147 struct hlist_node link;
148 }; 148 };
149 149
150 #ifdef __KVM_HAVE_IOAPIC 150 #ifdef __KVM_HAVE_IOAPIC
151 151
152 struct kvm_irq_routing_table { 152 struct kvm_irq_routing_table {
153 int chip[KVM_NR_IRQCHIPS][KVM_IOAPIC_NUM_PINS]; 153 int chip[KVM_NR_IRQCHIPS][KVM_IOAPIC_NUM_PINS];
154 struct kvm_kernel_irq_routing_entry *rt_entries; 154 struct kvm_kernel_irq_routing_entry *rt_entries;
155 u32 nr_rt_entries; 155 u32 nr_rt_entries;
156 /* 156 /*
157 * Array indexed by gsi. Each entry contains list of irq chips 157 * Array indexed by gsi. Each entry contains list of irq chips
158 * the gsi is connected to. 158 * the gsi is connected to.
159 */ 159 */
160 struct hlist_head map[0]; 160 struct hlist_head map[0];
161 }; 161 };
162 162
163 #else 163 #else
164 164
165 struct kvm_irq_routing_table {}; 165 struct kvm_irq_routing_table {};
166 166
167 #endif 167 #endif
168 168
169 struct kvm_memslots { 169 struct kvm_memslots {
170 int nmemslots; 170 int nmemslots;
171 struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS + 171 struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS +
172 KVM_PRIVATE_MEM_SLOTS]; 172 KVM_PRIVATE_MEM_SLOTS];
173 }; 173 };
174 174
175 struct kvm { 175 struct kvm {
176 spinlock_t mmu_lock; 176 spinlock_t mmu_lock;
177 raw_spinlock_t requests_lock; 177 raw_spinlock_t requests_lock;
178 struct mutex slots_lock; 178 struct mutex slots_lock;
179 struct mm_struct *mm; /* userspace tied to this vm */ 179 struct mm_struct *mm; /* userspace tied to this vm */
180 struct kvm_memslots *memslots; 180 struct kvm_memslots *memslots;
181 struct srcu_struct srcu; 181 struct srcu_struct srcu;
182 #ifdef CONFIG_KVM_APIC_ARCHITECTURE 182 #ifdef CONFIG_KVM_APIC_ARCHITECTURE
183 u32 bsp_vcpu_id; 183 u32 bsp_vcpu_id;
184 struct kvm_vcpu *bsp_vcpu; 184 struct kvm_vcpu *bsp_vcpu;
185 #endif 185 #endif
186 struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; 186 struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
187 atomic_t online_vcpus; 187 atomic_t online_vcpus;
188 struct list_head vm_list; 188 struct list_head vm_list;
189 struct mutex lock; 189 struct mutex lock;
190 struct kvm_io_bus *buses[KVM_NR_BUSES]; 190 struct kvm_io_bus *buses[KVM_NR_BUSES];
191 #ifdef CONFIG_HAVE_KVM_EVENTFD 191 #ifdef CONFIG_HAVE_KVM_EVENTFD
192 struct { 192 struct {
193 spinlock_t lock; 193 spinlock_t lock;
194 struct list_head items; 194 struct list_head items;
195 } irqfds; 195 } irqfds;
196 struct list_head ioeventfds; 196 struct list_head ioeventfds;
197 #endif 197 #endif
198 struct kvm_vm_stat stat; 198 struct kvm_vm_stat stat;
199 struct kvm_arch arch; 199 struct kvm_arch arch;
200 atomic_t users_count; 200 atomic_t users_count;
201 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 201 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
202 struct kvm_coalesced_mmio_dev *coalesced_mmio_dev; 202 struct kvm_coalesced_mmio_dev *coalesced_mmio_dev;
203 struct kvm_coalesced_mmio_ring *coalesced_mmio_ring; 203 struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
204 #endif 204 #endif
205 205
206 struct mutex irq_lock; 206 struct mutex irq_lock;
207 #ifdef CONFIG_HAVE_KVM_IRQCHIP 207 #ifdef CONFIG_HAVE_KVM_IRQCHIP
208 struct kvm_irq_routing_table *irq_routing; 208 struct kvm_irq_routing_table *irq_routing;
209 struct hlist_head mask_notifier_list; 209 struct hlist_head mask_notifier_list;
210 struct hlist_head irq_ack_notifier_list; 210 struct hlist_head irq_ack_notifier_list;
211 #endif 211 #endif
212 212
213 #ifdef KVM_ARCH_WANT_MMU_NOTIFIER 213 #ifdef KVM_ARCH_WANT_MMU_NOTIFIER
214 struct mmu_notifier mmu_notifier; 214 struct mmu_notifier mmu_notifier;
215 unsigned long mmu_notifier_seq; 215 unsigned long mmu_notifier_seq;
216 long mmu_notifier_count; 216 long mmu_notifier_count;
217 #endif 217 #endif
218 }; 218 };
219 219
220 /* The guest did something we don't support. */ 220 /* The guest did something we don't support. */
221 #define pr_unimpl(vcpu, fmt, ...) \ 221 #define pr_unimpl(vcpu, fmt, ...) \
222 do { \ 222 do { \
223 if (printk_ratelimit()) \ 223 if (printk_ratelimit()) \
224 printk(KERN_ERR "kvm: %i: cpu%i " fmt, \ 224 printk(KERN_ERR "kvm: %i: cpu%i " fmt, \
225 current->tgid, (vcpu)->vcpu_id , ## __VA_ARGS__); \ 225 current->tgid, (vcpu)->vcpu_id , ## __VA_ARGS__); \
226 } while (0) 226 } while (0)
227 227
228 #define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt) 228 #define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt)
229 #define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt) 229 #define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt)
230 230
231 static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i) 231 static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i)
232 { 232 {
233 smp_rmb(); 233 smp_rmb();
234 return kvm->vcpus[i]; 234 return kvm->vcpus[i];
235 } 235 }
236 236
237 #define kvm_for_each_vcpu(idx, vcpup, kvm) \ 237 #define kvm_for_each_vcpu(idx, vcpup, kvm) \
238 for (idx = 0, vcpup = kvm_get_vcpu(kvm, idx); \ 238 for (idx = 0, vcpup = kvm_get_vcpu(kvm, idx); \
239 idx < atomic_read(&kvm->online_vcpus) && vcpup; \ 239 idx < atomic_read(&kvm->online_vcpus) && vcpup; \
240 vcpup = kvm_get_vcpu(kvm, ++idx)) 240 vcpup = kvm_get_vcpu(kvm, ++idx))
241 241
242 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id); 242 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id);
243 void kvm_vcpu_uninit(struct kvm_vcpu *vcpu); 243 void kvm_vcpu_uninit(struct kvm_vcpu *vcpu);
244 244
245 void vcpu_load(struct kvm_vcpu *vcpu); 245 void vcpu_load(struct kvm_vcpu *vcpu);
246 void vcpu_put(struct kvm_vcpu *vcpu); 246 void vcpu_put(struct kvm_vcpu *vcpu);
247 247
248 int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, 248 int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
249 struct module *module); 249 struct module *module);
250 void kvm_exit(void); 250 void kvm_exit(void);
251 251
252 void kvm_get_kvm(struct kvm *kvm); 252 void kvm_get_kvm(struct kvm *kvm);
253 void kvm_put_kvm(struct kvm *kvm); 253 void kvm_put_kvm(struct kvm *kvm);
254 254
255 static inline struct kvm_memslots *kvm_memslots(struct kvm *kvm) 255 static inline struct kvm_memslots *kvm_memslots(struct kvm *kvm)
256 { 256 {
257 return rcu_dereference_check(kvm->memslots, 257 return rcu_dereference_check(kvm->memslots,
258 srcu_read_lock_held(&kvm->srcu) 258 srcu_read_lock_held(&kvm->srcu)
259 || lockdep_is_held(&kvm->slots_lock)); 259 || lockdep_is_held(&kvm->slots_lock));
260 } 260 }
261 261
262 #define HPA_MSB ((sizeof(hpa_t) * 8) - 1) 262 #define HPA_MSB ((sizeof(hpa_t) * 8) - 1)
263 #define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB) 263 #define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB)
264 static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; } 264 static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; }
265 265
266 extern struct page *bad_page; 266 extern struct page *bad_page;
267 extern pfn_t bad_pfn; 267 extern pfn_t bad_pfn;
268 268
269 int is_error_page(struct page *page); 269 int is_error_page(struct page *page);
270 int is_error_pfn(pfn_t pfn); 270 int is_error_pfn(pfn_t pfn);
271 int is_hwpoison_pfn(pfn_t pfn); 271 int is_hwpoison_pfn(pfn_t pfn);
272 int is_fault_pfn(pfn_t pfn);
272 int kvm_is_error_hva(unsigned long addr); 273 int kvm_is_error_hva(unsigned long addr);
273 int kvm_set_memory_region(struct kvm *kvm, 274 int kvm_set_memory_region(struct kvm *kvm,
274 struct kvm_userspace_memory_region *mem, 275 struct kvm_userspace_memory_region *mem,
275 int user_alloc); 276 int user_alloc);
276 int __kvm_set_memory_region(struct kvm *kvm, 277 int __kvm_set_memory_region(struct kvm *kvm,
277 struct kvm_userspace_memory_region *mem, 278 struct kvm_userspace_memory_region *mem,
278 int user_alloc); 279 int user_alloc);
279 int kvm_arch_prepare_memory_region(struct kvm *kvm, 280 int kvm_arch_prepare_memory_region(struct kvm *kvm,
280 struct kvm_memory_slot *memslot, 281 struct kvm_memory_slot *memslot,
281 struct kvm_memory_slot old, 282 struct kvm_memory_slot old,
282 struct kvm_userspace_memory_region *mem, 283 struct kvm_userspace_memory_region *mem,
283 int user_alloc); 284 int user_alloc);
284 void kvm_arch_commit_memory_region(struct kvm *kvm, 285 void kvm_arch_commit_memory_region(struct kvm *kvm,
285 struct kvm_userspace_memory_region *mem, 286 struct kvm_userspace_memory_region *mem,
286 struct kvm_memory_slot old, 287 struct kvm_memory_slot old,
287 int user_alloc); 288 int user_alloc);
288 void kvm_disable_largepages(void); 289 void kvm_disable_largepages(void);
289 void kvm_arch_flush_shadow(struct kvm *kvm); 290 void kvm_arch_flush_shadow(struct kvm *kvm);
290 291
291 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn); 292 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
292 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn); 293 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn);
293 void kvm_release_page_clean(struct page *page); 294 void kvm_release_page_clean(struct page *page);
294 void kvm_release_page_dirty(struct page *page); 295 void kvm_release_page_dirty(struct page *page);
295 void kvm_set_page_dirty(struct page *page); 296 void kvm_set_page_dirty(struct page *page);
296 void kvm_set_page_accessed(struct page *page); 297 void kvm_set_page_accessed(struct page *page);
297 298
298 pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn); 299 pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
299 pfn_t gfn_to_pfn_memslot(struct kvm *kvm, 300 pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
300 struct kvm_memory_slot *slot, gfn_t gfn); 301 struct kvm_memory_slot *slot, gfn_t gfn);
301 int memslot_id(struct kvm *kvm, gfn_t gfn); 302 int memslot_id(struct kvm *kvm, gfn_t gfn);
302 void kvm_release_pfn_dirty(pfn_t); 303 void kvm_release_pfn_dirty(pfn_t);
303 void kvm_release_pfn_clean(pfn_t pfn); 304 void kvm_release_pfn_clean(pfn_t pfn);
304 void kvm_set_pfn_dirty(pfn_t pfn); 305 void kvm_set_pfn_dirty(pfn_t pfn);
305 void kvm_set_pfn_accessed(pfn_t pfn); 306 void kvm_set_pfn_accessed(pfn_t pfn);
306 void kvm_get_pfn(pfn_t pfn); 307 void kvm_get_pfn(pfn_t pfn);
307 308
308 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, 309 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
309 int len); 310 int len);
310 int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, 311 int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
311 unsigned long len); 312 unsigned long len);
312 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len); 313 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len);
313 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, 314 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
314 int offset, int len); 315 int offset, int len);
315 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, 316 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
316 unsigned long len); 317 unsigned long len);
317 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len); 318 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len);
318 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len); 319 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len);
319 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn); 320 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn);
320 int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn); 321 int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn);
321 unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn); 322 unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn);
322 void mark_page_dirty(struct kvm *kvm, gfn_t gfn); 323 void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
323 324
324 void kvm_vcpu_block(struct kvm_vcpu *vcpu); 325 void kvm_vcpu_block(struct kvm_vcpu *vcpu);
325 void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu); 326 void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu);
326 void kvm_resched(struct kvm_vcpu *vcpu); 327 void kvm_resched(struct kvm_vcpu *vcpu);
327 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu); 328 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
328 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu); 329 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
329 void kvm_flush_remote_tlbs(struct kvm *kvm); 330 void kvm_flush_remote_tlbs(struct kvm *kvm);
330 void kvm_reload_remote_mmus(struct kvm *kvm); 331 void kvm_reload_remote_mmus(struct kvm *kvm);
331 332
332 long kvm_arch_dev_ioctl(struct file *filp, 333 long kvm_arch_dev_ioctl(struct file *filp,
333 unsigned int ioctl, unsigned long arg); 334 unsigned int ioctl, unsigned long arg);
334 long kvm_arch_vcpu_ioctl(struct file *filp, 335 long kvm_arch_vcpu_ioctl(struct file *filp,
335 unsigned int ioctl, unsigned long arg); 336 unsigned int ioctl, unsigned long arg);
336 337
337 int kvm_dev_ioctl_check_extension(long ext); 338 int kvm_dev_ioctl_check_extension(long ext);
338 339
339 int kvm_get_dirty_log(struct kvm *kvm, 340 int kvm_get_dirty_log(struct kvm *kvm,
340 struct kvm_dirty_log *log, int *is_dirty); 341 struct kvm_dirty_log *log, int *is_dirty);
341 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, 342 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
342 struct kvm_dirty_log *log); 343 struct kvm_dirty_log *log);
343 344
344 int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, 345 int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
345 struct 346 struct
346 kvm_userspace_memory_region *mem, 347 kvm_userspace_memory_region *mem,
347 int user_alloc); 348 int user_alloc);
348 long kvm_arch_vm_ioctl(struct file *filp, 349 long kvm_arch_vm_ioctl(struct file *filp,
349 unsigned int ioctl, unsigned long arg); 350 unsigned int ioctl, unsigned long arg);
350 351
351 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu); 352 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu);
352 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu); 353 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu);
353 354
354 int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, 355 int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
355 struct kvm_translation *tr); 356 struct kvm_translation *tr);
356 357
357 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs); 358 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs);
358 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs); 359 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs);
359 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, 360 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
360 struct kvm_sregs *sregs); 361 struct kvm_sregs *sregs);
361 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, 362 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
362 struct kvm_sregs *sregs); 363 struct kvm_sregs *sregs);
363 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, 364 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
364 struct kvm_mp_state *mp_state); 365 struct kvm_mp_state *mp_state);
365 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, 366 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
366 struct kvm_mp_state *mp_state); 367 struct kvm_mp_state *mp_state);
367 int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, 368 int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
368 struct kvm_guest_debug *dbg); 369 struct kvm_guest_debug *dbg);
369 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run); 370 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run);
370 371
371 int kvm_arch_init(void *opaque); 372 int kvm_arch_init(void *opaque);
372 void kvm_arch_exit(void); 373 void kvm_arch_exit(void);
373 374
374 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu); 375 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu);
375 void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu); 376 void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu);
376 377
377 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu); 378 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu);
378 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu); 379 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
379 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu); 380 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu);
380 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id); 381 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id);
381 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu); 382 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu);
382 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu); 383 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu);
383 384
384 int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu); 385 int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu);
385 int kvm_arch_hardware_enable(void *garbage); 386 int kvm_arch_hardware_enable(void *garbage);
386 void kvm_arch_hardware_disable(void *garbage); 387 void kvm_arch_hardware_disable(void *garbage);
387 int kvm_arch_hardware_setup(void); 388 int kvm_arch_hardware_setup(void);
388 void kvm_arch_hardware_unsetup(void); 389 void kvm_arch_hardware_unsetup(void);
389 void kvm_arch_check_processor_compat(void *rtn); 390 void kvm_arch_check_processor_compat(void *rtn);
390 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu); 391 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu);
391 392
392 void kvm_free_physmem(struct kvm *kvm); 393 void kvm_free_physmem(struct kvm *kvm);
393 394
394 struct kvm *kvm_arch_create_vm(void); 395 struct kvm *kvm_arch_create_vm(void);
395 void kvm_arch_destroy_vm(struct kvm *kvm); 396 void kvm_arch_destroy_vm(struct kvm *kvm);
396 void kvm_free_all_assigned_devices(struct kvm *kvm); 397 void kvm_free_all_assigned_devices(struct kvm *kvm);
397 void kvm_arch_sync_events(struct kvm *kvm); 398 void kvm_arch_sync_events(struct kvm *kvm);
398 399
399 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu); 400 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu);
400 void kvm_vcpu_kick(struct kvm_vcpu *vcpu); 401 void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
401 402
402 int kvm_is_mmio_pfn(pfn_t pfn); 403 int kvm_is_mmio_pfn(pfn_t pfn);
403 404
404 struct kvm_irq_ack_notifier { 405 struct kvm_irq_ack_notifier {
405 struct hlist_node link; 406 struct hlist_node link;
406 unsigned gsi; 407 unsigned gsi;
407 void (*irq_acked)(struct kvm_irq_ack_notifier *kian); 408 void (*irq_acked)(struct kvm_irq_ack_notifier *kian);
408 }; 409 };
409 410
410 #define KVM_ASSIGNED_MSIX_PENDING 0x1 411 #define KVM_ASSIGNED_MSIX_PENDING 0x1
411 struct kvm_guest_msix_entry { 412 struct kvm_guest_msix_entry {
412 u32 vector; 413 u32 vector;
413 u16 entry; 414 u16 entry;
414 u16 flags; 415 u16 flags;
415 }; 416 };
416 417
417 struct kvm_assigned_dev_kernel { 418 struct kvm_assigned_dev_kernel {
418 struct kvm_irq_ack_notifier ack_notifier; 419 struct kvm_irq_ack_notifier ack_notifier;
419 struct work_struct interrupt_work; 420 struct work_struct interrupt_work;
420 struct list_head list; 421 struct list_head list;
421 int assigned_dev_id; 422 int assigned_dev_id;
422 int host_segnr; 423 int host_segnr;
423 int host_busnr; 424 int host_busnr;
424 int host_devfn; 425 int host_devfn;
425 unsigned int entries_nr; 426 unsigned int entries_nr;
426 int host_irq; 427 int host_irq;
427 bool host_irq_disabled; 428 bool host_irq_disabled;
428 struct msix_entry *host_msix_entries; 429 struct msix_entry *host_msix_entries;
429 int guest_irq; 430 int guest_irq;
430 struct kvm_guest_msix_entry *guest_msix_entries; 431 struct kvm_guest_msix_entry *guest_msix_entries;
431 unsigned long irq_requested_type; 432 unsigned long irq_requested_type;
432 int irq_source_id; 433 int irq_source_id;
433 int flags; 434 int flags;
434 struct pci_dev *dev; 435 struct pci_dev *dev;
435 struct kvm *kvm; 436 struct kvm *kvm;
436 spinlock_t assigned_dev_lock; 437 spinlock_t assigned_dev_lock;
437 }; 438 };
438 439
439 struct kvm_irq_mask_notifier { 440 struct kvm_irq_mask_notifier {
440 void (*func)(struct kvm_irq_mask_notifier *kimn, bool masked); 441 void (*func)(struct kvm_irq_mask_notifier *kimn, bool masked);
441 int irq; 442 int irq;
442 struct hlist_node link; 443 struct hlist_node link;
443 }; 444 };
444 445
445 void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq, 446 void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
446 struct kvm_irq_mask_notifier *kimn); 447 struct kvm_irq_mask_notifier *kimn);
447 void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq, 448 void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
448 struct kvm_irq_mask_notifier *kimn); 449 struct kvm_irq_mask_notifier *kimn);
449 void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask); 450 void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask);
450 451
451 #ifdef __KVM_HAVE_IOAPIC 452 #ifdef __KVM_HAVE_IOAPIC
452 void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic, 453 void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
453 union kvm_ioapic_redirect_entry *entry, 454 union kvm_ioapic_redirect_entry *entry,
454 unsigned long *deliver_bitmask); 455 unsigned long *deliver_bitmask);
455 #endif 456 #endif
456 int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level); 457 int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level);
457 void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin); 458 void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
458 void kvm_register_irq_ack_notifier(struct kvm *kvm, 459 void kvm_register_irq_ack_notifier(struct kvm *kvm,
459 struct kvm_irq_ack_notifier *kian); 460 struct kvm_irq_ack_notifier *kian);
460 void kvm_unregister_irq_ack_notifier(struct kvm *kvm, 461 void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
461 struct kvm_irq_ack_notifier *kian); 462 struct kvm_irq_ack_notifier *kian);
462 int kvm_request_irq_source_id(struct kvm *kvm); 463 int kvm_request_irq_source_id(struct kvm *kvm);
463 void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id); 464 void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
464 465
465 /* For vcpu->arch.iommu_flags */ 466 /* For vcpu->arch.iommu_flags */
466 #define KVM_IOMMU_CACHE_COHERENCY 0x1 467 #define KVM_IOMMU_CACHE_COHERENCY 0x1
467 468
468 #ifdef CONFIG_IOMMU_API 469 #ifdef CONFIG_IOMMU_API
469 int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot); 470 int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot);
470 int kvm_iommu_map_guest(struct kvm *kvm); 471 int kvm_iommu_map_guest(struct kvm *kvm);
471 int kvm_iommu_unmap_guest(struct kvm *kvm); 472 int kvm_iommu_unmap_guest(struct kvm *kvm);
472 int kvm_assign_device(struct kvm *kvm, 473 int kvm_assign_device(struct kvm *kvm,
473 struct kvm_assigned_dev_kernel *assigned_dev); 474 struct kvm_assigned_dev_kernel *assigned_dev);
474 int kvm_deassign_device(struct kvm *kvm, 475 int kvm_deassign_device(struct kvm *kvm,
475 struct kvm_assigned_dev_kernel *assigned_dev); 476 struct kvm_assigned_dev_kernel *assigned_dev);
476 #else /* CONFIG_IOMMU_API */ 477 #else /* CONFIG_IOMMU_API */
477 static inline int kvm_iommu_map_pages(struct kvm *kvm, 478 static inline int kvm_iommu_map_pages(struct kvm *kvm,
478 gfn_t base_gfn, 479 gfn_t base_gfn,
479 unsigned long npages) 480 unsigned long npages)
480 { 481 {
481 return 0; 482 return 0;
482 } 483 }
483 484
484 static inline int kvm_iommu_map_guest(struct kvm *kvm) 485 static inline int kvm_iommu_map_guest(struct kvm *kvm)
485 { 486 {
486 return -ENODEV; 487 return -ENODEV;
487 } 488 }
488 489
489 static inline int kvm_iommu_unmap_guest(struct kvm *kvm) 490 static inline int kvm_iommu_unmap_guest(struct kvm *kvm)
490 { 491 {
491 return 0; 492 return 0;
492 } 493 }
493 494
494 static inline int kvm_assign_device(struct kvm *kvm, 495 static inline int kvm_assign_device(struct kvm *kvm,
495 struct kvm_assigned_dev_kernel *assigned_dev) 496 struct kvm_assigned_dev_kernel *assigned_dev)
496 { 497 {
497 return 0; 498 return 0;
498 } 499 }
499 500
500 static inline int kvm_deassign_device(struct kvm *kvm, 501 static inline int kvm_deassign_device(struct kvm *kvm,
501 struct kvm_assigned_dev_kernel *assigned_dev) 502 struct kvm_assigned_dev_kernel *assigned_dev)
502 { 503 {
503 return 0; 504 return 0;
504 } 505 }
505 #endif /* CONFIG_IOMMU_API */ 506 #endif /* CONFIG_IOMMU_API */
506 507
507 static inline void kvm_guest_enter(void) 508 static inline void kvm_guest_enter(void)
508 { 509 {
509 account_system_vtime(current); 510 account_system_vtime(current);
510 current->flags |= PF_VCPU; 511 current->flags |= PF_VCPU;
511 } 512 }
512 513
513 static inline void kvm_guest_exit(void) 514 static inline void kvm_guest_exit(void)
514 { 515 {
515 account_system_vtime(current); 516 account_system_vtime(current);
516 current->flags &= ~PF_VCPU; 517 current->flags &= ~PF_VCPU;
517 } 518 }
518 519
519 static inline gpa_t gfn_to_gpa(gfn_t gfn) 520 static inline gpa_t gfn_to_gpa(gfn_t gfn)
520 { 521 {
521 return (gpa_t)gfn << PAGE_SHIFT; 522 return (gpa_t)gfn << PAGE_SHIFT;
522 } 523 }
523 524
524 static inline hpa_t pfn_to_hpa(pfn_t pfn) 525 static inline hpa_t pfn_to_hpa(pfn_t pfn)
525 { 526 {
526 return (hpa_t)pfn << PAGE_SHIFT; 527 return (hpa_t)pfn << PAGE_SHIFT;
527 } 528 }
528 529
529 static inline void kvm_migrate_timers(struct kvm_vcpu *vcpu) 530 static inline void kvm_migrate_timers(struct kvm_vcpu *vcpu)
530 { 531 {
531 set_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests); 532 set_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests);
532 } 533 }
533 534
534 enum kvm_stat_kind { 535 enum kvm_stat_kind {
535 KVM_STAT_VM, 536 KVM_STAT_VM,
536 KVM_STAT_VCPU, 537 KVM_STAT_VCPU,
537 }; 538 };
538 539
539 struct kvm_stats_debugfs_item { 540 struct kvm_stats_debugfs_item {
540 const char *name; 541 const char *name;
541 int offset; 542 int offset;
542 enum kvm_stat_kind kind; 543 enum kvm_stat_kind kind;
543 struct dentry *dentry; 544 struct dentry *dentry;
544 }; 545 };
545 extern struct kvm_stats_debugfs_item debugfs_entries[]; 546 extern struct kvm_stats_debugfs_item debugfs_entries[];
546 extern struct dentry *kvm_debugfs_dir; 547 extern struct dentry *kvm_debugfs_dir;
547 548
548 #ifdef KVM_ARCH_WANT_MMU_NOTIFIER 549 #ifdef KVM_ARCH_WANT_MMU_NOTIFIER
549 static inline int mmu_notifier_retry(struct kvm_vcpu *vcpu, unsigned long mmu_seq) 550 static inline int mmu_notifier_retry(struct kvm_vcpu *vcpu, unsigned long mmu_seq)
550 { 551 {
551 if (unlikely(vcpu->kvm->mmu_notifier_count)) 552 if (unlikely(vcpu->kvm->mmu_notifier_count))
552 return 1; 553 return 1;
553 /* 554 /*
554 * Both reads happen under the mmu_lock and both values are 555 * Both reads happen under the mmu_lock and both values are
555 * modified under mmu_lock, so there's no need of smb_rmb() 556 * modified under mmu_lock, so there's no need of smb_rmb()
556 * here in between, otherwise mmu_notifier_count should be 557 * here in between, otherwise mmu_notifier_count should be
557 * read before mmu_notifier_seq, see 558 * read before mmu_notifier_seq, see
558 * mmu_notifier_invalidate_range_end write side. 559 * mmu_notifier_invalidate_range_end write side.
559 */ 560 */
560 if (vcpu->kvm->mmu_notifier_seq != mmu_seq) 561 if (vcpu->kvm->mmu_notifier_seq != mmu_seq)
561 return 1; 562 return 1;
562 return 0; 563 return 0;
563 } 564 }
564 #endif 565 #endif
565 566
566 #ifdef CONFIG_HAVE_KVM_IRQCHIP 567 #ifdef CONFIG_HAVE_KVM_IRQCHIP
567 568
568 #define KVM_MAX_IRQ_ROUTES 1024 569 #define KVM_MAX_IRQ_ROUTES 1024
569 570
570 int kvm_setup_default_irq_routing(struct kvm *kvm); 571 int kvm_setup_default_irq_routing(struct kvm *kvm);
571 int kvm_set_irq_routing(struct kvm *kvm, 572 int kvm_set_irq_routing(struct kvm *kvm,
572 const struct kvm_irq_routing_entry *entries, 573 const struct kvm_irq_routing_entry *entries,
573 unsigned nr, 574 unsigned nr,
574 unsigned flags); 575 unsigned flags);
575 void kvm_free_irq_routing(struct kvm *kvm); 576 void kvm_free_irq_routing(struct kvm *kvm);
576 577
577 #else 578 #else
578 579
579 static inline void kvm_free_irq_routing(struct kvm *kvm) {} 580 static inline void kvm_free_irq_routing(struct kvm *kvm) {}
580 581
581 #endif 582 #endif
582 583
583 #ifdef CONFIG_HAVE_KVM_EVENTFD 584 #ifdef CONFIG_HAVE_KVM_EVENTFD
584 585
585 void kvm_eventfd_init(struct kvm *kvm); 586 void kvm_eventfd_init(struct kvm *kvm);
586 int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags); 587 int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags);
587 void kvm_irqfd_release(struct kvm *kvm); 588 void kvm_irqfd_release(struct kvm *kvm);
588 int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args); 589 int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args);
589 590
590 #else 591 #else
591 592
592 static inline void kvm_eventfd_init(struct kvm *kvm) {} 593 static inline void kvm_eventfd_init(struct kvm *kvm) {}
593 static inline int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags) 594 static inline int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
594 { 595 {
595 return -EINVAL; 596 return -EINVAL;
596 } 597 }
597 598
598 static inline void kvm_irqfd_release(struct kvm *kvm) {} 599 static inline void kvm_irqfd_release(struct kvm *kvm) {}
599 static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) 600 static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
600 { 601 {
601 return -ENOSYS; 602 return -ENOSYS;
602 } 603 }
603 604
604 #endif /* CONFIG_HAVE_KVM_EVENTFD */ 605 #endif /* CONFIG_HAVE_KVM_EVENTFD */
605 606
606 #ifdef CONFIG_KVM_APIC_ARCHITECTURE 607 #ifdef CONFIG_KVM_APIC_ARCHITECTURE
607 static inline bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu) 608 static inline bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
608 { 609 {
609 return vcpu->kvm->bsp_vcpu_id == vcpu->vcpu_id; 610 return vcpu->kvm->bsp_vcpu_id == vcpu->vcpu_id;
610 } 611 }
611 #endif 612 #endif
612 613
613 #ifdef __KVM_HAVE_DEVICE_ASSIGNMENT 614 #ifdef __KVM_HAVE_DEVICE_ASSIGNMENT
614 615
615 long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, 616 long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
616 unsigned long arg); 617 unsigned long arg);
617 618
618 #else 619 #else
619 620
620 static inline long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, 621 static inline long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
621 unsigned long arg) 622 unsigned long arg)
622 { 623 {
623 return -ENOTTY; 624 return -ENOTTY;
624 } 625 }
625 626
626 #endif 627 #endif
627 628
628 static inline void kvm_make_request(int req, struct kvm_vcpu *vcpu) 629 static inline void kvm_make_request(int req, struct kvm_vcpu *vcpu)
629 { 630 {
630 set_bit(req, &vcpu->requests); 631 set_bit(req, &vcpu->requests);
631 } 632 }
632 633
633 static inline bool kvm_make_check_request(int req, struct kvm_vcpu *vcpu) 634 static inline bool kvm_make_check_request(int req, struct kvm_vcpu *vcpu)
634 { 635 {
635 return test_and_set_bit(req, &vcpu->requests); 636 return test_and_set_bit(req, &vcpu->requests);
636 } 637 }
637 638
638 static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu) 639 static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu)
639 { 640 {
640 if (test_bit(req, &vcpu->requests)) { 641 if (test_bit(req, &vcpu->requests)) {
641 clear_bit(req, &vcpu->requests); 642 clear_bit(req, &vcpu->requests);
642 return true; 643 return true;
643 } else { 644 } else {
644 return false; 645 return false;
645 } 646 }
646 } 647 }
647 648
648 #endif 649 #endif
649 650
650 651
1 /* 1 /*
2 * Kernel-based Virtual Machine driver for Linux 2 * Kernel-based Virtual Machine driver for Linux
3 * 3 *
4 * This module enables machines with Intel VT-x extensions to run virtual 4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation. 5 * machines without emulation or binary translation.
6 * 6 *
7 * Copyright (C) 2006 Qumranet, Inc. 7 * Copyright (C) 2006 Qumranet, Inc.
8 * Copyright 2010 Red Hat, Inc. and/or its affilates. 8 * Copyright 2010 Red Hat, Inc. and/or its affilates.
9 * 9 *
10 * Authors: 10 * Authors:
11 * Avi Kivity <avi@qumranet.com> 11 * Avi Kivity <avi@qumranet.com>
12 * Yaniv Kamay <yaniv@qumranet.com> 12 * Yaniv Kamay <yaniv@qumranet.com>
13 * 13 *
14 * This work is licensed under the terms of the GNU GPL, version 2. See 14 * This work is licensed under the terms of the GNU GPL, version 2. See
15 * the COPYING file in the top-level directory. 15 * the COPYING file in the top-level directory.
16 * 16 *
17 */ 17 */
18 18
19 #include "iodev.h" 19 #include "iodev.h"
20 20
21 #include <linux/kvm_host.h> 21 #include <linux/kvm_host.h>
22 #include <linux/kvm.h> 22 #include <linux/kvm.h>
23 #include <linux/module.h> 23 #include <linux/module.h>
24 #include <linux/errno.h> 24 #include <linux/errno.h>
25 #include <linux/percpu.h> 25 #include <linux/percpu.h>
26 #include <linux/mm.h> 26 #include <linux/mm.h>
27 #include <linux/miscdevice.h> 27 #include <linux/miscdevice.h>
28 #include <linux/vmalloc.h> 28 #include <linux/vmalloc.h>
29 #include <linux/reboot.h> 29 #include <linux/reboot.h>
30 #include <linux/debugfs.h> 30 #include <linux/debugfs.h>
31 #include <linux/highmem.h> 31 #include <linux/highmem.h>
32 #include <linux/file.h> 32 #include <linux/file.h>
33 #include <linux/sysdev.h> 33 #include <linux/sysdev.h>
34 #include <linux/cpu.h> 34 #include <linux/cpu.h>
35 #include <linux/sched.h> 35 #include <linux/sched.h>
36 #include <linux/cpumask.h> 36 #include <linux/cpumask.h>
37 #include <linux/smp.h> 37 #include <linux/smp.h>
38 #include <linux/anon_inodes.h> 38 #include <linux/anon_inodes.h>
39 #include <linux/profile.h> 39 #include <linux/profile.h>
40 #include <linux/kvm_para.h> 40 #include <linux/kvm_para.h>
41 #include <linux/pagemap.h> 41 #include <linux/pagemap.h>
42 #include <linux/mman.h> 42 #include <linux/mman.h>
43 #include <linux/swap.h> 43 #include <linux/swap.h>
44 #include <linux/bitops.h> 44 #include <linux/bitops.h>
45 #include <linux/spinlock.h> 45 #include <linux/spinlock.h>
46 #include <linux/compat.h> 46 #include <linux/compat.h>
47 #include <linux/srcu.h> 47 #include <linux/srcu.h>
48 #include <linux/hugetlb.h> 48 #include <linux/hugetlb.h>
49 #include <linux/slab.h> 49 #include <linux/slab.h>
50 50
51 #include <asm/processor.h> 51 #include <asm/processor.h>
52 #include <asm/io.h> 52 #include <asm/io.h>
53 #include <asm/uaccess.h> 53 #include <asm/uaccess.h>
54 #include <asm/pgtable.h> 54 #include <asm/pgtable.h>
55 #include <asm-generic/bitops/le.h> 55 #include <asm-generic/bitops/le.h>
56 56
57 #include "coalesced_mmio.h" 57 #include "coalesced_mmio.h"
58 58
59 #define CREATE_TRACE_POINTS 59 #define CREATE_TRACE_POINTS
60 #include <trace/events/kvm.h> 60 #include <trace/events/kvm.h>
61 61
62 MODULE_AUTHOR("Qumranet"); 62 MODULE_AUTHOR("Qumranet");
63 MODULE_LICENSE("GPL"); 63 MODULE_LICENSE("GPL");
64 64
65 /* 65 /*
66 * Ordering of locks: 66 * Ordering of locks:
67 * 67 *
68 * kvm->lock --> kvm->slots_lock --> kvm->irq_lock 68 * kvm->lock --> kvm->slots_lock --> kvm->irq_lock
69 */ 69 */
70 70
71 DEFINE_SPINLOCK(kvm_lock); 71 DEFINE_SPINLOCK(kvm_lock);
72 LIST_HEAD(vm_list); 72 LIST_HEAD(vm_list);
73 73
74 static cpumask_var_t cpus_hardware_enabled; 74 static cpumask_var_t cpus_hardware_enabled;
75 static int kvm_usage_count = 0; 75 static int kvm_usage_count = 0;
76 static atomic_t hardware_enable_failed; 76 static atomic_t hardware_enable_failed;
77 77
78 struct kmem_cache *kvm_vcpu_cache; 78 struct kmem_cache *kvm_vcpu_cache;
79 EXPORT_SYMBOL_GPL(kvm_vcpu_cache); 79 EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
80 80
81 static __read_mostly struct preempt_ops kvm_preempt_ops; 81 static __read_mostly struct preempt_ops kvm_preempt_ops;
82 82
83 struct dentry *kvm_debugfs_dir; 83 struct dentry *kvm_debugfs_dir;
84 84
85 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, 85 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
86 unsigned long arg); 86 unsigned long arg);
87 static int hardware_enable_all(void); 87 static int hardware_enable_all(void);
88 static void hardware_disable_all(void); 88 static void hardware_disable_all(void);
89 89
90 static void kvm_io_bus_destroy(struct kvm_io_bus *bus); 90 static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
91 91
92 static bool kvm_rebooting; 92 static bool kvm_rebooting;
93 93
94 static bool largepages_enabled = true; 94 static bool largepages_enabled = true;
95 95
96 static struct page *hwpoison_page; 96 static struct page *hwpoison_page;
97 static pfn_t hwpoison_pfn; 97 static pfn_t hwpoison_pfn;
98 98
99 static struct page *fault_page;
100 static pfn_t fault_pfn;
101
99 inline int kvm_is_mmio_pfn(pfn_t pfn) 102 inline int kvm_is_mmio_pfn(pfn_t pfn)
100 { 103 {
101 if (pfn_valid(pfn)) { 104 if (pfn_valid(pfn)) {
102 struct page *page = compound_head(pfn_to_page(pfn)); 105 struct page *page = compound_head(pfn_to_page(pfn));
103 return PageReserved(page); 106 return PageReserved(page);
104 } 107 }
105 108
106 return true; 109 return true;
107 } 110 }
108 111
109 /* 112 /*
110 * Switches to specified vcpu, until a matching vcpu_put() 113 * Switches to specified vcpu, until a matching vcpu_put()
111 */ 114 */
112 void vcpu_load(struct kvm_vcpu *vcpu) 115 void vcpu_load(struct kvm_vcpu *vcpu)
113 { 116 {
114 int cpu; 117 int cpu;
115 118
116 mutex_lock(&vcpu->mutex); 119 mutex_lock(&vcpu->mutex);
117 cpu = get_cpu(); 120 cpu = get_cpu();
118 preempt_notifier_register(&vcpu->preempt_notifier); 121 preempt_notifier_register(&vcpu->preempt_notifier);
119 kvm_arch_vcpu_load(vcpu, cpu); 122 kvm_arch_vcpu_load(vcpu, cpu);
120 put_cpu(); 123 put_cpu();
121 } 124 }
122 125
123 void vcpu_put(struct kvm_vcpu *vcpu) 126 void vcpu_put(struct kvm_vcpu *vcpu)
124 { 127 {
125 preempt_disable(); 128 preempt_disable();
126 kvm_arch_vcpu_put(vcpu); 129 kvm_arch_vcpu_put(vcpu);
127 preempt_notifier_unregister(&vcpu->preempt_notifier); 130 preempt_notifier_unregister(&vcpu->preempt_notifier);
128 preempt_enable(); 131 preempt_enable();
129 mutex_unlock(&vcpu->mutex); 132 mutex_unlock(&vcpu->mutex);
130 } 133 }
131 134
132 static void ack_flush(void *_completed) 135 static void ack_flush(void *_completed)
133 { 136 {
134 } 137 }
135 138
136 static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) 139 static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
137 { 140 {
138 int i, cpu, me; 141 int i, cpu, me;
139 cpumask_var_t cpus; 142 cpumask_var_t cpus;
140 bool called = true; 143 bool called = true;
141 struct kvm_vcpu *vcpu; 144 struct kvm_vcpu *vcpu;
142 145
143 zalloc_cpumask_var(&cpus, GFP_ATOMIC); 146 zalloc_cpumask_var(&cpus, GFP_ATOMIC);
144 147
145 raw_spin_lock(&kvm->requests_lock); 148 raw_spin_lock(&kvm->requests_lock);
146 me = smp_processor_id(); 149 me = smp_processor_id();
147 kvm_for_each_vcpu(i, vcpu, kvm) { 150 kvm_for_each_vcpu(i, vcpu, kvm) {
148 if (kvm_make_check_request(req, vcpu)) 151 if (kvm_make_check_request(req, vcpu))
149 continue; 152 continue;
150 cpu = vcpu->cpu; 153 cpu = vcpu->cpu;
151 if (cpus != NULL && cpu != -1 && cpu != me) 154 if (cpus != NULL && cpu != -1 && cpu != me)
152 cpumask_set_cpu(cpu, cpus); 155 cpumask_set_cpu(cpu, cpus);
153 } 156 }
154 if (unlikely(cpus == NULL)) 157 if (unlikely(cpus == NULL))
155 smp_call_function_many(cpu_online_mask, ack_flush, NULL, 1); 158 smp_call_function_many(cpu_online_mask, ack_flush, NULL, 1);
156 else if (!cpumask_empty(cpus)) 159 else if (!cpumask_empty(cpus))
157 smp_call_function_many(cpus, ack_flush, NULL, 1); 160 smp_call_function_many(cpus, ack_flush, NULL, 1);
158 else 161 else
159 called = false; 162 called = false;
160 raw_spin_unlock(&kvm->requests_lock); 163 raw_spin_unlock(&kvm->requests_lock);
161 free_cpumask_var(cpus); 164 free_cpumask_var(cpus);
162 return called; 165 return called;
163 } 166 }
164 167
165 void kvm_flush_remote_tlbs(struct kvm *kvm) 168 void kvm_flush_remote_tlbs(struct kvm *kvm)
166 { 169 {
167 if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) 170 if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
168 ++kvm->stat.remote_tlb_flush; 171 ++kvm->stat.remote_tlb_flush;
169 } 172 }
170 173
171 void kvm_reload_remote_mmus(struct kvm *kvm) 174 void kvm_reload_remote_mmus(struct kvm *kvm)
172 { 175 {
173 make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD); 176 make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
174 } 177 }
175 178
176 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) 179 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
177 { 180 {
178 struct page *page; 181 struct page *page;
179 int r; 182 int r;
180 183
181 mutex_init(&vcpu->mutex); 184 mutex_init(&vcpu->mutex);
182 vcpu->cpu = -1; 185 vcpu->cpu = -1;
183 vcpu->kvm = kvm; 186 vcpu->kvm = kvm;
184 vcpu->vcpu_id = id; 187 vcpu->vcpu_id = id;
185 init_waitqueue_head(&vcpu->wq); 188 init_waitqueue_head(&vcpu->wq);
186 189
187 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 190 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
188 if (!page) { 191 if (!page) {
189 r = -ENOMEM; 192 r = -ENOMEM;
190 goto fail; 193 goto fail;
191 } 194 }
192 vcpu->run = page_address(page); 195 vcpu->run = page_address(page);
193 196
194 r = kvm_arch_vcpu_init(vcpu); 197 r = kvm_arch_vcpu_init(vcpu);
195 if (r < 0) 198 if (r < 0)
196 goto fail_free_run; 199 goto fail_free_run;
197 return 0; 200 return 0;
198 201
199 fail_free_run: 202 fail_free_run:
200 free_page((unsigned long)vcpu->run); 203 free_page((unsigned long)vcpu->run);
201 fail: 204 fail:
202 return r; 205 return r;
203 } 206 }
204 EXPORT_SYMBOL_GPL(kvm_vcpu_init); 207 EXPORT_SYMBOL_GPL(kvm_vcpu_init);
205 208
206 void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) 209 void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
207 { 210 {
208 kvm_arch_vcpu_uninit(vcpu); 211 kvm_arch_vcpu_uninit(vcpu);
209 free_page((unsigned long)vcpu->run); 212 free_page((unsigned long)vcpu->run);
210 } 213 }
211 EXPORT_SYMBOL_GPL(kvm_vcpu_uninit); 214 EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
212 215
213 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 216 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
214 static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) 217 static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
215 { 218 {
216 return container_of(mn, struct kvm, mmu_notifier); 219 return container_of(mn, struct kvm, mmu_notifier);
217 } 220 }
218 221
219 static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn, 222 static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
220 struct mm_struct *mm, 223 struct mm_struct *mm,
221 unsigned long address) 224 unsigned long address)
222 { 225 {
223 struct kvm *kvm = mmu_notifier_to_kvm(mn); 226 struct kvm *kvm = mmu_notifier_to_kvm(mn);
224 int need_tlb_flush, idx; 227 int need_tlb_flush, idx;
225 228
226 /* 229 /*
227 * When ->invalidate_page runs, the linux pte has been zapped 230 * When ->invalidate_page runs, the linux pte has been zapped
228 * already but the page is still allocated until 231 * already but the page is still allocated until
229 * ->invalidate_page returns. So if we increase the sequence 232 * ->invalidate_page returns. So if we increase the sequence
230 * here the kvm page fault will notice if the spte can't be 233 * here the kvm page fault will notice if the spte can't be
231 * established because the page is going to be freed. If 234 * established because the page is going to be freed. If
232 * instead the kvm page fault establishes the spte before 235 * instead the kvm page fault establishes the spte before
233 * ->invalidate_page runs, kvm_unmap_hva will release it 236 * ->invalidate_page runs, kvm_unmap_hva will release it
234 * before returning. 237 * before returning.
235 * 238 *
236 * The sequence increase only need to be seen at spin_unlock 239 * The sequence increase only need to be seen at spin_unlock
237 * time, and not at spin_lock time. 240 * time, and not at spin_lock time.
238 * 241 *
239 * Increasing the sequence after the spin_unlock would be 242 * Increasing the sequence after the spin_unlock would be
240 * unsafe because the kvm page fault could then establish the 243 * unsafe because the kvm page fault could then establish the
241 * pte after kvm_unmap_hva returned, without noticing the page 244 * pte after kvm_unmap_hva returned, without noticing the page
242 * is going to be freed. 245 * is going to be freed.
243 */ 246 */
244 idx = srcu_read_lock(&kvm->srcu); 247 idx = srcu_read_lock(&kvm->srcu);
245 spin_lock(&kvm->mmu_lock); 248 spin_lock(&kvm->mmu_lock);
246 kvm->mmu_notifier_seq++; 249 kvm->mmu_notifier_seq++;
247 need_tlb_flush = kvm_unmap_hva(kvm, address); 250 need_tlb_flush = kvm_unmap_hva(kvm, address);
248 spin_unlock(&kvm->mmu_lock); 251 spin_unlock(&kvm->mmu_lock);
249 srcu_read_unlock(&kvm->srcu, idx); 252 srcu_read_unlock(&kvm->srcu, idx);
250 253
251 /* we've to flush the tlb before the pages can be freed */ 254 /* we've to flush the tlb before the pages can be freed */
252 if (need_tlb_flush) 255 if (need_tlb_flush)
253 kvm_flush_remote_tlbs(kvm); 256 kvm_flush_remote_tlbs(kvm);
254 257
255 } 258 }
256 259
257 static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, 260 static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
258 struct mm_struct *mm, 261 struct mm_struct *mm,
259 unsigned long address, 262 unsigned long address,
260 pte_t pte) 263 pte_t pte)
261 { 264 {
262 struct kvm *kvm = mmu_notifier_to_kvm(mn); 265 struct kvm *kvm = mmu_notifier_to_kvm(mn);
263 int idx; 266 int idx;
264 267
265 idx = srcu_read_lock(&kvm->srcu); 268 idx = srcu_read_lock(&kvm->srcu);
266 spin_lock(&kvm->mmu_lock); 269 spin_lock(&kvm->mmu_lock);
267 kvm->mmu_notifier_seq++; 270 kvm->mmu_notifier_seq++;
268 kvm_set_spte_hva(kvm, address, pte); 271 kvm_set_spte_hva(kvm, address, pte);
269 spin_unlock(&kvm->mmu_lock); 272 spin_unlock(&kvm->mmu_lock);
270 srcu_read_unlock(&kvm->srcu, idx); 273 srcu_read_unlock(&kvm->srcu, idx);
271 } 274 }
272 275
273 static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, 276 static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
274 struct mm_struct *mm, 277 struct mm_struct *mm,
275 unsigned long start, 278 unsigned long start,
276 unsigned long end) 279 unsigned long end)
277 { 280 {
278 struct kvm *kvm = mmu_notifier_to_kvm(mn); 281 struct kvm *kvm = mmu_notifier_to_kvm(mn);
279 int need_tlb_flush = 0, idx; 282 int need_tlb_flush = 0, idx;
280 283
281 idx = srcu_read_lock(&kvm->srcu); 284 idx = srcu_read_lock(&kvm->srcu);
282 spin_lock(&kvm->mmu_lock); 285 spin_lock(&kvm->mmu_lock);
283 /* 286 /*
284 * The count increase must become visible at unlock time as no 287 * The count increase must become visible at unlock time as no
285 * spte can be established without taking the mmu_lock and 288 * spte can be established without taking the mmu_lock and
286 * count is also read inside the mmu_lock critical section. 289 * count is also read inside the mmu_lock critical section.
287 */ 290 */
288 kvm->mmu_notifier_count++; 291 kvm->mmu_notifier_count++;
289 for (; start < end; start += PAGE_SIZE) 292 for (; start < end; start += PAGE_SIZE)
290 need_tlb_flush |= kvm_unmap_hva(kvm, start); 293 need_tlb_flush |= kvm_unmap_hva(kvm, start);
291 spin_unlock(&kvm->mmu_lock); 294 spin_unlock(&kvm->mmu_lock);
292 srcu_read_unlock(&kvm->srcu, idx); 295 srcu_read_unlock(&kvm->srcu, idx);
293 296
294 /* we've to flush the tlb before the pages can be freed */ 297 /* we've to flush the tlb before the pages can be freed */
295 if (need_tlb_flush) 298 if (need_tlb_flush)
296 kvm_flush_remote_tlbs(kvm); 299 kvm_flush_remote_tlbs(kvm);
297 } 300 }
298 301
299 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, 302 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
300 struct mm_struct *mm, 303 struct mm_struct *mm,
301 unsigned long start, 304 unsigned long start,
302 unsigned long end) 305 unsigned long end)
303 { 306 {
304 struct kvm *kvm = mmu_notifier_to_kvm(mn); 307 struct kvm *kvm = mmu_notifier_to_kvm(mn);
305 308
306 spin_lock(&kvm->mmu_lock); 309 spin_lock(&kvm->mmu_lock);
307 /* 310 /*
308 * This sequence increase will notify the kvm page fault that 311 * This sequence increase will notify the kvm page fault that
309 * the page that is going to be mapped in the spte could have 312 * the page that is going to be mapped in the spte could have
310 * been freed. 313 * been freed.
311 */ 314 */
312 kvm->mmu_notifier_seq++; 315 kvm->mmu_notifier_seq++;
313 /* 316 /*
314 * The above sequence increase must be visible before the 317 * The above sequence increase must be visible before the
315 * below count decrease but both values are read by the kvm 318 * below count decrease but both values are read by the kvm
316 * page fault under mmu_lock spinlock so we don't need to add 319 * page fault under mmu_lock spinlock so we don't need to add
317 * a smb_wmb() here in between the two. 320 * a smb_wmb() here in between the two.
318 */ 321 */
319 kvm->mmu_notifier_count--; 322 kvm->mmu_notifier_count--;
320 spin_unlock(&kvm->mmu_lock); 323 spin_unlock(&kvm->mmu_lock);
321 324
322 BUG_ON(kvm->mmu_notifier_count < 0); 325 BUG_ON(kvm->mmu_notifier_count < 0);
323 } 326 }
324 327
325 static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, 328 static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
326 struct mm_struct *mm, 329 struct mm_struct *mm,
327 unsigned long address) 330 unsigned long address)
328 { 331 {
329 struct kvm *kvm = mmu_notifier_to_kvm(mn); 332 struct kvm *kvm = mmu_notifier_to_kvm(mn);
330 int young, idx; 333 int young, idx;
331 334
332 idx = srcu_read_lock(&kvm->srcu); 335 idx = srcu_read_lock(&kvm->srcu);
333 spin_lock(&kvm->mmu_lock); 336 spin_lock(&kvm->mmu_lock);
334 young = kvm_age_hva(kvm, address); 337 young = kvm_age_hva(kvm, address);
335 spin_unlock(&kvm->mmu_lock); 338 spin_unlock(&kvm->mmu_lock);
336 srcu_read_unlock(&kvm->srcu, idx); 339 srcu_read_unlock(&kvm->srcu, idx);
337 340
338 if (young) 341 if (young)
339 kvm_flush_remote_tlbs(kvm); 342 kvm_flush_remote_tlbs(kvm);
340 343
341 return young; 344 return young;
342 } 345 }
343 346
344 static void kvm_mmu_notifier_release(struct mmu_notifier *mn, 347 static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
345 struct mm_struct *mm) 348 struct mm_struct *mm)
346 { 349 {
347 struct kvm *kvm = mmu_notifier_to_kvm(mn); 350 struct kvm *kvm = mmu_notifier_to_kvm(mn);
348 int idx; 351 int idx;
349 352
350 idx = srcu_read_lock(&kvm->srcu); 353 idx = srcu_read_lock(&kvm->srcu);
351 kvm_arch_flush_shadow(kvm); 354 kvm_arch_flush_shadow(kvm);
352 srcu_read_unlock(&kvm->srcu, idx); 355 srcu_read_unlock(&kvm->srcu, idx);
353 } 356 }
354 357
355 static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { 358 static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
356 .invalidate_page = kvm_mmu_notifier_invalidate_page, 359 .invalidate_page = kvm_mmu_notifier_invalidate_page,
357 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, 360 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
358 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, 361 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
359 .clear_flush_young = kvm_mmu_notifier_clear_flush_young, 362 .clear_flush_young = kvm_mmu_notifier_clear_flush_young,
360 .change_pte = kvm_mmu_notifier_change_pte, 363 .change_pte = kvm_mmu_notifier_change_pte,
361 .release = kvm_mmu_notifier_release, 364 .release = kvm_mmu_notifier_release,
362 }; 365 };
363 366
364 static int kvm_init_mmu_notifier(struct kvm *kvm) 367 static int kvm_init_mmu_notifier(struct kvm *kvm)
365 { 368 {
366 kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops; 369 kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
367 return mmu_notifier_register(&kvm->mmu_notifier, current->mm); 370 return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
368 } 371 }
369 372
370 #else /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */ 373 #else /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */
371 374
372 static int kvm_init_mmu_notifier(struct kvm *kvm) 375 static int kvm_init_mmu_notifier(struct kvm *kvm)
373 { 376 {
374 return 0; 377 return 0;
375 } 378 }
376 379
377 #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */ 380 #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
378 381
379 static struct kvm *kvm_create_vm(void) 382 static struct kvm *kvm_create_vm(void)
380 { 383 {
381 int r = 0, i; 384 int r = 0, i;
382 struct kvm *kvm = kvm_arch_create_vm(); 385 struct kvm *kvm = kvm_arch_create_vm();
383 386
384 if (IS_ERR(kvm)) 387 if (IS_ERR(kvm))
385 goto out; 388 goto out;
386 389
387 r = hardware_enable_all(); 390 r = hardware_enable_all();
388 if (r) 391 if (r)
389 goto out_err_nodisable; 392 goto out_err_nodisable;
390 393
391 #ifdef CONFIG_HAVE_KVM_IRQCHIP 394 #ifdef CONFIG_HAVE_KVM_IRQCHIP
392 INIT_HLIST_HEAD(&kvm->mask_notifier_list); 395 INIT_HLIST_HEAD(&kvm->mask_notifier_list);
393 INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list); 396 INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
394 #endif 397 #endif
395 398
396 r = -ENOMEM; 399 r = -ENOMEM;
397 kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); 400 kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
398 if (!kvm->memslots) 401 if (!kvm->memslots)
399 goto out_err; 402 goto out_err;
400 if (init_srcu_struct(&kvm->srcu)) 403 if (init_srcu_struct(&kvm->srcu))
401 goto out_err; 404 goto out_err;
402 for (i = 0; i < KVM_NR_BUSES; i++) { 405 for (i = 0; i < KVM_NR_BUSES; i++) {
403 kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus), 406 kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus),
404 GFP_KERNEL); 407 GFP_KERNEL);
405 if (!kvm->buses[i]) { 408 if (!kvm->buses[i]) {
406 cleanup_srcu_struct(&kvm->srcu); 409 cleanup_srcu_struct(&kvm->srcu);
407 goto out_err; 410 goto out_err;
408 } 411 }
409 } 412 }
410 413
411 r = kvm_init_mmu_notifier(kvm); 414 r = kvm_init_mmu_notifier(kvm);
412 if (r) { 415 if (r) {
413 cleanup_srcu_struct(&kvm->srcu); 416 cleanup_srcu_struct(&kvm->srcu);
414 goto out_err; 417 goto out_err;
415 } 418 }
416 419
417 kvm->mm = current->mm; 420 kvm->mm = current->mm;
418 atomic_inc(&kvm->mm->mm_count); 421 atomic_inc(&kvm->mm->mm_count);
419 spin_lock_init(&kvm->mmu_lock); 422 spin_lock_init(&kvm->mmu_lock);
420 raw_spin_lock_init(&kvm->requests_lock); 423 raw_spin_lock_init(&kvm->requests_lock);
421 kvm_eventfd_init(kvm); 424 kvm_eventfd_init(kvm);
422 mutex_init(&kvm->lock); 425 mutex_init(&kvm->lock);
423 mutex_init(&kvm->irq_lock); 426 mutex_init(&kvm->irq_lock);
424 mutex_init(&kvm->slots_lock); 427 mutex_init(&kvm->slots_lock);
425 atomic_set(&kvm->users_count, 1); 428 atomic_set(&kvm->users_count, 1);
426 spin_lock(&kvm_lock); 429 spin_lock(&kvm_lock);
427 list_add(&kvm->vm_list, &vm_list); 430 list_add(&kvm->vm_list, &vm_list);
428 spin_unlock(&kvm_lock); 431 spin_unlock(&kvm_lock);
429 out: 432 out:
430 return kvm; 433 return kvm;
431 434
432 out_err: 435 out_err:
433 hardware_disable_all(); 436 hardware_disable_all();
434 out_err_nodisable: 437 out_err_nodisable:
435 for (i = 0; i < KVM_NR_BUSES; i++) 438 for (i = 0; i < KVM_NR_BUSES; i++)
436 kfree(kvm->buses[i]); 439 kfree(kvm->buses[i]);
437 kfree(kvm->memslots); 440 kfree(kvm->memslots);
438 kfree(kvm); 441 kfree(kvm);
439 return ERR_PTR(r); 442 return ERR_PTR(r);
440 } 443 }
441 444
442 /* 445 /*
443 * Free any memory in @free but not in @dont. 446 * Free any memory in @free but not in @dont.
444 */ 447 */
445 static void kvm_free_physmem_slot(struct kvm_memory_slot *free, 448 static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
446 struct kvm_memory_slot *dont) 449 struct kvm_memory_slot *dont)
447 { 450 {
448 int i; 451 int i;
449 452
450 if (!dont || free->rmap != dont->rmap) 453 if (!dont || free->rmap != dont->rmap)
451 vfree(free->rmap); 454 vfree(free->rmap);
452 455
453 if (!dont || free->dirty_bitmap != dont->dirty_bitmap) 456 if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
454 vfree(free->dirty_bitmap); 457 vfree(free->dirty_bitmap);
455 458
456 459
457 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { 460 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
458 if (!dont || free->lpage_info[i] != dont->lpage_info[i]) { 461 if (!dont || free->lpage_info[i] != dont->lpage_info[i]) {
459 vfree(free->lpage_info[i]); 462 vfree(free->lpage_info[i]);
460 free->lpage_info[i] = NULL; 463 free->lpage_info[i] = NULL;
461 } 464 }
462 } 465 }
463 466
464 free->npages = 0; 467 free->npages = 0;
465 free->dirty_bitmap = NULL; 468 free->dirty_bitmap = NULL;
466 free->rmap = NULL; 469 free->rmap = NULL;
467 } 470 }
468 471
469 void kvm_free_physmem(struct kvm *kvm) 472 void kvm_free_physmem(struct kvm *kvm)
470 { 473 {
471 int i; 474 int i;
472 struct kvm_memslots *slots = kvm->memslots; 475 struct kvm_memslots *slots = kvm->memslots;
473 476
474 for (i = 0; i < slots->nmemslots; ++i) 477 for (i = 0; i < slots->nmemslots; ++i)
475 kvm_free_physmem_slot(&slots->memslots[i], NULL); 478 kvm_free_physmem_slot(&slots->memslots[i], NULL);
476 479
477 kfree(kvm->memslots); 480 kfree(kvm->memslots);
478 } 481 }
479 482
480 static void kvm_destroy_vm(struct kvm *kvm) 483 static void kvm_destroy_vm(struct kvm *kvm)
481 { 484 {
482 int i; 485 int i;
483 struct mm_struct *mm = kvm->mm; 486 struct mm_struct *mm = kvm->mm;
484 487
485 kvm_arch_sync_events(kvm); 488 kvm_arch_sync_events(kvm);
486 spin_lock(&kvm_lock); 489 spin_lock(&kvm_lock);
487 list_del(&kvm->vm_list); 490 list_del(&kvm->vm_list);
488 spin_unlock(&kvm_lock); 491 spin_unlock(&kvm_lock);
489 kvm_free_irq_routing(kvm); 492 kvm_free_irq_routing(kvm);
490 for (i = 0; i < KVM_NR_BUSES; i++) 493 for (i = 0; i < KVM_NR_BUSES; i++)
491 kvm_io_bus_destroy(kvm->buses[i]); 494 kvm_io_bus_destroy(kvm->buses[i]);
492 kvm_coalesced_mmio_free(kvm); 495 kvm_coalesced_mmio_free(kvm);
493 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 496 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
494 mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); 497 mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
495 #else 498 #else
496 kvm_arch_flush_shadow(kvm); 499 kvm_arch_flush_shadow(kvm);
497 #endif 500 #endif
498 kvm_arch_destroy_vm(kvm); 501 kvm_arch_destroy_vm(kvm);
499 hardware_disable_all(); 502 hardware_disable_all();
500 mmdrop(mm); 503 mmdrop(mm);
501 } 504 }
502 505
503 void kvm_get_kvm(struct kvm *kvm) 506 void kvm_get_kvm(struct kvm *kvm)
504 { 507 {
505 atomic_inc(&kvm->users_count); 508 atomic_inc(&kvm->users_count);
506 } 509 }
507 EXPORT_SYMBOL_GPL(kvm_get_kvm); 510 EXPORT_SYMBOL_GPL(kvm_get_kvm);
508 511
509 void kvm_put_kvm(struct kvm *kvm) 512 void kvm_put_kvm(struct kvm *kvm)
510 { 513 {
511 if (atomic_dec_and_test(&kvm->users_count)) 514 if (atomic_dec_and_test(&kvm->users_count))
512 kvm_destroy_vm(kvm); 515 kvm_destroy_vm(kvm);
513 } 516 }
514 EXPORT_SYMBOL_GPL(kvm_put_kvm); 517 EXPORT_SYMBOL_GPL(kvm_put_kvm);
515 518
516 519
517 static int kvm_vm_release(struct inode *inode, struct file *filp) 520 static int kvm_vm_release(struct inode *inode, struct file *filp)
518 { 521 {
519 struct kvm *kvm = filp->private_data; 522 struct kvm *kvm = filp->private_data;
520 523
521 kvm_irqfd_release(kvm); 524 kvm_irqfd_release(kvm);
522 525
523 kvm_put_kvm(kvm); 526 kvm_put_kvm(kvm);
524 return 0; 527 return 0;
525 } 528 }
526 529
527 /* 530 /*
528 * Allocate some memory and give it an address in the guest physical address 531 * Allocate some memory and give it an address in the guest physical address
529 * space. 532 * space.
530 * 533 *
531 * Discontiguous memory is allowed, mostly for framebuffers. 534 * Discontiguous memory is allowed, mostly for framebuffers.
532 * 535 *
533 * Must be called holding mmap_sem for write. 536 * Must be called holding mmap_sem for write.
534 */ 537 */
535 int __kvm_set_memory_region(struct kvm *kvm, 538 int __kvm_set_memory_region(struct kvm *kvm,
536 struct kvm_userspace_memory_region *mem, 539 struct kvm_userspace_memory_region *mem,
537 int user_alloc) 540 int user_alloc)
538 { 541 {
539 int r, flush_shadow = 0; 542 int r, flush_shadow = 0;
540 gfn_t base_gfn; 543 gfn_t base_gfn;
541 unsigned long npages; 544 unsigned long npages;
542 unsigned long i; 545 unsigned long i;
543 struct kvm_memory_slot *memslot; 546 struct kvm_memory_slot *memslot;
544 struct kvm_memory_slot old, new; 547 struct kvm_memory_slot old, new;
545 struct kvm_memslots *slots, *old_memslots; 548 struct kvm_memslots *slots, *old_memslots;
546 549
547 r = -EINVAL; 550 r = -EINVAL;
548 /* General sanity checks */ 551 /* General sanity checks */
549 if (mem->memory_size & (PAGE_SIZE - 1)) 552 if (mem->memory_size & (PAGE_SIZE - 1))
550 goto out; 553 goto out;
551 if (mem->guest_phys_addr & (PAGE_SIZE - 1)) 554 if (mem->guest_phys_addr & (PAGE_SIZE - 1))
552 goto out; 555 goto out;
553 if (user_alloc && (mem->userspace_addr & (PAGE_SIZE - 1))) 556 if (user_alloc && (mem->userspace_addr & (PAGE_SIZE - 1)))
554 goto out; 557 goto out;
555 if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS) 558 if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS)
556 goto out; 559 goto out;
557 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) 560 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
558 goto out; 561 goto out;
559 562
560 memslot = &kvm->memslots->memslots[mem->slot]; 563 memslot = &kvm->memslots->memslots[mem->slot];
561 base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; 564 base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
562 npages = mem->memory_size >> PAGE_SHIFT; 565 npages = mem->memory_size >> PAGE_SHIFT;
563 566
564 r = -EINVAL; 567 r = -EINVAL;
565 if (npages > KVM_MEM_MAX_NR_PAGES) 568 if (npages > KVM_MEM_MAX_NR_PAGES)
566 goto out; 569 goto out;
567 570
568 if (!npages) 571 if (!npages)
569 mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES; 572 mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
570 573
571 new = old = *memslot; 574 new = old = *memslot;
572 575
573 new.id = mem->slot; 576 new.id = mem->slot;
574 new.base_gfn = base_gfn; 577 new.base_gfn = base_gfn;
575 new.npages = npages; 578 new.npages = npages;
576 new.flags = mem->flags; 579 new.flags = mem->flags;
577 580
578 /* Disallow changing a memory slot's size. */ 581 /* Disallow changing a memory slot's size. */
579 r = -EINVAL; 582 r = -EINVAL;
580 if (npages && old.npages && npages != old.npages) 583 if (npages && old.npages && npages != old.npages)
581 goto out_free; 584 goto out_free;
582 585
583 /* Check for overlaps */ 586 /* Check for overlaps */
584 r = -EEXIST; 587 r = -EEXIST;
585 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { 588 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
586 struct kvm_memory_slot *s = &kvm->memslots->memslots[i]; 589 struct kvm_memory_slot *s = &kvm->memslots->memslots[i];
587 590
588 if (s == memslot || !s->npages) 591 if (s == memslot || !s->npages)
589 continue; 592 continue;
590 if (!((base_gfn + npages <= s->base_gfn) || 593 if (!((base_gfn + npages <= s->base_gfn) ||
591 (base_gfn >= s->base_gfn + s->npages))) 594 (base_gfn >= s->base_gfn + s->npages)))
592 goto out_free; 595 goto out_free;
593 } 596 }
594 597
595 /* Free page dirty bitmap if unneeded */ 598 /* Free page dirty bitmap if unneeded */
596 if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES)) 599 if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
597 new.dirty_bitmap = NULL; 600 new.dirty_bitmap = NULL;
598 601
599 r = -ENOMEM; 602 r = -ENOMEM;
600 603
601 /* Allocate if a slot is being created */ 604 /* Allocate if a slot is being created */
602 #ifndef CONFIG_S390 605 #ifndef CONFIG_S390
603 if (npages && !new.rmap) { 606 if (npages && !new.rmap) {
604 new.rmap = vmalloc(npages * sizeof(*new.rmap)); 607 new.rmap = vmalloc(npages * sizeof(*new.rmap));
605 608
606 if (!new.rmap) 609 if (!new.rmap)
607 goto out_free; 610 goto out_free;
608 611
609 memset(new.rmap, 0, npages * sizeof(*new.rmap)); 612 memset(new.rmap, 0, npages * sizeof(*new.rmap));
610 613
611 new.user_alloc = user_alloc; 614 new.user_alloc = user_alloc;
612 new.userspace_addr = mem->userspace_addr; 615 new.userspace_addr = mem->userspace_addr;
613 } 616 }
614 if (!npages) 617 if (!npages)
615 goto skip_lpage; 618 goto skip_lpage;
616 619
617 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { 620 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
618 unsigned long ugfn; 621 unsigned long ugfn;
619 unsigned long j; 622 unsigned long j;
620 int lpages; 623 int lpages;
621 int level = i + 2; 624 int level = i + 2;
622 625
623 /* Avoid unused variable warning if no large pages */ 626 /* Avoid unused variable warning if no large pages */
624 (void)level; 627 (void)level;
625 628
626 if (new.lpage_info[i]) 629 if (new.lpage_info[i])
627 continue; 630 continue;
628 631
629 lpages = 1 + ((base_gfn + npages - 1) 632 lpages = 1 + ((base_gfn + npages - 1)
630 >> KVM_HPAGE_GFN_SHIFT(level)); 633 >> KVM_HPAGE_GFN_SHIFT(level));
631 lpages -= base_gfn >> KVM_HPAGE_GFN_SHIFT(level); 634 lpages -= base_gfn >> KVM_HPAGE_GFN_SHIFT(level);
632 635
633 new.lpage_info[i] = vmalloc(lpages * sizeof(*new.lpage_info[i])); 636 new.lpage_info[i] = vmalloc(lpages * sizeof(*new.lpage_info[i]));
634 637
635 if (!new.lpage_info[i]) 638 if (!new.lpage_info[i])
636 goto out_free; 639 goto out_free;
637 640
638 memset(new.lpage_info[i], 0, 641 memset(new.lpage_info[i], 0,
639 lpages * sizeof(*new.lpage_info[i])); 642 lpages * sizeof(*new.lpage_info[i]));
640 643
641 if (base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1)) 644 if (base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
642 new.lpage_info[i][0].write_count = 1; 645 new.lpage_info[i][0].write_count = 1;
643 if ((base_gfn+npages) & (KVM_PAGES_PER_HPAGE(level) - 1)) 646 if ((base_gfn+npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
644 new.lpage_info[i][lpages - 1].write_count = 1; 647 new.lpage_info[i][lpages - 1].write_count = 1;
645 ugfn = new.userspace_addr >> PAGE_SHIFT; 648 ugfn = new.userspace_addr >> PAGE_SHIFT;
646 /* 649 /*
647 * If the gfn and userspace address are not aligned wrt each 650 * If the gfn and userspace address are not aligned wrt each
648 * other, or if explicitly asked to, disable large page 651 * other, or if explicitly asked to, disable large page
649 * support for this slot 652 * support for this slot
650 */ 653 */
651 if ((base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) || 654 if ((base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) ||
652 !largepages_enabled) 655 !largepages_enabled)
653 for (j = 0; j < lpages; ++j) 656 for (j = 0; j < lpages; ++j)
654 new.lpage_info[i][j].write_count = 1; 657 new.lpage_info[i][j].write_count = 1;
655 } 658 }
656 659
657 skip_lpage: 660 skip_lpage:
658 661
659 /* Allocate page dirty bitmap if needed */ 662 /* Allocate page dirty bitmap if needed */
660 if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { 663 if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
661 unsigned long dirty_bytes = kvm_dirty_bitmap_bytes(&new); 664 unsigned long dirty_bytes = kvm_dirty_bitmap_bytes(&new);
662 665
663 new.dirty_bitmap = vmalloc(dirty_bytes); 666 new.dirty_bitmap = vmalloc(dirty_bytes);
664 if (!new.dirty_bitmap) 667 if (!new.dirty_bitmap)
665 goto out_free; 668 goto out_free;
666 memset(new.dirty_bitmap, 0, dirty_bytes); 669 memset(new.dirty_bitmap, 0, dirty_bytes);
667 /* destroy any largepage mappings for dirty tracking */ 670 /* destroy any largepage mappings for dirty tracking */
668 if (old.npages) 671 if (old.npages)
669 flush_shadow = 1; 672 flush_shadow = 1;
670 } 673 }
671 #else /* not defined CONFIG_S390 */ 674 #else /* not defined CONFIG_S390 */
672 new.user_alloc = user_alloc; 675 new.user_alloc = user_alloc;
673 if (user_alloc) 676 if (user_alloc)
674 new.userspace_addr = mem->userspace_addr; 677 new.userspace_addr = mem->userspace_addr;
675 #endif /* not defined CONFIG_S390 */ 678 #endif /* not defined CONFIG_S390 */
676 679
677 if (!npages) { 680 if (!npages) {
678 r = -ENOMEM; 681 r = -ENOMEM;
679 slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); 682 slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
680 if (!slots) 683 if (!slots)
681 goto out_free; 684 goto out_free;
682 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); 685 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
683 if (mem->slot >= slots->nmemslots) 686 if (mem->slot >= slots->nmemslots)
684 slots->nmemslots = mem->slot + 1; 687 slots->nmemslots = mem->slot + 1;
685 slots->memslots[mem->slot].flags |= KVM_MEMSLOT_INVALID; 688 slots->memslots[mem->slot].flags |= KVM_MEMSLOT_INVALID;
686 689
687 old_memslots = kvm->memslots; 690 old_memslots = kvm->memslots;
688 rcu_assign_pointer(kvm->memslots, slots); 691 rcu_assign_pointer(kvm->memslots, slots);
689 synchronize_srcu_expedited(&kvm->srcu); 692 synchronize_srcu_expedited(&kvm->srcu);
690 /* From this point no new shadow pages pointing to a deleted 693 /* From this point no new shadow pages pointing to a deleted
691 * memslot will be created. 694 * memslot will be created.
692 * 695 *
693 * validation of sp->gfn happens in: 696 * validation of sp->gfn happens in:
694 * - gfn_to_hva (kvm_read_guest, gfn_to_pfn) 697 * - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
695 * - kvm_is_visible_gfn (mmu_check_roots) 698 * - kvm_is_visible_gfn (mmu_check_roots)
696 */ 699 */
697 kvm_arch_flush_shadow(kvm); 700 kvm_arch_flush_shadow(kvm);
698 kfree(old_memslots); 701 kfree(old_memslots);
699 } 702 }
700 703
701 r = kvm_arch_prepare_memory_region(kvm, &new, old, mem, user_alloc); 704 r = kvm_arch_prepare_memory_region(kvm, &new, old, mem, user_alloc);
702 if (r) 705 if (r)
703 goto out_free; 706 goto out_free;
704 707
705 #ifdef CONFIG_DMAR 708 #ifdef CONFIG_DMAR
706 /* map the pages in iommu page table */ 709 /* map the pages in iommu page table */
707 if (npages) { 710 if (npages) {
708 r = kvm_iommu_map_pages(kvm, &new); 711 r = kvm_iommu_map_pages(kvm, &new);
709 if (r) 712 if (r)
710 goto out_free; 713 goto out_free;
711 } 714 }
712 #endif 715 #endif
713 716
714 r = -ENOMEM; 717 r = -ENOMEM;
715 slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); 718 slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
716 if (!slots) 719 if (!slots)
717 goto out_free; 720 goto out_free;
718 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); 721 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
719 if (mem->slot >= slots->nmemslots) 722 if (mem->slot >= slots->nmemslots)
720 slots->nmemslots = mem->slot + 1; 723 slots->nmemslots = mem->slot + 1;
721 724
722 /* actual memory is freed via old in kvm_free_physmem_slot below */ 725 /* actual memory is freed via old in kvm_free_physmem_slot below */
723 if (!npages) { 726 if (!npages) {
724 new.rmap = NULL; 727 new.rmap = NULL;
725 new.dirty_bitmap = NULL; 728 new.dirty_bitmap = NULL;
726 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) 729 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i)
727 new.lpage_info[i] = NULL; 730 new.lpage_info[i] = NULL;
728 } 731 }
729 732
730 slots->memslots[mem->slot] = new; 733 slots->memslots[mem->slot] = new;
731 old_memslots = kvm->memslots; 734 old_memslots = kvm->memslots;
732 rcu_assign_pointer(kvm->memslots, slots); 735 rcu_assign_pointer(kvm->memslots, slots);
733 synchronize_srcu_expedited(&kvm->srcu); 736 synchronize_srcu_expedited(&kvm->srcu);
734 737
735 kvm_arch_commit_memory_region(kvm, mem, old, user_alloc); 738 kvm_arch_commit_memory_region(kvm, mem, old, user_alloc);
736 739
737 kvm_free_physmem_slot(&old, &new); 740 kvm_free_physmem_slot(&old, &new);
738 kfree(old_memslots); 741 kfree(old_memslots);
739 742
740 if (flush_shadow) 743 if (flush_shadow)
741 kvm_arch_flush_shadow(kvm); 744 kvm_arch_flush_shadow(kvm);
742 745
743 return 0; 746 return 0;
744 747
745 out_free: 748 out_free:
746 kvm_free_physmem_slot(&new, &old); 749 kvm_free_physmem_slot(&new, &old);
747 out: 750 out:
748 return r; 751 return r;
749 752
750 } 753 }
751 EXPORT_SYMBOL_GPL(__kvm_set_memory_region); 754 EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
752 755
753 int kvm_set_memory_region(struct kvm *kvm, 756 int kvm_set_memory_region(struct kvm *kvm,
754 struct kvm_userspace_memory_region *mem, 757 struct kvm_userspace_memory_region *mem,
755 int user_alloc) 758 int user_alloc)
756 { 759 {
757 int r; 760 int r;
758 761
759 mutex_lock(&kvm->slots_lock); 762 mutex_lock(&kvm->slots_lock);
760 r = __kvm_set_memory_region(kvm, mem, user_alloc); 763 r = __kvm_set_memory_region(kvm, mem, user_alloc);
761 mutex_unlock(&kvm->slots_lock); 764 mutex_unlock(&kvm->slots_lock);
762 return r; 765 return r;
763 } 766 }
764 EXPORT_SYMBOL_GPL(kvm_set_memory_region); 767 EXPORT_SYMBOL_GPL(kvm_set_memory_region);
765 768
766 int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, 769 int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
767 struct 770 struct
768 kvm_userspace_memory_region *mem, 771 kvm_userspace_memory_region *mem,
769 int user_alloc) 772 int user_alloc)
770 { 773 {
771 if (mem->slot >= KVM_MEMORY_SLOTS) 774 if (mem->slot >= KVM_MEMORY_SLOTS)
772 return -EINVAL; 775 return -EINVAL;
773 return kvm_set_memory_region(kvm, mem, user_alloc); 776 return kvm_set_memory_region(kvm, mem, user_alloc);
774 } 777 }
775 778
776 int kvm_get_dirty_log(struct kvm *kvm, 779 int kvm_get_dirty_log(struct kvm *kvm,
777 struct kvm_dirty_log *log, int *is_dirty) 780 struct kvm_dirty_log *log, int *is_dirty)
778 { 781 {
779 struct kvm_memory_slot *memslot; 782 struct kvm_memory_slot *memslot;
780 int r, i; 783 int r, i;
781 unsigned long n; 784 unsigned long n;
782 unsigned long any = 0; 785 unsigned long any = 0;
783 786
784 r = -EINVAL; 787 r = -EINVAL;
785 if (log->slot >= KVM_MEMORY_SLOTS) 788 if (log->slot >= KVM_MEMORY_SLOTS)
786 goto out; 789 goto out;
787 790
788 memslot = &kvm->memslots->memslots[log->slot]; 791 memslot = &kvm->memslots->memslots[log->slot];
789 r = -ENOENT; 792 r = -ENOENT;
790 if (!memslot->dirty_bitmap) 793 if (!memslot->dirty_bitmap)
791 goto out; 794 goto out;
792 795
793 n = kvm_dirty_bitmap_bytes(memslot); 796 n = kvm_dirty_bitmap_bytes(memslot);
794 797
795 for (i = 0; !any && i < n/sizeof(long); ++i) 798 for (i = 0; !any && i < n/sizeof(long); ++i)
796 any = memslot->dirty_bitmap[i]; 799 any = memslot->dirty_bitmap[i];
797 800
798 r = -EFAULT; 801 r = -EFAULT;
799 if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) 802 if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
800 goto out; 803 goto out;
801 804
802 if (any) 805 if (any)
803 *is_dirty = 1; 806 *is_dirty = 1;
804 807
805 r = 0; 808 r = 0;
806 out: 809 out:
807 return r; 810 return r;
808 } 811 }
809 812
810 void kvm_disable_largepages(void) 813 void kvm_disable_largepages(void)
811 { 814 {
812 largepages_enabled = false; 815 largepages_enabled = false;
813 } 816 }
814 EXPORT_SYMBOL_GPL(kvm_disable_largepages); 817 EXPORT_SYMBOL_GPL(kvm_disable_largepages);
815 818
816 int is_error_page(struct page *page) 819 int is_error_page(struct page *page)
817 { 820 {
818 return page == bad_page || page == hwpoison_page; 821 return page == bad_page || page == hwpoison_page || page == fault_page;
819 } 822 }
820 EXPORT_SYMBOL_GPL(is_error_page); 823 EXPORT_SYMBOL_GPL(is_error_page);
821 824
822 int is_error_pfn(pfn_t pfn) 825 int is_error_pfn(pfn_t pfn)
823 { 826 {
824 return pfn == bad_pfn || pfn == hwpoison_pfn; 827 return pfn == bad_pfn || pfn == hwpoison_pfn || pfn == fault_pfn;
825 } 828 }
826 EXPORT_SYMBOL_GPL(is_error_pfn); 829 EXPORT_SYMBOL_GPL(is_error_pfn);
827 830
828 int is_hwpoison_pfn(pfn_t pfn) 831 int is_hwpoison_pfn(pfn_t pfn)
829 { 832 {
830 return pfn == hwpoison_pfn; 833 return pfn == hwpoison_pfn;
831 } 834 }
832 EXPORT_SYMBOL_GPL(is_hwpoison_pfn); 835 EXPORT_SYMBOL_GPL(is_hwpoison_pfn);
833 836
837 int is_fault_pfn(pfn_t pfn)
838 {
839 return pfn == fault_pfn;
840 }
841 EXPORT_SYMBOL_GPL(is_fault_pfn);
842
834 static inline unsigned long bad_hva(void) 843 static inline unsigned long bad_hva(void)
835 { 844 {
836 return PAGE_OFFSET; 845 return PAGE_OFFSET;
837 } 846 }
838 847
839 int kvm_is_error_hva(unsigned long addr) 848 int kvm_is_error_hva(unsigned long addr)
840 { 849 {
841 return addr == bad_hva(); 850 return addr == bad_hva();
842 } 851 }
843 EXPORT_SYMBOL_GPL(kvm_is_error_hva); 852 EXPORT_SYMBOL_GPL(kvm_is_error_hva);
844 853
845 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) 854 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
846 { 855 {
847 int i; 856 int i;
848 struct kvm_memslots *slots = kvm_memslots(kvm); 857 struct kvm_memslots *slots = kvm_memslots(kvm);
849 858
850 for (i = 0; i < slots->nmemslots; ++i) { 859 for (i = 0; i < slots->nmemslots; ++i) {
851 struct kvm_memory_slot *memslot = &slots->memslots[i]; 860 struct kvm_memory_slot *memslot = &slots->memslots[i];
852 861
853 if (gfn >= memslot->base_gfn 862 if (gfn >= memslot->base_gfn
854 && gfn < memslot->base_gfn + memslot->npages) 863 && gfn < memslot->base_gfn + memslot->npages)
855 return memslot; 864 return memslot;
856 } 865 }
857 return NULL; 866 return NULL;
858 } 867 }
859 EXPORT_SYMBOL_GPL(gfn_to_memslot); 868 EXPORT_SYMBOL_GPL(gfn_to_memslot);
860 869
861 int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) 870 int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
862 { 871 {
863 int i; 872 int i;
864 struct kvm_memslots *slots = kvm_memslots(kvm); 873 struct kvm_memslots *slots = kvm_memslots(kvm);
865 874
866 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { 875 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
867 struct kvm_memory_slot *memslot = &slots->memslots[i]; 876 struct kvm_memory_slot *memslot = &slots->memslots[i];
868 877
869 if (memslot->flags & KVM_MEMSLOT_INVALID) 878 if (memslot->flags & KVM_MEMSLOT_INVALID)
870 continue; 879 continue;
871 880
872 if (gfn >= memslot->base_gfn 881 if (gfn >= memslot->base_gfn
873 && gfn < memslot->base_gfn + memslot->npages) 882 && gfn < memslot->base_gfn + memslot->npages)
874 return 1; 883 return 1;
875 } 884 }
876 return 0; 885 return 0;
877 } 886 }
878 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); 887 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
879 888
880 unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn) 889 unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn)
881 { 890 {
882 struct vm_area_struct *vma; 891 struct vm_area_struct *vma;
883 unsigned long addr, size; 892 unsigned long addr, size;
884 893
885 size = PAGE_SIZE; 894 size = PAGE_SIZE;
886 895
887 addr = gfn_to_hva(kvm, gfn); 896 addr = gfn_to_hva(kvm, gfn);
888 if (kvm_is_error_hva(addr)) 897 if (kvm_is_error_hva(addr))
889 return PAGE_SIZE; 898 return PAGE_SIZE;
890 899
891 down_read(&current->mm->mmap_sem); 900 down_read(&current->mm->mmap_sem);
892 vma = find_vma(current->mm, addr); 901 vma = find_vma(current->mm, addr);
893 if (!vma) 902 if (!vma)
894 goto out; 903 goto out;
895 904
896 size = vma_kernel_pagesize(vma); 905 size = vma_kernel_pagesize(vma);
897 906
898 out: 907 out:
899 up_read(&current->mm->mmap_sem); 908 up_read(&current->mm->mmap_sem);
900 909
901 return size; 910 return size;
902 } 911 }
903 912
904 int memslot_id(struct kvm *kvm, gfn_t gfn) 913 int memslot_id(struct kvm *kvm, gfn_t gfn)
905 { 914 {
906 int i; 915 int i;
907 struct kvm_memslots *slots = kvm_memslots(kvm); 916 struct kvm_memslots *slots = kvm_memslots(kvm);
908 struct kvm_memory_slot *memslot = NULL; 917 struct kvm_memory_slot *memslot = NULL;
909 918
910 for (i = 0; i < slots->nmemslots; ++i) { 919 for (i = 0; i < slots->nmemslots; ++i) {
911 memslot = &slots->memslots[i]; 920 memslot = &slots->memslots[i];
912 921
913 if (gfn >= memslot->base_gfn 922 if (gfn >= memslot->base_gfn
914 && gfn < memslot->base_gfn + memslot->npages) 923 && gfn < memslot->base_gfn + memslot->npages)
915 break; 924 break;
916 } 925 }
917 926
918 return memslot - slots->memslots; 927 return memslot - slots->memslots;
919 } 928 }
920 929
921 static unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn) 930 static unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
922 { 931 {
923 return slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE; 932 return slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE;
924 } 933 }
925 934
926 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) 935 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
927 { 936 {
928 struct kvm_memory_slot *slot; 937 struct kvm_memory_slot *slot;
929 938
930 slot = gfn_to_memslot(kvm, gfn); 939 slot = gfn_to_memslot(kvm, gfn);
931 if (!slot || slot->flags & KVM_MEMSLOT_INVALID) 940 if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
932 return bad_hva(); 941 return bad_hva();
933 return gfn_to_hva_memslot(slot, gfn); 942 return gfn_to_hva_memslot(slot, gfn);
934 } 943 }
935 EXPORT_SYMBOL_GPL(gfn_to_hva); 944 EXPORT_SYMBOL_GPL(gfn_to_hva);
936 945
937 static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr) 946 static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr)
938 { 947 {
939 struct page *page[1]; 948 struct page *page[1];
940 int npages; 949 int npages;
941 pfn_t pfn; 950 pfn_t pfn;
942 951
943 might_sleep(); 952 might_sleep();
944 953
945 npages = get_user_pages_fast(addr, 1, 1, page); 954 npages = get_user_pages_fast(addr, 1, 1, page);
946 955
947 if (unlikely(npages != 1)) { 956 if (unlikely(npages != 1)) {
948 struct vm_area_struct *vma; 957 struct vm_area_struct *vma;
949 958
950 down_read(&current->mm->mmap_sem); 959 down_read(&current->mm->mmap_sem);
951 if (is_hwpoison_address(addr)) { 960 if (is_hwpoison_address(addr)) {
952 up_read(&current->mm->mmap_sem); 961 up_read(&current->mm->mmap_sem);
953 get_page(hwpoison_page); 962 get_page(hwpoison_page);
954 return page_to_pfn(hwpoison_page); 963 return page_to_pfn(hwpoison_page);
955 } 964 }
956 965
957 vma = find_vma(current->mm, addr); 966 vma = find_vma(current->mm, addr);
958 967
959 if (vma == NULL || addr < vma->vm_start || 968 if (vma == NULL || addr < vma->vm_start ||
960 !(vma->vm_flags & VM_PFNMAP)) { 969 !(vma->vm_flags & VM_PFNMAP)) {
961 up_read(&current->mm->mmap_sem); 970 up_read(&current->mm->mmap_sem);
962 get_page(bad_page); 971 get_page(fault_page);
963 return page_to_pfn(bad_page); 972 return page_to_pfn(fault_page);
964 } 973 }
965 974
966 pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 975 pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
967 up_read(&current->mm->mmap_sem); 976 up_read(&current->mm->mmap_sem);
968 BUG_ON(!kvm_is_mmio_pfn(pfn)); 977 BUG_ON(!kvm_is_mmio_pfn(pfn));
969 } else 978 } else
970 pfn = page_to_pfn(page[0]); 979 pfn = page_to_pfn(page[0]);
971 980
972 return pfn; 981 return pfn;
973 } 982 }
974 983
975 pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) 984 pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
976 { 985 {
977 unsigned long addr; 986 unsigned long addr;
978 987
979 addr = gfn_to_hva(kvm, gfn); 988 addr = gfn_to_hva(kvm, gfn);
980 if (kvm_is_error_hva(addr)) { 989 if (kvm_is_error_hva(addr)) {
981 get_page(bad_page); 990 get_page(bad_page);
982 return page_to_pfn(bad_page); 991 return page_to_pfn(bad_page);
983 } 992 }
984 993
985 return hva_to_pfn(kvm, addr); 994 return hva_to_pfn(kvm, addr);
986 } 995 }
987 EXPORT_SYMBOL_GPL(gfn_to_pfn); 996 EXPORT_SYMBOL_GPL(gfn_to_pfn);
988 997
989 pfn_t gfn_to_pfn_memslot(struct kvm *kvm, 998 pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
990 struct kvm_memory_slot *slot, gfn_t gfn) 999 struct kvm_memory_slot *slot, gfn_t gfn)
991 { 1000 {
992 unsigned long addr = gfn_to_hva_memslot(slot, gfn); 1001 unsigned long addr = gfn_to_hva_memslot(slot, gfn);
993 return hva_to_pfn(kvm, addr); 1002 return hva_to_pfn(kvm, addr);
994 } 1003 }
995 1004
996 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) 1005 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
997 { 1006 {
998 pfn_t pfn; 1007 pfn_t pfn;
999 1008
1000 pfn = gfn_to_pfn(kvm, gfn); 1009 pfn = gfn_to_pfn(kvm, gfn);
1001 if (!kvm_is_mmio_pfn(pfn)) 1010 if (!kvm_is_mmio_pfn(pfn))
1002 return pfn_to_page(pfn); 1011 return pfn_to_page(pfn);
1003 1012
1004 WARN_ON(kvm_is_mmio_pfn(pfn)); 1013 WARN_ON(kvm_is_mmio_pfn(pfn));
1005 1014
1006 get_page(bad_page); 1015 get_page(bad_page);
1007 return bad_page; 1016 return bad_page;
1008 } 1017 }
1009 1018
1010 EXPORT_SYMBOL_GPL(gfn_to_page); 1019 EXPORT_SYMBOL_GPL(gfn_to_page);
1011 1020
1012 void kvm_release_page_clean(struct page *page) 1021 void kvm_release_page_clean(struct page *page)
1013 { 1022 {
1014 kvm_release_pfn_clean(page_to_pfn(page)); 1023 kvm_release_pfn_clean(page_to_pfn(page));
1015 } 1024 }
1016 EXPORT_SYMBOL_GPL(kvm_release_page_clean); 1025 EXPORT_SYMBOL_GPL(kvm_release_page_clean);
1017 1026
1018 void kvm_release_pfn_clean(pfn_t pfn) 1027 void kvm_release_pfn_clean(pfn_t pfn)
1019 { 1028 {
1020 if (!kvm_is_mmio_pfn(pfn)) 1029 if (!kvm_is_mmio_pfn(pfn))
1021 put_page(pfn_to_page(pfn)); 1030 put_page(pfn_to_page(pfn));
1022 } 1031 }
1023 EXPORT_SYMBOL_GPL(kvm_release_pfn_clean); 1032 EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
1024 1033
1025 void kvm_release_page_dirty(struct page *page) 1034 void kvm_release_page_dirty(struct page *page)
1026 { 1035 {
1027 kvm_release_pfn_dirty(page_to_pfn(page)); 1036 kvm_release_pfn_dirty(page_to_pfn(page));
1028 } 1037 }
1029 EXPORT_SYMBOL_GPL(kvm_release_page_dirty); 1038 EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
1030 1039
1031 void kvm_release_pfn_dirty(pfn_t pfn) 1040 void kvm_release_pfn_dirty(pfn_t pfn)
1032 { 1041 {
1033 kvm_set_pfn_dirty(pfn); 1042 kvm_set_pfn_dirty(pfn);
1034 kvm_release_pfn_clean(pfn); 1043 kvm_release_pfn_clean(pfn);
1035 } 1044 }
1036 EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty); 1045 EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
1037 1046
1038 void kvm_set_page_dirty(struct page *page) 1047 void kvm_set_page_dirty(struct page *page)
1039 { 1048 {
1040 kvm_set_pfn_dirty(page_to_pfn(page)); 1049 kvm_set_pfn_dirty(page_to_pfn(page));
1041 } 1050 }
1042 EXPORT_SYMBOL_GPL(kvm_set_page_dirty); 1051 EXPORT_SYMBOL_GPL(kvm_set_page_dirty);
1043 1052
1044 void kvm_set_pfn_dirty(pfn_t pfn) 1053 void kvm_set_pfn_dirty(pfn_t pfn)
1045 { 1054 {
1046 if (!kvm_is_mmio_pfn(pfn)) { 1055 if (!kvm_is_mmio_pfn(pfn)) {
1047 struct page *page = pfn_to_page(pfn); 1056 struct page *page = pfn_to_page(pfn);
1048 if (!PageReserved(page)) 1057 if (!PageReserved(page))
1049 SetPageDirty(page); 1058 SetPageDirty(page);
1050 } 1059 }
1051 } 1060 }
1052 EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty); 1061 EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
1053 1062
1054 void kvm_set_pfn_accessed(pfn_t pfn) 1063 void kvm_set_pfn_accessed(pfn_t pfn)
1055 { 1064 {
1056 if (!kvm_is_mmio_pfn(pfn)) 1065 if (!kvm_is_mmio_pfn(pfn))
1057 mark_page_accessed(pfn_to_page(pfn)); 1066 mark_page_accessed(pfn_to_page(pfn));
1058 } 1067 }
1059 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed); 1068 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
1060 1069
1061 void kvm_get_pfn(pfn_t pfn) 1070 void kvm_get_pfn(pfn_t pfn)
1062 { 1071 {
1063 if (!kvm_is_mmio_pfn(pfn)) 1072 if (!kvm_is_mmio_pfn(pfn))
1064 get_page(pfn_to_page(pfn)); 1073 get_page(pfn_to_page(pfn));
1065 } 1074 }
1066 EXPORT_SYMBOL_GPL(kvm_get_pfn); 1075 EXPORT_SYMBOL_GPL(kvm_get_pfn);
1067 1076
1068 static int next_segment(unsigned long len, int offset) 1077 static int next_segment(unsigned long len, int offset)
1069 { 1078 {
1070 if (len > PAGE_SIZE - offset) 1079 if (len > PAGE_SIZE - offset)
1071 return PAGE_SIZE - offset; 1080 return PAGE_SIZE - offset;
1072 else 1081 else
1073 return len; 1082 return len;
1074 } 1083 }
1075 1084
1076 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, 1085 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
1077 int len) 1086 int len)
1078 { 1087 {
1079 int r; 1088 int r;
1080 unsigned long addr; 1089 unsigned long addr;
1081 1090
1082 addr = gfn_to_hva(kvm, gfn); 1091 addr = gfn_to_hva(kvm, gfn);
1083 if (kvm_is_error_hva(addr)) 1092 if (kvm_is_error_hva(addr))
1084 return -EFAULT; 1093 return -EFAULT;
1085 r = copy_from_user(data, (void __user *)addr + offset, len); 1094 r = copy_from_user(data, (void __user *)addr + offset, len);
1086 if (r) 1095 if (r)
1087 return -EFAULT; 1096 return -EFAULT;
1088 return 0; 1097 return 0;
1089 } 1098 }
1090 EXPORT_SYMBOL_GPL(kvm_read_guest_page); 1099 EXPORT_SYMBOL_GPL(kvm_read_guest_page);
1091 1100
1092 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) 1101 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
1093 { 1102 {
1094 gfn_t gfn = gpa >> PAGE_SHIFT; 1103 gfn_t gfn = gpa >> PAGE_SHIFT;
1095 int seg; 1104 int seg;
1096 int offset = offset_in_page(gpa); 1105 int offset = offset_in_page(gpa);
1097 int ret; 1106 int ret;
1098 1107
1099 while ((seg = next_segment(len, offset)) != 0) { 1108 while ((seg = next_segment(len, offset)) != 0) {
1100 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg); 1109 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
1101 if (ret < 0) 1110 if (ret < 0)
1102 return ret; 1111 return ret;
1103 offset = 0; 1112 offset = 0;
1104 len -= seg; 1113 len -= seg;
1105 data += seg; 1114 data += seg;
1106 ++gfn; 1115 ++gfn;
1107 } 1116 }
1108 return 0; 1117 return 0;
1109 } 1118 }
1110 EXPORT_SYMBOL_GPL(kvm_read_guest); 1119 EXPORT_SYMBOL_GPL(kvm_read_guest);
1111 1120
1112 int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, 1121 int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
1113 unsigned long len) 1122 unsigned long len)
1114 { 1123 {
1115 int r; 1124 int r;
1116 unsigned long addr; 1125 unsigned long addr;
1117 gfn_t gfn = gpa >> PAGE_SHIFT; 1126 gfn_t gfn = gpa >> PAGE_SHIFT;
1118 int offset = offset_in_page(gpa); 1127 int offset = offset_in_page(gpa);
1119 1128
1120 addr = gfn_to_hva(kvm, gfn); 1129 addr = gfn_to_hva(kvm, gfn);
1121 if (kvm_is_error_hva(addr)) 1130 if (kvm_is_error_hva(addr))
1122 return -EFAULT; 1131 return -EFAULT;
1123 pagefault_disable(); 1132 pagefault_disable();
1124 r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len); 1133 r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
1125 pagefault_enable(); 1134 pagefault_enable();
1126 if (r) 1135 if (r)
1127 return -EFAULT; 1136 return -EFAULT;
1128 return 0; 1137 return 0;
1129 } 1138 }
1130 EXPORT_SYMBOL(kvm_read_guest_atomic); 1139 EXPORT_SYMBOL(kvm_read_guest_atomic);
1131 1140
1132 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, 1141 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
1133 int offset, int len) 1142 int offset, int len)
1134 { 1143 {
1135 int r; 1144 int r;
1136 unsigned long addr; 1145 unsigned long addr;
1137 1146
1138 addr = gfn_to_hva(kvm, gfn); 1147 addr = gfn_to_hva(kvm, gfn);
1139 if (kvm_is_error_hva(addr)) 1148 if (kvm_is_error_hva(addr))
1140 return -EFAULT; 1149 return -EFAULT;
1141 r = copy_to_user((void __user *)addr + offset, data, len); 1150 r = copy_to_user((void __user *)addr + offset, data, len);
1142 if (r) 1151 if (r)
1143 return -EFAULT; 1152 return -EFAULT;
1144 mark_page_dirty(kvm, gfn); 1153 mark_page_dirty(kvm, gfn);
1145 return 0; 1154 return 0;
1146 } 1155 }
1147 EXPORT_SYMBOL_GPL(kvm_write_guest_page); 1156 EXPORT_SYMBOL_GPL(kvm_write_guest_page);
1148 1157
1149 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, 1158 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
1150 unsigned long len) 1159 unsigned long len)
1151 { 1160 {
1152 gfn_t gfn = gpa >> PAGE_SHIFT; 1161 gfn_t gfn = gpa >> PAGE_SHIFT;
1153 int seg; 1162 int seg;
1154 int offset = offset_in_page(gpa); 1163 int offset = offset_in_page(gpa);
1155 int ret; 1164 int ret;
1156 1165
1157 while ((seg = next_segment(len, offset)) != 0) { 1166 while ((seg = next_segment(len, offset)) != 0) {
1158 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg); 1167 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
1159 if (ret < 0) 1168 if (ret < 0)
1160 return ret; 1169 return ret;
1161 offset = 0; 1170 offset = 0;
1162 len -= seg; 1171 len -= seg;
1163 data += seg; 1172 data += seg;
1164 ++gfn; 1173 ++gfn;
1165 } 1174 }
1166 return 0; 1175 return 0;
1167 } 1176 }
1168 1177
1169 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) 1178 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
1170 { 1179 {
1171 return kvm_write_guest_page(kvm, gfn, empty_zero_page, offset, len); 1180 return kvm_write_guest_page(kvm, gfn, empty_zero_page, offset, len);
1172 } 1181 }
1173 EXPORT_SYMBOL_GPL(kvm_clear_guest_page); 1182 EXPORT_SYMBOL_GPL(kvm_clear_guest_page);
1174 1183
1175 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) 1184 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
1176 { 1185 {
1177 gfn_t gfn = gpa >> PAGE_SHIFT; 1186 gfn_t gfn = gpa >> PAGE_SHIFT;
1178 int seg; 1187 int seg;
1179 int offset = offset_in_page(gpa); 1188 int offset = offset_in_page(gpa);
1180 int ret; 1189 int ret;
1181 1190
1182 while ((seg = next_segment(len, offset)) != 0) { 1191 while ((seg = next_segment(len, offset)) != 0) {
1183 ret = kvm_clear_guest_page(kvm, gfn, offset, seg); 1192 ret = kvm_clear_guest_page(kvm, gfn, offset, seg);
1184 if (ret < 0) 1193 if (ret < 0)
1185 return ret; 1194 return ret;
1186 offset = 0; 1195 offset = 0;
1187 len -= seg; 1196 len -= seg;
1188 ++gfn; 1197 ++gfn;
1189 } 1198 }
1190 return 0; 1199 return 0;
1191 } 1200 }
1192 EXPORT_SYMBOL_GPL(kvm_clear_guest); 1201 EXPORT_SYMBOL_GPL(kvm_clear_guest);
1193 1202
1194 void mark_page_dirty(struct kvm *kvm, gfn_t gfn) 1203 void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
1195 { 1204 {
1196 struct kvm_memory_slot *memslot; 1205 struct kvm_memory_slot *memslot;
1197 1206
1198 memslot = gfn_to_memslot(kvm, gfn); 1207 memslot = gfn_to_memslot(kvm, gfn);
1199 if (memslot && memslot->dirty_bitmap) { 1208 if (memslot && memslot->dirty_bitmap) {
1200 unsigned long rel_gfn = gfn - memslot->base_gfn; 1209 unsigned long rel_gfn = gfn - memslot->base_gfn;
1201 1210
1202 generic___set_le_bit(rel_gfn, memslot->dirty_bitmap); 1211 generic___set_le_bit(rel_gfn, memslot->dirty_bitmap);
1203 } 1212 }
1204 } 1213 }
1205 1214
1206 /* 1215 /*
1207 * The vCPU has executed a HLT instruction with in-kernel mode enabled. 1216 * The vCPU has executed a HLT instruction with in-kernel mode enabled.
1208 */ 1217 */
1209 void kvm_vcpu_block(struct kvm_vcpu *vcpu) 1218 void kvm_vcpu_block(struct kvm_vcpu *vcpu)
1210 { 1219 {
1211 DEFINE_WAIT(wait); 1220 DEFINE_WAIT(wait);
1212 1221
1213 for (;;) { 1222 for (;;) {
1214 prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); 1223 prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
1215 1224
1216 if (kvm_arch_vcpu_runnable(vcpu)) { 1225 if (kvm_arch_vcpu_runnable(vcpu)) {
1217 kvm_make_request(KVM_REQ_UNHALT, vcpu); 1226 kvm_make_request(KVM_REQ_UNHALT, vcpu);
1218 break; 1227 break;
1219 } 1228 }
1220 if (kvm_cpu_has_pending_timer(vcpu)) 1229 if (kvm_cpu_has_pending_timer(vcpu))
1221 break; 1230 break;
1222 if (signal_pending(current)) 1231 if (signal_pending(current))
1223 break; 1232 break;
1224 1233
1225 schedule(); 1234 schedule();
1226 } 1235 }
1227 1236
1228 finish_wait(&vcpu->wq, &wait); 1237 finish_wait(&vcpu->wq, &wait);
1229 } 1238 }
1230 1239
1231 void kvm_resched(struct kvm_vcpu *vcpu) 1240 void kvm_resched(struct kvm_vcpu *vcpu)
1232 { 1241 {
1233 if (!need_resched()) 1242 if (!need_resched())
1234 return; 1243 return;
1235 cond_resched(); 1244 cond_resched();
1236 } 1245 }
1237 EXPORT_SYMBOL_GPL(kvm_resched); 1246 EXPORT_SYMBOL_GPL(kvm_resched);
1238 1247
1239 void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu) 1248 void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu)
1240 { 1249 {
1241 ktime_t expires; 1250 ktime_t expires;
1242 DEFINE_WAIT(wait); 1251 DEFINE_WAIT(wait);
1243 1252
1244 prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); 1253 prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
1245 1254
1246 /* Sleep for 100 us, and hope lock-holder got scheduled */ 1255 /* Sleep for 100 us, and hope lock-holder got scheduled */
1247 expires = ktime_add_ns(ktime_get(), 100000UL); 1256 expires = ktime_add_ns(ktime_get(), 100000UL);
1248 schedule_hrtimeout(&expires, HRTIMER_MODE_ABS); 1257 schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
1249 1258
1250 finish_wait(&vcpu->wq, &wait); 1259 finish_wait(&vcpu->wq, &wait);
1251 } 1260 }
1252 EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin); 1261 EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
1253 1262
1254 static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 1263 static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1255 { 1264 {
1256 struct kvm_vcpu *vcpu = vma->vm_file->private_data; 1265 struct kvm_vcpu *vcpu = vma->vm_file->private_data;
1257 struct page *page; 1266 struct page *page;
1258 1267
1259 if (vmf->pgoff == 0) 1268 if (vmf->pgoff == 0)
1260 page = virt_to_page(vcpu->run); 1269 page = virt_to_page(vcpu->run);
1261 #ifdef CONFIG_X86 1270 #ifdef CONFIG_X86
1262 else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET) 1271 else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
1263 page = virt_to_page(vcpu->arch.pio_data); 1272 page = virt_to_page(vcpu->arch.pio_data);
1264 #endif 1273 #endif
1265 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 1274 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
1266 else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET) 1275 else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
1267 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring); 1276 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
1268 #endif 1277 #endif
1269 else 1278 else
1270 return VM_FAULT_SIGBUS; 1279 return VM_FAULT_SIGBUS;
1271 get_page(page); 1280 get_page(page);
1272 vmf->page = page; 1281 vmf->page = page;
1273 return 0; 1282 return 0;
1274 } 1283 }
1275 1284
1276 static const struct vm_operations_struct kvm_vcpu_vm_ops = { 1285 static const struct vm_operations_struct kvm_vcpu_vm_ops = {
1277 .fault = kvm_vcpu_fault, 1286 .fault = kvm_vcpu_fault,
1278 }; 1287 };
1279 1288
1280 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma) 1289 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
1281 { 1290 {
1282 vma->vm_ops = &kvm_vcpu_vm_ops; 1291 vma->vm_ops = &kvm_vcpu_vm_ops;
1283 return 0; 1292 return 0;
1284 } 1293 }
1285 1294
1286 static int kvm_vcpu_release(struct inode *inode, struct file *filp) 1295 static int kvm_vcpu_release(struct inode *inode, struct file *filp)
1287 { 1296 {
1288 struct kvm_vcpu *vcpu = filp->private_data; 1297 struct kvm_vcpu *vcpu = filp->private_data;
1289 1298
1290 kvm_put_kvm(vcpu->kvm); 1299 kvm_put_kvm(vcpu->kvm);
1291 return 0; 1300 return 0;
1292 } 1301 }
1293 1302
1294 static struct file_operations kvm_vcpu_fops = { 1303 static struct file_operations kvm_vcpu_fops = {
1295 .release = kvm_vcpu_release, 1304 .release = kvm_vcpu_release,
1296 .unlocked_ioctl = kvm_vcpu_ioctl, 1305 .unlocked_ioctl = kvm_vcpu_ioctl,
1297 .compat_ioctl = kvm_vcpu_ioctl, 1306 .compat_ioctl = kvm_vcpu_ioctl,
1298 .mmap = kvm_vcpu_mmap, 1307 .mmap = kvm_vcpu_mmap,
1299 }; 1308 };
1300 1309
1301 /* 1310 /*
1302 * Allocates an inode for the vcpu. 1311 * Allocates an inode for the vcpu.
1303 */ 1312 */
1304 static int create_vcpu_fd(struct kvm_vcpu *vcpu) 1313 static int create_vcpu_fd(struct kvm_vcpu *vcpu)
1305 { 1314 {
1306 return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR); 1315 return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR);
1307 } 1316 }
1308 1317
1309 /* 1318 /*
1310 * Creates some virtual cpus. Good luck creating more than one. 1319 * Creates some virtual cpus. Good luck creating more than one.
1311 */ 1320 */
1312 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) 1321 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
1313 { 1322 {
1314 int r; 1323 int r;
1315 struct kvm_vcpu *vcpu, *v; 1324 struct kvm_vcpu *vcpu, *v;
1316 1325
1317 vcpu = kvm_arch_vcpu_create(kvm, id); 1326 vcpu = kvm_arch_vcpu_create(kvm, id);
1318 if (IS_ERR(vcpu)) 1327 if (IS_ERR(vcpu))
1319 return PTR_ERR(vcpu); 1328 return PTR_ERR(vcpu);
1320 1329
1321 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); 1330 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
1322 1331
1323 r = kvm_arch_vcpu_setup(vcpu); 1332 r = kvm_arch_vcpu_setup(vcpu);
1324 if (r) 1333 if (r)
1325 return r; 1334 return r;
1326 1335
1327 mutex_lock(&kvm->lock); 1336 mutex_lock(&kvm->lock);
1328 if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) { 1337 if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) {
1329 r = -EINVAL; 1338 r = -EINVAL;
1330 goto vcpu_destroy; 1339 goto vcpu_destroy;
1331 } 1340 }
1332 1341
1333 kvm_for_each_vcpu(r, v, kvm) 1342 kvm_for_each_vcpu(r, v, kvm)
1334 if (v->vcpu_id == id) { 1343 if (v->vcpu_id == id) {
1335 r = -EEXIST; 1344 r = -EEXIST;
1336 goto vcpu_destroy; 1345 goto vcpu_destroy;
1337 } 1346 }
1338 1347
1339 BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]); 1348 BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]);
1340 1349
1341 /* Now it's all set up, let userspace reach it */ 1350 /* Now it's all set up, let userspace reach it */
1342 kvm_get_kvm(kvm); 1351 kvm_get_kvm(kvm);
1343 r = create_vcpu_fd(vcpu); 1352 r = create_vcpu_fd(vcpu);
1344 if (r < 0) { 1353 if (r < 0) {
1345 kvm_put_kvm(kvm); 1354 kvm_put_kvm(kvm);
1346 goto vcpu_destroy; 1355 goto vcpu_destroy;
1347 } 1356 }
1348 1357
1349 kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu; 1358 kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu;
1350 smp_wmb(); 1359 smp_wmb();
1351 atomic_inc(&kvm->online_vcpus); 1360 atomic_inc(&kvm->online_vcpus);
1352 1361
1353 #ifdef CONFIG_KVM_APIC_ARCHITECTURE 1362 #ifdef CONFIG_KVM_APIC_ARCHITECTURE
1354 if (kvm->bsp_vcpu_id == id) 1363 if (kvm->bsp_vcpu_id == id)
1355 kvm->bsp_vcpu = vcpu; 1364 kvm->bsp_vcpu = vcpu;
1356 #endif 1365 #endif
1357 mutex_unlock(&kvm->lock); 1366 mutex_unlock(&kvm->lock);
1358 return r; 1367 return r;
1359 1368
1360 vcpu_destroy: 1369 vcpu_destroy:
1361 mutex_unlock(&kvm->lock); 1370 mutex_unlock(&kvm->lock);
1362 kvm_arch_vcpu_destroy(vcpu); 1371 kvm_arch_vcpu_destroy(vcpu);
1363 return r; 1372 return r;
1364 } 1373 }
1365 1374
1366 static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset) 1375 static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
1367 { 1376 {
1368 if (sigset) { 1377 if (sigset) {
1369 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP)); 1378 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
1370 vcpu->sigset_active = 1; 1379 vcpu->sigset_active = 1;
1371 vcpu->sigset = *sigset; 1380 vcpu->sigset = *sigset;
1372 } else 1381 } else
1373 vcpu->sigset_active = 0; 1382 vcpu->sigset_active = 0;
1374 return 0; 1383 return 0;
1375 } 1384 }
1376 1385
1377 static long kvm_vcpu_ioctl(struct file *filp, 1386 static long kvm_vcpu_ioctl(struct file *filp,
1378 unsigned int ioctl, unsigned long arg) 1387 unsigned int ioctl, unsigned long arg)
1379 { 1388 {
1380 struct kvm_vcpu *vcpu = filp->private_data; 1389 struct kvm_vcpu *vcpu = filp->private_data;
1381 void __user *argp = (void __user *)arg; 1390 void __user *argp = (void __user *)arg;
1382 int r; 1391 int r;
1383 struct kvm_fpu *fpu = NULL; 1392 struct kvm_fpu *fpu = NULL;
1384 struct kvm_sregs *kvm_sregs = NULL; 1393 struct kvm_sregs *kvm_sregs = NULL;
1385 1394
1386 if (vcpu->kvm->mm != current->mm) 1395 if (vcpu->kvm->mm != current->mm)
1387 return -EIO; 1396 return -EIO;
1388 1397
1389 #if defined(CONFIG_S390) || defined(CONFIG_PPC) 1398 #if defined(CONFIG_S390) || defined(CONFIG_PPC)
1390 /* 1399 /*
1391 * Special cases: vcpu ioctls that are asynchronous to vcpu execution, 1400 * Special cases: vcpu ioctls that are asynchronous to vcpu execution,
1392 * so vcpu_load() would break it. 1401 * so vcpu_load() would break it.
1393 */ 1402 */
1394 if (ioctl == KVM_S390_INTERRUPT || ioctl == KVM_INTERRUPT) 1403 if (ioctl == KVM_S390_INTERRUPT || ioctl == KVM_INTERRUPT)
1395 return kvm_arch_vcpu_ioctl(filp, ioctl, arg); 1404 return kvm_arch_vcpu_ioctl(filp, ioctl, arg);
1396 #endif 1405 #endif
1397 1406
1398 1407
1399 vcpu_load(vcpu); 1408 vcpu_load(vcpu);
1400 switch (ioctl) { 1409 switch (ioctl) {
1401 case KVM_RUN: 1410 case KVM_RUN:
1402 r = -EINVAL; 1411 r = -EINVAL;
1403 if (arg) 1412 if (arg)
1404 goto out; 1413 goto out;
1405 r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run); 1414 r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
1406 break; 1415 break;
1407 case KVM_GET_REGS: { 1416 case KVM_GET_REGS: {
1408 struct kvm_regs *kvm_regs; 1417 struct kvm_regs *kvm_regs;
1409 1418
1410 r = -ENOMEM; 1419 r = -ENOMEM;
1411 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL); 1420 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL);
1412 if (!kvm_regs) 1421 if (!kvm_regs)
1413 goto out; 1422 goto out;
1414 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs); 1423 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
1415 if (r) 1424 if (r)
1416 goto out_free1; 1425 goto out_free1;
1417 r = -EFAULT; 1426 r = -EFAULT;
1418 if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs))) 1427 if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs)))
1419 goto out_free1; 1428 goto out_free1;
1420 r = 0; 1429 r = 0;
1421 out_free1: 1430 out_free1:
1422 kfree(kvm_regs); 1431 kfree(kvm_regs);
1423 break; 1432 break;
1424 } 1433 }
1425 case KVM_SET_REGS: { 1434 case KVM_SET_REGS: {
1426 struct kvm_regs *kvm_regs; 1435 struct kvm_regs *kvm_regs;
1427 1436
1428 r = -ENOMEM; 1437 r = -ENOMEM;
1429 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL); 1438 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL);
1430 if (!kvm_regs) 1439 if (!kvm_regs)
1431 goto out; 1440 goto out;
1432 r = -EFAULT; 1441 r = -EFAULT;
1433 if (copy_from_user(kvm_regs, argp, sizeof(struct kvm_regs))) 1442 if (copy_from_user(kvm_regs, argp, sizeof(struct kvm_regs)))
1434 goto out_free2; 1443 goto out_free2;
1435 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs); 1444 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
1436 if (r) 1445 if (r)
1437 goto out_free2; 1446 goto out_free2;
1438 r = 0; 1447 r = 0;
1439 out_free2: 1448 out_free2:
1440 kfree(kvm_regs); 1449 kfree(kvm_regs);
1441 break; 1450 break;
1442 } 1451 }
1443 case KVM_GET_SREGS: { 1452 case KVM_GET_SREGS: {
1444 kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL); 1453 kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL);
1445 r = -ENOMEM; 1454 r = -ENOMEM;
1446 if (!kvm_sregs) 1455 if (!kvm_sregs)
1447 goto out; 1456 goto out;
1448 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs); 1457 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs);
1449 if (r) 1458 if (r)
1450 goto out; 1459 goto out;
1451 r = -EFAULT; 1460 r = -EFAULT;
1452 if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs))) 1461 if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs)))
1453 goto out; 1462 goto out;
1454 r = 0; 1463 r = 0;
1455 break; 1464 break;
1456 } 1465 }
1457 case KVM_SET_SREGS: { 1466 case KVM_SET_SREGS: {
1458 kvm_sregs = kmalloc(sizeof(struct kvm_sregs), GFP_KERNEL); 1467 kvm_sregs = kmalloc(sizeof(struct kvm_sregs), GFP_KERNEL);
1459 r = -ENOMEM; 1468 r = -ENOMEM;
1460 if (!kvm_sregs) 1469 if (!kvm_sregs)
1461 goto out; 1470 goto out;
1462 r = -EFAULT; 1471 r = -EFAULT;
1463 if (copy_from_user(kvm_sregs, argp, sizeof(struct kvm_sregs))) 1472 if (copy_from_user(kvm_sregs, argp, sizeof(struct kvm_sregs)))
1464 goto out; 1473 goto out;
1465 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs); 1474 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
1466 if (r) 1475 if (r)
1467 goto out; 1476 goto out;
1468 r = 0; 1477 r = 0;
1469 break; 1478 break;
1470 } 1479 }
1471 case KVM_GET_MP_STATE: { 1480 case KVM_GET_MP_STATE: {
1472 struct kvm_mp_state mp_state; 1481 struct kvm_mp_state mp_state;
1473 1482
1474 r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state); 1483 r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state);
1475 if (r) 1484 if (r)
1476 goto out; 1485 goto out;
1477 r = -EFAULT; 1486 r = -EFAULT;
1478 if (copy_to_user(argp, &mp_state, sizeof mp_state)) 1487 if (copy_to_user(argp, &mp_state, sizeof mp_state))
1479 goto out; 1488 goto out;
1480 r = 0; 1489 r = 0;
1481 break; 1490 break;
1482 } 1491 }
1483 case KVM_SET_MP_STATE: { 1492 case KVM_SET_MP_STATE: {
1484 struct kvm_mp_state mp_state; 1493 struct kvm_mp_state mp_state;
1485 1494
1486 r = -EFAULT; 1495 r = -EFAULT;
1487 if (copy_from_user(&mp_state, argp, sizeof mp_state)) 1496 if (copy_from_user(&mp_state, argp, sizeof mp_state))
1488 goto out; 1497 goto out;
1489 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state); 1498 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
1490 if (r) 1499 if (r)
1491 goto out; 1500 goto out;
1492 r = 0; 1501 r = 0;
1493 break; 1502 break;
1494 } 1503 }
1495 case KVM_TRANSLATE: { 1504 case KVM_TRANSLATE: {
1496 struct kvm_translation tr; 1505 struct kvm_translation tr;
1497 1506
1498 r = -EFAULT; 1507 r = -EFAULT;
1499 if (copy_from_user(&tr, argp, sizeof tr)) 1508 if (copy_from_user(&tr, argp, sizeof tr))
1500 goto out; 1509 goto out;
1501 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr); 1510 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
1502 if (r) 1511 if (r)
1503 goto out; 1512 goto out;
1504 r = -EFAULT; 1513 r = -EFAULT;
1505 if (copy_to_user(argp, &tr, sizeof tr)) 1514 if (copy_to_user(argp, &tr, sizeof tr))
1506 goto out; 1515 goto out;
1507 r = 0; 1516 r = 0;
1508 break; 1517 break;
1509 } 1518 }
1510 case KVM_SET_GUEST_DEBUG: { 1519 case KVM_SET_GUEST_DEBUG: {
1511 struct kvm_guest_debug dbg; 1520 struct kvm_guest_debug dbg;
1512 1521
1513 r = -EFAULT; 1522 r = -EFAULT;
1514 if (copy_from_user(&dbg, argp, sizeof dbg)) 1523 if (copy_from_user(&dbg, argp, sizeof dbg))
1515 goto out; 1524 goto out;
1516 r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg); 1525 r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
1517 if (r) 1526 if (r)
1518 goto out; 1527 goto out;
1519 r = 0; 1528 r = 0;
1520 break; 1529 break;
1521 } 1530 }
1522 case KVM_SET_SIGNAL_MASK: { 1531 case KVM_SET_SIGNAL_MASK: {
1523 struct kvm_signal_mask __user *sigmask_arg = argp; 1532 struct kvm_signal_mask __user *sigmask_arg = argp;
1524 struct kvm_signal_mask kvm_sigmask; 1533 struct kvm_signal_mask kvm_sigmask;
1525 sigset_t sigset, *p; 1534 sigset_t sigset, *p;
1526 1535
1527 p = NULL; 1536 p = NULL;
1528 if (argp) { 1537 if (argp) {
1529 r = -EFAULT; 1538 r = -EFAULT;
1530 if (copy_from_user(&kvm_sigmask, argp, 1539 if (copy_from_user(&kvm_sigmask, argp,
1531 sizeof kvm_sigmask)) 1540 sizeof kvm_sigmask))
1532 goto out; 1541 goto out;
1533 r = -EINVAL; 1542 r = -EINVAL;
1534 if (kvm_sigmask.len != sizeof sigset) 1543 if (kvm_sigmask.len != sizeof sigset)
1535 goto out; 1544 goto out;
1536 r = -EFAULT; 1545 r = -EFAULT;
1537 if (copy_from_user(&sigset, sigmask_arg->sigset, 1546 if (copy_from_user(&sigset, sigmask_arg->sigset,
1538 sizeof sigset)) 1547 sizeof sigset))
1539 goto out; 1548 goto out;
1540 p = &sigset; 1549 p = &sigset;
1541 } 1550 }
1542 r = kvm_vcpu_ioctl_set_sigmask(vcpu, p); 1551 r = kvm_vcpu_ioctl_set_sigmask(vcpu, p);
1543 break; 1552 break;
1544 } 1553 }
1545 case KVM_GET_FPU: { 1554 case KVM_GET_FPU: {
1546 fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL); 1555 fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL);
1547 r = -ENOMEM; 1556 r = -ENOMEM;
1548 if (!fpu) 1557 if (!fpu)
1549 goto out; 1558 goto out;
1550 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu); 1559 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu);
1551 if (r) 1560 if (r)
1552 goto out; 1561 goto out;
1553 r = -EFAULT; 1562 r = -EFAULT;
1554 if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu))) 1563 if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu)))
1555 goto out; 1564 goto out;
1556 r = 0; 1565 r = 0;
1557 break; 1566 break;
1558 } 1567 }
1559 case KVM_SET_FPU: { 1568 case KVM_SET_FPU: {
1560 fpu = kmalloc(sizeof(struct kvm_fpu), GFP_KERNEL); 1569 fpu = kmalloc(sizeof(struct kvm_fpu), GFP_KERNEL);
1561 r = -ENOMEM; 1570 r = -ENOMEM;
1562 if (!fpu) 1571 if (!fpu)
1563 goto out; 1572 goto out;
1564 r = -EFAULT; 1573 r = -EFAULT;
1565 if (copy_from_user(fpu, argp, sizeof(struct kvm_fpu))) 1574 if (copy_from_user(fpu, argp, sizeof(struct kvm_fpu)))
1566 goto out; 1575 goto out;
1567 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu); 1576 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
1568 if (r) 1577 if (r)
1569 goto out; 1578 goto out;
1570 r = 0; 1579 r = 0;
1571 break; 1580 break;
1572 } 1581 }
1573 default: 1582 default:
1574 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); 1583 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
1575 } 1584 }
1576 out: 1585 out:
1577 vcpu_put(vcpu); 1586 vcpu_put(vcpu);
1578 kfree(fpu); 1587 kfree(fpu);
1579 kfree(kvm_sregs); 1588 kfree(kvm_sregs);
1580 return r; 1589 return r;
1581 } 1590 }
1582 1591
1583 static long kvm_vm_ioctl(struct file *filp, 1592 static long kvm_vm_ioctl(struct file *filp,
1584 unsigned int ioctl, unsigned long arg) 1593 unsigned int ioctl, unsigned long arg)
1585 { 1594 {
1586 struct kvm *kvm = filp->private_data; 1595 struct kvm *kvm = filp->private_data;
1587 void __user *argp = (void __user *)arg; 1596 void __user *argp = (void __user *)arg;
1588 int r; 1597 int r;
1589 1598
1590 if (kvm->mm != current->mm) 1599 if (kvm->mm != current->mm)
1591 return -EIO; 1600 return -EIO;
1592 switch (ioctl) { 1601 switch (ioctl) {
1593 case KVM_CREATE_VCPU: 1602 case KVM_CREATE_VCPU:
1594 r = kvm_vm_ioctl_create_vcpu(kvm, arg); 1603 r = kvm_vm_ioctl_create_vcpu(kvm, arg);
1595 if (r < 0) 1604 if (r < 0)
1596 goto out; 1605 goto out;
1597 break; 1606 break;
1598 case KVM_SET_USER_MEMORY_REGION: { 1607 case KVM_SET_USER_MEMORY_REGION: {
1599 struct kvm_userspace_memory_region kvm_userspace_mem; 1608 struct kvm_userspace_memory_region kvm_userspace_mem;
1600 1609
1601 r = -EFAULT; 1610 r = -EFAULT;
1602 if (copy_from_user(&kvm_userspace_mem, argp, 1611 if (copy_from_user(&kvm_userspace_mem, argp,
1603 sizeof kvm_userspace_mem)) 1612 sizeof kvm_userspace_mem))
1604 goto out; 1613 goto out;
1605 1614
1606 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1); 1615 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1);
1607 if (r) 1616 if (r)
1608 goto out; 1617 goto out;
1609 break; 1618 break;
1610 } 1619 }
1611 case KVM_GET_DIRTY_LOG: { 1620 case KVM_GET_DIRTY_LOG: {
1612 struct kvm_dirty_log log; 1621 struct kvm_dirty_log log;
1613 1622
1614 r = -EFAULT; 1623 r = -EFAULT;
1615 if (copy_from_user(&log, argp, sizeof log)) 1624 if (copy_from_user(&log, argp, sizeof log))
1616 goto out; 1625 goto out;
1617 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 1626 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
1618 if (r) 1627 if (r)
1619 goto out; 1628 goto out;
1620 break; 1629 break;
1621 } 1630 }
1622 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 1631 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
1623 case KVM_REGISTER_COALESCED_MMIO: { 1632 case KVM_REGISTER_COALESCED_MMIO: {
1624 struct kvm_coalesced_mmio_zone zone; 1633 struct kvm_coalesced_mmio_zone zone;
1625 r = -EFAULT; 1634 r = -EFAULT;
1626 if (copy_from_user(&zone, argp, sizeof zone)) 1635 if (copy_from_user(&zone, argp, sizeof zone))
1627 goto out; 1636 goto out;
1628 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone); 1637 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
1629 if (r) 1638 if (r)
1630 goto out; 1639 goto out;
1631 r = 0; 1640 r = 0;
1632 break; 1641 break;
1633 } 1642 }
1634 case KVM_UNREGISTER_COALESCED_MMIO: { 1643 case KVM_UNREGISTER_COALESCED_MMIO: {
1635 struct kvm_coalesced_mmio_zone zone; 1644 struct kvm_coalesced_mmio_zone zone;
1636 r = -EFAULT; 1645 r = -EFAULT;
1637 if (copy_from_user(&zone, argp, sizeof zone)) 1646 if (copy_from_user(&zone, argp, sizeof zone))
1638 goto out; 1647 goto out;
1639 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone); 1648 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
1640 if (r) 1649 if (r)
1641 goto out; 1650 goto out;
1642 r = 0; 1651 r = 0;
1643 break; 1652 break;
1644 } 1653 }
1645 #endif 1654 #endif
1646 case KVM_IRQFD: { 1655 case KVM_IRQFD: {
1647 struct kvm_irqfd data; 1656 struct kvm_irqfd data;
1648 1657
1649 r = -EFAULT; 1658 r = -EFAULT;
1650 if (copy_from_user(&data, argp, sizeof data)) 1659 if (copy_from_user(&data, argp, sizeof data))
1651 goto out; 1660 goto out;
1652 r = kvm_irqfd(kvm, data.fd, data.gsi, data.flags); 1661 r = kvm_irqfd(kvm, data.fd, data.gsi, data.flags);
1653 break; 1662 break;
1654 } 1663 }
1655 case KVM_IOEVENTFD: { 1664 case KVM_IOEVENTFD: {
1656 struct kvm_ioeventfd data; 1665 struct kvm_ioeventfd data;
1657 1666
1658 r = -EFAULT; 1667 r = -EFAULT;
1659 if (copy_from_user(&data, argp, sizeof data)) 1668 if (copy_from_user(&data, argp, sizeof data))
1660 goto out; 1669 goto out;
1661 r = kvm_ioeventfd(kvm, &data); 1670 r = kvm_ioeventfd(kvm, &data);
1662 break; 1671 break;
1663 } 1672 }
1664 #ifdef CONFIG_KVM_APIC_ARCHITECTURE 1673 #ifdef CONFIG_KVM_APIC_ARCHITECTURE
1665 case KVM_SET_BOOT_CPU_ID: 1674 case KVM_SET_BOOT_CPU_ID:
1666 r = 0; 1675 r = 0;
1667 mutex_lock(&kvm->lock); 1676 mutex_lock(&kvm->lock);
1668 if (atomic_read(&kvm->online_vcpus) != 0) 1677 if (atomic_read(&kvm->online_vcpus) != 0)
1669 r = -EBUSY; 1678 r = -EBUSY;
1670 else 1679 else
1671 kvm->bsp_vcpu_id = arg; 1680 kvm->bsp_vcpu_id = arg;
1672 mutex_unlock(&kvm->lock); 1681 mutex_unlock(&kvm->lock);
1673 break; 1682 break;
1674 #endif 1683 #endif
1675 default: 1684 default:
1676 r = kvm_arch_vm_ioctl(filp, ioctl, arg); 1685 r = kvm_arch_vm_ioctl(filp, ioctl, arg);
1677 if (r == -ENOTTY) 1686 if (r == -ENOTTY)
1678 r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg); 1687 r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg);
1679 } 1688 }
1680 out: 1689 out:
1681 return r; 1690 return r;
1682 } 1691 }
1683 1692
1684 #ifdef CONFIG_COMPAT 1693 #ifdef CONFIG_COMPAT
1685 struct compat_kvm_dirty_log { 1694 struct compat_kvm_dirty_log {
1686 __u32 slot; 1695 __u32 slot;
1687 __u32 padding1; 1696 __u32 padding1;
1688 union { 1697 union {
1689 compat_uptr_t dirty_bitmap; /* one bit per page */ 1698 compat_uptr_t dirty_bitmap; /* one bit per page */
1690 __u64 padding2; 1699 __u64 padding2;
1691 }; 1700 };
1692 }; 1701 };
1693 1702
1694 static long kvm_vm_compat_ioctl(struct file *filp, 1703 static long kvm_vm_compat_ioctl(struct file *filp,
1695 unsigned int ioctl, unsigned long arg) 1704 unsigned int ioctl, unsigned long arg)
1696 { 1705 {
1697 struct kvm *kvm = filp->private_data; 1706 struct kvm *kvm = filp->private_data;
1698 int r; 1707 int r;
1699 1708
1700 if (kvm->mm != current->mm) 1709 if (kvm->mm != current->mm)
1701 return -EIO; 1710 return -EIO;
1702 switch (ioctl) { 1711 switch (ioctl) {
1703 case KVM_GET_DIRTY_LOG: { 1712 case KVM_GET_DIRTY_LOG: {
1704 struct compat_kvm_dirty_log compat_log; 1713 struct compat_kvm_dirty_log compat_log;
1705 struct kvm_dirty_log log; 1714 struct kvm_dirty_log log;
1706 1715
1707 r = -EFAULT; 1716 r = -EFAULT;
1708 if (copy_from_user(&compat_log, (void __user *)arg, 1717 if (copy_from_user(&compat_log, (void __user *)arg,
1709 sizeof(compat_log))) 1718 sizeof(compat_log)))
1710 goto out; 1719 goto out;
1711 log.slot = compat_log.slot; 1720 log.slot = compat_log.slot;
1712 log.padding1 = compat_log.padding1; 1721 log.padding1 = compat_log.padding1;
1713 log.padding2 = compat_log.padding2; 1722 log.padding2 = compat_log.padding2;
1714 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap); 1723 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
1715 1724
1716 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 1725 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
1717 if (r) 1726 if (r)
1718 goto out; 1727 goto out;
1719 break; 1728 break;
1720 } 1729 }
1721 default: 1730 default:
1722 r = kvm_vm_ioctl(filp, ioctl, arg); 1731 r = kvm_vm_ioctl(filp, ioctl, arg);
1723 } 1732 }
1724 1733
1725 out: 1734 out:
1726 return r; 1735 return r;
1727 } 1736 }
1728 #endif 1737 #endif
1729 1738
1730 static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 1739 static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1731 { 1740 {
1732 struct page *page[1]; 1741 struct page *page[1];
1733 unsigned long addr; 1742 unsigned long addr;
1734 int npages; 1743 int npages;
1735 gfn_t gfn = vmf->pgoff; 1744 gfn_t gfn = vmf->pgoff;
1736 struct kvm *kvm = vma->vm_file->private_data; 1745 struct kvm *kvm = vma->vm_file->private_data;
1737 1746
1738 addr = gfn_to_hva(kvm, gfn); 1747 addr = gfn_to_hva(kvm, gfn);
1739 if (kvm_is_error_hva(addr)) 1748 if (kvm_is_error_hva(addr))
1740 return VM_FAULT_SIGBUS; 1749 return VM_FAULT_SIGBUS;
1741 1750
1742 npages = get_user_pages(current, current->mm, addr, 1, 1, 0, page, 1751 npages = get_user_pages(current, current->mm, addr, 1, 1, 0, page,
1743 NULL); 1752 NULL);
1744 if (unlikely(npages != 1)) 1753 if (unlikely(npages != 1))
1745 return VM_FAULT_SIGBUS; 1754 return VM_FAULT_SIGBUS;
1746 1755
1747 vmf->page = page[0]; 1756 vmf->page = page[0];
1748 return 0; 1757 return 0;
1749 } 1758 }
1750 1759
1751 static const struct vm_operations_struct kvm_vm_vm_ops = { 1760 static const struct vm_operations_struct kvm_vm_vm_ops = {
1752 .fault = kvm_vm_fault, 1761 .fault = kvm_vm_fault,
1753 }; 1762 };
1754 1763
1755 static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma) 1764 static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
1756 { 1765 {
1757 vma->vm_ops = &kvm_vm_vm_ops; 1766 vma->vm_ops = &kvm_vm_vm_ops;
1758 return 0; 1767 return 0;
1759 } 1768 }
1760 1769
1761 static struct file_operations kvm_vm_fops = { 1770 static struct file_operations kvm_vm_fops = {
1762 .release = kvm_vm_release, 1771 .release = kvm_vm_release,
1763 .unlocked_ioctl = kvm_vm_ioctl, 1772 .unlocked_ioctl = kvm_vm_ioctl,
1764 #ifdef CONFIG_COMPAT 1773 #ifdef CONFIG_COMPAT
1765 .compat_ioctl = kvm_vm_compat_ioctl, 1774 .compat_ioctl = kvm_vm_compat_ioctl,
1766 #endif 1775 #endif
1767 .mmap = kvm_vm_mmap, 1776 .mmap = kvm_vm_mmap,
1768 }; 1777 };
1769 1778
1770 static int kvm_dev_ioctl_create_vm(void) 1779 static int kvm_dev_ioctl_create_vm(void)
1771 { 1780 {
1772 int fd, r; 1781 int fd, r;
1773 struct kvm *kvm; 1782 struct kvm *kvm;
1774 1783
1775 kvm = kvm_create_vm(); 1784 kvm = kvm_create_vm();
1776 if (IS_ERR(kvm)) 1785 if (IS_ERR(kvm))
1777 return PTR_ERR(kvm); 1786 return PTR_ERR(kvm);
1778 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 1787 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
1779 r = kvm_coalesced_mmio_init(kvm); 1788 r = kvm_coalesced_mmio_init(kvm);
1780 if (r < 0) { 1789 if (r < 0) {
1781 kvm_put_kvm(kvm); 1790 kvm_put_kvm(kvm);
1782 return r; 1791 return r;
1783 } 1792 }
1784 #endif 1793 #endif
1785 fd = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR); 1794 fd = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
1786 if (fd < 0) 1795 if (fd < 0)
1787 kvm_put_kvm(kvm); 1796 kvm_put_kvm(kvm);
1788 1797
1789 return fd; 1798 return fd;
1790 } 1799 }
1791 1800
1792 static long kvm_dev_ioctl_check_extension_generic(long arg) 1801 static long kvm_dev_ioctl_check_extension_generic(long arg)
1793 { 1802 {
1794 switch (arg) { 1803 switch (arg) {
1795 case KVM_CAP_USER_MEMORY: 1804 case KVM_CAP_USER_MEMORY:
1796 case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: 1805 case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
1797 case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS: 1806 case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
1798 #ifdef CONFIG_KVM_APIC_ARCHITECTURE 1807 #ifdef CONFIG_KVM_APIC_ARCHITECTURE
1799 case KVM_CAP_SET_BOOT_CPU_ID: 1808 case KVM_CAP_SET_BOOT_CPU_ID:
1800 #endif 1809 #endif
1801 case KVM_CAP_INTERNAL_ERROR_DATA: 1810 case KVM_CAP_INTERNAL_ERROR_DATA:
1802 return 1; 1811 return 1;
1803 #ifdef CONFIG_HAVE_KVM_IRQCHIP 1812 #ifdef CONFIG_HAVE_KVM_IRQCHIP
1804 case KVM_CAP_IRQ_ROUTING: 1813 case KVM_CAP_IRQ_ROUTING:
1805 return KVM_MAX_IRQ_ROUTES; 1814 return KVM_MAX_IRQ_ROUTES;
1806 #endif 1815 #endif
1807 default: 1816 default:
1808 break; 1817 break;
1809 } 1818 }
1810 return kvm_dev_ioctl_check_extension(arg); 1819 return kvm_dev_ioctl_check_extension(arg);
1811 } 1820 }
1812 1821
1813 static long kvm_dev_ioctl(struct file *filp, 1822 static long kvm_dev_ioctl(struct file *filp,
1814 unsigned int ioctl, unsigned long arg) 1823 unsigned int ioctl, unsigned long arg)
1815 { 1824 {
1816 long r = -EINVAL; 1825 long r = -EINVAL;
1817 1826
1818 switch (ioctl) { 1827 switch (ioctl) {
1819 case KVM_GET_API_VERSION: 1828 case KVM_GET_API_VERSION:
1820 r = -EINVAL; 1829 r = -EINVAL;
1821 if (arg) 1830 if (arg)
1822 goto out; 1831 goto out;
1823 r = KVM_API_VERSION; 1832 r = KVM_API_VERSION;
1824 break; 1833 break;
1825 case KVM_CREATE_VM: 1834 case KVM_CREATE_VM:
1826 r = -EINVAL; 1835 r = -EINVAL;
1827 if (arg) 1836 if (arg)
1828 goto out; 1837 goto out;
1829 r = kvm_dev_ioctl_create_vm(); 1838 r = kvm_dev_ioctl_create_vm();
1830 break; 1839 break;
1831 case KVM_CHECK_EXTENSION: 1840 case KVM_CHECK_EXTENSION:
1832 r = kvm_dev_ioctl_check_extension_generic(arg); 1841 r = kvm_dev_ioctl_check_extension_generic(arg);
1833 break; 1842 break;
1834 case KVM_GET_VCPU_MMAP_SIZE: 1843 case KVM_GET_VCPU_MMAP_SIZE:
1835 r = -EINVAL; 1844 r = -EINVAL;
1836 if (arg) 1845 if (arg)
1837 goto out; 1846 goto out;
1838 r = PAGE_SIZE; /* struct kvm_run */ 1847 r = PAGE_SIZE; /* struct kvm_run */
1839 #ifdef CONFIG_X86 1848 #ifdef CONFIG_X86
1840 r += PAGE_SIZE; /* pio data page */ 1849 r += PAGE_SIZE; /* pio data page */
1841 #endif 1850 #endif
1842 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 1851 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
1843 r += PAGE_SIZE; /* coalesced mmio ring page */ 1852 r += PAGE_SIZE; /* coalesced mmio ring page */
1844 #endif 1853 #endif
1845 break; 1854 break;
1846 case KVM_TRACE_ENABLE: 1855 case KVM_TRACE_ENABLE:
1847 case KVM_TRACE_PAUSE: 1856 case KVM_TRACE_PAUSE:
1848 case KVM_TRACE_DISABLE: 1857 case KVM_TRACE_DISABLE:
1849 r = -EOPNOTSUPP; 1858 r = -EOPNOTSUPP;
1850 break; 1859 break;
1851 default: 1860 default:
1852 return kvm_arch_dev_ioctl(filp, ioctl, arg); 1861 return kvm_arch_dev_ioctl(filp, ioctl, arg);
1853 } 1862 }
1854 out: 1863 out:
1855 return r; 1864 return r;
1856 } 1865 }
1857 1866
1858 static struct file_operations kvm_chardev_ops = { 1867 static struct file_operations kvm_chardev_ops = {
1859 .unlocked_ioctl = kvm_dev_ioctl, 1868 .unlocked_ioctl = kvm_dev_ioctl,
1860 .compat_ioctl = kvm_dev_ioctl, 1869 .compat_ioctl = kvm_dev_ioctl,
1861 }; 1870 };
1862 1871
1863 static struct miscdevice kvm_dev = { 1872 static struct miscdevice kvm_dev = {
1864 KVM_MINOR, 1873 KVM_MINOR,
1865 "kvm", 1874 "kvm",
1866 &kvm_chardev_ops, 1875 &kvm_chardev_ops,
1867 }; 1876 };
1868 1877
1869 static void hardware_enable(void *junk) 1878 static void hardware_enable(void *junk)
1870 { 1879 {
1871 int cpu = raw_smp_processor_id(); 1880 int cpu = raw_smp_processor_id();
1872 int r; 1881 int r;
1873 1882
1874 if (cpumask_test_cpu(cpu, cpus_hardware_enabled)) 1883 if (cpumask_test_cpu(cpu, cpus_hardware_enabled))
1875 return; 1884 return;
1876 1885
1877 cpumask_set_cpu(cpu, cpus_hardware_enabled); 1886 cpumask_set_cpu(cpu, cpus_hardware_enabled);
1878 1887
1879 r = kvm_arch_hardware_enable(NULL); 1888 r = kvm_arch_hardware_enable(NULL);
1880 1889
1881 if (r) { 1890 if (r) {
1882 cpumask_clear_cpu(cpu, cpus_hardware_enabled); 1891 cpumask_clear_cpu(cpu, cpus_hardware_enabled);
1883 atomic_inc(&hardware_enable_failed); 1892 atomic_inc(&hardware_enable_failed);
1884 printk(KERN_INFO "kvm: enabling virtualization on " 1893 printk(KERN_INFO "kvm: enabling virtualization on "
1885 "CPU%d failed\n", cpu); 1894 "CPU%d failed\n", cpu);
1886 } 1895 }
1887 } 1896 }
1888 1897
1889 static void hardware_disable(void *junk) 1898 static void hardware_disable(void *junk)
1890 { 1899 {
1891 int cpu = raw_smp_processor_id(); 1900 int cpu = raw_smp_processor_id();
1892 1901
1893 if (!cpumask_test_cpu(cpu, cpus_hardware_enabled)) 1902 if (!cpumask_test_cpu(cpu, cpus_hardware_enabled))
1894 return; 1903 return;
1895 cpumask_clear_cpu(cpu, cpus_hardware_enabled); 1904 cpumask_clear_cpu(cpu, cpus_hardware_enabled);
1896 kvm_arch_hardware_disable(NULL); 1905 kvm_arch_hardware_disable(NULL);
1897 } 1906 }
1898 1907
1899 static void hardware_disable_all_nolock(void) 1908 static void hardware_disable_all_nolock(void)
1900 { 1909 {
1901 BUG_ON(!kvm_usage_count); 1910 BUG_ON(!kvm_usage_count);
1902 1911
1903 kvm_usage_count--; 1912 kvm_usage_count--;
1904 if (!kvm_usage_count) 1913 if (!kvm_usage_count)
1905 on_each_cpu(hardware_disable, NULL, 1); 1914 on_each_cpu(hardware_disable, NULL, 1);
1906 } 1915 }
1907 1916
1908 static void hardware_disable_all(void) 1917 static void hardware_disable_all(void)
1909 { 1918 {
1910 spin_lock(&kvm_lock); 1919 spin_lock(&kvm_lock);
1911 hardware_disable_all_nolock(); 1920 hardware_disable_all_nolock();
1912 spin_unlock(&kvm_lock); 1921 spin_unlock(&kvm_lock);
1913 } 1922 }
1914 1923
1915 static int hardware_enable_all(void) 1924 static int hardware_enable_all(void)
1916 { 1925 {
1917 int r = 0; 1926 int r = 0;
1918 1927
1919 spin_lock(&kvm_lock); 1928 spin_lock(&kvm_lock);
1920 1929
1921 kvm_usage_count++; 1930 kvm_usage_count++;
1922 if (kvm_usage_count == 1) { 1931 if (kvm_usage_count == 1) {
1923 atomic_set(&hardware_enable_failed, 0); 1932 atomic_set(&hardware_enable_failed, 0);
1924 on_each_cpu(hardware_enable, NULL, 1); 1933 on_each_cpu(hardware_enable, NULL, 1);
1925 1934
1926 if (atomic_read(&hardware_enable_failed)) { 1935 if (atomic_read(&hardware_enable_failed)) {
1927 hardware_disable_all_nolock(); 1936 hardware_disable_all_nolock();
1928 r = -EBUSY; 1937 r = -EBUSY;
1929 } 1938 }
1930 } 1939 }
1931 1940
1932 spin_unlock(&kvm_lock); 1941 spin_unlock(&kvm_lock);
1933 1942
1934 return r; 1943 return r;
1935 } 1944 }
1936 1945
1937 static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, 1946 static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
1938 void *v) 1947 void *v)
1939 { 1948 {
1940 int cpu = (long)v; 1949 int cpu = (long)v;
1941 1950
1942 if (!kvm_usage_count) 1951 if (!kvm_usage_count)
1943 return NOTIFY_OK; 1952 return NOTIFY_OK;
1944 1953
1945 val &= ~CPU_TASKS_FROZEN; 1954 val &= ~CPU_TASKS_FROZEN;
1946 switch (val) { 1955 switch (val) {
1947 case CPU_DYING: 1956 case CPU_DYING:
1948 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", 1957 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
1949 cpu); 1958 cpu);
1950 hardware_disable(NULL); 1959 hardware_disable(NULL);
1951 break; 1960 break;
1952 case CPU_ONLINE: 1961 case CPU_ONLINE:
1953 printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n", 1962 printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
1954 cpu); 1963 cpu);
1955 smp_call_function_single(cpu, hardware_enable, NULL, 1); 1964 smp_call_function_single(cpu, hardware_enable, NULL, 1);
1956 break; 1965 break;
1957 } 1966 }
1958 return NOTIFY_OK; 1967 return NOTIFY_OK;
1959 } 1968 }
1960 1969
1961 1970
1962 asmlinkage void kvm_handle_fault_on_reboot(void) 1971 asmlinkage void kvm_handle_fault_on_reboot(void)
1963 { 1972 {
1964 if (kvm_rebooting) 1973 if (kvm_rebooting)
1965 /* spin while reset goes on */ 1974 /* spin while reset goes on */
1966 while (true) 1975 while (true)
1967 ; 1976 ;
1968 /* Fault while not rebooting. We want the trace. */ 1977 /* Fault while not rebooting. We want the trace. */
1969 BUG(); 1978 BUG();
1970 } 1979 }
1971 EXPORT_SYMBOL_GPL(kvm_handle_fault_on_reboot); 1980 EXPORT_SYMBOL_GPL(kvm_handle_fault_on_reboot);
1972 1981
1973 static int kvm_reboot(struct notifier_block *notifier, unsigned long val, 1982 static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
1974 void *v) 1983 void *v)
1975 { 1984 {
1976 /* 1985 /*
1977 * Some (well, at least mine) BIOSes hang on reboot if 1986 * Some (well, at least mine) BIOSes hang on reboot if
1978 * in vmx root mode. 1987 * in vmx root mode.
1979 * 1988 *
1980 * And Intel TXT required VMX off for all cpu when system shutdown. 1989 * And Intel TXT required VMX off for all cpu when system shutdown.
1981 */ 1990 */
1982 printk(KERN_INFO "kvm: exiting hardware virtualization\n"); 1991 printk(KERN_INFO "kvm: exiting hardware virtualization\n");
1983 kvm_rebooting = true; 1992 kvm_rebooting = true;
1984 on_each_cpu(hardware_disable, NULL, 1); 1993 on_each_cpu(hardware_disable, NULL, 1);
1985 return NOTIFY_OK; 1994 return NOTIFY_OK;
1986 } 1995 }
1987 1996
1988 static struct notifier_block kvm_reboot_notifier = { 1997 static struct notifier_block kvm_reboot_notifier = {
1989 .notifier_call = kvm_reboot, 1998 .notifier_call = kvm_reboot,
1990 .priority = 0, 1999 .priority = 0,
1991 }; 2000 };
1992 2001
1993 static void kvm_io_bus_destroy(struct kvm_io_bus *bus) 2002 static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
1994 { 2003 {
1995 int i; 2004 int i;
1996 2005
1997 for (i = 0; i < bus->dev_count; i++) { 2006 for (i = 0; i < bus->dev_count; i++) {
1998 struct kvm_io_device *pos = bus->devs[i]; 2007 struct kvm_io_device *pos = bus->devs[i];
1999 2008
2000 kvm_iodevice_destructor(pos); 2009 kvm_iodevice_destructor(pos);
2001 } 2010 }
2002 kfree(bus); 2011 kfree(bus);
2003 } 2012 }
2004 2013
2005 /* kvm_io_bus_write - called under kvm->slots_lock */ 2014 /* kvm_io_bus_write - called under kvm->slots_lock */
2006 int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, 2015 int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
2007 int len, const void *val) 2016 int len, const void *val)
2008 { 2017 {
2009 int i; 2018 int i;
2010 struct kvm_io_bus *bus; 2019 struct kvm_io_bus *bus;
2011 2020
2012 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); 2021 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
2013 for (i = 0; i < bus->dev_count; i++) 2022 for (i = 0; i < bus->dev_count; i++)
2014 if (!kvm_iodevice_write(bus->devs[i], addr, len, val)) 2023 if (!kvm_iodevice_write(bus->devs[i], addr, len, val))
2015 return 0; 2024 return 0;
2016 return -EOPNOTSUPP; 2025 return -EOPNOTSUPP;
2017 } 2026 }
2018 2027
2019 /* kvm_io_bus_read - called under kvm->slots_lock */ 2028 /* kvm_io_bus_read - called under kvm->slots_lock */
2020 int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, 2029 int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
2021 int len, void *val) 2030 int len, void *val)
2022 { 2031 {
2023 int i; 2032 int i;
2024 struct kvm_io_bus *bus; 2033 struct kvm_io_bus *bus;
2025 2034
2026 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); 2035 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
2027 for (i = 0; i < bus->dev_count; i++) 2036 for (i = 0; i < bus->dev_count; i++)
2028 if (!kvm_iodevice_read(bus->devs[i], addr, len, val)) 2037 if (!kvm_iodevice_read(bus->devs[i], addr, len, val))
2029 return 0; 2038 return 0;
2030 return -EOPNOTSUPP; 2039 return -EOPNOTSUPP;
2031 } 2040 }
2032 2041
2033 /* Caller must hold slots_lock. */ 2042 /* Caller must hold slots_lock. */
2034 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, 2043 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx,
2035 struct kvm_io_device *dev) 2044 struct kvm_io_device *dev)
2036 { 2045 {
2037 struct kvm_io_bus *new_bus, *bus; 2046 struct kvm_io_bus *new_bus, *bus;
2038 2047
2039 bus = kvm->buses[bus_idx]; 2048 bus = kvm->buses[bus_idx];
2040 if (bus->dev_count > NR_IOBUS_DEVS-1) 2049 if (bus->dev_count > NR_IOBUS_DEVS-1)
2041 return -ENOSPC; 2050 return -ENOSPC;
2042 2051
2043 new_bus = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL); 2052 new_bus = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL);
2044 if (!new_bus) 2053 if (!new_bus)
2045 return -ENOMEM; 2054 return -ENOMEM;
2046 memcpy(new_bus, bus, sizeof(struct kvm_io_bus)); 2055 memcpy(new_bus, bus, sizeof(struct kvm_io_bus));
2047 new_bus->devs[new_bus->dev_count++] = dev; 2056 new_bus->devs[new_bus->dev_count++] = dev;
2048 rcu_assign_pointer(kvm->buses[bus_idx], new_bus); 2057 rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
2049 synchronize_srcu_expedited(&kvm->srcu); 2058 synchronize_srcu_expedited(&kvm->srcu);
2050 kfree(bus); 2059 kfree(bus);
2051 2060
2052 return 0; 2061 return 0;
2053 } 2062 }
2054 2063
2055 /* Caller must hold slots_lock. */ 2064 /* Caller must hold slots_lock. */
2056 int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, 2065 int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
2057 struct kvm_io_device *dev) 2066 struct kvm_io_device *dev)
2058 { 2067 {
2059 int i, r; 2068 int i, r;
2060 struct kvm_io_bus *new_bus, *bus; 2069 struct kvm_io_bus *new_bus, *bus;
2061 2070
2062 new_bus = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL); 2071 new_bus = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL);
2063 if (!new_bus) 2072 if (!new_bus)
2064 return -ENOMEM; 2073 return -ENOMEM;
2065 2074
2066 bus = kvm->buses[bus_idx]; 2075 bus = kvm->buses[bus_idx];
2067 memcpy(new_bus, bus, sizeof(struct kvm_io_bus)); 2076 memcpy(new_bus, bus, sizeof(struct kvm_io_bus));
2068 2077
2069 r = -ENOENT; 2078 r = -ENOENT;
2070 for (i = 0; i < new_bus->dev_count; i++) 2079 for (i = 0; i < new_bus->dev_count; i++)
2071 if (new_bus->devs[i] == dev) { 2080 if (new_bus->devs[i] == dev) {
2072 r = 0; 2081 r = 0;
2073 new_bus->devs[i] = new_bus->devs[--new_bus->dev_count]; 2082 new_bus->devs[i] = new_bus->devs[--new_bus->dev_count];
2074 break; 2083 break;
2075 } 2084 }
2076 2085
2077 if (r) { 2086 if (r) {
2078 kfree(new_bus); 2087 kfree(new_bus);
2079 return r; 2088 return r;
2080 } 2089 }
2081 2090
2082 rcu_assign_pointer(kvm->buses[bus_idx], new_bus); 2091 rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
2083 synchronize_srcu_expedited(&kvm->srcu); 2092 synchronize_srcu_expedited(&kvm->srcu);
2084 kfree(bus); 2093 kfree(bus);
2085 return r; 2094 return r;
2086 } 2095 }
2087 2096
2088 static struct notifier_block kvm_cpu_notifier = { 2097 static struct notifier_block kvm_cpu_notifier = {
2089 .notifier_call = kvm_cpu_hotplug, 2098 .notifier_call = kvm_cpu_hotplug,
2090 .priority = 20, /* must be > scheduler priority */ 2099 .priority = 20, /* must be > scheduler priority */
2091 }; 2100 };
2092 2101
2093 static int vm_stat_get(void *_offset, u64 *val) 2102 static int vm_stat_get(void *_offset, u64 *val)
2094 { 2103 {
2095 unsigned offset = (long)_offset; 2104 unsigned offset = (long)_offset;
2096 struct kvm *kvm; 2105 struct kvm *kvm;
2097 2106
2098 *val = 0; 2107 *val = 0;
2099 spin_lock(&kvm_lock); 2108 spin_lock(&kvm_lock);
2100 list_for_each_entry(kvm, &vm_list, vm_list) 2109 list_for_each_entry(kvm, &vm_list, vm_list)
2101 *val += *(u32 *)((void *)kvm + offset); 2110 *val += *(u32 *)((void *)kvm + offset);
2102 spin_unlock(&kvm_lock); 2111 spin_unlock(&kvm_lock);
2103 return 0; 2112 return 0;
2104 } 2113 }
2105 2114
2106 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, NULL, "%llu\n"); 2115 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, NULL, "%llu\n");
2107 2116
2108 static int vcpu_stat_get(void *_offset, u64 *val) 2117 static int vcpu_stat_get(void *_offset, u64 *val)
2109 { 2118 {
2110 unsigned offset = (long)_offset; 2119 unsigned offset = (long)_offset;
2111 struct kvm *kvm; 2120 struct kvm *kvm;
2112 struct kvm_vcpu *vcpu; 2121 struct kvm_vcpu *vcpu;
2113 int i; 2122 int i;
2114 2123
2115 *val = 0; 2124 *val = 0;
2116 spin_lock(&kvm_lock); 2125 spin_lock(&kvm_lock);
2117 list_for_each_entry(kvm, &vm_list, vm_list) 2126 list_for_each_entry(kvm, &vm_list, vm_list)
2118 kvm_for_each_vcpu(i, vcpu, kvm) 2127 kvm_for_each_vcpu(i, vcpu, kvm)
2119 *val += *(u32 *)((void *)vcpu + offset); 2128 *val += *(u32 *)((void *)vcpu + offset);
2120 2129
2121 spin_unlock(&kvm_lock); 2130 spin_unlock(&kvm_lock);
2122 return 0; 2131 return 0;
2123 } 2132 }
2124 2133
2125 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, NULL, "%llu\n"); 2134 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, NULL, "%llu\n");
2126 2135
2127 static const struct file_operations *stat_fops[] = { 2136 static const struct file_operations *stat_fops[] = {
2128 [KVM_STAT_VCPU] = &vcpu_stat_fops, 2137 [KVM_STAT_VCPU] = &vcpu_stat_fops,
2129 [KVM_STAT_VM] = &vm_stat_fops, 2138 [KVM_STAT_VM] = &vm_stat_fops,
2130 }; 2139 };
2131 2140
2132 static void kvm_init_debug(void) 2141 static void kvm_init_debug(void)
2133 { 2142 {
2134 struct kvm_stats_debugfs_item *p; 2143 struct kvm_stats_debugfs_item *p;
2135 2144
2136 kvm_debugfs_dir = debugfs_create_dir("kvm", NULL); 2145 kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
2137 for (p = debugfs_entries; p->name; ++p) 2146 for (p = debugfs_entries; p->name; ++p)
2138 p->dentry = debugfs_create_file(p->name, 0444, kvm_debugfs_dir, 2147 p->dentry = debugfs_create_file(p->name, 0444, kvm_debugfs_dir,
2139 (void *)(long)p->offset, 2148 (void *)(long)p->offset,
2140 stat_fops[p->kind]); 2149 stat_fops[p->kind]);
2141 } 2150 }
2142 2151
2143 static void kvm_exit_debug(void) 2152 static void kvm_exit_debug(void)
2144 { 2153 {
2145 struct kvm_stats_debugfs_item *p; 2154 struct kvm_stats_debugfs_item *p;
2146 2155
2147 for (p = debugfs_entries; p->name; ++p) 2156 for (p = debugfs_entries; p->name; ++p)
2148 debugfs_remove(p->dentry); 2157 debugfs_remove(p->dentry);
2149 debugfs_remove(kvm_debugfs_dir); 2158 debugfs_remove(kvm_debugfs_dir);
2150 } 2159 }
2151 2160
2152 static int kvm_suspend(struct sys_device *dev, pm_message_t state) 2161 static int kvm_suspend(struct sys_device *dev, pm_message_t state)
2153 { 2162 {
2154 if (kvm_usage_count) 2163 if (kvm_usage_count)
2155 hardware_disable(NULL); 2164 hardware_disable(NULL);
2156 return 0; 2165 return 0;
2157 } 2166 }
2158 2167
2159 static int kvm_resume(struct sys_device *dev) 2168 static int kvm_resume(struct sys_device *dev)
2160 { 2169 {
2161 if (kvm_usage_count) 2170 if (kvm_usage_count)
2162 hardware_enable(NULL); 2171 hardware_enable(NULL);
2163 return 0; 2172 return 0;
2164 } 2173 }
2165 2174
2166 static struct sysdev_class kvm_sysdev_class = { 2175 static struct sysdev_class kvm_sysdev_class = {
2167 .name = "kvm", 2176 .name = "kvm",
2168 .suspend = kvm_suspend, 2177 .suspend = kvm_suspend,
2169 .resume = kvm_resume, 2178 .resume = kvm_resume,
2170 }; 2179 };
2171 2180
2172 static struct sys_device kvm_sysdev = { 2181 static struct sys_device kvm_sysdev = {
2173 .id = 0, 2182 .id = 0,
2174 .cls = &kvm_sysdev_class, 2183 .cls = &kvm_sysdev_class,
2175 }; 2184 };
2176 2185
2177 struct page *bad_page; 2186 struct page *bad_page;
2178 pfn_t bad_pfn; 2187 pfn_t bad_pfn;
2179 2188
2180 static inline 2189 static inline
2181 struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) 2190 struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
2182 { 2191 {
2183 return container_of(pn, struct kvm_vcpu, preempt_notifier); 2192 return container_of(pn, struct kvm_vcpu, preempt_notifier);
2184 } 2193 }
2185 2194
2186 static void kvm_sched_in(struct preempt_notifier *pn, int cpu) 2195 static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
2187 { 2196 {
2188 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 2197 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
2189 2198
2190 kvm_arch_vcpu_load(vcpu, cpu); 2199 kvm_arch_vcpu_load(vcpu, cpu);
2191 } 2200 }
2192 2201
2193 static void kvm_sched_out(struct preempt_notifier *pn, 2202 static void kvm_sched_out(struct preempt_notifier *pn,
2194 struct task_struct *next) 2203 struct task_struct *next)
2195 { 2204 {
2196 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 2205 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
2197 2206
2198 kvm_arch_vcpu_put(vcpu); 2207 kvm_arch_vcpu_put(vcpu);
2199 } 2208 }
2200 2209
2201 int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, 2210 int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
2202 struct module *module) 2211 struct module *module)
2203 { 2212 {
2204 int r; 2213 int r;
2205 int cpu; 2214 int cpu;
2206 2215
2207 r = kvm_arch_init(opaque); 2216 r = kvm_arch_init(opaque);
2208 if (r) 2217 if (r)
2209 goto out_fail; 2218 goto out_fail;
2210 2219
2211 bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO); 2220 bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2212 2221
2213 if (bad_page == NULL) { 2222 if (bad_page == NULL) {
2214 r = -ENOMEM; 2223 r = -ENOMEM;
2215 goto out; 2224 goto out;
2216 } 2225 }
2217 2226
2218 bad_pfn = page_to_pfn(bad_page); 2227 bad_pfn = page_to_pfn(bad_page);
2219 2228
2220 hwpoison_page = alloc_page(GFP_KERNEL | __GFP_ZERO); 2229 hwpoison_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2221 2230
2222 if (hwpoison_page == NULL) { 2231 if (hwpoison_page == NULL) {
2223 r = -ENOMEM; 2232 r = -ENOMEM;
2224 goto out_free_0; 2233 goto out_free_0;
2225 } 2234 }
2226 2235
2227 hwpoison_pfn = page_to_pfn(hwpoison_page); 2236 hwpoison_pfn = page_to_pfn(hwpoison_page);
2228 2237
2238 fault_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2239
2240 if (fault_page == NULL) {
2241 r = -ENOMEM;
2242 goto out_free_0;
2243 }
2244
2245 fault_pfn = page_to_pfn(fault_page);
2246
2229 if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { 2247 if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
2230 r = -ENOMEM; 2248 r = -ENOMEM;
2231 goto out_free_0; 2249 goto out_free_0;
2232 } 2250 }
2233 2251
2234 r = kvm_arch_hardware_setup(); 2252 r = kvm_arch_hardware_setup();
2235 if (r < 0) 2253 if (r < 0)
2236 goto out_free_0a; 2254 goto out_free_0a;
2237 2255
2238 for_each_online_cpu(cpu) { 2256 for_each_online_cpu(cpu) {
2239 smp_call_function_single(cpu, 2257 smp_call_function_single(cpu,
2240 kvm_arch_check_processor_compat, 2258 kvm_arch_check_processor_compat,
2241 &r, 1); 2259 &r, 1);
2242 if (r < 0) 2260 if (r < 0)
2243 goto out_free_1; 2261 goto out_free_1;
2244 } 2262 }
2245 2263
2246 r = register_cpu_notifier(&kvm_cpu_notifier); 2264 r = register_cpu_notifier(&kvm_cpu_notifier);
2247 if (r) 2265 if (r)
2248 goto out_free_2; 2266 goto out_free_2;
2249 register_reboot_notifier(&kvm_reboot_notifier); 2267 register_reboot_notifier(&kvm_reboot_notifier);
2250 2268
2251 r = sysdev_class_register(&kvm_sysdev_class); 2269 r = sysdev_class_register(&kvm_sysdev_class);
2252 if (r) 2270 if (r)
2253 goto out_free_3; 2271 goto out_free_3;
2254 2272
2255 r = sysdev_register(&kvm_sysdev); 2273 r = sysdev_register(&kvm_sysdev);
2256 if (r) 2274 if (r)
2257 goto out_free_4; 2275 goto out_free_4;
2258 2276
2259 /* A kmem cache lets us meet the alignment requirements of fx_save. */ 2277 /* A kmem cache lets us meet the alignment requirements of fx_save. */
2260 if (!vcpu_align) 2278 if (!vcpu_align)
2261 vcpu_align = __alignof__(struct kvm_vcpu); 2279 vcpu_align = __alignof__(struct kvm_vcpu);
2262 kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, vcpu_align, 2280 kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, vcpu_align,
2263 0, NULL); 2281 0, NULL);
2264 if (!kvm_vcpu_cache) { 2282 if (!kvm_vcpu_cache) {
2265 r = -ENOMEM; 2283 r = -ENOMEM;
2266 goto out_free_5; 2284 goto out_free_5;
2267 } 2285 }
2268 2286
2269 kvm_chardev_ops.owner = module; 2287 kvm_chardev_ops.owner = module;
2270 kvm_vm_fops.owner = module; 2288 kvm_vm_fops.owner = module;
2271 kvm_vcpu_fops.owner = module; 2289 kvm_vcpu_fops.owner = module;
2272 2290
2273 r = misc_register(&kvm_dev); 2291 r = misc_register(&kvm_dev);
2274 if (r) { 2292 if (r) {
2275 printk(KERN_ERR "kvm: misc device register failed\n"); 2293 printk(KERN_ERR "kvm: misc device register failed\n");
2276 goto out_free; 2294 goto out_free;
2277 } 2295 }
2278 2296
2279 kvm_preempt_ops.sched_in = kvm_sched_in; 2297 kvm_preempt_ops.sched_in = kvm_sched_in;
2280 kvm_preempt_ops.sched_out = kvm_sched_out; 2298 kvm_preempt_ops.sched_out = kvm_sched_out;
2281 2299
2282 kvm_init_debug(); 2300 kvm_init_debug();
2283 2301
2284 return 0; 2302 return 0;
2285 2303
2286 out_free: 2304 out_free:
2287 kmem_cache_destroy(kvm_vcpu_cache); 2305 kmem_cache_destroy(kvm_vcpu_cache);
2288 out_free_5: 2306 out_free_5:
2289 sysdev_unregister(&kvm_sysdev); 2307 sysdev_unregister(&kvm_sysdev);
2290 out_free_4: 2308 out_free_4:
2291 sysdev_class_unregister(&kvm_sysdev_class); 2309 sysdev_class_unregister(&kvm_sysdev_class);
2292 out_free_3: 2310 out_free_3:
2293 unregister_reboot_notifier(&kvm_reboot_notifier); 2311 unregister_reboot_notifier(&kvm_reboot_notifier);
2294 unregister_cpu_notifier(&kvm_cpu_notifier); 2312 unregister_cpu_notifier(&kvm_cpu_notifier);
2295 out_free_2: 2313 out_free_2:
2296 out_free_1: 2314 out_free_1:
2297 kvm_arch_hardware_unsetup(); 2315 kvm_arch_hardware_unsetup();
2298 out_free_0a: 2316 out_free_0a:
2299 free_cpumask_var(cpus_hardware_enabled); 2317 free_cpumask_var(cpus_hardware_enabled);
2300 out_free_0: 2318 out_free_0:
2319 if (fault_page)
2320 __free_page(fault_page);
2301 if (hwpoison_page) 2321 if (hwpoison_page)
2302 __free_page(hwpoison_page); 2322 __free_page(hwpoison_page);
2303 __free_page(bad_page); 2323 __free_page(bad_page);
2304 out: 2324 out:
2305 kvm_arch_exit(); 2325 kvm_arch_exit();
2306 out_fail: 2326 out_fail:
2307 return r; 2327 return r;
2308 } 2328 }
2309 EXPORT_SYMBOL_GPL(kvm_init); 2329 EXPORT_SYMBOL_GPL(kvm_init);
2310 2330
2311 void kvm_exit(void) 2331 void kvm_exit(void)
2312 { 2332 {
2313 kvm_exit_debug(); 2333 kvm_exit_debug();
2314 misc_deregister(&kvm_dev); 2334 misc_deregister(&kvm_dev);
2315 kmem_cache_destroy(kvm_vcpu_cache); 2335 kmem_cache_destroy(kvm_vcpu_cache);
2316 sysdev_unregister(&kvm_sysdev); 2336 sysdev_unregister(&kvm_sysdev);
2317 sysdev_class_unregister(&kvm_sysdev_class); 2337 sysdev_class_unregister(&kvm_sysdev_class);
2318 unregister_reboot_notifier(&kvm_reboot_notifier); 2338 unregister_reboot_notifier(&kvm_reboot_notifier);
2319 unregister_cpu_notifier(&kvm_cpu_notifier); 2339 unregister_cpu_notifier(&kvm_cpu_notifier);
2320 on_each_cpu(hardware_disable, NULL, 1); 2340 on_each_cpu(hardware_disable, NULL, 1);
2321 kvm_arch_hardware_unsetup(); 2341 kvm_arch_hardware_unsetup();
2322 kvm_arch_exit(); 2342 kvm_arch_exit();
2323 free_cpumask_var(cpus_hardware_enabled); 2343 free_cpumask_var(cpus_hardware_enabled);
2324 __free_page(hwpoison_page); 2344 __free_page(hwpoison_page);
2325 __free_page(bad_page); 2345 __free_page(bad_page);
2326 } 2346 }
2327 EXPORT_SYMBOL_GPL(kvm_exit); 2347 EXPORT_SYMBOL_GPL(kvm_exit);
2328 2348