Commit c972f3b125d8818748429b94cd2e59f473943a33

Authored by Takuya Yoshikawa
Committed by Gleb Natapov
1 parent aa11e3a8a6

KVM: Write protect the updated slot only when dirty logging is enabled

Calling kvm_mmu_slot_remove_write_access() for a deleted slot does
nothing but search for non-existent mmu pages which have mappings to
that deleted memory; this is safe but a waste of time.

Since we want to make the function rmap based in a later patch, in a
manner which makes it unsafe to be called for a deleted slot, we makes
the caller see if the slot is non-zero and being dirty logged.

Reviewed-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Signed-off-by: Gleb Natapov <gleb@redhat.com>

Showing 2 changed files with 7 additions and 2 deletions Inline Diff

1 /* 1 /*
2 * Kernel-based Virtual Machine driver for Linux 2 * Kernel-based Virtual Machine driver for Linux
3 * 3 *
4 * derived from drivers/kvm/kvm_main.c 4 * derived from drivers/kvm/kvm_main.c
5 * 5 *
6 * Copyright (C) 2006 Qumranet, Inc. 6 * Copyright (C) 2006 Qumranet, Inc.
7 * Copyright (C) 2008 Qumranet, Inc. 7 * Copyright (C) 2008 Qumranet, Inc.
8 * Copyright IBM Corporation, 2008 8 * Copyright IBM Corporation, 2008
9 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 9 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
10 * 10 *
11 * Authors: 11 * Authors:
12 * Avi Kivity <avi@qumranet.com> 12 * Avi Kivity <avi@qumranet.com>
13 * Yaniv Kamay <yaniv@qumranet.com> 13 * Yaniv Kamay <yaniv@qumranet.com>
14 * Amit Shah <amit.shah@qumranet.com> 14 * Amit Shah <amit.shah@qumranet.com>
15 * Ben-Ami Yassour <benami@il.ibm.com> 15 * Ben-Ami Yassour <benami@il.ibm.com>
16 * 16 *
17 * This work is licensed under the terms of the GNU GPL, version 2. See 17 * This work is licensed under the terms of the GNU GPL, version 2. See
18 * the COPYING file in the top-level directory. 18 * the COPYING file in the top-level directory.
19 * 19 *
20 */ 20 */
21 21
22 #include <linux/kvm_host.h> 22 #include <linux/kvm_host.h>
23 #include "irq.h" 23 #include "irq.h"
24 #include "mmu.h" 24 #include "mmu.h"
25 #include "i8254.h" 25 #include "i8254.h"
26 #include "tss.h" 26 #include "tss.h"
27 #include "kvm_cache_regs.h" 27 #include "kvm_cache_regs.h"
28 #include "x86.h" 28 #include "x86.h"
29 #include "cpuid.h" 29 #include "cpuid.h"
30 30
31 #include <linux/clocksource.h> 31 #include <linux/clocksource.h>
32 #include <linux/interrupt.h> 32 #include <linux/interrupt.h>
33 #include <linux/kvm.h> 33 #include <linux/kvm.h>
34 #include <linux/fs.h> 34 #include <linux/fs.h>
35 #include <linux/vmalloc.h> 35 #include <linux/vmalloc.h>
36 #include <linux/module.h> 36 #include <linux/module.h>
37 #include <linux/mman.h> 37 #include <linux/mman.h>
38 #include <linux/highmem.h> 38 #include <linux/highmem.h>
39 #include <linux/iommu.h> 39 #include <linux/iommu.h>
40 #include <linux/intel-iommu.h> 40 #include <linux/intel-iommu.h>
41 #include <linux/cpufreq.h> 41 #include <linux/cpufreq.h>
42 #include <linux/user-return-notifier.h> 42 #include <linux/user-return-notifier.h>
43 #include <linux/srcu.h> 43 #include <linux/srcu.h>
44 #include <linux/slab.h> 44 #include <linux/slab.h>
45 #include <linux/perf_event.h> 45 #include <linux/perf_event.h>
46 #include <linux/uaccess.h> 46 #include <linux/uaccess.h>
47 #include <linux/hash.h> 47 #include <linux/hash.h>
48 #include <linux/pci.h> 48 #include <linux/pci.h>
49 #include <linux/timekeeper_internal.h> 49 #include <linux/timekeeper_internal.h>
50 #include <linux/pvclock_gtod.h> 50 #include <linux/pvclock_gtod.h>
51 #include <trace/events/kvm.h> 51 #include <trace/events/kvm.h>
52 52
53 #define CREATE_TRACE_POINTS 53 #define CREATE_TRACE_POINTS
54 #include "trace.h" 54 #include "trace.h"
55 55
56 #include <asm/debugreg.h> 56 #include <asm/debugreg.h>
57 #include <asm/msr.h> 57 #include <asm/msr.h>
58 #include <asm/desc.h> 58 #include <asm/desc.h>
59 #include <asm/mtrr.h> 59 #include <asm/mtrr.h>
60 #include <asm/mce.h> 60 #include <asm/mce.h>
61 #include <asm/i387.h> 61 #include <asm/i387.h>
62 #include <asm/fpu-internal.h> /* Ugh! */ 62 #include <asm/fpu-internal.h> /* Ugh! */
63 #include <asm/xcr.h> 63 #include <asm/xcr.h>
64 #include <asm/pvclock.h> 64 #include <asm/pvclock.h>
65 #include <asm/div64.h> 65 #include <asm/div64.h>
66 66
67 #define MAX_IO_MSRS 256 67 #define MAX_IO_MSRS 256
68 #define KVM_MAX_MCE_BANKS 32 68 #define KVM_MAX_MCE_BANKS 32
69 #define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P) 69 #define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P)
70 70
71 #define emul_to_vcpu(ctxt) \ 71 #define emul_to_vcpu(ctxt) \
72 container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt) 72 container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt)
73 73
74 /* EFER defaults: 74 /* EFER defaults:
75 * - enable syscall per default because its emulated by KVM 75 * - enable syscall per default because its emulated by KVM
76 * - enable LME and LMA per default on 64 bit KVM 76 * - enable LME and LMA per default on 64 bit KVM
77 */ 77 */
78 #ifdef CONFIG_X86_64 78 #ifdef CONFIG_X86_64
79 static 79 static
80 u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA)); 80 u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA));
81 #else 81 #else
82 static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); 82 static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
83 #endif 83 #endif
84 84
85 #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM 85 #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
86 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU 86 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
87 87
88 static void update_cr8_intercept(struct kvm_vcpu *vcpu); 88 static void update_cr8_intercept(struct kvm_vcpu *vcpu);
89 static void process_nmi(struct kvm_vcpu *vcpu); 89 static void process_nmi(struct kvm_vcpu *vcpu);
90 90
91 struct kvm_x86_ops *kvm_x86_ops; 91 struct kvm_x86_ops *kvm_x86_ops;
92 EXPORT_SYMBOL_GPL(kvm_x86_ops); 92 EXPORT_SYMBOL_GPL(kvm_x86_ops);
93 93
94 static bool ignore_msrs = 0; 94 static bool ignore_msrs = 0;
95 module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR); 95 module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
96 96
97 bool kvm_has_tsc_control; 97 bool kvm_has_tsc_control;
98 EXPORT_SYMBOL_GPL(kvm_has_tsc_control); 98 EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
99 u32 kvm_max_guest_tsc_khz; 99 u32 kvm_max_guest_tsc_khz;
100 EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz); 100 EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
101 101
102 /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */ 102 /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
103 static u32 tsc_tolerance_ppm = 250; 103 static u32 tsc_tolerance_ppm = 250;
104 module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); 104 module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
105 105
106 #define KVM_NR_SHARED_MSRS 16 106 #define KVM_NR_SHARED_MSRS 16
107 107
108 struct kvm_shared_msrs_global { 108 struct kvm_shared_msrs_global {
109 int nr; 109 int nr;
110 u32 msrs[KVM_NR_SHARED_MSRS]; 110 u32 msrs[KVM_NR_SHARED_MSRS];
111 }; 111 };
112 112
113 struct kvm_shared_msrs { 113 struct kvm_shared_msrs {
114 struct user_return_notifier urn; 114 struct user_return_notifier urn;
115 bool registered; 115 bool registered;
116 struct kvm_shared_msr_values { 116 struct kvm_shared_msr_values {
117 u64 host; 117 u64 host;
118 u64 curr; 118 u64 curr;
119 } values[KVM_NR_SHARED_MSRS]; 119 } values[KVM_NR_SHARED_MSRS];
120 }; 120 };
121 121
122 static struct kvm_shared_msrs_global __read_mostly shared_msrs_global; 122 static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
123 static DEFINE_PER_CPU(struct kvm_shared_msrs, shared_msrs); 123 static DEFINE_PER_CPU(struct kvm_shared_msrs, shared_msrs);
124 124
125 struct kvm_stats_debugfs_item debugfs_entries[] = { 125 struct kvm_stats_debugfs_item debugfs_entries[] = {
126 { "pf_fixed", VCPU_STAT(pf_fixed) }, 126 { "pf_fixed", VCPU_STAT(pf_fixed) },
127 { "pf_guest", VCPU_STAT(pf_guest) }, 127 { "pf_guest", VCPU_STAT(pf_guest) },
128 { "tlb_flush", VCPU_STAT(tlb_flush) }, 128 { "tlb_flush", VCPU_STAT(tlb_flush) },
129 { "invlpg", VCPU_STAT(invlpg) }, 129 { "invlpg", VCPU_STAT(invlpg) },
130 { "exits", VCPU_STAT(exits) }, 130 { "exits", VCPU_STAT(exits) },
131 { "io_exits", VCPU_STAT(io_exits) }, 131 { "io_exits", VCPU_STAT(io_exits) },
132 { "mmio_exits", VCPU_STAT(mmio_exits) }, 132 { "mmio_exits", VCPU_STAT(mmio_exits) },
133 { "signal_exits", VCPU_STAT(signal_exits) }, 133 { "signal_exits", VCPU_STAT(signal_exits) },
134 { "irq_window", VCPU_STAT(irq_window_exits) }, 134 { "irq_window", VCPU_STAT(irq_window_exits) },
135 { "nmi_window", VCPU_STAT(nmi_window_exits) }, 135 { "nmi_window", VCPU_STAT(nmi_window_exits) },
136 { "halt_exits", VCPU_STAT(halt_exits) }, 136 { "halt_exits", VCPU_STAT(halt_exits) },
137 { "halt_wakeup", VCPU_STAT(halt_wakeup) }, 137 { "halt_wakeup", VCPU_STAT(halt_wakeup) },
138 { "hypercalls", VCPU_STAT(hypercalls) }, 138 { "hypercalls", VCPU_STAT(hypercalls) },
139 { "request_irq", VCPU_STAT(request_irq_exits) }, 139 { "request_irq", VCPU_STAT(request_irq_exits) },
140 { "irq_exits", VCPU_STAT(irq_exits) }, 140 { "irq_exits", VCPU_STAT(irq_exits) },
141 { "host_state_reload", VCPU_STAT(host_state_reload) }, 141 { "host_state_reload", VCPU_STAT(host_state_reload) },
142 { "efer_reload", VCPU_STAT(efer_reload) }, 142 { "efer_reload", VCPU_STAT(efer_reload) },
143 { "fpu_reload", VCPU_STAT(fpu_reload) }, 143 { "fpu_reload", VCPU_STAT(fpu_reload) },
144 { "insn_emulation", VCPU_STAT(insn_emulation) }, 144 { "insn_emulation", VCPU_STAT(insn_emulation) },
145 { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, 145 { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
146 { "irq_injections", VCPU_STAT(irq_injections) }, 146 { "irq_injections", VCPU_STAT(irq_injections) },
147 { "nmi_injections", VCPU_STAT(nmi_injections) }, 147 { "nmi_injections", VCPU_STAT(nmi_injections) },
148 { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, 148 { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
149 { "mmu_pte_write", VM_STAT(mmu_pte_write) }, 149 { "mmu_pte_write", VM_STAT(mmu_pte_write) },
150 { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, 150 { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
151 { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) }, 151 { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
152 { "mmu_flooded", VM_STAT(mmu_flooded) }, 152 { "mmu_flooded", VM_STAT(mmu_flooded) },
153 { "mmu_recycled", VM_STAT(mmu_recycled) }, 153 { "mmu_recycled", VM_STAT(mmu_recycled) },
154 { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, 154 { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
155 { "mmu_unsync", VM_STAT(mmu_unsync) }, 155 { "mmu_unsync", VM_STAT(mmu_unsync) },
156 { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, 156 { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
157 { "largepages", VM_STAT(lpages) }, 157 { "largepages", VM_STAT(lpages) },
158 { NULL } 158 { NULL }
159 }; 159 };
160 160
161 u64 __read_mostly host_xcr0; 161 u64 __read_mostly host_xcr0;
162 162
163 static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); 163 static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
164 164
165 static int kvm_vcpu_reset(struct kvm_vcpu *vcpu); 165 static int kvm_vcpu_reset(struct kvm_vcpu *vcpu);
166 166
167 static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) 167 static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
168 { 168 {
169 int i; 169 int i;
170 for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++) 170 for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++)
171 vcpu->arch.apf.gfns[i] = ~0; 171 vcpu->arch.apf.gfns[i] = ~0;
172 } 172 }
173 173
174 static void kvm_on_user_return(struct user_return_notifier *urn) 174 static void kvm_on_user_return(struct user_return_notifier *urn)
175 { 175 {
176 unsigned slot; 176 unsigned slot;
177 struct kvm_shared_msrs *locals 177 struct kvm_shared_msrs *locals
178 = container_of(urn, struct kvm_shared_msrs, urn); 178 = container_of(urn, struct kvm_shared_msrs, urn);
179 struct kvm_shared_msr_values *values; 179 struct kvm_shared_msr_values *values;
180 180
181 for (slot = 0; slot < shared_msrs_global.nr; ++slot) { 181 for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
182 values = &locals->values[slot]; 182 values = &locals->values[slot];
183 if (values->host != values->curr) { 183 if (values->host != values->curr) {
184 wrmsrl(shared_msrs_global.msrs[slot], values->host); 184 wrmsrl(shared_msrs_global.msrs[slot], values->host);
185 values->curr = values->host; 185 values->curr = values->host;
186 } 186 }
187 } 187 }
188 locals->registered = false; 188 locals->registered = false;
189 user_return_notifier_unregister(urn); 189 user_return_notifier_unregister(urn);
190 } 190 }
191 191
192 static void shared_msr_update(unsigned slot, u32 msr) 192 static void shared_msr_update(unsigned slot, u32 msr)
193 { 193 {
194 struct kvm_shared_msrs *smsr; 194 struct kvm_shared_msrs *smsr;
195 u64 value; 195 u64 value;
196 196
197 smsr = &__get_cpu_var(shared_msrs); 197 smsr = &__get_cpu_var(shared_msrs);
198 /* only read, and nobody should modify it at this time, 198 /* only read, and nobody should modify it at this time,
199 * so don't need lock */ 199 * so don't need lock */
200 if (slot >= shared_msrs_global.nr) { 200 if (slot >= shared_msrs_global.nr) {
201 printk(KERN_ERR "kvm: invalid MSR slot!"); 201 printk(KERN_ERR "kvm: invalid MSR slot!");
202 return; 202 return;
203 } 203 }
204 rdmsrl_safe(msr, &value); 204 rdmsrl_safe(msr, &value);
205 smsr->values[slot].host = value; 205 smsr->values[slot].host = value;
206 smsr->values[slot].curr = value; 206 smsr->values[slot].curr = value;
207 } 207 }
208 208
209 void kvm_define_shared_msr(unsigned slot, u32 msr) 209 void kvm_define_shared_msr(unsigned slot, u32 msr)
210 { 210 {
211 if (slot >= shared_msrs_global.nr) 211 if (slot >= shared_msrs_global.nr)
212 shared_msrs_global.nr = slot + 1; 212 shared_msrs_global.nr = slot + 1;
213 shared_msrs_global.msrs[slot] = msr; 213 shared_msrs_global.msrs[slot] = msr;
214 /* we need ensured the shared_msr_global have been updated */ 214 /* we need ensured the shared_msr_global have been updated */
215 smp_wmb(); 215 smp_wmb();
216 } 216 }
217 EXPORT_SYMBOL_GPL(kvm_define_shared_msr); 217 EXPORT_SYMBOL_GPL(kvm_define_shared_msr);
218 218
219 static void kvm_shared_msr_cpu_online(void) 219 static void kvm_shared_msr_cpu_online(void)
220 { 220 {
221 unsigned i; 221 unsigned i;
222 222
223 for (i = 0; i < shared_msrs_global.nr; ++i) 223 for (i = 0; i < shared_msrs_global.nr; ++i)
224 shared_msr_update(i, shared_msrs_global.msrs[i]); 224 shared_msr_update(i, shared_msrs_global.msrs[i]);
225 } 225 }
226 226
227 void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask) 227 void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
228 { 228 {
229 struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs); 229 struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);
230 230
231 if (((value ^ smsr->values[slot].curr) & mask) == 0) 231 if (((value ^ smsr->values[slot].curr) & mask) == 0)
232 return; 232 return;
233 smsr->values[slot].curr = value; 233 smsr->values[slot].curr = value;
234 wrmsrl(shared_msrs_global.msrs[slot], value); 234 wrmsrl(shared_msrs_global.msrs[slot], value);
235 if (!smsr->registered) { 235 if (!smsr->registered) {
236 smsr->urn.on_user_return = kvm_on_user_return; 236 smsr->urn.on_user_return = kvm_on_user_return;
237 user_return_notifier_register(&smsr->urn); 237 user_return_notifier_register(&smsr->urn);
238 smsr->registered = true; 238 smsr->registered = true;
239 } 239 }
240 } 240 }
241 EXPORT_SYMBOL_GPL(kvm_set_shared_msr); 241 EXPORT_SYMBOL_GPL(kvm_set_shared_msr);
242 242
243 static void drop_user_return_notifiers(void *ignore) 243 static void drop_user_return_notifiers(void *ignore)
244 { 244 {
245 struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs); 245 struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);
246 246
247 if (smsr->registered) 247 if (smsr->registered)
248 kvm_on_user_return(&smsr->urn); 248 kvm_on_user_return(&smsr->urn);
249 } 249 }
250 250
251 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) 251 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
252 { 252 {
253 return vcpu->arch.apic_base; 253 return vcpu->arch.apic_base;
254 } 254 }
255 EXPORT_SYMBOL_GPL(kvm_get_apic_base); 255 EXPORT_SYMBOL_GPL(kvm_get_apic_base);
256 256
257 void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data) 257 void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
258 { 258 {
259 /* TODO: reserve bits check */ 259 /* TODO: reserve bits check */
260 kvm_lapic_set_base(vcpu, data); 260 kvm_lapic_set_base(vcpu, data);
261 } 261 }
262 EXPORT_SYMBOL_GPL(kvm_set_apic_base); 262 EXPORT_SYMBOL_GPL(kvm_set_apic_base);
263 263
264 #define EXCPT_BENIGN 0 264 #define EXCPT_BENIGN 0
265 #define EXCPT_CONTRIBUTORY 1 265 #define EXCPT_CONTRIBUTORY 1
266 #define EXCPT_PF 2 266 #define EXCPT_PF 2
267 267
268 static int exception_class(int vector) 268 static int exception_class(int vector)
269 { 269 {
270 switch (vector) { 270 switch (vector) {
271 case PF_VECTOR: 271 case PF_VECTOR:
272 return EXCPT_PF; 272 return EXCPT_PF;
273 case DE_VECTOR: 273 case DE_VECTOR:
274 case TS_VECTOR: 274 case TS_VECTOR:
275 case NP_VECTOR: 275 case NP_VECTOR:
276 case SS_VECTOR: 276 case SS_VECTOR:
277 case GP_VECTOR: 277 case GP_VECTOR:
278 return EXCPT_CONTRIBUTORY; 278 return EXCPT_CONTRIBUTORY;
279 default: 279 default:
280 break; 280 break;
281 } 281 }
282 return EXCPT_BENIGN; 282 return EXCPT_BENIGN;
283 } 283 }
284 284
285 static void kvm_multiple_exception(struct kvm_vcpu *vcpu, 285 static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
286 unsigned nr, bool has_error, u32 error_code, 286 unsigned nr, bool has_error, u32 error_code,
287 bool reinject) 287 bool reinject)
288 { 288 {
289 u32 prev_nr; 289 u32 prev_nr;
290 int class1, class2; 290 int class1, class2;
291 291
292 kvm_make_request(KVM_REQ_EVENT, vcpu); 292 kvm_make_request(KVM_REQ_EVENT, vcpu);
293 293
294 if (!vcpu->arch.exception.pending) { 294 if (!vcpu->arch.exception.pending) {
295 queue: 295 queue:
296 vcpu->arch.exception.pending = true; 296 vcpu->arch.exception.pending = true;
297 vcpu->arch.exception.has_error_code = has_error; 297 vcpu->arch.exception.has_error_code = has_error;
298 vcpu->arch.exception.nr = nr; 298 vcpu->arch.exception.nr = nr;
299 vcpu->arch.exception.error_code = error_code; 299 vcpu->arch.exception.error_code = error_code;
300 vcpu->arch.exception.reinject = reinject; 300 vcpu->arch.exception.reinject = reinject;
301 return; 301 return;
302 } 302 }
303 303
304 /* to check exception */ 304 /* to check exception */
305 prev_nr = vcpu->arch.exception.nr; 305 prev_nr = vcpu->arch.exception.nr;
306 if (prev_nr == DF_VECTOR) { 306 if (prev_nr == DF_VECTOR) {
307 /* triple fault -> shutdown */ 307 /* triple fault -> shutdown */
308 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 308 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
309 return; 309 return;
310 } 310 }
311 class1 = exception_class(prev_nr); 311 class1 = exception_class(prev_nr);
312 class2 = exception_class(nr); 312 class2 = exception_class(nr);
313 if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY) 313 if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
314 || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) { 314 || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
315 /* generate double fault per SDM Table 5-5 */ 315 /* generate double fault per SDM Table 5-5 */
316 vcpu->arch.exception.pending = true; 316 vcpu->arch.exception.pending = true;
317 vcpu->arch.exception.has_error_code = true; 317 vcpu->arch.exception.has_error_code = true;
318 vcpu->arch.exception.nr = DF_VECTOR; 318 vcpu->arch.exception.nr = DF_VECTOR;
319 vcpu->arch.exception.error_code = 0; 319 vcpu->arch.exception.error_code = 0;
320 } else 320 } else
321 /* replace previous exception with a new one in a hope 321 /* replace previous exception with a new one in a hope
322 that instruction re-execution will regenerate lost 322 that instruction re-execution will regenerate lost
323 exception */ 323 exception */
324 goto queue; 324 goto queue;
325 } 325 }
326 326
327 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) 327 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
328 { 328 {
329 kvm_multiple_exception(vcpu, nr, false, 0, false); 329 kvm_multiple_exception(vcpu, nr, false, 0, false);
330 } 330 }
331 EXPORT_SYMBOL_GPL(kvm_queue_exception); 331 EXPORT_SYMBOL_GPL(kvm_queue_exception);
332 332
333 void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr) 333 void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
334 { 334 {
335 kvm_multiple_exception(vcpu, nr, false, 0, true); 335 kvm_multiple_exception(vcpu, nr, false, 0, true);
336 } 336 }
337 EXPORT_SYMBOL_GPL(kvm_requeue_exception); 337 EXPORT_SYMBOL_GPL(kvm_requeue_exception);
338 338
339 void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err) 339 void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
340 { 340 {
341 if (err) 341 if (err)
342 kvm_inject_gp(vcpu, 0); 342 kvm_inject_gp(vcpu, 0);
343 else 343 else
344 kvm_x86_ops->skip_emulated_instruction(vcpu); 344 kvm_x86_ops->skip_emulated_instruction(vcpu);
345 } 345 }
346 EXPORT_SYMBOL_GPL(kvm_complete_insn_gp); 346 EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
347 347
348 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) 348 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
349 { 349 {
350 ++vcpu->stat.pf_guest; 350 ++vcpu->stat.pf_guest;
351 vcpu->arch.cr2 = fault->address; 351 vcpu->arch.cr2 = fault->address;
352 kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code); 352 kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
353 } 353 }
354 EXPORT_SYMBOL_GPL(kvm_inject_page_fault); 354 EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
355 355
356 void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) 356 void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
357 { 357 {
358 if (mmu_is_nested(vcpu) && !fault->nested_page_fault) 358 if (mmu_is_nested(vcpu) && !fault->nested_page_fault)
359 vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault); 359 vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault);
360 else 360 else
361 vcpu->arch.mmu.inject_page_fault(vcpu, fault); 361 vcpu->arch.mmu.inject_page_fault(vcpu, fault);
362 } 362 }
363 363
364 void kvm_inject_nmi(struct kvm_vcpu *vcpu) 364 void kvm_inject_nmi(struct kvm_vcpu *vcpu)
365 { 365 {
366 atomic_inc(&vcpu->arch.nmi_queued); 366 atomic_inc(&vcpu->arch.nmi_queued);
367 kvm_make_request(KVM_REQ_NMI, vcpu); 367 kvm_make_request(KVM_REQ_NMI, vcpu);
368 } 368 }
369 EXPORT_SYMBOL_GPL(kvm_inject_nmi); 369 EXPORT_SYMBOL_GPL(kvm_inject_nmi);
370 370
371 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) 371 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
372 { 372 {
373 kvm_multiple_exception(vcpu, nr, true, error_code, false); 373 kvm_multiple_exception(vcpu, nr, true, error_code, false);
374 } 374 }
375 EXPORT_SYMBOL_GPL(kvm_queue_exception_e); 375 EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
376 376
377 void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) 377 void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
378 { 378 {
379 kvm_multiple_exception(vcpu, nr, true, error_code, true); 379 kvm_multiple_exception(vcpu, nr, true, error_code, true);
380 } 380 }
381 EXPORT_SYMBOL_GPL(kvm_requeue_exception_e); 381 EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);
382 382
383 /* 383 /*
384 * Checks if cpl <= required_cpl; if true, return true. Otherwise queue 384 * Checks if cpl <= required_cpl; if true, return true. Otherwise queue
385 * a #GP and return false. 385 * a #GP and return false.
386 */ 386 */
387 bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl) 387 bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
388 { 388 {
389 if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl) 389 if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl)
390 return true; 390 return true;
391 kvm_queue_exception_e(vcpu, GP_VECTOR, 0); 391 kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
392 return false; 392 return false;
393 } 393 }
394 EXPORT_SYMBOL_GPL(kvm_require_cpl); 394 EXPORT_SYMBOL_GPL(kvm_require_cpl);
395 395
396 /* 396 /*
397 * This function will be used to read from the physical memory of the currently 397 * This function will be used to read from the physical memory of the currently
398 * running guest. The difference to kvm_read_guest_page is that this function 398 * running guest. The difference to kvm_read_guest_page is that this function
399 * can read from guest physical or from the guest's guest physical memory. 399 * can read from guest physical or from the guest's guest physical memory.
400 */ 400 */
401 int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, 401 int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
402 gfn_t ngfn, void *data, int offset, int len, 402 gfn_t ngfn, void *data, int offset, int len,
403 u32 access) 403 u32 access)
404 { 404 {
405 gfn_t real_gfn; 405 gfn_t real_gfn;
406 gpa_t ngpa; 406 gpa_t ngpa;
407 407
408 ngpa = gfn_to_gpa(ngfn); 408 ngpa = gfn_to_gpa(ngfn);
409 real_gfn = mmu->translate_gpa(vcpu, ngpa, access); 409 real_gfn = mmu->translate_gpa(vcpu, ngpa, access);
410 if (real_gfn == UNMAPPED_GVA) 410 if (real_gfn == UNMAPPED_GVA)
411 return -EFAULT; 411 return -EFAULT;
412 412
413 real_gfn = gpa_to_gfn(real_gfn); 413 real_gfn = gpa_to_gfn(real_gfn);
414 414
415 return kvm_read_guest_page(vcpu->kvm, real_gfn, data, offset, len); 415 return kvm_read_guest_page(vcpu->kvm, real_gfn, data, offset, len);
416 } 416 }
417 EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu); 417 EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
418 418
419 int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, 419 int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
420 void *data, int offset, int len, u32 access) 420 void *data, int offset, int len, u32 access)
421 { 421 {
422 return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn, 422 return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn,
423 data, offset, len, access); 423 data, offset, len, access);
424 } 424 }
425 425
426 /* 426 /*
427 * Load the pae pdptrs. Return true is they are all valid. 427 * Load the pae pdptrs. Return true is they are all valid.
428 */ 428 */
429 int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3) 429 int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
430 { 430 {
431 gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; 431 gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
432 unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; 432 unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
433 int i; 433 int i;
434 int ret; 434 int ret;
435 u64 pdpte[ARRAY_SIZE(mmu->pdptrs)]; 435 u64 pdpte[ARRAY_SIZE(mmu->pdptrs)];
436 436
437 ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte, 437 ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte,
438 offset * sizeof(u64), sizeof(pdpte), 438 offset * sizeof(u64), sizeof(pdpte),
439 PFERR_USER_MASK|PFERR_WRITE_MASK); 439 PFERR_USER_MASK|PFERR_WRITE_MASK);
440 if (ret < 0) { 440 if (ret < 0) {
441 ret = 0; 441 ret = 0;
442 goto out; 442 goto out;
443 } 443 }
444 for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { 444 for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
445 if (is_present_gpte(pdpte[i]) && 445 if (is_present_gpte(pdpte[i]) &&
446 (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) { 446 (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) {
447 ret = 0; 447 ret = 0;
448 goto out; 448 goto out;
449 } 449 }
450 } 450 }
451 ret = 1; 451 ret = 1;
452 452
453 memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs)); 453 memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
454 __set_bit(VCPU_EXREG_PDPTR, 454 __set_bit(VCPU_EXREG_PDPTR,
455 (unsigned long *)&vcpu->arch.regs_avail); 455 (unsigned long *)&vcpu->arch.regs_avail);
456 __set_bit(VCPU_EXREG_PDPTR, 456 __set_bit(VCPU_EXREG_PDPTR,
457 (unsigned long *)&vcpu->arch.regs_dirty); 457 (unsigned long *)&vcpu->arch.regs_dirty);
458 out: 458 out:
459 459
460 return ret; 460 return ret;
461 } 461 }
462 EXPORT_SYMBOL_GPL(load_pdptrs); 462 EXPORT_SYMBOL_GPL(load_pdptrs);
463 463
464 static bool pdptrs_changed(struct kvm_vcpu *vcpu) 464 static bool pdptrs_changed(struct kvm_vcpu *vcpu)
465 { 465 {
466 u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)]; 466 u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
467 bool changed = true; 467 bool changed = true;
468 int offset; 468 int offset;
469 gfn_t gfn; 469 gfn_t gfn;
470 int r; 470 int r;
471 471
472 if (is_long_mode(vcpu) || !is_pae(vcpu)) 472 if (is_long_mode(vcpu) || !is_pae(vcpu))
473 return false; 473 return false;
474 474
475 if (!test_bit(VCPU_EXREG_PDPTR, 475 if (!test_bit(VCPU_EXREG_PDPTR,
476 (unsigned long *)&vcpu->arch.regs_avail)) 476 (unsigned long *)&vcpu->arch.regs_avail))
477 return true; 477 return true;
478 478
479 gfn = (kvm_read_cr3(vcpu) & ~31u) >> PAGE_SHIFT; 479 gfn = (kvm_read_cr3(vcpu) & ~31u) >> PAGE_SHIFT;
480 offset = (kvm_read_cr3(vcpu) & ~31u) & (PAGE_SIZE - 1); 480 offset = (kvm_read_cr3(vcpu) & ~31u) & (PAGE_SIZE - 1);
481 r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte), 481 r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
482 PFERR_USER_MASK | PFERR_WRITE_MASK); 482 PFERR_USER_MASK | PFERR_WRITE_MASK);
483 if (r < 0) 483 if (r < 0)
484 goto out; 484 goto out;
485 changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0; 485 changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
486 out: 486 out:
487 487
488 return changed; 488 return changed;
489 } 489 }
490 490
491 int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 491 int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
492 { 492 {
493 unsigned long old_cr0 = kvm_read_cr0(vcpu); 493 unsigned long old_cr0 = kvm_read_cr0(vcpu);
494 unsigned long update_bits = X86_CR0_PG | X86_CR0_WP | 494 unsigned long update_bits = X86_CR0_PG | X86_CR0_WP |
495 X86_CR0_CD | X86_CR0_NW; 495 X86_CR0_CD | X86_CR0_NW;
496 496
497 cr0 |= X86_CR0_ET; 497 cr0 |= X86_CR0_ET;
498 498
499 #ifdef CONFIG_X86_64 499 #ifdef CONFIG_X86_64
500 if (cr0 & 0xffffffff00000000UL) 500 if (cr0 & 0xffffffff00000000UL)
501 return 1; 501 return 1;
502 #endif 502 #endif
503 503
504 cr0 &= ~CR0_RESERVED_BITS; 504 cr0 &= ~CR0_RESERVED_BITS;
505 505
506 if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) 506 if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
507 return 1; 507 return 1;
508 508
509 if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) 509 if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
510 return 1; 510 return 1;
511 511
512 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { 512 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
513 #ifdef CONFIG_X86_64 513 #ifdef CONFIG_X86_64
514 if ((vcpu->arch.efer & EFER_LME)) { 514 if ((vcpu->arch.efer & EFER_LME)) {
515 int cs_db, cs_l; 515 int cs_db, cs_l;
516 516
517 if (!is_pae(vcpu)) 517 if (!is_pae(vcpu))
518 return 1; 518 return 1;
519 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 519 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
520 if (cs_l) 520 if (cs_l)
521 return 1; 521 return 1;
522 } else 522 } else
523 #endif 523 #endif
524 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, 524 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
525 kvm_read_cr3(vcpu))) 525 kvm_read_cr3(vcpu)))
526 return 1; 526 return 1;
527 } 527 }
528 528
529 if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) 529 if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
530 return 1; 530 return 1;
531 531
532 kvm_x86_ops->set_cr0(vcpu, cr0); 532 kvm_x86_ops->set_cr0(vcpu, cr0);
533 533
534 if ((cr0 ^ old_cr0) & X86_CR0_PG) { 534 if ((cr0 ^ old_cr0) & X86_CR0_PG) {
535 kvm_clear_async_pf_completion_queue(vcpu); 535 kvm_clear_async_pf_completion_queue(vcpu);
536 kvm_async_pf_hash_reset(vcpu); 536 kvm_async_pf_hash_reset(vcpu);
537 } 537 }
538 538
539 if ((cr0 ^ old_cr0) & update_bits) 539 if ((cr0 ^ old_cr0) & update_bits)
540 kvm_mmu_reset_context(vcpu); 540 kvm_mmu_reset_context(vcpu);
541 return 0; 541 return 0;
542 } 542 }
543 EXPORT_SYMBOL_GPL(kvm_set_cr0); 543 EXPORT_SYMBOL_GPL(kvm_set_cr0);
544 544
545 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) 545 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
546 { 546 {
547 (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f)); 547 (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
548 } 548 }
549 EXPORT_SYMBOL_GPL(kvm_lmsw); 549 EXPORT_SYMBOL_GPL(kvm_lmsw);
550 550
551 int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) 551 int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
552 { 552 {
553 u64 xcr0; 553 u64 xcr0;
554 554
555 /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */ 555 /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */
556 if (index != XCR_XFEATURE_ENABLED_MASK) 556 if (index != XCR_XFEATURE_ENABLED_MASK)
557 return 1; 557 return 1;
558 xcr0 = xcr; 558 xcr0 = xcr;
559 if (kvm_x86_ops->get_cpl(vcpu) != 0) 559 if (kvm_x86_ops->get_cpl(vcpu) != 0)
560 return 1; 560 return 1;
561 if (!(xcr0 & XSTATE_FP)) 561 if (!(xcr0 & XSTATE_FP))
562 return 1; 562 return 1;
563 if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE)) 563 if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE))
564 return 1; 564 return 1;
565 if (xcr0 & ~host_xcr0) 565 if (xcr0 & ~host_xcr0)
566 return 1; 566 return 1;
567 vcpu->arch.xcr0 = xcr0; 567 vcpu->arch.xcr0 = xcr0;
568 vcpu->guest_xcr0_loaded = 0; 568 vcpu->guest_xcr0_loaded = 0;
569 return 0; 569 return 0;
570 } 570 }
571 571
572 int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) 572 int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
573 { 573 {
574 if (__kvm_set_xcr(vcpu, index, xcr)) { 574 if (__kvm_set_xcr(vcpu, index, xcr)) {
575 kvm_inject_gp(vcpu, 0); 575 kvm_inject_gp(vcpu, 0);
576 return 1; 576 return 1;
577 } 577 }
578 return 0; 578 return 0;
579 } 579 }
580 EXPORT_SYMBOL_GPL(kvm_set_xcr); 580 EXPORT_SYMBOL_GPL(kvm_set_xcr);
581 581
582 int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 582 int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
583 { 583 {
584 unsigned long old_cr4 = kvm_read_cr4(vcpu); 584 unsigned long old_cr4 = kvm_read_cr4(vcpu);
585 unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | 585 unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE |
586 X86_CR4_PAE | X86_CR4_SMEP; 586 X86_CR4_PAE | X86_CR4_SMEP;
587 if (cr4 & CR4_RESERVED_BITS) 587 if (cr4 & CR4_RESERVED_BITS)
588 return 1; 588 return 1;
589 589
590 if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE)) 590 if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE))
591 return 1; 591 return 1;
592 592
593 if (!guest_cpuid_has_smep(vcpu) && (cr4 & X86_CR4_SMEP)) 593 if (!guest_cpuid_has_smep(vcpu) && (cr4 & X86_CR4_SMEP))
594 return 1; 594 return 1;
595 595
596 if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_RDWRGSFS)) 596 if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_RDWRGSFS))
597 return 1; 597 return 1;
598 598
599 if (is_long_mode(vcpu)) { 599 if (is_long_mode(vcpu)) {
600 if (!(cr4 & X86_CR4_PAE)) 600 if (!(cr4 & X86_CR4_PAE))
601 return 1; 601 return 1;
602 } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) 602 } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
603 && ((cr4 ^ old_cr4) & pdptr_bits) 603 && ((cr4 ^ old_cr4) & pdptr_bits)
604 && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, 604 && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
605 kvm_read_cr3(vcpu))) 605 kvm_read_cr3(vcpu)))
606 return 1; 606 return 1;
607 607
608 if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) { 608 if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) {
609 if (!guest_cpuid_has_pcid(vcpu)) 609 if (!guest_cpuid_has_pcid(vcpu))
610 return 1; 610 return 1;
611 611
612 /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */ 612 /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
613 if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu)) 613 if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))
614 return 1; 614 return 1;
615 } 615 }
616 616
617 if (kvm_x86_ops->set_cr4(vcpu, cr4)) 617 if (kvm_x86_ops->set_cr4(vcpu, cr4))
618 return 1; 618 return 1;
619 619
620 if (((cr4 ^ old_cr4) & pdptr_bits) || 620 if (((cr4 ^ old_cr4) & pdptr_bits) ||
621 (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE))) 621 (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
622 kvm_mmu_reset_context(vcpu); 622 kvm_mmu_reset_context(vcpu);
623 623
624 if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE) 624 if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE)
625 kvm_update_cpuid(vcpu); 625 kvm_update_cpuid(vcpu);
626 626
627 return 0; 627 return 0;
628 } 628 }
629 EXPORT_SYMBOL_GPL(kvm_set_cr4); 629 EXPORT_SYMBOL_GPL(kvm_set_cr4);
630 630
631 int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 631 int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
632 { 632 {
633 if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) { 633 if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
634 kvm_mmu_sync_roots(vcpu); 634 kvm_mmu_sync_roots(vcpu);
635 kvm_mmu_flush_tlb(vcpu); 635 kvm_mmu_flush_tlb(vcpu);
636 return 0; 636 return 0;
637 } 637 }
638 638
639 if (is_long_mode(vcpu)) { 639 if (is_long_mode(vcpu)) {
640 if (kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) { 640 if (kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) {
641 if (cr3 & CR3_PCID_ENABLED_RESERVED_BITS) 641 if (cr3 & CR3_PCID_ENABLED_RESERVED_BITS)
642 return 1; 642 return 1;
643 } else 643 } else
644 if (cr3 & CR3_L_MODE_RESERVED_BITS) 644 if (cr3 & CR3_L_MODE_RESERVED_BITS)
645 return 1; 645 return 1;
646 } else { 646 } else {
647 if (is_pae(vcpu)) { 647 if (is_pae(vcpu)) {
648 if (cr3 & CR3_PAE_RESERVED_BITS) 648 if (cr3 & CR3_PAE_RESERVED_BITS)
649 return 1; 649 return 1;
650 if (is_paging(vcpu) && 650 if (is_paging(vcpu) &&
651 !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) 651 !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
652 return 1; 652 return 1;
653 } 653 }
654 /* 654 /*
655 * We don't check reserved bits in nonpae mode, because 655 * We don't check reserved bits in nonpae mode, because
656 * this isn't enforced, and VMware depends on this. 656 * this isn't enforced, and VMware depends on this.
657 */ 657 */
658 } 658 }
659 659
660 /* 660 /*
661 * Does the new cr3 value map to physical memory? (Note, we 661 * Does the new cr3 value map to physical memory? (Note, we
662 * catch an invalid cr3 even in real-mode, because it would 662 * catch an invalid cr3 even in real-mode, because it would
663 * cause trouble later on when we turn on paging anyway.) 663 * cause trouble later on when we turn on paging anyway.)
664 * 664 *
665 * A real CPU would silently accept an invalid cr3 and would 665 * A real CPU would silently accept an invalid cr3 and would
666 * attempt to use it - with largely undefined (and often hard 666 * attempt to use it - with largely undefined (and often hard
667 * to debug) behavior on the guest side. 667 * to debug) behavior on the guest side.
668 */ 668 */
669 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) 669 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
670 return 1; 670 return 1;
671 vcpu->arch.cr3 = cr3; 671 vcpu->arch.cr3 = cr3;
672 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); 672 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
673 vcpu->arch.mmu.new_cr3(vcpu); 673 vcpu->arch.mmu.new_cr3(vcpu);
674 return 0; 674 return 0;
675 } 675 }
676 EXPORT_SYMBOL_GPL(kvm_set_cr3); 676 EXPORT_SYMBOL_GPL(kvm_set_cr3);
677 677
678 int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) 678 int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
679 { 679 {
680 if (cr8 & CR8_RESERVED_BITS) 680 if (cr8 & CR8_RESERVED_BITS)
681 return 1; 681 return 1;
682 if (irqchip_in_kernel(vcpu->kvm)) 682 if (irqchip_in_kernel(vcpu->kvm))
683 kvm_lapic_set_tpr(vcpu, cr8); 683 kvm_lapic_set_tpr(vcpu, cr8);
684 else 684 else
685 vcpu->arch.cr8 = cr8; 685 vcpu->arch.cr8 = cr8;
686 return 0; 686 return 0;
687 } 687 }
688 EXPORT_SYMBOL_GPL(kvm_set_cr8); 688 EXPORT_SYMBOL_GPL(kvm_set_cr8);
689 689
690 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) 690 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
691 { 691 {
692 if (irqchip_in_kernel(vcpu->kvm)) 692 if (irqchip_in_kernel(vcpu->kvm))
693 return kvm_lapic_get_cr8(vcpu); 693 return kvm_lapic_get_cr8(vcpu);
694 else 694 else
695 return vcpu->arch.cr8; 695 return vcpu->arch.cr8;
696 } 696 }
697 EXPORT_SYMBOL_GPL(kvm_get_cr8); 697 EXPORT_SYMBOL_GPL(kvm_get_cr8);
698 698
699 static void kvm_update_dr7(struct kvm_vcpu *vcpu) 699 static void kvm_update_dr7(struct kvm_vcpu *vcpu)
700 { 700 {
701 unsigned long dr7; 701 unsigned long dr7;
702 702
703 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 703 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
704 dr7 = vcpu->arch.guest_debug_dr7; 704 dr7 = vcpu->arch.guest_debug_dr7;
705 else 705 else
706 dr7 = vcpu->arch.dr7; 706 dr7 = vcpu->arch.dr7;
707 kvm_x86_ops->set_dr7(vcpu, dr7); 707 kvm_x86_ops->set_dr7(vcpu, dr7);
708 vcpu->arch.switch_db_regs = (dr7 & DR7_BP_EN_MASK); 708 vcpu->arch.switch_db_regs = (dr7 & DR7_BP_EN_MASK);
709 } 709 }
710 710
711 static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) 711 static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
712 { 712 {
713 switch (dr) { 713 switch (dr) {
714 case 0 ... 3: 714 case 0 ... 3:
715 vcpu->arch.db[dr] = val; 715 vcpu->arch.db[dr] = val;
716 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) 716 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
717 vcpu->arch.eff_db[dr] = val; 717 vcpu->arch.eff_db[dr] = val;
718 break; 718 break;
719 case 4: 719 case 4:
720 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) 720 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
721 return 1; /* #UD */ 721 return 1; /* #UD */
722 /* fall through */ 722 /* fall through */
723 case 6: 723 case 6:
724 if (val & 0xffffffff00000000ULL) 724 if (val & 0xffffffff00000000ULL)
725 return -1; /* #GP */ 725 return -1; /* #GP */
726 vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1; 726 vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
727 break; 727 break;
728 case 5: 728 case 5:
729 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) 729 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
730 return 1; /* #UD */ 730 return 1; /* #UD */
731 /* fall through */ 731 /* fall through */
732 default: /* 7 */ 732 default: /* 7 */
733 if (val & 0xffffffff00000000ULL) 733 if (val & 0xffffffff00000000ULL)
734 return -1; /* #GP */ 734 return -1; /* #GP */
735 vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1; 735 vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
736 kvm_update_dr7(vcpu); 736 kvm_update_dr7(vcpu);
737 break; 737 break;
738 } 738 }
739 739
740 return 0; 740 return 0;
741 } 741 }
742 742
743 int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) 743 int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
744 { 744 {
745 int res; 745 int res;
746 746
747 res = __kvm_set_dr(vcpu, dr, val); 747 res = __kvm_set_dr(vcpu, dr, val);
748 if (res > 0) 748 if (res > 0)
749 kvm_queue_exception(vcpu, UD_VECTOR); 749 kvm_queue_exception(vcpu, UD_VECTOR);
750 else if (res < 0) 750 else if (res < 0)
751 kvm_inject_gp(vcpu, 0); 751 kvm_inject_gp(vcpu, 0);
752 752
753 return res; 753 return res;
754 } 754 }
755 EXPORT_SYMBOL_GPL(kvm_set_dr); 755 EXPORT_SYMBOL_GPL(kvm_set_dr);
756 756
757 static int _kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) 757 static int _kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
758 { 758 {
759 switch (dr) { 759 switch (dr) {
760 case 0 ... 3: 760 case 0 ... 3:
761 *val = vcpu->arch.db[dr]; 761 *val = vcpu->arch.db[dr];
762 break; 762 break;
763 case 4: 763 case 4:
764 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) 764 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
765 return 1; 765 return 1;
766 /* fall through */ 766 /* fall through */
767 case 6: 767 case 6:
768 *val = vcpu->arch.dr6; 768 *val = vcpu->arch.dr6;
769 break; 769 break;
770 case 5: 770 case 5:
771 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) 771 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
772 return 1; 772 return 1;
773 /* fall through */ 773 /* fall through */
774 default: /* 7 */ 774 default: /* 7 */
775 *val = vcpu->arch.dr7; 775 *val = vcpu->arch.dr7;
776 break; 776 break;
777 } 777 }
778 778
779 return 0; 779 return 0;
780 } 780 }
781 781
782 int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) 782 int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
783 { 783 {
784 if (_kvm_get_dr(vcpu, dr, val)) { 784 if (_kvm_get_dr(vcpu, dr, val)) {
785 kvm_queue_exception(vcpu, UD_VECTOR); 785 kvm_queue_exception(vcpu, UD_VECTOR);
786 return 1; 786 return 1;
787 } 787 }
788 return 0; 788 return 0;
789 } 789 }
790 EXPORT_SYMBOL_GPL(kvm_get_dr); 790 EXPORT_SYMBOL_GPL(kvm_get_dr);
791 791
792 bool kvm_rdpmc(struct kvm_vcpu *vcpu) 792 bool kvm_rdpmc(struct kvm_vcpu *vcpu)
793 { 793 {
794 u32 ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); 794 u32 ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
795 u64 data; 795 u64 data;
796 int err; 796 int err;
797 797
798 err = kvm_pmu_read_pmc(vcpu, ecx, &data); 798 err = kvm_pmu_read_pmc(vcpu, ecx, &data);
799 if (err) 799 if (err)
800 return err; 800 return err;
801 kvm_register_write(vcpu, VCPU_REGS_RAX, (u32)data); 801 kvm_register_write(vcpu, VCPU_REGS_RAX, (u32)data);
802 kvm_register_write(vcpu, VCPU_REGS_RDX, data >> 32); 802 kvm_register_write(vcpu, VCPU_REGS_RDX, data >> 32);
803 return err; 803 return err;
804 } 804 }
805 EXPORT_SYMBOL_GPL(kvm_rdpmc); 805 EXPORT_SYMBOL_GPL(kvm_rdpmc);
806 806
807 /* 807 /*
808 * List of msr numbers which we expose to userspace through KVM_GET_MSRS 808 * List of msr numbers which we expose to userspace through KVM_GET_MSRS
809 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. 809 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
810 * 810 *
811 * This list is modified at module load time to reflect the 811 * This list is modified at module load time to reflect the
812 * capabilities of the host cpu. This capabilities test skips MSRs that are 812 * capabilities of the host cpu. This capabilities test skips MSRs that are
813 * kvm-specific. Those are put in the beginning of the list. 813 * kvm-specific. Those are put in the beginning of the list.
814 */ 814 */
815 815
816 #define KVM_SAVE_MSRS_BEGIN 10 816 #define KVM_SAVE_MSRS_BEGIN 10
817 static u32 msrs_to_save[] = { 817 static u32 msrs_to_save[] = {
818 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, 818 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
819 MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, 819 MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
820 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, 820 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
821 HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, 821 HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
822 MSR_KVM_PV_EOI_EN, 822 MSR_KVM_PV_EOI_EN,
823 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 823 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
824 MSR_STAR, 824 MSR_STAR,
825 #ifdef CONFIG_X86_64 825 #ifdef CONFIG_X86_64
826 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, 826 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
827 #endif 827 #endif
828 MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA 828 MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
829 }; 829 };
830 830
831 static unsigned num_msrs_to_save; 831 static unsigned num_msrs_to_save;
832 832
833 static const u32 emulated_msrs[] = { 833 static const u32 emulated_msrs[] = {
834 MSR_IA32_TSC_ADJUST, 834 MSR_IA32_TSC_ADJUST,
835 MSR_IA32_TSCDEADLINE, 835 MSR_IA32_TSCDEADLINE,
836 MSR_IA32_MISC_ENABLE, 836 MSR_IA32_MISC_ENABLE,
837 MSR_IA32_MCG_STATUS, 837 MSR_IA32_MCG_STATUS,
838 MSR_IA32_MCG_CTL, 838 MSR_IA32_MCG_CTL,
839 }; 839 };
840 840
841 static int set_efer(struct kvm_vcpu *vcpu, u64 efer) 841 static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
842 { 842 {
843 u64 old_efer = vcpu->arch.efer; 843 u64 old_efer = vcpu->arch.efer;
844 844
845 if (efer & efer_reserved_bits) 845 if (efer & efer_reserved_bits)
846 return 1; 846 return 1;
847 847
848 if (is_paging(vcpu) 848 if (is_paging(vcpu)
849 && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) 849 && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
850 return 1; 850 return 1;
851 851
852 if (efer & EFER_FFXSR) { 852 if (efer & EFER_FFXSR) {
853 struct kvm_cpuid_entry2 *feat; 853 struct kvm_cpuid_entry2 *feat;
854 854
855 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); 855 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
856 if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) 856 if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT)))
857 return 1; 857 return 1;
858 } 858 }
859 859
860 if (efer & EFER_SVME) { 860 if (efer & EFER_SVME) {
861 struct kvm_cpuid_entry2 *feat; 861 struct kvm_cpuid_entry2 *feat;
862 862
863 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); 863 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
864 if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) 864 if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM)))
865 return 1; 865 return 1;
866 } 866 }
867 867
868 efer &= ~EFER_LMA; 868 efer &= ~EFER_LMA;
869 efer |= vcpu->arch.efer & EFER_LMA; 869 efer |= vcpu->arch.efer & EFER_LMA;
870 870
871 kvm_x86_ops->set_efer(vcpu, efer); 871 kvm_x86_ops->set_efer(vcpu, efer);
872 872
873 vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; 873 vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
874 874
875 /* Update reserved bits */ 875 /* Update reserved bits */
876 if ((efer ^ old_efer) & EFER_NX) 876 if ((efer ^ old_efer) & EFER_NX)
877 kvm_mmu_reset_context(vcpu); 877 kvm_mmu_reset_context(vcpu);
878 878
879 return 0; 879 return 0;
880 } 880 }
881 881
882 void kvm_enable_efer_bits(u64 mask) 882 void kvm_enable_efer_bits(u64 mask)
883 { 883 {
884 efer_reserved_bits &= ~mask; 884 efer_reserved_bits &= ~mask;
885 } 885 }
886 EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); 886 EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
887 887
888 888
889 /* 889 /*
890 * Writes msr value into into the appropriate "register". 890 * Writes msr value into into the appropriate "register".
891 * Returns 0 on success, non-0 otherwise. 891 * Returns 0 on success, non-0 otherwise.
892 * Assumes vcpu_load() was already called. 892 * Assumes vcpu_load() was already called.
893 */ 893 */
894 int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) 894 int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
895 { 895 {
896 return kvm_x86_ops->set_msr(vcpu, msr); 896 return kvm_x86_ops->set_msr(vcpu, msr);
897 } 897 }
898 898
899 /* 899 /*
900 * Adapt set_msr() to msr_io()'s calling convention 900 * Adapt set_msr() to msr_io()'s calling convention
901 */ 901 */
902 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) 902 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
903 { 903 {
904 struct msr_data msr; 904 struct msr_data msr;
905 905
906 msr.data = *data; 906 msr.data = *data;
907 msr.index = index; 907 msr.index = index;
908 msr.host_initiated = true; 908 msr.host_initiated = true;
909 return kvm_set_msr(vcpu, &msr); 909 return kvm_set_msr(vcpu, &msr);
910 } 910 }
911 911
912 #ifdef CONFIG_X86_64 912 #ifdef CONFIG_X86_64
913 struct pvclock_gtod_data { 913 struct pvclock_gtod_data {
914 seqcount_t seq; 914 seqcount_t seq;
915 915
916 struct { /* extract of a clocksource struct */ 916 struct { /* extract of a clocksource struct */
917 int vclock_mode; 917 int vclock_mode;
918 cycle_t cycle_last; 918 cycle_t cycle_last;
919 cycle_t mask; 919 cycle_t mask;
920 u32 mult; 920 u32 mult;
921 u32 shift; 921 u32 shift;
922 } clock; 922 } clock;
923 923
924 /* open coded 'struct timespec' */ 924 /* open coded 'struct timespec' */
925 u64 monotonic_time_snsec; 925 u64 monotonic_time_snsec;
926 time_t monotonic_time_sec; 926 time_t monotonic_time_sec;
927 }; 927 };
928 928
929 static struct pvclock_gtod_data pvclock_gtod_data; 929 static struct pvclock_gtod_data pvclock_gtod_data;
930 930
931 static void update_pvclock_gtod(struct timekeeper *tk) 931 static void update_pvclock_gtod(struct timekeeper *tk)
932 { 932 {
933 struct pvclock_gtod_data *vdata = &pvclock_gtod_data; 933 struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
934 934
935 write_seqcount_begin(&vdata->seq); 935 write_seqcount_begin(&vdata->seq);
936 936
937 /* copy pvclock gtod data */ 937 /* copy pvclock gtod data */
938 vdata->clock.vclock_mode = tk->clock->archdata.vclock_mode; 938 vdata->clock.vclock_mode = tk->clock->archdata.vclock_mode;
939 vdata->clock.cycle_last = tk->clock->cycle_last; 939 vdata->clock.cycle_last = tk->clock->cycle_last;
940 vdata->clock.mask = tk->clock->mask; 940 vdata->clock.mask = tk->clock->mask;
941 vdata->clock.mult = tk->mult; 941 vdata->clock.mult = tk->mult;
942 vdata->clock.shift = tk->shift; 942 vdata->clock.shift = tk->shift;
943 943
944 vdata->monotonic_time_sec = tk->xtime_sec 944 vdata->monotonic_time_sec = tk->xtime_sec
945 + tk->wall_to_monotonic.tv_sec; 945 + tk->wall_to_monotonic.tv_sec;
946 vdata->monotonic_time_snsec = tk->xtime_nsec 946 vdata->monotonic_time_snsec = tk->xtime_nsec
947 + (tk->wall_to_monotonic.tv_nsec 947 + (tk->wall_to_monotonic.tv_nsec
948 << tk->shift); 948 << tk->shift);
949 while (vdata->monotonic_time_snsec >= 949 while (vdata->monotonic_time_snsec >=
950 (((u64)NSEC_PER_SEC) << tk->shift)) { 950 (((u64)NSEC_PER_SEC) << tk->shift)) {
951 vdata->monotonic_time_snsec -= 951 vdata->monotonic_time_snsec -=
952 ((u64)NSEC_PER_SEC) << tk->shift; 952 ((u64)NSEC_PER_SEC) << tk->shift;
953 vdata->monotonic_time_sec++; 953 vdata->monotonic_time_sec++;
954 } 954 }
955 955
956 write_seqcount_end(&vdata->seq); 956 write_seqcount_end(&vdata->seq);
957 } 957 }
958 #endif 958 #endif
959 959
960 960
961 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) 961 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
962 { 962 {
963 int version; 963 int version;
964 int r; 964 int r;
965 struct pvclock_wall_clock wc; 965 struct pvclock_wall_clock wc;
966 struct timespec boot; 966 struct timespec boot;
967 967
968 if (!wall_clock) 968 if (!wall_clock)
969 return; 969 return;
970 970
971 r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version)); 971 r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version));
972 if (r) 972 if (r)
973 return; 973 return;
974 974
975 if (version & 1) 975 if (version & 1)
976 ++version; /* first time write, random junk */ 976 ++version; /* first time write, random junk */
977 977
978 ++version; 978 ++version;
979 979
980 kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); 980 kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
981 981
982 /* 982 /*
983 * The guest calculates current wall clock time by adding 983 * The guest calculates current wall clock time by adding
984 * system time (updated by kvm_guest_time_update below) to the 984 * system time (updated by kvm_guest_time_update below) to the
985 * wall clock specified here. guest system time equals host 985 * wall clock specified here. guest system time equals host
986 * system time for us, thus we must fill in host boot time here. 986 * system time for us, thus we must fill in host boot time here.
987 */ 987 */
988 getboottime(&boot); 988 getboottime(&boot);
989 989
990 if (kvm->arch.kvmclock_offset) { 990 if (kvm->arch.kvmclock_offset) {
991 struct timespec ts = ns_to_timespec(kvm->arch.kvmclock_offset); 991 struct timespec ts = ns_to_timespec(kvm->arch.kvmclock_offset);
992 boot = timespec_sub(boot, ts); 992 boot = timespec_sub(boot, ts);
993 } 993 }
994 wc.sec = boot.tv_sec; 994 wc.sec = boot.tv_sec;
995 wc.nsec = boot.tv_nsec; 995 wc.nsec = boot.tv_nsec;
996 wc.version = version; 996 wc.version = version;
997 997
998 kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc)); 998 kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
999 999
1000 version++; 1000 version++;
1001 kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); 1001 kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
1002 } 1002 }
1003 1003
1004 static uint32_t div_frac(uint32_t dividend, uint32_t divisor) 1004 static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
1005 { 1005 {
1006 uint32_t quotient, remainder; 1006 uint32_t quotient, remainder;
1007 1007
1008 /* Don't try to replace with do_div(), this one calculates 1008 /* Don't try to replace with do_div(), this one calculates
1009 * "(dividend << 32) / divisor" */ 1009 * "(dividend << 32) / divisor" */
1010 __asm__ ( "divl %4" 1010 __asm__ ( "divl %4"
1011 : "=a" (quotient), "=d" (remainder) 1011 : "=a" (quotient), "=d" (remainder)
1012 : "0" (0), "1" (dividend), "r" (divisor) ); 1012 : "0" (0), "1" (dividend), "r" (divisor) );
1013 return quotient; 1013 return quotient;
1014 } 1014 }
1015 1015
1016 static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz, 1016 static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz,
1017 s8 *pshift, u32 *pmultiplier) 1017 s8 *pshift, u32 *pmultiplier)
1018 { 1018 {
1019 uint64_t scaled64; 1019 uint64_t scaled64;
1020 int32_t shift = 0; 1020 int32_t shift = 0;
1021 uint64_t tps64; 1021 uint64_t tps64;
1022 uint32_t tps32; 1022 uint32_t tps32;
1023 1023
1024 tps64 = base_khz * 1000LL; 1024 tps64 = base_khz * 1000LL;
1025 scaled64 = scaled_khz * 1000LL; 1025 scaled64 = scaled_khz * 1000LL;
1026 while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) { 1026 while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) {
1027 tps64 >>= 1; 1027 tps64 >>= 1;
1028 shift--; 1028 shift--;
1029 } 1029 }
1030 1030
1031 tps32 = (uint32_t)tps64; 1031 tps32 = (uint32_t)tps64;
1032 while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) { 1032 while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) {
1033 if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000) 1033 if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000)
1034 scaled64 >>= 1; 1034 scaled64 >>= 1;
1035 else 1035 else
1036 tps32 <<= 1; 1036 tps32 <<= 1;
1037 shift++; 1037 shift++;
1038 } 1038 }
1039 1039
1040 *pshift = shift; 1040 *pshift = shift;
1041 *pmultiplier = div_frac(scaled64, tps32); 1041 *pmultiplier = div_frac(scaled64, tps32);
1042 1042
1043 pr_debug("%s: base_khz %u => %u, shift %d, mul %u\n", 1043 pr_debug("%s: base_khz %u => %u, shift %d, mul %u\n",
1044 __func__, base_khz, scaled_khz, shift, *pmultiplier); 1044 __func__, base_khz, scaled_khz, shift, *pmultiplier);
1045 } 1045 }
1046 1046
1047 static inline u64 get_kernel_ns(void) 1047 static inline u64 get_kernel_ns(void)
1048 { 1048 {
1049 struct timespec ts; 1049 struct timespec ts;
1050 1050
1051 WARN_ON(preemptible()); 1051 WARN_ON(preemptible());
1052 ktime_get_ts(&ts); 1052 ktime_get_ts(&ts);
1053 monotonic_to_bootbased(&ts); 1053 monotonic_to_bootbased(&ts);
1054 return timespec_to_ns(&ts); 1054 return timespec_to_ns(&ts);
1055 } 1055 }
1056 1056
1057 #ifdef CONFIG_X86_64 1057 #ifdef CONFIG_X86_64
1058 static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0); 1058 static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
1059 #endif 1059 #endif
1060 1060
1061 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); 1061 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
1062 unsigned long max_tsc_khz; 1062 unsigned long max_tsc_khz;
1063 1063
1064 static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) 1064 static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
1065 { 1065 {
1066 return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult, 1066 return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult,
1067 vcpu->arch.virtual_tsc_shift); 1067 vcpu->arch.virtual_tsc_shift);
1068 } 1068 }
1069 1069
1070 static u32 adjust_tsc_khz(u32 khz, s32 ppm) 1070 static u32 adjust_tsc_khz(u32 khz, s32 ppm)
1071 { 1071 {
1072 u64 v = (u64)khz * (1000000 + ppm); 1072 u64 v = (u64)khz * (1000000 + ppm);
1073 do_div(v, 1000000); 1073 do_div(v, 1000000);
1074 return v; 1074 return v;
1075 } 1075 }
1076 1076
1077 static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz) 1077 static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
1078 { 1078 {
1079 u32 thresh_lo, thresh_hi; 1079 u32 thresh_lo, thresh_hi;
1080 int use_scaling = 0; 1080 int use_scaling = 0;
1081 1081
1082 /* Compute a scale to convert nanoseconds in TSC cycles */ 1082 /* Compute a scale to convert nanoseconds in TSC cycles */
1083 kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000, 1083 kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
1084 &vcpu->arch.virtual_tsc_shift, 1084 &vcpu->arch.virtual_tsc_shift,
1085 &vcpu->arch.virtual_tsc_mult); 1085 &vcpu->arch.virtual_tsc_mult);
1086 vcpu->arch.virtual_tsc_khz = this_tsc_khz; 1086 vcpu->arch.virtual_tsc_khz = this_tsc_khz;
1087 1087
1088 /* 1088 /*
1089 * Compute the variation in TSC rate which is acceptable 1089 * Compute the variation in TSC rate which is acceptable
1090 * within the range of tolerance and decide if the 1090 * within the range of tolerance and decide if the
1091 * rate being applied is within that bounds of the hardware 1091 * rate being applied is within that bounds of the hardware
1092 * rate. If so, no scaling or compensation need be done. 1092 * rate. If so, no scaling or compensation need be done.
1093 */ 1093 */
1094 thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm); 1094 thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
1095 thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm); 1095 thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);
1096 if (this_tsc_khz < thresh_lo || this_tsc_khz > thresh_hi) { 1096 if (this_tsc_khz < thresh_lo || this_tsc_khz > thresh_hi) {
1097 pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", this_tsc_khz, thresh_lo, thresh_hi); 1097 pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", this_tsc_khz, thresh_lo, thresh_hi);
1098 use_scaling = 1; 1098 use_scaling = 1;
1099 } 1099 }
1100 kvm_x86_ops->set_tsc_khz(vcpu, this_tsc_khz, use_scaling); 1100 kvm_x86_ops->set_tsc_khz(vcpu, this_tsc_khz, use_scaling);
1101 } 1101 }
1102 1102
1103 static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) 1103 static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
1104 { 1104 {
1105 u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec, 1105 u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec,
1106 vcpu->arch.virtual_tsc_mult, 1106 vcpu->arch.virtual_tsc_mult,
1107 vcpu->arch.virtual_tsc_shift); 1107 vcpu->arch.virtual_tsc_shift);
1108 tsc += vcpu->arch.this_tsc_write; 1108 tsc += vcpu->arch.this_tsc_write;
1109 return tsc; 1109 return tsc;
1110 } 1110 }
1111 1111
1112 void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) 1112 void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
1113 { 1113 {
1114 #ifdef CONFIG_X86_64 1114 #ifdef CONFIG_X86_64
1115 bool vcpus_matched; 1115 bool vcpus_matched;
1116 bool do_request = false; 1116 bool do_request = false;
1117 struct kvm_arch *ka = &vcpu->kvm->arch; 1117 struct kvm_arch *ka = &vcpu->kvm->arch;
1118 struct pvclock_gtod_data *gtod = &pvclock_gtod_data; 1118 struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
1119 1119
1120 vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 == 1120 vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
1121 atomic_read(&vcpu->kvm->online_vcpus)); 1121 atomic_read(&vcpu->kvm->online_vcpus));
1122 1122
1123 if (vcpus_matched && gtod->clock.vclock_mode == VCLOCK_TSC) 1123 if (vcpus_matched && gtod->clock.vclock_mode == VCLOCK_TSC)
1124 if (!ka->use_master_clock) 1124 if (!ka->use_master_clock)
1125 do_request = 1; 1125 do_request = 1;
1126 1126
1127 if (!vcpus_matched && ka->use_master_clock) 1127 if (!vcpus_matched && ka->use_master_clock)
1128 do_request = 1; 1128 do_request = 1;
1129 1129
1130 if (do_request) 1130 if (do_request)
1131 kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); 1131 kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
1132 1132
1133 trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc, 1133 trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc,
1134 atomic_read(&vcpu->kvm->online_vcpus), 1134 atomic_read(&vcpu->kvm->online_vcpus),
1135 ka->use_master_clock, gtod->clock.vclock_mode); 1135 ka->use_master_clock, gtod->clock.vclock_mode);
1136 #endif 1136 #endif
1137 } 1137 }
1138 1138
1139 static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset) 1139 static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)
1140 { 1140 {
1141 u64 curr_offset = kvm_x86_ops->read_tsc_offset(vcpu); 1141 u64 curr_offset = kvm_x86_ops->read_tsc_offset(vcpu);
1142 vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset; 1142 vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;
1143 } 1143 }
1144 1144
1145 void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) 1145 void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
1146 { 1146 {
1147 struct kvm *kvm = vcpu->kvm; 1147 struct kvm *kvm = vcpu->kvm;
1148 u64 offset, ns, elapsed; 1148 u64 offset, ns, elapsed;
1149 unsigned long flags; 1149 unsigned long flags;
1150 s64 usdiff; 1150 s64 usdiff;
1151 bool matched; 1151 bool matched;
1152 u64 data = msr->data; 1152 u64 data = msr->data;
1153 1153
1154 raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); 1154 raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
1155 offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); 1155 offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
1156 ns = get_kernel_ns(); 1156 ns = get_kernel_ns();
1157 elapsed = ns - kvm->arch.last_tsc_nsec; 1157 elapsed = ns - kvm->arch.last_tsc_nsec;
1158 1158
1159 /* n.b - signed multiplication and division required */ 1159 /* n.b - signed multiplication and division required */
1160 usdiff = data - kvm->arch.last_tsc_write; 1160 usdiff = data - kvm->arch.last_tsc_write;
1161 #ifdef CONFIG_X86_64 1161 #ifdef CONFIG_X86_64
1162 usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz; 1162 usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz;
1163 #else 1163 #else
1164 /* do_div() only does unsigned */ 1164 /* do_div() only does unsigned */
1165 asm("idivl %2; xor %%edx, %%edx" 1165 asm("idivl %2; xor %%edx, %%edx"
1166 : "=A"(usdiff) 1166 : "=A"(usdiff)
1167 : "A"(usdiff * 1000), "rm"(vcpu->arch.virtual_tsc_khz)); 1167 : "A"(usdiff * 1000), "rm"(vcpu->arch.virtual_tsc_khz));
1168 #endif 1168 #endif
1169 do_div(elapsed, 1000); 1169 do_div(elapsed, 1000);
1170 usdiff -= elapsed; 1170 usdiff -= elapsed;
1171 if (usdiff < 0) 1171 if (usdiff < 0)
1172 usdiff = -usdiff; 1172 usdiff = -usdiff;
1173 1173
1174 /* 1174 /*
1175 * Special case: TSC write with a small delta (1 second) of virtual 1175 * Special case: TSC write with a small delta (1 second) of virtual
1176 * cycle time against real time is interpreted as an attempt to 1176 * cycle time against real time is interpreted as an attempt to
1177 * synchronize the CPU. 1177 * synchronize the CPU.
1178 * 1178 *
1179 * For a reliable TSC, we can match TSC offsets, and for an unstable 1179 * For a reliable TSC, we can match TSC offsets, and for an unstable
1180 * TSC, we add elapsed time in this computation. We could let the 1180 * TSC, we add elapsed time in this computation. We could let the
1181 * compensation code attempt to catch up if we fall behind, but 1181 * compensation code attempt to catch up if we fall behind, but
1182 * it's better to try to match offsets from the beginning. 1182 * it's better to try to match offsets from the beginning.
1183 */ 1183 */
1184 if (usdiff < USEC_PER_SEC && 1184 if (usdiff < USEC_PER_SEC &&
1185 vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) { 1185 vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
1186 if (!check_tsc_unstable()) { 1186 if (!check_tsc_unstable()) {
1187 offset = kvm->arch.cur_tsc_offset; 1187 offset = kvm->arch.cur_tsc_offset;
1188 pr_debug("kvm: matched tsc offset for %llu\n", data); 1188 pr_debug("kvm: matched tsc offset for %llu\n", data);
1189 } else { 1189 } else {
1190 u64 delta = nsec_to_cycles(vcpu, elapsed); 1190 u64 delta = nsec_to_cycles(vcpu, elapsed);
1191 data += delta; 1191 data += delta;
1192 offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); 1192 offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
1193 pr_debug("kvm: adjusted tsc offset by %llu\n", delta); 1193 pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
1194 } 1194 }
1195 matched = true; 1195 matched = true;
1196 } else { 1196 } else {
1197 /* 1197 /*
1198 * We split periods of matched TSC writes into generations. 1198 * We split periods of matched TSC writes into generations.
1199 * For each generation, we track the original measured 1199 * For each generation, we track the original measured
1200 * nanosecond time, offset, and write, so if TSCs are in 1200 * nanosecond time, offset, and write, so if TSCs are in
1201 * sync, we can match exact offset, and if not, we can match 1201 * sync, we can match exact offset, and if not, we can match
1202 * exact software computation in compute_guest_tsc() 1202 * exact software computation in compute_guest_tsc()
1203 * 1203 *
1204 * These values are tracked in kvm->arch.cur_xxx variables. 1204 * These values are tracked in kvm->arch.cur_xxx variables.
1205 */ 1205 */
1206 kvm->arch.cur_tsc_generation++; 1206 kvm->arch.cur_tsc_generation++;
1207 kvm->arch.cur_tsc_nsec = ns; 1207 kvm->arch.cur_tsc_nsec = ns;
1208 kvm->arch.cur_tsc_write = data; 1208 kvm->arch.cur_tsc_write = data;
1209 kvm->arch.cur_tsc_offset = offset; 1209 kvm->arch.cur_tsc_offset = offset;
1210 matched = false; 1210 matched = false;
1211 pr_debug("kvm: new tsc generation %u, clock %llu\n", 1211 pr_debug("kvm: new tsc generation %u, clock %llu\n",
1212 kvm->arch.cur_tsc_generation, data); 1212 kvm->arch.cur_tsc_generation, data);
1213 } 1213 }
1214 1214
1215 /* 1215 /*
1216 * We also track th most recent recorded KHZ, write and time to 1216 * We also track th most recent recorded KHZ, write and time to
1217 * allow the matching interval to be extended at each write. 1217 * allow the matching interval to be extended at each write.
1218 */ 1218 */
1219 kvm->arch.last_tsc_nsec = ns; 1219 kvm->arch.last_tsc_nsec = ns;
1220 kvm->arch.last_tsc_write = data; 1220 kvm->arch.last_tsc_write = data;
1221 kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz; 1221 kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
1222 1222
1223 /* Reset of TSC must disable overshoot protection below */ 1223 /* Reset of TSC must disable overshoot protection below */
1224 vcpu->arch.hv_clock.tsc_timestamp = 0; 1224 vcpu->arch.hv_clock.tsc_timestamp = 0;
1225 vcpu->arch.last_guest_tsc = data; 1225 vcpu->arch.last_guest_tsc = data;
1226 1226
1227 /* Keep track of which generation this VCPU has synchronized to */ 1227 /* Keep track of which generation this VCPU has synchronized to */
1228 vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation; 1228 vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;
1229 vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec; 1229 vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
1230 vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write; 1230 vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
1231 1231
1232 if (guest_cpuid_has_tsc_adjust(vcpu) && !msr->host_initiated) 1232 if (guest_cpuid_has_tsc_adjust(vcpu) && !msr->host_initiated)
1233 update_ia32_tsc_adjust_msr(vcpu, offset); 1233 update_ia32_tsc_adjust_msr(vcpu, offset);
1234 kvm_x86_ops->write_tsc_offset(vcpu, offset); 1234 kvm_x86_ops->write_tsc_offset(vcpu, offset);
1235 raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); 1235 raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
1236 1236
1237 spin_lock(&kvm->arch.pvclock_gtod_sync_lock); 1237 spin_lock(&kvm->arch.pvclock_gtod_sync_lock);
1238 if (matched) 1238 if (matched)
1239 kvm->arch.nr_vcpus_matched_tsc++; 1239 kvm->arch.nr_vcpus_matched_tsc++;
1240 else 1240 else
1241 kvm->arch.nr_vcpus_matched_tsc = 0; 1241 kvm->arch.nr_vcpus_matched_tsc = 0;
1242 1242
1243 kvm_track_tsc_matching(vcpu); 1243 kvm_track_tsc_matching(vcpu);
1244 spin_unlock(&kvm->arch.pvclock_gtod_sync_lock); 1244 spin_unlock(&kvm->arch.pvclock_gtod_sync_lock);
1245 } 1245 }
1246 1246
1247 EXPORT_SYMBOL_GPL(kvm_write_tsc); 1247 EXPORT_SYMBOL_GPL(kvm_write_tsc);
1248 1248
1249 #ifdef CONFIG_X86_64 1249 #ifdef CONFIG_X86_64
1250 1250
1251 static cycle_t read_tsc(void) 1251 static cycle_t read_tsc(void)
1252 { 1252 {
1253 cycle_t ret; 1253 cycle_t ret;
1254 u64 last; 1254 u64 last;
1255 1255
1256 /* 1256 /*
1257 * Empirically, a fence (of type that depends on the CPU) 1257 * Empirically, a fence (of type that depends on the CPU)
1258 * before rdtsc is enough to ensure that rdtsc is ordered 1258 * before rdtsc is enough to ensure that rdtsc is ordered
1259 * with respect to loads. The various CPU manuals are unclear 1259 * with respect to loads. The various CPU manuals are unclear
1260 * as to whether rdtsc can be reordered with later loads, 1260 * as to whether rdtsc can be reordered with later loads,
1261 * but no one has ever seen it happen. 1261 * but no one has ever seen it happen.
1262 */ 1262 */
1263 rdtsc_barrier(); 1263 rdtsc_barrier();
1264 ret = (cycle_t)vget_cycles(); 1264 ret = (cycle_t)vget_cycles();
1265 1265
1266 last = pvclock_gtod_data.clock.cycle_last; 1266 last = pvclock_gtod_data.clock.cycle_last;
1267 1267
1268 if (likely(ret >= last)) 1268 if (likely(ret >= last))
1269 return ret; 1269 return ret;
1270 1270
1271 /* 1271 /*
1272 * GCC likes to generate cmov here, but this branch is extremely 1272 * GCC likes to generate cmov here, but this branch is extremely
1273 * predictable (it's just a funciton of time and the likely is 1273 * predictable (it's just a funciton of time and the likely is
1274 * very likely) and there's a data dependence, so force GCC 1274 * very likely) and there's a data dependence, so force GCC
1275 * to generate a branch instead. I don't barrier() because 1275 * to generate a branch instead. I don't barrier() because
1276 * we don't actually need a barrier, and if this function 1276 * we don't actually need a barrier, and if this function
1277 * ever gets inlined it will generate worse code. 1277 * ever gets inlined it will generate worse code.
1278 */ 1278 */
1279 asm volatile (""); 1279 asm volatile ("");
1280 return last; 1280 return last;
1281 } 1281 }
1282 1282
1283 static inline u64 vgettsc(cycle_t *cycle_now) 1283 static inline u64 vgettsc(cycle_t *cycle_now)
1284 { 1284 {
1285 long v; 1285 long v;
1286 struct pvclock_gtod_data *gtod = &pvclock_gtod_data; 1286 struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
1287 1287
1288 *cycle_now = read_tsc(); 1288 *cycle_now = read_tsc();
1289 1289
1290 v = (*cycle_now - gtod->clock.cycle_last) & gtod->clock.mask; 1290 v = (*cycle_now - gtod->clock.cycle_last) & gtod->clock.mask;
1291 return v * gtod->clock.mult; 1291 return v * gtod->clock.mult;
1292 } 1292 }
1293 1293
1294 static int do_monotonic(struct timespec *ts, cycle_t *cycle_now) 1294 static int do_monotonic(struct timespec *ts, cycle_t *cycle_now)
1295 { 1295 {
1296 unsigned long seq; 1296 unsigned long seq;
1297 u64 ns; 1297 u64 ns;
1298 int mode; 1298 int mode;
1299 struct pvclock_gtod_data *gtod = &pvclock_gtod_data; 1299 struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
1300 1300
1301 ts->tv_nsec = 0; 1301 ts->tv_nsec = 0;
1302 do { 1302 do {
1303 seq = read_seqcount_begin(&gtod->seq); 1303 seq = read_seqcount_begin(&gtod->seq);
1304 mode = gtod->clock.vclock_mode; 1304 mode = gtod->clock.vclock_mode;
1305 ts->tv_sec = gtod->monotonic_time_sec; 1305 ts->tv_sec = gtod->monotonic_time_sec;
1306 ns = gtod->monotonic_time_snsec; 1306 ns = gtod->monotonic_time_snsec;
1307 ns += vgettsc(cycle_now); 1307 ns += vgettsc(cycle_now);
1308 ns >>= gtod->clock.shift; 1308 ns >>= gtod->clock.shift;
1309 } while (unlikely(read_seqcount_retry(&gtod->seq, seq))); 1309 } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
1310 timespec_add_ns(ts, ns); 1310 timespec_add_ns(ts, ns);
1311 1311
1312 return mode; 1312 return mode;
1313 } 1313 }
1314 1314
1315 /* returns true if host is using tsc clocksource */ 1315 /* returns true if host is using tsc clocksource */
1316 static bool kvm_get_time_and_clockread(s64 *kernel_ns, cycle_t *cycle_now) 1316 static bool kvm_get_time_and_clockread(s64 *kernel_ns, cycle_t *cycle_now)
1317 { 1317 {
1318 struct timespec ts; 1318 struct timespec ts;
1319 1319
1320 /* checked again under seqlock below */ 1320 /* checked again under seqlock below */
1321 if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC) 1321 if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC)
1322 return false; 1322 return false;
1323 1323
1324 if (do_monotonic(&ts, cycle_now) != VCLOCK_TSC) 1324 if (do_monotonic(&ts, cycle_now) != VCLOCK_TSC)
1325 return false; 1325 return false;
1326 1326
1327 monotonic_to_bootbased(&ts); 1327 monotonic_to_bootbased(&ts);
1328 *kernel_ns = timespec_to_ns(&ts); 1328 *kernel_ns = timespec_to_ns(&ts);
1329 1329
1330 return true; 1330 return true;
1331 } 1331 }
1332 #endif 1332 #endif
1333 1333
1334 /* 1334 /*
1335 * 1335 *
1336 * Assuming a stable TSC across physical CPUS, and a stable TSC 1336 * Assuming a stable TSC across physical CPUS, and a stable TSC
1337 * across virtual CPUs, the following condition is possible. 1337 * across virtual CPUs, the following condition is possible.
1338 * Each numbered line represents an event visible to both 1338 * Each numbered line represents an event visible to both
1339 * CPUs at the next numbered event. 1339 * CPUs at the next numbered event.
1340 * 1340 *
1341 * "timespecX" represents host monotonic time. "tscX" represents 1341 * "timespecX" represents host monotonic time. "tscX" represents
1342 * RDTSC value. 1342 * RDTSC value.
1343 * 1343 *
1344 * VCPU0 on CPU0 | VCPU1 on CPU1 1344 * VCPU0 on CPU0 | VCPU1 on CPU1
1345 * 1345 *
1346 * 1. read timespec0,tsc0 1346 * 1. read timespec0,tsc0
1347 * 2. | timespec1 = timespec0 + N 1347 * 2. | timespec1 = timespec0 + N
1348 * | tsc1 = tsc0 + M 1348 * | tsc1 = tsc0 + M
1349 * 3. transition to guest | transition to guest 1349 * 3. transition to guest | transition to guest
1350 * 4. ret0 = timespec0 + (rdtsc - tsc0) | 1350 * 4. ret0 = timespec0 + (rdtsc - tsc0) |
1351 * 5. | ret1 = timespec1 + (rdtsc - tsc1) 1351 * 5. | ret1 = timespec1 + (rdtsc - tsc1)
1352 * | ret1 = timespec0 + N + (rdtsc - (tsc0 + M)) 1352 * | ret1 = timespec0 + N + (rdtsc - (tsc0 + M))
1353 * 1353 *
1354 * Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity: 1354 * Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity:
1355 * 1355 *
1356 * - ret0 < ret1 1356 * - ret0 < ret1
1357 * - timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M)) 1357 * - timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M))
1358 * ... 1358 * ...
1359 * - 0 < N - M => M < N 1359 * - 0 < N - M => M < N
1360 * 1360 *
1361 * That is, when timespec0 != timespec1, M < N. Unfortunately that is not 1361 * That is, when timespec0 != timespec1, M < N. Unfortunately that is not
1362 * always the case (the difference between two distinct xtime instances 1362 * always the case (the difference between two distinct xtime instances
1363 * might be smaller then the difference between corresponding TSC reads, 1363 * might be smaller then the difference between corresponding TSC reads,
1364 * when updating guest vcpus pvclock areas). 1364 * when updating guest vcpus pvclock areas).
1365 * 1365 *
1366 * To avoid that problem, do not allow visibility of distinct 1366 * To avoid that problem, do not allow visibility of distinct
1367 * system_timestamp/tsc_timestamp values simultaneously: use a master 1367 * system_timestamp/tsc_timestamp values simultaneously: use a master
1368 * copy of host monotonic time values. Update that master copy 1368 * copy of host monotonic time values. Update that master copy
1369 * in lockstep. 1369 * in lockstep.
1370 * 1370 *
1371 * Rely on synchronization of host TSCs and guest TSCs for monotonicity. 1371 * Rely on synchronization of host TSCs and guest TSCs for monotonicity.
1372 * 1372 *
1373 */ 1373 */
1374 1374
1375 static void pvclock_update_vm_gtod_copy(struct kvm *kvm) 1375 static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
1376 { 1376 {
1377 #ifdef CONFIG_X86_64 1377 #ifdef CONFIG_X86_64
1378 struct kvm_arch *ka = &kvm->arch; 1378 struct kvm_arch *ka = &kvm->arch;
1379 int vclock_mode; 1379 int vclock_mode;
1380 bool host_tsc_clocksource, vcpus_matched; 1380 bool host_tsc_clocksource, vcpus_matched;
1381 1381
1382 vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 == 1382 vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
1383 atomic_read(&kvm->online_vcpus)); 1383 atomic_read(&kvm->online_vcpus));
1384 1384
1385 /* 1385 /*
1386 * If the host uses TSC clock, then passthrough TSC as stable 1386 * If the host uses TSC clock, then passthrough TSC as stable
1387 * to the guest. 1387 * to the guest.
1388 */ 1388 */
1389 host_tsc_clocksource = kvm_get_time_and_clockread( 1389 host_tsc_clocksource = kvm_get_time_and_clockread(
1390 &ka->master_kernel_ns, 1390 &ka->master_kernel_ns,
1391 &ka->master_cycle_now); 1391 &ka->master_cycle_now);
1392 1392
1393 ka->use_master_clock = host_tsc_clocksource & vcpus_matched; 1393 ka->use_master_clock = host_tsc_clocksource & vcpus_matched;
1394 1394
1395 if (ka->use_master_clock) 1395 if (ka->use_master_clock)
1396 atomic_set(&kvm_guest_has_master_clock, 1); 1396 atomic_set(&kvm_guest_has_master_clock, 1);
1397 1397
1398 vclock_mode = pvclock_gtod_data.clock.vclock_mode; 1398 vclock_mode = pvclock_gtod_data.clock.vclock_mode;
1399 trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode, 1399 trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode,
1400 vcpus_matched); 1400 vcpus_matched);
1401 #endif 1401 #endif
1402 } 1402 }
1403 1403
1404 static int kvm_guest_time_update(struct kvm_vcpu *v) 1404 static int kvm_guest_time_update(struct kvm_vcpu *v)
1405 { 1405 {
1406 unsigned long flags, this_tsc_khz; 1406 unsigned long flags, this_tsc_khz;
1407 struct kvm_vcpu_arch *vcpu = &v->arch; 1407 struct kvm_vcpu_arch *vcpu = &v->arch;
1408 struct kvm_arch *ka = &v->kvm->arch; 1408 struct kvm_arch *ka = &v->kvm->arch;
1409 void *shared_kaddr; 1409 void *shared_kaddr;
1410 s64 kernel_ns, max_kernel_ns; 1410 s64 kernel_ns, max_kernel_ns;
1411 u64 tsc_timestamp, host_tsc; 1411 u64 tsc_timestamp, host_tsc;
1412 struct pvclock_vcpu_time_info *guest_hv_clock; 1412 struct pvclock_vcpu_time_info *guest_hv_clock;
1413 u8 pvclock_flags; 1413 u8 pvclock_flags;
1414 bool use_master_clock; 1414 bool use_master_clock;
1415 1415
1416 kernel_ns = 0; 1416 kernel_ns = 0;
1417 host_tsc = 0; 1417 host_tsc = 0;
1418 1418
1419 /* Keep irq disabled to prevent changes to the clock */ 1419 /* Keep irq disabled to prevent changes to the clock */
1420 local_irq_save(flags); 1420 local_irq_save(flags);
1421 this_tsc_khz = __get_cpu_var(cpu_tsc_khz); 1421 this_tsc_khz = __get_cpu_var(cpu_tsc_khz);
1422 if (unlikely(this_tsc_khz == 0)) { 1422 if (unlikely(this_tsc_khz == 0)) {
1423 local_irq_restore(flags); 1423 local_irq_restore(flags);
1424 kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); 1424 kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
1425 return 1; 1425 return 1;
1426 } 1426 }
1427 1427
1428 /* 1428 /*
1429 * If the host uses TSC clock, then passthrough TSC as stable 1429 * If the host uses TSC clock, then passthrough TSC as stable
1430 * to the guest. 1430 * to the guest.
1431 */ 1431 */
1432 spin_lock(&ka->pvclock_gtod_sync_lock); 1432 spin_lock(&ka->pvclock_gtod_sync_lock);
1433 use_master_clock = ka->use_master_clock; 1433 use_master_clock = ka->use_master_clock;
1434 if (use_master_clock) { 1434 if (use_master_clock) {
1435 host_tsc = ka->master_cycle_now; 1435 host_tsc = ka->master_cycle_now;
1436 kernel_ns = ka->master_kernel_ns; 1436 kernel_ns = ka->master_kernel_ns;
1437 } 1437 }
1438 spin_unlock(&ka->pvclock_gtod_sync_lock); 1438 spin_unlock(&ka->pvclock_gtod_sync_lock);
1439 if (!use_master_clock) { 1439 if (!use_master_clock) {
1440 host_tsc = native_read_tsc(); 1440 host_tsc = native_read_tsc();
1441 kernel_ns = get_kernel_ns(); 1441 kernel_ns = get_kernel_ns();
1442 } 1442 }
1443 1443
1444 tsc_timestamp = kvm_x86_ops->read_l1_tsc(v, host_tsc); 1444 tsc_timestamp = kvm_x86_ops->read_l1_tsc(v, host_tsc);
1445 1445
1446 /* 1446 /*
1447 * We may have to catch up the TSC to match elapsed wall clock 1447 * We may have to catch up the TSC to match elapsed wall clock
1448 * time for two reasons, even if kvmclock is used. 1448 * time for two reasons, even if kvmclock is used.
1449 * 1) CPU could have been running below the maximum TSC rate 1449 * 1) CPU could have been running below the maximum TSC rate
1450 * 2) Broken TSC compensation resets the base at each VCPU 1450 * 2) Broken TSC compensation resets the base at each VCPU
1451 * entry to avoid unknown leaps of TSC even when running 1451 * entry to avoid unknown leaps of TSC even when running
1452 * again on the same CPU. This may cause apparent elapsed 1452 * again on the same CPU. This may cause apparent elapsed
1453 * time to disappear, and the guest to stand still or run 1453 * time to disappear, and the guest to stand still or run
1454 * very slowly. 1454 * very slowly.
1455 */ 1455 */
1456 if (vcpu->tsc_catchup) { 1456 if (vcpu->tsc_catchup) {
1457 u64 tsc = compute_guest_tsc(v, kernel_ns); 1457 u64 tsc = compute_guest_tsc(v, kernel_ns);
1458 if (tsc > tsc_timestamp) { 1458 if (tsc > tsc_timestamp) {
1459 adjust_tsc_offset_guest(v, tsc - tsc_timestamp); 1459 adjust_tsc_offset_guest(v, tsc - tsc_timestamp);
1460 tsc_timestamp = tsc; 1460 tsc_timestamp = tsc;
1461 } 1461 }
1462 } 1462 }
1463 1463
1464 local_irq_restore(flags); 1464 local_irq_restore(flags);
1465 1465
1466 if (!vcpu->time_page) 1466 if (!vcpu->time_page)
1467 return 0; 1467 return 0;
1468 1468
1469 /* 1469 /*
1470 * Time as measured by the TSC may go backwards when resetting the base 1470 * Time as measured by the TSC may go backwards when resetting the base
1471 * tsc_timestamp. The reason for this is that the TSC resolution is 1471 * tsc_timestamp. The reason for this is that the TSC resolution is
1472 * higher than the resolution of the other clock scales. Thus, many 1472 * higher than the resolution of the other clock scales. Thus, many
1473 * possible measurments of the TSC correspond to one measurement of any 1473 * possible measurments of the TSC correspond to one measurement of any
1474 * other clock, and so a spread of values is possible. This is not a 1474 * other clock, and so a spread of values is possible. This is not a
1475 * problem for the computation of the nanosecond clock; with TSC rates 1475 * problem for the computation of the nanosecond clock; with TSC rates
1476 * around 1GHZ, there can only be a few cycles which correspond to one 1476 * around 1GHZ, there can only be a few cycles which correspond to one
1477 * nanosecond value, and any path through this code will inevitably 1477 * nanosecond value, and any path through this code will inevitably
1478 * take longer than that. However, with the kernel_ns value itself, 1478 * take longer than that. However, with the kernel_ns value itself,
1479 * the precision may be much lower, down to HZ granularity. If the 1479 * the precision may be much lower, down to HZ granularity. If the
1480 * first sampling of TSC against kernel_ns ends in the low part of the 1480 * first sampling of TSC against kernel_ns ends in the low part of the
1481 * range, and the second in the high end of the range, we can get: 1481 * range, and the second in the high end of the range, we can get:
1482 * 1482 *
1483 * (TSC - offset_low) * S + kns_old > (TSC - offset_high) * S + kns_new 1483 * (TSC - offset_low) * S + kns_old > (TSC - offset_high) * S + kns_new
1484 * 1484 *
1485 * As the sampling errors potentially range in the thousands of cycles, 1485 * As the sampling errors potentially range in the thousands of cycles,
1486 * it is possible such a time value has already been observed by the 1486 * it is possible such a time value has already been observed by the
1487 * guest. To protect against this, we must compute the system time as 1487 * guest. To protect against this, we must compute the system time as
1488 * observed by the guest and ensure the new system time is greater. 1488 * observed by the guest and ensure the new system time is greater.
1489 */ 1489 */
1490 max_kernel_ns = 0; 1490 max_kernel_ns = 0;
1491 if (vcpu->hv_clock.tsc_timestamp) { 1491 if (vcpu->hv_clock.tsc_timestamp) {
1492 max_kernel_ns = vcpu->last_guest_tsc - 1492 max_kernel_ns = vcpu->last_guest_tsc -
1493 vcpu->hv_clock.tsc_timestamp; 1493 vcpu->hv_clock.tsc_timestamp;
1494 max_kernel_ns = pvclock_scale_delta(max_kernel_ns, 1494 max_kernel_ns = pvclock_scale_delta(max_kernel_ns,
1495 vcpu->hv_clock.tsc_to_system_mul, 1495 vcpu->hv_clock.tsc_to_system_mul,
1496 vcpu->hv_clock.tsc_shift); 1496 vcpu->hv_clock.tsc_shift);
1497 max_kernel_ns += vcpu->last_kernel_ns; 1497 max_kernel_ns += vcpu->last_kernel_ns;
1498 } 1498 }
1499 1499
1500 if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) { 1500 if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) {
1501 kvm_get_time_scale(NSEC_PER_SEC / 1000, this_tsc_khz, 1501 kvm_get_time_scale(NSEC_PER_SEC / 1000, this_tsc_khz,
1502 &vcpu->hv_clock.tsc_shift, 1502 &vcpu->hv_clock.tsc_shift,
1503 &vcpu->hv_clock.tsc_to_system_mul); 1503 &vcpu->hv_clock.tsc_to_system_mul);
1504 vcpu->hw_tsc_khz = this_tsc_khz; 1504 vcpu->hw_tsc_khz = this_tsc_khz;
1505 } 1505 }
1506 1506
1507 /* with a master <monotonic time, tsc value> tuple, 1507 /* with a master <monotonic time, tsc value> tuple,
1508 * pvclock clock reads always increase at the (scaled) rate 1508 * pvclock clock reads always increase at the (scaled) rate
1509 * of guest TSC - no need to deal with sampling errors. 1509 * of guest TSC - no need to deal with sampling errors.
1510 */ 1510 */
1511 if (!use_master_clock) { 1511 if (!use_master_clock) {
1512 if (max_kernel_ns > kernel_ns) 1512 if (max_kernel_ns > kernel_ns)
1513 kernel_ns = max_kernel_ns; 1513 kernel_ns = max_kernel_ns;
1514 } 1514 }
1515 /* With all the info we got, fill in the values */ 1515 /* With all the info we got, fill in the values */
1516 vcpu->hv_clock.tsc_timestamp = tsc_timestamp; 1516 vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
1517 vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; 1517 vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
1518 vcpu->last_kernel_ns = kernel_ns; 1518 vcpu->last_kernel_ns = kernel_ns;
1519 vcpu->last_guest_tsc = tsc_timestamp; 1519 vcpu->last_guest_tsc = tsc_timestamp;
1520 1520
1521 /* 1521 /*
1522 * The interface expects us to write an even number signaling that the 1522 * The interface expects us to write an even number signaling that the
1523 * update is finished. Since the guest won't see the intermediate 1523 * update is finished. Since the guest won't see the intermediate
1524 * state, we just increase by 2 at the end. 1524 * state, we just increase by 2 at the end.
1525 */ 1525 */
1526 vcpu->hv_clock.version += 2; 1526 vcpu->hv_clock.version += 2;
1527 1527
1528 shared_kaddr = kmap_atomic(vcpu->time_page); 1528 shared_kaddr = kmap_atomic(vcpu->time_page);
1529 1529
1530 guest_hv_clock = shared_kaddr + vcpu->time_offset; 1530 guest_hv_clock = shared_kaddr + vcpu->time_offset;
1531 1531
1532 /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */ 1532 /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
1533 pvclock_flags = (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED); 1533 pvclock_flags = (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED);
1534 1534
1535 if (vcpu->pvclock_set_guest_stopped_request) { 1535 if (vcpu->pvclock_set_guest_stopped_request) {
1536 pvclock_flags |= PVCLOCK_GUEST_STOPPED; 1536 pvclock_flags |= PVCLOCK_GUEST_STOPPED;
1537 vcpu->pvclock_set_guest_stopped_request = false; 1537 vcpu->pvclock_set_guest_stopped_request = false;
1538 } 1538 }
1539 1539
1540 /* If the host uses TSC clocksource, then it is stable */ 1540 /* If the host uses TSC clocksource, then it is stable */
1541 if (use_master_clock) 1541 if (use_master_clock)
1542 pvclock_flags |= PVCLOCK_TSC_STABLE_BIT; 1542 pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
1543 1543
1544 vcpu->hv_clock.flags = pvclock_flags; 1544 vcpu->hv_clock.flags = pvclock_flags;
1545 1545
1546 memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock, 1546 memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
1547 sizeof(vcpu->hv_clock)); 1547 sizeof(vcpu->hv_clock));
1548 1548
1549 kunmap_atomic(shared_kaddr); 1549 kunmap_atomic(shared_kaddr);
1550 1550
1551 mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT); 1551 mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
1552 return 0; 1552 return 0;
1553 } 1553 }
1554 1554
1555 static bool msr_mtrr_valid(unsigned msr) 1555 static bool msr_mtrr_valid(unsigned msr)
1556 { 1556 {
1557 switch (msr) { 1557 switch (msr) {
1558 case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1: 1558 case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1:
1559 case MSR_MTRRfix64K_00000: 1559 case MSR_MTRRfix64K_00000:
1560 case MSR_MTRRfix16K_80000: 1560 case MSR_MTRRfix16K_80000:
1561 case MSR_MTRRfix16K_A0000: 1561 case MSR_MTRRfix16K_A0000:
1562 case MSR_MTRRfix4K_C0000: 1562 case MSR_MTRRfix4K_C0000:
1563 case MSR_MTRRfix4K_C8000: 1563 case MSR_MTRRfix4K_C8000:
1564 case MSR_MTRRfix4K_D0000: 1564 case MSR_MTRRfix4K_D0000:
1565 case MSR_MTRRfix4K_D8000: 1565 case MSR_MTRRfix4K_D8000:
1566 case MSR_MTRRfix4K_E0000: 1566 case MSR_MTRRfix4K_E0000:
1567 case MSR_MTRRfix4K_E8000: 1567 case MSR_MTRRfix4K_E8000:
1568 case MSR_MTRRfix4K_F0000: 1568 case MSR_MTRRfix4K_F0000:
1569 case MSR_MTRRfix4K_F8000: 1569 case MSR_MTRRfix4K_F8000:
1570 case MSR_MTRRdefType: 1570 case MSR_MTRRdefType:
1571 case MSR_IA32_CR_PAT: 1571 case MSR_IA32_CR_PAT:
1572 return true; 1572 return true;
1573 case 0x2f8: 1573 case 0x2f8:
1574 return true; 1574 return true;
1575 } 1575 }
1576 return false; 1576 return false;
1577 } 1577 }
1578 1578
1579 static bool valid_pat_type(unsigned t) 1579 static bool valid_pat_type(unsigned t)
1580 { 1580 {
1581 return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */ 1581 return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */
1582 } 1582 }
1583 1583
1584 static bool valid_mtrr_type(unsigned t) 1584 static bool valid_mtrr_type(unsigned t)
1585 { 1585 {
1586 return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */ 1586 return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */
1587 } 1587 }
1588 1588
1589 static bool mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) 1589 static bool mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1590 { 1590 {
1591 int i; 1591 int i;
1592 1592
1593 if (!msr_mtrr_valid(msr)) 1593 if (!msr_mtrr_valid(msr))
1594 return false; 1594 return false;
1595 1595
1596 if (msr == MSR_IA32_CR_PAT) { 1596 if (msr == MSR_IA32_CR_PAT) {
1597 for (i = 0; i < 8; i++) 1597 for (i = 0; i < 8; i++)
1598 if (!valid_pat_type((data >> (i * 8)) & 0xff)) 1598 if (!valid_pat_type((data >> (i * 8)) & 0xff))
1599 return false; 1599 return false;
1600 return true; 1600 return true;
1601 } else if (msr == MSR_MTRRdefType) { 1601 } else if (msr == MSR_MTRRdefType) {
1602 if (data & ~0xcff) 1602 if (data & ~0xcff)
1603 return false; 1603 return false;
1604 return valid_mtrr_type(data & 0xff); 1604 return valid_mtrr_type(data & 0xff);
1605 } else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) { 1605 } else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) {
1606 for (i = 0; i < 8 ; i++) 1606 for (i = 0; i < 8 ; i++)
1607 if (!valid_mtrr_type((data >> (i * 8)) & 0xff)) 1607 if (!valid_mtrr_type((data >> (i * 8)) & 0xff))
1608 return false; 1608 return false;
1609 return true; 1609 return true;
1610 } 1610 }
1611 1611
1612 /* variable MTRRs */ 1612 /* variable MTRRs */
1613 return valid_mtrr_type(data & 0xff); 1613 return valid_mtrr_type(data & 0xff);
1614 } 1614 }
1615 1615
1616 static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data) 1616 static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1617 { 1617 {
1618 u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; 1618 u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
1619 1619
1620 if (!mtrr_valid(vcpu, msr, data)) 1620 if (!mtrr_valid(vcpu, msr, data))
1621 return 1; 1621 return 1;
1622 1622
1623 if (msr == MSR_MTRRdefType) { 1623 if (msr == MSR_MTRRdefType) {
1624 vcpu->arch.mtrr_state.def_type = data; 1624 vcpu->arch.mtrr_state.def_type = data;
1625 vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10; 1625 vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10;
1626 } else if (msr == MSR_MTRRfix64K_00000) 1626 } else if (msr == MSR_MTRRfix64K_00000)
1627 p[0] = data; 1627 p[0] = data;
1628 else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000) 1628 else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
1629 p[1 + msr - MSR_MTRRfix16K_80000] = data; 1629 p[1 + msr - MSR_MTRRfix16K_80000] = data;
1630 else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000) 1630 else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
1631 p[3 + msr - MSR_MTRRfix4K_C0000] = data; 1631 p[3 + msr - MSR_MTRRfix4K_C0000] = data;
1632 else if (msr == MSR_IA32_CR_PAT) 1632 else if (msr == MSR_IA32_CR_PAT)
1633 vcpu->arch.pat = data; 1633 vcpu->arch.pat = data;
1634 else { /* Variable MTRRs */ 1634 else { /* Variable MTRRs */
1635 int idx, is_mtrr_mask; 1635 int idx, is_mtrr_mask;
1636 u64 *pt; 1636 u64 *pt;
1637 1637
1638 idx = (msr - 0x200) / 2; 1638 idx = (msr - 0x200) / 2;
1639 is_mtrr_mask = msr - 0x200 - 2 * idx; 1639 is_mtrr_mask = msr - 0x200 - 2 * idx;
1640 if (!is_mtrr_mask) 1640 if (!is_mtrr_mask)
1641 pt = 1641 pt =
1642 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo; 1642 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
1643 else 1643 else
1644 pt = 1644 pt =
1645 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo; 1645 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
1646 *pt = data; 1646 *pt = data;
1647 } 1647 }
1648 1648
1649 kvm_mmu_reset_context(vcpu); 1649 kvm_mmu_reset_context(vcpu);
1650 return 0; 1650 return 0;
1651 } 1651 }
1652 1652
1653 static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data) 1653 static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1654 { 1654 {
1655 u64 mcg_cap = vcpu->arch.mcg_cap; 1655 u64 mcg_cap = vcpu->arch.mcg_cap;
1656 unsigned bank_num = mcg_cap & 0xff; 1656 unsigned bank_num = mcg_cap & 0xff;
1657 1657
1658 switch (msr) { 1658 switch (msr) {
1659 case MSR_IA32_MCG_STATUS: 1659 case MSR_IA32_MCG_STATUS:
1660 vcpu->arch.mcg_status = data; 1660 vcpu->arch.mcg_status = data;
1661 break; 1661 break;
1662 case MSR_IA32_MCG_CTL: 1662 case MSR_IA32_MCG_CTL:
1663 if (!(mcg_cap & MCG_CTL_P)) 1663 if (!(mcg_cap & MCG_CTL_P))
1664 return 1; 1664 return 1;
1665 if (data != 0 && data != ~(u64)0) 1665 if (data != 0 && data != ~(u64)0)
1666 return -1; 1666 return -1;
1667 vcpu->arch.mcg_ctl = data; 1667 vcpu->arch.mcg_ctl = data;
1668 break; 1668 break;
1669 default: 1669 default:
1670 if (msr >= MSR_IA32_MC0_CTL && 1670 if (msr >= MSR_IA32_MC0_CTL &&
1671 msr < MSR_IA32_MC0_CTL + 4 * bank_num) { 1671 msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
1672 u32 offset = msr - MSR_IA32_MC0_CTL; 1672 u32 offset = msr - MSR_IA32_MC0_CTL;
1673 /* only 0 or all 1s can be written to IA32_MCi_CTL 1673 /* only 0 or all 1s can be written to IA32_MCi_CTL
1674 * some Linux kernels though clear bit 10 in bank 4 to 1674 * some Linux kernels though clear bit 10 in bank 4 to
1675 * workaround a BIOS/GART TBL issue on AMD K8s, ignore 1675 * workaround a BIOS/GART TBL issue on AMD K8s, ignore
1676 * this to avoid an uncatched #GP in the guest 1676 * this to avoid an uncatched #GP in the guest
1677 */ 1677 */
1678 if ((offset & 0x3) == 0 && 1678 if ((offset & 0x3) == 0 &&
1679 data != 0 && (data | (1 << 10)) != ~(u64)0) 1679 data != 0 && (data | (1 << 10)) != ~(u64)0)
1680 return -1; 1680 return -1;
1681 vcpu->arch.mce_banks[offset] = data; 1681 vcpu->arch.mce_banks[offset] = data;
1682 break; 1682 break;
1683 } 1683 }
1684 return 1; 1684 return 1;
1685 } 1685 }
1686 return 0; 1686 return 0;
1687 } 1687 }
1688 1688
1689 static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data) 1689 static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
1690 { 1690 {
1691 struct kvm *kvm = vcpu->kvm; 1691 struct kvm *kvm = vcpu->kvm;
1692 int lm = is_long_mode(vcpu); 1692 int lm = is_long_mode(vcpu);
1693 u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64 1693 u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64
1694 : (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32; 1694 : (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32;
1695 u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64 1695 u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
1696 : kvm->arch.xen_hvm_config.blob_size_32; 1696 : kvm->arch.xen_hvm_config.blob_size_32;
1697 u32 page_num = data & ~PAGE_MASK; 1697 u32 page_num = data & ~PAGE_MASK;
1698 u64 page_addr = data & PAGE_MASK; 1698 u64 page_addr = data & PAGE_MASK;
1699 u8 *page; 1699 u8 *page;
1700 int r; 1700 int r;
1701 1701
1702 r = -E2BIG; 1702 r = -E2BIG;
1703 if (page_num >= blob_size) 1703 if (page_num >= blob_size)
1704 goto out; 1704 goto out;
1705 r = -ENOMEM; 1705 r = -ENOMEM;
1706 page = memdup_user(blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE); 1706 page = memdup_user(blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE);
1707 if (IS_ERR(page)) { 1707 if (IS_ERR(page)) {
1708 r = PTR_ERR(page); 1708 r = PTR_ERR(page);
1709 goto out; 1709 goto out;
1710 } 1710 }
1711 if (kvm_write_guest(kvm, page_addr, page, PAGE_SIZE)) 1711 if (kvm_write_guest(kvm, page_addr, page, PAGE_SIZE))
1712 goto out_free; 1712 goto out_free;
1713 r = 0; 1713 r = 0;
1714 out_free: 1714 out_free:
1715 kfree(page); 1715 kfree(page);
1716 out: 1716 out:
1717 return r; 1717 return r;
1718 } 1718 }
1719 1719
1720 static bool kvm_hv_hypercall_enabled(struct kvm *kvm) 1720 static bool kvm_hv_hypercall_enabled(struct kvm *kvm)
1721 { 1721 {
1722 return kvm->arch.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE; 1722 return kvm->arch.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE;
1723 } 1723 }
1724 1724
1725 static bool kvm_hv_msr_partition_wide(u32 msr) 1725 static bool kvm_hv_msr_partition_wide(u32 msr)
1726 { 1726 {
1727 bool r = false; 1727 bool r = false;
1728 switch (msr) { 1728 switch (msr) {
1729 case HV_X64_MSR_GUEST_OS_ID: 1729 case HV_X64_MSR_GUEST_OS_ID:
1730 case HV_X64_MSR_HYPERCALL: 1730 case HV_X64_MSR_HYPERCALL:
1731 r = true; 1731 r = true;
1732 break; 1732 break;
1733 } 1733 }
1734 1734
1735 return r; 1735 return r;
1736 } 1736 }
1737 1737
1738 static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data) 1738 static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1739 { 1739 {
1740 struct kvm *kvm = vcpu->kvm; 1740 struct kvm *kvm = vcpu->kvm;
1741 1741
1742 switch (msr) { 1742 switch (msr) {
1743 case HV_X64_MSR_GUEST_OS_ID: 1743 case HV_X64_MSR_GUEST_OS_ID:
1744 kvm->arch.hv_guest_os_id = data; 1744 kvm->arch.hv_guest_os_id = data;
1745 /* setting guest os id to zero disables hypercall page */ 1745 /* setting guest os id to zero disables hypercall page */
1746 if (!kvm->arch.hv_guest_os_id) 1746 if (!kvm->arch.hv_guest_os_id)
1747 kvm->arch.hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE; 1747 kvm->arch.hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE;
1748 break; 1748 break;
1749 case HV_X64_MSR_HYPERCALL: { 1749 case HV_X64_MSR_HYPERCALL: {
1750 u64 gfn; 1750 u64 gfn;
1751 unsigned long addr; 1751 unsigned long addr;
1752 u8 instructions[4]; 1752 u8 instructions[4];
1753 1753
1754 /* if guest os id is not set hypercall should remain disabled */ 1754 /* if guest os id is not set hypercall should remain disabled */
1755 if (!kvm->arch.hv_guest_os_id) 1755 if (!kvm->arch.hv_guest_os_id)
1756 break; 1756 break;
1757 if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) { 1757 if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) {
1758 kvm->arch.hv_hypercall = data; 1758 kvm->arch.hv_hypercall = data;
1759 break; 1759 break;
1760 } 1760 }
1761 gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT; 1761 gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT;
1762 addr = gfn_to_hva(kvm, gfn); 1762 addr = gfn_to_hva(kvm, gfn);
1763 if (kvm_is_error_hva(addr)) 1763 if (kvm_is_error_hva(addr))
1764 return 1; 1764 return 1;
1765 kvm_x86_ops->patch_hypercall(vcpu, instructions); 1765 kvm_x86_ops->patch_hypercall(vcpu, instructions);
1766 ((unsigned char *)instructions)[3] = 0xc3; /* ret */ 1766 ((unsigned char *)instructions)[3] = 0xc3; /* ret */
1767 if (__copy_to_user((void __user *)addr, instructions, 4)) 1767 if (__copy_to_user((void __user *)addr, instructions, 4))
1768 return 1; 1768 return 1;
1769 kvm->arch.hv_hypercall = data; 1769 kvm->arch.hv_hypercall = data;
1770 break; 1770 break;
1771 } 1771 }
1772 default: 1772 default:
1773 vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " 1773 vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
1774 "data 0x%llx\n", msr, data); 1774 "data 0x%llx\n", msr, data);
1775 return 1; 1775 return 1;
1776 } 1776 }
1777 return 0; 1777 return 0;
1778 } 1778 }
1779 1779
1780 static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data) 1780 static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1781 { 1781 {
1782 switch (msr) { 1782 switch (msr) {
1783 case HV_X64_MSR_APIC_ASSIST_PAGE: { 1783 case HV_X64_MSR_APIC_ASSIST_PAGE: {
1784 unsigned long addr; 1784 unsigned long addr;
1785 1785
1786 if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) { 1786 if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) {
1787 vcpu->arch.hv_vapic = data; 1787 vcpu->arch.hv_vapic = data;
1788 break; 1788 break;
1789 } 1789 }
1790 addr = gfn_to_hva(vcpu->kvm, data >> 1790 addr = gfn_to_hva(vcpu->kvm, data >>
1791 HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT); 1791 HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT);
1792 if (kvm_is_error_hva(addr)) 1792 if (kvm_is_error_hva(addr))
1793 return 1; 1793 return 1;
1794 if (__clear_user((void __user *)addr, PAGE_SIZE)) 1794 if (__clear_user((void __user *)addr, PAGE_SIZE))
1795 return 1; 1795 return 1;
1796 vcpu->arch.hv_vapic = data; 1796 vcpu->arch.hv_vapic = data;
1797 break; 1797 break;
1798 } 1798 }
1799 case HV_X64_MSR_EOI: 1799 case HV_X64_MSR_EOI:
1800 return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data); 1800 return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data);
1801 case HV_X64_MSR_ICR: 1801 case HV_X64_MSR_ICR:
1802 return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data); 1802 return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data);
1803 case HV_X64_MSR_TPR: 1803 case HV_X64_MSR_TPR:
1804 return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data); 1804 return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data);
1805 default: 1805 default:
1806 vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " 1806 vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
1807 "data 0x%llx\n", msr, data); 1807 "data 0x%llx\n", msr, data);
1808 return 1; 1808 return 1;
1809 } 1809 }
1810 1810
1811 return 0; 1811 return 0;
1812 } 1812 }
1813 1813
1814 static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) 1814 static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
1815 { 1815 {
1816 gpa_t gpa = data & ~0x3f; 1816 gpa_t gpa = data & ~0x3f;
1817 1817
1818 /* Bits 2:5 are reserved, Should be zero */ 1818 /* Bits 2:5 are reserved, Should be zero */
1819 if (data & 0x3c) 1819 if (data & 0x3c)
1820 return 1; 1820 return 1;
1821 1821
1822 vcpu->arch.apf.msr_val = data; 1822 vcpu->arch.apf.msr_val = data;
1823 1823
1824 if (!(data & KVM_ASYNC_PF_ENABLED)) { 1824 if (!(data & KVM_ASYNC_PF_ENABLED)) {
1825 kvm_clear_async_pf_completion_queue(vcpu); 1825 kvm_clear_async_pf_completion_queue(vcpu);
1826 kvm_async_pf_hash_reset(vcpu); 1826 kvm_async_pf_hash_reset(vcpu);
1827 return 0; 1827 return 0;
1828 } 1828 }
1829 1829
1830 if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa)) 1830 if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa))
1831 return 1; 1831 return 1;
1832 1832
1833 vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS); 1833 vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
1834 kvm_async_pf_wakeup_all(vcpu); 1834 kvm_async_pf_wakeup_all(vcpu);
1835 return 0; 1835 return 0;
1836 } 1836 }
1837 1837
1838 static void kvmclock_reset(struct kvm_vcpu *vcpu) 1838 static void kvmclock_reset(struct kvm_vcpu *vcpu)
1839 { 1839 {
1840 if (vcpu->arch.time_page) { 1840 if (vcpu->arch.time_page) {
1841 kvm_release_page_dirty(vcpu->arch.time_page); 1841 kvm_release_page_dirty(vcpu->arch.time_page);
1842 vcpu->arch.time_page = NULL; 1842 vcpu->arch.time_page = NULL;
1843 } 1843 }
1844 } 1844 }
1845 1845
1846 static void accumulate_steal_time(struct kvm_vcpu *vcpu) 1846 static void accumulate_steal_time(struct kvm_vcpu *vcpu)
1847 { 1847 {
1848 u64 delta; 1848 u64 delta;
1849 1849
1850 if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) 1850 if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
1851 return; 1851 return;
1852 1852
1853 delta = current->sched_info.run_delay - vcpu->arch.st.last_steal; 1853 delta = current->sched_info.run_delay - vcpu->arch.st.last_steal;
1854 vcpu->arch.st.last_steal = current->sched_info.run_delay; 1854 vcpu->arch.st.last_steal = current->sched_info.run_delay;
1855 vcpu->arch.st.accum_steal = delta; 1855 vcpu->arch.st.accum_steal = delta;
1856 } 1856 }
1857 1857
1858 static void record_steal_time(struct kvm_vcpu *vcpu) 1858 static void record_steal_time(struct kvm_vcpu *vcpu)
1859 { 1859 {
1860 if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) 1860 if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
1861 return; 1861 return;
1862 1862
1863 if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, 1863 if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
1864 &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) 1864 &vcpu->arch.st.steal, sizeof(struct kvm_steal_time))))
1865 return; 1865 return;
1866 1866
1867 vcpu->arch.st.steal.steal += vcpu->arch.st.accum_steal; 1867 vcpu->arch.st.steal.steal += vcpu->arch.st.accum_steal;
1868 vcpu->arch.st.steal.version += 2; 1868 vcpu->arch.st.steal.version += 2;
1869 vcpu->arch.st.accum_steal = 0; 1869 vcpu->arch.st.accum_steal = 0;
1870 1870
1871 kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, 1871 kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
1872 &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); 1872 &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
1873 } 1873 }
1874 1874
1875 int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 1875 int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
1876 { 1876 {
1877 bool pr = false; 1877 bool pr = false;
1878 u32 msr = msr_info->index; 1878 u32 msr = msr_info->index;
1879 u64 data = msr_info->data; 1879 u64 data = msr_info->data;
1880 1880
1881 switch (msr) { 1881 switch (msr) {
1882 case MSR_EFER: 1882 case MSR_EFER:
1883 return set_efer(vcpu, data); 1883 return set_efer(vcpu, data);
1884 case MSR_K7_HWCR: 1884 case MSR_K7_HWCR:
1885 data &= ~(u64)0x40; /* ignore flush filter disable */ 1885 data &= ~(u64)0x40; /* ignore flush filter disable */
1886 data &= ~(u64)0x100; /* ignore ignne emulation enable */ 1886 data &= ~(u64)0x100; /* ignore ignne emulation enable */
1887 data &= ~(u64)0x8; /* ignore TLB cache disable */ 1887 data &= ~(u64)0x8; /* ignore TLB cache disable */
1888 if (data != 0) { 1888 if (data != 0) {
1889 vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", 1889 vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
1890 data); 1890 data);
1891 return 1; 1891 return 1;
1892 } 1892 }
1893 break; 1893 break;
1894 case MSR_FAM10H_MMIO_CONF_BASE: 1894 case MSR_FAM10H_MMIO_CONF_BASE:
1895 if (data != 0) { 1895 if (data != 0) {
1896 vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: " 1896 vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "
1897 "0x%llx\n", data); 1897 "0x%llx\n", data);
1898 return 1; 1898 return 1;
1899 } 1899 }
1900 break; 1900 break;
1901 case MSR_AMD64_NB_CFG: 1901 case MSR_AMD64_NB_CFG:
1902 break; 1902 break;
1903 case MSR_IA32_DEBUGCTLMSR: 1903 case MSR_IA32_DEBUGCTLMSR:
1904 if (!data) { 1904 if (!data) {
1905 /* We support the non-activated case already */ 1905 /* We support the non-activated case already */
1906 break; 1906 break;
1907 } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) { 1907 } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
1908 /* Values other than LBR and BTF are vendor-specific, 1908 /* Values other than LBR and BTF are vendor-specific,
1909 thus reserved and should throw a #GP */ 1909 thus reserved and should throw a #GP */
1910 return 1; 1910 return 1;
1911 } 1911 }
1912 vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", 1912 vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
1913 __func__, data); 1913 __func__, data);
1914 break; 1914 break;
1915 case MSR_IA32_UCODE_REV: 1915 case MSR_IA32_UCODE_REV:
1916 case MSR_IA32_UCODE_WRITE: 1916 case MSR_IA32_UCODE_WRITE:
1917 case MSR_VM_HSAVE_PA: 1917 case MSR_VM_HSAVE_PA:
1918 case MSR_AMD64_PATCH_LOADER: 1918 case MSR_AMD64_PATCH_LOADER:
1919 break; 1919 break;
1920 case 0x200 ... 0x2ff: 1920 case 0x200 ... 0x2ff:
1921 return set_msr_mtrr(vcpu, msr, data); 1921 return set_msr_mtrr(vcpu, msr, data);
1922 case MSR_IA32_APICBASE: 1922 case MSR_IA32_APICBASE:
1923 kvm_set_apic_base(vcpu, data); 1923 kvm_set_apic_base(vcpu, data);
1924 break; 1924 break;
1925 case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: 1925 case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
1926 return kvm_x2apic_msr_write(vcpu, msr, data); 1926 return kvm_x2apic_msr_write(vcpu, msr, data);
1927 case MSR_IA32_TSCDEADLINE: 1927 case MSR_IA32_TSCDEADLINE:
1928 kvm_set_lapic_tscdeadline_msr(vcpu, data); 1928 kvm_set_lapic_tscdeadline_msr(vcpu, data);
1929 break; 1929 break;
1930 case MSR_IA32_TSC_ADJUST: 1930 case MSR_IA32_TSC_ADJUST:
1931 if (guest_cpuid_has_tsc_adjust(vcpu)) { 1931 if (guest_cpuid_has_tsc_adjust(vcpu)) {
1932 if (!msr_info->host_initiated) { 1932 if (!msr_info->host_initiated) {
1933 u64 adj = data - vcpu->arch.ia32_tsc_adjust_msr; 1933 u64 adj = data - vcpu->arch.ia32_tsc_adjust_msr;
1934 kvm_x86_ops->adjust_tsc_offset(vcpu, adj, true); 1934 kvm_x86_ops->adjust_tsc_offset(vcpu, adj, true);
1935 } 1935 }
1936 vcpu->arch.ia32_tsc_adjust_msr = data; 1936 vcpu->arch.ia32_tsc_adjust_msr = data;
1937 } 1937 }
1938 break; 1938 break;
1939 case MSR_IA32_MISC_ENABLE: 1939 case MSR_IA32_MISC_ENABLE:
1940 vcpu->arch.ia32_misc_enable_msr = data; 1940 vcpu->arch.ia32_misc_enable_msr = data;
1941 break; 1941 break;
1942 case MSR_KVM_WALL_CLOCK_NEW: 1942 case MSR_KVM_WALL_CLOCK_NEW:
1943 case MSR_KVM_WALL_CLOCK: 1943 case MSR_KVM_WALL_CLOCK:
1944 vcpu->kvm->arch.wall_clock = data; 1944 vcpu->kvm->arch.wall_clock = data;
1945 kvm_write_wall_clock(vcpu->kvm, data); 1945 kvm_write_wall_clock(vcpu->kvm, data);
1946 break; 1946 break;
1947 case MSR_KVM_SYSTEM_TIME_NEW: 1947 case MSR_KVM_SYSTEM_TIME_NEW:
1948 case MSR_KVM_SYSTEM_TIME: { 1948 case MSR_KVM_SYSTEM_TIME: {
1949 kvmclock_reset(vcpu); 1949 kvmclock_reset(vcpu);
1950 1950
1951 vcpu->arch.time = data; 1951 vcpu->arch.time = data;
1952 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); 1952 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
1953 1953
1954 /* we verify if the enable bit is set... */ 1954 /* we verify if the enable bit is set... */
1955 if (!(data & 1)) 1955 if (!(data & 1))
1956 break; 1956 break;
1957 1957
1958 /* ...but clean it before doing the actual write */ 1958 /* ...but clean it before doing the actual write */
1959 vcpu->arch.time_offset = data & ~(PAGE_MASK | 1); 1959 vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
1960 1960
1961 vcpu->arch.time_page = 1961 vcpu->arch.time_page =
1962 gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); 1962 gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
1963 1963
1964 if (is_error_page(vcpu->arch.time_page)) 1964 if (is_error_page(vcpu->arch.time_page))
1965 vcpu->arch.time_page = NULL; 1965 vcpu->arch.time_page = NULL;
1966 1966
1967 break; 1967 break;
1968 } 1968 }
1969 case MSR_KVM_ASYNC_PF_EN: 1969 case MSR_KVM_ASYNC_PF_EN:
1970 if (kvm_pv_enable_async_pf(vcpu, data)) 1970 if (kvm_pv_enable_async_pf(vcpu, data))
1971 return 1; 1971 return 1;
1972 break; 1972 break;
1973 case MSR_KVM_STEAL_TIME: 1973 case MSR_KVM_STEAL_TIME:
1974 1974
1975 if (unlikely(!sched_info_on())) 1975 if (unlikely(!sched_info_on()))
1976 return 1; 1976 return 1;
1977 1977
1978 if (data & KVM_STEAL_RESERVED_MASK) 1978 if (data & KVM_STEAL_RESERVED_MASK)
1979 return 1; 1979 return 1;
1980 1980
1981 if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime, 1981 if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime,
1982 data & KVM_STEAL_VALID_BITS)) 1982 data & KVM_STEAL_VALID_BITS))
1983 return 1; 1983 return 1;
1984 1984
1985 vcpu->arch.st.msr_val = data; 1985 vcpu->arch.st.msr_val = data;
1986 1986
1987 if (!(data & KVM_MSR_ENABLED)) 1987 if (!(data & KVM_MSR_ENABLED))
1988 break; 1988 break;
1989 1989
1990 vcpu->arch.st.last_steal = current->sched_info.run_delay; 1990 vcpu->arch.st.last_steal = current->sched_info.run_delay;
1991 1991
1992 preempt_disable(); 1992 preempt_disable();
1993 accumulate_steal_time(vcpu); 1993 accumulate_steal_time(vcpu);
1994 preempt_enable(); 1994 preempt_enable();
1995 1995
1996 kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); 1996 kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
1997 1997
1998 break; 1998 break;
1999 case MSR_KVM_PV_EOI_EN: 1999 case MSR_KVM_PV_EOI_EN:
2000 if (kvm_lapic_enable_pv_eoi(vcpu, data)) 2000 if (kvm_lapic_enable_pv_eoi(vcpu, data))
2001 return 1; 2001 return 1;
2002 break; 2002 break;
2003 2003
2004 case MSR_IA32_MCG_CTL: 2004 case MSR_IA32_MCG_CTL:
2005 case MSR_IA32_MCG_STATUS: 2005 case MSR_IA32_MCG_STATUS:
2006 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: 2006 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
2007 return set_msr_mce(vcpu, msr, data); 2007 return set_msr_mce(vcpu, msr, data);
2008 2008
2009 /* Performance counters are not protected by a CPUID bit, 2009 /* Performance counters are not protected by a CPUID bit,
2010 * so we should check all of them in the generic path for the sake of 2010 * so we should check all of them in the generic path for the sake of
2011 * cross vendor migration. 2011 * cross vendor migration.
2012 * Writing a zero into the event select MSRs disables them, 2012 * Writing a zero into the event select MSRs disables them,
2013 * which we perfectly emulate ;-). Any other value should be at least 2013 * which we perfectly emulate ;-). Any other value should be at least
2014 * reported, some guests depend on them. 2014 * reported, some guests depend on them.
2015 */ 2015 */
2016 case MSR_K7_EVNTSEL0: 2016 case MSR_K7_EVNTSEL0:
2017 case MSR_K7_EVNTSEL1: 2017 case MSR_K7_EVNTSEL1:
2018 case MSR_K7_EVNTSEL2: 2018 case MSR_K7_EVNTSEL2:
2019 case MSR_K7_EVNTSEL3: 2019 case MSR_K7_EVNTSEL3:
2020 if (data != 0) 2020 if (data != 0)
2021 vcpu_unimpl(vcpu, "unimplemented perfctr wrmsr: " 2021 vcpu_unimpl(vcpu, "unimplemented perfctr wrmsr: "
2022 "0x%x data 0x%llx\n", msr, data); 2022 "0x%x data 0x%llx\n", msr, data);
2023 break; 2023 break;
2024 /* at least RHEL 4 unconditionally writes to the perfctr registers, 2024 /* at least RHEL 4 unconditionally writes to the perfctr registers,
2025 * so we ignore writes to make it happy. 2025 * so we ignore writes to make it happy.
2026 */ 2026 */
2027 case MSR_K7_PERFCTR0: 2027 case MSR_K7_PERFCTR0:
2028 case MSR_K7_PERFCTR1: 2028 case MSR_K7_PERFCTR1:
2029 case MSR_K7_PERFCTR2: 2029 case MSR_K7_PERFCTR2:
2030 case MSR_K7_PERFCTR3: 2030 case MSR_K7_PERFCTR3:
2031 vcpu_unimpl(vcpu, "unimplemented perfctr wrmsr: " 2031 vcpu_unimpl(vcpu, "unimplemented perfctr wrmsr: "
2032 "0x%x data 0x%llx\n", msr, data); 2032 "0x%x data 0x%llx\n", msr, data);
2033 break; 2033 break;
2034 case MSR_P6_PERFCTR0: 2034 case MSR_P6_PERFCTR0:
2035 case MSR_P6_PERFCTR1: 2035 case MSR_P6_PERFCTR1:
2036 pr = true; 2036 pr = true;
2037 case MSR_P6_EVNTSEL0: 2037 case MSR_P6_EVNTSEL0:
2038 case MSR_P6_EVNTSEL1: 2038 case MSR_P6_EVNTSEL1:
2039 if (kvm_pmu_msr(vcpu, msr)) 2039 if (kvm_pmu_msr(vcpu, msr))
2040 return kvm_pmu_set_msr(vcpu, msr, data); 2040 return kvm_pmu_set_msr(vcpu, msr, data);
2041 2041
2042 if (pr || data != 0) 2042 if (pr || data != 0)
2043 vcpu_unimpl(vcpu, "disabled perfctr wrmsr: " 2043 vcpu_unimpl(vcpu, "disabled perfctr wrmsr: "
2044 "0x%x data 0x%llx\n", msr, data); 2044 "0x%x data 0x%llx\n", msr, data);
2045 break; 2045 break;
2046 case MSR_K7_CLK_CTL: 2046 case MSR_K7_CLK_CTL:
2047 /* 2047 /*
2048 * Ignore all writes to this no longer documented MSR. 2048 * Ignore all writes to this no longer documented MSR.
2049 * Writes are only relevant for old K7 processors, 2049 * Writes are only relevant for old K7 processors,
2050 * all pre-dating SVM, but a recommended workaround from 2050 * all pre-dating SVM, but a recommended workaround from
2051 * AMD for these chips. It is possible to specify the 2051 * AMD for these chips. It is possible to specify the
2052 * affected processor models on the command line, hence 2052 * affected processor models on the command line, hence
2053 * the need to ignore the workaround. 2053 * the need to ignore the workaround.
2054 */ 2054 */
2055 break; 2055 break;
2056 case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: 2056 case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
2057 if (kvm_hv_msr_partition_wide(msr)) { 2057 if (kvm_hv_msr_partition_wide(msr)) {
2058 int r; 2058 int r;
2059 mutex_lock(&vcpu->kvm->lock); 2059 mutex_lock(&vcpu->kvm->lock);
2060 r = set_msr_hyperv_pw(vcpu, msr, data); 2060 r = set_msr_hyperv_pw(vcpu, msr, data);
2061 mutex_unlock(&vcpu->kvm->lock); 2061 mutex_unlock(&vcpu->kvm->lock);
2062 return r; 2062 return r;
2063 } else 2063 } else
2064 return set_msr_hyperv(vcpu, msr, data); 2064 return set_msr_hyperv(vcpu, msr, data);
2065 break; 2065 break;
2066 case MSR_IA32_BBL_CR_CTL3: 2066 case MSR_IA32_BBL_CR_CTL3:
2067 /* Drop writes to this legacy MSR -- see rdmsr 2067 /* Drop writes to this legacy MSR -- see rdmsr
2068 * counterpart for further detail. 2068 * counterpart for further detail.
2069 */ 2069 */
2070 vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data); 2070 vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data);
2071 break; 2071 break;
2072 case MSR_AMD64_OSVW_ID_LENGTH: 2072 case MSR_AMD64_OSVW_ID_LENGTH:
2073 if (!guest_cpuid_has_osvw(vcpu)) 2073 if (!guest_cpuid_has_osvw(vcpu))
2074 return 1; 2074 return 1;
2075 vcpu->arch.osvw.length = data; 2075 vcpu->arch.osvw.length = data;
2076 break; 2076 break;
2077 case MSR_AMD64_OSVW_STATUS: 2077 case MSR_AMD64_OSVW_STATUS:
2078 if (!guest_cpuid_has_osvw(vcpu)) 2078 if (!guest_cpuid_has_osvw(vcpu))
2079 return 1; 2079 return 1;
2080 vcpu->arch.osvw.status = data; 2080 vcpu->arch.osvw.status = data;
2081 break; 2081 break;
2082 default: 2082 default:
2083 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) 2083 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
2084 return xen_hvm_config(vcpu, data); 2084 return xen_hvm_config(vcpu, data);
2085 if (kvm_pmu_msr(vcpu, msr)) 2085 if (kvm_pmu_msr(vcpu, msr))
2086 return kvm_pmu_set_msr(vcpu, msr, data); 2086 return kvm_pmu_set_msr(vcpu, msr, data);
2087 if (!ignore_msrs) { 2087 if (!ignore_msrs) {
2088 vcpu_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", 2088 vcpu_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
2089 msr, data); 2089 msr, data);
2090 return 1; 2090 return 1;
2091 } else { 2091 } else {
2092 vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", 2092 vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n",
2093 msr, data); 2093 msr, data);
2094 break; 2094 break;
2095 } 2095 }
2096 } 2096 }
2097 return 0; 2097 return 0;
2098 } 2098 }
2099 EXPORT_SYMBOL_GPL(kvm_set_msr_common); 2099 EXPORT_SYMBOL_GPL(kvm_set_msr_common);
2100 2100
2101 2101
2102 /* 2102 /*
2103 * Reads an msr value (of 'msr_index') into 'pdata'. 2103 * Reads an msr value (of 'msr_index') into 'pdata'.
2104 * Returns 0 on success, non-0 otherwise. 2104 * Returns 0 on success, non-0 otherwise.
2105 * Assumes vcpu_load() was already called. 2105 * Assumes vcpu_load() was already called.
2106 */ 2106 */
2107 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) 2107 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
2108 { 2108 {
2109 return kvm_x86_ops->get_msr(vcpu, msr_index, pdata); 2109 return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
2110 } 2110 }
2111 2111
2112 static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 2112 static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
2113 { 2113 {
2114 u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; 2114 u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
2115 2115
2116 if (!msr_mtrr_valid(msr)) 2116 if (!msr_mtrr_valid(msr))
2117 return 1; 2117 return 1;
2118 2118
2119 if (msr == MSR_MTRRdefType) 2119 if (msr == MSR_MTRRdefType)
2120 *pdata = vcpu->arch.mtrr_state.def_type + 2120 *pdata = vcpu->arch.mtrr_state.def_type +
2121 (vcpu->arch.mtrr_state.enabled << 10); 2121 (vcpu->arch.mtrr_state.enabled << 10);
2122 else if (msr == MSR_MTRRfix64K_00000) 2122 else if (msr == MSR_MTRRfix64K_00000)
2123 *pdata = p[0]; 2123 *pdata = p[0];
2124 else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000) 2124 else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
2125 *pdata = p[1 + msr - MSR_MTRRfix16K_80000]; 2125 *pdata = p[1 + msr - MSR_MTRRfix16K_80000];
2126 else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000) 2126 else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
2127 *pdata = p[3 + msr - MSR_MTRRfix4K_C0000]; 2127 *pdata = p[3 + msr - MSR_MTRRfix4K_C0000];
2128 else if (msr == MSR_IA32_CR_PAT) 2128 else if (msr == MSR_IA32_CR_PAT)
2129 *pdata = vcpu->arch.pat; 2129 *pdata = vcpu->arch.pat;
2130 else { /* Variable MTRRs */ 2130 else { /* Variable MTRRs */
2131 int idx, is_mtrr_mask; 2131 int idx, is_mtrr_mask;
2132 u64 *pt; 2132 u64 *pt;
2133 2133
2134 idx = (msr - 0x200) / 2; 2134 idx = (msr - 0x200) / 2;
2135 is_mtrr_mask = msr - 0x200 - 2 * idx; 2135 is_mtrr_mask = msr - 0x200 - 2 * idx;
2136 if (!is_mtrr_mask) 2136 if (!is_mtrr_mask)
2137 pt = 2137 pt =
2138 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo; 2138 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
2139 else 2139 else
2140 pt = 2140 pt =
2141 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo; 2141 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
2142 *pdata = *pt; 2142 *pdata = *pt;
2143 } 2143 }
2144 2144
2145 return 0; 2145 return 0;
2146 } 2146 }
2147 2147
2148 static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 2148 static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
2149 { 2149 {
2150 u64 data; 2150 u64 data;
2151 u64 mcg_cap = vcpu->arch.mcg_cap; 2151 u64 mcg_cap = vcpu->arch.mcg_cap;
2152 unsigned bank_num = mcg_cap & 0xff; 2152 unsigned bank_num = mcg_cap & 0xff;
2153 2153
2154 switch (msr) { 2154 switch (msr) {
2155 case MSR_IA32_P5_MC_ADDR: 2155 case MSR_IA32_P5_MC_ADDR:
2156 case MSR_IA32_P5_MC_TYPE: 2156 case MSR_IA32_P5_MC_TYPE:
2157 data = 0; 2157 data = 0;
2158 break; 2158 break;
2159 case MSR_IA32_MCG_CAP: 2159 case MSR_IA32_MCG_CAP:
2160 data = vcpu->arch.mcg_cap; 2160 data = vcpu->arch.mcg_cap;
2161 break; 2161 break;
2162 case MSR_IA32_MCG_CTL: 2162 case MSR_IA32_MCG_CTL:
2163 if (!(mcg_cap & MCG_CTL_P)) 2163 if (!(mcg_cap & MCG_CTL_P))
2164 return 1; 2164 return 1;
2165 data = vcpu->arch.mcg_ctl; 2165 data = vcpu->arch.mcg_ctl;
2166 break; 2166 break;
2167 case MSR_IA32_MCG_STATUS: 2167 case MSR_IA32_MCG_STATUS:
2168 data = vcpu->arch.mcg_status; 2168 data = vcpu->arch.mcg_status;
2169 break; 2169 break;
2170 default: 2170 default:
2171 if (msr >= MSR_IA32_MC0_CTL && 2171 if (msr >= MSR_IA32_MC0_CTL &&
2172 msr < MSR_IA32_MC0_CTL + 4 * bank_num) { 2172 msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
2173 u32 offset = msr - MSR_IA32_MC0_CTL; 2173 u32 offset = msr - MSR_IA32_MC0_CTL;
2174 data = vcpu->arch.mce_banks[offset]; 2174 data = vcpu->arch.mce_banks[offset];
2175 break; 2175 break;
2176 } 2176 }
2177 return 1; 2177 return 1;
2178 } 2178 }
2179 *pdata = data; 2179 *pdata = data;
2180 return 0; 2180 return 0;
2181 } 2181 }
2182 2182
2183 static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 2183 static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
2184 { 2184 {
2185 u64 data = 0; 2185 u64 data = 0;
2186 struct kvm *kvm = vcpu->kvm; 2186 struct kvm *kvm = vcpu->kvm;
2187 2187
2188 switch (msr) { 2188 switch (msr) {
2189 case HV_X64_MSR_GUEST_OS_ID: 2189 case HV_X64_MSR_GUEST_OS_ID:
2190 data = kvm->arch.hv_guest_os_id; 2190 data = kvm->arch.hv_guest_os_id;
2191 break; 2191 break;
2192 case HV_X64_MSR_HYPERCALL: 2192 case HV_X64_MSR_HYPERCALL:
2193 data = kvm->arch.hv_hypercall; 2193 data = kvm->arch.hv_hypercall;
2194 break; 2194 break;
2195 default: 2195 default:
2196 vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); 2196 vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
2197 return 1; 2197 return 1;
2198 } 2198 }
2199 2199
2200 *pdata = data; 2200 *pdata = data;
2201 return 0; 2201 return 0;
2202 } 2202 }
2203 2203
2204 static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 2204 static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
2205 { 2205 {
2206 u64 data = 0; 2206 u64 data = 0;
2207 2207
2208 switch (msr) { 2208 switch (msr) {
2209 case HV_X64_MSR_VP_INDEX: { 2209 case HV_X64_MSR_VP_INDEX: {
2210 int r; 2210 int r;
2211 struct kvm_vcpu *v; 2211 struct kvm_vcpu *v;
2212 kvm_for_each_vcpu(r, v, vcpu->kvm) 2212 kvm_for_each_vcpu(r, v, vcpu->kvm)
2213 if (v == vcpu) 2213 if (v == vcpu)
2214 data = r; 2214 data = r;
2215 break; 2215 break;
2216 } 2216 }
2217 case HV_X64_MSR_EOI: 2217 case HV_X64_MSR_EOI:
2218 return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata); 2218 return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata);
2219 case HV_X64_MSR_ICR: 2219 case HV_X64_MSR_ICR:
2220 return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata); 2220 return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata);
2221 case HV_X64_MSR_TPR: 2221 case HV_X64_MSR_TPR:
2222 return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata); 2222 return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata);
2223 case HV_X64_MSR_APIC_ASSIST_PAGE: 2223 case HV_X64_MSR_APIC_ASSIST_PAGE:
2224 data = vcpu->arch.hv_vapic; 2224 data = vcpu->arch.hv_vapic;
2225 break; 2225 break;
2226 default: 2226 default:
2227 vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); 2227 vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
2228 return 1; 2228 return 1;
2229 } 2229 }
2230 *pdata = data; 2230 *pdata = data;
2231 return 0; 2231 return 0;
2232 } 2232 }
2233 2233
2234 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 2234 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
2235 { 2235 {
2236 u64 data; 2236 u64 data;
2237 2237
2238 switch (msr) { 2238 switch (msr) {
2239 case MSR_IA32_PLATFORM_ID: 2239 case MSR_IA32_PLATFORM_ID:
2240 case MSR_IA32_EBL_CR_POWERON: 2240 case MSR_IA32_EBL_CR_POWERON:
2241 case MSR_IA32_DEBUGCTLMSR: 2241 case MSR_IA32_DEBUGCTLMSR:
2242 case MSR_IA32_LASTBRANCHFROMIP: 2242 case MSR_IA32_LASTBRANCHFROMIP:
2243 case MSR_IA32_LASTBRANCHTOIP: 2243 case MSR_IA32_LASTBRANCHTOIP:
2244 case MSR_IA32_LASTINTFROMIP: 2244 case MSR_IA32_LASTINTFROMIP:
2245 case MSR_IA32_LASTINTTOIP: 2245 case MSR_IA32_LASTINTTOIP:
2246 case MSR_K8_SYSCFG: 2246 case MSR_K8_SYSCFG:
2247 case MSR_K7_HWCR: 2247 case MSR_K7_HWCR:
2248 case MSR_VM_HSAVE_PA: 2248 case MSR_VM_HSAVE_PA:
2249 case MSR_K7_EVNTSEL0: 2249 case MSR_K7_EVNTSEL0:
2250 case MSR_K7_PERFCTR0: 2250 case MSR_K7_PERFCTR0:
2251 case MSR_K8_INT_PENDING_MSG: 2251 case MSR_K8_INT_PENDING_MSG:
2252 case MSR_AMD64_NB_CFG: 2252 case MSR_AMD64_NB_CFG:
2253 case MSR_FAM10H_MMIO_CONF_BASE: 2253 case MSR_FAM10H_MMIO_CONF_BASE:
2254 data = 0; 2254 data = 0;
2255 break; 2255 break;
2256 case MSR_P6_PERFCTR0: 2256 case MSR_P6_PERFCTR0:
2257 case MSR_P6_PERFCTR1: 2257 case MSR_P6_PERFCTR1:
2258 case MSR_P6_EVNTSEL0: 2258 case MSR_P6_EVNTSEL0:
2259 case MSR_P6_EVNTSEL1: 2259 case MSR_P6_EVNTSEL1:
2260 if (kvm_pmu_msr(vcpu, msr)) 2260 if (kvm_pmu_msr(vcpu, msr))
2261 return kvm_pmu_get_msr(vcpu, msr, pdata); 2261 return kvm_pmu_get_msr(vcpu, msr, pdata);
2262 data = 0; 2262 data = 0;
2263 break; 2263 break;
2264 case MSR_IA32_UCODE_REV: 2264 case MSR_IA32_UCODE_REV:
2265 data = 0x100000000ULL; 2265 data = 0x100000000ULL;
2266 break; 2266 break;
2267 case MSR_MTRRcap: 2267 case MSR_MTRRcap:
2268 data = 0x500 | KVM_NR_VAR_MTRR; 2268 data = 0x500 | KVM_NR_VAR_MTRR;
2269 break; 2269 break;
2270 case 0x200 ... 0x2ff: 2270 case 0x200 ... 0x2ff:
2271 return get_msr_mtrr(vcpu, msr, pdata); 2271 return get_msr_mtrr(vcpu, msr, pdata);
2272 case 0xcd: /* fsb frequency */ 2272 case 0xcd: /* fsb frequency */
2273 data = 3; 2273 data = 3;
2274 break; 2274 break;
2275 /* 2275 /*
2276 * MSR_EBC_FREQUENCY_ID 2276 * MSR_EBC_FREQUENCY_ID
2277 * Conservative value valid for even the basic CPU models. 2277 * Conservative value valid for even the basic CPU models.
2278 * Models 0,1: 000 in bits 23:21 indicating a bus speed of 2278 * Models 0,1: 000 in bits 23:21 indicating a bus speed of
2279 * 100MHz, model 2 000 in bits 18:16 indicating 100MHz, 2279 * 100MHz, model 2 000 in bits 18:16 indicating 100MHz,
2280 * and 266MHz for model 3, or 4. Set Core Clock 2280 * and 266MHz for model 3, or 4. Set Core Clock
2281 * Frequency to System Bus Frequency Ratio to 1 (bits 2281 * Frequency to System Bus Frequency Ratio to 1 (bits
2282 * 31:24) even though these are only valid for CPU 2282 * 31:24) even though these are only valid for CPU
2283 * models > 2, however guests may end up dividing or 2283 * models > 2, however guests may end up dividing or
2284 * multiplying by zero otherwise. 2284 * multiplying by zero otherwise.
2285 */ 2285 */
2286 case MSR_EBC_FREQUENCY_ID: 2286 case MSR_EBC_FREQUENCY_ID:
2287 data = 1 << 24; 2287 data = 1 << 24;
2288 break; 2288 break;
2289 case MSR_IA32_APICBASE: 2289 case MSR_IA32_APICBASE:
2290 data = kvm_get_apic_base(vcpu); 2290 data = kvm_get_apic_base(vcpu);
2291 break; 2291 break;
2292 case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: 2292 case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
2293 return kvm_x2apic_msr_read(vcpu, msr, pdata); 2293 return kvm_x2apic_msr_read(vcpu, msr, pdata);
2294 break; 2294 break;
2295 case MSR_IA32_TSCDEADLINE: 2295 case MSR_IA32_TSCDEADLINE:
2296 data = kvm_get_lapic_tscdeadline_msr(vcpu); 2296 data = kvm_get_lapic_tscdeadline_msr(vcpu);
2297 break; 2297 break;
2298 case MSR_IA32_TSC_ADJUST: 2298 case MSR_IA32_TSC_ADJUST:
2299 data = (u64)vcpu->arch.ia32_tsc_adjust_msr; 2299 data = (u64)vcpu->arch.ia32_tsc_adjust_msr;
2300 break; 2300 break;
2301 case MSR_IA32_MISC_ENABLE: 2301 case MSR_IA32_MISC_ENABLE:
2302 data = vcpu->arch.ia32_misc_enable_msr; 2302 data = vcpu->arch.ia32_misc_enable_msr;
2303 break; 2303 break;
2304 case MSR_IA32_PERF_STATUS: 2304 case MSR_IA32_PERF_STATUS:
2305 /* TSC increment by tick */ 2305 /* TSC increment by tick */
2306 data = 1000ULL; 2306 data = 1000ULL;
2307 /* CPU multiplier */ 2307 /* CPU multiplier */
2308 data |= (((uint64_t)4ULL) << 40); 2308 data |= (((uint64_t)4ULL) << 40);
2309 break; 2309 break;
2310 case MSR_EFER: 2310 case MSR_EFER:
2311 data = vcpu->arch.efer; 2311 data = vcpu->arch.efer;
2312 break; 2312 break;
2313 case MSR_KVM_WALL_CLOCK: 2313 case MSR_KVM_WALL_CLOCK:
2314 case MSR_KVM_WALL_CLOCK_NEW: 2314 case MSR_KVM_WALL_CLOCK_NEW:
2315 data = vcpu->kvm->arch.wall_clock; 2315 data = vcpu->kvm->arch.wall_clock;
2316 break; 2316 break;
2317 case MSR_KVM_SYSTEM_TIME: 2317 case MSR_KVM_SYSTEM_TIME:
2318 case MSR_KVM_SYSTEM_TIME_NEW: 2318 case MSR_KVM_SYSTEM_TIME_NEW:
2319 data = vcpu->arch.time; 2319 data = vcpu->arch.time;
2320 break; 2320 break;
2321 case MSR_KVM_ASYNC_PF_EN: 2321 case MSR_KVM_ASYNC_PF_EN:
2322 data = vcpu->arch.apf.msr_val; 2322 data = vcpu->arch.apf.msr_val;
2323 break; 2323 break;
2324 case MSR_KVM_STEAL_TIME: 2324 case MSR_KVM_STEAL_TIME:
2325 data = vcpu->arch.st.msr_val; 2325 data = vcpu->arch.st.msr_val;
2326 break; 2326 break;
2327 case MSR_KVM_PV_EOI_EN: 2327 case MSR_KVM_PV_EOI_EN:
2328 data = vcpu->arch.pv_eoi.msr_val; 2328 data = vcpu->arch.pv_eoi.msr_val;
2329 break; 2329 break;
2330 case MSR_IA32_P5_MC_ADDR: 2330 case MSR_IA32_P5_MC_ADDR:
2331 case MSR_IA32_P5_MC_TYPE: 2331 case MSR_IA32_P5_MC_TYPE:
2332 case MSR_IA32_MCG_CAP: 2332 case MSR_IA32_MCG_CAP:
2333 case MSR_IA32_MCG_CTL: 2333 case MSR_IA32_MCG_CTL:
2334 case MSR_IA32_MCG_STATUS: 2334 case MSR_IA32_MCG_STATUS:
2335 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: 2335 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
2336 return get_msr_mce(vcpu, msr, pdata); 2336 return get_msr_mce(vcpu, msr, pdata);
2337 case MSR_K7_CLK_CTL: 2337 case MSR_K7_CLK_CTL:
2338 /* 2338 /*
2339 * Provide expected ramp-up count for K7. All other 2339 * Provide expected ramp-up count for K7. All other
2340 * are set to zero, indicating minimum divisors for 2340 * are set to zero, indicating minimum divisors for
2341 * every field. 2341 * every field.
2342 * 2342 *
2343 * This prevents guest kernels on AMD host with CPU 2343 * This prevents guest kernels on AMD host with CPU
2344 * type 6, model 8 and higher from exploding due to 2344 * type 6, model 8 and higher from exploding due to
2345 * the rdmsr failing. 2345 * the rdmsr failing.
2346 */ 2346 */
2347 data = 0x20000000; 2347 data = 0x20000000;
2348 break; 2348 break;
2349 case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: 2349 case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
2350 if (kvm_hv_msr_partition_wide(msr)) { 2350 if (kvm_hv_msr_partition_wide(msr)) {
2351 int r; 2351 int r;
2352 mutex_lock(&vcpu->kvm->lock); 2352 mutex_lock(&vcpu->kvm->lock);
2353 r = get_msr_hyperv_pw(vcpu, msr, pdata); 2353 r = get_msr_hyperv_pw(vcpu, msr, pdata);
2354 mutex_unlock(&vcpu->kvm->lock); 2354 mutex_unlock(&vcpu->kvm->lock);
2355 return r; 2355 return r;
2356 } else 2356 } else
2357 return get_msr_hyperv(vcpu, msr, pdata); 2357 return get_msr_hyperv(vcpu, msr, pdata);
2358 break; 2358 break;
2359 case MSR_IA32_BBL_CR_CTL3: 2359 case MSR_IA32_BBL_CR_CTL3:
2360 /* This legacy MSR exists but isn't fully documented in current 2360 /* This legacy MSR exists but isn't fully documented in current
2361 * silicon. It is however accessed by winxp in very narrow 2361 * silicon. It is however accessed by winxp in very narrow
2362 * scenarios where it sets bit #19, itself documented as 2362 * scenarios where it sets bit #19, itself documented as
2363 * a "reserved" bit. Best effort attempt to source coherent 2363 * a "reserved" bit. Best effort attempt to source coherent
2364 * read data here should the balance of the register be 2364 * read data here should the balance of the register be
2365 * interpreted by the guest: 2365 * interpreted by the guest:
2366 * 2366 *
2367 * L2 cache control register 3: 64GB range, 256KB size, 2367 * L2 cache control register 3: 64GB range, 256KB size,
2368 * enabled, latency 0x1, configured 2368 * enabled, latency 0x1, configured
2369 */ 2369 */
2370 data = 0xbe702111; 2370 data = 0xbe702111;
2371 break; 2371 break;
2372 case MSR_AMD64_OSVW_ID_LENGTH: 2372 case MSR_AMD64_OSVW_ID_LENGTH:
2373 if (!guest_cpuid_has_osvw(vcpu)) 2373 if (!guest_cpuid_has_osvw(vcpu))
2374 return 1; 2374 return 1;
2375 data = vcpu->arch.osvw.length; 2375 data = vcpu->arch.osvw.length;
2376 break; 2376 break;
2377 case MSR_AMD64_OSVW_STATUS: 2377 case MSR_AMD64_OSVW_STATUS:
2378 if (!guest_cpuid_has_osvw(vcpu)) 2378 if (!guest_cpuid_has_osvw(vcpu))
2379 return 1; 2379 return 1;
2380 data = vcpu->arch.osvw.status; 2380 data = vcpu->arch.osvw.status;
2381 break; 2381 break;
2382 default: 2382 default:
2383 if (kvm_pmu_msr(vcpu, msr)) 2383 if (kvm_pmu_msr(vcpu, msr))
2384 return kvm_pmu_get_msr(vcpu, msr, pdata); 2384 return kvm_pmu_get_msr(vcpu, msr, pdata);
2385 if (!ignore_msrs) { 2385 if (!ignore_msrs) {
2386 vcpu_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); 2386 vcpu_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
2387 return 1; 2387 return 1;
2388 } else { 2388 } else {
2389 vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr); 2389 vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr);
2390 data = 0; 2390 data = 0;
2391 } 2391 }
2392 break; 2392 break;
2393 } 2393 }
2394 *pdata = data; 2394 *pdata = data;
2395 return 0; 2395 return 0;
2396 } 2396 }
2397 EXPORT_SYMBOL_GPL(kvm_get_msr_common); 2397 EXPORT_SYMBOL_GPL(kvm_get_msr_common);
2398 2398
2399 /* 2399 /*
2400 * Read or write a bunch of msrs. All parameters are kernel addresses. 2400 * Read or write a bunch of msrs. All parameters are kernel addresses.
2401 * 2401 *
2402 * @return number of msrs set successfully. 2402 * @return number of msrs set successfully.
2403 */ 2403 */
2404 static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, 2404 static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
2405 struct kvm_msr_entry *entries, 2405 struct kvm_msr_entry *entries,
2406 int (*do_msr)(struct kvm_vcpu *vcpu, 2406 int (*do_msr)(struct kvm_vcpu *vcpu,
2407 unsigned index, u64 *data)) 2407 unsigned index, u64 *data))
2408 { 2408 {
2409 int i, idx; 2409 int i, idx;
2410 2410
2411 idx = srcu_read_lock(&vcpu->kvm->srcu); 2411 idx = srcu_read_lock(&vcpu->kvm->srcu);
2412 for (i = 0; i < msrs->nmsrs; ++i) 2412 for (i = 0; i < msrs->nmsrs; ++i)
2413 if (do_msr(vcpu, entries[i].index, &entries[i].data)) 2413 if (do_msr(vcpu, entries[i].index, &entries[i].data))
2414 break; 2414 break;
2415 srcu_read_unlock(&vcpu->kvm->srcu, idx); 2415 srcu_read_unlock(&vcpu->kvm->srcu, idx);
2416 2416
2417 return i; 2417 return i;
2418 } 2418 }
2419 2419
2420 /* 2420 /*
2421 * Read or write a bunch of msrs. Parameters are user addresses. 2421 * Read or write a bunch of msrs. Parameters are user addresses.
2422 * 2422 *
2423 * @return number of msrs set successfully. 2423 * @return number of msrs set successfully.
2424 */ 2424 */
2425 static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs, 2425 static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
2426 int (*do_msr)(struct kvm_vcpu *vcpu, 2426 int (*do_msr)(struct kvm_vcpu *vcpu,
2427 unsigned index, u64 *data), 2427 unsigned index, u64 *data),
2428 int writeback) 2428 int writeback)
2429 { 2429 {
2430 struct kvm_msrs msrs; 2430 struct kvm_msrs msrs;
2431 struct kvm_msr_entry *entries; 2431 struct kvm_msr_entry *entries;
2432 int r, n; 2432 int r, n;
2433 unsigned size; 2433 unsigned size;
2434 2434
2435 r = -EFAULT; 2435 r = -EFAULT;
2436 if (copy_from_user(&msrs, user_msrs, sizeof msrs)) 2436 if (copy_from_user(&msrs, user_msrs, sizeof msrs))
2437 goto out; 2437 goto out;
2438 2438
2439 r = -E2BIG; 2439 r = -E2BIG;
2440 if (msrs.nmsrs >= MAX_IO_MSRS) 2440 if (msrs.nmsrs >= MAX_IO_MSRS)
2441 goto out; 2441 goto out;
2442 2442
2443 size = sizeof(struct kvm_msr_entry) * msrs.nmsrs; 2443 size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
2444 entries = memdup_user(user_msrs->entries, size); 2444 entries = memdup_user(user_msrs->entries, size);
2445 if (IS_ERR(entries)) { 2445 if (IS_ERR(entries)) {
2446 r = PTR_ERR(entries); 2446 r = PTR_ERR(entries);
2447 goto out; 2447 goto out;
2448 } 2448 }
2449 2449
2450 r = n = __msr_io(vcpu, &msrs, entries, do_msr); 2450 r = n = __msr_io(vcpu, &msrs, entries, do_msr);
2451 if (r < 0) 2451 if (r < 0)
2452 goto out_free; 2452 goto out_free;
2453 2453
2454 r = -EFAULT; 2454 r = -EFAULT;
2455 if (writeback && copy_to_user(user_msrs->entries, entries, size)) 2455 if (writeback && copy_to_user(user_msrs->entries, entries, size))
2456 goto out_free; 2456 goto out_free;
2457 2457
2458 r = n; 2458 r = n;
2459 2459
2460 out_free: 2460 out_free:
2461 kfree(entries); 2461 kfree(entries);
2462 out: 2462 out:
2463 return r; 2463 return r;
2464 } 2464 }
2465 2465
2466 int kvm_dev_ioctl_check_extension(long ext) 2466 int kvm_dev_ioctl_check_extension(long ext)
2467 { 2467 {
2468 int r; 2468 int r;
2469 2469
2470 switch (ext) { 2470 switch (ext) {
2471 case KVM_CAP_IRQCHIP: 2471 case KVM_CAP_IRQCHIP:
2472 case KVM_CAP_HLT: 2472 case KVM_CAP_HLT:
2473 case KVM_CAP_MMU_SHADOW_CACHE_CONTROL: 2473 case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
2474 case KVM_CAP_SET_TSS_ADDR: 2474 case KVM_CAP_SET_TSS_ADDR:
2475 case KVM_CAP_EXT_CPUID: 2475 case KVM_CAP_EXT_CPUID:
2476 case KVM_CAP_CLOCKSOURCE: 2476 case KVM_CAP_CLOCKSOURCE:
2477 case KVM_CAP_PIT: 2477 case KVM_CAP_PIT:
2478 case KVM_CAP_NOP_IO_DELAY: 2478 case KVM_CAP_NOP_IO_DELAY:
2479 case KVM_CAP_MP_STATE: 2479 case KVM_CAP_MP_STATE:
2480 case KVM_CAP_SYNC_MMU: 2480 case KVM_CAP_SYNC_MMU:
2481 case KVM_CAP_USER_NMI: 2481 case KVM_CAP_USER_NMI:
2482 case KVM_CAP_REINJECT_CONTROL: 2482 case KVM_CAP_REINJECT_CONTROL:
2483 case KVM_CAP_IRQ_INJECT_STATUS: 2483 case KVM_CAP_IRQ_INJECT_STATUS:
2484 case KVM_CAP_ASSIGN_DEV_IRQ: 2484 case KVM_CAP_ASSIGN_DEV_IRQ:
2485 case KVM_CAP_IRQFD: 2485 case KVM_CAP_IRQFD:
2486 case KVM_CAP_IOEVENTFD: 2486 case KVM_CAP_IOEVENTFD:
2487 case KVM_CAP_PIT2: 2487 case KVM_CAP_PIT2:
2488 case KVM_CAP_PIT_STATE2: 2488 case KVM_CAP_PIT_STATE2:
2489 case KVM_CAP_SET_IDENTITY_MAP_ADDR: 2489 case KVM_CAP_SET_IDENTITY_MAP_ADDR:
2490 case KVM_CAP_XEN_HVM: 2490 case KVM_CAP_XEN_HVM:
2491 case KVM_CAP_ADJUST_CLOCK: 2491 case KVM_CAP_ADJUST_CLOCK:
2492 case KVM_CAP_VCPU_EVENTS: 2492 case KVM_CAP_VCPU_EVENTS:
2493 case KVM_CAP_HYPERV: 2493 case KVM_CAP_HYPERV:
2494 case KVM_CAP_HYPERV_VAPIC: 2494 case KVM_CAP_HYPERV_VAPIC:
2495 case KVM_CAP_HYPERV_SPIN: 2495 case KVM_CAP_HYPERV_SPIN:
2496 case KVM_CAP_PCI_SEGMENT: 2496 case KVM_CAP_PCI_SEGMENT:
2497 case KVM_CAP_DEBUGREGS: 2497 case KVM_CAP_DEBUGREGS:
2498 case KVM_CAP_X86_ROBUST_SINGLESTEP: 2498 case KVM_CAP_X86_ROBUST_SINGLESTEP:
2499 case KVM_CAP_XSAVE: 2499 case KVM_CAP_XSAVE:
2500 case KVM_CAP_ASYNC_PF: 2500 case KVM_CAP_ASYNC_PF:
2501 case KVM_CAP_GET_TSC_KHZ: 2501 case KVM_CAP_GET_TSC_KHZ:
2502 case KVM_CAP_PCI_2_3: 2502 case KVM_CAP_PCI_2_3:
2503 case KVM_CAP_KVMCLOCK_CTRL: 2503 case KVM_CAP_KVMCLOCK_CTRL:
2504 case KVM_CAP_READONLY_MEM: 2504 case KVM_CAP_READONLY_MEM:
2505 case KVM_CAP_IRQFD_RESAMPLE: 2505 case KVM_CAP_IRQFD_RESAMPLE:
2506 r = 1; 2506 r = 1;
2507 break; 2507 break;
2508 case KVM_CAP_COALESCED_MMIO: 2508 case KVM_CAP_COALESCED_MMIO:
2509 r = KVM_COALESCED_MMIO_PAGE_OFFSET; 2509 r = KVM_COALESCED_MMIO_PAGE_OFFSET;
2510 break; 2510 break;
2511 case KVM_CAP_VAPIC: 2511 case KVM_CAP_VAPIC:
2512 r = !kvm_x86_ops->cpu_has_accelerated_tpr(); 2512 r = !kvm_x86_ops->cpu_has_accelerated_tpr();
2513 break; 2513 break;
2514 case KVM_CAP_NR_VCPUS: 2514 case KVM_CAP_NR_VCPUS:
2515 r = KVM_SOFT_MAX_VCPUS; 2515 r = KVM_SOFT_MAX_VCPUS;
2516 break; 2516 break;
2517 case KVM_CAP_MAX_VCPUS: 2517 case KVM_CAP_MAX_VCPUS:
2518 r = KVM_MAX_VCPUS; 2518 r = KVM_MAX_VCPUS;
2519 break; 2519 break;
2520 case KVM_CAP_NR_MEMSLOTS: 2520 case KVM_CAP_NR_MEMSLOTS:
2521 r = KVM_USER_MEM_SLOTS; 2521 r = KVM_USER_MEM_SLOTS;
2522 break; 2522 break;
2523 case KVM_CAP_PV_MMU: /* obsolete */ 2523 case KVM_CAP_PV_MMU: /* obsolete */
2524 r = 0; 2524 r = 0;
2525 break; 2525 break;
2526 case KVM_CAP_IOMMU: 2526 case KVM_CAP_IOMMU:
2527 r = iommu_present(&pci_bus_type); 2527 r = iommu_present(&pci_bus_type);
2528 break; 2528 break;
2529 case KVM_CAP_MCE: 2529 case KVM_CAP_MCE:
2530 r = KVM_MAX_MCE_BANKS; 2530 r = KVM_MAX_MCE_BANKS;
2531 break; 2531 break;
2532 case KVM_CAP_XCRS: 2532 case KVM_CAP_XCRS:
2533 r = cpu_has_xsave; 2533 r = cpu_has_xsave;
2534 break; 2534 break;
2535 case KVM_CAP_TSC_CONTROL: 2535 case KVM_CAP_TSC_CONTROL:
2536 r = kvm_has_tsc_control; 2536 r = kvm_has_tsc_control;
2537 break; 2537 break;
2538 case KVM_CAP_TSC_DEADLINE_TIMER: 2538 case KVM_CAP_TSC_DEADLINE_TIMER:
2539 r = boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER); 2539 r = boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER);
2540 break; 2540 break;
2541 default: 2541 default:
2542 r = 0; 2542 r = 0;
2543 break; 2543 break;
2544 } 2544 }
2545 return r; 2545 return r;
2546 2546
2547 } 2547 }
2548 2548
2549 long kvm_arch_dev_ioctl(struct file *filp, 2549 long kvm_arch_dev_ioctl(struct file *filp,
2550 unsigned int ioctl, unsigned long arg) 2550 unsigned int ioctl, unsigned long arg)
2551 { 2551 {
2552 void __user *argp = (void __user *)arg; 2552 void __user *argp = (void __user *)arg;
2553 long r; 2553 long r;
2554 2554
2555 switch (ioctl) { 2555 switch (ioctl) {
2556 case KVM_GET_MSR_INDEX_LIST: { 2556 case KVM_GET_MSR_INDEX_LIST: {
2557 struct kvm_msr_list __user *user_msr_list = argp; 2557 struct kvm_msr_list __user *user_msr_list = argp;
2558 struct kvm_msr_list msr_list; 2558 struct kvm_msr_list msr_list;
2559 unsigned n; 2559 unsigned n;
2560 2560
2561 r = -EFAULT; 2561 r = -EFAULT;
2562 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list)) 2562 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
2563 goto out; 2563 goto out;
2564 n = msr_list.nmsrs; 2564 n = msr_list.nmsrs;
2565 msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs); 2565 msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
2566 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list)) 2566 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
2567 goto out; 2567 goto out;
2568 r = -E2BIG; 2568 r = -E2BIG;
2569 if (n < msr_list.nmsrs) 2569 if (n < msr_list.nmsrs)
2570 goto out; 2570 goto out;
2571 r = -EFAULT; 2571 r = -EFAULT;
2572 if (copy_to_user(user_msr_list->indices, &msrs_to_save, 2572 if (copy_to_user(user_msr_list->indices, &msrs_to_save,
2573 num_msrs_to_save * sizeof(u32))) 2573 num_msrs_to_save * sizeof(u32)))
2574 goto out; 2574 goto out;
2575 if (copy_to_user(user_msr_list->indices + num_msrs_to_save, 2575 if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
2576 &emulated_msrs, 2576 &emulated_msrs,
2577 ARRAY_SIZE(emulated_msrs) * sizeof(u32))) 2577 ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
2578 goto out; 2578 goto out;
2579 r = 0; 2579 r = 0;
2580 break; 2580 break;
2581 } 2581 }
2582 case KVM_GET_SUPPORTED_CPUID: { 2582 case KVM_GET_SUPPORTED_CPUID: {
2583 struct kvm_cpuid2 __user *cpuid_arg = argp; 2583 struct kvm_cpuid2 __user *cpuid_arg = argp;
2584 struct kvm_cpuid2 cpuid; 2584 struct kvm_cpuid2 cpuid;
2585 2585
2586 r = -EFAULT; 2586 r = -EFAULT;
2587 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 2587 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
2588 goto out; 2588 goto out;
2589 r = kvm_dev_ioctl_get_supported_cpuid(&cpuid, 2589 r = kvm_dev_ioctl_get_supported_cpuid(&cpuid,
2590 cpuid_arg->entries); 2590 cpuid_arg->entries);
2591 if (r) 2591 if (r)
2592 goto out; 2592 goto out;
2593 2593
2594 r = -EFAULT; 2594 r = -EFAULT;
2595 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid)) 2595 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
2596 goto out; 2596 goto out;
2597 r = 0; 2597 r = 0;
2598 break; 2598 break;
2599 } 2599 }
2600 case KVM_X86_GET_MCE_CAP_SUPPORTED: { 2600 case KVM_X86_GET_MCE_CAP_SUPPORTED: {
2601 u64 mce_cap; 2601 u64 mce_cap;
2602 2602
2603 mce_cap = KVM_MCE_CAP_SUPPORTED; 2603 mce_cap = KVM_MCE_CAP_SUPPORTED;
2604 r = -EFAULT; 2604 r = -EFAULT;
2605 if (copy_to_user(argp, &mce_cap, sizeof mce_cap)) 2605 if (copy_to_user(argp, &mce_cap, sizeof mce_cap))
2606 goto out; 2606 goto out;
2607 r = 0; 2607 r = 0;
2608 break; 2608 break;
2609 } 2609 }
2610 default: 2610 default:
2611 r = -EINVAL; 2611 r = -EINVAL;
2612 } 2612 }
2613 out: 2613 out:
2614 return r; 2614 return r;
2615 } 2615 }
2616 2616
2617 static void wbinvd_ipi(void *garbage) 2617 static void wbinvd_ipi(void *garbage)
2618 { 2618 {
2619 wbinvd(); 2619 wbinvd();
2620 } 2620 }
2621 2621
2622 static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu) 2622 static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
2623 { 2623 {
2624 return vcpu->kvm->arch.iommu_domain && 2624 return vcpu->kvm->arch.iommu_domain &&
2625 !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY); 2625 !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY);
2626 } 2626 }
2627 2627
2628 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 2628 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
2629 { 2629 {
2630 /* Address WBINVD may be executed by guest */ 2630 /* Address WBINVD may be executed by guest */
2631 if (need_emulate_wbinvd(vcpu)) { 2631 if (need_emulate_wbinvd(vcpu)) {
2632 if (kvm_x86_ops->has_wbinvd_exit()) 2632 if (kvm_x86_ops->has_wbinvd_exit())
2633 cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); 2633 cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
2634 else if (vcpu->cpu != -1 && vcpu->cpu != cpu) 2634 else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
2635 smp_call_function_single(vcpu->cpu, 2635 smp_call_function_single(vcpu->cpu,
2636 wbinvd_ipi, NULL, 1); 2636 wbinvd_ipi, NULL, 1);
2637 } 2637 }
2638 2638
2639 kvm_x86_ops->vcpu_load(vcpu, cpu); 2639 kvm_x86_ops->vcpu_load(vcpu, cpu);
2640 2640
2641 /* Apply any externally detected TSC adjustments (due to suspend) */ 2641 /* Apply any externally detected TSC adjustments (due to suspend) */
2642 if (unlikely(vcpu->arch.tsc_offset_adjustment)) { 2642 if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
2643 adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment); 2643 adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
2644 vcpu->arch.tsc_offset_adjustment = 0; 2644 vcpu->arch.tsc_offset_adjustment = 0;
2645 set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests); 2645 set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
2646 } 2646 }
2647 2647
2648 if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) { 2648 if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
2649 s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 : 2649 s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
2650 native_read_tsc() - vcpu->arch.last_host_tsc; 2650 native_read_tsc() - vcpu->arch.last_host_tsc;
2651 if (tsc_delta < 0) 2651 if (tsc_delta < 0)
2652 mark_tsc_unstable("KVM discovered backwards TSC"); 2652 mark_tsc_unstable("KVM discovered backwards TSC");
2653 if (check_tsc_unstable()) { 2653 if (check_tsc_unstable()) {
2654 u64 offset = kvm_x86_ops->compute_tsc_offset(vcpu, 2654 u64 offset = kvm_x86_ops->compute_tsc_offset(vcpu,
2655 vcpu->arch.last_guest_tsc); 2655 vcpu->arch.last_guest_tsc);
2656 kvm_x86_ops->write_tsc_offset(vcpu, offset); 2656 kvm_x86_ops->write_tsc_offset(vcpu, offset);
2657 vcpu->arch.tsc_catchup = 1; 2657 vcpu->arch.tsc_catchup = 1;
2658 } 2658 }
2659 /* 2659 /*
2660 * On a host with synchronized TSC, there is no need to update 2660 * On a host with synchronized TSC, there is no need to update
2661 * kvmclock on vcpu->cpu migration 2661 * kvmclock on vcpu->cpu migration
2662 */ 2662 */
2663 if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1) 2663 if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1)
2664 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); 2664 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
2665 if (vcpu->cpu != cpu) 2665 if (vcpu->cpu != cpu)
2666 kvm_migrate_timers(vcpu); 2666 kvm_migrate_timers(vcpu);
2667 vcpu->cpu = cpu; 2667 vcpu->cpu = cpu;
2668 } 2668 }
2669 2669
2670 accumulate_steal_time(vcpu); 2670 accumulate_steal_time(vcpu);
2671 kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); 2671 kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
2672 } 2672 }
2673 2673
2674 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) 2674 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
2675 { 2675 {
2676 kvm_x86_ops->vcpu_put(vcpu); 2676 kvm_x86_ops->vcpu_put(vcpu);
2677 kvm_put_guest_fpu(vcpu); 2677 kvm_put_guest_fpu(vcpu);
2678 vcpu->arch.last_host_tsc = native_read_tsc(); 2678 vcpu->arch.last_host_tsc = native_read_tsc();
2679 } 2679 }
2680 2680
2681 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, 2681 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
2682 struct kvm_lapic_state *s) 2682 struct kvm_lapic_state *s)
2683 { 2683 {
2684 memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); 2684 memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
2685 2685
2686 return 0; 2686 return 0;
2687 } 2687 }
2688 2688
2689 static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, 2689 static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
2690 struct kvm_lapic_state *s) 2690 struct kvm_lapic_state *s)
2691 { 2691 {
2692 kvm_apic_post_state_restore(vcpu, s); 2692 kvm_apic_post_state_restore(vcpu, s);
2693 update_cr8_intercept(vcpu); 2693 update_cr8_intercept(vcpu);
2694 2694
2695 return 0; 2695 return 0;
2696 } 2696 }
2697 2697
2698 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, 2698 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
2699 struct kvm_interrupt *irq) 2699 struct kvm_interrupt *irq)
2700 { 2700 {
2701 if (irq->irq < 0 || irq->irq >= KVM_NR_INTERRUPTS) 2701 if (irq->irq < 0 || irq->irq >= KVM_NR_INTERRUPTS)
2702 return -EINVAL; 2702 return -EINVAL;
2703 if (irqchip_in_kernel(vcpu->kvm)) 2703 if (irqchip_in_kernel(vcpu->kvm))
2704 return -ENXIO; 2704 return -ENXIO;
2705 2705
2706 kvm_queue_interrupt(vcpu, irq->irq, false); 2706 kvm_queue_interrupt(vcpu, irq->irq, false);
2707 kvm_make_request(KVM_REQ_EVENT, vcpu); 2707 kvm_make_request(KVM_REQ_EVENT, vcpu);
2708 2708
2709 return 0; 2709 return 0;
2710 } 2710 }
2711 2711
2712 static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu) 2712 static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
2713 { 2713 {
2714 kvm_inject_nmi(vcpu); 2714 kvm_inject_nmi(vcpu);
2715 2715
2716 return 0; 2716 return 0;
2717 } 2717 }
2718 2718
2719 static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu, 2719 static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
2720 struct kvm_tpr_access_ctl *tac) 2720 struct kvm_tpr_access_ctl *tac)
2721 { 2721 {
2722 if (tac->flags) 2722 if (tac->flags)
2723 return -EINVAL; 2723 return -EINVAL;
2724 vcpu->arch.tpr_access_reporting = !!tac->enabled; 2724 vcpu->arch.tpr_access_reporting = !!tac->enabled;
2725 return 0; 2725 return 0;
2726 } 2726 }
2727 2727
2728 static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, 2728 static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
2729 u64 mcg_cap) 2729 u64 mcg_cap)
2730 { 2730 {
2731 int r; 2731 int r;
2732 unsigned bank_num = mcg_cap & 0xff, bank; 2732 unsigned bank_num = mcg_cap & 0xff, bank;
2733 2733
2734 r = -EINVAL; 2734 r = -EINVAL;
2735 if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS) 2735 if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
2736 goto out; 2736 goto out;
2737 if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000)) 2737 if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000))
2738 goto out; 2738 goto out;
2739 r = 0; 2739 r = 0;
2740 vcpu->arch.mcg_cap = mcg_cap; 2740 vcpu->arch.mcg_cap = mcg_cap;
2741 /* Init IA32_MCG_CTL to all 1s */ 2741 /* Init IA32_MCG_CTL to all 1s */
2742 if (mcg_cap & MCG_CTL_P) 2742 if (mcg_cap & MCG_CTL_P)
2743 vcpu->arch.mcg_ctl = ~(u64)0; 2743 vcpu->arch.mcg_ctl = ~(u64)0;
2744 /* Init IA32_MCi_CTL to all 1s */ 2744 /* Init IA32_MCi_CTL to all 1s */
2745 for (bank = 0; bank < bank_num; bank++) 2745 for (bank = 0; bank < bank_num; bank++)
2746 vcpu->arch.mce_banks[bank*4] = ~(u64)0; 2746 vcpu->arch.mce_banks[bank*4] = ~(u64)0;
2747 out: 2747 out:
2748 return r; 2748 return r;
2749 } 2749 }
2750 2750
2751 static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, 2751 static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
2752 struct kvm_x86_mce *mce) 2752 struct kvm_x86_mce *mce)
2753 { 2753 {
2754 u64 mcg_cap = vcpu->arch.mcg_cap; 2754 u64 mcg_cap = vcpu->arch.mcg_cap;
2755 unsigned bank_num = mcg_cap & 0xff; 2755 unsigned bank_num = mcg_cap & 0xff;
2756 u64 *banks = vcpu->arch.mce_banks; 2756 u64 *banks = vcpu->arch.mce_banks;
2757 2757
2758 if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL)) 2758 if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
2759 return -EINVAL; 2759 return -EINVAL;
2760 /* 2760 /*
2761 * if IA32_MCG_CTL is not all 1s, the uncorrected error 2761 * if IA32_MCG_CTL is not all 1s, the uncorrected error
2762 * reporting is disabled 2762 * reporting is disabled
2763 */ 2763 */
2764 if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) && 2764 if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
2765 vcpu->arch.mcg_ctl != ~(u64)0) 2765 vcpu->arch.mcg_ctl != ~(u64)0)
2766 return 0; 2766 return 0;
2767 banks += 4 * mce->bank; 2767 banks += 4 * mce->bank;
2768 /* 2768 /*
2769 * if IA32_MCi_CTL is not all 1s, the uncorrected error 2769 * if IA32_MCi_CTL is not all 1s, the uncorrected error
2770 * reporting is disabled for the bank 2770 * reporting is disabled for the bank
2771 */ 2771 */
2772 if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0) 2772 if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0)
2773 return 0; 2773 return 0;
2774 if (mce->status & MCI_STATUS_UC) { 2774 if (mce->status & MCI_STATUS_UC) {
2775 if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) || 2775 if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
2776 !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) { 2776 !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) {
2777 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 2777 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2778 return 0; 2778 return 0;
2779 } 2779 }
2780 if (banks[1] & MCI_STATUS_VAL) 2780 if (banks[1] & MCI_STATUS_VAL)
2781 mce->status |= MCI_STATUS_OVER; 2781 mce->status |= MCI_STATUS_OVER;
2782 banks[2] = mce->addr; 2782 banks[2] = mce->addr;
2783 banks[3] = mce->misc; 2783 banks[3] = mce->misc;
2784 vcpu->arch.mcg_status = mce->mcg_status; 2784 vcpu->arch.mcg_status = mce->mcg_status;
2785 banks[1] = mce->status; 2785 banks[1] = mce->status;
2786 kvm_queue_exception(vcpu, MC_VECTOR); 2786 kvm_queue_exception(vcpu, MC_VECTOR);
2787 } else if (!(banks[1] & MCI_STATUS_VAL) 2787 } else if (!(banks[1] & MCI_STATUS_VAL)
2788 || !(banks[1] & MCI_STATUS_UC)) { 2788 || !(banks[1] & MCI_STATUS_UC)) {
2789 if (banks[1] & MCI_STATUS_VAL) 2789 if (banks[1] & MCI_STATUS_VAL)
2790 mce->status |= MCI_STATUS_OVER; 2790 mce->status |= MCI_STATUS_OVER;
2791 banks[2] = mce->addr; 2791 banks[2] = mce->addr;
2792 banks[3] = mce->misc; 2792 banks[3] = mce->misc;
2793 banks[1] = mce->status; 2793 banks[1] = mce->status;
2794 } else 2794 } else
2795 banks[1] |= MCI_STATUS_OVER; 2795 banks[1] |= MCI_STATUS_OVER;
2796 return 0; 2796 return 0;
2797 } 2797 }
2798 2798
2799 static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, 2799 static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
2800 struct kvm_vcpu_events *events) 2800 struct kvm_vcpu_events *events)
2801 { 2801 {
2802 process_nmi(vcpu); 2802 process_nmi(vcpu);
2803 events->exception.injected = 2803 events->exception.injected =
2804 vcpu->arch.exception.pending && 2804 vcpu->arch.exception.pending &&
2805 !kvm_exception_is_soft(vcpu->arch.exception.nr); 2805 !kvm_exception_is_soft(vcpu->arch.exception.nr);
2806 events->exception.nr = vcpu->arch.exception.nr; 2806 events->exception.nr = vcpu->arch.exception.nr;
2807 events->exception.has_error_code = vcpu->arch.exception.has_error_code; 2807 events->exception.has_error_code = vcpu->arch.exception.has_error_code;
2808 events->exception.pad = 0; 2808 events->exception.pad = 0;
2809 events->exception.error_code = vcpu->arch.exception.error_code; 2809 events->exception.error_code = vcpu->arch.exception.error_code;
2810 2810
2811 events->interrupt.injected = 2811 events->interrupt.injected =
2812 vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft; 2812 vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft;
2813 events->interrupt.nr = vcpu->arch.interrupt.nr; 2813 events->interrupt.nr = vcpu->arch.interrupt.nr;
2814 events->interrupt.soft = 0; 2814 events->interrupt.soft = 0;
2815 events->interrupt.shadow = 2815 events->interrupt.shadow =
2816 kvm_x86_ops->get_interrupt_shadow(vcpu, 2816 kvm_x86_ops->get_interrupt_shadow(vcpu,
2817 KVM_X86_SHADOW_INT_MOV_SS | KVM_X86_SHADOW_INT_STI); 2817 KVM_X86_SHADOW_INT_MOV_SS | KVM_X86_SHADOW_INT_STI);
2818 2818
2819 events->nmi.injected = vcpu->arch.nmi_injected; 2819 events->nmi.injected = vcpu->arch.nmi_injected;
2820 events->nmi.pending = vcpu->arch.nmi_pending != 0; 2820 events->nmi.pending = vcpu->arch.nmi_pending != 0;
2821 events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu); 2821 events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);
2822 events->nmi.pad = 0; 2822 events->nmi.pad = 0;
2823 2823
2824 events->sipi_vector = vcpu->arch.sipi_vector; 2824 events->sipi_vector = vcpu->arch.sipi_vector;
2825 2825
2826 events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING 2826 events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
2827 | KVM_VCPUEVENT_VALID_SIPI_VECTOR 2827 | KVM_VCPUEVENT_VALID_SIPI_VECTOR
2828 | KVM_VCPUEVENT_VALID_SHADOW); 2828 | KVM_VCPUEVENT_VALID_SHADOW);
2829 memset(&events->reserved, 0, sizeof(events->reserved)); 2829 memset(&events->reserved, 0, sizeof(events->reserved));
2830 } 2830 }
2831 2831
2832 static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, 2832 static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
2833 struct kvm_vcpu_events *events) 2833 struct kvm_vcpu_events *events)
2834 { 2834 {
2835 if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING 2835 if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
2836 | KVM_VCPUEVENT_VALID_SIPI_VECTOR 2836 | KVM_VCPUEVENT_VALID_SIPI_VECTOR
2837 | KVM_VCPUEVENT_VALID_SHADOW)) 2837 | KVM_VCPUEVENT_VALID_SHADOW))
2838 return -EINVAL; 2838 return -EINVAL;
2839 2839
2840 process_nmi(vcpu); 2840 process_nmi(vcpu);
2841 vcpu->arch.exception.pending = events->exception.injected; 2841 vcpu->arch.exception.pending = events->exception.injected;
2842 vcpu->arch.exception.nr = events->exception.nr; 2842 vcpu->arch.exception.nr = events->exception.nr;
2843 vcpu->arch.exception.has_error_code = events->exception.has_error_code; 2843 vcpu->arch.exception.has_error_code = events->exception.has_error_code;
2844 vcpu->arch.exception.error_code = events->exception.error_code; 2844 vcpu->arch.exception.error_code = events->exception.error_code;
2845 2845
2846 vcpu->arch.interrupt.pending = events->interrupt.injected; 2846 vcpu->arch.interrupt.pending = events->interrupt.injected;
2847 vcpu->arch.interrupt.nr = events->interrupt.nr; 2847 vcpu->arch.interrupt.nr = events->interrupt.nr;
2848 vcpu->arch.interrupt.soft = events->interrupt.soft; 2848 vcpu->arch.interrupt.soft = events->interrupt.soft;
2849 if (events->flags & KVM_VCPUEVENT_VALID_SHADOW) 2849 if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
2850 kvm_x86_ops->set_interrupt_shadow(vcpu, 2850 kvm_x86_ops->set_interrupt_shadow(vcpu,
2851 events->interrupt.shadow); 2851 events->interrupt.shadow);
2852 2852
2853 vcpu->arch.nmi_injected = events->nmi.injected; 2853 vcpu->arch.nmi_injected = events->nmi.injected;
2854 if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) 2854 if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
2855 vcpu->arch.nmi_pending = events->nmi.pending; 2855 vcpu->arch.nmi_pending = events->nmi.pending;
2856 kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked); 2856 kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);
2857 2857
2858 if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR) 2858 if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR)
2859 vcpu->arch.sipi_vector = events->sipi_vector; 2859 vcpu->arch.sipi_vector = events->sipi_vector;
2860 2860
2861 kvm_make_request(KVM_REQ_EVENT, vcpu); 2861 kvm_make_request(KVM_REQ_EVENT, vcpu);
2862 2862
2863 return 0; 2863 return 0;
2864 } 2864 }
2865 2865
2866 static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu, 2866 static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
2867 struct kvm_debugregs *dbgregs) 2867 struct kvm_debugregs *dbgregs)
2868 { 2868 {
2869 memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db)); 2869 memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
2870 dbgregs->dr6 = vcpu->arch.dr6; 2870 dbgregs->dr6 = vcpu->arch.dr6;
2871 dbgregs->dr7 = vcpu->arch.dr7; 2871 dbgregs->dr7 = vcpu->arch.dr7;
2872 dbgregs->flags = 0; 2872 dbgregs->flags = 0;
2873 memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved)); 2873 memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved));
2874 } 2874 }
2875 2875
2876 static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, 2876 static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
2877 struct kvm_debugregs *dbgregs) 2877 struct kvm_debugregs *dbgregs)
2878 { 2878 {
2879 if (dbgregs->flags) 2879 if (dbgregs->flags)
2880 return -EINVAL; 2880 return -EINVAL;
2881 2881
2882 memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db)); 2882 memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
2883 vcpu->arch.dr6 = dbgregs->dr6; 2883 vcpu->arch.dr6 = dbgregs->dr6;
2884 vcpu->arch.dr7 = dbgregs->dr7; 2884 vcpu->arch.dr7 = dbgregs->dr7;
2885 2885
2886 return 0; 2886 return 0;
2887 } 2887 }
2888 2888
2889 static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, 2889 static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
2890 struct kvm_xsave *guest_xsave) 2890 struct kvm_xsave *guest_xsave)
2891 { 2891 {
2892 if (cpu_has_xsave) 2892 if (cpu_has_xsave)
2893 memcpy(guest_xsave->region, 2893 memcpy(guest_xsave->region,
2894 &vcpu->arch.guest_fpu.state->xsave, 2894 &vcpu->arch.guest_fpu.state->xsave,
2895 xstate_size); 2895 xstate_size);
2896 else { 2896 else {
2897 memcpy(guest_xsave->region, 2897 memcpy(guest_xsave->region,
2898 &vcpu->arch.guest_fpu.state->fxsave, 2898 &vcpu->arch.guest_fpu.state->fxsave,
2899 sizeof(struct i387_fxsave_struct)); 2899 sizeof(struct i387_fxsave_struct));
2900 *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] = 2900 *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] =
2901 XSTATE_FPSSE; 2901 XSTATE_FPSSE;
2902 } 2902 }
2903 } 2903 }
2904 2904
2905 static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, 2905 static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
2906 struct kvm_xsave *guest_xsave) 2906 struct kvm_xsave *guest_xsave)
2907 { 2907 {
2908 u64 xstate_bv = 2908 u64 xstate_bv =
2909 *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)]; 2909 *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];
2910 2910
2911 if (cpu_has_xsave) 2911 if (cpu_has_xsave)
2912 memcpy(&vcpu->arch.guest_fpu.state->xsave, 2912 memcpy(&vcpu->arch.guest_fpu.state->xsave,
2913 guest_xsave->region, xstate_size); 2913 guest_xsave->region, xstate_size);
2914 else { 2914 else {
2915 if (xstate_bv & ~XSTATE_FPSSE) 2915 if (xstate_bv & ~XSTATE_FPSSE)
2916 return -EINVAL; 2916 return -EINVAL;
2917 memcpy(&vcpu->arch.guest_fpu.state->fxsave, 2917 memcpy(&vcpu->arch.guest_fpu.state->fxsave,
2918 guest_xsave->region, sizeof(struct i387_fxsave_struct)); 2918 guest_xsave->region, sizeof(struct i387_fxsave_struct));
2919 } 2919 }
2920 return 0; 2920 return 0;
2921 } 2921 }
2922 2922
2923 static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu, 2923 static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
2924 struct kvm_xcrs *guest_xcrs) 2924 struct kvm_xcrs *guest_xcrs)
2925 { 2925 {
2926 if (!cpu_has_xsave) { 2926 if (!cpu_has_xsave) {
2927 guest_xcrs->nr_xcrs = 0; 2927 guest_xcrs->nr_xcrs = 0;
2928 return; 2928 return;
2929 } 2929 }
2930 2930
2931 guest_xcrs->nr_xcrs = 1; 2931 guest_xcrs->nr_xcrs = 1;
2932 guest_xcrs->flags = 0; 2932 guest_xcrs->flags = 0;
2933 guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK; 2933 guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK;
2934 guest_xcrs->xcrs[0].value = vcpu->arch.xcr0; 2934 guest_xcrs->xcrs[0].value = vcpu->arch.xcr0;
2935 } 2935 }
2936 2936
2937 static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu, 2937 static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
2938 struct kvm_xcrs *guest_xcrs) 2938 struct kvm_xcrs *guest_xcrs)
2939 { 2939 {
2940 int i, r = 0; 2940 int i, r = 0;
2941 2941
2942 if (!cpu_has_xsave) 2942 if (!cpu_has_xsave)
2943 return -EINVAL; 2943 return -EINVAL;
2944 2944
2945 if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags) 2945 if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags)
2946 return -EINVAL; 2946 return -EINVAL;
2947 2947
2948 for (i = 0; i < guest_xcrs->nr_xcrs; i++) 2948 for (i = 0; i < guest_xcrs->nr_xcrs; i++)
2949 /* Only support XCR0 currently */ 2949 /* Only support XCR0 currently */
2950 if (guest_xcrs->xcrs[0].xcr == XCR_XFEATURE_ENABLED_MASK) { 2950 if (guest_xcrs->xcrs[0].xcr == XCR_XFEATURE_ENABLED_MASK) {
2951 r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK, 2951 r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK,
2952 guest_xcrs->xcrs[0].value); 2952 guest_xcrs->xcrs[0].value);
2953 break; 2953 break;
2954 } 2954 }
2955 if (r) 2955 if (r)
2956 r = -EINVAL; 2956 r = -EINVAL;
2957 return r; 2957 return r;
2958 } 2958 }
2959 2959
2960 /* 2960 /*
2961 * kvm_set_guest_paused() indicates to the guest kernel that it has been 2961 * kvm_set_guest_paused() indicates to the guest kernel that it has been
2962 * stopped by the hypervisor. This function will be called from the host only. 2962 * stopped by the hypervisor. This function will be called from the host only.
2963 * EINVAL is returned when the host attempts to set the flag for a guest that 2963 * EINVAL is returned when the host attempts to set the flag for a guest that
2964 * does not support pv clocks. 2964 * does not support pv clocks.
2965 */ 2965 */
2966 static int kvm_set_guest_paused(struct kvm_vcpu *vcpu) 2966 static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
2967 { 2967 {
2968 if (!vcpu->arch.time_page) 2968 if (!vcpu->arch.time_page)
2969 return -EINVAL; 2969 return -EINVAL;
2970 vcpu->arch.pvclock_set_guest_stopped_request = true; 2970 vcpu->arch.pvclock_set_guest_stopped_request = true;
2971 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); 2971 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
2972 return 0; 2972 return 0;
2973 } 2973 }
2974 2974
2975 long kvm_arch_vcpu_ioctl(struct file *filp, 2975 long kvm_arch_vcpu_ioctl(struct file *filp,
2976 unsigned int ioctl, unsigned long arg) 2976 unsigned int ioctl, unsigned long arg)
2977 { 2977 {
2978 struct kvm_vcpu *vcpu = filp->private_data; 2978 struct kvm_vcpu *vcpu = filp->private_data;
2979 void __user *argp = (void __user *)arg; 2979 void __user *argp = (void __user *)arg;
2980 int r; 2980 int r;
2981 union { 2981 union {
2982 struct kvm_lapic_state *lapic; 2982 struct kvm_lapic_state *lapic;
2983 struct kvm_xsave *xsave; 2983 struct kvm_xsave *xsave;
2984 struct kvm_xcrs *xcrs; 2984 struct kvm_xcrs *xcrs;
2985 void *buffer; 2985 void *buffer;
2986 } u; 2986 } u;
2987 2987
2988 u.buffer = NULL; 2988 u.buffer = NULL;
2989 switch (ioctl) { 2989 switch (ioctl) {
2990 case KVM_GET_LAPIC: { 2990 case KVM_GET_LAPIC: {
2991 r = -EINVAL; 2991 r = -EINVAL;
2992 if (!vcpu->arch.apic) 2992 if (!vcpu->arch.apic)
2993 goto out; 2993 goto out;
2994 u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); 2994 u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
2995 2995
2996 r = -ENOMEM; 2996 r = -ENOMEM;
2997 if (!u.lapic) 2997 if (!u.lapic)
2998 goto out; 2998 goto out;
2999 r = kvm_vcpu_ioctl_get_lapic(vcpu, u.lapic); 2999 r = kvm_vcpu_ioctl_get_lapic(vcpu, u.lapic);
3000 if (r) 3000 if (r)
3001 goto out; 3001 goto out;
3002 r = -EFAULT; 3002 r = -EFAULT;
3003 if (copy_to_user(argp, u.lapic, sizeof(struct kvm_lapic_state))) 3003 if (copy_to_user(argp, u.lapic, sizeof(struct kvm_lapic_state)))
3004 goto out; 3004 goto out;
3005 r = 0; 3005 r = 0;
3006 break; 3006 break;
3007 } 3007 }
3008 case KVM_SET_LAPIC: { 3008 case KVM_SET_LAPIC: {
3009 r = -EINVAL; 3009 r = -EINVAL;
3010 if (!vcpu->arch.apic) 3010 if (!vcpu->arch.apic)
3011 goto out; 3011 goto out;
3012 u.lapic = memdup_user(argp, sizeof(*u.lapic)); 3012 u.lapic = memdup_user(argp, sizeof(*u.lapic));
3013 if (IS_ERR(u.lapic)) 3013 if (IS_ERR(u.lapic))
3014 return PTR_ERR(u.lapic); 3014 return PTR_ERR(u.lapic);
3015 3015
3016 r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic); 3016 r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic);
3017 break; 3017 break;
3018 } 3018 }
3019 case KVM_INTERRUPT: { 3019 case KVM_INTERRUPT: {
3020 struct kvm_interrupt irq; 3020 struct kvm_interrupt irq;
3021 3021
3022 r = -EFAULT; 3022 r = -EFAULT;
3023 if (copy_from_user(&irq, argp, sizeof irq)) 3023 if (copy_from_user(&irq, argp, sizeof irq))
3024 goto out; 3024 goto out;
3025 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); 3025 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
3026 break; 3026 break;
3027 } 3027 }
3028 case KVM_NMI: { 3028 case KVM_NMI: {
3029 r = kvm_vcpu_ioctl_nmi(vcpu); 3029 r = kvm_vcpu_ioctl_nmi(vcpu);
3030 break; 3030 break;
3031 } 3031 }
3032 case KVM_SET_CPUID: { 3032 case KVM_SET_CPUID: {
3033 struct kvm_cpuid __user *cpuid_arg = argp; 3033 struct kvm_cpuid __user *cpuid_arg = argp;
3034 struct kvm_cpuid cpuid; 3034 struct kvm_cpuid cpuid;
3035 3035
3036 r = -EFAULT; 3036 r = -EFAULT;
3037 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 3037 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
3038 goto out; 3038 goto out;
3039 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries); 3039 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
3040 break; 3040 break;
3041 } 3041 }
3042 case KVM_SET_CPUID2: { 3042 case KVM_SET_CPUID2: {
3043 struct kvm_cpuid2 __user *cpuid_arg = argp; 3043 struct kvm_cpuid2 __user *cpuid_arg = argp;
3044 struct kvm_cpuid2 cpuid; 3044 struct kvm_cpuid2 cpuid;
3045 3045
3046 r = -EFAULT; 3046 r = -EFAULT;
3047 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 3047 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
3048 goto out; 3048 goto out;
3049 r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid, 3049 r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
3050 cpuid_arg->entries); 3050 cpuid_arg->entries);
3051 break; 3051 break;
3052 } 3052 }
3053 case KVM_GET_CPUID2: { 3053 case KVM_GET_CPUID2: {
3054 struct kvm_cpuid2 __user *cpuid_arg = argp; 3054 struct kvm_cpuid2 __user *cpuid_arg = argp;
3055 struct kvm_cpuid2 cpuid; 3055 struct kvm_cpuid2 cpuid;
3056 3056
3057 r = -EFAULT; 3057 r = -EFAULT;
3058 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 3058 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
3059 goto out; 3059 goto out;
3060 r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid, 3060 r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
3061 cpuid_arg->entries); 3061 cpuid_arg->entries);
3062 if (r) 3062 if (r)
3063 goto out; 3063 goto out;
3064 r = -EFAULT; 3064 r = -EFAULT;
3065 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid)) 3065 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
3066 goto out; 3066 goto out;
3067 r = 0; 3067 r = 0;
3068 break; 3068 break;
3069 } 3069 }
3070 case KVM_GET_MSRS: 3070 case KVM_GET_MSRS:
3071 r = msr_io(vcpu, argp, kvm_get_msr, 1); 3071 r = msr_io(vcpu, argp, kvm_get_msr, 1);
3072 break; 3072 break;
3073 case KVM_SET_MSRS: 3073 case KVM_SET_MSRS:
3074 r = msr_io(vcpu, argp, do_set_msr, 0); 3074 r = msr_io(vcpu, argp, do_set_msr, 0);
3075 break; 3075 break;
3076 case KVM_TPR_ACCESS_REPORTING: { 3076 case KVM_TPR_ACCESS_REPORTING: {
3077 struct kvm_tpr_access_ctl tac; 3077 struct kvm_tpr_access_ctl tac;
3078 3078
3079 r = -EFAULT; 3079 r = -EFAULT;
3080 if (copy_from_user(&tac, argp, sizeof tac)) 3080 if (copy_from_user(&tac, argp, sizeof tac))
3081 goto out; 3081 goto out;
3082 r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac); 3082 r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac);
3083 if (r) 3083 if (r)
3084 goto out; 3084 goto out;
3085 r = -EFAULT; 3085 r = -EFAULT;
3086 if (copy_to_user(argp, &tac, sizeof tac)) 3086 if (copy_to_user(argp, &tac, sizeof tac))
3087 goto out; 3087 goto out;
3088 r = 0; 3088 r = 0;
3089 break; 3089 break;
3090 }; 3090 };
3091 case KVM_SET_VAPIC_ADDR: { 3091 case KVM_SET_VAPIC_ADDR: {
3092 struct kvm_vapic_addr va; 3092 struct kvm_vapic_addr va;
3093 3093
3094 r = -EINVAL; 3094 r = -EINVAL;
3095 if (!irqchip_in_kernel(vcpu->kvm)) 3095 if (!irqchip_in_kernel(vcpu->kvm))
3096 goto out; 3096 goto out;
3097 r = -EFAULT; 3097 r = -EFAULT;
3098 if (copy_from_user(&va, argp, sizeof va)) 3098 if (copy_from_user(&va, argp, sizeof va))
3099 goto out; 3099 goto out;
3100 r = 0; 3100 r = 0;
3101 kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr); 3101 kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
3102 break; 3102 break;
3103 } 3103 }
3104 case KVM_X86_SETUP_MCE: { 3104 case KVM_X86_SETUP_MCE: {
3105 u64 mcg_cap; 3105 u64 mcg_cap;
3106 3106
3107 r = -EFAULT; 3107 r = -EFAULT;
3108 if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap)) 3108 if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap))
3109 goto out; 3109 goto out;
3110 r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap); 3110 r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap);
3111 break; 3111 break;
3112 } 3112 }
3113 case KVM_X86_SET_MCE: { 3113 case KVM_X86_SET_MCE: {
3114 struct kvm_x86_mce mce; 3114 struct kvm_x86_mce mce;
3115 3115
3116 r = -EFAULT; 3116 r = -EFAULT;
3117 if (copy_from_user(&mce, argp, sizeof mce)) 3117 if (copy_from_user(&mce, argp, sizeof mce))
3118 goto out; 3118 goto out;
3119 r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce); 3119 r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
3120 break; 3120 break;
3121 } 3121 }
3122 case KVM_GET_VCPU_EVENTS: { 3122 case KVM_GET_VCPU_EVENTS: {
3123 struct kvm_vcpu_events events; 3123 struct kvm_vcpu_events events;
3124 3124
3125 kvm_vcpu_ioctl_x86_get_vcpu_events(vcpu, &events); 3125 kvm_vcpu_ioctl_x86_get_vcpu_events(vcpu, &events);
3126 3126
3127 r = -EFAULT; 3127 r = -EFAULT;
3128 if (copy_to_user(argp, &events, sizeof(struct kvm_vcpu_events))) 3128 if (copy_to_user(argp, &events, sizeof(struct kvm_vcpu_events)))
3129 break; 3129 break;
3130 r = 0; 3130 r = 0;
3131 break; 3131 break;
3132 } 3132 }
3133 case KVM_SET_VCPU_EVENTS: { 3133 case KVM_SET_VCPU_EVENTS: {
3134 struct kvm_vcpu_events events; 3134 struct kvm_vcpu_events events;
3135 3135
3136 r = -EFAULT; 3136 r = -EFAULT;
3137 if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events))) 3137 if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events)))
3138 break; 3138 break;
3139 3139
3140 r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events); 3140 r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);
3141 break; 3141 break;
3142 } 3142 }
3143 case KVM_GET_DEBUGREGS: { 3143 case KVM_GET_DEBUGREGS: {
3144 struct kvm_debugregs dbgregs; 3144 struct kvm_debugregs dbgregs;
3145 3145
3146 kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs); 3146 kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs);
3147 3147
3148 r = -EFAULT; 3148 r = -EFAULT;
3149 if (copy_to_user(argp, &dbgregs, 3149 if (copy_to_user(argp, &dbgregs,
3150 sizeof(struct kvm_debugregs))) 3150 sizeof(struct kvm_debugregs)))
3151 break; 3151 break;
3152 r = 0; 3152 r = 0;
3153 break; 3153 break;
3154 } 3154 }
3155 case KVM_SET_DEBUGREGS: { 3155 case KVM_SET_DEBUGREGS: {
3156 struct kvm_debugregs dbgregs; 3156 struct kvm_debugregs dbgregs;
3157 3157
3158 r = -EFAULT; 3158 r = -EFAULT;
3159 if (copy_from_user(&dbgregs, argp, 3159 if (copy_from_user(&dbgregs, argp,
3160 sizeof(struct kvm_debugregs))) 3160 sizeof(struct kvm_debugregs)))
3161 break; 3161 break;
3162 3162
3163 r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs); 3163 r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs);
3164 break; 3164 break;
3165 } 3165 }
3166 case KVM_GET_XSAVE: { 3166 case KVM_GET_XSAVE: {
3167 u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL); 3167 u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
3168 r = -ENOMEM; 3168 r = -ENOMEM;
3169 if (!u.xsave) 3169 if (!u.xsave)
3170 break; 3170 break;
3171 3171
3172 kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave); 3172 kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave);
3173 3173
3174 r = -EFAULT; 3174 r = -EFAULT;
3175 if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave))) 3175 if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave)))
3176 break; 3176 break;
3177 r = 0; 3177 r = 0;
3178 break; 3178 break;
3179 } 3179 }
3180 case KVM_SET_XSAVE: { 3180 case KVM_SET_XSAVE: {
3181 u.xsave = memdup_user(argp, sizeof(*u.xsave)); 3181 u.xsave = memdup_user(argp, sizeof(*u.xsave));
3182 if (IS_ERR(u.xsave)) 3182 if (IS_ERR(u.xsave))
3183 return PTR_ERR(u.xsave); 3183 return PTR_ERR(u.xsave);
3184 3184
3185 r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave); 3185 r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
3186 break; 3186 break;
3187 } 3187 }
3188 case KVM_GET_XCRS: { 3188 case KVM_GET_XCRS: {
3189 u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL); 3189 u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
3190 r = -ENOMEM; 3190 r = -ENOMEM;
3191 if (!u.xcrs) 3191 if (!u.xcrs)
3192 break; 3192 break;
3193 3193
3194 kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs); 3194 kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs);
3195 3195
3196 r = -EFAULT; 3196 r = -EFAULT;
3197 if (copy_to_user(argp, u.xcrs, 3197 if (copy_to_user(argp, u.xcrs,
3198 sizeof(struct kvm_xcrs))) 3198 sizeof(struct kvm_xcrs)))
3199 break; 3199 break;
3200 r = 0; 3200 r = 0;
3201 break; 3201 break;
3202 } 3202 }
3203 case KVM_SET_XCRS: { 3203 case KVM_SET_XCRS: {
3204 u.xcrs = memdup_user(argp, sizeof(*u.xcrs)); 3204 u.xcrs = memdup_user(argp, sizeof(*u.xcrs));
3205 if (IS_ERR(u.xcrs)) 3205 if (IS_ERR(u.xcrs))
3206 return PTR_ERR(u.xcrs); 3206 return PTR_ERR(u.xcrs);
3207 3207
3208 r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs); 3208 r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
3209 break; 3209 break;
3210 } 3210 }
3211 case KVM_SET_TSC_KHZ: { 3211 case KVM_SET_TSC_KHZ: {
3212 u32 user_tsc_khz; 3212 u32 user_tsc_khz;
3213 3213
3214 r = -EINVAL; 3214 r = -EINVAL;
3215 user_tsc_khz = (u32)arg; 3215 user_tsc_khz = (u32)arg;
3216 3216
3217 if (user_tsc_khz >= kvm_max_guest_tsc_khz) 3217 if (user_tsc_khz >= kvm_max_guest_tsc_khz)
3218 goto out; 3218 goto out;
3219 3219
3220 if (user_tsc_khz == 0) 3220 if (user_tsc_khz == 0)
3221 user_tsc_khz = tsc_khz; 3221 user_tsc_khz = tsc_khz;
3222 3222
3223 kvm_set_tsc_khz(vcpu, user_tsc_khz); 3223 kvm_set_tsc_khz(vcpu, user_tsc_khz);
3224 3224
3225 r = 0; 3225 r = 0;
3226 goto out; 3226 goto out;
3227 } 3227 }
3228 case KVM_GET_TSC_KHZ: { 3228 case KVM_GET_TSC_KHZ: {
3229 r = vcpu->arch.virtual_tsc_khz; 3229 r = vcpu->arch.virtual_tsc_khz;
3230 goto out; 3230 goto out;
3231 } 3231 }
3232 case KVM_KVMCLOCK_CTRL: { 3232 case KVM_KVMCLOCK_CTRL: {
3233 r = kvm_set_guest_paused(vcpu); 3233 r = kvm_set_guest_paused(vcpu);
3234 goto out; 3234 goto out;
3235 } 3235 }
3236 default: 3236 default:
3237 r = -EINVAL; 3237 r = -EINVAL;
3238 } 3238 }
3239 out: 3239 out:
3240 kfree(u.buffer); 3240 kfree(u.buffer);
3241 return r; 3241 return r;
3242 } 3242 }
3243 3243
3244 int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) 3244 int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
3245 { 3245 {
3246 return VM_FAULT_SIGBUS; 3246 return VM_FAULT_SIGBUS;
3247 } 3247 }
3248 3248
3249 static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) 3249 static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
3250 { 3250 {
3251 int ret; 3251 int ret;
3252 3252
3253 if (addr > (unsigned int)(-3 * PAGE_SIZE)) 3253 if (addr > (unsigned int)(-3 * PAGE_SIZE))
3254 return -EINVAL; 3254 return -EINVAL;
3255 ret = kvm_x86_ops->set_tss_addr(kvm, addr); 3255 ret = kvm_x86_ops->set_tss_addr(kvm, addr);
3256 return ret; 3256 return ret;
3257 } 3257 }
3258 3258
3259 static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm, 3259 static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,
3260 u64 ident_addr) 3260 u64 ident_addr)
3261 { 3261 {
3262 kvm->arch.ept_identity_map_addr = ident_addr; 3262 kvm->arch.ept_identity_map_addr = ident_addr;
3263 return 0; 3263 return 0;
3264 } 3264 }
3265 3265
3266 static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, 3266 static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
3267 u32 kvm_nr_mmu_pages) 3267 u32 kvm_nr_mmu_pages)
3268 { 3268 {
3269 if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES) 3269 if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
3270 return -EINVAL; 3270 return -EINVAL;
3271 3271
3272 mutex_lock(&kvm->slots_lock); 3272 mutex_lock(&kvm->slots_lock);
3273 spin_lock(&kvm->mmu_lock); 3273 spin_lock(&kvm->mmu_lock);
3274 3274
3275 kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); 3275 kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
3276 kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; 3276 kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
3277 3277
3278 spin_unlock(&kvm->mmu_lock); 3278 spin_unlock(&kvm->mmu_lock);
3279 mutex_unlock(&kvm->slots_lock); 3279 mutex_unlock(&kvm->slots_lock);
3280 return 0; 3280 return 0;
3281 } 3281 }
3282 3282
3283 static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) 3283 static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
3284 { 3284 {
3285 return kvm->arch.n_max_mmu_pages; 3285 return kvm->arch.n_max_mmu_pages;
3286 } 3286 }
3287 3287
3288 static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) 3288 static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
3289 { 3289 {
3290 int r; 3290 int r;
3291 3291
3292 r = 0; 3292 r = 0;
3293 switch (chip->chip_id) { 3293 switch (chip->chip_id) {
3294 case KVM_IRQCHIP_PIC_MASTER: 3294 case KVM_IRQCHIP_PIC_MASTER:
3295 memcpy(&chip->chip.pic, 3295 memcpy(&chip->chip.pic,
3296 &pic_irqchip(kvm)->pics[0], 3296 &pic_irqchip(kvm)->pics[0],
3297 sizeof(struct kvm_pic_state)); 3297 sizeof(struct kvm_pic_state));
3298 break; 3298 break;
3299 case KVM_IRQCHIP_PIC_SLAVE: 3299 case KVM_IRQCHIP_PIC_SLAVE:
3300 memcpy(&chip->chip.pic, 3300 memcpy(&chip->chip.pic,
3301 &pic_irqchip(kvm)->pics[1], 3301 &pic_irqchip(kvm)->pics[1],
3302 sizeof(struct kvm_pic_state)); 3302 sizeof(struct kvm_pic_state));
3303 break; 3303 break;
3304 case KVM_IRQCHIP_IOAPIC: 3304 case KVM_IRQCHIP_IOAPIC:
3305 r = kvm_get_ioapic(kvm, &chip->chip.ioapic); 3305 r = kvm_get_ioapic(kvm, &chip->chip.ioapic);
3306 break; 3306 break;
3307 default: 3307 default:
3308 r = -EINVAL; 3308 r = -EINVAL;
3309 break; 3309 break;
3310 } 3310 }
3311 return r; 3311 return r;
3312 } 3312 }
3313 3313
3314 static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) 3314 static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
3315 { 3315 {
3316 int r; 3316 int r;
3317 3317
3318 r = 0; 3318 r = 0;
3319 switch (chip->chip_id) { 3319 switch (chip->chip_id) {
3320 case KVM_IRQCHIP_PIC_MASTER: 3320 case KVM_IRQCHIP_PIC_MASTER:
3321 spin_lock(&pic_irqchip(kvm)->lock); 3321 spin_lock(&pic_irqchip(kvm)->lock);
3322 memcpy(&pic_irqchip(kvm)->pics[0], 3322 memcpy(&pic_irqchip(kvm)->pics[0],
3323 &chip->chip.pic, 3323 &chip->chip.pic,
3324 sizeof(struct kvm_pic_state)); 3324 sizeof(struct kvm_pic_state));
3325 spin_unlock(&pic_irqchip(kvm)->lock); 3325 spin_unlock(&pic_irqchip(kvm)->lock);
3326 break; 3326 break;
3327 case KVM_IRQCHIP_PIC_SLAVE: 3327 case KVM_IRQCHIP_PIC_SLAVE:
3328 spin_lock(&pic_irqchip(kvm)->lock); 3328 spin_lock(&pic_irqchip(kvm)->lock);
3329 memcpy(&pic_irqchip(kvm)->pics[1], 3329 memcpy(&pic_irqchip(kvm)->pics[1],
3330 &chip->chip.pic, 3330 &chip->chip.pic,
3331 sizeof(struct kvm_pic_state)); 3331 sizeof(struct kvm_pic_state));
3332 spin_unlock(&pic_irqchip(kvm)->lock); 3332 spin_unlock(&pic_irqchip(kvm)->lock);
3333 break; 3333 break;
3334 case KVM_IRQCHIP_IOAPIC: 3334 case KVM_IRQCHIP_IOAPIC:
3335 r = kvm_set_ioapic(kvm, &chip->chip.ioapic); 3335 r = kvm_set_ioapic(kvm, &chip->chip.ioapic);
3336 break; 3336 break;
3337 default: 3337 default:
3338 r = -EINVAL; 3338 r = -EINVAL;
3339 break; 3339 break;
3340 } 3340 }
3341 kvm_pic_update_irq(pic_irqchip(kvm)); 3341 kvm_pic_update_irq(pic_irqchip(kvm));
3342 return r; 3342 return r;
3343 } 3343 }
3344 3344
3345 static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps) 3345 static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
3346 { 3346 {
3347 int r = 0; 3347 int r = 0;
3348 3348
3349 mutex_lock(&kvm->arch.vpit->pit_state.lock); 3349 mutex_lock(&kvm->arch.vpit->pit_state.lock);
3350 memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state)); 3350 memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));
3351 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 3351 mutex_unlock(&kvm->arch.vpit->pit_state.lock);
3352 return r; 3352 return r;
3353 } 3353 }
3354 3354
3355 static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps) 3355 static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
3356 { 3356 {
3357 int r = 0; 3357 int r = 0;
3358 3358
3359 mutex_lock(&kvm->arch.vpit->pit_state.lock); 3359 mutex_lock(&kvm->arch.vpit->pit_state.lock);
3360 memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state)); 3360 memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
3361 kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0); 3361 kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0);
3362 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 3362 mutex_unlock(&kvm->arch.vpit->pit_state.lock);
3363 return r; 3363 return r;
3364 } 3364 }
3365 3365
3366 static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) 3366 static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
3367 { 3367 {
3368 int r = 0; 3368 int r = 0;
3369 3369
3370 mutex_lock(&kvm->arch.vpit->pit_state.lock); 3370 mutex_lock(&kvm->arch.vpit->pit_state.lock);
3371 memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels, 3371 memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
3372 sizeof(ps->channels)); 3372 sizeof(ps->channels));
3373 ps->flags = kvm->arch.vpit->pit_state.flags; 3373 ps->flags = kvm->arch.vpit->pit_state.flags;
3374 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 3374 mutex_unlock(&kvm->arch.vpit->pit_state.lock);
3375 memset(&ps->reserved, 0, sizeof(ps->reserved)); 3375 memset(&ps->reserved, 0, sizeof(ps->reserved));
3376 return r; 3376 return r;
3377 } 3377 }
3378 3378
3379 static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) 3379 static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
3380 { 3380 {
3381 int r = 0, start = 0; 3381 int r = 0, start = 0;
3382 u32 prev_legacy, cur_legacy; 3382 u32 prev_legacy, cur_legacy;
3383 mutex_lock(&kvm->arch.vpit->pit_state.lock); 3383 mutex_lock(&kvm->arch.vpit->pit_state.lock);
3384 prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY; 3384 prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
3385 cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY; 3385 cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
3386 if (!prev_legacy && cur_legacy) 3386 if (!prev_legacy && cur_legacy)
3387 start = 1; 3387 start = 1;
3388 memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels, 3388 memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels,
3389 sizeof(kvm->arch.vpit->pit_state.channels)); 3389 sizeof(kvm->arch.vpit->pit_state.channels));
3390 kvm->arch.vpit->pit_state.flags = ps->flags; 3390 kvm->arch.vpit->pit_state.flags = ps->flags;
3391 kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start); 3391 kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start);
3392 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 3392 mutex_unlock(&kvm->arch.vpit->pit_state.lock);
3393 return r; 3393 return r;
3394 } 3394 }
3395 3395
3396 static int kvm_vm_ioctl_reinject(struct kvm *kvm, 3396 static int kvm_vm_ioctl_reinject(struct kvm *kvm,
3397 struct kvm_reinject_control *control) 3397 struct kvm_reinject_control *control)
3398 { 3398 {
3399 if (!kvm->arch.vpit) 3399 if (!kvm->arch.vpit)
3400 return -ENXIO; 3400 return -ENXIO;
3401 mutex_lock(&kvm->arch.vpit->pit_state.lock); 3401 mutex_lock(&kvm->arch.vpit->pit_state.lock);
3402 kvm->arch.vpit->pit_state.reinject = control->pit_reinject; 3402 kvm->arch.vpit->pit_state.reinject = control->pit_reinject;
3403 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 3403 mutex_unlock(&kvm->arch.vpit->pit_state.lock);
3404 return 0; 3404 return 0;
3405 } 3405 }
3406 3406
3407 /** 3407 /**
3408 * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot 3408 * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
3409 * @kvm: kvm instance 3409 * @kvm: kvm instance
3410 * @log: slot id and address to which we copy the log 3410 * @log: slot id and address to which we copy the log
3411 * 3411 *
3412 * We need to keep it in mind that VCPU threads can write to the bitmap 3412 * We need to keep it in mind that VCPU threads can write to the bitmap
3413 * concurrently. So, to avoid losing data, we keep the following order for 3413 * concurrently. So, to avoid losing data, we keep the following order for
3414 * each bit: 3414 * each bit:
3415 * 3415 *
3416 * 1. Take a snapshot of the bit and clear it if needed. 3416 * 1. Take a snapshot of the bit and clear it if needed.
3417 * 2. Write protect the corresponding page. 3417 * 2. Write protect the corresponding page.
3418 * 3. Flush TLB's if needed. 3418 * 3. Flush TLB's if needed.
3419 * 4. Copy the snapshot to the userspace. 3419 * 4. Copy the snapshot to the userspace.
3420 * 3420 *
3421 * Between 2 and 3, the guest may write to the page using the remaining TLB 3421 * Between 2 and 3, the guest may write to the page using the remaining TLB
3422 * entry. This is not a problem because the page will be reported dirty at 3422 * entry. This is not a problem because the page will be reported dirty at
3423 * step 4 using the snapshot taken before and step 3 ensures that successive 3423 * step 4 using the snapshot taken before and step 3 ensures that successive
3424 * writes will be logged for the next call. 3424 * writes will be logged for the next call.
3425 */ 3425 */
3426 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) 3426 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
3427 { 3427 {
3428 int r; 3428 int r;
3429 struct kvm_memory_slot *memslot; 3429 struct kvm_memory_slot *memslot;
3430 unsigned long n, i; 3430 unsigned long n, i;
3431 unsigned long *dirty_bitmap; 3431 unsigned long *dirty_bitmap;
3432 unsigned long *dirty_bitmap_buffer; 3432 unsigned long *dirty_bitmap_buffer;
3433 bool is_dirty = false; 3433 bool is_dirty = false;
3434 3434
3435 mutex_lock(&kvm->slots_lock); 3435 mutex_lock(&kvm->slots_lock);
3436 3436
3437 r = -EINVAL; 3437 r = -EINVAL;
3438 if (log->slot >= KVM_USER_MEM_SLOTS) 3438 if (log->slot >= KVM_USER_MEM_SLOTS)
3439 goto out; 3439 goto out;
3440 3440
3441 memslot = id_to_memslot(kvm->memslots, log->slot); 3441 memslot = id_to_memslot(kvm->memslots, log->slot);
3442 3442
3443 dirty_bitmap = memslot->dirty_bitmap; 3443 dirty_bitmap = memslot->dirty_bitmap;
3444 r = -ENOENT; 3444 r = -ENOENT;
3445 if (!dirty_bitmap) 3445 if (!dirty_bitmap)
3446 goto out; 3446 goto out;
3447 3447
3448 n = kvm_dirty_bitmap_bytes(memslot); 3448 n = kvm_dirty_bitmap_bytes(memslot);
3449 3449
3450 dirty_bitmap_buffer = dirty_bitmap + n / sizeof(long); 3450 dirty_bitmap_buffer = dirty_bitmap + n / sizeof(long);
3451 memset(dirty_bitmap_buffer, 0, n); 3451 memset(dirty_bitmap_buffer, 0, n);
3452 3452
3453 spin_lock(&kvm->mmu_lock); 3453 spin_lock(&kvm->mmu_lock);
3454 3454
3455 for (i = 0; i < n / sizeof(long); i++) { 3455 for (i = 0; i < n / sizeof(long); i++) {
3456 unsigned long mask; 3456 unsigned long mask;
3457 gfn_t offset; 3457 gfn_t offset;
3458 3458
3459 if (!dirty_bitmap[i]) 3459 if (!dirty_bitmap[i])
3460 continue; 3460 continue;
3461 3461
3462 is_dirty = true; 3462 is_dirty = true;
3463 3463
3464 mask = xchg(&dirty_bitmap[i], 0); 3464 mask = xchg(&dirty_bitmap[i], 0);
3465 dirty_bitmap_buffer[i] = mask; 3465 dirty_bitmap_buffer[i] = mask;
3466 3466
3467 offset = i * BITS_PER_LONG; 3467 offset = i * BITS_PER_LONG;
3468 kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, mask); 3468 kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, mask);
3469 } 3469 }
3470 if (is_dirty) 3470 if (is_dirty)
3471 kvm_flush_remote_tlbs(kvm); 3471 kvm_flush_remote_tlbs(kvm);
3472 3472
3473 spin_unlock(&kvm->mmu_lock); 3473 spin_unlock(&kvm->mmu_lock);
3474 3474
3475 r = -EFAULT; 3475 r = -EFAULT;
3476 if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n)) 3476 if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
3477 goto out; 3477 goto out;
3478 3478
3479 r = 0; 3479 r = 0;
3480 out: 3480 out:
3481 mutex_unlock(&kvm->slots_lock); 3481 mutex_unlock(&kvm->slots_lock);
3482 return r; 3482 return r;
3483 } 3483 }
3484 3484
3485 int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event) 3485 int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event)
3486 { 3486 {
3487 if (!irqchip_in_kernel(kvm)) 3487 if (!irqchip_in_kernel(kvm))
3488 return -ENXIO; 3488 return -ENXIO;
3489 3489
3490 irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 3490 irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
3491 irq_event->irq, irq_event->level); 3491 irq_event->irq, irq_event->level);
3492 return 0; 3492 return 0;
3493 } 3493 }
3494 3494
3495 long kvm_arch_vm_ioctl(struct file *filp, 3495 long kvm_arch_vm_ioctl(struct file *filp,
3496 unsigned int ioctl, unsigned long arg) 3496 unsigned int ioctl, unsigned long arg)
3497 { 3497 {
3498 struct kvm *kvm = filp->private_data; 3498 struct kvm *kvm = filp->private_data;
3499 void __user *argp = (void __user *)arg; 3499 void __user *argp = (void __user *)arg;
3500 int r = -ENOTTY; 3500 int r = -ENOTTY;
3501 /* 3501 /*
3502 * This union makes it completely explicit to gcc-3.x 3502 * This union makes it completely explicit to gcc-3.x
3503 * that these two variables' stack usage should be 3503 * that these two variables' stack usage should be
3504 * combined, not added together. 3504 * combined, not added together.
3505 */ 3505 */
3506 union { 3506 union {
3507 struct kvm_pit_state ps; 3507 struct kvm_pit_state ps;
3508 struct kvm_pit_state2 ps2; 3508 struct kvm_pit_state2 ps2;
3509 struct kvm_pit_config pit_config; 3509 struct kvm_pit_config pit_config;
3510 } u; 3510 } u;
3511 3511
3512 switch (ioctl) { 3512 switch (ioctl) {
3513 case KVM_SET_TSS_ADDR: 3513 case KVM_SET_TSS_ADDR:
3514 r = kvm_vm_ioctl_set_tss_addr(kvm, arg); 3514 r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
3515 break; 3515 break;
3516 case KVM_SET_IDENTITY_MAP_ADDR: { 3516 case KVM_SET_IDENTITY_MAP_ADDR: {
3517 u64 ident_addr; 3517 u64 ident_addr;
3518 3518
3519 r = -EFAULT; 3519 r = -EFAULT;
3520 if (copy_from_user(&ident_addr, argp, sizeof ident_addr)) 3520 if (copy_from_user(&ident_addr, argp, sizeof ident_addr))
3521 goto out; 3521 goto out;
3522 r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr); 3522 r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr);
3523 break; 3523 break;
3524 } 3524 }
3525 case KVM_SET_NR_MMU_PAGES: 3525 case KVM_SET_NR_MMU_PAGES:
3526 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); 3526 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
3527 break; 3527 break;
3528 case KVM_GET_NR_MMU_PAGES: 3528 case KVM_GET_NR_MMU_PAGES:
3529 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); 3529 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
3530 break; 3530 break;
3531 case KVM_CREATE_IRQCHIP: { 3531 case KVM_CREATE_IRQCHIP: {
3532 struct kvm_pic *vpic; 3532 struct kvm_pic *vpic;
3533 3533
3534 mutex_lock(&kvm->lock); 3534 mutex_lock(&kvm->lock);
3535 r = -EEXIST; 3535 r = -EEXIST;
3536 if (kvm->arch.vpic) 3536 if (kvm->arch.vpic)
3537 goto create_irqchip_unlock; 3537 goto create_irqchip_unlock;
3538 r = -EINVAL; 3538 r = -EINVAL;
3539 if (atomic_read(&kvm->online_vcpus)) 3539 if (atomic_read(&kvm->online_vcpus))
3540 goto create_irqchip_unlock; 3540 goto create_irqchip_unlock;
3541 r = -ENOMEM; 3541 r = -ENOMEM;
3542 vpic = kvm_create_pic(kvm); 3542 vpic = kvm_create_pic(kvm);
3543 if (vpic) { 3543 if (vpic) {
3544 r = kvm_ioapic_init(kvm); 3544 r = kvm_ioapic_init(kvm);
3545 if (r) { 3545 if (r) {
3546 mutex_lock(&kvm->slots_lock); 3546 mutex_lock(&kvm->slots_lock);
3547 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, 3547 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
3548 &vpic->dev_master); 3548 &vpic->dev_master);
3549 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, 3549 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
3550 &vpic->dev_slave); 3550 &vpic->dev_slave);
3551 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, 3551 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
3552 &vpic->dev_eclr); 3552 &vpic->dev_eclr);
3553 mutex_unlock(&kvm->slots_lock); 3553 mutex_unlock(&kvm->slots_lock);
3554 kfree(vpic); 3554 kfree(vpic);
3555 goto create_irqchip_unlock; 3555 goto create_irqchip_unlock;
3556 } 3556 }
3557 } else 3557 } else
3558 goto create_irqchip_unlock; 3558 goto create_irqchip_unlock;
3559 smp_wmb(); 3559 smp_wmb();
3560 kvm->arch.vpic = vpic; 3560 kvm->arch.vpic = vpic;
3561 smp_wmb(); 3561 smp_wmb();
3562 r = kvm_setup_default_irq_routing(kvm); 3562 r = kvm_setup_default_irq_routing(kvm);
3563 if (r) { 3563 if (r) {
3564 mutex_lock(&kvm->slots_lock); 3564 mutex_lock(&kvm->slots_lock);
3565 mutex_lock(&kvm->irq_lock); 3565 mutex_lock(&kvm->irq_lock);
3566 kvm_ioapic_destroy(kvm); 3566 kvm_ioapic_destroy(kvm);
3567 kvm_destroy_pic(kvm); 3567 kvm_destroy_pic(kvm);
3568 mutex_unlock(&kvm->irq_lock); 3568 mutex_unlock(&kvm->irq_lock);
3569 mutex_unlock(&kvm->slots_lock); 3569 mutex_unlock(&kvm->slots_lock);
3570 } 3570 }
3571 create_irqchip_unlock: 3571 create_irqchip_unlock:
3572 mutex_unlock(&kvm->lock); 3572 mutex_unlock(&kvm->lock);
3573 break; 3573 break;
3574 } 3574 }
3575 case KVM_CREATE_PIT: 3575 case KVM_CREATE_PIT:
3576 u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY; 3576 u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY;
3577 goto create_pit; 3577 goto create_pit;
3578 case KVM_CREATE_PIT2: 3578 case KVM_CREATE_PIT2:
3579 r = -EFAULT; 3579 r = -EFAULT;
3580 if (copy_from_user(&u.pit_config, argp, 3580 if (copy_from_user(&u.pit_config, argp,
3581 sizeof(struct kvm_pit_config))) 3581 sizeof(struct kvm_pit_config)))
3582 goto out; 3582 goto out;
3583 create_pit: 3583 create_pit:
3584 mutex_lock(&kvm->slots_lock); 3584 mutex_lock(&kvm->slots_lock);
3585 r = -EEXIST; 3585 r = -EEXIST;
3586 if (kvm->arch.vpit) 3586 if (kvm->arch.vpit)
3587 goto create_pit_unlock; 3587 goto create_pit_unlock;
3588 r = -ENOMEM; 3588 r = -ENOMEM;
3589 kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags); 3589 kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags);
3590 if (kvm->arch.vpit) 3590 if (kvm->arch.vpit)
3591 r = 0; 3591 r = 0;
3592 create_pit_unlock: 3592 create_pit_unlock:
3593 mutex_unlock(&kvm->slots_lock); 3593 mutex_unlock(&kvm->slots_lock);
3594 break; 3594 break;
3595 case KVM_GET_IRQCHIP: { 3595 case KVM_GET_IRQCHIP: {
3596 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ 3596 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
3597 struct kvm_irqchip *chip; 3597 struct kvm_irqchip *chip;
3598 3598
3599 chip = memdup_user(argp, sizeof(*chip)); 3599 chip = memdup_user(argp, sizeof(*chip));
3600 if (IS_ERR(chip)) { 3600 if (IS_ERR(chip)) {
3601 r = PTR_ERR(chip); 3601 r = PTR_ERR(chip);
3602 goto out; 3602 goto out;
3603 } 3603 }
3604 3604
3605 r = -ENXIO; 3605 r = -ENXIO;
3606 if (!irqchip_in_kernel(kvm)) 3606 if (!irqchip_in_kernel(kvm))
3607 goto get_irqchip_out; 3607 goto get_irqchip_out;
3608 r = kvm_vm_ioctl_get_irqchip(kvm, chip); 3608 r = kvm_vm_ioctl_get_irqchip(kvm, chip);
3609 if (r) 3609 if (r)
3610 goto get_irqchip_out; 3610 goto get_irqchip_out;
3611 r = -EFAULT; 3611 r = -EFAULT;
3612 if (copy_to_user(argp, chip, sizeof *chip)) 3612 if (copy_to_user(argp, chip, sizeof *chip))
3613 goto get_irqchip_out; 3613 goto get_irqchip_out;
3614 r = 0; 3614 r = 0;
3615 get_irqchip_out: 3615 get_irqchip_out:
3616 kfree(chip); 3616 kfree(chip);
3617 break; 3617 break;
3618 } 3618 }
3619 case KVM_SET_IRQCHIP: { 3619 case KVM_SET_IRQCHIP: {
3620 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ 3620 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
3621 struct kvm_irqchip *chip; 3621 struct kvm_irqchip *chip;
3622 3622
3623 chip = memdup_user(argp, sizeof(*chip)); 3623 chip = memdup_user(argp, sizeof(*chip));
3624 if (IS_ERR(chip)) { 3624 if (IS_ERR(chip)) {
3625 r = PTR_ERR(chip); 3625 r = PTR_ERR(chip);
3626 goto out; 3626 goto out;
3627 } 3627 }
3628 3628
3629 r = -ENXIO; 3629 r = -ENXIO;
3630 if (!irqchip_in_kernel(kvm)) 3630 if (!irqchip_in_kernel(kvm))
3631 goto set_irqchip_out; 3631 goto set_irqchip_out;
3632 r = kvm_vm_ioctl_set_irqchip(kvm, chip); 3632 r = kvm_vm_ioctl_set_irqchip(kvm, chip);
3633 if (r) 3633 if (r)
3634 goto set_irqchip_out; 3634 goto set_irqchip_out;
3635 r = 0; 3635 r = 0;
3636 set_irqchip_out: 3636 set_irqchip_out:
3637 kfree(chip); 3637 kfree(chip);
3638 break; 3638 break;
3639 } 3639 }
3640 case KVM_GET_PIT: { 3640 case KVM_GET_PIT: {
3641 r = -EFAULT; 3641 r = -EFAULT;
3642 if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state))) 3642 if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state)))
3643 goto out; 3643 goto out;
3644 r = -ENXIO; 3644 r = -ENXIO;
3645 if (!kvm->arch.vpit) 3645 if (!kvm->arch.vpit)
3646 goto out; 3646 goto out;
3647 r = kvm_vm_ioctl_get_pit(kvm, &u.ps); 3647 r = kvm_vm_ioctl_get_pit(kvm, &u.ps);
3648 if (r) 3648 if (r)
3649 goto out; 3649 goto out;
3650 r = -EFAULT; 3650 r = -EFAULT;
3651 if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state))) 3651 if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state)))
3652 goto out; 3652 goto out;
3653 r = 0; 3653 r = 0;
3654 break; 3654 break;
3655 } 3655 }
3656 case KVM_SET_PIT: { 3656 case KVM_SET_PIT: {
3657 r = -EFAULT; 3657 r = -EFAULT;
3658 if (copy_from_user(&u.ps, argp, sizeof u.ps)) 3658 if (copy_from_user(&u.ps, argp, sizeof u.ps))
3659 goto out; 3659 goto out;
3660 r = -ENXIO; 3660 r = -ENXIO;
3661 if (!kvm->arch.vpit) 3661 if (!kvm->arch.vpit)
3662 goto out; 3662 goto out;
3663 r = kvm_vm_ioctl_set_pit(kvm, &u.ps); 3663 r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
3664 break; 3664 break;
3665 } 3665 }
3666 case KVM_GET_PIT2: { 3666 case KVM_GET_PIT2: {
3667 r = -ENXIO; 3667 r = -ENXIO;
3668 if (!kvm->arch.vpit) 3668 if (!kvm->arch.vpit)
3669 goto out; 3669 goto out;
3670 r = kvm_vm_ioctl_get_pit2(kvm, &u.ps2); 3670 r = kvm_vm_ioctl_get_pit2(kvm, &u.ps2);
3671 if (r) 3671 if (r)
3672 goto out; 3672 goto out;
3673 r = -EFAULT; 3673 r = -EFAULT;
3674 if (copy_to_user(argp, &u.ps2, sizeof(u.ps2))) 3674 if (copy_to_user(argp, &u.ps2, sizeof(u.ps2)))
3675 goto out; 3675 goto out;
3676 r = 0; 3676 r = 0;
3677 break; 3677 break;
3678 } 3678 }
3679 case KVM_SET_PIT2: { 3679 case KVM_SET_PIT2: {
3680 r = -EFAULT; 3680 r = -EFAULT;
3681 if (copy_from_user(&u.ps2, argp, sizeof(u.ps2))) 3681 if (copy_from_user(&u.ps2, argp, sizeof(u.ps2)))
3682 goto out; 3682 goto out;
3683 r = -ENXIO; 3683 r = -ENXIO;
3684 if (!kvm->arch.vpit) 3684 if (!kvm->arch.vpit)
3685 goto out; 3685 goto out;
3686 r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2); 3686 r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2);
3687 break; 3687 break;
3688 } 3688 }
3689 case KVM_REINJECT_CONTROL: { 3689 case KVM_REINJECT_CONTROL: {
3690 struct kvm_reinject_control control; 3690 struct kvm_reinject_control control;
3691 r = -EFAULT; 3691 r = -EFAULT;
3692 if (copy_from_user(&control, argp, sizeof(control))) 3692 if (copy_from_user(&control, argp, sizeof(control)))
3693 goto out; 3693 goto out;
3694 r = kvm_vm_ioctl_reinject(kvm, &control); 3694 r = kvm_vm_ioctl_reinject(kvm, &control);
3695 break; 3695 break;
3696 } 3696 }
3697 case KVM_XEN_HVM_CONFIG: { 3697 case KVM_XEN_HVM_CONFIG: {
3698 r = -EFAULT; 3698 r = -EFAULT;
3699 if (copy_from_user(&kvm->arch.xen_hvm_config, argp, 3699 if (copy_from_user(&kvm->arch.xen_hvm_config, argp,
3700 sizeof(struct kvm_xen_hvm_config))) 3700 sizeof(struct kvm_xen_hvm_config)))
3701 goto out; 3701 goto out;
3702 r = -EINVAL; 3702 r = -EINVAL;
3703 if (kvm->arch.xen_hvm_config.flags) 3703 if (kvm->arch.xen_hvm_config.flags)
3704 goto out; 3704 goto out;
3705 r = 0; 3705 r = 0;
3706 break; 3706 break;
3707 } 3707 }
3708 case KVM_SET_CLOCK: { 3708 case KVM_SET_CLOCK: {
3709 struct kvm_clock_data user_ns; 3709 struct kvm_clock_data user_ns;
3710 u64 now_ns; 3710 u64 now_ns;
3711 s64 delta; 3711 s64 delta;
3712 3712
3713 r = -EFAULT; 3713 r = -EFAULT;
3714 if (copy_from_user(&user_ns, argp, sizeof(user_ns))) 3714 if (copy_from_user(&user_ns, argp, sizeof(user_ns)))
3715 goto out; 3715 goto out;
3716 3716
3717 r = -EINVAL; 3717 r = -EINVAL;
3718 if (user_ns.flags) 3718 if (user_ns.flags)
3719 goto out; 3719 goto out;
3720 3720
3721 r = 0; 3721 r = 0;
3722 local_irq_disable(); 3722 local_irq_disable();
3723 now_ns = get_kernel_ns(); 3723 now_ns = get_kernel_ns();
3724 delta = user_ns.clock - now_ns; 3724 delta = user_ns.clock - now_ns;
3725 local_irq_enable(); 3725 local_irq_enable();
3726 kvm->arch.kvmclock_offset = delta; 3726 kvm->arch.kvmclock_offset = delta;
3727 break; 3727 break;
3728 } 3728 }
3729 case KVM_GET_CLOCK: { 3729 case KVM_GET_CLOCK: {
3730 struct kvm_clock_data user_ns; 3730 struct kvm_clock_data user_ns;
3731 u64 now_ns; 3731 u64 now_ns;
3732 3732
3733 local_irq_disable(); 3733 local_irq_disable();
3734 now_ns = get_kernel_ns(); 3734 now_ns = get_kernel_ns();
3735 user_ns.clock = kvm->arch.kvmclock_offset + now_ns; 3735 user_ns.clock = kvm->arch.kvmclock_offset + now_ns;
3736 local_irq_enable(); 3736 local_irq_enable();
3737 user_ns.flags = 0; 3737 user_ns.flags = 0;
3738 memset(&user_ns.pad, 0, sizeof(user_ns.pad)); 3738 memset(&user_ns.pad, 0, sizeof(user_ns.pad));
3739 3739
3740 r = -EFAULT; 3740 r = -EFAULT;
3741 if (copy_to_user(argp, &user_ns, sizeof(user_ns))) 3741 if (copy_to_user(argp, &user_ns, sizeof(user_ns)))
3742 goto out; 3742 goto out;
3743 r = 0; 3743 r = 0;
3744 break; 3744 break;
3745 } 3745 }
3746 3746
3747 default: 3747 default:
3748 ; 3748 ;
3749 } 3749 }
3750 out: 3750 out:
3751 return r; 3751 return r;
3752 } 3752 }
3753 3753
3754 static void kvm_init_msr_list(void) 3754 static void kvm_init_msr_list(void)
3755 { 3755 {
3756 u32 dummy[2]; 3756 u32 dummy[2];
3757 unsigned i, j; 3757 unsigned i, j;
3758 3758
3759 /* skip the first msrs in the list. KVM-specific */ 3759 /* skip the first msrs in the list. KVM-specific */
3760 for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) { 3760 for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) {
3761 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) 3761 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
3762 continue; 3762 continue;
3763 if (j < i) 3763 if (j < i)
3764 msrs_to_save[j] = msrs_to_save[i]; 3764 msrs_to_save[j] = msrs_to_save[i];
3765 j++; 3765 j++;
3766 } 3766 }
3767 num_msrs_to_save = j; 3767 num_msrs_to_save = j;
3768 } 3768 }
3769 3769
3770 static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, 3770 static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
3771 const void *v) 3771 const void *v)
3772 { 3772 {
3773 int handled = 0; 3773 int handled = 0;
3774 int n; 3774 int n;
3775 3775
3776 do { 3776 do {
3777 n = min(len, 8); 3777 n = min(len, 8);
3778 if (!(vcpu->arch.apic && 3778 if (!(vcpu->arch.apic &&
3779 !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, n, v)) 3779 !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, n, v))
3780 && kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, n, v)) 3780 && kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, n, v))
3781 break; 3781 break;
3782 handled += n; 3782 handled += n;
3783 addr += n; 3783 addr += n;
3784 len -= n; 3784 len -= n;
3785 v += n; 3785 v += n;
3786 } while (len); 3786 } while (len);
3787 3787
3788 return handled; 3788 return handled;
3789 } 3789 }
3790 3790
3791 static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) 3791 static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
3792 { 3792 {
3793 int handled = 0; 3793 int handled = 0;
3794 int n; 3794 int n;
3795 3795
3796 do { 3796 do {
3797 n = min(len, 8); 3797 n = min(len, 8);
3798 if (!(vcpu->arch.apic && 3798 if (!(vcpu->arch.apic &&
3799 !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, n, v)) 3799 !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, n, v))
3800 && kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, n, v)) 3800 && kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, n, v))
3801 break; 3801 break;
3802 trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, *(u64 *)v); 3802 trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, *(u64 *)v);
3803 handled += n; 3803 handled += n;
3804 addr += n; 3804 addr += n;
3805 len -= n; 3805 len -= n;
3806 v += n; 3806 v += n;
3807 } while (len); 3807 } while (len);
3808 3808
3809 return handled; 3809 return handled;
3810 } 3810 }
3811 3811
3812 static void kvm_set_segment(struct kvm_vcpu *vcpu, 3812 static void kvm_set_segment(struct kvm_vcpu *vcpu,
3813 struct kvm_segment *var, int seg) 3813 struct kvm_segment *var, int seg)
3814 { 3814 {
3815 kvm_x86_ops->set_segment(vcpu, var, seg); 3815 kvm_x86_ops->set_segment(vcpu, var, seg);
3816 } 3816 }
3817 3817
3818 void kvm_get_segment(struct kvm_vcpu *vcpu, 3818 void kvm_get_segment(struct kvm_vcpu *vcpu,
3819 struct kvm_segment *var, int seg) 3819 struct kvm_segment *var, int seg)
3820 { 3820 {
3821 kvm_x86_ops->get_segment(vcpu, var, seg); 3821 kvm_x86_ops->get_segment(vcpu, var, seg);
3822 } 3822 }
3823 3823
3824 gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access) 3824 gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
3825 { 3825 {
3826 gpa_t t_gpa; 3826 gpa_t t_gpa;
3827 struct x86_exception exception; 3827 struct x86_exception exception;
3828 3828
3829 BUG_ON(!mmu_is_nested(vcpu)); 3829 BUG_ON(!mmu_is_nested(vcpu));
3830 3830
3831 /* NPT walks are always user-walks */ 3831 /* NPT walks are always user-walks */
3832 access |= PFERR_USER_MASK; 3832 access |= PFERR_USER_MASK;
3833 t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &exception); 3833 t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &exception);
3834 3834
3835 return t_gpa; 3835 return t_gpa;
3836 } 3836 }
3837 3837
3838 gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, 3838 gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
3839 struct x86_exception *exception) 3839 struct x86_exception *exception)
3840 { 3840 {
3841 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3841 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3842 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); 3842 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
3843 } 3843 }
3844 3844
3845 gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, 3845 gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
3846 struct x86_exception *exception) 3846 struct x86_exception *exception)
3847 { 3847 {
3848 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3848 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3849 access |= PFERR_FETCH_MASK; 3849 access |= PFERR_FETCH_MASK;
3850 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); 3850 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
3851 } 3851 }
3852 3852
3853 gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, 3853 gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
3854 struct x86_exception *exception) 3854 struct x86_exception *exception)
3855 { 3855 {
3856 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3856 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3857 access |= PFERR_WRITE_MASK; 3857 access |= PFERR_WRITE_MASK;
3858 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); 3858 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
3859 } 3859 }
3860 3860
3861 /* uses this to access any guest's mapped memory without checking CPL */ 3861 /* uses this to access any guest's mapped memory without checking CPL */
3862 gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, 3862 gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
3863 struct x86_exception *exception) 3863 struct x86_exception *exception)
3864 { 3864 {
3865 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, exception); 3865 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, exception);
3866 } 3866 }
3867 3867
3868 static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, 3868 static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
3869 struct kvm_vcpu *vcpu, u32 access, 3869 struct kvm_vcpu *vcpu, u32 access,
3870 struct x86_exception *exception) 3870 struct x86_exception *exception)
3871 { 3871 {
3872 void *data = val; 3872 void *data = val;
3873 int r = X86EMUL_CONTINUE; 3873 int r = X86EMUL_CONTINUE;
3874 3874
3875 while (bytes) { 3875 while (bytes) {
3876 gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access, 3876 gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access,
3877 exception); 3877 exception);
3878 unsigned offset = addr & (PAGE_SIZE-1); 3878 unsigned offset = addr & (PAGE_SIZE-1);
3879 unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); 3879 unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
3880 int ret; 3880 int ret;
3881 3881
3882 if (gpa == UNMAPPED_GVA) 3882 if (gpa == UNMAPPED_GVA)
3883 return X86EMUL_PROPAGATE_FAULT; 3883 return X86EMUL_PROPAGATE_FAULT;
3884 ret = kvm_read_guest(vcpu->kvm, gpa, data, toread); 3884 ret = kvm_read_guest(vcpu->kvm, gpa, data, toread);
3885 if (ret < 0) { 3885 if (ret < 0) {
3886 r = X86EMUL_IO_NEEDED; 3886 r = X86EMUL_IO_NEEDED;
3887 goto out; 3887 goto out;
3888 } 3888 }
3889 3889
3890 bytes -= toread; 3890 bytes -= toread;
3891 data += toread; 3891 data += toread;
3892 addr += toread; 3892 addr += toread;
3893 } 3893 }
3894 out: 3894 out:
3895 return r; 3895 return r;
3896 } 3896 }
3897 3897
3898 /* used for instruction fetching */ 3898 /* used for instruction fetching */
3899 static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt, 3899 static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
3900 gva_t addr, void *val, unsigned int bytes, 3900 gva_t addr, void *val, unsigned int bytes,
3901 struct x86_exception *exception) 3901 struct x86_exception *exception)
3902 { 3902 {
3903 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); 3903 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
3904 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3904 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3905 3905
3906 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 3906 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu,
3907 access | PFERR_FETCH_MASK, 3907 access | PFERR_FETCH_MASK,
3908 exception); 3908 exception);
3909 } 3909 }
3910 3910
3911 int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt, 3911 int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
3912 gva_t addr, void *val, unsigned int bytes, 3912 gva_t addr, void *val, unsigned int bytes,
3913 struct x86_exception *exception) 3913 struct x86_exception *exception)
3914 { 3914 {
3915 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); 3915 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
3916 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3916 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3917 3917
3918 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, 3918 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
3919 exception); 3919 exception);
3920 } 3920 }
3921 EXPORT_SYMBOL_GPL(kvm_read_guest_virt); 3921 EXPORT_SYMBOL_GPL(kvm_read_guest_virt);
3922 3922
3923 static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt, 3923 static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt,
3924 gva_t addr, void *val, unsigned int bytes, 3924 gva_t addr, void *val, unsigned int bytes,
3925 struct x86_exception *exception) 3925 struct x86_exception *exception)
3926 { 3926 {
3927 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); 3927 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
3928 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception); 3928 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception);
3929 } 3929 }
3930 3930
3931 int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, 3931 int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
3932 gva_t addr, void *val, 3932 gva_t addr, void *val,
3933 unsigned int bytes, 3933 unsigned int bytes,
3934 struct x86_exception *exception) 3934 struct x86_exception *exception)
3935 { 3935 {
3936 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); 3936 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
3937 void *data = val; 3937 void *data = val;
3938 int r = X86EMUL_CONTINUE; 3938 int r = X86EMUL_CONTINUE;
3939 3939
3940 while (bytes) { 3940 while (bytes) {
3941 gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, 3941 gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr,
3942 PFERR_WRITE_MASK, 3942 PFERR_WRITE_MASK,
3943 exception); 3943 exception);
3944 unsigned offset = addr & (PAGE_SIZE-1); 3944 unsigned offset = addr & (PAGE_SIZE-1);
3945 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); 3945 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
3946 int ret; 3946 int ret;
3947 3947
3948 if (gpa == UNMAPPED_GVA) 3948 if (gpa == UNMAPPED_GVA)
3949 return X86EMUL_PROPAGATE_FAULT; 3949 return X86EMUL_PROPAGATE_FAULT;
3950 ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite); 3950 ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);
3951 if (ret < 0) { 3951 if (ret < 0) {
3952 r = X86EMUL_IO_NEEDED; 3952 r = X86EMUL_IO_NEEDED;
3953 goto out; 3953 goto out;
3954 } 3954 }
3955 3955
3956 bytes -= towrite; 3956 bytes -= towrite;
3957 data += towrite; 3957 data += towrite;
3958 addr += towrite; 3958 addr += towrite;
3959 } 3959 }
3960 out: 3960 out:
3961 return r; 3961 return r;
3962 } 3962 }
3963 EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system); 3963 EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
3964 3964
3965 static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, 3965 static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
3966 gpa_t *gpa, struct x86_exception *exception, 3966 gpa_t *gpa, struct x86_exception *exception,
3967 bool write) 3967 bool write)
3968 { 3968 {
3969 u32 access = ((kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0) 3969 u32 access = ((kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0)
3970 | (write ? PFERR_WRITE_MASK : 0); 3970 | (write ? PFERR_WRITE_MASK : 0);
3971 3971
3972 if (vcpu_match_mmio_gva(vcpu, gva) 3972 if (vcpu_match_mmio_gva(vcpu, gva)
3973 && !permission_fault(vcpu->arch.walk_mmu, vcpu->arch.access, access)) { 3973 && !permission_fault(vcpu->arch.walk_mmu, vcpu->arch.access, access)) {
3974 *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT | 3974 *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
3975 (gva & (PAGE_SIZE - 1)); 3975 (gva & (PAGE_SIZE - 1));
3976 trace_vcpu_match_mmio(gva, *gpa, write, false); 3976 trace_vcpu_match_mmio(gva, *gpa, write, false);
3977 return 1; 3977 return 1;
3978 } 3978 }
3979 3979
3980 *gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); 3980 *gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
3981 3981
3982 if (*gpa == UNMAPPED_GVA) 3982 if (*gpa == UNMAPPED_GVA)
3983 return -1; 3983 return -1;
3984 3984
3985 /* For APIC access vmexit */ 3985 /* For APIC access vmexit */
3986 if ((*gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 3986 if ((*gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
3987 return 1; 3987 return 1;
3988 3988
3989 if (vcpu_match_mmio_gpa(vcpu, *gpa)) { 3989 if (vcpu_match_mmio_gpa(vcpu, *gpa)) {
3990 trace_vcpu_match_mmio(gva, *gpa, write, true); 3990 trace_vcpu_match_mmio(gva, *gpa, write, true);
3991 return 1; 3991 return 1;
3992 } 3992 }
3993 3993
3994 return 0; 3994 return 0;
3995 } 3995 }
3996 3996
3997 int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, 3997 int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
3998 const void *val, int bytes) 3998 const void *val, int bytes)
3999 { 3999 {
4000 int ret; 4000 int ret;
4001 4001
4002 ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes); 4002 ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
4003 if (ret < 0) 4003 if (ret < 0)
4004 return 0; 4004 return 0;
4005 kvm_mmu_pte_write(vcpu, gpa, val, bytes); 4005 kvm_mmu_pte_write(vcpu, gpa, val, bytes);
4006 return 1; 4006 return 1;
4007 } 4007 }
4008 4008
4009 struct read_write_emulator_ops { 4009 struct read_write_emulator_ops {
4010 int (*read_write_prepare)(struct kvm_vcpu *vcpu, void *val, 4010 int (*read_write_prepare)(struct kvm_vcpu *vcpu, void *val,
4011 int bytes); 4011 int bytes);
4012 int (*read_write_emulate)(struct kvm_vcpu *vcpu, gpa_t gpa, 4012 int (*read_write_emulate)(struct kvm_vcpu *vcpu, gpa_t gpa,
4013 void *val, int bytes); 4013 void *val, int bytes);
4014 int (*read_write_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa, 4014 int (*read_write_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa,
4015 int bytes, void *val); 4015 int bytes, void *val);
4016 int (*read_write_exit_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa, 4016 int (*read_write_exit_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa,
4017 void *val, int bytes); 4017 void *val, int bytes);
4018 bool write; 4018 bool write;
4019 }; 4019 };
4020 4020
4021 static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes) 4021 static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes)
4022 { 4022 {
4023 if (vcpu->mmio_read_completed) { 4023 if (vcpu->mmio_read_completed) {
4024 trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, 4024 trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
4025 vcpu->mmio_fragments[0].gpa, *(u64 *)val); 4025 vcpu->mmio_fragments[0].gpa, *(u64 *)val);
4026 vcpu->mmio_read_completed = 0; 4026 vcpu->mmio_read_completed = 0;
4027 return 1; 4027 return 1;
4028 } 4028 }
4029 4029
4030 return 0; 4030 return 0;
4031 } 4031 }
4032 4032
4033 static int read_emulate(struct kvm_vcpu *vcpu, gpa_t gpa, 4033 static int read_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
4034 void *val, int bytes) 4034 void *val, int bytes)
4035 { 4035 {
4036 return !kvm_read_guest(vcpu->kvm, gpa, val, bytes); 4036 return !kvm_read_guest(vcpu->kvm, gpa, val, bytes);
4037 } 4037 }
4038 4038
4039 static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa, 4039 static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
4040 void *val, int bytes) 4040 void *val, int bytes)
4041 { 4041 {
4042 return emulator_write_phys(vcpu, gpa, val, bytes); 4042 return emulator_write_phys(vcpu, gpa, val, bytes);
4043 } 4043 }
4044 4044
4045 static int write_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes, void *val) 4045 static int write_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes, void *val)
4046 { 4046 {
4047 trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val); 4047 trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
4048 return vcpu_mmio_write(vcpu, gpa, bytes, val); 4048 return vcpu_mmio_write(vcpu, gpa, bytes, val);
4049 } 4049 }
4050 4050
4051 static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, 4051 static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
4052 void *val, int bytes) 4052 void *val, int bytes)
4053 { 4053 {
4054 trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0); 4054 trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);
4055 return X86EMUL_IO_NEEDED; 4055 return X86EMUL_IO_NEEDED;
4056 } 4056 }
4057 4057
4058 static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, 4058 static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
4059 void *val, int bytes) 4059 void *val, int bytes)
4060 { 4060 {
4061 struct kvm_mmio_fragment *frag = &vcpu->mmio_fragments[0]; 4061 struct kvm_mmio_fragment *frag = &vcpu->mmio_fragments[0];
4062 4062
4063 memcpy(vcpu->run->mmio.data, frag->data, min(8u, frag->len)); 4063 memcpy(vcpu->run->mmio.data, frag->data, min(8u, frag->len));
4064 return X86EMUL_CONTINUE; 4064 return X86EMUL_CONTINUE;
4065 } 4065 }
4066 4066
4067 static const struct read_write_emulator_ops read_emultor = { 4067 static const struct read_write_emulator_ops read_emultor = {
4068 .read_write_prepare = read_prepare, 4068 .read_write_prepare = read_prepare,
4069 .read_write_emulate = read_emulate, 4069 .read_write_emulate = read_emulate,
4070 .read_write_mmio = vcpu_mmio_read, 4070 .read_write_mmio = vcpu_mmio_read,
4071 .read_write_exit_mmio = read_exit_mmio, 4071 .read_write_exit_mmio = read_exit_mmio,
4072 }; 4072 };
4073 4073
4074 static const struct read_write_emulator_ops write_emultor = { 4074 static const struct read_write_emulator_ops write_emultor = {
4075 .read_write_emulate = write_emulate, 4075 .read_write_emulate = write_emulate,
4076 .read_write_mmio = write_mmio, 4076 .read_write_mmio = write_mmio,
4077 .read_write_exit_mmio = write_exit_mmio, 4077 .read_write_exit_mmio = write_exit_mmio,
4078 .write = true, 4078 .write = true,
4079 }; 4079 };
4080 4080
4081 static int emulator_read_write_onepage(unsigned long addr, void *val, 4081 static int emulator_read_write_onepage(unsigned long addr, void *val,
4082 unsigned int bytes, 4082 unsigned int bytes,
4083 struct x86_exception *exception, 4083 struct x86_exception *exception,
4084 struct kvm_vcpu *vcpu, 4084 struct kvm_vcpu *vcpu,
4085 const struct read_write_emulator_ops *ops) 4085 const struct read_write_emulator_ops *ops)
4086 { 4086 {
4087 gpa_t gpa; 4087 gpa_t gpa;
4088 int handled, ret; 4088 int handled, ret;
4089 bool write = ops->write; 4089 bool write = ops->write;
4090 struct kvm_mmio_fragment *frag; 4090 struct kvm_mmio_fragment *frag;
4091 4091
4092 ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write); 4092 ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write);
4093 4093
4094 if (ret < 0) 4094 if (ret < 0)
4095 return X86EMUL_PROPAGATE_FAULT; 4095 return X86EMUL_PROPAGATE_FAULT;
4096 4096
4097 /* For APIC access vmexit */ 4097 /* For APIC access vmexit */
4098 if (ret) 4098 if (ret)
4099 goto mmio; 4099 goto mmio;
4100 4100
4101 if (ops->read_write_emulate(vcpu, gpa, val, bytes)) 4101 if (ops->read_write_emulate(vcpu, gpa, val, bytes))
4102 return X86EMUL_CONTINUE; 4102 return X86EMUL_CONTINUE;
4103 4103
4104 mmio: 4104 mmio:
4105 /* 4105 /*
4106 * Is this MMIO handled locally? 4106 * Is this MMIO handled locally?
4107 */ 4107 */
4108 handled = ops->read_write_mmio(vcpu, gpa, bytes, val); 4108 handled = ops->read_write_mmio(vcpu, gpa, bytes, val);
4109 if (handled == bytes) 4109 if (handled == bytes)
4110 return X86EMUL_CONTINUE; 4110 return X86EMUL_CONTINUE;
4111 4111
4112 gpa += handled; 4112 gpa += handled;
4113 bytes -= handled; 4113 bytes -= handled;
4114 val += handled; 4114 val += handled;
4115 4115
4116 WARN_ON(vcpu->mmio_nr_fragments >= KVM_MAX_MMIO_FRAGMENTS); 4116 WARN_ON(vcpu->mmio_nr_fragments >= KVM_MAX_MMIO_FRAGMENTS);
4117 frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++]; 4117 frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++];
4118 frag->gpa = gpa; 4118 frag->gpa = gpa;
4119 frag->data = val; 4119 frag->data = val;
4120 frag->len = bytes; 4120 frag->len = bytes;
4121 return X86EMUL_CONTINUE; 4121 return X86EMUL_CONTINUE;
4122 } 4122 }
4123 4123
4124 int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr, 4124 int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr,
4125 void *val, unsigned int bytes, 4125 void *val, unsigned int bytes,
4126 struct x86_exception *exception, 4126 struct x86_exception *exception,
4127 const struct read_write_emulator_ops *ops) 4127 const struct read_write_emulator_ops *ops)
4128 { 4128 {
4129 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); 4129 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4130 gpa_t gpa; 4130 gpa_t gpa;
4131 int rc; 4131 int rc;
4132 4132
4133 if (ops->read_write_prepare && 4133 if (ops->read_write_prepare &&
4134 ops->read_write_prepare(vcpu, val, bytes)) 4134 ops->read_write_prepare(vcpu, val, bytes))
4135 return X86EMUL_CONTINUE; 4135 return X86EMUL_CONTINUE;
4136 4136
4137 vcpu->mmio_nr_fragments = 0; 4137 vcpu->mmio_nr_fragments = 0;
4138 4138
4139 /* Crossing a page boundary? */ 4139 /* Crossing a page boundary? */
4140 if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { 4140 if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
4141 int now; 4141 int now;
4142 4142
4143 now = -addr & ~PAGE_MASK; 4143 now = -addr & ~PAGE_MASK;
4144 rc = emulator_read_write_onepage(addr, val, now, exception, 4144 rc = emulator_read_write_onepage(addr, val, now, exception,
4145 vcpu, ops); 4145 vcpu, ops);
4146 4146
4147 if (rc != X86EMUL_CONTINUE) 4147 if (rc != X86EMUL_CONTINUE)
4148 return rc; 4148 return rc;
4149 addr += now; 4149 addr += now;
4150 val += now; 4150 val += now;
4151 bytes -= now; 4151 bytes -= now;
4152 } 4152 }
4153 4153
4154 rc = emulator_read_write_onepage(addr, val, bytes, exception, 4154 rc = emulator_read_write_onepage(addr, val, bytes, exception,
4155 vcpu, ops); 4155 vcpu, ops);
4156 if (rc != X86EMUL_CONTINUE) 4156 if (rc != X86EMUL_CONTINUE)
4157 return rc; 4157 return rc;
4158 4158
4159 if (!vcpu->mmio_nr_fragments) 4159 if (!vcpu->mmio_nr_fragments)
4160 return rc; 4160 return rc;
4161 4161
4162 gpa = vcpu->mmio_fragments[0].gpa; 4162 gpa = vcpu->mmio_fragments[0].gpa;
4163 4163
4164 vcpu->mmio_needed = 1; 4164 vcpu->mmio_needed = 1;
4165 vcpu->mmio_cur_fragment = 0; 4165 vcpu->mmio_cur_fragment = 0;
4166 4166
4167 vcpu->run->mmio.len = min(8u, vcpu->mmio_fragments[0].len); 4167 vcpu->run->mmio.len = min(8u, vcpu->mmio_fragments[0].len);
4168 vcpu->run->mmio.is_write = vcpu->mmio_is_write = ops->write; 4168 vcpu->run->mmio.is_write = vcpu->mmio_is_write = ops->write;
4169 vcpu->run->exit_reason = KVM_EXIT_MMIO; 4169 vcpu->run->exit_reason = KVM_EXIT_MMIO;
4170 vcpu->run->mmio.phys_addr = gpa; 4170 vcpu->run->mmio.phys_addr = gpa;
4171 4171
4172 return ops->read_write_exit_mmio(vcpu, gpa, val, bytes); 4172 return ops->read_write_exit_mmio(vcpu, gpa, val, bytes);
4173 } 4173 }
4174 4174
4175 static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, 4175 static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
4176 unsigned long addr, 4176 unsigned long addr,
4177 void *val, 4177 void *val,
4178 unsigned int bytes, 4178 unsigned int bytes,
4179 struct x86_exception *exception) 4179 struct x86_exception *exception)
4180 { 4180 {
4181 return emulator_read_write(ctxt, addr, val, bytes, 4181 return emulator_read_write(ctxt, addr, val, bytes,
4182 exception, &read_emultor); 4182 exception, &read_emultor);
4183 } 4183 }
4184 4184
4185 int emulator_write_emulated(struct x86_emulate_ctxt *ctxt, 4185 int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,
4186 unsigned long addr, 4186 unsigned long addr,
4187 const void *val, 4187 const void *val,
4188 unsigned int bytes, 4188 unsigned int bytes,
4189 struct x86_exception *exception) 4189 struct x86_exception *exception)
4190 { 4190 {
4191 return emulator_read_write(ctxt, addr, (void *)val, bytes, 4191 return emulator_read_write(ctxt, addr, (void *)val, bytes,
4192 exception, &write_emultor); 4192 exception, &write_emultor);
4193 } 4193 }
4194 4194
4195 #define CMPXCHG_TYPE(t, ptr, old, new) \ 4195 #define CMPXCHG_TYPE(t, ptr, old, new) \
4196 (cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old)) 4196 (cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old))
4197 4197
4198 #ifdef CONFIG_X86_64 4198 #ifdef CONFIG_X86_64
4199 # define CMPXCHG64(ptr, old, new) CMPXCHG_TYPE(u64, ptr, old, new) 4199 # define CMPXCHG64(ptr, old, new) CMPXCHG_TYPE(u64, ptr, old, new)
4200 #else 4200 #else
4201 # define CMPXCHG64(ptr, old, new) \ 4201 # define CMPXCHG64(ptr, old, new) \
4202 (cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old)) 4202 (cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old))
4203 #endif 4203 #endif
4204 4204
4205 static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, 4205 static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
4206 unsigned long addr, 4206 unsigned long addr,
4207 const void *old, 4207 const void *old,
4208 const void *new, 4208 const void *new,
4209 unsigned int bytes, 4209 unsigned int bytes,
4210 struct x86_exception *exception) 4210 struct x86_exception *exception)
4211 { 4211 {
4212 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); 4212 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4213 gpa_t gpa; 4213 gpa_t gpa;
4214 struct page *page; 4214 struct page *page;
4215 char *kaddr; 4215 char *kaddr;
4216 bool exchanged; 4216 bool exchanged;
4217 4217
4218 /* guests cmpxchg8b have to be emulated atomically */ 4218 /* guests cmpxchg8b have to be emulated atomically */
4219 if (bytes > 8 || (bytes & (bytes - 1))) 4219 if (bytes > 8 || (bytes & (bytes - 1)))
4220 goto emul_write; 4220 goto emul_write;
4221 4221
4222 gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL); 4222 gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
4223 4223
4224 if (gpa == UNMAPPED_GVA || 4224 if (gpa == UNMAPPED_GVA ||
4225 (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 4225 (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
4226 goto emul_write; 4226 goto emul_write;
4227 4227
4228 if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK)) 4228 if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
4229 goto emul_write; 4229 goto emul_write;
4230 4230
4231 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); 4231 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
4232 if (is_error_page(page)) 4232 if (is_error_page(page))
4233 goto emul_write; 4233 goto emul_write;
4234 4234
4235 kaddr = kmap_atomic(page); 4235 kaddr = kmap_atomic(page);
4236 kaddr += offset_in_page(gpa); 4236 kaddr += offset_in_page(gpa);
4237 switch (bytes) { 4237 switch (bytes) {
4238 case 1: 4238 case 1:
4239 exchanged = CMPXCHG_TYPE(u8, kaddr, old, new); 4239 exchanged = CMPXCHG_TYPE(u8, kaddr, old, new);
4240 break; 4240 break;
4241 case 2: 4241 case 2:
4242 exchanged = CMPXCHG_TYPE(u16, kaddr, old, new); 4242 exchanged = CMPXCHG_TYPE(u16, kaddr, old, new);
4243 break; 4243 break;
4244 case 4: 4244 case 4:
4245 exchanged = CMPXCHG_TYPE(u32, kaddr, old, new); 4245 exchanged = CMPXCHG_TYPE(u32, kaddr, old, new);
4246 break; 4246 break;
4247 case 8: 4247 case 8:
4248 exchanged = CMPXCHG64(kaddr, old, new); 4248 exchanged = CMPXCHG64(kaddr, old, new);
4249 break; 4249 break;
4250 default: 4250 default:
4251 BUG(); 4251 BUG();
4252 } 4252 }
4253 kunmap_atomic(kaddr); 4253 kunmap_atomic(kaddr);
4254 kvm_release_page_dirty(page); 4254 kvm_release_page_dirty(page);
4255 4255
4256 if (!exchanged) 4256 if (!exchanged)
4257 return X86EMUL_CMPXCHG_FAILED; 4257 return X86EMUL_CMPXCHG_FAILED;
4258 4258
4259 kvm_mmu_pte_write(vcpu, gpa, new, bytes); 4259 kvm_mmu_pte_write(vcpu, gpa, new, bytes);
4260 4260
4261 return X86EMUL_CONTINUE; 4261 return X86EMUL_CONTINUE;
4262 4262
4263 emul_write: 4263 emul_write:
4264 printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); 4264 printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
4265 4265
4266 return emulator_write_emulated(ctxt, addr, new, bytes, exception); 4266 return emulator_write_emulated(ctxt, addr, new, bytes, exception);
4267 } 4267 }
4268 4268
4269 static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) 4269 static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
4270 { 4270 {
4271 /* TODO: String I/O for in kernel device */ 4271 /* TODO: String I/O for in kernel device */
4272 int r; 4272 int r;
4273 4273
4274 if (vcpu->arch.pio.in) 4274 if (vcpu->arch.pio.in)
4275 r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port, 4275 r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port,
4276 vcpu->arch.pio.size, pd); 4276 vcpu->arch.pio.size, pd);
4277 else 4277 else
4278 r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS, 4278 r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
4279 vcpu->arch.pio.port, vcpu->arch.pio.size, 4279 vcpu->arch.pio.port, vcpu->arch.pio.size,
4280 pd); 4280 pd);
4281 return r; 4281 return r;
4282 } 4282 }
4283 4283
4284 static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size, 4284 static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size,
4285 unsigned short port, void *val, 4285 unsigned short port, void *val,
4286 unsigned int count, bool in) 4286 unsigned int count, bool in)
4287 { 4287 {
4288 trace_kvm_pio(!in, port, size, count); 4288 trace_kvm_pio(!in, port, size, count);
4289 4289
4290 vcpu->arch.pio.port = port; 4290 vcpu->arch.pio.port = port;
4291 vcpu->arch.pio.in = in; 4291 vcpu->arch.pio.in = in;
4292 vcpu->arch.pio.count = count; 4292 vcpu->arch.pio.count = count;
4293 vcpu->arch.pio.size = size; 4293 vcpu->arch.pio.size = size;
4294 4294
4295 if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { 4295 if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
4296 vcpu->arch.pio.count = 0; 4296 vcpu->arch.pio.count = 0;
4297 return 1; 4297 return 1;
4298 } 4298 }
4299 4299
4300 vcpu->run->exit_reason = KVM_EXIT_IO; 4300 vcpu->run->exit_reason = KVM_EXIT_IO;
4301 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; 4301 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
4302 vcpu->run->io.size = size; 4302 vcpu->run->io.size = size;
4303 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; 4303 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
4304 vcpu->run->io.count = count; 4304 vcpu->run->io.count = count;
4305 vcpu->run->io.port = port; 4305 vcpu->run->io.port = port;
4306 4306
4307 return 0; 4307 return 0;
4308 } 4308 }
4309 4309
4310 static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt, 4310 static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
4311 int size, unsigned short port, void *val, 4311 int size, unsigned short port, void *val,
4312 unsigned int count) 4312 unsigned int count)
4313 { 4313 {
4314 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); 4314 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4315 int ret; 4315 int ret;
4316 4316
4317 if (vcpu->arch.pio.count) 4317 if (vcpu->arch.pio.count)
4318 goto data_avail; 4318 goto data_avail;
4319 4319
4320 ret = emulator_pio_in_out(vcpu, size, port, val, count, true); 4320 ret = emulator_pio_in_out(vcpu, size, port, val, count, true);
4321 if (ret) { 4321 if (ret) {
4322 data_avail: 4322 data_avail:
4323 memcpy(val, vcpu->arch.pio_data, size * count); 4323 memcpy(val, vcpu->arch.pio_data, size * count);
4324 vcpu->arch.pio.count = 0; 4324 vcpu->arch.pio.count = 0;
4325 return 1; 4325 return 1;
4326 } 4326 }
4327 4327
4328 return 0; 4328 return 0;
4329 } 4329 }
4330 4330
4331 static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt, 4331 static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
4332 int size, unsigned short port, 4332 int size, unsigned short port,
4333 const void *val, unsigned int count) 4333 const void *val, unsigned int count)
4334 { 4334 {
4335 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); 4335 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4336 4336
4337 memcpy(vcpu->arch.pio_data, val, size * count); 4337 memcpy(vcpu->arch.pio_data, val, size * count);
4338 return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false); 4338 return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false);
4339 } 4339 }
4340 4340
4341 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) 4341 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
4342 { 4342 {
4343 return kvm_x86_ops->get_segment_base(vcpu, seg); 4343 return kvm_x86_ops->get_segment_base(vcpu, seg);
4344 } 4344 }
4345 4345
4346 static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address) 4346 static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address)
4347 { 4347 {
4348 kvm_mmu_invlpg(emul_to_vcpu(ctxt), address); 4348 kvm_mmu_invlpg(emul_to_vcpu(ctxt), address);
4349 } 4349 }
4350 4350
4351 int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu) 4351 int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
4352 { 4352 {
4353 if (!need_emulate_wbinvd(vcpu)) 4353 if (!need_emulate_wbinvd(vcpu))
4354 return X86EMUL_CONTINUE; 4354 return X86EMUL_CONTINUE;
4355 4355
4356 if (kvm_x86_ops->has_wbinvd_exit()) { 4356 if (kvm_x86_ops->has_wbinvd_exit()) {
4357 int cpu = get_cpu(); 4357 int cpu = get_cpu();
4358 4358
4359 cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); 4359 cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
4360 smp_call_function_many(vcpu->arch.wbinvd_dirty_mask, 4360 smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,
4361 wbinvd_ipi, NULL, 1); 4361 wbinvd_ipi, NULL, 1);
4362 put_cpu(); 4362 put_cpu();
4363 cpumask_clear(vcpu->arch.wbinvd_dirty_mask); 4363 cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
4364 } else 4364 } else
4365 wbinvd(); 4365 wbinvd();
4366 return X86EMUL_CONTINUE; 4366 return X86EMUL_CONTINUE;
4367 } 4367 }
4368 EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd); 4368 EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
4369 4369
4370 static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt) 4370 static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt)
4371 { 4371 {
4372 kvm_emulate_wbinvd(emul_to_vcpu(ctxt)); 4372 kvm_emulate_wbinvd(emul_to_vcpu(ctxt));
4373 } 4373 }
4374 4374
4375 int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) 4375 int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
4376 { 4376 {
4377 return _kvm_get_dr(emul_to_vcpu(ctxt), dr, dest); 4377 return _kvm_get_dr(emul_to_vcpu(ctxt), dr, dest);
4378 } 4378 }
4379 4379
4380 int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) 4380 int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
4381 { 4381 {
4382 4382
4383 return __kvm_set_dr(emul_to_vcpu(ctxt), dr, value); 4383 return __kvm_set_dr(emul_to_vcpu(ctxt), dr, value);
4384 } 4384 }
4385 4385
4386 static u64 mk_cr_64(u64 curr_cr, u32 new_val) 4386 static u64 mk_cr_64(u64 curr_cr, u32 new_val)
4387 { 4387 {
4388 return (curr_cr & ~((1ULL << 32) - 1)) | new_val; 4388 return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
4389 } 4389 }
4390 4390
4391 static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr) 4391 static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr)
4392 { 4392 {
4393 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); 4393 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4394 unsigned long value; 4394 unsigned long value;
4395 4395
4396 switch (cr) { 4396 switch (cr) {
4397 case 0: 4397 case 0:
4398 value = kvm_read_cr0(vcpu); 4398 value = kvm_read_cr0(vcpu);
4399 break; 4399 break;
4400 case 2: 4400 case 2:
4401 value = vcpu->arch.cr2; 4401 value = vcpu->arch.cr2;
4402 break; 4402 break;
4403 case 3: 4403 case 3:
4404 value = kvm_read_cr3(vcpu); 4404 value = kvm_read_cr3(vcpu);
4405 break; 4405 break;
4406 case 4: 4406 case 4:
4407 value = kvm_read_cr4(vcpu); 4407 value = kvm_read_cr4(vcpu);
4408 break; 4408 break;
4409 case 8: 4409 case 8:
4410 value = kvm_get_cr8(vcpu); 4410 value = kvm_get_cr8(vcpu);
4411 break; 4411 break;
4412 default: 4412 default:
4413 kvm_err("%s: unexpected cr %u\n", __func__, cr); 4413 kvm_err("%s: unexpected cr %u\n", __func__, cr);
4414 return 0; 4414 return 0;
4415 } 4415 }
4416 4416
4417 return value; 4417 return value;
4418 } 4418 }
4419 4419
4420 static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val) 4420 static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
4421 { 4421 {
4422 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); 4422 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4423 int res = 0; 4423 int res = 0;
4424 4424
4425 switch (cr) { 4425 switch (cr) {
4426 case 0: 4426 case 0:
4427 res = kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val)); 4427 res = kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
4428 break; 4428 break;
4429 case 2: 4429 case 2:
4430 vcpu->arch.cr2 = val; 4430 vcpu->arch.cr2 = val;
4431 break; 4431 break;
4432 case 3: 4432 case 3:
4433 res = kvm_set_cr3(vcpu, val); 4433 res = kvm_set_cr3(vcpu, val);
4434 break; 4434 break;
4435 case 4: 4435 case 4:
4436 res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val)); 4436 res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
4437 break; 4437 break;
4438 case 8: 4438 case 8:
4439 res = kvm_set_cr8(vcpu, val); 4439 res = kvm_set_cr8(vcpu, val);
4440 break; 4440 break;
4441 default: 4441 default:
4442 kvm_err("%s: unexpected cr %u\n", __func__, cr); 4442 kvm_err("%s: unexpected cr %u\n", __func__, cr);
4443 res = -1; 4443 res = -1;
4444 } 4444 }
4445 4445
4446 return res; 4446 return res;
4447 } 4447 }
4448 4448
4449 static void emulator_set_rflags(struct x86_emulate_ctxt *ctxt, ulong val) 4449 static void emulator_set_rflags(struct x86_emulate_ctxt *ctxt, ulong val)
4450 { 4450 {
4451 kvm_set_rflags(emul_to_vcpu(ctxt), val); 4451 kvm_set_rflags(emul_to_vcpu(ctxt), val);
4452 } 4452 }
4453 4453
4454 static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt) 4454 static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
4455 { 4455 {
4456 return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt)); 4456 return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt));
4457 } 4457 }
4458 4458
4459 static void emulator_get_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) 4459 static void emulator_get_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
4460 { 4460 {
4461 kvm_x86_ops->get_gdt(emul_to_vcpu(ctxt), dt); 4461 kvm_x86_ops->get_gdt(emul_to_vcpu(ctxt), dt);
4462 } 4462 }
4463 4463
4464 static void emulator_get_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) 4464 static void emulator_get_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
4465 { 4465 {
4466 kvm_x86_ops->get_idt(emul_to_vcpu(ctxt), dt); 4466 kvm_x86_ops->get_idt(emul_to_vcpu(ctxt), dt);
4467 } 4467 }
4468 4468
4469 static void emulator_set_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) 4469 static void emulator_set_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
4470 { 4470 {
4471 kvm_x86_ops->set_gdt(emul_to_vcpu(ctxt), dt); 4471 kvm_x86_ops->set_gdt(emul_to_vcpu(ctxt), dt);
4472 } 4472 }
4473 4473
4474 static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) 4474 static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
4475 { 4475 {
4476 kvm_x86_ops->set_idt(emul_to_vcpu(ctxt), dt); 4476 kvm_x86_ops->set_idt(emul_to_vcpu(ctxt), dt);
4477 } 4477 }
4478 4478
4479 static unsigned long emulator_get_cached_segment_base( 4479 static unsigned long emulator_get_cached_segment_base(
4480 struct x86_emulate_ctxt *ctxt, int seg) 4480 struct x86_emulate_ctxt *ctxt, int seg)
4481 { 4481 {
4482 return get_segment_base(emul_to_vcpu(ctxt), seg); 4482 return get_segment_base(emul_to_vcpu(ctxt), seg);
4483 } 4483 }
4484 4484
4485 static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector, 4485 static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector,
4486 struct desc_struct *desc, u32 *base3, 4486 struct desc_struct *desc, u32 *base3,
4487 int seg) 4487 int seg)
4488 { 4488 {
4489 struct kvm_segment var; 4489 struct kvm_segment var;
4490 4490
4491 kvm_get_segment(emul_to_vcpu(ctxt), &var, seg); 4491 kvm_get_segment(emul_to_vcpu(ctxt), &var, seg);
4492 *selector = var.selector; 4492 *selector = var.selector;
4493 4493
4494 if (var.unusable) 4494 if (var.unusable)
4495 return false; 4495 return false;
4496 4496
4497 if (var.g) 4497 if (var.g)
4498 var.limit >>= 12; 4498 var.limit >>= 12;
4499 set_desc_limit(desc, var.limit); 4499 set_desc_limit(desc, var.limit);
4500 set_desc_base(desc, (unsigned long)var.base); 4500 set_desc_base(desc, (unsigned long)var.base);
4501 #ifdef CONFIG_X86_64 4501 #ifdef CONFIG_X86_64
4502 if (base3) 4502 if (base3)
4503 *base3 = var.base >> 32; 4503 *base3 = var.base >> 32;
4504 #endif 4504 #endif
4505 desc->type = var.type; 4505 desc->type = var.type;
4506 desc->s = var.s; 4506 desc->s = var.s;
4507 desc->dpl = var.dpl; 4507 desc->dpl = var.dpl;
4508 desc->p = var.present; 4508 desc->p = var.present;
4509 desc->avl = var.avl; 4509 desc->avl = var.avl;
4510 desc->l = var.l; 4510 desc->l = var.l;
4511 desc->d = var.db; 4511 desc->d = var.db;
4512 desc->g = var.g; 4512 desc->g = var.g;
4513 4513
4514 return true; 4514 return true;
4515 } 4515 }
4516 4516
4517 static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector, 4517 static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector,
4518 struct desc_struct *desc, u32 base3, 4518 struct desc_struct *desc, u32 base3,
4519 int seg) 4519 int seg)
4520 { 4520 {
4521 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); 4521 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4522 struct kvm_segment var; 4522 struct kvm_segment var;
4523 4523
4524 var.selector = selector; 4524 var.selector = selector;
4525 var.base = get_desc_base(desc); 4525 var.base = get_desc_base(desc);
4526 #ifdef CONFIG_X86_64 4526 #ifdef CONFIG_X86_64
4527 var.base |= ((u64)base3) << 32; 4527 var.base |= ((u64)base3) << 32;
4528 #endif 4528 #endif
4529 var.limit = get_desc_limit(desc); 4529 var.limit = get_desc_limit(desc);
4530 if (desc->g) 4530 if (desc->g)
4531 var.limit = (var.limit << 12) | 0xfff; 4531 var.limit = (var.limit << 12) | 0xfff;
4532 var.type = desc->type; 4532 var.type = desc->type;
4533 var.present = desc->p; 4533 var.present = desc->p;
4534 var.dpl = desc->dpl; 4534 var.dpl = desc->dpl;
4535 var.db = desc->d; 4535 var.db = desc->d;
4536 var.s = desc->s; 4536 var.s = desc->s;
4537 var.l = desc->l; 4537 var.l = desc->l;
4538 var.g = desc->g; 4538 var.g = desc->g;
4539 var.avl = desc->avl; 4539 var.avl = desc->avl;
4540 var.present = desc->p; 4540 var.present = desc->p;
4541 var.unusable = !var.present; 4541 var.unusable = !var.present;
4542 var.padding = 0; 4542 var.padding = 0;
4543 4543
4544 kvm_set_segment(vcpu, &var, seg); 4544 kvm_set_segment(vcpu, &var, seg);
4545 return; 4545 return;
4546 } 4546 }
4547 4547
4548 static int emulator_get_msr(struct x86_emulate_ctxt *ctxt, 4548 static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
4549 u32 msr_index, u64 *pdata) 4549 u32 msr_index, u64 *pdata)
4550 { 4550 {
4551 return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata); 4551 return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata);
4552 } 4552 }
4553 4553
4554 static int emulator_set_msr(struct x86_emulate_ctxt *ctxt, 4554 static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
4555 u32 msr_index, u64 data) 4555 u32 msr_index, u64 data)
4556 { 4556 {
4557 struct msr_data msr; 4557 struct msr_data msr;
4558 4558
4559 msr.data = data; 4559 msr.data = data;
4560 msr.index = msr_index; 4560 msr.index = msr_index;
4561 msr.host_initiated = false; 4561 msr.host_initiated = false;
4562 return kvm_set_msr(emul_to_vcpu(ctxt), &msr); 4562 return kvm_set_msr(emul_to_vcpu(ctxt), &msr);
4563 } 4563 }
4564 4564
4565 static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt, 4565 static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
4566 u32 pmc, u64 *pdata) 4566 u32 pmc, u64 *pdata)
4567 { 4567 {
4568 return kvm_pmu_read_pmc(emul_to_vcpu(ctxt), pmc, pdata); 4568 return kvm_pmu_read_pmc(emul_to_vcpu(ctxt), pmc, pdata);
4569 } 4569 }
4570 4570
4571 static void emulator_halt(struct x86_emulate_ctxt *ctxt) 4571 static void emulator_halt(struct x86_emulate_ctxt *ctxt)
4572 { 4572 {
4573 emul_to_vcpu(ctxt)->arch.halt_request = 1; 4573 emul_to_vcpu(ctxt)->arch.halt_request = 1;
4574 } 4574 }
4575 4575
4576 static void emulator_get_fpu(struct x86_emulate_ctxt *ctxt) 4576 static void emulator_get_fpu(struct x86_emulate_ctxt *ctxt)
4577 { 4577 {
4578 preempt_disable(); 4578 preempt_disable();
4579 kvm_load_guest_fpu(emul_to_vcpu(ctxt)); 4579 kvm_load_guest_fpu(emul_to_vcpu(ctxt));
4580 /* 4580 /*
4581 * CR0.TS may reference the host fpu state, not the guest fpu state, 4581 * CR0.TS may reference the host fpu state, not the guest fpu state,
4582 * so it may be clear at this point. 4582 * so it may be clear at this point.
4583 */ 4583 */
4584 clts(); 4584 clts();
4585 } 4585 }
4586 4586
4587 static void emulator_put_fpu(struct x86_emulate_ctxt *ctxt) 4587 static void emulator_put_fpu(struct x86_emulate_ctxt *ctxt)
4588 { 4588 {
4589 preempt_enable(); 4589 preempt_enable();
4590 } 4590 }
4591 4591
4592 static int emulator_intercept(struct x86_emulate_ctxt *ctxt, 4592 static int emulator_intercept(struct x86_emulate_ctxt *ctxt,
4593 struct x86_instruction_info *info, 4593 struct x86_instruction_info *info,
4594 enum x86_intercept_stage stage) 4594 enum x86_intercept_stage stage)
4595 { 4595 {
4596 return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage); 4596 return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage);
4597 } 4597 }
4598 4598
4599 static void emulator_get_cpuid(struct x86_emulate_ctxt *ctxt, 4599 static void emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
4600 u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) 4600 u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
4601 { 4601 {
4602 kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx); 4602 kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx);
4603 } 4603 }
4604 4604
4605 static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg) 4605 static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg)
4606 { 4606 {
4607 return kvm_register_read(emul_to_vcpu(ctxt), reg); 4607 return kvm_register_read(emul_to_vcpu(ctxt), reg);
4608 } 4608 }
4609 4609
4610 static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulong val) 4610 static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulong val)
4611 { 4611 {
4612 kvm_register_write(emul_to_vcpu(ctxt), reg, val); 4612 kvm_register_write(emul_to_vcpu(ctxt), reg, val);
4613 } 4613 }
4614 4614
4615 static const struct x86_emulate_ops emulate_ops = { 4615 static const struct x86_emulate_ops emulate_ops = {
4616 .read_gpr = emulator_read_gpr, 4616 .read_gpr = emulator_read_gpr,
4617 .write_gpr = emulator_write_gpr, 4617 .write_gpr = emulator_write_gpr,
4618 .read_std = kvm_read_guest_virt_system, 4618 .read_std = kvm_read_guest_virt_system,
4619 .write_std = kvm_write_guest_virt_system, 4619 .write_std = kvm_write_guest_virt_system,
4620 .fetch = kvm_fetch_guest_virt, 4620 .fetch = kvm_fetch_guest_virt,
4621 .read_emulated = emulator_read_emulated, 4621 .read_emulated = emulator_read_emulated,
4622 .write_emulated = emulator_write_emulated, 4622 .write_emulated = emulator_write_emulated,
4623 .cmpxchg_emulated = emulator_cmpxchg_emulated, 4623 .cmpxchg_emulated = emulator_cmpxchg_emulated,
4624 .invlpg = emulator_invlpg, 4624 .invlpg = emulator_invlpg,
4625 .pio_in_emulated = emulator_pio_in_emulated, 4625 .pio_in_emulated = emulator_pio_in_emulated,
4626 .pio_out_emulated = emulator_pio_out_emulated, 4626 .pio_out_emulated = emulator_pio_out_emulated,
4627 .get_segment = emulator_get_segment, 4627 .get_segment = emulator_get_segment,
4628 .set_segment = emulator_set_segment, 4628 .set_segment = emulator_set_segment,
4629 .get_cached_segment_base = emulator_get_cached_segment_base, 4629 .get_cached_segment_base = emulator_get_cached_segment_base,
4630 .get_gdt = emulator_get_gdt, 4630 .get_gdt = emulator_get_gdt,
4631 .get_idt = emulator_get_idt, 4631 .get_idt = emulator_get_idt,
4632 .set_gdt = emulator_set_gdt, 4632 .set_gdt = emulator_set_gdt,
4633 .set_idt = emulator_set_idt, 4633 .set_idt = emulator_set_idt,
4634 .get_cr = emulator_get_cr, 4634 .get_cr = emulator_get_cr,
4635 .set_cr = emulator_set_cr, 4635 .set_cr = emulator_set_cr,
4636 .set_rflags = emulator_set_rflags, 4636 .set_rflags = emulator_set_rflags,
4637 .cpl = emulator_get_cpl, 4637 .cpl = emulator_get_cpl,
4638 .get_dr = emulator_get_dr, 4638 .get_dr = emulator_get_dr,
4639 .set_dr = emulator_set_dr, 4639 .set_dr = emulator_set_dr,
4640 .set_msr = emulator_set_msr, 4640 .set_msr = emulator_set_msr,
4641 .get_msr = emulator_get_msr, 4641 .get_msr = emulator_get_msr,
4642 .read_pmc = emulator_read_pmc, 4642 .read_pmc = emulator_read_pmc,
4643 .halt = emulator_halt, 4643 .halt = emulator_halt,
4644 .wbinvd = emulator_wbinvd, 4644 .wbinvd = emulator_wbinvd,
4645 .fix_hypercall = emulator_fix_hypercall, 4645 .fix_hypercall = emulator_fix_hypercall,
4646 .get_fpu = emulator_get_fpu, 4646 .get_fpu = emulator_get_fpu,
4647 .put_fpu = emulator_put_fpu, 4647 .put_fpu = emulator_put_fpu,
4648 .intercept = emulator_intercept, 4648 .intercept = emulator_intercept,
4649 .get_cpuid = emulator_get_cpuid, 4649 .get_cpuid = emulator_get_cpuid,
4650 }; 4650 };
4651 4651
4652 static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) 4652 static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
4653 { 4653 {
4654 u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask); 4654 u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask);
4655 /* 4655 /*
4656 * an sti; sti; sequence only disable interrupts for the first 4656 * an sti; sti; sequence only disable interrupts for the first
4657 * instruction. So, if the last instruction, be it emulated or 4657 * instruction. So, if the last instruction, be it emulated or
4658 * not, left the system with the INT_STI flag enabled, it 4658 * not, left the system with the INT_STI flag enabled, it
4659 * means that the last instruction is an sti. We should not 4659 * means that the last instruction is an sti. We should not
4660 * leave the flag on in this case. The same goes for mov ss 4660 * leave the flag on in this case. The same goes for mov ss
4661 */ 4661 */
4662 if (!(int_shadow & mask)) 4662 if (!(int_shadow & mask))
4663 kvm_x86_ops->set_interrupt_shadow(vcpu, mask); 4663 kvm_x86_ops->set_interrupt_shadow(vcpu, mask);
4664 } 4664 }
4665 4665
4666 static void inject_emulated_exception(struct kvm_vcpu *vcpu) 4666 static void inject_emulated_exception(struct kvm_vcpu *vcpu)
4667 { 4667 {
4668 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; 4668 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
4669 if (ctxt->exception.vector == PF_VECTOR) 4669 if (ctxt->exception.vector == PF_VECTOR)
4670 kvm_propagate_fault(vcpu, &ctxt->exception); 4670 kvm_propagate_fault(vcpu, &ctxt->exception);
4671 else if (ctxt->exception.error_code_valid) 4671 else if (ctxt->exception.error_code_valid)
4672 kvm_queue_exception_e(vcpu, ctxt->exception.vector, 4672 kvm_queue_exception_e(vcpu, ctxt->exception.vector,
4673 ctxt->exception.error_code); 4673 ctxt->exception.error_code);
4674 else 4674 else
4675 kvm_queue_exception(vcpu, ctxt->exception.vector); 4675 kvm_queue_exception(vcpu, ctxt->exception.vector);
4676 } 4676 }
4677 4677
4678 static void init_decode_cache(struct x86_emulate_ctxt *ctxt) 4678 static void init_decode_cache(struct x86_emulate_ctxt *ctxt)
4679 { 4679 {
4680 memset(&ctxt->twobyte, 0, 4680 memset(&ctxt->twobyte, 0,
4681 (void *)&ctxt->_regs - (void *)&ctxt->twobyte); 4681 (void *)&ctxt->_regs - (void *)&ctxt->twobyte);
4682 4682
4683 ctxt->fetch.start = 0; 4683 ctxt->fetch.start = 0;
4684 ctxt->fetch.end = 0; 4684 ctxt->fetch.end = 0;
4685 ctxt->io_read.pos = 0; 4685 ctxt->io_read.pos = 0;
4686 ctxt->io_read.end = 0; 4686 ctxt->io_read.end = 0;
4687 ctxt->mem_read.pos = 0; 4687 ctxt->mem_read.pos = 0;
4688 ctxt->mem_read.end = 0; 4688 ctxt->mem_read.end = 0;
4689 } 4689 }
4690 4690
4691 static void init_emulate_ctxt(struct kvm_vcpu *vcpu) 4691 static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
4692 { 4692 {
4693 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; 4693 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
4694 int cs_db, cs_l; 4694 int cs_db, cs_l;
4695 4695
4696 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 4696 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
4697 4697
4698 ctxt->eflags = kvm_get_rflags(vcpu); 4698 ctxt->eflags = kvm_get_rflags(vcpu);
4699 ctxt->eip = kvm_rip_read(vcpu); 4699 ctxt->eip = kvm_rip_read(vcpu);
4700 ctxt->mode = (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : 4700 ctxt->mode = (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
4701 (ctxt->eflags & X86_EFLAGS_VM) ? X86EMUL_MODE_VM86 : 4701 (ctxt->eflags & X86_EFLAGS_VM) ? X86EMUL_MODE_VM86 :
4702 cs_l ? X86EMUL_MODE_PROT64 : 4702 cs_l ? X86EMUL_MODE_PROT64 :
4703 cs_db ? X86EMUL_MODE_PROT32 : 4703 cs_db ? X86EMUL_MODE_PROT32 :
4704 X86EMUL_MODE_PROT16; 4704 X86EMUL_MODE_PROT16;
4705 ctxt->guest_mode = is_guest_mode(vcpu); 4705 ctxt->guest_mode = is_guest_mode(vcpu);
4706 4706
4707 init_decode_cache(ctxt); 4707 init_decode_cache(ctxt);
4708 vcpu->arch.emulate_regs_need_sync_from_vcpu = false; 4708 vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
4709 } 4709 }
4710 4710
4711 int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) 4711 int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
4712 { 4712 {
4713 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; 4713 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
4714 int ret; 4714 int ret;
4715 4715
4716 init_emulate_ctxt(vcpu); 4716 init_emulate_ctxt(vcpu);
4717 4717
4718 ctxt->op_bytes = 2; 4718 ctxt->op_bytes = 2;
4719 ctxt->ad_bytes = 2; 4719 ctxt->ad_bytes = 2;
4720 ctxt->_eip = ctxt->eip + inc_eip; 4720 ctxt->_eip = ctxt->eip + inc_eip;
4721 ret = emulate_int_real(ctxt, irq); 4721 ret = emulate_int_real(ctxt, irq);
4722 4722
4723 if (ret != X86EMUL_CONTINUE) 4723 if (ret != X86EMUL_CONTINUE)
4724 return EMULATE_FAIL; 4724 return EMULATE_FAIL;
4725 4725
4726 ctxt->eip = ctxt->_eip; 4726 ctxt->eip = ctxt->_eip;
4727 kvm_rip_write(vcpu, ctxt->eip); 4727 kvm_rip_write(vcpu, ctxt->eip);
4728 kvm_set_rflags(vcpu, ctxt->eflags); 4728 kvm_set_rflags(vcpu, ctxt->eflags);
4729 4729
4730 if (irq == NMI_VECTOR) 4730 if (irq == NMI_VECTOR)
4731 vcpu->arch.nmi_pending = 0; 4731 vcpu->arch.nmi_pending = 0;
4732 else 4732 else
4733 vcpu->arch.interrupt.pending = false; 4733 vcpu->arch.interrupt.pending = false;
4734 4734
4735 return EMULATE_DONE; 4735 return EMULATE_DONE;
4736 } 4736 }
4737 EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt); 4737 EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
4738 4738
4739 static int handle_emulation_failure(struct kvm_vcpu *vcpu) 4739 static int handle_emulation_failure(struct kvm_vcpu *vcpu)
4740 { 4740 {
4741 int r = EMULATE_DONE; 4741 int r = EMULATE_DONE;
4742 4742
4743 ++vcpu->stat.insn_emulation_fail; 4743 ++vcpu->stat.insn_emulation_fail;
4744 trace_kvm_emulate_insn_failed(vcpu); 4744 trace_kvm_emulate_insn_failed(vcpu);
4745 if (!is_guest_mode(vcpu)) { 4745 if (!is_guest_mode(vcpu)) {
4746 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 4746 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
4747 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; 4747 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
4748 vcpu->run->internal.ndata = 0; 4748 vcpu->run->internal.ndata = 0;
4749 r = EMULATE_FAIL; 4749 r = EMULATE_FAIL;
4750 } 4750 }
4751 kvm_queue_exception(vcpu, UD_VECTOR); 4751 kvm_queue_exception(vcpu, UD_VECTOR);
4752 4752
4753 return r; 4753 return r;
4754 } 4754 }
4755 4755
4756 static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) 4756 static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
4757 { 4757 {
4758 gpa_t gpa; 4758 gpa_t gpa;
4759 pfn_t pfn; 4759 pfn_t pfn;
4760 4760
4761 if (tdp_enabled) 4761 if (tdp_enabled)
4762 return false; 4762 return false;
4763 4763
4764 /* 4764 /*
4765 * if emulation was due to access to shadowed page table 4765 * if emulation was due to access to shadowed page table
4766 * and it failed try to unshadow page and re-enter the 4766 * and it failed try to unshadow page and re-enter the
4767 * guest to let CPU execute the instruction. 4767 * guest to let CPU execute the instruction.
4768 */ 4768 */
4769 if (kvm_mmu_unprotect_page_virt(vcpu, gva)) 4769 if (kvm_mmu_unprotect_page_virt(vcpu, gva))
4770 return true; 4770 return true;
4771 4771
4772 gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL); 4772 gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL);
4773 4773
4774 if (gpa == UNMAPPED_GVA) 4774 if (gpa == UNMAPPED_GVA)
4775 return true; /* let cpu generate fault */ 4775 return true; /* let cpu generate fault */
4776 4776
4777 /* 4777 /*
4778 * Do not retry the unhandleable instruction if it faults on the 4778 * Do not retry the unhandleable instruction if it faults on the
4779 * readonly host memory, otherwise it will goto a infinite loop: 4779 * readonly host memory, otherwise it will goto a infinite loop:
4780 * retry instruction -> write #PF -> emulation fail -> retry 4780 * retry instruction -> write #PF -> emulation fail -> retry
4781 * instruction -> ... 4781 * instruction -> ...
4782 */ 4782 */
4783 pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa)); 4783 pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));
4784 if (!is_error_noslot_pfn(pfn)) { 4784 if (!is_error_noslot_pfn(pfn)) {
4785 kvm_release_pfn_clean(pfn); 4785 kvm_release_pfn_clean(pfn);
4786 return true; 4786 return true;
4787 } 4787 }
4788 4788
4789 return false; 4789 return false;
4790 } 4790 }
4791 4791
4792 static bool retry_instruction(struct x86_emulate_ctxt *ctxt, 4792 static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
4793 unsigned long cr2, int emulation_type) 4793 unsigned long cr2, int emulation_type)
4794 { 4794 {
4795 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); 4795 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4796 unsigned long last_retry_eip, last_retry_addr, gpa = cr2; 4796 unsigned long last_retry_eip, last_retry_addr, gpa = cr2;
4797 4797
4798 last_retry_eip = vcpu->arch.last_retry_eip; 4798 last_retry_eip = vcpu->arch.last_retry_eip;
4799 last_retry_addr = vcpu->arch.last_retry_addr; 4799 last_retry_addr = vcpu->arch.last_retry_addr;
4800 4800
4801 /* 4801 /*
4802 * If the emulation is caused by #PF and it is non-page_table 4802 * If the emulation is caused by #PF and it is non-page_table
4803 * writing instruction, it means the VM-EXIT is caused by shadow 4803 * writing instruction, it means the VM-EXIT is caused by shadow
4804 * page protected, we can zap the shadow page and retry this 4804 * page protected, we can zap the shadow page and retry this
4805 * instruction directly. 4805 * instruction directly.
4806 * 4806 *
4807 * Note: if the guest uses a non-page-table modifying instruction 4807 * Note: if the guest uses a non-page-table modifying instruction
4808 * on the PDE that points to the instruction, then we will unmap 4808 * on the PDE that points to the instruction, then we will unmap
4809 * the instruction and go to an infinite loop. So, we cache the 4809 * the instruction and go to an infinite loop. So, we cache the
4810 * last retried eip and the last fault address, if we meet the eip 4810 * last retried eip and the last fault address, if we meet the eip
4811 * and the address again, we can break out of the potential infinite 4811 * and the address again, we can break out of the potential infinite
4812 * loop. 4812 * loop.
4813 */ 4813 */
4814 vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0; 4814 vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0;
4815 4815
4816 if (!(emulation_type & EMULTYPE_RETRY)) 4816 if (!(emulation_type & EMULTYPE_RETRY))
4817 return false; 4817 return false;
4818 4818
4819 if (x86_page_table_writing_insn(ctxt)) 4819 if (x86_page_table_writing_insn(ctxt))
4820 return false; 4820 return false;
4821 4821
4822 if (ctxt->eip == last_retry_eip && last_retry_addr == cr2) 4822 if (ctxt->eip == last_retry_eip && last_retry_addr == cr2)
4823 return false; 4823 return false;
4824 4824
4825 vcpu->arch.last_retry_eip = ctxt->eip; 4825 vcpu->arch.last_retry_eip = ctxt->eip;
4826 vcpu->arch.last_retry_addr = cr2; 4826 vcpu->arch.last_retry_addr = cr2;
4827 4827
4828 if (!vcpu->arch.mmu.direct_map) 4828 if (!vcpu->arch.mmu.direct_map)
4829 gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL); 4829 gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
4830 4830
4831 kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); 4831 kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
4832 4832
4833 return true; 4833 return true;
4834 } 4834 }
4835 4835
4836 static int complete_emulated_mmio(struct kvm_vcpu *vcpu); 4836 static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
4837 static int complete_emulated_pio(struct kvm_vcpu *vcpu); 4837 static int complete_emulated_pio(struct kvm_vcpu *vcpu);
4838 4838
4839 int x86_emulate_instruction(struct kvm_vcpu *vcpu, 4839 int x86_emulate_instruction(struct kvm_vcpu *vcpu,
4840 unsigned long cr2, 4840 unsigned long cr2,
4841 int emulation_type, 4841 int emulation_type,
4842 void *insn, 4842 void *insn,
4843 int insn_len) 4843 int insn_len)
4844 { 4844 {
4845 int r; 4845 int r;
4846 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; 4846 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
4847 bool writeback = true; 4847 bool writeback = true;
4848 4848
4849 kvm_clear_exception_queue(vcpu); 4849 kvm_clear_exception_queue(vcpu);
4850 4850
4851 if (!(emulation_type & EMULTYPE_NO_DECODE)) { 4851 if (!(emulation_type & EMULTYPE_NO_DECODE)) {
4852 init_emulate_ctxt(vcpu); 4852 init_emulate_ctxt(vcpu);
4853 ctxt->interruptibility = 0; 4853 ctxt->interruptibility = 0;
4854 ctxt->have_exception = false; 4854 ctxt->have_exception = false;
4855 ctxt->perm_ok = false; 4855 ctxt->perm_ok = false;
4856 4856
4857 ctxt->only_vendor_specific_insn 4857 ctxt->only_vendor_specific_insn
4858 = emulation_type & EMULTYPE_TRAP_UD; 4858 = emulation_type & EMULTYPE_TRAP_UD;
4859 4859
4860 r = x86_decode_insn(ctxt, insn, insn_len); 4860 r = x86_decode_insn(ctxt, insn, insn_len);
4861 4861
4862 trace_kvm_emulate_insn_start(vcpu); 4862 trace_kvm_emulate_insn_start(vcpu);
4863 ++vcpu->stat.insn_emulation; 4863 ++vcpu->stat.insn_emulation;
4864 if (r != EMULATION_OK) { 4864 if (r != EMULATION_OK) {
4865 if (emulation_type & EMULTYPE_TRAP_UD) 4865 if (emulation_type & EMULTYPE_TRAP_UD)
4866 return EMULATE_FAIL; 4866 return EMULATE_FAIL;
4867 if (reexecute_instruction(vcpu, cr2)) 4867 if (reexecute_instruction(vcpu, cr2))
4868 return EMULATE_DONE; 4868 return EMULATE_DONE;
4869 if (emulation_type & EMULTYPE_SKIP) 4869 if (emulation_type & EMULTYPE_SKIP)
4870 return EMULATE_FAIL; 4870 return EMULATE_FAIL;
4871 return handle_emulation_failure(vcpu); 4871 return handle_emulation_failure(vcpu);
4872 } 4872 }
4873 } 4873 }
4874 4874
4875 if (emulation_type & EMULTYPE_SKIP) { 4875 if (emulation_type & EMULTYPE_SKIP) {
4876 kvm_rip_write(vcpu, ctxt->_eip); 4876 kvm_rip_write(vcpu, ctxt->_eip);
4877 return EMULATE_DONE; 4877 return EMULATE_DONE;
4878 } 4878 }
4879 4879
4880 if (retry_instruction(ctxt, cr2, emulation_type)) 4880 if (retry_instruction(ctxt, cr2, emulation_type))
4881 return EMULATE_DONE; 4881 return EMULATE_DONE;
4882 4882
4883 /* this is needed for vmware backdoor interface to work since it 4883 /* this is needed for vmware backdoor interface to work since it
4884 changes registers values during IO operation */ 4884 changes registers values during IO operation */
4885 if (vcpu->arch.emulate_regs_need_sync_from_vcpu) { 4885 if (vcpu->arch.emulate_regs_need_sync_from_vcpu) {
4886 vcpu->arch.emulate_regs_need_sync_from_vcpu = false; 4886 vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
4887 emulator_invalidate_register_cache(ctxt); 4887 emulator_invalidate_register_cache(ctxt);
4888 } 4888 }
4889 4889
4890 restart: 4890 restart:
4891 r = x86_emulate_insn(ctxt); 4891 r = x86_emulate_insn(ctxt);
4892 4892
4893 if (r == EMULATION_INTERCEPTED) 4893 if (r == EMULATION_INTERCEPTED)
4894 return EMULATE_DONE; 4894 return EMULATE_DONE;
4895 4895
4896 if (r == EMULATION_FAILED) { 4896 if (r == EMULATION_FAILED) {
4897 if (reexecute_instruction(vcpu, cr2)) 4897 if (reexecute_instruction(vcpu, cr2))
4898 return EMULATE_DONE; 4898 return EMULATE_DONE;
4899 4899
4900 return handle_emulation_failure(vcpu); 4900 return handle_emulation_failure(vcpu);
4901 } 4901 }
4902 4902
4903 if (ctxt->have_exception) { 4903 if (ctxt->have_exception) {
4904 inject_emulated_exception(vcpu); 4904 inject_emulated_exception(vcpu);
4905 r = EMULATE_DONE; 4905 r = EMULATE_DONE;
4906 } else if (vcpu->arch.pio.count) { 4906 } else if (vcpu->arch.pio.count) {
4907 if (!vcpu->arch.pio.in) 4907 if (!vcpu->arch.pio.in)
4908 vcpu->arch.pio.count = 0; 4908 vcpu->arch.pio.count = 0;
4909 else { 4909 else {
4910 writeback = false; 4910 writeback = false;
4911 vcpu->arch.complete_userspace_io = complete_emulated_pio; 4911 vcpu->arch.complete_userspace_io = complete_emulated_pio;
4912 } 4912 }
4913 r = EMULATE_DO_MMIO; 4913 r = EMULATE_DO_MMIO;
4914 } else if (vcpu->mmio_needed) { 4914 } else if (vcpu->mmio_needed) {
4915 if (!vcpu->mmio_is_write) 4915 if (!vcpu->mmio_is_write)
4916 writeback = false; 4916 writeback = false;
4917 r = EMULATE_DO_MMIO; 4917 r = EMULATE_DO_MMIO;
4918 vcpu->arch.complete_userspace_io = complete_emulated_mmio; 4918 vcpu->arch.complete_userspace_io = complete_emulated_mmio;
4919 } else if (r == EMULATION_RESTART) 4919 } else if (r == EMULATION_RESTART)
4920 goto restart; 4920 goto restart;
4921 else 4921 else
4922 r = EMULATE_DONE; 4922 r = EMULATE_DONE;
4923 4923
4924 if (writeback) { 4924 if (writeback) {
4925 toggle_interruptibility(vcpu, ctxt->interruptibility); 4925 toggle_interruptibility(vcpu, ctxt->interruptibility);
4926 kvm_set_rflags(vcpu, ctxt->eflags); 4926 kvm_set_rflags(vcpu, ctxt->eflags);
4927 kvm_make_request(KVM_REQ_EVENT, vcpu); 4927 kvm_make_request(KVM_REQ_EVENT, vcpu);
4928 vcpu->arch.emulate_regs_need_sync_to_vcpu = false; 4928 vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
4929 kvm_rip_write(vcpu, ctxt->eip); 4929 kvm_rip_write(vcpu, ctxt->eip);
4930 } else 4930 } else
4931 vcpu->arch.emulate_regs_need_sync_to_vcpu = true; 4931 vcpu->arch.emulate_regs_need_sync_to_vcpu = true;
4932 4932
4933 return r; 4933 return r;
4934 } 4934 }
4935 EXPORT_SYMBOL_GPL(x86_emulate_instruction); 4935 EXPORT_SYMBOL_GPL(x86_emulate_instruction);
4936 4936
4937 int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port) 4937 int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)
4938 { 4938 {
4939 unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX); 4939 unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX);
4940 int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt, 4940 int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt,
4941 size, port, &val, 1); 4941 size, port, &val, 1);
4942 /* do not return to emulator after return from userspace */ 4942 /* do not return to emulator after return from userspace */
4943 vcpu->arch.pio.count = 0; 4943 vcpu->arch.pio.count = 0;
4944 return ret; 4944 return ret;
4945 } 4945 }
4946 EXPORT_SYMBOL_GPL(kvm_fast_pio_out); 4946 EXPORT_SYMBOL_GPL(kvm_fast_pio_out);
4947 4947
4948 static void tsc_bad(void *info) 4948 static void tsc_bad(void *info)
4949 { 4949 {
4950 __this_cpu_write(cpu_tsc_khz, 0); 4950 __this_cpu_write(cpu_tsc_khz, 0);
4951 } 4951 }
4952 4952
4953 static void tsc_khz_changed(void *data) 4953 static void tsc_khz_changed(void *data)
4954 { 4954 {
4955 struct cpufreq_freqs *freq = data; 4955 struct cpufreq_freqs *freq = data;
4956 unsigned long khz = 0; 4956 unsigned long khz = 0;
4957 4957
4958 if (data) 4958 if (data)
4959 khz = freq->new; 4959 khz = freq->new;
4960 else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) 4960 else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
4961 khz = cpufreq_quick_get(raw_smp_processor_id()); 4961 khz = cpufreq_quick_get(raw_smp_processor_id());
4962 if (!khz) 4962 if (!khz)
4963 khz = tsc_khz; 4963 khz = tsc_khz;
4964 __this_cpu_write(cpu_tsc_khz, khz); 4964 __this_cpu_write(cpu_tsc_khz, khz);
4965 } 4965 }
4966 4966
4967 static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, 4967 static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
4968 void *data) 4968 void *data)
4969 { 4969 {
4970 struct cpufreq_freqs *freq = data; 4970 struct cpufreq_freqs *freq = data;
4971 struct kvm *kvm; 4971 struct kvm *kvm;
4972 struct kvm_vcpu *vcpu; 4972 struct kvm_vcpu *vcpu;
4973 int i, send_ipi = 0; 4973 int i, send_ipi = 0;
4974 4974
4975 /* 4975 /*
4976 * We allow guests to temporarily run on slowing clocks, 4976 * We allow guests to temporarily run on slowing clocks,
4977 * provided we notify them after, or to run on accelerating 4977 * provided we notify them after, or to run on accelerating
4978 * clocks, provided we notify them before. Thus time never 4978 * clocks, provided we notify them before. Thus time never
4979 * goes backwards. 4979 * goes backwards.
4980 * 4980 *
4981 * However, we have a problem. We can't atomically update 4981 * However, we have a problem. We can't atomically update
4982 * the frequency of a given CPU from this function; it is 4982 * the frequency of a given CPU from this function; it is
4983 * merely a notifier, which can be called from any CPU. 4983 * merely a notifier, which can be called from any CPU.
4984 * Changing the TSC frequency at arbitrary points in time 4984 * Changing the TSC frequency at arbitrary points in time
4985 * requires a recomputation of local variables related to 4985 * requires a recomputation of local variables related to
4986 * the TSC for each VCPU. We must flag these local variables 4986 * the TSC for each VCPU. We must flag these local variables
4987 * to be updated and be sure the update takes place with the 4987 * to be updated and be sure the update takes place with the
4988 * new frequency before any guests proceed. 4988 * new frequency before any guests proceed.
4989 * 4989 *
4990 * Unfortunately, the combination of hotplug CPU and frequency 4990 * Unfortunately, the combination of hotplug CPU and frequency
4991 * change creates an intractable locking scenario; the order 4991 * change creates an intractable locking scenario; the order
4992 * of when these callouts happen is undefined with respect to 4992 * of when these callouts happen is undefined with respect to
4993 * CPU hotplug, and they can race with each other. As such, 4993 * CPU hotplug, and they can race with each other. As such,
4994 * merely setting per_cpu(cpu_tsc_khz) = X during a hotadd is 4994 * merely setting per_cpu(cpu_tsc_khz) = X during a hotadd is
4995 * undefined; you can actually have a CPU frequency change take 4995 * undefined; you can actually have a CPU frequency change take
4996 * place in between the computation of X and the setting of the 4996 * place in between the computation of X and the setting of the
4997 * variable. To protect against this problem, all updates of 4997 * variable. To protect against this problem, all updates of
4998 * the per_cpu tsc_khz variable are done in an interrupt 4998 * the per_cpu tsc_khz variable are done in an interrupt
4999 * protected IPI, and all callers wishing to update the value 4999 * protected IPI, and all callers wishing to update the value
5000 * must wait for a synchronous IPI to complete (which is trivial 5000 * must wait for a synchronous IPI to complete (which is trivial
5001 * if the caller is on the CPU already). This establishes the 5001 * if the caller is on the CPU already). This establishes the
5002 * necessary total order on variable updates. 5002 * necessary total order on variable updates.
5003 * 5003 *
5004 * Note that because a guest time update may take place 5004 * Note that because a guest time update may take place
5005 * anytime after the setting of the VCPU's request bit, the 5005 * anytime after the setting of the VCPU's request bit, the
5006 * correct TSC value must be set before the request. However, 5006 * correct TSC value must be set before the request. However,
5007 * to ensure the update actually makes it to any guest which 5007 * to ensure the update actually makes it to any guest which
5008 * starts running in hardware virtualization between the set 5008 * starts running in hardware virtualization between the set
5009 * and the acquisition of the spinlock, we must also ping the 5009 * and the acquisition of the spinlock, we must also ping the
5010 * CPU after setting the request bit. 5010 * CPU after setting the request bit.
5011 * 5011 *
5012 */ 5012 */
5013 5013
5014 if (val == CPUFREQ_PRECHANGE && freq->old > freq->new) 5014 if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
5015 return 0; 5015 return 0;
5016 if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new) 5016 if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
5017 return 0; 5017 return 0;
5018 5018
5019 smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1); 5019 smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
5020 5020
5021 raw_spin_lock(&kvm_lock); 5021 raw_spin_lock(&kvm_lock);
5022 list_for_each_entry(kvm, &vm_list, vm_list) { 5022 list_for_each_entry(kvm, &vm_list, vm_list) {
5023 kvm_for_each_vcpu(i, vcpu, kvm) { 5023 kvm_for_each_vcpu(i, vcpu, kvm) {
5024 if (vcpu->cpu != freq->cpu) 5024 if (vcpu->cpu != freq->cpu)
5025 continue; 5025 continue;
5026 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); 5026 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
5027 if (vcpu->cpu != smp_processor_id()) 5027 if (vcpu->cpu != smp_processor_id())
5028 send_ipi = 1; 5028 send_ipi = 1;
5029 } 5029 }
5030 } 5030 }
5031 raw_spin_unlock(&kvm_lock); 5031 raw_spin_unlock(&kvm_lock);
5032 5032
5033 if (freq->old < freq->new && send_ipi) { 5033 if (freq->old < freq->new && send_ipi) {
5034 /* 5034 /*
5035 * We upscale the frequency. Must make the guest 5035 * We upscale the frequency. Must make the guest
5036 * doesn't see old kvmclock values while running with 5036 * doesn't see old kvmclock values while running with
5037 * the new frequency, otherwise we risk the guest sees 5037 * the new frequency, otherwise we risk the guest sees
5038 * time go backwards. 5038 * time go backwards.
5039 * 5039 *
5040 * In case we update the frequency for another cpu 5040 * In case we update the frequency for another cpu
5041 * (which might be in guest context) send an interrupt 5041 * (which might be in guest context) send an interrupt
5042 * to kick the cpu out of guest context. Next time 5042 * to kick the cpu out of guest context. Next time
5043 * guest context is entered kvmclock will be updated, 5043 * guest context is entered kvmclock will be updated,
5044 * so the guest will not see stale values. 5044 * so the guest will not see stale values.
5045 */ 5045 */
5046 smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1); 5046 smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
5047 } 5047 }
5048 return 0; 5048 return 0;
5049 } 5049 }
5050 5050
5051 static struct notifier_block kvmclock_cpufreq_notifier_block = { 5051 static struct notifier_block kvmclock_cpufreq_notifier_block = {
5052 .notifier_call = kvmclock_cpufreq_notifier 5052 .notifier_call = kvmclock_cpufreq_notifier
5053 }; 5053 };
5054 5054
5055 static int kvmclock_cpu_notifier(struct notifier_block *nfb, 5055 static int kvmclock_cpu_notifier(struct notifier_block *nfb,
5056 unsigned long action, void *hcpu) 5056 unsigned long action, void *hcpu)
5057 { 5057 {
5058 unsigned int cpu = (unsigned long)hcpu; 5058 unsigned int cpu = (unsigned long)hcpu;
5059 5059
5060 switch (action) { 5060 switch (action) {
5061 case CPU_ONLINE: 5061 case CPU_ONLINE:
5062 case CPU_DOWN_FAILED: 5062 case CPU_DOWN_FAILED:
5063 smp_call_function_single(cpu, tsc_khz_changed, NULL, 1); 5063 smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);
5064 break; 5064 break;
5065 case CPU_DOWN_PREPARE: 5065 case CPU_DOWN_PREPARE:
5066 smp_call_function_single(cpu, tsc_bad, NULL, 1); 5066 smp_call_function_single(cpu, tsc_bad, NULL, 1);
5067 break; 5067 break;
5068 } 5068 }
5069 return NOTIFY_OK; 5069 return NOTIFY_OK;
5070 } 5070 }
5071 5071
5072 static struct notifier_block kvmclock_cpu_notifier_block = { 5072 static struct notifier_block kvmclock_cpu_notifier_block = {
5073 .notifier_call = kvmclock_cpu_notifier, 5073 .notifier_call = kvmclock_cpu_notifier,
5074 .priority = -INT_MAX 5074 .priority = -INT_MAX
5075 }; 5075 };
5076 5076
5077 static void kvm_timer_init(void) 5077 static void kvm_timer_init(void)
5078 { 5078 {
5079 int cpu; 5079 int cpu;
5080 5080
5081 max_tsc_khz = tsc_khz; 5081 max_tsc_khz = tsc_khz;
5082 register_hotcpu_notifier(&kvmclock_cpu_notifier_block); 5082 register_hotcpu_notifier(&kvmclock_cpu_notifier_block);
5083 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { 5083 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
5084 #ifdef CONFIG_CPU_FREQ 5084 #ifdef CONFIG_CPU_FREQ
5085 struct cpufreq_policy policy; 5085 struct cpufreq_policy policy;
5086 memset(&policy, 0, sizeof(policy)); 5086 memset(&policy, 0, sizeof(policy));
5087 cpu = get_cpu(); 5087 cpu = get_cpu();
5088 cpufreq_get_policy(&policy, cpu); 5088 cpufreq_get_policy(&policy, cpu);
5089 if (policy.cpuinfo.max_freq) 5089 if (policy.cpuinfo.max_freq)
5090 max_tsc_khz = policy.cpuinfo.max_freq; 5090 max_tsc_khz = policy.cpuinfo.max_freq;
5091 put_cpu(); 5091 put_cpu();
5092 #endif 5092 #endif
5093 cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, 5093 cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
5094 CPUFREQ_TRANSITION_NOTIFIER); 5094 CPUFREQ_TRANSITION_NOTIFIER);
5095 } 5095 }
5096 pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz); 5096 pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz);
5097 for_each_online_cpu(cpu) 5097 for_each_online_cpu(cpu)
5098 smp_call_function_single(cpu, tsc_khz_changed, NULL, 1); 5098 smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);
5099 } 5099 }
5100 5100
5101 static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu); 5101 static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
5102 5102
5103 int kvm_is_in_guest(void) 5103 int kvm_is_in_guest(void)
5104 { 5104 {
5105 return __this_cpu_read(current_vcpu) != NULL; 5105 return __this_cpu_read(current_vcpu) != NULL;
5106 } 5106 }
5107 5107
5108 static int kvm_is_user_mode(void) 5108 static int kvm_is_user_mode(void)
5109 { 5109 {
5110 int user_mode = 3; 5110 int user_mode = 3;
5111 5111
5112 if (__this_cpu_read(current_vcpu)) 5112 if (__this_cpu_read(current_vcpu))
5113 user_mode = kvm_x86_ops->get_cpl(__this_cpu_read(current_vcpu)); 5113 user_mode = kvm_x86_ops->get_cpl(__this_cpu_read(current_vcpu));
5114 5114
5115 return user_mode != 0; 5115 return user_mode != 0;
5116 } 5116 }
5117 5117
5118 static unsigned long kvm_get_guest_ip(void) 5118 static unsigned long kvm_get_guest_ip(void)
5119 { 5119 {
5120 unsigned long ip = 0; 5120 unsigned long ip = 0;
5121 5121
5122 if (__this_cpu_read(current_vcpu)) 5122 if (__this_cpu_read(current_vcpu))
5123 ip = kvm_rip_read(__this_cpu_read(current_vcpu)); 5123 ip = kvm_rip_read(__this_cpu_read(current_vcpu));
5124 5124
5125 return ip; 5125 return ip;
5126 } 5126 }
5127 5127
5128 static struct perf_guest_info_callbacks kvm_guest_cbs = { 5128 static struct perf_guest_info_callbacks kvm_guest_cbs = {
5129 .is_in_guest = kvm_is_in_guest, 5129 .is_in_guest = kvm_is_in_guest,
5130 .is_user_mode = kvm_is_user_mode, 5130 .is_user_mode = kvm_is_user_mode,
5131 .get_guest_ip = kvm_get_guest_ip, 5131 .get_guest_ip = kvm_get_guest_ip,
5132 }; 5132 };
5133 5133
5134 void kvm_before_handle_nmi(struct kvm_vcpu *vcpu) 5134 void kvm_before_handle_nmi(struct kvm_vcpu *vcpu)
5135 { 5135 {
5136 __this_cpu_write(current_vcpu, vcpu); 5136 __this_cpu_write(current_vcpu, vcpu);
5137 } 5137 }
5138 EXPORT_SYMBOL_GPL(kvm_before_handle_nmi); 5138 EXPORT_SYMBOL_GPL(kvm_before_handle_nmi);
5139 5139
5140 void kvm_after_handle_nmi(struct kvm_vcpu *vcpu) 5140 void kvm_after_handle_nmi(struct kvm_vcpu *vcpu)
5141 { 5141 {
5142 __this_cpu_write(current_vcpu, NULL); 5142 __this_cpu_write(current_vcpu, NULL);
5143 } 5143 }
5144 EXPORT_SYMBOL_GPL(kvm_after_handle_nmi); 5144 EXPORT_SYMBOL_GPL(kvm_after_handle_nmi);
5145 5145
5146 static void kvm_set_mmio_spte_mask(void) 5146 static void kvm_set_mmio_spte_mask(void)
5147 { 5147 {
5148 u64 mask; 5148 u64 mask;
5149 int maxphyaddr = boot_cpu_data.x86_phys_bits; 5149 int maxphyaddr = boot_cpu_data.x86_phys_bits;
5150 5150
5151 /* 5151 /*
5152 * Set the reserved bits and the present bit of an paging-structure 5152 * Set the reserved bits and the present bit of an paging-structure
5153 * entry to generate page fault with PFER.RSV = 1. 5153 * entry to generate page fault with PFER.RSV = 1.
5154 */ 5154 */
5155 mask = ((1ull << (62 - maxphyaddr + 1)) - 1) << maxphyaddr; 5155 mask = ((1ull << (62 - maxphyaddr + 1)) - 1) << maxphyaddr;
5156 mask |= 1ull; 5156 mask |= 1ull;
5157 5157
5158 #ifdef CONFIG_X86_64 5158 #ifdef CONFIG_X86_64
5159 /* 5159 /*
5160 * If reserved bit is not supported, clear the present bit to disable 5160 * If reserved bit is not supported, clear the present bit to disable
5161 * mmio page fault. 5161 * mmio page fault.
5162 */ 5162 */
5163 if (maxphyaddr == 52) 5163 if (maxphyaddr == 52)
5164 mask &= ~1ull; 5164 mask &= ~1ull;
5165 #endif 5165 #endif
5166 5166
5167 kvm_mmu_set_mmio_spte_mask(mask); 5167 kvm_mmu_set_mmio_spte_mask(mask);
5168 } 5168 }
5169 5169
5170 #ifdef CONFIG_X86_64 5170 #ifdef CONFIG_X86_64
5171 static void pvclock_gtod_update_fn(struct work_struct *work) 5171 static void pvclock_gtod_update_fn(struct work_struct *work)
5172 { 5172 {
5173 struct kvm *kvm; 5173 struct kvm *kvm;
5174 5174
5175 struct kvm_vcpu *vcpu; 5175 struct kvm_vcpu *vcpu;
5176 int i; 5176 int i;
5177 5177
5178 raw_spin_lock(&kvm_lock); 5178 raw_spin_lock(&kvm_lock);
5179 list_for_each_entry(kvm, &vm_list, vm_list) 5179 list_for_each_entry(kvm, &vm_list, vm_list)
5180 kvm_for_each_vcpu(i, vcpu, kvm) 5180 kvm_for_each_vcpu(i, vcpu, kvm)
5181 set_bit(KVM_REQ_MASTERCLOCK_UPDATE, &vcpu->requests); 5181 set_bit(KVM_REQ_MASTERCLOCK_UPDATE, &vcpu->requests);
5182 atomic_set(&kvm_guest_has_master_clock, 0); 5182 atomic_set(&kvm_guest_has_master_clock, 0);
5183 raw_spin_unlock(&kvm_lock); 5183 raw_spin_unlock(&kvm_lock);
5184 } 5184 }
5185 5185
5186 static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn); 5186 static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
5187 5187
5188 /* 5188 /*
5189 * Notification about pvclock gtod data update. 5189 * Notification about pvclock gtod data update.
5190 */ 5190 */
5191 static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused, 5191 static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
5192 void *priv) 5192 void *priv)
5193 { 5193 {
5194 struct pvclock_gtod_data *gtod = &pvclock_gtod_data; 5194 struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
5195 struct timekeeper *tk = priv; 5195 struct timekeeper *tk = priv;
5196 5196
5197 update_pvclock_gtod(tk); 5197 update_pvclock_gtod(tk);
5198 5198
5199 /* disable master clock if host does not trust, or does not 5199 /* disable master clock if host does not trust, or does not
5200 * use, TSC clocksource 5200 * use, TSC clocksource
5201 */ 5201 */
5202 if (gtod->clock.vclock_mode != VCLOCK_TSC && 5202 if (gtod->clock.vclock_mode != VCLOCK_TSC &&
5203 atomic_read(&kvm_guest_has_master_clock) != 0) 5203 atomic_read(&kvm_guest_has_master_clock) != 0)
5204 queue_work(system_long_wq, &pvclock_gtod_work); 5204 queue_work(system_long_wq, &pvclock_gtod_work);
5205 5205
5206 return 0; 5206 return 0;
5207 } 5207 }
5208 5208
5209 static struct notifier_block pvclock_gtod_notifier = { 5209 static struct notifier_block pvclock_gtod_notifier = {
5210 .notifier_call = pvclock_gtod_notify, 5210 .notifier_call = pvclock_gtod_notify,
5211 }; 5211 };
5212 #endif 5212 #endif
5213 5213
5214 int kvm_arch_init(void *opaque) 5214 int kvm_arch_init(void *opaque)
5215 { 5215 {
5216 int r; 5216 int r;
5217 struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque; 5217 struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
5218 5218
5219 if (kvm_x86_ops) { 5219 if (kvm_x86_ops) {
5220 printk(KERN_ERR "kvm: already loaded the other module\n"); 5220 printk(KERN_ERR "kvm: already loaded the other module\n");
5221 r = -EEXIST; 5221 r = -EEXIST;
5222 goto out; 5222 goto out;
5223 } 5223 }
5224 5224
5225 if (!ops->cpu_has_kvm_support()) { 5225 if (!ops->cpu_has_kvm_support()) {
5226 printk(KERN_ERR "kvm: no hardware support\n"); 5226 printk(KERN_ERR "kvm: no hardware support\n");
5227 r = -EOPNOTSUPP; 5227 r = -EOPNOTSUPP;
5228 goto out; 5228 goto out;
5229 } 5229 }
5230 if (ops->disabled_by_bios()) { 5230 if (ops->disabled_by_bios()) {
5231 printk(KERN_ERR "kvm: disabled by bios\n"); 5231 printk(KERN_ERR "kvm: disabled by bios\n");
5232 r = -EOPNOTSUPP; 5232 r = -EOPNOTSUPP;
5233 goto out; 5233 goto out;
5234 } 5234 }
5235 5235
5236 r = kvm_mmu_module_init(); 5236 r = kvm_mmu_module_init();
5237 if (r) 5237 if (r)
5238 goto out; 5238 goto out;
5239 5239
5240 kvm_set_mmio_spte_mask(); 5240 kvm_set_mmio_spte_mask();
5241 kvm_init_msr_list(); 5241 kvm_init_msr_list();
5242 5242
5243 kvm_x86_ops = ops; 5243 kvm_x86_ops = ops;
5244 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, 5244 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
5245 PT_DIRTY_MASK, PT64_NX_MASK, 0); 5245 PT_DIRTY_MASK, PT64_NX_MASK, 0);
5246 5246
5247 kvm_timer_init(); 5247 kvm_timer_init();
5248 5248
5249 perf_register_guest_info_callbacks(&kvm_guest_cbs); 5249 perf_register_guest_info_callbacks(&kvm_guest_cbs);
5250 5250
5251 if (cpu_has_xsave) 5251 if (cpu_has_xsave)
5252 host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); 5252 host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
5253 5253
5254 kvm_lapic_init(); 5254 kvm_lapic_init();
5255 #ifdef CONFIG_X86_64 5255 #ifdef CONFIG_X86_64
5256 pvclock_gtod_register_notifier(&pvclock_gtod_notifier); 5256 pvclock_gtod_register_notifier(&pvclock_gtod_notifier);
5257 #endif 5257 #endif
5258 5258
5259 return 0; 5259 return 0;
5260 5260
5261 out: 5261 out:
5262 return r; 5262 return r;
5263 } 5263 }
5264 5264
5265 void kvm_arch_exit(void) 5265 void kvm_arch_exit(void)
5266 { 5266 {
5267 perf_unregister_guest_info_callbacks(&kvm_guest_cbs); 5267 perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
5268 5268
5269 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) 5269 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
5270 cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block, 5270 cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
5271 CPUFREQ_TRANSITION_NOTIFIER); 5271 CPUFREQ_TRANSITION_NOTIFIER);
5272 unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block); 5272 unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block);
5273 #ifdef CONFIG_X86_64 5273 #ifdef CONFIG_X86_64
5274 pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier); 5274 pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);
5275 #endif 5275 #endif
5276 kvm_x86_ops = NULL; 5276 kvm_x86_ops = NULL;
5277 kvm_mmu_module_exit(); 5277 kvm_mmu_module_exit();
5278 } 5278 }
5279 5279
5280 int kvm_emulate_halt(struct kvm_vcpu *vcpu) 5280 int kvm_emulate_halt(struct kvm_vcpu *vcpu)
5281 { 5281 {
5282 ++vcpu->stat.halt_exits; 5282 ++vcpu->stat.halt_exits;
5283 if (irqchip_in_kernel(vcpu->kvm)) { 5283 if (irqchip_in_kernel(vcpu->kvm)) {
5284 vcpu->arch.mp_state = KVM_MP_STATE_HALTED; 5284 vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
5285 return 1; 5285 return 1;
5286 } else { 5286 } else {
5287 vcpu->run->exit_reason = KVM_EXIT_HLT; 5287 vcpu->run->exit_reason = KVM_EXIT_HLT;
5288 return 0; 5288 return 0;
5289 } 5289 }
5290 } 5290 }
5291 EXPORT_SYMBOL_GPL(kvm_emulate_halt); 5291 EXPORT_SYMBOL_GPL(kvm_emulate_halt);
5292 5292
5293 int kvm_hv_hypercall(struct kvm_vcpu *vcpu) 5293 int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
5294 { 5294 {
5295 u64 param, ingpa, outgpa, ret; 5295 u64 param, ingpa, outgpa, ret;
5296 uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0; 5296 uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0;
5297 bool fast, longmode; 5297 bool fast, longmode;
5298 int cs_db, cs_l; 5298 int cs_db, cs_l;
5299 5299
5300 /* 5300 /*
5301 * hypercall generates UD from non zero cpl and real mode 5301 * hypercall generates UD from non zero cpl and real mode
5302 * per HYPER-V spec 5302 * per HYPER-V spec
5303 */ 5303 */
5304 if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) { 5304 if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) {
5305 kvm_queue_exception(vcpu, UD_VECTOR); 5305 kvm_queue_exception(vcpu, UD_VECTOR);
5306 return 0; 5306 return 0;
5307 } 5307 }
5308 5308
5309 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 5309 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
5310 longmode = is_long_mode(vcpu) && cs_l == 1; 5310 longmode = is_long_mode(vcpu) && cs_l == 1;
5311 5311
5312 if (!longmode) { 5312 if (!longmode) {
5313 param = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) | 5313 param = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) |
5314 (kvm_register_read(vcpu, VCPU_REGS_RAX) & 0xffffffff); 5314 (kvm_register_read(vcpu, VCPU_REGS_RAX) & 0xffffffff);
5315 ingpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RBX) << 32) | 5315 ingpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RBX) << 32) |
5316 (kvm_register_read(vcpu, VCPU_REGS_RCX) & 0xffffffff); 5316 (kvm_register_read(vcpu, VCPU_REGS_RCX) & 0xffffffff);
5317 outgpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDI) << 32) | 5317 outgpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDI) << 32) |
5318 (kvm_register_read(vcpu, VCPU_REGS_RSI) & 0xffffffff); 5318 (kvm_register_read(vcpu, VCPU_REGS_RSI) & 0xffffffff);
5319 } 5319 }
5320 #ifdef CONFIG_X86_64 5320 #ifdef CONFIG_X86_64
5321 else { 5321 else {
5322 param = kvm_register_read(vcpu, VCPU_REGS_RCX); 5322 param = kvm_register_read(vcpu, VCPU_REGS_RCX);
5323 ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX); 5323 ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX);
5324 outgpa = kvm_register_read(vcpu, VCPU_REGS_R8); 5324 outgpa = kvm_register_read(vcpu, VCPU_REGS_R8);
5325 } 5325 }
5326 #endif 5326 #endif
5327 5327
5328 code = param & 0xffff; 5328 code = param & 0xffff;
5329 fast = (param >> 16) & 0x1; 5329 fast = (param >> 16) & 0x1;
5330 rep_cnt = (param >> 32) & 0xfff; 5330 rep_cnt = (param >> 32) & 0xfff;
5331 rep_idx = (param >> 48) & 0xfff; 5331 rep_idx = (param >> 48) & 0xfff;
5332 5332
5333 trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa); 5333 trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa);
5334 5334
5335 switch (code) { 5335 switch (code) {
5336 case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT: 5336 case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT:
5337 kvm_vcpu_on_spin(vcpu); 5337 kvm_vcpu_on_spin(vcpu);
5338 break; 5338 break;
5339 default: 5339 default:
5340 res = HV_STATUS_INVALID_HYPERCALL_CODE; 5340 res = HV_STATUS_INVALID_HYPERCALL_CODE;
5341 break; 5341 break;
5342 } 5342 }
5343 5343
5344 ret = res | (((u64)rep_done & 0xfff) << 32); 5344 ret = res | (((u64)rep_done & 0xfff) << 32);
5345 if (longmode) { 5345 if (longmode) {
5346 kvm_register_write(vcpu, VCPU_REGS_RAX, ret); 5346 kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
5347 } else { 5347 } else {
5348 kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32); 5348 kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32);
5349 kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff); 5349 kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff);
5350 } 5350 }
5351 5351
5352 return 1; 5352 return 1;
5353 } 5353 }
5354 5354
5355 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) 5355 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
5356 { 5356 {
5357 unsigned long nr, a0, a1, a2, a3, ret; 5357 unsigned long nr, a0, a1, a2, a3, ret;
5358 int r = 1; 5358 int r = 1;
5359 5359
5360 if (kvm_hv_hypercall_enabled(vcpu->kvm)) 5360 if (kvm_hv_hypercall_enabled(vcpu->kvm))
5361 return kvm_hv_hypercall(vcpu); 5361 return kvm_hv_hypercall(vcpu);
5362 5362
5363 nr = kvm_register_read(vcpu, VCPU_REGS_RAX); 5363 nr = kvm_register_read(vcpu, VCPU_REGS_RAX);
5364 a0 = kvm_register_read(vcpu, VCPU_REGS_RBX); 5364 a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);
5365 a1 = kvm_register_read(vcpu, VCPU_REGS_RCX); 5365 a1 = kvm_register_read(vcpu, VCPU_REGS_RCX);
5366 a2 = kvm_register_read(vcpu, VCPU_REGS_RDX); 5366 a2 = kvm_register_read(vcpu, VCPU_REGS_RDX);
5367 a3 = kvm_register_read(vcpu, VCPU_REGS_RSI); 5367 a3 = kvm_register_read(vcpu, VCPU_REGS_RSI);
5368 5368
5369 trace_kvm_hypercall(nr, a0, a1, a2, a3); 5369 trace_kvm_hypercall(nr, a0, a1, a2, a3);
5370 5370
5371 if (!is_long_mode(vcpu)) { 5371 if (!is_long_mode(vcpu)) {
5372 nr &= 0xFFFFFFFF; 5372 nr &= 0xFFFFFFFF;
5373 a0 &= 0xFFFFFFFF; 5373 a0 &= 0xFFFFFFFF;
5374 a1 &= 0xFFFFFFFF; 5374 a1 &= 0xFFFFFFFF;
5375 a2 &= 0xFFFFFFFF; 5375 a2 &= 0xFFFFFFFF;
5376 a3 &= 0xFFFFFFFF; 5376 a3 &= 0xFFFFFFFF;
5377 } 5377 }
5378 5378
5379 if (kvm_x86_ops->get_cpl(vcpu) != 0) { 5379 if (kvm_x86_ops->get_cpl(vcpu) != 0) {
5380 ret = -KVM_EPERM; 5380 ret = -KVM_EPERM;
5381 goto out; 5381 goto out;
5382 } 5382 }
5383 5383
5384 switch (nr) { 5384 switch (nr) {
5385 case KVM_HC_VAPIC_POLL_IRQ: 5385 case KVM_HC_VAPIC_POLL_IRQ:
5386 ret = 0; 5386 ret = 0;
5387 break; 5387 break;
5388 default: 5388 default:
5389 ret = -KVM_ENOSYS; 5389 ret = -KVM_ENOSYS;
5390 break; 5390 break;
5391 } 5391 }
5392 out: 5392 out:
5393 kvm_register_write(vcpu, VCPU_REGS_RAX, ret); 5393 kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
5394 ++vcpu->stat.hypercalls; 5394 ++vcpu->stat.hypercalls;
5395 return r; 5395 return r;
5396 } 5396 }
5397 EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); 5397 EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
5398 5398
5399 static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) 5399 static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
5400 { 5400 {
5401 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); 5401 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
5402 char instruction[3]; 5402 char instruction[3];
5403 unsigned long rip = kvm_rip_read(vcpu); 5403 unsigned long rip = kvm_rip_read(vcpu);
5404 5404
5405 /* 5405 /*
5406 * Blow out the MMU to ensure that no other VCPU has an active mapping 5406 * Blow out the MMU to ensure that no other VCPU has an active mapping
5407 * to ensure that the updated hypercall appears atomically across all 5407 * to ensure that the updated hypercall appears atomically across all
5408 * VCPUs. 5408 * VCPUs.
5409 */ 5409 */
5410 kvm_mmu_zap_all(vcpu->kvm); 5410 kvm_mmu_zap_all(vcpu->kvm);
5411 5411
5412 kvm_x86_ops->patch_hypercall(vcpu, instruction); 5412 kvm_x86_ops->patch_hypercall(vcpu, instruction);
5413 5413
5414 return emulator_write_emulated(ctxt, rip, instruction, 3, NULL); 5414 return emulator_write_emulated(ctxt, rip, instruction, 3, NULL);
5415 } 5415 }
5416 5416
5417 /* 5417 /*
5418 * Check if userspace requested an interrupt window, and that the 5418 * Check if userspace requested an interrupt window, and that the
5419 * interrupt window is open. 5419 * interrupt window is open.
5420 * 5420 *
5421 * No need to exit to userspace if we already have an interrupt queued. 5421 * No need to exit to userspace if we already have an interrupt queued.
5422 */ 5422 */
5423 static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu) 5423 static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
5424 { 5424 {
5425 return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) && 5425 return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&
5426 vcpu->run->request_interrupt_window && 5426 vcpu->run->request_interrupt_window &&
5427 kvm_arch_interrupt_allowed(vcpu)); 5427 kvm_arch_interrupt_allowed(vcpu));
5428 } 5428 }
5429 5429
5430 static void post_kvm_run_save(struct kvm_vcpu *vcpu) 5430 static void post_kvm_run_save(struct kvm_vcpu *vcpu)
5431 { 5431 {
5432 struct kvm_run *kvm_run = vcpu->run; 5432 struct kvm_run *kvm_run = vcpu->run;
5433 5433
5434 kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0; 5434 kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
5435 kvm_run->cr8 = kvm_get_cr8(vcpu); 5435 kvm_run->cr8 = kvm_get_cr8(vcpu);
5436 kvm_run->apic_base = kvm_get_apic_base(vcpu); 5436 kvm_run->apic_base = kvm_get_apic_base(vcpu);
5437 if (irqchip_in_kernel(vcpu->kvm)) 5437 if (irqchip_in_kernel(vcpu->kvm))
5438 kvm_run->ready_for_interrupt_injection = 1; 5438 kvm_run->ready_for_interrupt_injection = 1;
5439 else 5439 else
5440 kvm_run->ready_for_interrupt_injection = 5440 kvm_run->ready_for_interrupt_injection =
5441 kvm_arch_interrupt_allowed(vcpu) && 5441 kvm_arch_interrupt_allowed(vcpu) &&
5442 !kvm_cpu_has_interrupt(vcpu) && 5442 !kvm_cpu_has_interrupt(vcpu) &&
5443 !kvm_event_needs_reinjection(vcpu); 5443 !kvm_event_needs_reinjection(vcpu);
5444 } 5444 }
5445 5445
5446 static int vapic_enter(struct kvm_vcpu *vcpu) 5446 static int vapic_enter(struct kvm_vcpu *vcpu)
5447 { 5447 {
5448 struct kvm_lapic *apic = vcpu->arch.apic; 5448 struct kvm_lapic *apic = vcpu->arch.apic;
5449 struct page *page; 5449 struct page *page;
5450 5450
5451 if (!apic || !apic->vapic_addr) 5451 if (!apic || !apic->vapic_addr)
5452 return 0; 5452 return 0;
5453 5453
5454 page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); 5454 page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
5455 if (is_error_page(page)) 5455 if (is_error_page(page))
5456 return -EFAULT; 5456 return -EFAULT;
5457 5457
5458 vcpu->arch.apic->vapic_page = page; 5458 vcpu->arch.apic->vapic_page = page;
5459 return 0; 5459 return 0;
5460 } 5460 }
5461 5461
5462 static void vapic_exit(struct kvm_vcpu *vcpu) 5462 static void vapic_exit(struct kvm_vcpu *vcpu)
5463 { 5463 {
5464 struct kvm_lapic *apic = vcpu->arch.apic; 5464 struct kvm_lapic *apic = vcpu->arch.apic;
5465 int idx; 5465 int idx;
5466 5466
5467 if (!apic || !apic->vapic_addr) 5467 if (!apic || !apic->vapic_addr)
5468 return; 5468 return;
5469 5469
5470 idx = srcu_read_lock(&vcpu->kvm->srcu); 5470 idx = srcu_read_lock(&vcpu->kvm->srcu);
5471 kvm_release_page_dirty(apic->vapic_page); 5471 kvm_release_page_dirty(apic->vapic_page);
5472 mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); 5472 mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
5473 srcu_read_unlock(&vcpu->kvm->srcu, idx); 5473 srcu_read_unlock(&vcpu->kvm->srcu, idx);
5474 } 5474 }
5475 5475
5476 static void update_cr8_intercept(struct kvm_vcpu *vcpu) 5476 static void update_cr8_intercept(struct kvm_vcpu *vcpu)
5477 { 5477 {
5478 int max_irr, tpr; 5478 int max_irr, tpr;
5479 5479
5480 if (!kvm_x86_ops->update_cr8_intercept) 5480 if (!kvm_x86_ops->update_cr8_intercept)
5481 return; 5481 return;
5482 5482
5483 if (!vcpu->arch.apic) 5483 if (!vcpu->arch.apic)
5484 return; 5484 return;
5485 5485
5486 if (!vcpu->arch.apic->vapic_addr) 5486 if (!vcpu->arch.apic->vapic_addr)
5487 max_irr = kvm_lapic_find_highest_irr(vcpu); 5487 max_irr = kvm_lapic_find_highest_irr(vcpu);
5488 else 5488 else
5489 max_irr = -1; 5489 max_irr = -1;
5490 5490
5491 if (max_irr != -1) 5491 if (max_irr != -1)
5492 max_irr >>= 4; 5492 max_irr >>= 4;
5493 5493
5494 tpr = kvm_lapic_get_cr8(vcpu); 5494 tpr = kvm_lapic_get_cr8(vcpu);
5495 5495
5496 kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr); 5496 kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
5497 } 5497 }
5498 5498
5499 static void inject_pending_event(struct kvm_vcpu *vcpu) 5499 static void inject_pending_event(struct kvm_vcpu *vcpu)
5500 { 5500 {
5501 /* try to reinject previous events if any */ 5501 /* try to reinject previous events if any */
5502 if (vcpu->arch.exception.pending) { 5502 if (vcpu->arch.exception.pending) {
5503 trace_kvm_inj_exception(vcpu->arch.exception.nr, 5503 trace_kvm_inj_exception(vcpu->arch.exception.nr,
5504 vcpu->arch.exception.has_error_code, 5504 vcpu->arch.exception.has_error_code,
5505 vcpu->arch.exception.error_code); 5505 vcpu->arch.exception.error_code);
5506 kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr, 5506 kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
5507 vcpu->arch.exception.has_error_code, 5507 vcpu->arch.exception.has_error_code,
5508 vcpu->arch.exception.error_code, 5508 vcpu->arch.exception.error_code,
5509 vcpu->arch.exception.reinject); 5509 vcpu->arch.exception.reinject);
5510 return; 5510 return;
5511 } 5511 }
5512 5512
5513 if (vcpu->arch.nmi_injected) { 5513 if (vcpu->arch.nmi_injected) {
5514 kvm_x86_ops->set_nmi(vcpu); 5514 kvm_x86_ops->set_nmi(vcpu);
5515 return; 5515 return;
5516 } 5516 }
5517 5517
5518 if (vcpu->arch.interrupt.pending) { 5518 if (vcpu->arch.interrupt.pending) {
5519 kvm_x86_ops->set_irq(vcpu); 5519 kvm_x86_ops->set_irq(vcpu);
5520 return; 5520 return;
5521 } 5521 }
5522 5522
5523 /* try to inject new event if pending */ 5523 /* try to inject new event if pending */
5524 if (vcpu->arch.nmi_pending) { 5524 if (vcpu->arch.nmi_pending) {
5525 if (kvm_x86_ops->nmi_allowed(vcpu)) { 5525 if (kvm_x86_ops->nmi_allowed(vcpu)) {
5526 --vcpu->arch.nmi_pending; 5526 --vcpu->arch.nmi_pending;
5527 vcpu->arch.nmi_injected = true; 5527 vcpu->arch.nmi_injected = true;
5528 kvm_x86_ops->set_nmi(vcpu); 5528 kvm_x86_ops->set_nmi(vcpu);
5529 } 5529 }
5530 } else if (kvm_cpu_has_interrupt(vcpu)) { 5530 } else if (kvm_cpu_has_interrupt(vcpu)) {
5531 if (kvm_x86_ops->interrupt_allowed(vcpu)) { 5531 if (kvm_x86_ops->interrupt_allowed(vcpu)) {
5532 kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), 5532 kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
5533 false); 5533 false);
5534 kvm_x86_ops->set_irq(vcpu); 5534 kvm_x86_ops->set_irq(vcpu);
5535 } 5535 }
5536 } 5536 }
5537 } 5537 }
5538 5538
5539 static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu) 5539 static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)
5540 { 5540 {
5541 if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) && 5541 if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
5542 !vcpu->guest_xcr0_loaded) { 5542 !vcpu->guest_xcr0_loaded) {
5543 /* kvm_set_xcr() also depends on this */ 5543 /* kvm_set_xcr() also depends on this */
5544 xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0); 5544 xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
5545 vcpu->guest_xcr0_loaded = 1; 5545 vcpu->guest_xcr0_loaded = 1;
5546 } 5546 }
5547 } 5547 }
5548 5548
5549 static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu) 5549 static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
5550 { 5550 {
5551 if (vcpu->guest_xcr0_loaded) { 5551 if (vcpu->guest_xcr0_loaded) {
5552 if (vcpu->arch.xcr0 != host_xcr0) 5552 if (vcpu->arch.xcr0 != host_xcr0)
5553 xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0); 5553 xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
5554 vcpu->guest_xcr0_loaded = 0; 5554 vcpu->guest_xcr0_loaded = 0;
5555 } 5555 }
5556 } 5556 }
5557 5557
5558 static void process_nmi(struct kvm_vcpu *vcpu) 5558 static void process_nmi(struct kvm_vcpu *vcpu)
5559 { 5559 {
5560 unsigned limit = 2; 5560 unsigned limit = 2;
5561 5561
5562 /* 5562 /*
5563 * x86 is limited to one NMI running, and one NMI pending after it. 5563 * x86 is limited to one NMI running, and one NMI pending after it.
5564 * If an NMI is already in progress, limit further NMIs to just one. 5564 * If an NMI is already in progress, limit further NMIs to just one.
5565 * Otherwise, allow two (and we'll inject the first one immediately). 5565 * Otherwise, allow two (and we'll inject the first one immediately).
5566 */ 5566 */
5567 if (kvm_x86_ops->get_nmi_mask(vcpu) || vcpu->arch.nmi_injected) 5567 if (kvm_x86_ops->get_nmi_mask(vcpu) || vcpu->arch.nmi_injected)
5568 limit = 1; 5568 limit = 1;
5569 5569
5570 vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0); 5570 vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
5571 vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit); 5571 vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit);
5572 kvm_make_request(KVM_REQ_EVENT, vcpu); 5572 kvm_make_request(KVM_REQ_EVENT, vcpu);
5573 } 5573 }
5574 5574
5575 static void kvm_gen_update_masterclock(struct kvm *kvm) 5575 static void kvm_gen_update_masterclock(struct kvm *kvm)
5576 { 5576 {
5577 #ifdef CONFIG_X86_64 5577 #ifdef CONFIG_X86_64
5578 int i; 5578 int i;
5579 struct kvm_vcpu *vcpu; 5579 struct kvm_vcpu *vcpu;
5580 struct kvm_arch *ka = &kvm->arch; 5580 struct kvm_arch *ka = &kvm->arch;
5581 5581
5582 spin_lock(&ka->pvclock_gtod_sync_lock); 5582 spin_lock(&ka->pvclock_gtod_sync_lock);
5583 kvm_make_mclock_inprogress_request(kvm); 5583 kvm_make_mclock_inprogress_request(kvm);
5584 /* no guest entries from this point */ 5584 /* no guest entries from this point */
5585 pvclock_update_vm_gtod_copy(kvm); 5585 pvclock_update_vm_gtod_copy(kvm);
5586 5586
5587 kvm_for_each_vcpu(i, vcpu, kvm) 5587 kvm_for_each_vcpu(i, vcpu, kvm)
5588 set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests); 5588 set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
5589 5589
5590 /* guest entries allowed */ 5590 /* guest entries allowed */
5591 kvm_for_each_vcpu(i, vcpu, kvm) 5591 kvm_for_each_vcpu(i, vcpu, kvm)
5592 clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests); 5592 clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests);
5593 5593
5594 spin_unlock(&ka->pvclock_gtod_sync_lock); 5594 spin_unlock(&ka->pvclock_gtod_sync_lock);
5595 #endif 5595 #endif
5596 } 5596 }
5597 5597
5598 static int vcpu_enter_guest(struct kvm_vcpu *vcpu) 5598 static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5599 { 5599 {
5600 int r; 5600 int r;
5601 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && 5601 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
5602 vcpu->run->request_interrupt_window; 5602 vcpu->run->request_interrupt_window;
5603 bool req_immediate_exit = 0; 5603 bool req_immediate_exit = 0;
5604 5604
5605 if (vcpu->requests) { 5605 if (vcpu->requests) {
5606 if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) 5606 if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
5607 kvm_mmu_unload(vcpu); 5607 kvm_mmu_unload(vcpu);
5608 if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) 5608 if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
5609 __kvm_migrate_timers(vcpu); 5609 __kvm_migrate_timers(vcpu);
5610 if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu)) 5610 if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu))
5611 kvm_gen_update_masterclock(vcpu->kvm); 5611 kvm_gen_update_masterclock(vcpu->kvm);
5612 if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) { 5612 if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
5613 r = kvm_guest_time_update(vcpu); 5613 r = kvm_guest_time_update(vcpu);
5614 if (unlikely(r)) 5614 if (unlikely(r))
5615 goto out; 5615 goto out;
5616 } 5616 }
5617 if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu)) 5617 if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
5618 kvm_mmu_sync_roots(vcpu); 5618 kvm_mmu_sync_roots(vcpu);
5619 if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) 5619 if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
5620 kvm_x86_ops->tlb_flush(vcpu); 5620 kvm_x86_ops->tlb_flush(vcpu);
5621 if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) { 5621 if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
5622 vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS; 5622 vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
5623 r = 0; 5623 r = 0;
5624 goto out; 5624 goto out;
5625 } 5625 }
5626 if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) { 5626 if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
5627 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; 5627 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
5628 r = 0; 5628 r = 0;
5629 goto out; 5629 goto out;
5630 } 5630 }
5631 if (kvm_check_request(KVM_REQ_DEACTIVATE_FPU, vcpu)) { 5631 if (kvm_check_request(KVM_REQ_DEACTIVATE_FPU, vcpu)) {
5632 vcpu->fpu_active = 0; 5632 vcpu->fpu_active = 0;
5633 kvm_x86_ops->fpu_deactivate(vcpu); 5633 kvm_x86_ops->fpu_deactivate(vcpu);
5634 } 5634 }
5635 if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) { 5635 if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {
5636 /* Page is swapped out. Do synthetic halt */ 5636 /* Page is swapped out. Do synthetic halt */
5637 vcpu->arch.apf.halted = true; 5637 vcpu->arch.apf.halted = true;
5638 r = 1; 5638 r = 1;
5639 goto out; 5639 goto out;
5640 } 5640 }
5641 if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu)) 5641 if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
5642 record_steal_time(vcpu); 5642 record_steal_time(vcpu);
5643 if (kvm_check_request(KVM_REQ_NMI, vcpu)) 5643 if (kvm_check_request(KVM_REQ_NMI, vcpu))
5644 process_nmi(vcpu); 5644 process_nmi(vcpu);
5645 req_immediate_exit = 5645 req_immediate_exit =
5646 kvm_check_request(KVM_REQ_IMMEDIATE_EXIT, vcpu); 5646 kvm_check_request(KVM_REQ_IMMEDIATE_EXIT, vcpu);
5647 if (kvm_check_request(KVM_REQ_PMU, vcpu)) 5647 if (kvm_check_request(KVM_REQ_PMU, vcpu))
5648 kvm_handle_pmu_event(vcpu); 5648 kvm_handle_pmu_event(vcpu);
5649 if (kvm_check_request(KVM_REQ_PMI, vcpu)) 5649 if (kvm_check_request(KVM_REQ_PMI, vcpu))
5650 kvm_deliver_pmi(vcpu); 5650 kvm_deliver_pmi(vcpu);
5651 } 5651 }
5652 5652
5653 if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { 5653 if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
5654 inject_pending_event(vcpu); 5654 inject_pending_event(vcpu);
5655 5655
5656 /* enable NMI/IRQ window open exits if needed */ 5656 /* enable NMI/IRQ window open exits if needed */
5657 if (vcpu->arch.nmi_pending) 5657 if (vcpu->arch.nmi_pending)
5658 kvm_x86_ops->enable_nmi_window(vcpu); 5658 kvm_x86_ops->enable_nmi_window(vcpu);
5659 else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) 5659 else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
5660 kvm_x86_ops->enable_irq_window(vcpu); 5660 kvm_x86_ops->enable_irq_window(vcpu);
5661 5661
5662 if (kvm_lapic_enabled(vcpu)) { 5662 if (kvm_lapic_enabled(vcpu)) {
5663 update_cr8_intercept(vcpu); 5663 update_cr8_intercept(vcpu);
5664 kvm_lapic_sync_to_vapic(vcpu); 5664 kvm_lapic_sync_to_vapic(vcpu);
5665 } 5665 }
5666 } 5666 }
5667 5667
5668 r = kvm_mmu_reload(vcpu); 5668 r = kvm_mmu_reload(vcpu);
5669 if (unlikely(r)) { 5669 if (unlikely(r)) {
5670 goto cancel_injection; 5670 goto cancel_injection;
5671 } 5671 }
5672 5672
5673 preempt_disable(); 5673 preempt_disable();
5674 5674
5675 kvm_x86_ops->prepare_guest_switch(vcpu); 5675 kvm_x86_ops->prepare_guest_switch(vcpu);
5676 if (vcpu->fpu_active) 5676 if (vcpu->fpu_active)
5677 kvm_load_guest_fpu(vcpu); 5677 kvm_load_guest_fpu(vcpu);
5678 kvm_load_guest_xcr0(vcpu); 5678 kvm_load_guest_xcr0(vcpu);
5679 5679
5680 vcpu->mode = IN_GUEST_MODE; 5680 vcpu->mode = IN_GUEST_MODE;
5681 5681
5682 /* We should set ->mode before check ->requests, 5682 /* We should set ->mode before check ->requests,
5683 * see the comment in make_all_cpus_request. 5683 * see the comment in make_all_cpus_request.
5684 */ 5684 */
5685 smp_mb(); 5685 smp_mb();
5686 5686
5687 local_irq_disable(); 5687 local_irq_disable();
5688 5688
5689 if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests 5689 if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests
5690 || need_resched() || signal_pending(current)) { 5690 || need_resched() || signal_pending(current)) {
5691 vcpu->mode = OUTSIDE_GUEST_MODE; 5691 vcpu->mode = OUTSIDE_GUEST_MODE;
5692 smp_wmb(); 5692 smp_wmb();
5693 local_irq_enable(); 5693 local_irq_enable();
5694 preempt_enable(); 5694 preempt_enable();
5695 r = 1; 5695 r = 1;
5696 goto cancel_injection; 5696 goto cancel_injection;
5697 } 5697 }
5698 5698
5699 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); 5699 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
5700 5700
5701 if (req_immediate_exit) 5701 if (req_immediate_exit)
5702 smp_send_reschedule(vcpu->cpu); 5702 smp_send_reschedule(vcpu->cpu);
5703 5703
5704 kvm_guest_enter(); 5704 kvm_guest_enter();
5705 5705
5706 if (unlikely(vcpu->arch.switch_db_regs)) { 5706 if (unlikely(vcpu->arch.switch_db_regs)) {
5707 set_debugreg(0, 7); 5707 set_debugreg(0, 7);
5708 set_debugreg(vcpu->arch.eff_db[0], 0); 5708 set_debugreg(vcpu->arch.eff_db[0], 0);
5709 set_debugreg(vcpu->arch.eff_db[1], 1); 5709 set_debugreg(vcpu->arch.eff_db[1], 1);
5710 set_debugreg(vcpu->arch.eff_db[2], 2); 5710 set_debugreg(vcpu->arch.eff_db[2], 2);
5711 set_debugreg(vcpu->arch.eff_db[3], 3); 5711 set_debugreg(vcpu->arch.eff_db[3], 3);
5712 } 5712 }
5713 5713
5714 trace_kvm_entry(vcpu->vcpu_id); 5714 trace_kvm_entry(vcpu->vcpu_id);
5715 kvm_x86_ops->run(vcpu); 5715 kvm_x86_ops->run(vcpu);
5716 5716
5717 /* 5717 /*
5718 * If the guest has used debug registers, at least dr7 5718 * If the guest has used debug registers, at least dr7
5719 * will be disabled while returning to the host. 5719 * will be disabled while returning to the host.
5720 * If we don't have active breakpoints in the host, we don't 5720 * If we don't have active breakpoints in the host, we don't
5721 * care about the messed up debug address registers. But if 5721 * care about the messed up debug address registers. But if
5722 * we have some of them active, restore the old state. 5722 * we have some of them active, restore the old state.
5723 */ 5723 */
5724 if (hw_breakpoint_active()) 5724 if (hw_breakpoint_active())
5725 hw_breakpoint_restore(); 5725 hw_breakpoint_restore();
5726 5726
5727 vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, 5727 vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu,
5728 native_read_tsc()); 5728 native_read_tsc());
5729 5729
5730 vcpu->mode = OUTSIDE_GUEST_MODE; 5730 vcpu->mode = OUTSIDE_GUEST_MODE;
5731 smp_wmb(); 5731 smp_wmb();
5732 local_irq_enable(); 5732 local_irq_enable();
5733 5733
5734 ++vcpu->stat.exits; 5734 ++vcpu->stat.exits;
5735 5735
5736 /* 5736 /*
5737 * We must have an instruction between local_irq_enable() and 5737 * We must have an instruction between local_irq_enable() and
5738 * kvm_guest_exit(), so the timer interrupt isn't delayed by 5738 * kvm_guest_exit(), so the timer interrupt isn't delayed by
5739 * the interrupt shadow. The stat.exits increment will do nicely. 5739 * the interrupt shadow. The stat.exits increment will do nicely.
5740 * But we need to prevent reordering, hence this barrier(): 5740 * But we need to prevent reordering, hence this barrier():
5741 */ 5741 */
5742 barrier(); 5742 barrier();
5743 5743
5744 kvm_guest_exit(); 5744 kvm_guest_exit();
5745 5745
5746 preempt_enable(); 5746 preempt_enable();
5747 5747
5748 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); 5748 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
5749 5749
5750 /* 5750 /*
5751 * Profile KVM exit RIPs: 5751 * Profile KVM exit RIPs:
5752 */ 5752 */
5753 if (unlikely(prof_on == KVM_PROFILING)) { 5753 if (unlikely(prof_on == KVM_PROFILING)) {
5754 unsigned long rip = kvm_rip_read(vcpu); 5754 unsigned long rip = kvm_rip_read(vcpu);
5755 profile_hit(KVM_PROFILING, (void *)rip); 5755 profile_hit(KVM_PROFILING, (void *)rip);
5756 } 5756 }
5757 5757
5758 if (unlikely(vcpu->arch.tsc_always_catchup)) 5758 if (unlikely(vcpu->arch.tsc_always_catchup))
5759 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); 5759 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
5760 5760
5761 if (vcpu->arch.apic_attention) 5761 if (vcpu->arch.apic_attention)
5762 kvm_lapic_sync_from_vapic(vcpu); 5762 kvm_lapic_sync_from_vapic(vcpu);
5763 5763
5764 r = kvm_x86_ops->handle_exit(vcpu); 5764 r = kvm_x86_ops->handle_exit(vcpu);
5765 return r; 5765 return r;
5766 5766
5767 cancel_injection: 5767 cancel_injection:
5768 kvm_x86_ops->cancel_injection(vcpu); 5768 kvm_x86_ops->cancel_injection(vcpu);
5769 if (unlikely(vcpu->arch.apic_attention)) 5769 if (unlikely(vcpu->arch.apic_attention))
5770 kvm_lapic_sync_from_vapic(vcpu); 5770 kvm_lapic_sync_from_vapic(vcpu);
5771 out: 5771 out:
5772 return r; 5772 return r;
5773 } 5773 }
5774 5774
5775 5775
5776 static int __vcpu_run(struct kvm_vcpu *vcpu) 5776 static int __vcpu_run(struct kvm_vcpu *vcpu)
5777 { 5777 {
5778 int r; 5778 int r;
5779 struct kvm *kvm = vcpu->kvm; 5779 struct kvm *kvm = vcpu->kvm;
5780 5780
5781 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) { 5781 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
5782 pr_debug("vcpu %d received sipi with vector # %x\n", 5782 pr_debug("vcpu %d received sipi with vector # %x\n",
5783 vcpu->vcpu_id, vcpu->arch.sipi_vector); 5783 vcpu->vcpu_id, vcpu->arch.sipi_vector);
5784 kvm_lapic_reset(vcpu); 5784 kvm_lapic_reset(vcpu);
5785 r = kvm_vcpu_reset(vcpu); 5785 r = kvm_vcpu_reset(vcpu);
5786 if (r) 5786 if (r)
5787 return r; 5787 return r;
5788 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 5788 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
5789 } 5789 }
5790 5790
5791 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); 5791 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
5792 r = vapic_enter(vcpu); 5792 r = vapic_enter(vcpu);
5793 if (r) { 5793 if (r) {
5794 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); 5794 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
5795 return r; 5795 return r;
5796 } 5796 }
5797 5797
5798 r = 1; 5798 r = 1;
5799 while (r > 0) { 5799 while (r > 0) {
5800 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && 5800 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
5801 !vcpu->arch.apf.halted) 5801 !vcpu->arch.apf.halted)
5802 r = vcpu_enter_guest(vcpu); 5802 r = vcpu_enter_guest(vcpu);
5803 else { 5803 else {
5804 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); 5804 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
5805 kvm_vcpu_block(vcpu); 5805 kvm_vcpu_block(vcpu);
5806 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); 5806 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
5807 if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) 5807 if (kvm_check_request(KVM_REQ_UNHALT, vcpu))
5808 { 5808 {
5809 switch(vcpu->arch.mp_state) { 5809 switch(vcpu->arch.mp_state) {
5810 case KVM_MP_STATE_HALTED: 5810 case KVM_MP_STATE_HALTED:
5811 vcpu->arch.mp_state = 5811 vcpu->arch.mp_state =
5812 KVM_MP_STATE_RUNNABLE; 5812 KVM_MP_STATE_RUNNABLE;
5813 case KVM_MP_STATE_RUNNABLE: 5813 case KVM_MP_STATE_RUNNABLE:
5814 vcpu->arch.apf.halted = false; 5814 vcpu->arch.apf.halted = false;
5815 break; 5815 break;
5816 case KVM_MP_STATE_SIPI_RECEIVED: 5816 case KVM_MP_STATE_SIPI_RECEIVED:
5817 default: 5817 default:
5818 r = -EINTR; 5818 r = -EINTR;
5819 break; 5819 break;
5820 } 5820 }
5821 } 5821 }
5822 } 5822 }
5823 5823
5824 if (r <= 0) 5824 if (r <= 0)
5825 break; 5825 break;
5826 5826
5827 clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); 5827 clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
5828 if (kvm_cpu_has_pending_timer(vcpu)) 5828 if (kvm_cpu_has_pending_timer(vcpu))
5829 kvm_inject_pending_timer_irqs(vcpu); 5829 kvm_inject_pending_timer_irqs(vcpu);
5830 5830
5831 if (dm_request_for_irq_injection(vcpu)) { 5831 if (dm_request_for_irq_injection(vcpu)) {
5832 r = -EINTR; 5832 r = -EINTR;
5833 vcpu->run->exit_reason = KVM_EXIT_INTR; 5833 vcpu->run->exit_reason = KVM_EXIT_INTR;
5834 ++vcpu->stat.request_irq_exits; 5834 ++vcpu->stat.request_irq_exits;
5835 } 5835 }
5836 5836
5837 kvm_check_async_pf_completion(vcpu); 5837 kvm_check_async_pf_completion(vcpu);
5838 5838
5839 if (signal_pending(current)) { 5839 if (signal_pending(current)) {
5840 r = -EINTR; 5840 r = -EINTR;
5841 vcpu->run->exit_reason = KVM_EXIT_INTR; 5841 vcpu->run->exit_reason = KVM_EXIT_INTR;
5842 ++vcpu->stat.signal_exits; 5842 ++vcpu->stat.signal_exits;
5843 } 5843 }
5844 if (need_resched()) { 5844 if (need_resched()) {
5845 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); 5845 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
5846 kvm_resched(vcpu); 5846 kvm_resched(vcpu);
5847 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); 5847 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
5848 } 5848 }
5849 } 5849 }
5850 5850
5851 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); 5851 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
5852 5852
5853 vapic_exit(vcpu); 5853 vapic_exit(vcpu);
5854 5854
5855 return r; 5855 return r;
5856 } 5856 }
5857 5857
5858 static inline int complete_emulated_io(struct kvm_vcpu *vcpu) 5858 static inline int complete_emulated_io(struct kvm_vcpu *vcpu)
5859 { 5859 {
5860 int r; 5860 int r;
5861 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); 5861 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
5862 r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE); 5862 r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
5863 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); 5863 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
5864 if (r != EMULATE_DONE) 5864 if (r != EMULATE_DONE)
5865 return 0; 5865 return 0;
5866 return 1; 5866 return 1;
5867 } 5867 }
5868 5868
5869 static int complete_emulated_pio(struct kvm_vcpu *vcpu) 5869 static int complete_emulated_pio(struct kvm_vcpu *vcpu)
5870 { 5870 {
5871 BUG_ON(!vcpu->arch.pio.count); 5871 BUG_ON(!vcpu->arch.pio.count);
5872 5872
5873 return complete_emulated_io(vcpu); 5873 return complete_emulated_io(vcpu);
5874 } 5874 }
5875 5875
5876 /* 5876 /*
5877 * Implements the following, as a state machine: 5877 * Implements the following, as a state machine:
5878 * 5878 *
5879 * read: 5879 * read:
5880 * for each fragment 5880 * for each fragment
5881 * for each mmio piece in the fragment 5881 * for each mmio piece in the fragment
5882 * write gpa, len 5882 * write gpa, len
5883 * exit 5883 * exit
5884 * copy data 5884 * copy data
5885 * execute insn 5885 * execute insn
5886 * 5886 *
5887 * write: 5887 * write:
5888 * for each fragment 5888 * for each fragment
5889 * for each mmio piece in the fragment 5889 * for each mmio piece in the fragment
5890 * write gpa, len 5890 * write gpa, len
5891 * copy data 5891 * copy data
5892 * exit 5892 * exit
5893 */ 5893 */
5894 static int complete_emulated_mmio(struct kvm_vcpu *vcpu) 5894 static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
5895 { 5895 {
5896 struct kvm_run *run = vcpu->run; 5896 struct kvm_run *run = vcpu->run;
5897 struct kvm_mmio_fragment *frag; 5897 struct kvm_mmio_fragment *frag;
5898 unsigned len; 5898 unsigned len;
5899 5899
5900 BUG_ON(!vcpu->mmio_needed); 5900 BUG_ON(!vcpu->mmio_needed);
5901 5901
5902 /* Complete previous fragment */ 5902 /* Complete previous fragment */
5903 frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment]; 5903 frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment];
5904 len = min(8u, frag->len); 5904 len = min(8u, frag->len);
5905 if (!vcpu->mmio_is_write) 5905 if (!vcpu->mmio_is_write)
5906 memcpy(frag->data, run->mmio.data, len); 5906 memcpy(frag->data, run->mmio.data, len);
5907 5907
5908 if (frag->len <= 8) { 5908 if (frag->len <= 8) {
5909 /* Switch to the next fragment. */ 5909 /* Switch to the next fragment. */
5910 frag++; 5910 frag++;
5911 vcpu->mmio_cur_fragment++; 5911 vcpu->mmio_cur_fragment++;
5912 } else { 5912 } else {
5913 /* Go forward to the next mmio piece. */ 5913 /* Go forward to the next mmio piece. */
5914 frag->data += len; 5914 frag->data += len;
5915 frag->gpa += len; 5915 frag->gpa += len;
5916 frag->len -= len; 5916 frag->len -= len;
5917 } 5917 }
5918 5918
5919 if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) { 5919 if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) {
5920 vcpu->mmio_needed = 0; 5920 vcpu->mmio_needed = 0;
5921 if (vcpu->mmio_is_write) 5921 if (vcpu->mmio_is_write)
5922 return 1; 5922 return 1;
5923 vcpu->mmio_read_completed = 1; 5923 vcpu->mmio_read_completed = 1;
5924 return complete_emulated_io(vcpu); 5924 return complete_emulated_io(vcpu);
5925 } 5925 }
5926 5926
5927 run->exit_reason = KVM_EXIT_MMIO; 5927 run->exit_reason = KVM_EXIT_MMIO;
5928 run->mmio.phys_addr = frag->gpa; 5928 run->mmio.phys_addr = frag->gpa;
5929 if (vcpu->mmio_is_write) 5929 if (vcpu->mmio_is_write)
5930 memcpy(run->mmio.data, frag->data, min(8u, frag->len)); 5930 memcpy(run->mmio.data, frag->data, min(8u, frag->len));
5931 run->mmio.len = min(8u, frag->len); 5931 run->mmio.len = min(8u, frag->len);
5932 run->mmio.is_write = vcpu->mmio_is_write; 5932 run->mmio.is_write = vcpu->mmio_is_write;
5933 vcpu->arch.complete_userspace_io = complete_emulated_mmio; 5933 vcpu->arch.complete_userspace_io = complete_emulated_mmio;
5934 return 0; 5934 return 0;
5935 } 5935 }
5936 5936
5937 5937
5938 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 5938 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
5939 { 5939 {
5940 int r; 5940 int r;
5941 sigset_t sigsaved; 5941 sigset_t sigsaved;
5942 5942
5943 if (!tsk_used_math(current) && init_fpu(current)) 5943 if (!tsk_used_math(current) && init_fpu(current))
5944 return -ENOMEM; 5944 return -ENOMEM;
5945 5945
5946 if (vcpu->sigset_active) 5946 if (vcpu->sigset_active)
5947 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); 5947 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
5948 5948
5949 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { 5949 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
5950 kvm_vcpu_block(vcpu); 5950 kvm_vcpu_block(vcpu);
5951 clear_bit(KVM_REQ_UNHALT, &vcpu->requests); 5951 clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
5952 r = -EAGAIN; 5952 r = -EAGAIN;
5953 goto out; 5953 goto out;
5954 } 5954 }
5955 5955
5956 /* re-sync apic's tpr */ 5956 /* re-sync apic's tpr */
5957 if (!irqchip_in_kernel(vcpu->kvm)) { 5957 if (!irqchip_in_kernel(vcpu->kvm)) {
5958 if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) { 5958 if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
5959 r = -EINVAL; 5959 r = -EINVAL;
5960 goto out; 5960 goto out;
5961 } 5961 }
5962 } 5962 }
5963 5963
5964 if (unlikely(vcpu->arch.complete_userspace_io)) { 5964 if (unlikely(vcpu->arch.complete_userspace_io)) {
5965 int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io; 5965 int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io;
5966 vcpu->arch.complete_userspace_io = NULL; 5966 vcpu->arch.complete_userspace_io = NULL;
5967 r = cui(vcpu); 5967 r = cui(vcpu);
5968 if (r <= 0) 5968 if (r <= 0)
5969 goto out; 5969 goto out;
5970 } else 5970 } else
5971 WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed); 5971 WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);
5972 5972
5973 r = __vcpu_run(vcpu); 5973 r = __vcpu_run(vcpu);
5974 5974
5975 out: 5975 out:
5976 post_kvm_run_save(vcpu); 5976 post_kvm_run_save(vcpu);
5977 if (vcpu->sigset_active) 5977 if (vcpu->sigset_active)
5978 sigprocmask(SIG_SETMASK, &sigsaved, NULL); 5978 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
5979 5979
5980 return r; 5980 return r;
5981 } 5981 }
5982 5982
5983 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 5983 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
5984 { 5984 {
5985 if (vcpu->arch.emulate_regs_need_sync_to_vcpu) { 5985 if (vcpu->arch.emulate_regs_need_sync_to_vcpu) {
5986 /* 5986 /*
5987 * We are here if userspace calls get_regs() in the middle of 5987 * We are here if userspace calls get_regs() in the middle of
5988 * instruction emulation. Registers state needs to be copied 5988 * instruction emulation. Registers state needs to be copied
5989 * back from emulation context to vcpu. Userspace shouldn't do 5989 * back from emulation context to vcpu. Userspace shouldn't do
5990 * that usually, but some bad designed PV devices (vmware 5990 * that usually, but some bad designed PV devices (vmware
5991 * backdoor interface) need this to work 5991 * backdoor interface) need this to work
5992 */ 5992 */
5993 emulator_writeback_register_cache(&vcpu->arch.emulate_ctxt); 5993 emulator_writeback_register_cache(&vcpu->arch.emulate_ctxt);
5994 vcpu->arch.emulate_regs_need_sync_to_vcpu = false; 5994 vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
5995 } 5995 }
5996 regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); 5996 regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
5997 regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX); 5997 regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
5998 regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX); 5998 regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
5999 regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX); 5999 regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX);
6000 regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI); 6000 regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI);
6001 regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI); 6001 regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI);
6002 regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); 6002 regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
6003 regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP); 6003 regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP);
6004 #ifdef CONFIG_X86_64 6004 #ifdef CONFIG_X86_64
6005 regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8); 6005 regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8);
6006 regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9); 6006 regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9);
6007 regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10); 6007 regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10);
6008 regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11); 6008 regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11);
6009 regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12); 6009 regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12);
6010 regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13); 6010 regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13);
6011 regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14); 6011 regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14);
6012 regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15); 6012 regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15);
6013 #endif 6013 #endif
6014 6014
6015 regs->rip = kvm_rip_read(vcpu); 6015 regs->rip = kvm_rip_read(vcpu);
6016 regs->rflags = kvm_get_rflags(vcpu); 6016 regs->rflags = kvm_get_rflags(vcpu);
6017 6017
6018 return 0; 6018 return 0;
6019 } 6019 }
6020 6020
6021 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 6021 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
6022 { 6022 {
6023 vcpu->arch.emulate_regs_need_sync_from_vcpu = true; 6023 vcpu->arch.emulate_regs_need_sync_from_vcpu = true;
6024 vcpu->arch.emulate_regs_need_sync_to_vcpu = false; 6024 vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
6025 6025
6026 kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax); 6026 kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
6027 kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx); 6027 kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
6028 kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx); 6028 kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
6029 kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx); 6029 kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx);
6030 kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi); 6030 kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi);
6031 kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi); 6031 kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi);
6032 kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp); 6032 kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp);
6033 kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp); 6033 kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp);
6034 #ifdef CONFIG_X86_64 6034 #ifdef CONFIG_X86_64
6035 kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8); 6035 kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8);
6036 kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9); 6036 kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9);
6037 kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10); 6037 kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10);
6038 kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11); 6038 kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11);
6039 kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12); 6039 kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12);
6040 kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13); 6040 kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13);
6041 kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14); 6041 kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14);
6042 kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15); 6042 kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15);
6043 #endif 6043 #endif
6044 6044
6045 kvm_rip_write(vcpu, regs->rip); 6045 kvm_rip_write(vcpu, regs->rip);
6046 kvm_set_rflags(vcpu, regs->rflags); 6046 kvm_set_rflags(vcpu, regs->rflags);
6047 6047
6048 vcpu->arch.exception.pending = false; 6048 vcpu->arch.exception.pending = false;
6049 6049
6050 kvm_make_request(KVM_REQ_EVENT, vcpu); 6050 kvm_make_request(KVM_REQ_EVENT, vcpu);
6051 6051
6052 return 0; 6052 return 0;
6053 } 6053 }
6054 6054
6055 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) 6055 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
6056 { 6056 {
6057 struct kvm_segment cs; 6057 struct kvm_segment cs;
6058 6058
6059 kvm_get_segment(vcpu, &cs, VCPU_SREG_CS); 6059 kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
6060 *db = cs.db; 6060 *db = cs.db;
6061 *l = cs.l; 6061 *l = cs.l;
6062 } 6062 }
6063 EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits); 6063 EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
6064 6064
6065 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, 6065 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
6066 struct kvm_sregs *sregs) 6066 struct kvm_sregs *sregs)
6067 { 6067 {
6068 struct desc_ptr dt; 6068 struct desc_ptr dt;
6069 6069
6070 kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 6070 kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
6071 kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); 6071 kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
6072 kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES); 6072 kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
6073 kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS); 6073 kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
6074 kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS); 6074 kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
6075 kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS); 6075 kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
6076 6076
6077 kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR); 6077 kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
6078 kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); 6078 kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
6079 6079
6080 kvm_x86_ops->get_idt(vcpu, &dt); 6080 kvm_x86_ops->get_idt(vcpu, &dt);
6081 sregs->idt.limit = dt.size; 6081 sregs->idt.limit = dt.size;
6082 sregs->idt.base = dt.address; 6082 sregs->idt.base = dt.address;
6083 kvm_x86_ops->get_gdt(vcpu, &dt); 6083 kvm_x86_ops->get_gdt(vcpu, &dt);
6084 sregs->gdt.limit = dt.size; 6084 sregs->gdt.limit = dt.size;
6085 sregs->gdt.base = dt.address; 6085 sregs->gdt.base = dt.address;
6086 6086
6087 sregs->cr0 = kvm_read_cr0(vcpu); 6087 sregs->cr0 = kvm_read_cr0(vcpu);
6088 sregs->cr2 = vcpu->arch.cr2; 6088 sregs->cr2 = vcpu->arch.cr2;
6089 sregs->cr3 = kvm_read_cr3(vcpu); 6089 sregs->cr3 = kvm_read_cr3(vcpu);
6090 sregs->cr4 = kvm_read_cr4(vcpu); 6090 sregs->cr4 = kvm_read_cr4(vcpu);
6091 sregs->cr8 = kvm_get_cr8(vcpu); 6091 sregs->cr8 = kvm_get_cr8(vcpu);
6092 sregs->efer = vcpu->arch.efer; 6092 sregs->efer = vcpu->arch.efer;
6093 sregs->apic_base = kvm_get_apic_base(vcpu); 6093 sregs->apic_base = kvm_get_apic_base(vcpu);
6094 6094
6095 memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap); 6095 memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap);
6096 6096
6097 if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft) 6097 if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft)
6098 set_bit(vcpu->arch.interrupt.nr, 6098 set_bit(vcpu->arch.interrupt.nr,
6099 (unsigned long *)sregs->interrupt_bitmap); 6099 (unsigned long *)sregs->interrupt_bitmap);
6100 6100
6101 return 0; 6101 return 0;
6102 } 6102 }
6103 6103
6104 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, 6104 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
6105 struct kvm_mp_state *mp_state) 6105 struct kvm_mp_state *mp_state)
6106 { 6106 {
6107 mp_state->mp_state = vcpu->arch.mp_state; 6107 mp_state->mp_state = vcpu->arch.mp_state;
6108 return 0; 6108 return 0;
6109 } 6109 }
6110 6110
6111 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, 6111 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
6112 struct kvm_mp_state *mp_state) 6112 struct kvm_mp_state *mp_state)
6113 { 6113 {
6114 vcpu->arch.mp_state = mp_state->mp_state; 6114 vcpu->arch.mp_state = mp_state->mp_state;
6115 kvm_make_request(KVM_REQ_EVENT, vcpu); 6115 kvm_make_request(KVM_REQ_EVENT, vcpu);
6116 return 0; 6116 return 0;
6117 } 6117 }
6118 6118
6119 int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, 6119 int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
6120 int reason, bool has_error_code, u32 error_code) 6120 int reason, bool has_error_code, u32 error_code)
6121 { 6121 {
6122 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; 6122 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
6123 int ret; 6123 int ret;
6124 6124
6125 init_emulate_ctxt(vcpu); 6125 init_emulate_ctxt(vcpu);
6126 6126
6127 ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason, 6127 ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason,
6128 has_error_code, error_code); 6128 has_error_code, error_code);
6129 6129
6130 if (ret) 6130 if (ret)
6131 return EMULATE_FAIL; 6131 return EMULATE_FAIL;
6132 6132
6133 kvm_rip_write(vcpu, ctxt->eip); 6133 kvm_rip_write(vcpu, ctxt->eip);
6134 kvm_set_rflags(vcpu, ctxt->eflags); 6134 kvm_set_rflags(vcpu, ctxt->eflags);
6135 kvm_make_request(KVM_REQ_EVENT, vcpu); 6135 kvm_make_request(KVM_REQ_EVENT, vcpu);
6136 return EMULATE_DONE; 6136 return EMULATE_DONE;
6137 } 6137 }
6138 EXPORT_SYMBOL_GPL(kvm_task_switch); 6138 EXPORT_SYMBOL_GPL(kvm_task_switch);
6139 6139
6140 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, 6140 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
6141 struct kvm_sregs *sregs) 6141 struct kvm_sregs *sregs)
6142 { 6142 {
6143 int mmu_reset_needed = 0; 6143 int mmu_reset_needed = 0;
6144 int pending_vec, max_bits, idx; 6144 int pending_vec, max_bits, idx;
6145 struct desc_ptr dt; 6145 struct desc_ptr dt;
6146 6146
6147 if (!guest_cpuid_has_xsave(vcpu) && (sregs->cr4 & X86_CR4_OSXSAVE)) 6147 if (!guest_cpuid_has_xsave(vcpu) && (sregs->cr4 & X86_CR4_OSXSAVE))
6148 return -EINVAL; 6148 return -EINVAL;
6149 6149
6150 dt.size = sregs->idt.limit; 6150 dt.size = sregs->idt.limit;
6151 dt.address = sregs->idt.base; 6151 dt.address = sregs->idt.base;
6152 kvm_x86_ops->set_idt(vcpu, &dt); 6152 kvm_x86_ops->set_idt(vcpu, &dt);
6153 dt.size = sregs->gdt.limit; 6153 dt.size = sregs->gdt.limit;
6154 dt.address = sregs->gdt.base; 6154 dt.address = sregs->gdt.base;
6155 kvm_x86_ops->set_gdt(vcpu, &dt); 6155 kvm_x86_ops->set_gdt(vcpu, &dt);
6156 6156
6157 vcpu->arch.cr2 = sregs->cr2; 6157 vcpu->arch.cr2 = sregs->cr2;
6158 mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3; 6158 mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
6159 vcpu->arch.cr3 = sregs->cr3; 6159 vcpu->arch.cr3 = sregs->cr3;
6160 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); 6160 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
6161 6161
6162 kvm_set_cr8(vcpu, sregs->cr8); 6162 kvm_set_cr8(vcpu, sregs->cr8);
6163 6163
6164 mmu_reset_needed |= vcpu->arch.efer != sregs->efer; 6164 mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
6165 kvm_x86_ops->set_efer(vcpu, sregs->efer); 6165 kvm_x86_ops->set_efer(vcpu, sregs->efer);
6166 kvm_set_apic_base(vcpu, sregs->apic_base); 6166 kvm_set_apic_base(vcpu, sregs->apic_base);
6167 6167
6168 mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0; 6168 mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
6169 kvm_x86_ops->set_cr0(vcpu, sregs->cr0); 6169 kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
6170 vcpu->arch.cr0 = sregs->cr0; 6170 vcpu->arch.cr0 = sregs->cr0;
6171 6171
6172 mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; 6172 mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
6173 kvm_x86_ops->set_cr4(vcpu, sregs->cr4); 6173 kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
6174 if (sregs->cr4 & X86_CR4_OSXSAVE) 6174 if (sregs->cr4 & X86_CR4_OSXSAVE)
6175 kvm_update_cpuid(vcpu); 6175 kvm_update_cpuid(vcpu);
6176 6176
6177 idx = srcu_read_lock(&vcpu->kvm->srcu); 6177 idx = srcu_read_lock(&vcpu->kvm->srcu);
6178 if (!is_long_mode(vcpu) && is_pae(vcpu)) { 6178 if (!is_long_mode(vcpu) && is_pae(vcpu)) {
6179 load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); 6179 load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
6180 mmu_reset_needed = 1; 6180 mmu_reset_needed = 1;
6181 } 6181 }
6182 srcu_read_unlock(&vcpu->kvm->srcu, idx); 6182 srcu_read_unlock(&vcpu->kvm->srcu, idx);
6183 6183
6184 if (mmu_reset_needed) 6184 if (mmu_reset_needed)
6185 kvm_mmu_reset_context(vcpu); 6185 kvm_mmu_reset_context(vcpu);
6186 6186
6187 max_bits = KVM_NR_INTERRUPTS; 6187 max_bits = KVM_NR_INTERRUPTS;
6188 pending_vec = find_first_bit( 6188 pending_vec = find_first_bit(
6189 (const unsigned long *)sregs->interrupt_bitmap, max_bits); 6189 (const unsigned long *)sregs->interrupt_bitmap, max_bits);
6190 if (pending_vec < max_bits) { 6190 if (pending_vec < max_bits) {
6191 kvm_queue_interrupt(vcpu, pending_vec, false); 6191 kvm_queue_interrupt(vcpu, pending_vec, false);
6192 pr_debug("Set back pending irq %d\n", pending_vec); 6192 pr_debug("Set back pending irq %d\n", pending_vec);
6193 } 6193 }
6194 6194
6195 kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 6195 kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
6196 kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS); 6196 kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
6197 kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES); 6197 kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
6198 kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS); 6198 kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
6199 kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS); 6199 kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
6200 kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS); 6200 kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
6201 6201
6202 kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); 6202 kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
6203 kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); 6203 kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
6204 6204
6205 update_cr8_intercept(vcpu); 6205 update_cr8_intercept(vcpu);
6206 6206
6207 /* Older userspace won't unhalt the vcpu on reset. */ 6207 /* Older userspace won't unhalt the vcpu on reset. */
6208 if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 && 6208 if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 &&
6209 sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && 6209 sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
6210 !is_protmode(vcpu)) 6210 !is_protmode(vcpu))
6211 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 6211 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
6212 6212
6213 kvm_make_request(KVM_REQ_EVENT, vcpu); 6213 kvm_make_request(KVM_REQ_EVENT, vcpu);
6214 6214
6215 return 0; 6215 return 0;
6216 } 6216 }
6217 6217
6218 int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, 6218 int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
6219 struct kvm_guest_debug *dbg) 6219 struct kvm_guest_debug *dbg)
6220 { 6220 {
6221 unsigned long rflags; 6221 unsigned long rflags;
6222 int i, r; 6222 int i, r;
6223 6223
6224 if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) { 6224 if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
6225 r = -EBUSY; 6225 r = -EBUSY;
6226 if (vcpu->arch.exception.pending) 6226 if (vcpu->arch.exception.pending)
6227 goto out; 6227 goto out;
6228 if (dbg->control & KVM_GUESTDBG_INJECT_DB) 6228 if (dbg->control & KVM_GUESTDBG_INJECT_DB)
6229 kvm_queue_exception(vcpu, DB_VECTOR); 6229 kvm_queue_exception(vcpu, DB_VECTOR);
6230 else 6230 else
6231 kvm_queue_exception(vcpu, BP_VECTOR); 6231 kvm_queue_exception(vcpu, BP_VECTOR);
6232 } 6232 }
6233 6233
6234 /* 6234 /*
6235 * Read rflags as long as potentially injected trace flags are still 6235 * Read rflags as long as potentially injected trace flags are still
6236 * filtered out. 6236 * filtered out.
6237 */ 6237 */
6238 rflags = kvm_get_rflags(vcpu); 6238 rflags = kvm_get_rflags(vcpu);
6239 6239
6240 vcpu->guest_debug = dbg->control; 6240 vcpu->guest_debug = dbg->control;
6241 if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE)) 6241 if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
6242 vcpu->guest_debug = 0; 6242 vcpu->guest_debug = 0;
6243 6243
6244 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { 6244 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
6245 for (i = 0; i < KVM_NR_DB_REGS; ++i) 6245 for (i = 0; i < KVM_NR_DB_REGS; ++i)
6246 vcpu->arch.eff_db[i] = dbg->arch.debugreg[i]; 6246 vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];
6247 vcpu->arch.guest_debug_dr7 = dbg->arch.debugreg[7]; 6247 vcpu->arch.guest_debug_dr7 = dbg->arch.debugreg[7];
6248 } else { 6248 } else {
6249 for (i = 0; i < KVM_NR_DB_REGS; i++) 6249 for (i = 0; i < KVM_NR_DB_REGS; i++)
6250 vcpu->arch.eff_db[i] = vcpu->arch.db[i]; 6250 vcpu->arch.eff_db[i] = vcpu->arch.db[i];
6251 } 6251 }
6252 kvm_update_dr7(vcpu); 6252 kvm_update_dr7(vcpu);
6253 6253
6254 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 6254 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
6255 vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) + 6255 vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) +
6256 get_segment_base(vcpu, VCPU_SREG_CS); 6256 get_segment_base(vcpu, VCPU_SREG_CS);
6257 6257
6258 /* 6258 /*
6259 * Trigger an rflags update that will inject or remove the trace 6259 * Trigger an rflags update that will inject or remove the trace
6260 * flags. 6260 * flags.
6261 */ 6261 */
6262 kvm_set_rflags(vcpu, rflags); 6262 kvm_set_rflags(vcpu, rflags);
6263 6263
6264 kvm_x86_ops->update_db_bp_intercept(vcpu); 6264 kvm_x86_ops->update_db_bp_intercept(vcpu);
6265 6265
6266 r = 0; 6266 r = 0;
6267 6267
6268 out: 6268 out:
6269 6269
6270 return r; 6270 return r;
6271 } 6271 }
6272 6272
6273 /* 6273 /*
6274 * Translate a guest virtual address to a guest physical address. 6274 * Translate a guest virtual address to a guest physical address.
6275 */ 6275 */
6276 int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, 6276 int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
6277 struct kvm_translation *tr) 6277 struct kvm_translation *tr)
6278 { 6278 {
6279 unsigned long vaddr = tr->linear_address; 6279 unsigned long vaddr = tr->linear_address;
6280 gpa_t gpa; 6280 gpa_t gpa;
6281 int idx; 6281 int idx;
6282 6282
6283 idx = srcu_read_lock(&vcpu->kvm->srcu); 6283 idx = srcu_read_lock(&vcpu->kvm->srcu);
6284 gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL); 6284 gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL);
6285 srcu_read_unlock(&vcpu->kvm->srcu, idx); 6285 srcu_read_unlock(&vcpu->kvm->srcu, idx);
6286 tr->physical_address = gpa; 6286 tr->physical_address = gpa;
6287 tr->valid = gpa != UNMAPPED_GVA; 6287 tr->valid = gpa != UNMAPPED_GVA;
6288 tr->writeable = 1; 6288 tr->writeable = 1;
6289 tr->usermode = 0; 6289 tr->usermode = 0;
6290 6290
6291 return 0; 6291 return 0;
6292 } 6292 }
6293 6293
6294 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 6294 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
6295 { 6295 {
6296 struct i387_fxsave_struct *fxsave = 6296 struct i387_fxsave_struct *fxsave =
6297 &vcpu->arch.guest_fpu.state->fxsave; 6297 &vcpu->arch.guest_fpu.state->fxsave;
6298 6298
6299 memcpy(fpu->fpr, fxsave->st_space, 128); 6299 memcpy(fpu->fpr, fxsave->st_space, 128);
6300 fpu->fcw = fxsave->cwd; 6300 fpu->fcw = fxsave->cwd;
6301 fpu->fsw = fxsave->swd; 6301 fpu->fsw = fxsave->swd;
6302 fpu->ftwx = fxsave->twd; 6302 fpu->ftwx = fxsave->twd;
6303 fpu->last_opcode = fxsave->fop; 6303 fpu->last_opcode = fxsave->fop;
6304 fpu->last_ip = fxsave->rip; 6304 fpu->last_ip = fxsave->rip;
6305 fpu->last_dp = fxsave->rdp; 6305 fpu->last_dp = fxsave->rdp;
6306 memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space); 6306 memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
6307 6307
6308 return 0; 6308 return 0;
6309 } 6309 }
6310 6310
6311 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 6311 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
6312 { 6312 {
6313 struct i387_fxsave_struct *fxsave = 6313 struct i387_fxsave_struct *fxsave =
6314 &vcpu->arch.guest_fpu.state->fxsave; 6314 &vcpu->arch.guest_fpu.state->fxsave;
6315 6315
6316 memcpy(fxsave->st_space, fpu->fpr, 128); 6316 memcpy(fxsave->st_space, fpu->fpr, 128);
6317 fxsave->cwd = fpu->fcw; 6317 fxsave->cwd = fpu->fcw;
6318 fxsave->swd = fpu->fsw; 6318 fxsave->swd = fpu->fsw;
6319 fxsave->twd = fpu->ftwx; 6319 fxsave->twd = fpu->ftwx;
6320 fxsave->fop = fpu->last_opcode; 6320 fxsave->fop = fpu->last_opcode;
6321 fxsave->rip = fpu->last_ip; 6321 fxsave->rip = fpu->last_ip;
6322 fxsave->rdp = fpu->last_dp; 6322 fxsave->rdp = fpu->last_dp;
6323 memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space); 6323 memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
6324 6324
6325 return 0; 6325 return 0;
6326 } 6326 }
6327 6327
6328 int fx_init(struct kvm_vcpu *vcpu) 6328 int fx_init(struct kvm_vcpu *vcpu)
6329 { 6329 {
6330 int err; 6330 int err;
6331 6331
6332 err = fpu_alloc(&vcpu->arch.guest_fpu); 6332 err = fpu_alloc(&vcpu->arch.guest_fpu);
6333 if (err) 6333 if (err)
6334 return err; 6334 return err;
6335 6335
6336 fpu_finit(&vcpu->arch.guest_fpu); 6336 fpu_finit(&vcpu->arch.guest_fpu);
6337 6337
6338 /* 6338 /*
6339 * Ensure guest xcr0 is valid for loading 6339 * Ensure guest xcr0 is valid for loading
6340 */ 6340 */
6341 vcpu->arch.xcr0 = XSTATE_FP; 6341 vcpu->arch.xcr0 = XSTATE_FP;
6342 6342
6343 vcpu->arch.cr0 |= X86_CR0_ET; 6343 vcpu->arch.cr0 |= X86_CR0_ET;
6344 6344
6345 return 0; 6345 return 0;
6346 } 6346 }
6347 EXPORT_SYMBOL_GPL(fx_init); 6347 EXPORT_SYMBOL_GPL(fx_init);
6348 6348
6349 static void fx_free(struct kvm_vcpu *vcpu) 6349 static void fx_free(struct kvm_vcpu *vcpu)
6350 { 6350 {
6351 fpu_free(&vcpu->arch.guest_fpu); 6351 fpu_free(&vcpu->arch.guest_fpu);
6352 } 6352 }
6353 6353
6354 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) 6354 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
6355 { 6355 {
6356 if (vcpu->guest_fpu_loaded) 6356 if (vcpu->guest_fpu_loaded)
6357 return; 6357 return;
6358 6358
6359 /* 6359 /*
6360 * Restore all possible states in the guest, 6360 * Restore all possible states in the guest,
6361 * and assume host would use all available bits. 6361 * and assume host would use all available bits.
6362 * Guest xcr0 would be loaded later. 6362 * Guest xcr0 would be loaded later.
6363 */ 6363 */
6364 kvm_put_guest_xcr0(vcpu); 6364 kvm_put_guest_xcr0(vcpu);
6365 vcpu->guest_fpu_loaded = 1; 6365 vcpu->guest_fpu_loaded = 1;
6366 __kernel_fpu_begin(); 6366 __kernel_fpu_begin();
6367 fpu_restore_checking(&vcpu->arch.guest_fpu); 6367 fpu_restore_checking(&vcpu->arch.guest_fpu);
6368 trace_kvm_fpu(1); 6368 trace_kvm_fpu(1);
6369 } 6369 }
6370 6370
6371 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) 6371 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
6372 { 6372 {
6373 kvm_put_guest_xcr0(vcpu); 6373 kvm_put_guest_xcr0(vcpu);
6374 6374
6375 if (!vcpu->guest_fpu_loaded) 6375 if (!vcpu->guest_fpu_loaded)
6376 return; 6376 return;
6377 6377
6378 vcpu->guest_fpu_loaded = 0; 6378 vcpu->guest_fpu_loaded = 0;
6379 fpu_save_init(&vcpu->arch.guest_fpu); 6379 fpu_save_init(&vcpu->arch.guest_fpu);
6380 __kernel_fpu_end(); 6380 __kernel_fpu_end();
6381 ++vcpu->stat.fpu_reload; 6381 ++vcpu->stat.fpu_reload;
6382 kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu); 6382 kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
6383 trace_kvm_fpu(0); 6383 trace_kvm_fpu(0);
6384 } 6384 }
6385 6385
6386 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) 6386 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
6387 { 6387 {
6388 kvmclock_reset(vcpu); 6388 kvmclock_reset(vcpu);
6389 6389
6390 free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); 6390 free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
6391 fx_free(vcpu); 6391 fx_free(vcpu);
6392 kvm_x86_ops->vcpu_free(vcpu); 6392 kvm_x86_ops->vcpu_free(vcpu);
6393 } 6393 }
6394 6394
6395 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, 6395 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
6396 unsigned int id) 6396 unsigned int id)
6397 { 6397 {
6398 if (check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0) 6398 if (check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
6399 printk_once(KERN_WARNING 6399 printk_once(KERN_WARNING
6400 "kvm: SMP vm created on host with unstable TSC; " 6400 "kvm: SMP vm created on host with unstable TSC; "
6401 "guest TSC will not be reliable\n"); 6401 "guest TSC will not be reliable\n");
6402 return kvm_x86_ops->vcpu_create(kvm, id); 6402 return kvm_x86_ops->vcpu_create(kvm, id);
6403 } 6403 }
6404 6404
6405 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) 6405 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
6406 { 6406 {
6407 int r; 6407 int r;
6408 6408
6409 vcpu->arch.mtrr_state.have_fixed = 1; 6409 vcpu->arch.mtrr_state.have_fixed = 1;
6410 r = vcpu_load(vcpu); 6410 r = vcpu_load(vcpu);
6411 if (r) 6411 if (r)
6412 return r; 6412 return r;
6413 r = kvm_vcpu_reset(vcpu); 6413 r = kvm_vcpu_reset(vcpu);
6414 if (r == 0) 6414 if (r == 0)
6415 r = kvm_mmu_setup(vcpu); 6415 r = kvm_mmu_setup(vcpu);
6416 vcpu_put(vcpu); 6416 vcpu_put(vcpu);
6417 6417
6418 return r; 6418 return r;
6419 } 6419 }
6420 6420
6421 int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) 6421 int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
6422 { 6422 {
6423 int r; 6423 int r;
6424 struct msr_data msr; 6424 struct msr_data msr;
6425 6425
6426 r = vcpu_load(vcpu); 6426 r = vcpu_load(vcpu);
6427 if (r) 6427 if (r)
6428 return r; 6428 return r;
6429 msr.data = 0x0; 6429 msr.data = 0x0;
6430 msr.index = MSR_IA32_TSC; 6430 msr.index = MSR_IA32_TSC;
6431 msr.host_initiated = true; 6431 msr.host_initiated = true;
6432 kvm_write_tsc(vcpu, &msr); 6432 kvm_write_tsc(vcpu, &msr);
6433 vcpu_put(vcpu); 6433 vcpu_put(vcpu);
6434 6434
6435 return r; 6435 return r;
6436 } 6436 }
6437 6437
6438 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) 6438 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
6439 { 6439 {
6440 int r; 6440 int r;
6441 vcpu->arch.apf.msr_val = 0; 6441 vcpu->arch.apf.msr_val = 0;
6442 6442
6443 r = vcpu_load(vcpu); 6443 r = vcpu_load(vcpu);
6444 BUG_ON(r); 6444 BUG_ON(r);
6445 kvm_mmu_unload(vcpu); 6445 kvm_mmu_unload(vcpu);
6446 vcpu_put(vcpu); 6446 vcpu_put(vcpu);
6447 6447
6448 fx_free(vcpu); 6448 fx_free(vcpu);
6449 kvm_x86_ops->vcpu_free(vcpu); 6449 kvm_x86_ops->vcpu_free(vcpu);
6450 } 6450 }
6451 6451
6452 static int kvm_vcpu_reset(struct kvm_vcpu *vcpu) 6452 static int kvm_vcpu_reset(struct kvm_vcpu *vcpu)
6453 { 6453 {
6454 atomic_set(&vcpu->arch.nmi_queued, 0); 6454 atomic_set(&vcpu->arch.nmi_queued, 0);
6455 vcpu->arch.nmi_pending = 0; 6455 vcpu->arch.nmi_pending = 0;
6456 vcpu->arch.nmi_injected = false; 6456 vcpu->arch.nmi_injected = false;
6457 6457
6458 memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db)); 6458 memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
6459 vcpu->arch.dr6 = DR6_FIXED_1; 6459 vcpu->arch.dr6 = DR6_FIXED_1;
6460 vcpu->arch.dr7 = DR7_FIXED_1; 6460 vcpu->arch.dr7 = DR7_FIXED_1;
6461 kvm_update_dr7(vcpu); 6461 kvm_update_dr7(vcpu);
6462 6462
6463 kvm_make_request(KVM_REQ_EVENT, vcpu); 6463 kvm_make_request(KVM_REQ_EVENT, vcpu);
6464 vcpu->arch.apf.msr_val = 0; 6464 vcpu->arch.apf.msr_val = 0;
6465 vcpu->arch.st.msr_val = 0; 6465 vcpu->arch.st.msr_val = 0;
6466 6466
6467 kvmclock_reset(vcpu); 6467 kvmclock_reset(vcpu);
6468 6468
6469 kvm_clear_async_pf_completion_queue(vcpu); 6469 kvm_clear_async_pf_completion_queue(vcpu);
6470 kvm_async_pf_hash_reset(vcpu); 6470 kvm_async_pf_hash_reset(vcpu);
6471 vcpu->arch.apf.halted = false; 6471 vcpu->arch.apf.halted = false;
6472 6472
6473 kvm_pmu_reset(vcpu); 6473 kvm_pmu_reset(vcpu);
6474 6474
6475 memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs)); 6475 memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
6476 vcpu->arch.regs_avail = ~0; 6476 vcpu->arch.regs_avail = ~0;
6477 vcpu->arch.regs_dirty = ~0; 6477 vcpu->arch.regs_dirty = ~0;
6478 6478
6479 return kvm_x86_ops->vcpu_reset(vcpu); 6479 return kvm_x86_ops->vcpu_reset(vcpu);
6480 } 6480 }
6481 6481
6482 int kvm_arch_hardware_enable(void *garbage) 6482 int kvm_arch_hardware_enable(void *garbage)
6483 { 6483 {
6484 struct kvm *kvm; 6484 struct kvm *kvm;
6485 struct kvm_vcpu *vcpu; 6485 struct kvm_vcpu *vcpu;
6486 int i; 6486 int i;
6487 int ret; 6487 int ret;
6488 u64 local_tsc; 6488 u64 local_tsc;
6489 u64 max_tsc = 0; 6489 u64 max_tsc = 0;
6490 bool stable, backwards_tsc = false; 6490 bool stable, backwards_tsc = false;
6491 6491
6492 kvm_shared_msr_cpu_online(); 6492 kvm_shared_msr_cpu_online();
6493 ret = kvm_x86_ops->hardware_enable(garbage); 6493 ret = kvm_x86_ops->hardware_enable(garbage);
6494 if (ret != 0) 6494 if (ret != 0)
6495 return ret; 6495 return ret;
6496 6496
6497 local_tsc = native_read_tsc(); 6497 local_tsc = native_read_tsc();
6498 stable = !check_tsc_unstable(); 6498 stable = !check_tsc_unstable();
6499 list_for_each_entry(kvm, &vm_list, vm_list) { 6499 list_for_each_entry(kvm, &vm_list, vm_list) {
6500 kvm_for_each_vcpu(i, vcpu, kvm) { 6500 kvm_for_each_vcpu(i, vcpu, kvm) {
6501 if (!stable && vcpu->cpu == smp_processor_id()) 6501 if (!stable && vcpu->cpu == smp_processor_id())
6502 set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests); 6502 set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
6503 if (stable && vcpu->arch.last_host_tsc > local_tsc) { 6503 if (stable && vcpu->arch.last_host_tsc > local_tsc) {
6504 backwards_tsc = true; 6504 backwards_tsc = true;
6505 if (vcpu->arch.last_host_tsc > max_tsc) 6505 if (vcpu->arch.last_host_tsc > max_tsc)
6506 max_tsc = vcpu->arch.last_host_tsc; 6506 max_tsc = vcpu->arch.last_host_tsc;
6507 } 6507 }
6508 } 6508 }
6509 } 6509 }
6510 6510
6511 /* 6511 /*
6512 * Sometimes, even reliable TSCs go backwards. This happens on 6512 * Sometimes, even reliable TSCs go backwards. This happens on
6513 * platforms that reset TSC during suspend or hibernate actions, but 6513 * platforms that reset TSC during suspend or hibernate actions, but
6514 * maintain synchronization. We must compensate. Fortunately, we can 6514 * maintain synchronization. We must compensate. Fortunately, we can
6515 * detect that condition here, which happens early in CPU bringup, 6515 * detect that condition here, which happens early in CPU bringup,
6516 * before any KVM threads can be running. Unfortunately, we can't 6516 * before any KVM threads can be running. Unfortunately, we can't
6517 * bring the TSCs fully up to date with real time, as we aren't yet far 6517 * bring the TSCs fully up to date with real time, as we aren't yet far
6518 * enough into CPU bringup that we know how much real time has actually 6518 * enough into CPU bringup that we know how much real time has actually
6519 * elapsed; our helper function, get_kernel_ns() will be using boot 6519 * elapsed; our helper function, get_kernel_ns() will be using boot
6520 * variables that haven't been updated yet. 6520 * variables that haven't been updated yet.
6521 * 6521 *
6522 * So we simply find the maximum observed TSC above, then record the 6522 * So we simply find the maximum observed TSC above, then record the
6523 * adjustment to TSC in each VCPU. When the VCPU later gets loaded, 6523 * adjustment to TSC in each VCPU. When the VCPU later gets loaded,
6524 * the adjustment will be applied. Note that we accumulate 6524 * the adjustment will be applied. Note that we accumulate
6525 * adjustments, in case multiple suspend cycles happen before some VCPU 6525 * adjustments, in case multiple suspend cycles happen before some VCPU
6526 * gets a chance to run again. In the event that no KVM threads get a 6526 * gets a chance to run again. In the event that no KVM threads get a
6527 * chance to run, we will miss the entire elapsed period, as we'll have 6527 * chance to run, we will miss the entire elapsed period, as we'll have
6528 * reset last_host_tsc, so VCPUs will not have the TSC adjusted and may 6528 * reset last_host_tsc, so VCPUs will not have the TSC adjusted and may
6529 * loose cycle time. This isn't too big a deal, since the loss will be 6529 * loose cycle time. This isn't too big a deal, since the loss will be
6530 * uniform across all VCPUs (not to mention the scenario is extremely 6530 * uniform across all VCPUs (not to mention the scenario is extremely
6531 * unlikely). It is possible that a second hibernate recovery happens 6531 * unlikely). It is possible that a second hibernate recovery happens
6532 * much faster than a first, causing the observed TSC here to be 6532 * much faster than a first, causing the observed TSC here to be
6533 * smaller; this would require additional padding adjustment, which is 6533 * smaller; this would require additional padding adjustment, which is
6534 * why we set last_host_tsc to the local tsc observed here. 6534 * why we set last_host_tsc to the local tsc observed here.
6535 * 6535 *
6536 * N.B. - this code below runs only on platforms with reliable TSC, 6536 * N.B. - this code below runs only on platforms with reliable TSC,
6537 * as that is the only way backwards_tsc is set above. Also note 6537 * as that is the only way backwards_tsc is set above. Also note
6538 * that this runs for ALL vcpus, which is not a bug; all VCPUs should 6538 * that this runs for ALL vcpus, which is not a bug; all VCPUs should
6539 * have the same delta_cyc adjustment applied if backwards_tsc 6539 * have the same delta_cyc adjustment applied if backwards_tsc
6540 * is detected. Note further, this adjustment is only done once, 6540 * is detected. Note further, this adjustment is only done once,
6541 * as we reset last_host_tsc on all VCPUs to stop this from being 6541 * as we reset last_host_tsc on all VCPUs to stop this from being
6542 * called multiple times (one for each physical CPU bringup). 6542 * called multiple times (one for each physical CPU bringup).
6543 * 6543 *
6544 * Platforms with unreliable TSCs don't have to deal with this, they 6544 * Platforms with unreliable TSCs don't have to deal with this, they
6545 * will be compensated by the logic in vcpu_load, which sets the TSC to 6545 * will be compensated by the logic in vcpu_load, which sets the TSC to
6546 * catchup mode. This will catchup all VCPUs to real time, but cannot 6546 * catchup mode. This will catchup all VCPUs to real time, but cannot
6547 * guarantee that they stay in perfect synchronization. 6547 * guarantee that they stay in perfect synchronization.
6548 */ 6548 */
6549 if (backwards_tsc) { 6549 if (backwards_tsc) {
6550 u64 delta_cyc = max_tsc - local_tsc; 6550 u64 delta_cyc = max_tsc - local_tsc;
6551 list_for_each_entry(kvm, &vm_list, vm_list) { 6551 list_for_each_entry(kvm, &vm_list, vm_list) {
6552 kvm_for_each_vcpu(i, vcpu, kvm) { 6552 kvm_for_each_vcpu(i, vcpu, kvm) {
6553 vcpu->arch.tsc_offset_adjustment += delta_cyc; 6553 vcpu->arch.tsc_offset_adjustment += delta_cyc;
6554 vcpu->arch.last_host_tsc = local_tsc; 6554 vcpu->arch.last_host_tsc = local_tsc;
6555 set_bit(KVM_REQ_MASTERCLOCK_UPDATE, 6555 set_bit(KVM_REQ_MASTERCLOCK_UPDATE,
6556 &vcpu->requests); 6556 &vcpu->requests);
6557 } 6557 }
6558 6558
6559 /* 6559 /*
6560 * We have to disable TSC offset matching.. if you were 6560 * We have to disable TSC offset matching.. if you were
6561 * booting a VM while issuing an S4 host suspend.... 6561 * booting a VM while issuing an S4 host suspend....
6562 * you may have some problem. Solving this issue is 6562 * you may have some problem. Solving this issue is
6563 * left as an exercise to the reader. 6563 * left as an exercise to the reader.
6564 */ 6564 */
6565 kvm->arch.last_tsc_nsec = 0; 6565 kvm->arch.last_tsc_nsec = 0;
6566 kvm->arch.last_tsc_write = 0; 6566 kvm->arch.last_tsc_write = 0;
6567 } 6567 }
6568 6568
6569 } 6569 }
6570 return 0; 6570 return 0;
6571 } 6571 }
6572 6572
6573 void kvm_arch_hardware_disable(void *garbage) 6573 void kvm_arch_hardware_disable(void *garbage)
6574 { 6574 {
6575 kvm_x86_ops->hardware_disable(garbage); 6575 kvm_x86_ops->hardware_disable(garbage);
6576 drop_user_return_notifiers(garbage); 6576 drop_user_return_notifiers(garbage);
6577 } 6577 }
6578 6578
6579 int kvm_arch_hardware_setup(void) 6579 int kvm_arch_hardware_setup(void)
6580 { 6580 {
6581 return kvm_x86_ops->hardware_setup(); 6581 return kvm_x86_ops->hardware_setup();
6582 } 6582 }
6583 6583
6584 void kvm_arch_hardware_unsetup(void) 6584 void kvm_arch_hardware_unsetup(void)
6585 { 6585 {
6586 kvm_x86_ops->hardware_unsetup(); 6586 kvm_x86_ops->hardware_unsetup();
6587 } 6587 }
6588 6588
6589 void kvm_arch_check_processor_compat(void *rtn) 6589 void kvm_arch_check_processor_compat(void *rtn)
6590 { 6590 {
6591 kvm_x86_ops->check_processor_compatibility(rtn); 6591 kvm_x86_ops->check_processor_compatibility(rtn);
6592 } 6592 }
6593 6593
6594 bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu) 6594 bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
6595 { 6595 {
6596 return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL); 6596 return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL);
6597 } 6597 }
6598 6598
6599 struct static_key kvm_no_apic_vcpu __read_mostly; 6599 struct static_key kvm_no_apic_vcpu __read_mostly;
6600 6600
6601 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) 6601 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
6602 { 6602 {
6603 struct page *page; 6603 struct page *page;
6604 struct kvm *kvm; 6604 struct kvm *kvm;
6605 int r; 6605 int r;
6606 6606
6607 BUG_ON(vcpu->kvm == NULL); 6607 BUG_ON(vcpu->kvm == NULL);
6608 kvm = vcpu->kvm; 6608 kvm = vcpu->kvm;
6609 6609
6610 vcpu->arch.emulate_ctxt.ops = &emulate_ops; 6610 vcpu->arch.emulate_ctxt.ops = &emulate_ops;
6611 if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu)) 6611 if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
6612 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 6612 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
6613 else 6613 else
6614 vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; 6614 vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
6615 6615
6616 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 6616 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
6617 if (!page) { 6617 if (!page) {
6618 r = -ENOMEM; 6618 r = -ENOMEM;
6619 goto fail; 6619 goto fail;
6620 } 6620 }
6621 vcpu->arch.pio_data = page_address(page); 6621 vcpu->arch.pio_data = page_address(page);
6622 6622
6623 kvm_set_tsc_khz(vcpu, max_tsc_khz); 6623 kvm_set_tsc_khz(vcpu, max_tsc_khz);
6624 6624
6625 r = kvm_mmu_create(vcpu); 6625 r = kvm_mmu_create(vcpu);
6626 if (r < 0) 6626 if (r < 0)
6627 goto fail_free_pio_data; 6627 goto fail_free_pio_data;
6628 6628
6629 if (irqchip_in_kernel(kvm)) { 6629 if (irqchip_in_kernel(kvm)) {
6630 r = kvm_create_lapic(vcpu); 6630 r = kvm_create_lapic(vcpu);
6631 if (r < 0) 6631 if (r < 0)
6632 goto fail_mmu_destroy; 6632 goto fail_mmu_destroy;
6633 } else 6633 } else
6634 static_key_slow_inc(&kvm_no_apic_vcpu); 6634 static_key_slow_inc(&kvm_no_apic_vcpu);
6635 6635
6636 vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4, 6636 vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
6637 GFP_KERNEL); 6637 GFP_KERNEL);
6638 if (!vcpu->arch.mce_banks) { 6638 if (!vcpu->arch.mce_banks) {
6639 r = -ENOMEM; 6639 r = -ENOMEM;
6640 goto fail_free_lapic; 6640 goto fail_free_lapic;
6641 } 6641 }
6642 vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; 6642 vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
6643 6643
6644 if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) 6644 if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL))
6645 goto fail_free_mce_banks; 6645 goto fail_free_mce_banks;
6646 6646
6647 r = fx_init(vcpu); 6647 r = fx_init(vcpu);
6648 if (r) 6648 if (r)
6649 goto fail_free_wbinvd_dirty_mask; 6649 goto fail_free_wbinvd_dirty_mask;
6650 6650
6651 vcpu->arch.ia32_tsc_adjust_msr = 0x0; 6651 vcpu->arch.ia32_tsc_adjust_msr = 0x0;
6652 kvm_async_pf_hash_reset(vcpu); 6652 kvm_async_pf_hash_reset(vcpu);
6653 kvm_pmu_init(vcpu); 6653 kvm_pmu_init(vcpu);
6654 6654
6655 return 0; 6655 return 0;
6656 fail_free_wbinvd_dirty_mask: 6656 fail_free_wbinvd_dirty_mask:
6657 free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); 6657 free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
6658 fail_free_mce_banks: 6658 fail_free_mce_banks:
6659 kfree(vcpu->arch.mce_banks); 6659 kfree(vcpu->arch.mce_banks);
6660 fail_free_lapic: 6660 fail_free_lapic:
6661 kvm_free_lapic(vcpu); 6661 kvm_free_lapic(vcpu);
6662 fail_mmu_destroy: 6662 fail_mmu_destroy:
6663 kvm_mmu_destroy(vcpu); 6663 kvm_mmu_destroy(vcpu);
6664 fail_free_pio_data: 6664 fail_free_pio_data:
6665 free_page((unsigned long)vcpu->arch.pio_data); 6665 free_page((unsigned long)vcpu->arch.pio_data);
6666 fail: 6666 fail:
6667 return r; 6667 return r;
6668 } 6668 }
6669 6669
6670 void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) 6670 void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
6671 { 6671 {
6672 int idx; 6672 int idx;
6673 6673
6674 kvm_pmu_destroy(vcpu); 6674 kvm_pmu_destroy(vcpu);
6675 kfree(vcpu->arch.mce_banks); 6675 kfree(vcpu->arch.mce_banks);
6676 kvm_free_lapic(vcpu); 6676 kvm_free_lapic(vcpu);
6677 idx = srcu_read_lock(&vcpu->kvm->srcu); 6677 idx = srcu_read_lock(&vcpu->kvm->srcu);
6678 kvm_mmu_destroy(vcpu); 6678 kvm_mmu_destroy(vcpu);
6679 srcu_read_unlock(&vcpu->kvm->srcu, idx); 6679 srcu_read_unlock(&vcpu->kvm->srcu, idx);
6680 free_page((unsigned long)vcpu->arch.pio_data); 6680 free_page((unsigned long)vcpu->arch.pio_data);
6681 if (!irqchip_in_kernel(vcpu->kvm)) 6681 if (!irqchip_in_kernel(vcpu->kvm))
6682 static_key_slow_dec(&kvm_no_apic_vcpu); 6682 static_key_slow_dec(&kvm_no_apic_vcpu);
6683 } 6683 }
6684 6684
6685 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) 6685 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
6686 { 6686 {
6687 if (type) 6687 if (type)
6688 return -EINVAL; 6688 return -EINVAL;
6689 6689
6690 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); 6690 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
6691 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); 6691 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
6692 6692
6693 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ 6693 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
6694 set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); 6694 set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
6695 /* Reserve bit 1 of irq_sources_bitmap for irqfd-resampler */ 6695 /* Reserve bit 1 of irq_sources_bitmap for irqfd-resampler */
6696 set_bit(KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, 6696 set_bit(KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
6697 &kvm->arch.irq_sources_bitmap); 6697 &kvm->arch.irq_sources_bitmap);
6698 6698
6699 raw_spin_lock_init(&kvm->arch.tsc_write_lock); 6699 raw_spin_lock_init(&kvm->arch.tsc_write_lock);
6700 mutex_init(&kvm->arch.apic_map_lock); 6700 mutex_init(&kvm->arch.apic_map_lock);
6701 spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock); 6701 spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
6702 6702
6703 pvclock_update_vm_gtod_copy(kvm); 6703 pvclock_update_vm_gtod_copy(kvm);
6704 6704
6705 return 0; 6705 return 0;
6706 } 6706 }
6707 6707
6708 static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) 6708 static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
6709 { 6709 {
6710 int r; 6710 int r;
6711 r = vcpu_load(vcpu); 6711 r = vcpu_load(vcpu);
6712 BUG_ON(r); 6712 BUG_ON(r);
6713 kvm_mmu_unload(vcpu); 6713 kvm_mmu_unload(vcpu);
6714 vcpu_put(vcpu); 6714 vcpu_put(vcpu);
6715 } 6715 }
6716 6716
6717 static void kvm_free_vcpus(struct kvm *kvm) 6717 static void kvm_free_vcpus(struct kvm *kvm)
6718 { 6718 {
6719 unsigned int i; 6719 unsigned int i;
6720 struct kvm_vcpu *vcpu; 6720 struct kvm_vcpu *vcpu;
6721 6721
6722 /* 6722 /*
6723 * Unpin any mmu pages first. 6723 * Unpin any mmu pages first.
6724 */ 6724 */
6725 kvm_for_each_vcpu(i, vcpu, kvm) { 6725 kvm_for_each_vcpu(i, vcpu, kvm) {
6726 kvm_clear_async_pf_completion_queue(vcpu); 6726 kvm_clear_async_pf_completion_queue(vcpu);
6727 kvm_unload_vcpu_mmu(vcpu); 6727 kvm_unload_vcpu_mmu(vcpu);
6728 } 6728 }
6729 kvm_for_each_vcpu(i, vcpu, kvm) 6729 kvm_for_each_vcpu(i, vcpu, kvm)
6730 kvm_arch_vcpu_free(vcpu); 6730 kvm_arch_vcpu_free(vcpu);
6731 6731
6732 mutex_lock(&kvm->lock); 6732 mutex_lock(&kvm->lock);
6733 for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) 6733 for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
6734 kvm->vcpus[i] = NULL; 6734 kvm->vcpus[i] = NULL;
6735 6735
6736 atomic_set(&kvm->online_vcpus, 0); 6736 atomic_set(&kvm->online_vcpus, 0);
6737 mutex_unlock(&kvm->lock); 6737 mutex_unlock(&kvm->lock);
6738 } 6738 }
6739 6739
6740 void kvm_arch_sync_events(struct kvm *kvm) 6740 void kvm_arch_sync_events(struct kvm *kvm)
6741 { 6741 {
6742 kvm_free_all_assigned_devices(kvm); 6742 kvm_free_all_assigned_devices(kvm);
6743 kvm_free_pit(kvm); 6743 kvm_free_pit(kvm);
6744 } 6744 }
6745 6745
6746 void kvm_arch_destroy_vm(struct kvm *kvm) 6746 void kvm_arch_destroy_vm(struct kvm *kvm)
6747 { 6747 {
6748 kvm_iommu_unmap_guest(kvm); 6748 kvm_iommu_unmap_guest(kvm);
6749 kfree(kvm->arch.vpic); 6749 kfree(kvm->arch.vpic);
6750 kfree(kvm->arch.vioapic); 6750 kfree(kvm->arch.vioapic);
6751 kvm_free_vcpus(kvm); 6751 kvm_free_vcpus(kvm);
6752 if (kvm->arch.apic_access_page) 6752 if (kvm->arch.apic_access_page)
6753 put_page(kvm->arch.apic_access_page); 6753 put_page(kvm->arch.apic_access_page);
6754 if (kvm->arch.ept_identity_pagetable) 6754 if (kvm->arch.ept_identity_pagetable)
6755 put_page(kvm->arch.ept_identity_pagetable); 6755 put_page(kvm->arch.ept_identity_pagetable);
6756 kfree(rcu_dereference_check(kvm->arch.apic_map, 1)); 6756 kfree(rcu_dereference_check(kvm->arch.apic_map, 1));
6757 } 6757 }
6758 6758
6759 void kvm_arch_free_memslot(struct kvm_memory_slot *free, 6759 void kvm_arch_free_memslot(struct kvm_memory_slot *free,
6760 struct kvm_memory_slot *dont) 6760 struct kvm_memory_slot *dont)
6761 { 6761 {
6762 int i; 6762 int i;
6763 6763
6764 for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { 6764 for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
6765 if (!dont || free->arch.rmap[i] != dont->arch.rmap[i]) { 6765 if (!dont || free->arch.rmap[i] != dont->arch.rmap[i]) {
6766 kvm_kvfree(free->arch.rmap[i]); 6766 kvm_kvfree(free->arch.rmap[i]);
6767 free->arch.rmap[i] = NULL; 6767 free->arch.rmap[i] = NULL;
6768 } 6768 }
6769 if (i == 0) 6769 if (i == 0)
6770 continue; 6770 continue;
6771 6771
6772 if (!dont || free->arch.lpage_info[i - 1] != 6772 if (!dont || free->arch.lpage_info[i - 1] !=
6773 dont->arch.lpage_info[i - 1]) { 6773 dont->arch.lpage_info[i - 1]) {
6774 kvm_kvfree(free->arch.lpage_info[i - 1]); 6774 kvm_kvfree(free->arch.lpage_info[i - 1]);
6775 free->arch.lpage_info[i - 1] = NULL; 6775 free->arch.lpage_info[i - 1] = NULL;
6776 } 6776 }
6777 } 6777 }
6778 } 6778 }
6779 6779
6780 int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) 6780 int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
6781 { 6781 {
6782 int i; 6782 int i;
6783 6783
6784 for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { 6784 for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
6785 unsigned long ugfn; 6785 unsigned long ugfn;
6786 int lpages; 6786 int lpages;
6787 int level = i + 1; 6787 int level = i + 1;
6788 6788
6789 lpages = gfn_to_index(slot->base_gfn + npages - 1, 6789 lpages = gfn_to_index(slot->base_gfn + npages - 1,
6790 slot->base_gfn, level) + 1; 6790 slot->base_gfn, level) + 1;
6791 6791
6792 slot->arch.rmap[i] = 6792 slot->arch.rmap[i] =
6793 kvm_kvzalloc(lpages * sizeof(*slot->arch.rmap[i])); 6793 kvm_kvzalloc(lpages * sizeof(*slot->arch.rmap[i]));
6794 if (!slot->arch.rmap[i]) 6794 if (!slot->arch.rmap[i])
6795 goto out_free; 6795 goto out_free;
6796 if (i == 0) 6796 if (i == 0)
6797 continue; 6797 continue;
6798 6798
6799 slot->arch.lpage_info[i - 1] = kvm_kvzalloc(lpages * 6799 slot->arch.lpage_info[i - 1] = kvm_kvzalloc(lpages *
6800 sizeof(*slot->arch.lpage_info[i - 1])); 6800 sizeof(*slot->arch.lpage_info[i - 1]));
6801 if (!slot->arch.lpage_info[i - 1]) 6801 if (!slot->arch.lpage_info[i - 1])
6802 goto out_free; 6802 goto out_free;
6803 6803
6804 if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1)) 6804 if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
6805 slot->arch.lpage_info[i - 1][0].write_count = 1; 6805 slot->arch.lpage_info[i - 1][0].write_count = 1;
6806 if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1)) 6806 if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
6807 slot->arch.lpage_info[i - 1][lpages - 1].write_count = 1; 6807 slot->arch.lpage_info[i - 1][lpages - 1].write_count = 1;
6808 ugfn = slot->userspace_addr >> PAGE_SHIFT; 6808 ugfn = slot->userspace_addr >> PAGE_SHIFT;
6809 /* 6809 /*
6810 * If the gfn and userspace address are not aligned wrt each 6810 * If the gfn and userspace address are not aligned wrt each
6811 * other, or if explicitly asked to, disable large page 6811 * other, or if explicitly asked to, disable large page
6812 * support for this slot 6812 * support for this slot
6813 */ 6813 */
6814 if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) || 6814 if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) ||
6815 !kvm_largepages_enabled()) { 6815 !kvm_largepages_enabled()) {
6816 unsigned long j; 6816 unsigned long j;
6817 6817
6818 for (j = 0; j < lpages; ++j) 6818 for (j = 0; j < lpages; ++j)
6819 slot->arch.lpage_info[i - 1][j].write_count = 1; 6819 slot->arch.lpage_info[i - 1][j].write_count = 1;
6820 } 6820 }
6821 } 6821 }
6822 6822
6823 return 0; 6823 return 0;
6824 6824
6825 out_free: 6825 out_free:
6826 for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { 6826 for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
6827 kvm_kvfree(slot->arch.rmap[i]); 6827 kvm_kvfree(slot->arch.rmap[i]);
6828 slot->arch.rmap[i] = NULL; 6828 slot->arch.rmap[i] = NULL;
6829 if (i == 0) 6829 if (i == 0)
6830 continue; 6830 continue;
6831 6831
6832 kvm_kvfree(slot->arch.lpage_info[i - 1]); 6832 kvm_kvfree(slot->arch.lpage_info[i - 1]);
6833 slot->arch.lpage_info[i - 1] = NULL; 6833 slot->arch.lpage_info[i - 1] = NULL;
6834 } 6834 }
6835 return -ENOMEM; 6835 return -ENOMEM;
6836 } 6836 }
6837 6837
6838 int kvm_arch_prepare_memory_region(struct kvm *kvm, 6838 int kvm_arch_prepare_memory_region(struct kvm *kvm,
6839 struct kvm_memory_slot *memslot, 6839 struct kvm_memory_slot *memslot,
6840 struct kvm_memory_slot old, 6840 struct kvm_memory_slot old,
6841 struct kvm_userspace_memory_region *mem, 6841 struct kvm_userspace_memory_region *mem,
6842 bool user_alloc) 6842 bool user_alloc)
6843 { 6843 {
6844 int npages = memslot->npages; 6844 int npages = memslot->npages;
6845 int map_flags = MAP_PRIVATE | MAP_ANONYMOUS; 6845 int map_flags = MAP_PRIVATE | MAP_ANONYMOUS;
6846 6846
6847 /* Prevent internal slot pages from being moved by fork()/COW. */ 6847 /* Prevent internal slot pages from being moved by fork()/COW. */
6848 if (memslot->id >= KVM_USER_MEM_SLOTS) 6848 if (memslot->id >= KVM_USER_MEM_SLOTS)
6849 map_flags = MAP_SHARED | MAP_ANONYMOUS; 6849 map_flags = MAP_SHARED | MAP_ANONYMOUS;
6850 6850
6851 /*To keep backward compatibility with older userspace, 6851 /*To keep backward compatibility with older userspace,
6852 *x86 needs to handle !user_alloc case. 6852 *x86 needs to handle !user_alloc case.
6853 */ 6853 */
6854 if (!user_alloc) { 6854 if (!user_alloc) {
6855 if (npages && !old.npages) { 6855 if (npages && !old.npages) {
6856 unsigned long userspace_addr; 6856 unsigned long userspace_addr;
6857 6857
6858 userspace_addr = vm_mmap(NULL, 0, 6858 userspace_addr = vm_mmap(NULL, 0,
6859 npages * PAGE_SIZE, 6859 npages * PAGE_SIZE,
6860 PROT_READ | PROT_WRITE, 6860 PROT_READ | PROT_WRITE,
6861 map_flags, 6861 map_flags,
6862 0); 6862 0);
6863 6863
6864 if (IS_ERR((void *)userspace_addr)) 6864 if (IS_ERR((void *)userspace_addr))
6865 return PTR_ERR((void *)userspace_addr); 6865 return PTR_ERR((void *)userspace_addr);
6866 6866
6867 memslot->userspace_addr = userspace_addr; 6867 memslot->userspace_addr = userspace_addr;
6868 } 6868 }
6869 } 6869 }
6870 6870
6871 6871
6872 return 0; 6872 return 0;
6873 } 6873 }
6874 6874
6875 void kvm_arch_commit_memory_region(struct kvm *kvm, 6875 void kvm_arch_commit_memory_region(struct kvm *kvm,
6876 struct kvm_userspace_memory_region *mem, 6876 struct kvm_userspace_memory_region *mem,
6877 struct kvm_memory_slot old, 6877 struct kvm_memory_slot old,
6878 bool user_alloc) 6878 bool user_alloc)
6879 { 6879 {
6880 6880
6881 int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT; 6881 int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT;
6882 6882
6883 if (!user_alloc && !old.user_alloc && old.npages && !npages) { 6883 if (!user_alloc && !old.user_alloc && old.npages && !npages) {
6884 int ret; 6884 int ret;
6885 6885
6886 ret = vm_munmap(old.userspace_addr, 6886 ret = vm_munmap(old.userspace_addr,
6887 old.npages * PAGE_SIZE); 6887 old.npages * PAGE_SIZE);
6888 if (ret < 0) 6888 if (ret < 0)
6889 printk(KERN_WARNING 6889 printk(KERN_WARNING
6890 "kvm_vm_ioctl_set_memory_region: " 6890 "kvm_vm_ioctl_set_memory_region: "
6891 "failed to munmap memory\n"); 6891 "failed to munmap memory\n");
6892 } 6892 }
6893 6893
6894 if (!kvm->arch.n_requested_mmu_pages) 6894 if (!kvm->arch.n_requested_mmu_pages)
6895 nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); 6895 nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
6896 6896
6897 spin_lock(&kvm->mmu_lock); 6897 spin_lock(&kvm->mmu_lock);
6898 if (nr_mmu_pages) 6898 if (nr_mmu_pages)
6899 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); 6899 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
6900 kvm_mmu_slot_remove_write_access(kvm, mem->slot); 6900 /*
6901 * Write protect all pages for dirty logging.
6902 * Existing largepage mappings are destroyed here and new ones will
6903 * not be created until the end of the logging.
6904 */
6905 if (npages && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES))
6906 kvm_mmu_slot_remove_write_access(kvm, mem->slot);
6901 spin_unlock(&kvm->mmu_lock); 6907 spin_unlock(&kvm->mmu_lock);
6902 /* 6908 /*
6903 * If memory slot is created, or moved, we need to clear all 6909 * If memory slot is created, or moved, we need to clear all
6904 * mmio sptes. 6910 * mmio sptes.
6905 */ 6911 */
6906 if (npages && old.base_gfn != mem->guest_phys_addr >> PAGE_SHIFT) { 6912 if (npages && old.base_gfn != mem->guest_phys_addr >> PAGE_SHIFT) {
6907 kvm_mmu_zap_all(kvm); 6913 kvm_mmu_zap_all(kvm);
6908 kvm_reload_remote_mmus(kvm); 6914 kvm_reload_remote_mmus(kvm);
6909 } 6915 }
6910 } 6916 }
6911 6917
6912 void kvm_arch_flush_shadow_all(struct kvm *kvm) 6918 void kvm_arch_flush_shadow_all(struct kvm *kvm)
6913 { 6919 {
6914 kvm_mmu_zap_all(kvm); 6920 kvm_mmu_zap_all(kvm);
6915 kvm_reload_remote_mmus(kvm); 6921 kvm_reload_remote_mmus(kvm);
6916 } 6922 }
6917 6923
6918 void kvm_arch_flush_shadow_memslot(struct kvm *kvm, 6924 void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
6919 struct kvm_memory_slot *slot) 6925 struct kvm_memory_slot *slot)
6920 { 6926 {
6921 kvm_arch_flush_shadow_all(kvm); 6927 kvm_arch_flush_shadow_all(kvm);
6922 } 6928 }
6923 6929
6924 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) 6930 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
6925 { 6931 {
6926 return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && 6932 return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
6927 !vcpu->arch.apf.halted) 6933 !vcpu->arch.apf.halted)
6928 || !list_empty_careful(&vcpu->async_pf.done) 6934 || !list_empty_careful(&vcpu->async_pf.done)
6929 || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED 6935 || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
6930 || atomic_read(&vcpu->arch.nmi_queued) || 6936 || atomic_read(&vcpu->arch.nmi_queued) ||
6931 (kvm_arch_interrupt_allowed(vcpu) && 6937 (kvm_arch_interrupt_allowed(vcpu) &&
6932 kvm_cpu_has_interrupt(vcpu)); 6938 kvm_cpu_has_interrupt(vcpu));
6933 } 6939 }
6934 6940
6935 int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) 6941 int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
6936 { 6942 {
6937 return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE; 6943 return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
6938 } 6944 }
6939 6945
6940 int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) 6946 int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
6941 { 6947 {
6942 return kvm_x86_ops->interrupt_allowed(vcpu); 6948 return kvm_x86_ops->interrupt_allowed(vcpu);
6943 } 6949 }
6944 6950
6945 bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip) 6951 bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip)
6946 { 6952 {
6947 unsigned long current_rip = kvm_rip_read(vcpu) + 6953 unsigned long current_rip = kvm_rip_read(vcpu) +
6948 get_segment_base(vcpu, VCPU_SREG_CS); 6954 get_segment_base(vcpu, VCPU_SREG_CS);
6949 6955
6950 return current_rip == linear_rip; 6956 return current_rip == linear_rip;
6951 } 6957 }
6952 EXPORT_SYMBOL_GPL(kvm_is_linear_rip); 6958 EXPORT_SYMBOL_GPL(kvm_is_linear_rip);
6953 6959
6954 unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu) 6960 unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
6955 { 6961 {
6956 unsigned long rflags; 6962 unsigned long rflags;
6957 6963
6958 rflags = kvm_x86_ops->get_rflags(vcpu); 6964 rflags = kvm_x86_ops->get_rflags(vcpu);
6959 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 6965 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
6960 rflags &= ~X86_EFLAGS_TF; 6966 rflags &= ~X86_EFLAGS_TF;
6961 return rflags; 6967 return rflags;
6962 } 6968 }
6963 EXPORT_SYMBOL_GPL(kvm_get_rflags); 6969 EXPORT_SYMBOL_GPL(kvm_get_rflags);
6964 6970
6965 void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 6971 void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
6966 { 6972 {
6967 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP && 6973 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
6968 kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip)) 6974 kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
6969 rflags |= X86_EFLAGS_TF; 6975 rflags |= X86_EFLAGS_TF;
6970 kvm_x86_ops->set_rflags(vcpu, rflags); 6976 kvm_x86_ops->set_rflags(vcpu, rflags);
6971 kvm_make_request(KVM_REQ_EVENT, vcpu); 6977 kvm_make_request(KVM_REQ_EVENT, vcpu);
6972 } 6978 }
6973 EXPORT_SYMBOL_GPL(kvm_set_rflags); 6979 EXPORT_SYMBOL_GPL(kvm_set_rflags);
6974 6980
6975 void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) 6981 void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
6976 { 6982 {
6977 int r; 6983 int r;
6978 6984
6979 if ((vcpu->arch.mmu.direct_map != work->arch.direct_map) || 6985 if ((vcpu->arch.mmu.direct_map != work->arch.direct_map) ||
6980 is_error_page(work->page)) 6986 is_error_page(work->page))
6981 return; 6987 return;
6982 6988
6983 r = kvm_mmu_reload(vcpu); 6989 r = kvm_mmu_reload(vcpu);
6984 if (unlikely(r)) 6990 if (unlikely(r))
6985 return; 6991 return;
6986 6992
6987 if (!vcpu->arch.mmu.direct_map && 6993 if (!vcpu->arch.mmu.direct_map &&
6988 work->arch.cr3 != vcpu->arch.mmu.get_cr3(vcpu)) 6994 work->arch.cr3 != vcpu->arch.mmu.get_cr3(vcpu))
6989 return; 6995 return;
6990 6996
6991 vcpu->arch.mmu.page_fault(vcpu, work->gva, 0, true); 6997 vcpu->arch.mmu.page_fault(vcpu, work->gva, 0, true);
6992 } 6998 }
6993 6999
6994 static inline u32 kvm_async_pf_hash_fn(gfn_t gfn) 7000 static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
6995 { 7001 {
6996 return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU)); 7002 return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU));
6997 } 7003 }
6998 7004
6999 static inline u32 kvm_async_pf_next_probe(u32 key) 7005 static inline u32 kvm_async_pf_next_probe(u32 key)
7000 { 7006 {
7001 return (key + 1) & (roundup_pow_of_two(ASYNC_PF_PER_VCPU) - 1); 7007 return (key + 1) & (roundup_pow_of_two(ASYNC_PF_PER_VCPU) - 1);
7002 } 7008 }
7003 7009
7004 static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) 7010 static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
7005 { 7011 {
7006 u32 key = kvm_async_pf_hash_fn(gfn); 7012 u32 key = kvm_async_pf_hash_fn(gfn);
7007 7013
7008 while (vcpu->arch.apf.gfns[key] != ~0) 7014 while (vcpu->arch.apf.gfns[key] != ~0)
7009 key = kvm_async_pf_next_probe(key); 7015 key = kvm_async_pf_next_probe(key);
7010 7016
7011 vcpu->arch.apf.gfns[key] = gfn; 7017 vcpu->arch.apf.gfns[key] = gfn;
7012 } 7018 }
7013 7019
7014 static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn) 7020 static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn)
7015 { 7021 {
7016 int i; 7022 int i;
7017 u32 key = kvm_async_pf_hash_fn(gfn); 7023 u32 key = kvm_async_pf_hash_fn(gfn);
7018 7024
7019 for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU) && 7025 for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU) &&
7020 (vcpu->arch.apf.gfns[key] != gfn && 7026 (vcpu->arch.apf.gfns[key] != gfn &&
7021 vcpu->arch.apf.gfns[key] != ~0); i++) 7027 vcpu->arch.apf.gfns[key] != ~0); i++)
7022 key = kvm_async_pf_next_probe(key); 7028 key = kvm_async_pf_next_probe(key);
7023 7029
7024 return key; 7030 return key;
7025 } 7031 }
7026 7032
7027 bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) 7033 bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
7028 { 7034 {
7029 return vcpu->arch.apf.gfns[kvm_async_pf_gfn_slot(vcpu, gfn)] == gfn; 7035 return vcpu->arch.apf.gfns[kvm_async_pf_gfn_slot(vcpu, gfn)] == gfn;
7030 } 7036 }
7031 7037
7032 static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) 7038 static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
7033 { 7039 {
7034 u32 i, j, k; 7040 u32 i, j, k;
7035 7041
7036 i = j = kvm_async_pf_gfn_slot(vcpu, gfn); 7042 i = j = kvm_async_pf_gfn_slot(vcpu, gfn);
7037 while (true) { 7043 while (true) {
7038 vcpu->arch.apf.gfns[i] = ~0; 7044 vcpu->arch.apf.gfns[i] = ~0;
7039 do { 7045 do {
7040 j = kvm_async_pf_next_probe(j); 7046 j = kvm_async_pf_next_probe(j);
7041 if (vcpu->arch.apf.gfns[j] == ~0) 7047 if (vcpu->arch.apf.gfns[j] == ~0)
7042 return; 7048 return;
7043 k = kvm_async_pf_hash_fn(vcpu->arch.apf.gfns[j]); 7049 k = kvm_async_pf_hash_fn(vcpu->arch.apf.gfns[j]);
7044 /* 7050 /*
7045 * k lies cyclically in ]i,j] 7051 * k lies cyclically in ]i,j]
7046 * | i.k.j | 7052 * | i.k.j |
7047 * |....j i.k.| or |.k..j i...| 7053 * |....j i.k.| or |.k..j i...|
7048 */ 7054 */
7049 } while ((i <= j) ? (i < k && k <= j) : (i < k || k <= j)); 7055 } while ((i <= j) ? (i < k && k <= j) : (i < k || k <= j));
7050 vcpu->arch.apf.gfns[i] = vcpu->arch.apf.gfns[j]; 7056 vcpu->arch.apf.gfns[i] = vcpu->arch.apf.gfns[j];
7051 i = j; 7057 i = j;
7052 } 7058 }
7053 } 7059 }
7054 7060
7055 static int apf_put_user(struct kvm_vcpu *vcpu, u32 val) 7061 static int apf_put_user(struct kvm_vcpu *vcpu, u32 val)
7056 { 7062 {
7057 7063
7058 return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val, 7064 return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val,
7059 sizeof(val)); 7065 sizeof(val));
7060 } 7066 }
7061 7067
7062 void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, 7068 void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
7063 struct kvm_async_pf *work) 7069 struct kvm_async_pf *work)
7064 { 7070 {
7065 struct x86_exception fault; 7071 struct x86_exception fault;
7066 7072
7067 trace_kvm_async_pf_not_present(work->arch.token, work->gva); 7073 trace_kvm_async_pf_not_present(work->arch.token, work->gva);
7068 kvm_add_async_pf_gfn(vcpu, work->arch.gfn); 7074 kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
7069 7075
7070 if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) || 7076 if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) ||
7071 (vcpu->arch.apf.send_user_only && 7077 (vcpu->arch.apf.send_user_only &&
7072 kvm_x86_ops->get_cpl(vcpu) == 0)) 7078 kvm_x86_ops->get_cpl(vcpu) == 0))
7073 kvm_make_request(KVM_REQ_APF_HALT, vcpu); 7079 kvm_make_request(KVM_REQ_APF_HALT, vcpu);
7074 else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) { 7080 else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) {
7075 fault.vector = PF_VECTOR; 7081 fault.vector = PF_VECTOR;
7076 fault.error_code_valid = true; 7082 fault.error_code_valid = true;
7077 fault.error_code = 0; 7083 fault.error_code = 0;
7078 fault.nested_page_fault = false; 7084 fault.nested_page_fault = false;
7079 fault.address = work->arch.token; 7085 fault.address = work->arch.token;
7080 kvm_inject_page_fault(vcpu, &fault); 7086 kvm_inject_page_fault(vcpu, &fault);
7081 } 7087 }
7082 } 7088 }
7083 7089
7084 void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, 7090 void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
7085 struct kvm_async_pf *work) 7091 struct kvm_async_pf *work)
7086 { 7092 {
7087 struct x86_exception fault; 7093 struct x86_exception fault;
7088 7094
7089 trace_kvm_async_pf_ready(work->arch.token, work->gva); 7095 trace_kvm_async_pf_ready(work->arch.token, work->gva);
7090 if (is_error_page(work->page)) 7096 if (is_error_page(work->page))
7091 work->arch.token = ~0; /* broadcast wakeup */ 7097 work->arch.token = ~0; /* broadcast wakeup */
7092 else 7098 else
7093 kvm_del_async_pf_gfn(vcpu, work->arch.gfn); 7099 kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
7094 7100
7095 if ((vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) && 7101 if ((vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) &&
7096 !apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) { 7102 !apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
7097 fault.vector = PF_VECTOR; 7103 fault.vector = PF_VECTOR;
7098 fault.error_code_valid = true; 7104 fault.error_code_valid = true;
7099 fault.error_code = 0; 7105 fault.error_code = 0;
7100 fault.nested_page_fault = false; 7106 fault.nested_page_fault = false;
7101 fault.address = work->arch.token; 7107 fault.address = work->arch.token;
7102 kvm_inject_page_fault(vcpu, &fault); 7108 kvm_inject_page_fault(vcpu, &fault);
7103 } 7109 }
7104 vcpu->arch.apf.halted = false; 7110 vcpu->arch.apf.halted = false;
7105 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 7111 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
7106 } 7112 }
7107 7113
7108 bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu) 7114 bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
7109 { 7115 {
7110 if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED)) 7116 if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED))
7111 return true; 7117 return true;
7112 else 7118 else
7113 return !kvm_event_needs_reinjection(vcpu) && 7119 return !kvm_event_needs_reinjection(vcpu) &&
7114 kvm_x86_ops->interrupt_allowed(vcpu); 7120 kvm_x86_ops->interrupt_allowed(vcpu);
7115 } 7121 }
7116 7122
7117 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); 7123 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
7118 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); 7124 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
7119 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); 7125 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
7120 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr); 7126 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
7121 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr); 7127 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr);
7122 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun); 7128 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun);
7123 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit); 7129 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit);
7124 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject); 7130 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);
7125 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit); 7131 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
7126 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga); 7132 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
7127 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit); 7133 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
7128 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts); 7134 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
7129 7135
1 /* 1 /*
2 * Kernel-based Virtual Machine driver for Linux 2 * Kernel-based Virtual Machine driver for Linux
3 * 3 *
4 * This module enables machines with Intel VT-x extensions to run virtual 4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation. 5 * machines without emulation or binary translation.
6 * 6 *
7 * Copyright (C) 2006 Qumranet, Inc. 7 * Copyright (C) 2006 Qumranet, Inc.
8 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 8 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
9 * 9 *
10 * Authors: 10 * Authors:
11 * Avi Kivity <avi@qumranet.com> 11 * Avi Kivity <avi@qumranet.com>
12 * Yaniv Kamay <yaniv@qumranet.com> 12 * Yaniv Kamay <yaniv@qumranet.com>
13 * 13 *
14 * This work is licensed under the terms of the GNU GPL, version 2. See 14 * This work is licensed under the terms of the GNU GPL, version 2. See
15 * the COPYING file in the top-level directory. 15 * the COPYING file in the top-level directory.
16 * 16 *
17 */ 17 */
18 18
19 #include "iodev.h" 19 #include "iodev.h"
20 20
21 #include <linux/kvm_host.h> 21 #include <linux/kvm_host.h>
22 #include <linux/kvm.h> 22 #include <linux/kvm.h>
23 #include <linux/module.h> 23 #include <linux/module.h>
24 #include <linux/errno.h> 24 #include <linux/errno.h>
25 #include <linux/percpu.h> 25 #include <linux/percpu.h>
26 #include <linux/mm.h> 26 #include <linux/mm.h>
27 #include <linux/miscdevice.h> 27 #include <linux/miscdevice.h>
28 #include <linux/vmalloc.h> 28 #include <linux/vmalloc.h>
29 #include <linux/reboot.h> 29 #include <linux/reboot.h>
30 #include <linux/debugfs.h> 30 #include <linux/debugfs.h>
31 #include <linux/highmem.h> 31 #include <linux/highmem.h>
32 #include <linux/file.h> 32 #include <linux/file.h>
33 #include <linux/syscore_ops.h> 33 #include <linux/syscore_ops.h>
34 #include <linux/cpu.h> 34 #include <linux/cpu.h>
35 #include <linux/sched.h> 35 #include <linux/sched.h>
36 #include <linux/cpumask.h> 36 #include <linux/cpumask.h>
37 #include <linux/smp.h> 37 #include <linux/smp.h>
38 #include <linux/anon_inodes.h> 38 #include <linux/anon_inodes.h>
39 #include <linux/profile.h> 39 #include <linux/profile.h>
40 #include <linux/kvm_para.h> 40 #include <linux/kvm_para.h>
41 #include <linux/pagemap.h> 41 #include <linux/pagemap.h>
42 #include <linux/mman.h> 42 #include <linux/mman.h>
43 #include <linux/swap.h> 43 #include <linux/swap.h>
44 #include <linux/bitops.h> 44 #include <linux/bitops.h>
45 #include <linux/spinlock.h> 45 #include <linux/spinlock.h>
46 #include <linux/compat.h> 46 #include <linux/compat.h>
47 #include <linux/srcu.h> 47 #include <linux/srcu.h>
48 #include <linux/hugetlb.h> 48 #include <linux/hugetlb.h>
49 #include <linux/slab.h> 49 #include <linux/slab.h>
50 #include <linux/sort.h> 50 #include <linux/sort.h>
51 #include <linux/bsearch.h> 51 #include <linux/bsearch.h>
52 52
53 #include <asm/processor.h> 53 #include <asm/processor.h>
54 #include <asm/io.h> 54 #include <asm/io.h>
55 #include <asm/uaccess.h> 55 #include <asm/uaccess.h>
56 #include <asm/pgtable.h> 56 #include <asm/pgtable.h>
57 57
58 #include "coalesced_mmio.h" 58 #include "coalesced_mmio.h"
59 #include "async_pf.h" 59 #include "async_pf.h"
60 60
61 #define CREATE_TRACE_POINTS 61 #define CREATE_TRACE_POINTS
62 #include <trace/events/kvm.h> 62 #include <trace/events/kvm.h>
63 63
64 MODULE_AUTHOR("Qumranet"); 64 MODULE_AUTHOR("Qumranet");
65 MODULE_LICENSE("GPL"); 65 MODULE_LICENSE("GPL");
66 66
67 /* 67 /*
68 * Ordering of locks: 68 * Ordering of locks:
69 * 69 *
70 * kvm->lock --> kvm->slots_lock --> kvm->irq_lock 70 * kvm->lock --> kvm->slots_lock --> kvm->irq_lock
71 */ 71 */
72 72
73 DEFINE_RAW_SPINLOCK(kvm_lock); 73 DEFINE_RAW_SPINLOCK(kvm_lock);
74 LIST_HEAD(vm_list); 74 LIST_HEAD(vm_list);
75 75
76 static cpumask_var_t cpus_hardware_enabled; 76 static cpumask_var_t cpus_hardware_enabled;
77 static int kvm_usage_count = 0; 77 static int kvm_usage_count = 0;
78 static atomic_t hardware_enable_failed; 78 static atomic_t hardware_enable_failed;
79 79
80 struct kmem_cache *kvm_vcpu_cache; 80 struct kmem_cache *kvm_vcpu_cache;
81 EXPORT_SYMBOL_GPL(kvm_vcpu_cache); 81 EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
82 82
83 static __read_mostly struct preempt_ops kvm_preempt_ops; 83 static __read_mostly struct preempt_ops kvm_preempt_ops;
84 84
85 struct dentry *kvm_debugfs_dir; 85 struct dentry *kvm_debugfs_dir;
86 86
87 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, 87 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
88 unsigned long arg); 88 unsigned long arg);
89 #ifdef CONFIG_COMPAT 89 #ifdef CONFIG_COMPAT
90 static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl, 90 static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl,
91 unsigned long arg); 91 unsigned long arg);
92 #endif 92 #endif
93 static int hardware_enable_all(void); 93 static int hardware_enable_all(void);
94 static void hardware_disable_all(void); 94 static void hardware_disable_all(void);
95 95
96 static void kvm_io_bus_destroy(struct kvm_io_bus *bus); 96 static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
97 97
98 bool kvm_rebooting; 98 bool kvm_rebooting;
99 EXPORT_SYMBOL_GPL(kvm_rebooting); 99 EXPORT_SYMBOL_GPL(kvm_rebooting);
100 100
101 static bool largepages_enabled = true; 101 static bool largepages_enabled = true;
102 102
103 bool kvm_is_mmio_pfn(pfn_t pfn) 103 bool kvm_is_mmio_pfn(pfn_t pfn)
104 { 104 {
105 if (pfn_valid(pfn)) { 105 if (pfn_valid(pfn)) {
106 int reserved; 106 int reserved;
107 struct page *tail = pfn_to_page(pfn); 107 struct page *tail = pfn_to_page(pfn);
108 struct page *head = compound_trans_head(tail); 108 struct page *head = compound_trans_head(tail);
109 reserved = PageReserved(head); 109 reserved = PageReserved(head);
110 if (head != tail) { 110 if (head != tail) {
111 /* 111 /*
112 * "head" is not a dangling pointer 112 * "head" is not a dangling pointer
113 * (compound_trans_head takes care of that) 113 * (compound_trans_head takes care of that)
114 * but the hugepage may have been splitted 114 * but the hugepage may have been splitted
115 * from under us (and we may not hold a 115 * from under us (and we may not hold a
116 * reference count on the head page so it can 116 * reference count on the head page so it can
117 * be reused before we run PageReferenced), so 117 * be reused before we run PageReferenced), so
118 * we've to check PageTail before returning 118 * we've to check PageTail before returning
119 * what we just read. 119 * what we just read.
120 */ 120 */
121 smp_rmb(); 121 smp_rmb();
122 if (PageTail(tail)) 122 if (PageTail(tail))
123 return reserved; 123 return reserved;
124 } 124 }
125 return PageReserved(tail); 125 return PageReserved(tail);
126 } 126 }
127 127
128 return true; 128 return true;
129 } 129 }
130 130
131 /* 131 /*
132 * Switches to specified vcpu, until a matching vcpu_put() 132 * Switches to specified vcpu, until a matching vcpu_put()
133 */ 133 */
134 int vcpu_load(struct kvm_vcpu *vcpu) 134 int vcpu_load(struct kvm_vcpu *vcpu)
135 { 135 {
136 int cpu; 136 int cpu;
137 137
138 if (mutex_lock_killable(&vcpu->mutex)) 138 if (mutex_lock_killable(&vcpu->mutex))
139 return -EINTR; 139 return -EINTR;
140 if (unlikely(vcpu->pid != current->pids[PIDTYPE_PID].pid)) { 140 if (unlikely(vcpu->pid != current->pids[PIDTYPE_PID].pid)) {
141 /* The thread running this VCPU changed. */ 141 /* The thread running this VCPU changed. */
142 struct pid *oldpid = vcpu->pid; 142 struct pid *oldpid = vcpu->pid;
143 struct pid *newpid = get_task_pid(current, PIDTYPE_PID); 143 struct pid *newpid = get_task_pid(current, PIDTYPE_PID);
144 rcu_assign_pointer(vcpu->pid, newpid); 144 rcu_assign_pointer(vcpu->pid, newpid);
145 synchronize_rcu(); 145 synchronize_rcu();
146 put_pid(oldpid); 146 put_pid(oldpid);
147 } 147 }
148 cpu = get_cpu(); 148 cpu = get_cpu();
149 preempt_notifier_register(&vcpu->preempt_notifier); 149 preempt_notifier_register(&vcpu->preempt_notifier);
150 kvm_arch_vcpu_load(vcpu, cpu); 150 kvm_arch_vcpu_load(vcpu, cpu);
151 put_cpu(); 151 put_cpu();
152 return 0; 152 return 0;
153 } 153 }
154 154
155 void vcpu_put(struct kvm_vcpu *vcpu) 155 void vcpu_put(struct kvm_vcpu *vcpu)
156 { 156 {
157 preempt_disable(); 157 preempt_disable();
158 kvm_arch_vcpu_put(vcpu); 158 kvm_arch_vcpu_put(vcpu);
159 preempt_notifier_unregister(&vcpu->preempt_notifier); 159 preempt_notifier_unregister(&vcpu->preempt_notifier);
160 preempt_enable(); 160 preempt_enable();
161 mutex_unlock(&vcpu->mutex); 161 mutex_unlock(&vcpu->mutex);
162 } 162 }
163 163
164 static void ack_flush(void *_completed) 164 static void ack_flush(void *_completed)
165 { 165 {
166 } 166 }
167 167
168 static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) 168 static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
169 { 169 {
170 int i, cpu, me; 170 int i, cpu, me;
171 cpumask_var_t cpus; 171 cpumask_var_t cpus;
172 bool called = true; 172 bool called = true;
173 struct kvm_vcpu *vcpu; 173 struct kvm_vcpu *vcpu;
174 174
175 zalloc_cpumask_var(&cpus, GFP_ATOMIC); 175 zalloc_cpumask_var(&cpus, GFP_ATOMIC);
176 176
177 me = get_cpu(); 177 me = get_cpu();
178 kvm_for_each_vcpu(i, vcpu, kvm) { 178 kvm_for_each_vcpu(i, vcpu, kvm) {
179 kvm_make_request(req, vcpu); 179 kvm_make_request(req, vcpu);
180 cpu = vcpu->cpu; 180 cpu = vcpu->cpu;
181 181
182 /* Set ->requests bit before we read ->mode */ 182 /* Set ->requests bit before we read ->mode */
183 smp_mb(); 183 smp_mb();
184 184
185 if (cpus != NULL && cpu != -1 && cpu != me && 185 if (cpus != NULL && cpu != -1 && cpu != me &&
186 kvm_vcpu_exiting_guest_mode(vcpu) != OUTSIDE_GUEST_MODE) 186 kvm_vcpu_exiting_guest_mode(vcpu) != OUTSIDE_GUEST_MODE)
187 cpumask_set_cpu(cpu, cpus); 187 cpumask_set_cpu(cpu, cpus);
188 } 188 }
189 if (unlikely(cpus == NULL)) 189 if (unlikely(cpus == NULL))
190 smp_call_function_many(cpu_online_mask, ack_flush, NULL, 1); 190 smp_call_function_many(cpu_online_mask, ack_flush, NULL, 1);
191 else if (!cpumask_empty(cpus)) 191 else if (!cpumask_empty(cpus))
192 smp_call_function_many(cpus, ack_flush, NULL, 1); 192 smp_call_function_many(cpus, ack_flush, NULL, 1);
193 else 193 else
194 called = false; 194 called = false;
195 put_cpu(); 195 put_cpu();
196 free_cpumask_var(cpus); 196 free_cpumask_var(cpus);
197 return called; 197 return called;
198 } 198 }
199 199
200 void kvm_flush_remote_tlbs(struct kvm *kvm) 200 void kvm_flush_remote_tlbs(struct kvm *kvm)
201 { 201 {
202 long dirty_count = kvm->tlbs_dirty; 202 long dirty_count = kvm->tlbs_dirty;
203 203
204 smp_mb(); 204 smp_mb();
205 if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) 205 if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
206 ++kvm->stat.remote_tlb_flush; 206 ++kvm->stat.remote_tlb_flush;
207 cmpxchg(&kvm->tlbs_dirty, dirty_count, 0); 207 cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
208 } 208 }
209 209
210 void kvm_reload_remote_mmus(struct kvm *kvm) 210 void kvm_reload_remote_mmus(struct kvm *kvm)
211 { 211 {
212 make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD); 212 make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
213 } 213 }
214 214
215 void kvm_make_mclock_inprogress_request(struct kvm *kvm) 215 void kvm_make_mclock_inprogress_request(struct kvm *kvm)
216 { 216 {
217 make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS); 217 make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
218 } 218 }
219 219
220 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) 220 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
221 { 221 {
222 struct page *page; 222 struct page *page;
223 int r; 223 int r;
224 224
225 mutex_init(&vcpu->mutex); 225 mutex_init(&vcpu->mutex);
226 vcpu->cpu = -1; 226 vcpu->cpu = -1;
227 vcpu->kvm = kvm; 227 vcpu->kvm = kvm;
228 vcpu->vcpu_id = id; 228 vcpu->vcpu_id = id;
229 vcpu->pid = NULL; 229 vcpu->pid = NULL;
230 init_waitqueue_head(&vcpu->wq); 230 init_waitqueue_head(&vcpu->wq);
231 kvm_async_pf_vcpu_init(vcpu); 231 kvm_async_pf_vcpu_init(vcpu);
232 232
233 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 233 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
234 if (!page) { 234 if (!page) {
235 r = -ENOMEM; 235 r = -ENOMEM;
236 goto fail; 236 goto fail;
237 } 237 }
238 vcpu->run = page_address(page); 238 vcpu->run = page_address(page);
239 239
240 kvm_vcpu_set_in_spin_loop(vcpu, false); 240 kvm_vcpu_set_in_spin_loop(vcpu, false);
241 kvm_vcpu_set_dy_eligible(vcpu, false); 241 kvm_vcpu_set_dy_eligible(vcpu, false);
242 242
243 r = kvm_arch_vcpu_init(vcpu); 243 r = kvm_arch_vcpu_init(vcpu);
244 if (r < 0) 244 if (r < 0)
245 goto fail_free_run; 245 goto fail_free_run;
246 return 0; 246 return 0;
247 247
248 fail_free_run: 248 fail_free_run:
249 free_page((unsigned long)vcpu->run); 249 free_page((unsigned long)vcpu->run);
250 fail: 250 fail:
251 return r; 251 return r;
252 } 252 }
253 EXPORT_SYMBOL_GPL(kvm_vcpu_init); 253 EXPORT_SYMBOL_GPL(kvm_vcpu_init);
254 254
255 void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) 255 void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
256 { 256 {
257 put_pid(vcpu->pid); 257 put_pid(vcpu->pid);
258 kvm_arch_vcpu_uninit(vcpu); 258 kvm_arch_vcpu_uninit(vcpu);
259 free_page((unsigned long)vcpu->run); 259 free_page((unsigned long)vcpu->run);
260 } 260 }
261 EXPORT_SYMBOL_GPL(kvm_vcpu_uninit); 261 EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
262 262
263 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 263 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
264 static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) 264 static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
265 { 265 {
266 return container_of(mn, struct kvm, mmu_notifier); 266 return container_of(mn, struct kvm, mmu_notifier);
267 } 267 }
268 268
269 static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn, 269 static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
270 struct mm_struct *mm, 270 struct mm_struct *mm,
271 unsigned long address) 271 unsigned long address)
272 { 272 {
273 struct kvm *kvm = mmu_notifier_to_kvm(mn); 273 struct kvm *kvm = mmu_notifier_to_kvm(mn);
274 int need_tlb_flush, idx; 274 int need_tlb_flush, idx;
275 275
276 /* 276 /*
277 * When ->invalidate_page runs, the linux pte has been zapped 277 * When ->invalidate_page runs, the linux pte has been zapped
278 * already but the page is still allocated until 278 * already but the page is still allocated until
279 * ->invalidate_page returns. So if we increase the sequence 279 * ->invalidate_page returns. So if we increase the sequence
280 * here the kvm page fault will notice if the spte can't be 280 * here the kvm page fault will notice if the spte can't be
281 * established because the page is going to be freed. If 281 * established because the page is going to be freed. If
282 * instead the kvm page fault establishes the spte before 282 * instead the kvm page fault establishes the spte before
283 * ->invalidate_page runs, kvm_unmap_hva will release it 283 * ->invalidate_page runs, kvm_unmap_hva will release it
284 * before returning. 284 * before returning.
285 * 285 *
286 * The sequence increase only need to be seen at spin_unlock 286 * The sequence increase only need to be seen at spin_unlock
287 * time, and not at spin_lock time. 287 * time, and not at spin_lock time.
288 * 288 *
289 * Increasing the sequence after the spin_unlock would be 289 * Increasing the sequence after the spin_unlock would be
290 * unsafe because the kvm page fault could then establish the 290 * unsafe because the kvm page fault could then establish the
291 * pte after kvm_unmap_hva returned, without noticing the page 291 * pte after kvm_unmap_hva returned, without noticing the page
292 * is going to be freed. 292 * is going to be freed.
293 */ 293 */
294 idx = srcu_read_lock(&kvm->srcu); 294 idx = srcu_read_lock(&kvm->srcu);
295 spin_lock(&kvm->mmu_lock); 295 spin_lock(&kvm->mmu_lock);
296 296
297 kvm->mmu_notifier_seq++; 297 kvm->mmu_notifier_seq++;
298 need_tlb_flush = kvm_unmap_hva(kvm, address) | kvm->tlbs_dirty; 298 need_tlb_flush = kvm_unmap_hva(kvm, address) | kvm->tlbs_dirty;
299 /* we've to flush the tlb before the pages can be freed */ 299 /* we've to flush the tlb before the pages can be freed */
300 if (need_tlb_flush) 300 if (need_tlb_flush)
301 kvm_flush_remote_tlbs(kvm); 301 kvm_flush_remote_tlbs(kvm);
302 302
303 spin_unlock(&kvm->mmu_lock); 303 spin_unlock(&kvm->mmu_lock);
304 srcu_read_unlock(&kvm->srcu, idx); 304 srcu_read_unlock(&kvm->srcu, idx);
305 } 305 }
306 306
307 static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, 307 static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
308 struct mm_struct *mm, 308 struct mm_struct *mm,
309 unsigned long address, 309 unsigned long address,
310 pte_t pte) 310 pte_t pte)
311 { 311 {
312 struct kvm *kvm = mmu_notifier_to_kvm(mn); 312 struct kvm *kvm = mmu_notifier_to_kvm(mn);
313 int idx; 313 int idx;
314 314
315 idx = srcu_read_lock(&kvm->srcu); 315 idx = srcu_read_lock(&kvm->srcu);
316 spin_lock(&kvm->mmu_lock); 316 spin_lock(&kvm->mmu_lock);
317 kvm->mmu_notifier_seq++; 317 kvm->mmu_notifier_seq++;
318 kvm_set_spte_hva(kvm, address, pte); 318 kvm_set_spte_hva(kvm, address, pte);
319 spin_unlock(&kvm->mmu_lock); 319 spin_unlock(&kvm->mmu_lock);
320 srcu_read_unlock(&kvm->srcu, idx); 320 srcu_read_unlock(&kvm->srcu, idx);
321 } 321 }
322 322
323 static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, 323 static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
324 struct mm_struct *mm, 324 struct mm_struct *mm,
325 unsigned long start, 325 unsigned long start,
326 unsigned long end) 326 unsigned long end)
327 { 327 {
328 struct kvm *kvm = mmu_notifier_to_kvm(mn); 328 struct kvm *kvm = mmu_notifier_to_kvm(mn);
329 int need_tlb_flush = 0, idx; 329 int need_tlb_flush = 0, idx;
330 330
331 idx = srcu_read_lock(&kvm->srcu); 331 idx = srcu_read_lock(&kvm->srcu);
332 spin_lock(&kvm->mmu_lock); 332 spin_lock(&kvm->mmu_lock);
333 /* 333 /*
334 * The count increase must become visible at unlock time as no 334 * The count increase must become visible at unlock time as no
335 * spte can be established without taking the mmu_lock and 335 * spte can be established without taking the mmu_lock and
336 * count is also read inside the mmu_lock critical section. 336 * count is also read inside the mmu_lock critical section.
337 */ 337 */
338 kvm->mmu_notifier_count++; 338 kvm->mmu_notifier_count++;
339 need_tlb_flush = kvm_unmap_hva_range(kvm, start, end); 339 need_tlb_flush = kvm_unmap_hva_range(kvm, start, end);
340 need_tlb_flush |= kvm->tlbs_dirty; 340 need_tlb_flush |= kvm->tlbs_dirty;
341 /* we've to flush the tlb before the pages can be freed */ 341 /* we've to flush the tlb before the pages can be freed */
342 if (need_tlb_flush) 342 if (need_tlb_flush)
343 kvm_flush_remote_tlbs(kvm); 343 kvm_flush_remote_tlbs(kvm);
344 344
345 spin_unlock(&kvm->mmu_lock); 345 spin_unlock(&kvm->mmu_lock);
346 srcu_read_unlock(&kvm->srcu, idx); 346 srcu_read_unlock(&kvm->srcu, idx);
347 } 347 }
348 348
349 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, 349 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
350 struct mm_struct *mm, 350 struct mm_struct *mm,
351 unsigned long start, 351 unsigned long start,
352 unsigned long end) 352 unsigned long end)
353 { 353 {
354 struct kvm *kvm = mmu_notifier_to_kvm(mn); 354 struct kvm *kvm = mmu_notifier_to_kvm(mn);
355 355
356 spin_lock(&kvm->mmu_lock); 356 spin_lock(&kvm->mmu_lock);
357 /* 357 /*
358 * This sequence increase will notify the kvm page fault that 358 * This sequence increase will notify the kvm page fault that
359 * the page that is going to be mapped in the spte could have 359 * the page that is going to be mapped in the spte could have
360 * been freed. 360 * been freed.
361 */ 361 */
362 kvm->mmu_notifier_seq++; 362 kvm->mmu_notifier_seq++;
363 smp_wmb(); 363 smp_wmb();
364 /* 364 /*
365 * The above sequence increase must be visible before the 365 * The above sequence increase must be visible before the
366 * below count decrease, which is ensured by the smp_wmb above 366 * below count decrease, which is ensured by the smp_wmb above
367 * in conjunction with the smp_rmb in mmu_notifier_retry(). 367 * in conjunction with the smp_rmb in mmu_notifier_retry().
368 */ 368 */
369 kvm->mmu_notifier_count--; 369 kvm->mmu_notifier_count--;
370 spin_unlock(&kvm->mmu_lock); 370 spin_unlock(&kvm->mmu_lock);
371 371
372 BUG_ON(kvm->mmu_notifier_count < 0); 372 BUG_ON(kvm->mmu_notifier_count < 0);
373 } 373 }
374 374
375 static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, 375 static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
376 struct mm_struct *mm, 376 struct mm_struct *mm,
377 unsigned long address) 377 unsigned long address)
378 { 378 {
379 struct kvm *kvm = mmu_notifier_to_kvm(mn); 379 struct kvm *kvm = mmu_notifier_to_kvm(mn);
380 int young, idx; 380 int young, idx;
381 381
382 idx = srcu_read_lock(&kvm->srcu); 382 idx = srcu_read_lock(&kvm->srcu);
383 spin_lock(&kvm->mmu_lock); 383 spin_lock(&kvm->mmu_lock);
384 384
385 young = kvm_age_hva(kvm, address); 385 young = kvm_age_hva(kvm, address);
386 if (young) 386 if (young)
387 kvm_flush_remote_tlbs(kvm); 387 kvm_flush_remote_tlbs(kvm);
388 388
389 spin_unlock(&kvm->mmu_lock); 389 spin_unlock(&kvm->mmu_lock);
390 srcu_read_unlock(&kvm->srcu, idx); 390 srcu_read_unlock(&kvm->srcu, idx);
391 391
392 return young; 392 return young;
393 } 393 }
394 394
395 static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn, 395 static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
396 struct mm_struct *mm, 396 struct mm_struct *mm,
397 unsigned long address) 397 unsigned long address)
398 { 398 {
399 struct kvm *kvm = mmu_notifier_to_kvm(mn); 399 struct kvm *kvm = mmu_notifier_to_kvm(mn);
400 int young, idx; 400 int young, idx;
401 401
402 idx = srcu_read_lock(&kvm->srcu); 402 idx = srcu_read_lock(&kvm->srcu);
403 spin_lock(&kvm->mmu_lock); 403 spin_lock(&kvm->mmu_lock);
404 young = kvm_test_age_hva(kvm, address); 404 young = kvm_test_age_hva(kvm, address);
405 spin_unlock(&kvm->mmu_lock); 405 spin_unlock(&kvm->mmu_lock);
406 srcu_read_unlock(&kvm->srcu, idx); 406 srcu_read_unlock(&kvm->srcu, idx);
407 407
408 return young; 408 return young;
409 } 409 }
410 410
411 static void kvm_mmu_notifier_release(struct mmu_notifier *mn, 411 static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
412 struct mm_struct *mm) 412 struct mm_struct *mm)
413 { 413 {
414 struct kvm *kvm = mmu_notifier_to_kvm(mn); 414 struct kvm *kvm = mmu_notifier_to_kvm(mn);
415 int idx; 415 int idx;
416 416
417 idx = srcu_read_lock(&kvm->srcu); 417 idx = srcu_read_lock(&kvm->srcu);
418 kvm_arch_flush_shadow_all(kvm); 418 kvm_arch_flush_shadow_all(kvm);
419 srcu_read_unlock(&kvm->srcu, idx); 419 srcu_read_unlock(&kvm->srcu, idx);
420 } 420 }
421 421
422 static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { 422 static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
423 .invalidate_page = kvm_mmu_notifier_invalidate_page, 423 .invalidate_page = kvm_mmu_notifier_invalidate_page,
424 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, 424 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
425 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, 425 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
426 .clear_flush_young = kvm_mmu_notifier_clear_flush_young, 426 .clear_flush_young = kvm_mmu_notifier_clear_flush_young,
427 .test_young = kvm_mmu_notifier_test_young, 427 .test_young = kvm_mmu_notifier_test_young,
428 .change_pte = kvm_mmu_notifier_change_pte, 428 .change_pte = kvm_mmu_notifier_change_pte,
429 .release = kvm_mmu_notifier_release, 429 .release = kvm_mmu_notifier_release,
430 }; 430 };
431 431
432 static int kvm_init_mmu_notifier(struct kvm *kvm) 432 static int kvm_init_mmu_notifier(struct kvm *kvm)
433 { 433 {
434 kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops; 434 kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
435 return mmu_notifier_register(&kvm->mmu_notifier, current->mm); 435 return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
436 } 436 }
437 437
438 #else /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */ 438 #else /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */
439 439
440 static int kvm_init_mmu_notifier(struct kvm *kvm) 440 static int kvm_init_mmu_notifier(struct kvm *kvm)
441 { 441 {
442 return 0; 442 return 0;
443 } 443 }
444 444
445 #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */ 445 #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
446 446
447 static void kvm_init_memslots_id(struct kvm *kvm) 447 static void kvm_init_memslots_id(struct kvm *kvm)
448 { 448 {
449 int i; 449 int i;
450 struct kvm_memslots *slots = kvm->memslots; 450 struct kvm_memslots *slots = kvm->memslots;
451 451
452 for (i = 0; i < KVM_MEM_SLOTS_NUM; i++) 452 for (i = 0; i < KVM_MEM_SLOTS_NUM; i++)
453 slots->id_to_index[i] = slots->memslots[i].id = i; 453 slots->id_to_index[i] = slots->memslots[i].id = i;
454 } 454 }
455 455
456 static struct kvm *kvm_create_vm(unsigned long type) 456 static struct kvm *kvm_create_vm(unsigned long type)
457 { 457 {
458 int r, i; 458 int r, i;
459 struct kvm *kvm = kvm_arch_alloc_vm(); 459 struct kvm *kvm = kvm_arch_alloc_vm();
460 460
461 if (!kvm) 461 if (!kvm)
462 return ERR_PTR(-ENOMEM); 462 return ERR_PTR(-ENOMEM);
463 463
464 r = kvm_arch_init_vm(kvm, type); 464 r = kvm_arch_init_vm(kvm, type);
465 if (r) 465 if (r)
466 goto out_err_nodisable; 466 goto out_err_nodisable;
467 467
468 r = hardware_enable_all(); 468 r = hardware_enable_all();
469 if (r) 469 if (r)
470 goto out_err_nodisable; 470 goto out_err_nodisable;
471 471
472 #ifdef CONFIG_HAVE_KVM_IRQCHIP 472 #ifdef CONFIG_HAVE_KVM_IRQCHIP
473 INIT_HLIST_HEAD(&kvm->mask_notifier_list); 473 INIT_HLIST_HEAD(&kvm->mask_notifier_list);
474 INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list); 474 INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
475 #endif 475 #endif
476 476
477 BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX); 477 BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
478 478
479 r = -ENOMEM; 479 r = -ENOMEM;
480 kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); 480 kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
481 if (!kvm->memslots) 481 if (!kvm->memslots)
482 goto out_err_nosrcu; 482 goto out_err_nosrcu;
483 kvm_init_memslots_id(kvm); 483 kvm_init_memslots_id(kvm);
484 if (init_srcu_struct(&kvm->srcu)) 484 if (init_srcu_struct(&kvm->srcu))
485 goto out_err_nosrcu; 485 goto out_err_nosrcu;
486 for (i = 0; i < KVM_NR_BUSES; i++) { 486 for (i = 0; i < KVM_NR_BUSES; i++) {
487 kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus), 487 kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus),
488 GFP_KERNEL); 488 GFP_KERNEL);
489 if (!kvm->buses[i]) 489 if (!kvm->buses[i])
490 goto out_err; 490 goto out_err;
491 } 491 }
492 492
493 spin_lock_init(&kvm->mmu_lock); 493 spin_lock_init(&kvm->mmu_lock);
494 kvm->mm = current->mm; 494 kvm->mm = current->mm;
495 atomic_inc(&kvm->mm->mm_count); 495 atomic_inc(&kvm->mm->mm_count);
496 kvm_eventfd_init(kvm); 496 kvm_eventfd_init(kvm);
497 mutex_init(&kvm->lock); 497 mutex_init(&kvm->lock);
498 mutex_init(&kvm->irq_lock); 498 mutex_init(&kvm->irq_lock);
499 mutex_init(&kvm->slots_lock); 499 mutex_init(&kvm->slots_lock);
500 atomic_set(&kvm->users_count, 1); 500 atomic_set(&kvm->users_count, 1);
501 501
502 r = kvm_init_mmu_notifier(kvm); 502 r = kvm_init_mmu_notifier(kvm);
503 if (r) 503 if (r)
504 goto out_err; 504 goto out_err;
505 505
506 raw_spin_lock(&kvm_lock); 506 raw_spin_lock(&kvm_lock);
507 list_add(&kvm->vm_list, &vm_list); 507 list_add(&kvm->vm_list, &vm_list);
508 raw_spin_unlock(&kvm_lock); 508 raw_spin_unlock(&kvm_lock);
509 509
510 return kvm; 510 return kvm;
511 511
512 out_err: 512 out_err:
513 cleanup_srcu_struct(&kvm->srcu); 513 cleanup_srcu_struct(&kvm->srcu);
514 out_err_nosrcu: 514 out_err_nosrcu:
515 hardware_disable_all(); 515 hardware_disable_all();
516 out_err_nodisable: 516 out_err_nodisable:
517 for (i = 0; i < KVM_NR_BUSES; i++) 517 for (i = 0; i < KVM_NR_BUSES; i++)
518 kfree(kvm->buses[i]); 518 kfree(kvm->buses[i]);
519 kfree(kvm->memslots); 519 kfree(kvm->memslots);
520 kvm_arch_free_vm(kvm); 520 kvm_arch_free_vm(kvm);
521 return ERR_PTR(r); 521 return ERR_PTR(r);
522 } 522 }
523 523
524 /* 524 /*
525 * Avoid using vmalloc for a small buffer. 525 * Avoid using vmalloc for a small buffer.
526 * Should not be used when the size is statically known. 526 * Should not be used when the size is statically known.
527 */ 527 */
528 void *kvm_kvzalloc(unsigned long size) 528 void *kvm_kvzalloc(unsigned long size)
529 { 529 {
530 if (size > PAGE_SIZE) 530 if (size > PAGE_SIZE)
531 return vzalloc(size); 531 return vzalloc(size);
532 else 532 else
533 return kzalloc(size, GFP_KERNEL); 533 return kzalloc(size, GFP_KERNEL);
534 } 534 }
535 535
536 void kvm_kvfree(const void *addr) 536 void kvm_kvfree(const void *addr)
537 { 537 {
538 if (is_vmalloc_addr(addr)) 538 if (is_vmalloc_addr(addr))
539 vfree(addr); 539 vfree(addr);
540 else 540 else
541 kfree(addr); 541 kfree(addr);
542 } 542 }
543 543
544 static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) 544 static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
545 { 545 {
546 if (!memslot->dirty_bitmap) 546 if (!memslot->dirty_bitmap)
547 return; 547 return;
548 548
549 kvm_kvfree(memslot->dirty_bitmap); 549 kvm_kvfree(memslot->dirty_bitmap);
550 memslot->dirty_bitmap = NULL; 550 memslot->dirty_bitmap = NULL;
551 } 551 }
552 552
553 /* 553 /*
554 * Free any memory in @free but not in @dont. 554 * Free any memory in @free but not in @dont.
555 */ 555 */
556 static void kvm_free_physmem_slot(struct kvm_memory_slot *free, 556 static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
557 struct kvm_memory_slot *dont) 557 struct kvm_memory_slot *dont)
558 { 558 {
559 if (!dont || free->dirty_bitmap != dont->dirty_bitmap) 559 if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
560 kvm_destroy_dirty_bitmap(free); 560 kvm_destroy_dirty_bitmap(free);
561 561
562 kvm_arch_free_memslot(free, dont); 562 kvm_arch_free_memslot(free, dont);
563 563
564 free->npages = 0; 564 free->npages = 0;
565 } 565 }
566 566
567 void kvm_free_physmem(struct kvm *kvm) 567 void kvm_free_physmem(struct kvm *kvm)
568 { 568 {
569 struct kvm_memslots *slots = kvm->memslots; 569 struct kvm_memslots *slots = kvm->memslots;
570 struct kvm_memory_slot *memslot; 570 struct kvm_memory_slot *memslot;
571 571
572 kvm_for_each_memslot(memslot, slots) 572 kvm_for_each_memslot(memslot, slots)
573 kvm_free_physmem_slot(memslot, NULL); 573 kvm_free_physmem_slot(memslot, NULL);
574 574
575 kfree(kvm->memslots); 575 kfree(kvm->memslots);
576 } 576 }
577 577
578 static void kvm_destroy_vm(struct kvm *kvm) 578 static void kvm_destroy_vm(struct kvm *kvm)
579 { 579 {
580 int i; 580 int i;
581 struct mm_struct *mm = kvm->mm; 581 struct mm_struct *mm = kvm->mm;
582 582
583 kvm_arch_sync_events(kvm); 583 kvm_arch_sync_events(kvm);
584 raw_spin_lock(&kvm_lock); 584 raw_spin_lock(&kvm_lock);
585 list_del(&kvm->vm_list); 585 list_del(&kvm->vm_list);
586 raw_spin_unlock(&kvm_lock); 586 raw_spin_unlock(&kvm_lock);
587 kvm_free_irq_routing(kvm); 587 kvm_free_irq_routing(kvm);
588 for (i = 0; i < KVM_NR_BUSES; i++) 588 for (i = 0; i < KVM_NR_BUSES; i++)
589 kvm_io_bus_destroy(kvm->buses[i]); 589 kvm_io_bus_destroy(kvm->buses[i]);
590 kvm_coalesced_mmio_free(kvm); 590 kvm_coalesced_mmio_free(kvm);
591 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 591 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
592 mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); 592 mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
593 #else 593 #else
594 kvm_arch_flush_shadow_all(kvm); 594 kvm_arch_flush_shadow_all(kvm);
595 #endif 595 #endif
596 kvm_arch_destroy_vm(kvm); 596 kvm_arch_destroy_vm(kvm);
597 kvm_free_physmem(kvm); 597 kvm_free_physmem(kvm);
598 cleanup_srcu_struct(&kvm->srcu); 598 cleanup_srcu_struct(&kvm->srcu);
599 kvm_arch_free_vm(kvm); 599 kvm_arch_free_vm(kvm);
600 hardware_disable_all(); 600 hardware_disable_all();
601 mmdrop(mm); 601 mmdrop(mm);
602 } 602 }
603 603
604 void kvm_get_kvm(struct kvm *kvm) 604 void kvm_get_kvm(struct kvm *kvm)
605 { 605 {
606 atomic_inc(&kvm->users_count); 606 atomic_inc(&kvm->users_count);
607 } 607 }
608 EXPORT_SYMBOL_GPL(kvm_get_kvm); 608 EXPORT_SYMBOL_GPL(kvm_get_kvm);
609 609
610 void kvm_put_kvm(struct kvm *kvm) 610 void kvm_put_kvm(struct kvm *kvm)
611 { 611 {
612 if (atomic_dec_and_test(&kvm->users_count)) 612 if (atomic_dec_and_test(&kvm->users_count))
613 kvm_destroy_vm(kvm); 613 kvm_destroy_vm(kvm);
614 } 614 }
615 EXPORT_SYMBOL_GPL(kvm_put_kvm); 615 EXPORT_SYMBOL_GPL(kvm_put_kvm);
616 616
617 617
618 static int kvm_vm_release(struct inode *inode, struct file *filp) 618 static int kvm_vm_release(struct inode *inode, struct file *filp)
619 { 619 {
620 struct kvm *kvm = filp->private_data; 620 struct kvm *kvm = filp->private_data;
621 621
622 kvm_irqfd_release(kvm); 622 kvm_irqfd_release(kvm);
623 623
624 kvm_put_kvm(kvm); 624 kvm_put_kvm(kvm);
625 return 0; 625 return 0;
626 } 626 }
627 627
628 /* 628 /*
629 * Allocation size is twice as large as the actual dirty bitmap size. 629 * Allocation size is twice as large as the actual dirty bitmap size.
630 * See x86's kvm_vm_ioctl_get_dirty_log() why this is needed. 630 * See x86's kvm_vm_ioctl_get_dirty_log() why this is needed.
631 */ 631 */
632 static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot) 632 static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
633 { 633 {
634 #ifndef CONFIG_S390 634 #ifndef CONFIG_S390
635 unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot); 635 unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot);
636 636
637 memslot->dirty_bitmap = kvm_kvzalloc(dirty_bytes); 637 memslot->dirty_bitmap = kvm_kvzalloc(dirty_bytes);
638 if (!memslot->dirty_bitmap) 638 if (!memslot->dirty_bitmap)
639 return -ENOMEM; 639 return -ENOMEM;
640 640
641 #endif /* !CONFIG_S390 */ 641 #endif /* !CONFIG_S390 */
642 return 0; 642 return 0;
643 } 643 }
644 644
645 static int cmp_memslot(const void *slot1, const void *slot2) 645 static int cmp_memslot(const void *slot1, const void *slot2)
646 { 646 {
647 struct kvm_memory_slot *s1, *s2; 647 struct kvm_memory_slot *s1, *s2;
648 648
649 s1 = (struct kvm_memory_slot *)slot1; 649 s1 = (struct kvm_memory_slot *)slot1;
650 s2 = (struct kvm_memory_slot *)slot2; 650 s2 = (struct kvm_memory_slot *)slot2;
651 651
652 if (s1->npages < s2->npages) 652 if (s1->npages < s2->npages)
653 return 1; 653 return 1;
654 if (s1->npages > s2->npages) 654 if (s1->npages > s2->npages)
655 return -1; 655 return -1;
656 656
657 return 0; 657 return 0;
658 } 658 }
659 659
660 /* 660 /*
661 * Sort the memslots base on its size, so the larger slots 661 * Sort the memslots base on its size, so the larger slots
662 * will get better fit. 662 * will get better fit.
663 */ 663 */
664 static void sort_memslots(struct kvm_memslots *slots) 664 static void sort_memslots(struct kvm_memslots *slots)
665 { 665 {
666 int i; 666 int i;
667 667
668 sort(slots->memslots, KVM_MEM_SLOTS_NUM, 668 sort(slots->memslots, KVM_MEM_SLOTS_NUM,
669 sizeof(struct kvm_memory_slot), cmp_memslot, NULL); 669 sizeof(struct kvm_memory_slot), cmp_memslot, NULL);
670 670
671 for (i = 0; i < KVM_MEM_SLOTS_NUM; i++) 671 for (i = 0; i < KVM_MEM_SLOTS_NUM; i++)
672 slots->id_to_index[slots->memslots[i].id] = i; 672 slots->id_to_index[slots->memslots[i].id] = i;
673 } 673 }
674 674
675 void update_memslots(struct kvm_memslots *slots, struct kvm_memory_slot *new, 675 void update_memslots(struct kvm_memslots *slots, struct kvm_memory_slot *new,
676 u64 last_generation) 676 u64 last_generation)
677 { 677 {
678 if (new) { 678 if (new) {
679 int id = new->id; 679 int id = new->id;
680 struct kvm_memory_slot *old = id_to_memslot(slots, id); 680 struct kvm_memory_slot *old = id_to_memslot(slots, id);
681 unsigned long npages = old->npages; 681 unsigned long npages = old->npages;
682 682
683 *old = *new; 683 *old = *new;
684 if (new->npages != npages) 684 if (new->npages != npages)
685 sort_memslots(slots); 685 sort_memslots(slots);
686 } 686 }
687 687
688 slots->generation = last_generation + 1; 688 slots->generation = last_generation + 1;
689 } 689 }
690 690
691 static int check_memory_region_flags(struct kvm_userspace_memory_region *mem) 691 static int check_memory_region_flags(struct kvm_userspace_memory_region *mem)
692 { 692 {
693 u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES; 693 u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
694 694
695 #ifdef KVM_CAP_READONLY_MEM 695 #ifdef KVM_CAP_READONLY_MEM
696 valid_flags |= KVM_MEM_READONLY; 696 valid_flags |= KVM_MEM_READONLY;
697 #endif 697 #endif
698 698
699 if (mem->flags & ~valid_flags) 699 if (mem->flags & ~valid_flags)
700 return -EINVAL; 700 return -EINVAL;
701 701
702 return 0; 702 return 0;
703 } 703 }
704 704
705 static struct kvm_memslots *install_new_memslots(struct kvm *kvm, 705 static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
706 struct kvm_memslots *slots, struct kvm_memory_slot *new) 706 struct kvm_memslots *slots, struct kvm_memory_slot *new)
707 { 707 {
708 struct kvm_memslots *old_memslots = kvm->memslots; 708 struct kvm_memslots *old_memslots = kvm->memslots;
709 709
710 update_memslots(slots, new, kvm->memslots->generation); 710 update_memslots(slots, new, kvm->memslots->generation);
711 rcu_assign_pointer(kvm->memslots, slots); 711 rcu_assign_pointer(kvm->memslots, slots);
712 synchronize_srcu_expedited(&kvm->srcu); 712 synchronize_srcu_expedited(&kvm->srcu);
713 return old_memslots; 713 return old_memslots;
714 } 714 }
715 715
716 /* 716 /*
717 * Allocate some memory and give it an address in the guest physical address 717 * Allocate some memory and give it an address in the guest physical address
718 * space. 718 * space.
719 * 719 *
720 * Discontiguous memory is allowed, mostly for framebuffers. 720 * Discontiguous memory is allowed, mostly for framebuffers.
721 * 721 *
722 * Must be called holding mmap_sem for write. 722 * Must be called holding mmap_sem for write.
723 */ 723 */
724 int __kvm_set_memory_region(struct kvm *kvm, 724 int __kvm_set_memory_region(struct kvm *kvm,
725 struct kvm_userspace_memory_region *mem, 725 struct kvm_userspace_memory_region *mem,
726 bool user_alloc) 726 bool user_alloc)
727 { 727 {
728 int r; 728 int r;
729 gfn_t base_gfn; 729 gfn_t base_gfn;
730 unsigned long npages; 730 unsigned long npages;
731 struct kvm_memory_slot *memslot, *slot; 731 struct kvm_memory_slot *memslot, *slot;
732 struct kvm_memory_slot old, new; 732 struct kvm_memory_slot old, new;
733 struct kvm_memslots *slots = NULL, *old_memslots; 733 struct kvm_memslots *slots = NULL, *old_memslots;
734 734
735 r = check_memory_region_flags(mem); 735 r = check_memory_region_flags(mem);
736 if (r) 736 if (r)
737 goto out; 737 goto out;
738 738
739 r = -EINVAL; 739 r = -EINVAL;
740 /* General sanity checks */ 740 /* General sanity checks */
741 if (mem->memory_size & (PAGE_SIZE - 1)) 741 if (mem->memory_size & (PAGE_SIZE - 1))
742 goto out; 742 goto out;
743 if (mem->guest_phys_addr & (PAGE_SIZE - 1)) 743 if (mem->guest_phys_addr & (PAGE_SIZE - 1))
744 goto out; 744 goto out;
745 /* We can read the guest memory with __xxx_user() later on. */ 745 /* We can read the guest memory with __xxx_user() later on. */
746 if (user_alloc && 746 if (user_alloc &&
747 ((mem->userspace_addr & (PAGE_SIZE - 1)) || 747 ((mem->userspace_addr & (PAGE_SIZE - 1)) ||
748 !access_ok(VERIFY_WRITE, 748 !access_ok(VERIFY_WRITE,
749 (void __user *)(unsigned long)mem->userspace_addr, 749 (void __user *)(unsigned long)mem->userspace_addr,
750 mem->memory_size))) 750 mem->memory_size)))
751 goto out; 751 goto out;
752 if (mem->slot >= KVM_MEM_SLOTS_NUM) 752 if (mem->slot >= KVM_MEM_SLOTS_NUM)
753 goto out; 753 goto out;
754 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) 754 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
755 goto out; 755 goto out;
756 756
757 memslot = id_to_memslot(kvm->memslots, mem->slot); 757 memslot = id_to_memslot(kvm->memslots, mem->slot);
758 base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; 758 base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
759 npages = mem->memory_size >> PAGE_SHIFT; 759 npages = mem->memory_size >> PAGE_SHIFT;
760 760
761 r = -EINVAL; 761 r = -EINVAL;
762 if (npages > KVM_MEM_MAX_NR_PAGES) 762 if (npages > KVM_MEM_MAX_NR_PAGES)
763 goto out; 763 goto out;
764 764
765 if (!npages) 765 if (!npages)
766 mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES; 766 mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
767 767
768 new = old = *memslot; 768 new = old = *memslot;
769 769
770 new.id = mem->slot; 770 new.id = mem->slot;
771 new.base_gfn = base_gfn; 771 new.base_gfn = base_gfn;
772 new.npages = npages; 772 new.npages = npages;
773 new.flags = mem->flags; 773 new.flags = mem->flags;
774 774
775 /* 775 /*
776 * Disallow changing a memory slot's size or changing anything about 776 * Disallow changing a memory slot's size or changing anything about
777 * zero sized slots that doesn't involve making them non-zero. 777 * zero sized slots that doesn't involve making them non-zero.
778 */ 778 */
779 r = -EINVAL; 779 r = -EINVAL;
780 if (npages && old.npages && npages != old.npages) 780 if (npages && old.npages && npages != old.npages)
781 goto out_free; 781 goto out_free;
782 if (!npages && !old.npages) 782 if (!npages && !old.npages)
783 goto out_free; 783 goto out_free;
784 784
785 /* Check for overlaps */ 785 /* Check for overlaps */
786 r = -EEXIST; 786 r = -EEXIST;
787 kvm_for_each_memslot(slot, kvm->memslots) { 787 kvm_for_each_memslot(slot, kvm->memslots) {
788 if (slot->id >= KVM_USER_MEM_SLOTS || slot == memslot) 788 if (slot->id >= KVM_USER_MEM_SLOTS || slot == memslot)
789 continue; 789 continue;
790 if (!((base_gfn + npages <= slot->base_gfn) || 790 if (!((base_gfn + npages <= slot->base_gfn) ||
791 (base_gfn >= slot->base_gfn + slot->npages))) 791 (base_gfn >= slot->base_gfn + slot->npages)))
792 goto out_free; 792 goto out_free;
793 } 793 }
794 794
795 /* Free page dirty bitmap if unneeded */ 795 /* Free page dirty bitmap if unneeded */
796 if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES)) 796 if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
797 new.dirty_bitmap = NULL; 797 new.dirty_bitmap = NULL;
798 798
799 r = -ENOMEM; 799 r = -ENOMEM;
800 800
801 /* 801 /*
802 * Allocate if a slot is being created. If modifying a slot, 802 * Allocate if a slot is being created. If modifying a slot,
803 * the userspace_addr cannot change. 803 * the userspace_addr cannot change.
804 */ 804 */
805 if (!old.npages) { 805 if (!old.npages) {
806 new.user_alloc = user_alloc; 806 new.user_alloc = user_alloc;
807 new.userspace_addr = mem->userspace_addr; 807 new.userspace_addr = mem->userspace_addr;
808 808
809 if (kvm_arch_create_memslot(&new, npages)) 809 if (kvm_arch_create_memslot(&new, npages))
810 goto out_free; 810 goto out_free;
811 } else if (npages && mem->userspace_addr != old.userspace_addr) { 811 } else if (npages && mem->userspace_addr != old.userspace_addr) {
812 r = -EINVAL; 812 r = -EINVAL;
813 goto out_free; 813 goto out_free;
814 } 814 }
815 815
816 /* Allocate page dirty bitmap if needed */ 816 /* Allocate page dirty bitmap if needed */
817 if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { 817 if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
818 if (kvm_create_dirty_bitmap(&new) < 0) 818 if (kvm_create_dirty_bitmap(&new) < 0)
819 goto out_free; 819 goto out_free;
820 /* destroy any largepage mappings for dirty tracking */
821 } 820 }
822 821
823 if (!npages || base_gfn != old.base_gfn) { 822 if (!npages || base_gfn != old.base_gfn) {
824 struct kvm_memory_slot *slot; 823 struct kvm_memory_slot *slot;
825 824
826 r = -ENOMEM; 825 r = -ENOMEM;
827 slots = kmemdup(kvm->memslots, sizeof(struct kvm_memslots), 826 slots = kmemdup(kvm->memslots, sizeof(struct kvm_memslots),
828 GFP_KERNEL); 827 GFP_KERNEL);
829 if (!slots) 828 if (!slots)
830 goto out_free; 829 goto out_free;
831 slot = id_to_memslot(slots, mem->slot); 830 slot = id_to_memslot(slots, mem->slot);
832 slot->flags |= KVM_MEMSLOT_INVALID; 831 slot->flags |= KVM_MEMSLOT_INVALID;
833 832
834 old_memslots = install_new_memslots(kvm, slots, NULL); 833 old_memslots = install_new_memslots(kvm, slots, NULL);
835 834
836 /* slot was deleted or moved, clear iommu mapping */ 835 /* slot was deleted or moved, clear iommu mapping */
837 kvm_iommu_unmap_pages(kvm, &old); 836 kvm_iommu_unmap_pages(kvm, &old);
838 /* From this point no new shadow pages pointing to a deleted, 837 /* From this point no new shadow pages pointing to a deleted,
839 * or moved, memslot will be created. 838 * or moved, memslot will be created.
840 * 839 *
841 * validation of sp->gfn happens in: 840 * validation of sp->gfn happens in:
842 * - gfn_to_hva (kvm_read_guest, gfn_to_pfn) 841 * - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
843 * - kvm_is_visible_gfn (mmu_check_roots) 842 * - kvm_is_visible_gfn (mmu_check_roots)
844 */ 843 */
845 kvm_arch_flush_shadow_memslot(kvm, slot); 844 kvm_arch_flush_shadow_memslot(kvm, slot);
846 slots = old_memslots; 845 slots = old_memslots;
847 } 846 }
848 847
849 r = kvm_arch_prepare_memory_region(kvm, &new, old, mem, user_alloc); 848 r = kvm_arch_prepare_memory_region(kvm, &new, old, mem, user_alloc);
850 if (r) 849 if (r)
851 goto out_slots; 850 goto out_slots;
852 851
853 r = -ENOMEM; 852 r = -ENOMEM;
854 /* 853 /*
855 * We can re-use the old_memslots from above, the only difference 854 * We can re-use the old_memslots from above, the only difference
856 * from the currently installed memslots is the invalid flag. This 855 * from the currently installed memslots is the invalid flag. This
857 * will get overwritten by update_memslots anyway. 856 * will get overwritten by update_memslots anyway.
858 */ 857 */
859 if (!slots) { 858 if (!slots) {
860 slots = kmemdup(kvm->memslots, sizeof(struct kvm_memslots), 859 slots = kmemdup(kvm->memslots, sizeof(struct kvm_memslots),
861 GFP_KERNEL); 860 GFP_KERNEL);
862 if (!slots) 861 if (!slots)
863 goto out_free; 862 goto out_free;
864 } 863 }
865 864
866 /* map new memory slot into the iommu */ 865 /* map new memory slot into the iommu */
867 if (npages) { 866 if (npages) {
868 r = kvm_iommu_map_pages(kvm, &new); 867 r = kvm_iommu_map_pages(kvm, &new);
869 if (r) 868 if (r)
870 goto out_slots; 869 goto out_slots;
871 } 870 }
872 871
873 /* actual memory is freed via old in kvm_free_physmem_slot below */ 872 /* actual memory is freed via old in kvm_free_physmem_slot below */
874 if (!npages) { 873 if (!npages) {
875 new.dirty_bitmap = NULL; 874 new.dirty_bitmap = NULL;
876 memset(&new.arch, 0, sizeof(new.arch)); 875 memset(&new.arch, 0, sizeof(new.arch));
877 } 876 }
878 877
879 old_memslots = install_new_memslots(kvm, slots, &new); 878 old_memslots = install_new_memslots(kvm, slots, &new);
880 879
881 kvm_arch_commit_memory_region(kvm, mem, old, user_alloc); 880 kvm_arch_commit_memory_region(kvm, mem, old, user_alloc);
882 881
883 kvm_free_physmem_slot(&old, &new); 882 kvm_free_physmem_slot(&old, &new);
884 kfree(old_memslots); 883 kfree(old_memslots);
885 884
886 return 0; 885 return 0;
887 886
888 out_slots: 887 out_slots:
889 kfree(slots); 888 kfree(slots);
890 out_free: 889 out_free:
891 kvm_free_physmem_slot(&new, &old); 890 kvm_free_physmem_slot(&new, &old);
892 out: 891 out:
893 return r; 892 return r;
894 893
895 } 894 }
896 EXPORT_SYMBOL_GPL(__kvm_set_memory_region); 895 EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
897 896
898 int kvm_set_memory_region(struct kvm *kvm, 897 int kvm_set_memory_region(struct kvm *kvm,
899 struct kvm_userspace_memory_region *mem, 898 struct kvm_userspace_memory_region *mem,
900 bool user_alloc) 899 bool user_alloc)
901 { 900 {
902 int r; 901 int r;
903 902
904 mutex_lock(&kvm->slots_lock); 903 mutex_lock(&kvm->slots_lock);
905 r = __kvm_set_memory_region(kvm, mem, user_alloc); 904 r = __kvm_set_memory_region(kvm, mem, user_alloc);
906 mutex_unlock(&kvm->slots_lock); 905 mutex_unlock(&kvm->slots_lock);
907 return r; 906 return r;
908 } 907 }
909 EXPORT_SYMBOL_GPL(kvm_set_memory_region); 908 EXPORT_SYMBOL_GPL(kvm_set_memory_region);
910 909
911 int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, 910 int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
912 struct 911 struct
913 kvm_userspace_memory_region *mem, 912 kvm_userspace_memory_region *mem,
914 bool user_alloc) 913 bool user_alloc)
915 { 914 {
916 if (mem->slot >= KVM_USER_MEM_SLOTS) 915 if (mem->slot >= KVM_USER_MEM_SLOTS)
917 return -EINVAL; 916 return -EINVAL;
918 return kvm_set_memory_region(kvm, mem, user_alloc); 917 return kvm_set_memory_region(kvm, mem, user_alloc);
919 } 918 }
920 919
921 int kvm_get_dirty_log(struct kvm *kvm, 920 int kvm_get_dirty_log(struct kvm *kvm,
922 struct kvm_dirty_log *log, int *is_dirty) 921 struct kvm_dirty_log *log, int *is_dirty)
923 { 922 {
924 struct kvm_memory_slot *memslot; 923 struct kvm_memory_slot *memslot;
925 int r, i; 924 int r, i;
926 unsigned long n; 925 unsigned long n;
927 unsigned long any = 0; 926 unsigned long any = 0;
928 927
929 r = -EINVAL; 928 r = -EINVAL;
930 if (log->slot >= KVM_USER_MEM_SLOTS) 929 if (log->slot >= KVM_USER_MEM_SLOTS)
931 goto out; 930 goto out;
932 931
933 memslot = id_to_memslot(kvm->memslots, log->slot); 932 memslot = id_to_memslot(kvm->memslots, log->slot);
934 r = -ENOENT; 933 r = -ENOENT;
935 if (!memslot->dirty_bitmap) 934 if (!memslot->dirty_bitmap)
936 goto out; 935 goto out;
937 936
938 n = kvm_dirty_bitmap_bytes(memslot); 937 n = kvm_dirty_bitmap_bytes(memslot);
939 938
940 for (i = 0; !any && i < n/sizeof(long); ++i) 939 for (i = 0; !any && i < n/sizeof(long); ++i)
941 any = memslot->dirty_bitmap[i]; 940 any = memslot->dirty_bitmap[i];
942 941
943 r = -EFAULT; 942 r = -EFAULT;
944 if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) 943 if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
945 goto out; 944 goto out;
946 945
947 if (any) 946 if (any)
948 *is_dirty = 1; 947 *is_dirty = 1;
949 948
950 r = 0; 949 r = 0;
951 out: 950 out:
952 return r; 951 return r;
953 } 952 }
954 953
955 bool kvm_largepages_enabled(void) 954 bool kvm_largepages_enabled(void)
956 { 955 {
957 return largepages_enabled; 956 return largepages_enabled;
958 } 957 }
959 958
960 void kvm_disable_largepages(void) 959 void kvm_disable_largepages(void)
961 { 960 {
962 largepages_enabled = false; 961 largepages_enabled = false;
963 } 962 }
964 EXPORT_SYMBOL_GPL(kvm_disable_largepages); 963 EXPORT_SYMBOL_GPL(kvm_disable_largepages);
965 964
966 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) 965 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
967 { 966 {
968 return __gfn_to_memslot(kvm_memslots(kvm), gfn); 967 return __gfn_to_memslot(kvm_memslots(kvm), gfn);
969 } 968 }
970 EXPORT_SYMBOL_GPL(gfn_to_memslot); 969 EXPORT_SYMBOL_GPL(gfn_to_memslot);
971 970
972 int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) 971 int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
973 { 972 {
974 struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn); 973 struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn);
975 974
976 if (!memslot || memslot->id >= KVM_USER_MEM_SLOTS || 975 if (!memslot || memslot->id >= KVM_USER_MEM_SLOTS ||
977 memslot->flags & KVM_MEMSLOT_INVALID) 976 memslot->flags & KVM_MEMSLOT_INVALID)
978 return 0; 977 return 0;
979 978
980 return 1; 979 return 1;
981 } 980 }
982 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); 981 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
983 982
984 unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn) 983 unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn)
985 { 984 {
986 struct vm_area_struct *vma; 985 struct vm_area_struct *vma;
987 unsigned long addr, size; 986 unsigned long addr, size;
988 987
989 size = PAGE_SIZE; 988 size = PAGE_SIZE;
990 989
991 addr = gfn_to_hva(kvm, gfn); 990 addr = gfn_to_hva(kvm, gfn);
992 if (kvm_is_error_hva(addr)) 991 if (kvm_is_error_hva(addr))
993 return PAGE_SIZE; 992 return PAGE_SIZE;
994 993
995 down_read(&current->mm->mmap_sem); 994 down_read(&current->mm->mmap_sem);
996 vma = find_vma(current->mm, addr); 995 vma = find_vma(current->mm, addr);
997 if (!vma) 996 if (!vma)
998 goto out; 997 goto out;
999 998
1000 size = vma_kernel_pagesize(vma); 999 size = vma_kernel_pagesize(vma);
1001 1000
1002 out: 1001 out:
1003 up_read(&current->mm->mmap_sem); 1002 up_read(&current->mm->mmap_sem);
1004 1003
1005 return size; 1004 return size;
1006 } 1005 }
1007 1006
1008 static bool memslot_is_readonly(struct kvm_memory_slot *slot) 1007 static bool memslot_is_readonly(struct kvm_memory_slot *slot)
1009 { 1008 {
1010 return slot->flags & KVM_MEM_READONLY; 1009 return slot->flags & KVM_MEM_READONLY;
1011 } 1010 }
1012 1011
1013 static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, 1012 static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
1014 gfn_t *nr_pages, bool write) 1013 gfn_t *nr_pages, bool write)
1015 { 1014 {
1016 if (!slot || slot->flags & KVM_MEMSLOT_INVALID) 1015 if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
1017 return KVM_HVA_ERR_BAD; 1016 return KVM_HVA_ERR_BAD;
1018 1017
1019 if (memslot_is_readonly(slot) && write) 1018 if (memslot_is_readonly(slot) && write)
1020 return KVM_HVA_ERR_RO_BAD; 1019 return KVM_HVA_ERR_RO_BAD;
1021 1020
1022 if (nr_pages) 1021 if (nr_pages)
1023 *nr_pages = slot->npages - (gfn - slot->base_gfn); 1022 *nr_pages = slot->npages - (gfn - slot->base_gfn);
1024 1023
1025 return __gfn_to_hva_memslot(slot, gfn); 1024 return __gfn_to_hva_memslot(slot, gfn);
1026 } 1025 }
1027 1026
1028 static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, 1027 static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
1029 gfn_t *nr_pages) 1028 gfn_t *nr_pages)
1030 { 1029 {
1031 return __gfn_to_hva_many(slot, gfn, nr_pages, true); 1030 return __gfn_to_hva_many(slot, gfn, nr_pages, true);
1032 } 1031 }
1033 1032
1034 unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, 1033 unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
1035 gfn_t gfn) 1034 gfn_t gfn)
1036 { 1035 {
1037 return gfn_to_hva_many(slot, gfn, NULL); 1036 return gfn_to_hva_many(slot, gfn, NULL);
1038 } 1037 }
1039 EXPORT_SYMBOL_GPL(gfn_to_hva_memslot); 1038 EXPORT_SYMBOL_GPL(gfn_to_hva_memslot);
1040 1039
1041 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) 1040 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
1042 { 1041 {
1043 return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL); 1042 return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
1044 } 1043 }
1045 EXPORT_SYMBOL_GPL(gfn_to_hva); 1044 EXPORT_SYMBOL_GPL(gfn_to_hva);
1046 1045
1047 /* 1046 /*
1048 * The hva returned by this function is only allowed to be read. 1047 * The hva returned by this function is only allowed to be read.
1049 * It should pair with kvm_read_hva() or kvm_read_hva_atomic(). 1048 * It should pair with kvm_read_hva() or kvm_read_hva_atomic().
1050 */ 1049 */
1051 static unsigned long gfn_to_hva_read(struct kvm *kvm, gfn_t gfn) 1050 static unsigned long gfn_to_hva_read(struct kvm *kvm, gfn_t gfn)
1052 { 1051 {
1053 return __gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL, false); 1052 return __gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL, false);
1054 } 1053 }
1055 1054
1056 static int kvm_read_hva(void *data, void __user *hva, int len) 1055 static int kvm_read_hva(void *data, void __user *hva, int len)
1057 { 1056 {
1058 return __copy_from_user(data, hva, len); 1057 return __copy_from_user(data, hva, len);
1059 } 1058 }
1060 1059
1061 static int kvm_read_hva_atomic(void *data, void __user *hva, int len) 1060 static int kvm_read_hva_atomic(void *data, void __user *hva, int len)
1062 { 1061 {
1063 return __copy_from_user_inatomic(data, hva, len); 1062 return __copy_from_user_inatomic(data, hva, len);
1064 } 1063 }
1065 1064
1066 int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm, 1065 int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm,
1067 unsigned long start, int write, struct page **page) 1066 unsigned long start, int write, struct page **page)
1068 { 1067 {
1069 int flags = FOLL_TOUCH | FOLL_NOWAIT | FOLL_HWPOISON | FOLL_GET; 1068 int flags = FOLL_TOUCH | FOLL_NOWAIT | FOLL_HWPOISON | FOLL_GET;
1070 1069
1071 if (write) 1070 if (write)
1072 flags |= FOLL_WRITE; 1071 flags |= FOLL_WRITE;
1073 1072
1074 return __get_user_pages(tsk, mm, start, 1, flags, page, NULL, NULL); 1073 return __get_user_pages(tsk, mm, start, 1, flags, page, NULL, NULL);
1075 } 1074 }
1076 1075
1077 static inline int check_user_page_hwpoison(unsigned long addr) 1076 static inline int check_user_page_hwpoison(unsigned long addr)
1078 { 1077 {
1079 int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE; 1078 int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE;
1080 1079
1081 rc = __get_user_pages(current, current->mm, addr, 1, 1080 rc = __get_user_pages(current, current->mm, addr, 1,
1082 flags, NULL, NULL, NULL); 1081 flags, NULL, NULL, NULL);
1083 return rc == -EHWPOISON; 1082 return rc == -EHWPOISON;
1084 } 1083 }
1085 1084
1086 /* 1085 /*
1087 * The atomic path to get the writable pfn which will be stored in @pfn, 1086 * The atomic path to get the writable pfn which will be stored in @pfn,
1088 * true indicates success, otherwise false is returned. 1087 * true indicates success, otherwise false is returned.
1089 */ 1088 */
1090 static bool hva_to_pfn_fast(unsigned long addr, bool atomic, bool *async, 1089 static bool hva_to_pfn_fast(unsigned long addr, bool atomic, bool *async,
1091 bool write_fault, bool *writable, pfn_t *pfn) 1090 bool write_fault, bool *writable, pfn_t *pfn)
1092 { 1091 {
1093 struct page *page[1]; 1092 struct page *page[1];
1094 int npages; 1093 int npages;
1095 1094
1096 if (!(async || atomic)) 1095 if (!(async || atomic))
1097 return false; 1096 return false;
1098 1097
1099 /* 1098 /*
1100 * Fast pin a writable pfn only if it is a write fault request 1099 * Fast pin a writable pfn only if it is a write fault request
1101 * or the caller allows to map a writable pfn for a read fault 1100 * or the caller allows to map a writable pfn for a read fault
1102 * request. 1101 * request.
1103 */ 1102 */
1104 if (!(write_fault || writable)) 1103 if (!(write_fault || writable))
1105 return false; 1104 return false;
1106 1105
1107 npages = __get_user_pages_fast(addr, 1, 1, page); 1106 npages = __get_user_pages_fast(addr, 1, 1, page);
1108 if (npages == 1) { 1107 if (npages == 1) {
1109 *pfn = page_to_pfn(page[0]); 1108 *pfn = page_to_pfn(page[0]);
1110 1109
1111 if (writable) 1110 if (writable)
1112 *writable = true; 1111 *writable = true;
1113 return true; 1112 return true;
1114 } 1113 }
1115 1114
1116 return false; 1115 return false;
1117 } 1116 }
1118 1117
1119 /* 1118 /*
1120 * The slow path to get the pfn of the specified host virtual address, 1119 * The slow path to get the pfn of the specified host virtual address,
1121 * 1 indicates success, -errno is returned if error is detected. 1120 * 1 indicates success, -errno is returned if error is detected.
1122 */ 1121 */
1123 static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, 1122 static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
1124 bool *writable, pfn_t *pfn) 1123 bool *writable, pfn_t *pfn)
1125 { 1124 {
1126 struct page *page[1]; 1125 struct page *page[1];
1127 int npages = 0; 1126 int npages = 0;
1128 1127
1129 might_sleep(); 1128 might_sleep();
1130 1129
1131 if (writable) 1130 if (writable)
1132 *writable = write_fault; 1131 *writable = write_fault;
1133 1132
1134 if (async) { 1133 if (async) {
1135 down_read(&current->mm->mmap_sem); 1134 down_read(&current->mm->mmap_sem);
1136 npages = get_user_page_nowait(current, current->mm, 1135 npages = get_user_page_nowait(current, current->mm,
1137 addr, write_fault, page); 1136 addr, write_fault, page);
1138 up_read(&current->mm->mmap_sem); 1137 up_read(&current->mm->mmap_sem);
1139 } else 1138 } else
1140 npages = get_user_pages_fast(addr, 1, write_fault, 1139 npages = get_user_pages_fast(addr, 1, write_fault,
1141 page); 1140 page);
1142 if (npages != 1) 1141 if (npages != 1)
1143 return npages; 1142 return npages;
1144 1143
1145 /* map read fault as writable if possible */ 1144 /* map read fault as writable if possible */
1146 if (unlikely(!write_fault) && writable) { 1145 if (unlikely(!write_fault) && writable) {
1147 struct page *wpage[1]; 1146 struct page *wpage[1];
1148 1147
1149 npages = __get_user_pages_fast(addr, 1, 1, wpage); 1148 npages = __get_user_pages_fast(addr, 1, 1, wpage);
1150 if (npages == 1) { 1149 if (npages == 1) {
1151 *writable = true; 1150 *writable = true;
1152 put_page(page[0]); 1151 put_page(page[0]);
1153 page[0] = wpage[0]; 1152 page[0] = wpage[0];
1154 } 1153 }
1155 1154
1156 npages = 1; 1155 npages = 1;
1157 } 1156 }
1158 *pfn = page_to_pfn(page[0]); 1157 *pfn = page_to_pfn(page[0]);
1159 return npages; 1158 return npages;
1160 } 1159 }
1161 1160
1162 static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault) 1161 static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
1163 { 1162 {
1164 if (unlikely(!(vma->vm_flags & VM_READ))) 1163 if (unlikely(!(vma->vm_flags & VM_READ)))
1165 return false; 1164 return false;
1166 1165
1167 if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE)))) 1166 if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE))))
1168 return false; 1167 return false;
1169 1168
1170 return true; 1169 return true;
1171 } 1170 }
1172 1171
1173 /* 1172 /*
1174 * Pin guest page in memory and return its pfn. 1173 * Pin guest page in memory and return its pfn.
1175 * @addr: host virtual address which maps memory to the guest 1174 * @addr: host virtual address which maps memory to the guest
1176 * @atomic: whether this function can sleep 1175 * @atomic: whether this function can sleep
1177 * @async: whether this function need to wait IO complete if the 1176 * @async: whether this function need to wait IO complete if the
1178 * host page is not in the memory 1177 * host page is not in the memory
1179 * @write_fault: whether we should get a writable host page 1178 * @write_fault: whether we should get a writable host page
1180 * @writable: whether it allows to map a writable host page for !@write_fault 1179 * @writable: whether it allows to map a writable host page for !@write_fault
1181 * 1180 *
1182 * The function will map a writable host page for these two cases: 1181 * The function will map a writable host page for these two cases:
1183 * 1): @write_fault = true 1182 * 1): @write_fault = true
1184 * 2): @write_fault = false && @writable, @writable will tell the caller 1183 * 2): @write_fault = false && @writable, @writable will tell the caller
1185 * whether the mapping is writable. 1184 * whether the mapping is writable.
1186 */ 1185 */
1187 static pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, 1186 static pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
1188 bool write_fault, bool *writable) 1187 bool write_fault, bool *writable)
1189 { 1188 {
1190 struct vm_area_struct *vma; 1189 struct vm_area_struct *vma;
1191 pfn_t pfn = 0; 1190 pfn_t pfn = 0;
1192 int npages; 1191 int npages;
1193 1192
1194 /* we can do it either atomically or asynchronously, not both */ 1193 /* we can do it either atomically or asynchronously, not both */
1195 BUG_ON(atomic && async); 1194 BUG_ON(atomic && async);
1196 1195
1197 if (hva_to_pfn_fast(addr, atomic, async, write_fault, writable, &pfn)) 1196 if (hva_to_pfn_fast(addr, atomic, async, write_fault, writable, &pfn))
1198 return pfn; 1197 return pfn;
1199 1198
1200 if (atomic) 1199 if (atomic)
1201 return KVM_PFN_ERR_FAULT; 1200 return KVM_PFN_ERR_FAULT;
1202 1201
1203 npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn); 1202 npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn);
1204 if (npages == 1) 1203 if (npages == 1)
1205 return pfn; 1204 return pfn;
1206 1205
1207 down_read(&current->mm->mmap_sem); 1206 down_read(&current->mm->mmap_sem);
1208 if (npages == -EHWPOISON || 1207 if (npages == -EHWPOISON ||
1209 (!async && check_user_page_hwpoison(addr))) { 1208 (!async && check_user_page_hwpoison(addr))) {
1210 pfn = KVM_PFN_ERR_HWPOISON; 1209 pfn = KVM_PFN_ERR_HWPOISON;
1211 goto exit; 1210 goto exit;
1212 } 1211 }
1213 1212
1214 vma = find_vma_intersection(current->mm, addr, addr + 1); 1213 vma = find_vma_intersection(current->mm, addr, addr + 1);
1215 1214
1216 if (vma == NULL) 1215 if (vma == NULL)
1217 pfn = KVM_PFN_ERR_FAULT; 1216 pfn = KVM_PFN_ERR_FAULT;
1218 else if ((vma->vm_flags & VM_PFNMAP)) { 1217 else if ((vma->vm_flags & VM_PFNMAP)) {
1219 pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + 1218 pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) +
1220 vma->vm_pgoff; 1219 vma->vm_pgoff;
1221 BUG_ON(!kvm_is_mmio_pfn(pfn)); 1220 BUG_ON(!kvm_is_mmio_pfn(pfn));
1222 } else { 1221 } else {
1223 if (async && vma_is_valid(vma, write_fault)) 1222 if (async && vma_is_valid(vma, write_fault))
1224 *async = true; 1223 *async = true;
1225 pfn = KVM_PFN_ERR_FAULT; 1224 pfn = KVM_PFN_ERR_FAULT;
1226 } 1225 }
1227 exit: 1226 exit:
1228 up_read(&current->mm->mmap_sem); 1227 up_read(&current->mm->mmap_sem);
1229 return pfn; 1228 return pfn;
1230 } 1229 }
1231 1230
1232 static pfn_t 1231 static pfn_t
1233 __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic, 1232 __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic,
1234 bool *async, bool write_fault, bool *writable) 1233 bool *async, bool write_fault, bool *writable)
1235 { 1234 {
1236 unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault); 1235 unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
1237 1236
1238 if (addr == KVM_HVA_ERR_RO_BAD) 1237 if (addr == KVM_HVA_ERR_RO_BAD)
1239 return KVM_PFN_ERR_RO_FAULT; 1238 return KVM_PFN_ERR_RO_FAULT;
1240 1239
1241 if (kvm_is_error_hva(addr)) 1240 if (kvm_is_error_hva(addr))
1242 return KVM_PFN_NOSLOT; 1241 return KVM_PFN_NOSLOT;
1243 1242
1244 /* Do not map writable pfn in the readonly memslot. */ 1243 /* Do not map writable pfn in the readonly memslot. */
1245 if (writable && memslot_is_readonly(slot)) { 1244 if (writable && memslot_is_readonly(slot)) {
1246 *writable = false; 1245 *writable = false;
1247 writable = NULL; 1246 writable = NULL;
1248 } 1247 }
1249 1248
1250 return hva_to_pfn(addr, atomic, async, write_fault, 1249 return hva_to_pfn(addr, atomic, async, write_fault,
1251 writable); 1250 writable);
1252 } 1251 }
1253 1252
1254 static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async, 1253 static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async,
1255 bool write_fault, bool *writable) 1254 bool write_fault, bool *writable)
1256 { 1255 {
1257 struct kvm_memory_slot *slot; 1256 struct kvm_memory_slot *slot;
1258 1257
1259 if (async) 1258 if (async)
1260 *async = false; 1259 *async = false;
1261 1260
1262 slot = gfn_to_memslot(kvm, gfn); 1261 slot = gfn_to_memslot(kvm, gfn);
1263 1262
1264 return __gfn_to_pfn_memslot(slot, gfn, atomic, async, write_fault, 1263 return __gfn_to_pfn_memslot(slot, gfn, atomic, async, write_fault,
1265 writable); 1264 writable);
1266 } 1265 }
1267 1266
1268 pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn) 1267 pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
1269 { 1268 {
1270 return __gfn_to_pfn(kvm, gfn, true, NULL, true, NULL); 1269 return __gfn_to_pfn(kvm, gfn, true, NULL, true, NULL);
1271 } 1270 }
1272 EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic); 1271 EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic);
1273 1272
1274 pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async, 1273 pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async,
1275 bool write_fault, bool *writable) 1274 bool write_fault, bool *writable)
1276 { 1275 {
1277 return __gfn_to_pfn(kvm, gfn, false, async, write_fault, writable); 1276 return __gfn_to_pfn(kvm, gfn, false, async, write_fault, writable);
1278 } 1277 }
1279 EXPORT_SYMBOL_GPL(gfn_to_pfn_async); 1278 EXPORT_SYMBOL_GPL(gfn_to_pfn_async);
1280 1279
1281 pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) 1280 pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
1282 { 1281 {
1283 return __gfn_to_pfn(kvm, gfn, false, NULL, true, NULL); 1282 return __gfn_to_pfn(kvm, gfn, false, NULL, true, NULL);
1284 } 1283 }
1285 EXPORT_SYMBOL_GPL(gfn_to_pfn); 1284 EXPORT_SYMBOL_GPL(gfn_to_pfn);
1286 1285
1287 pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, 1286 pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
1288 bool *writable) 1287 bool *writable)
1289 { 1288 {
1290 return __gfn_to_pfn(kvm, gfn, false, NULL, write_fault, writable); 1289 return __gfn_to_pfn(kvm, gfn, false, NULL, write_fault, writable);
1291 } 1290 }
1292 EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); 1291 EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
1293 1292
1294 pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn) 1293 pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
1295 { 1294 {
1296 return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL); 1295 return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL);
1297 } 1296 }
1298 1297
1299 pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn) 1298 pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn)
1300 { 1299 {
1301 return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL); 1300 return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL);
1302 } 1301 }
1303 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic); 1302 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
1304 1303
1305 int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages, 1304 int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
1306 int nr_pages) 1305 int nr_pages)
1307 { 1306 {
1308 unsigned long addr; 1307 unsigned long addr;
1309 gfn_t entry; 1308 gfn_t entry;
1310 1309
1311 addr = gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, &entry); 1310 addr = gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, &entry);
1312 if (kvm_is_error_hva(addr)) 1311 if (kvm_is_error_hva(addr))
1313 return -1; 1312 return -1;
1314 1313
1315 if (entry < nr_pages) 1314 if (entry < nr_pages)
1316 return 0; 1315 return 0;
1317 1316
1318 return __get_user_pages_fast(addr, nr_pages, 1, pages); 1317 return __get_user_pages_fast(addr, nr_pages, 1, pages);
1319 } 1318 }
1320 EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic); 1319 EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
1321 1320
1322 static struct page *kvm_pfn_to_page(pfn_t pfn) 1321 static struct page *kvm_pfn_to_page(pfn_t pfn)
1323 { 1322 {
1324 if (is_error_noslot_pfn(pfn)) 1323 if (is_error_noslot_pfn(pfn))
1325 return KVM_ERR_PTR_BAD_PAGE; 1324 return KVM_ERR_PTR_BAD_PAGE;
1326 1325
1327 if (kvm_is_mmio_pfn(pfn)) { 1326 if (kvm_is_mmio_pfn(pfn)) {
1328 WARN_ON(1); 1327 WARN_ON(1);
1329 return KVM_ERR_PTR_BAD_PAGE; 1328 return KVM_ERR_PTR_BAD_PAGE;
1330 } 1329 }
1331 1330
1332 return pfn_to_page(pfn); 1331 return pfn_to_page(pfn);
1333 } 1332 }
1334 1333
1335 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) 1334 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
1336 { 1335 {
1337 pfn_t pfn; 1336 pfn_t pfn;
1338 1337
1339 pfn = gfn_to_pfn(kvm, gfn); 1338 pfn = gfn_to_pfn(kvm, gfn);
1340 1339
1341 return kvm_pfn_to_page(pfn); 1340 return kvm_pfn_to_page(pfn);
1342 } 1341 }
1343 1342
1344 EXPORT_SYMBOL_GPL(gfn_to_page); 1343 EXPORT_SYMBOL_GPL(gfn_to_page);
1345 1344
1346 void kvm_release_page_clean(struct page *page) 1345 void kvm_release_page_clean(struct page *page)
1347 { 1346 {
1348 WARN_ON(is_error_page(page)); 1347 WARN_ON(is_error_page(page));
1349 1348
1350 kvm_release_pfn_clean(page_to_pfn(page)); 1349 kvm_release_pfn_clean(page_to_pfn(page));
1351 } 1350 }
1352 EXPORT_SYMBOL_GPL(kvm_release_page_clean); 1351 EXPORT_SYMBOL_GPL(kvm_release_page_clean);
1353 1352
1354 void kvm_release_pfn_clean(pfn_t pfn) 1353 void kvm_release_pfn_clean(pfn_t pfn)
1355 { 1354 {
1356 if (!is_error_noslot_pfn(pfn) && !kvm_is_mmio_pfn(pfn)) 1355 if (!is_error_noslot_pfn(pfn) && !kvm_is_mmio_pfn(pfn))
1357 put_page(pfn_to_page(pfn)); 1356 put_page(pfn_to_page(pfn));
1358 } 1357 }
1359 EXPORT_SYMBOL_GPL(kvm_release_pfn_clean); 1358 EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
1360 1359
1361 void kvm_release_page_dirty(struct page *page) 1360 void kvm_release_page_dirty(struct page *page)
1362 { 1361 {
1363 WARN_ON(is_error_page(page)); 1362 WARN_ON(is_error_page(page));
1364 1363
1365 kvm_release_pfn_dirty(page_to_pfn(page)); 1364 kvm_release_pfn_dirty(page_to_pfn(page));
1366 } 1365 }
1367 EXPORT_SYMBOL_GPL(kvm_release_page_dirty); 1366 EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
1368 1367
1369 void kvm_release_pfn_dirty(pfn_t pfn) 1368 void kvm_release_pfn_dirty(pfn_t pfn)
1370 { 1369 {
1371 kvm_set_pfn_dirty(pfn); 1370 kvm_set_pfn_dirty(pfn);
1372 kvm_release_pfn_clean(pfn); 1371 kvm_release_pfn_clean(pfn);
1373 } 1372 }
1374 EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty); 1373 EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
1375 1374
1376 void kvm_set_page_dirty(struct page *page) 1375 void kvm_set_page_dirty(struct page *page)
1377 { 1376 {
1378 kvm_set_pfn_dirty(page_to_pfn(page)); 1377 kvm_set_pfn_dirty(page_to_pfn(page));
1379 } 1378 }
1380 EXPORT_SYMBOL_GPL(kvm_set_page_dirty); 1379 EXPORT_SYMBOL_GPL(kvm_set_page_dirty);
1381 1380
1382 void kvm_set_pfn_dirty(pfn_t pfn) 1381 void kvm_set_pfn_dirty(pfn_t pfn)
1383 { 1382 {
1384 if (!kvm_is_mmio_pfn(pfn)) { 1383 if (!kvm_is_mmio_pfn(pfn)) {
1385 struct page *page = pfn_to_page(pfn); 1384 struct page *page = pfn_to_page(pfn);
1386 if (!PageReserved(page)) 1385 if (!PageReserved(page))
1387 SetPageDirty(page); 1386 SetPageDirty(page);
1388 } 1387 }
1389 } 1388 }
1390 EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty); 1389 EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
1391 1390
1392 void kvm_set_pfn_accessed(pfn_t pfn) 1391 void kvm_set_pfn_accessed(pfn_t pfn)
1393 { 1392 {
1394 if (!kvm_is_mmio_pfn(pfn)) 1393 if (!kvm_is_mmio_pfn(pfn))
1395 mark_page_accessed(pfn_to_page(pfn)); 1394 mark_page_accessed(pfn_to_page(pfn));
1396 } 1395 }
1397 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed); 1396 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
1398 1397
1399 void kvm_get_pfn(pfn_t pfn) 1398 void kvm_get_pfn(pfn_t pfn)
1400 { 1399 {
1401 if (!kvm_is_mmio_pfn(pfn)) 1400 if (!kvm_is_mmio_pfn(pfn))
1402 get_page(pfn_to_page(pfn)); 1401 get_page(pfn_to_page(pfn));
1403 } 1402 }
1404 EXPORT_SYMBOL_GPL(kvm_get_pfn); 1403 EXPORT_SYMBOL_GPL(kvm_get_pfn);
1405 1404
1406 static int next_segment(unsigned long len, int offset) 1405 static int next_segment(unsigned long len, int offset)
1407 { 1406 {
1408 if (len > PAGE_SIZE - offset) 1407 if (len > PAGE_SIZE - offset)
1409 return PAGE_SIZE - offset; 1408 return PAGE_SIZE - offset;
1410 else 1409 else
1411 return len; 1410 return len;
1412 } 1411 }
1413 1412
1414 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, 1413 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
1415 int len) 1414 int len)
1416 { 1415 {
1417 int r; 1416 int r;
1418 unsigned long addr; 1417 unsigned long addr;
1419 1418
1420 addr = gfn_to_hva_read(kvm, gfn); 1419 addr = gfn_to_hva_read(kvm, gfn);
1421 if (kvm_is_error_hva(addr)) 1420 if (kvm_is_error_hva(addr))
1422 return -EFAULT; 1421 return -EFAULT;
1423 r = kvm_read_hva(data, (void __user *)addr + offset, len); 1422 r = kvm_read_hva(data, (void __user *)addr + offset, len);
1424 if (r) 1423 if (r)
1425 return -EFAULT; 1424 return -EFAULT;
1426 return 0; 1425 return 0;
1427 } 1426 }
1428 EXPORT_SYMBOL_GPL(kvm_read_guest_page); 1427 EXPORT_SYMBOL_GPL(kvm_read_guest_page);
1429 1428
1430 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) 1429 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
1431 { 1430 {
1432 gfn_t gfn = gpa >> PAGE_SHIFT; 1431 gfn_t gfn = gpa >> PAGE_SHIFT;
1433 int seg; 1432 int seg;
1434 int offset = offset_in_page(gpa); 1433 int offset = offset_in_page(gpa);
1435 int ret; 1434 int ret;
1436 1435
1437 while ((seg = next_segment(len, offset)) != 0) { 1436 while ((seg = next_segment(len, offset)) != 0) {
1438 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg); 1437 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
1439 if (ret < 0) 1438 if (ret < 0)
1440 return ret; 1439 return ret;
1441 offset = 0; 1440 offset = 0;
1442 len -= seg; 1441 len -= seg;
1443 data += seg; 1442 data += seg;
1444 ++gfn; 1443 ++gfn;
1445 } 1444 }
1446 return 0; 1445 return 0;
1447 } 1446 }
1448 EXPORT_SYMBOL_GPL(kvm_read_guest); 1447 EXPORT_SYMBOL_GPL(kvm_read_guest);
1449 1448
1450 int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, 1449 int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
1451 unsigned long len) 1450 unsigned long len)
1452 { 1451 {
1453 int r; 1452 int r;
1454 unsigned long addr; 1453 unsigned long addr;
1455 gfn_t gfn = gpa >> PAGE_SHIFT; 1454 gfn_t gfn = gpa >> PAGE_SHIFT;
1456 int offset = offset_in_page(gpa); 1455 int offset = offset_in_page(gpa);
1457 1456
1458 addr = gfn_to_hva_read(kvm, gfn); 1457 addr = gfn_to_hva_read(kvm, gfn);
1459 if (kvm_is_error_hva(addr)) 1458 if (kvm_is_error_hva(addr))
1460 return -EFAULT; 1459 return -EFAULT;
1461 pagefault_disable(); 1460 pagefault_disable();
1462 r = kvm_read_hva_atomic(data, (void __user *)addr + offset, len); 1461 r = kvm_read_hva_atomic(data, (void __user *)addr + offset, len);
1463 pagefault_enable(); 1462 pagefault_enable();
1464 if (r) 1463 if (r)
1465 return -EFAULT; 1464 return -EFAULT;
1466 return 0; 1465 return 0;
1467 } 1466 }
1468 EXPORT_SYMBOL(kvm_read_guest_atomic); 1467 EXPORT_SYMBOL(kvm_read_guest_atomic);
1469 1468
1470 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, 1469 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
1471 int offset, int len) 1470 int offset, int len)
1472 { 1471 {
1473 int r; 1472 int r;
1474 unsigned long addr; 1473 unsigned long addr;
1475 1474
1476 addr = gfn_to_hva(kvm, gfn); 1475 addr = gfn_to_hva(kvm, gfn);
1477 if (kvm_is_error_hva(addr)) 1476 if (kvm_is_error_hva(addr))
1478 return -EFAULT; 1477 return -EFAULT;
1479 r = __copy_to_user((void __user *)addr + offset, data, len); 1478 r = __copy_to_user((void __user *)addr + offset, data, len);
1480 if (r) 1479 if (r)
1481 return -EFAULT; 1480 return -EFAULT;
1482 mark_page_dirty(kvm, gfn); 1481 mark_page_dirty(kvm, gfn);
1483 return 0; 1482 return 0;
1484 } 1483 }
1485 EXPORT_SYMBOL_GPL(kvm_write_guest_page); 1484 EXPORT_SYMBOL_GPL(kvm_write_guest_page);
1486 1485
1487 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, 1486 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
1488 unsigned long len) 1487 unsigned long len)
1489 { 1488 {
1490 gfn_t gfn = gpa >> PAGE_SHIFT; 1489 gfn_t gfn = gpa >> PAGE_SHIFT;
1491 int seg; 1490 int seg;
1492 int offset = offset_in_page(gpa); 1491 int offset = offset_in_page(gpa);
1493 int ret; 1492 int ret;
1494 1493
1495 while ((seg = next_segment(len, offset)) != 0) { 1494 while ((seg = next_segment(len, offset)) != 0) {
1496 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg); 1495 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
1497 if (ret < 0) 1496 if (ret < 0)
1498 return ret; 1497 return ret;
1499 offset = 0; 1498 offset = 0;
1500 len -= seg; 1499 len -= seg;
1501 data += seg; 1500 data += seg;
1502 ++gfn; 1501 ++gfn;
1503 } 1502 }
1504 return 0; 1503 return 0;
1505 } 1504 }
1506 1505
1507 int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 1506 int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
1508 gpa_t gpa) 1507 gpa_t gpa)
1509 { 1508 {
1510 struct kvm_memslots *slots = kvm_memslots(kvm); 1509 struct kvm_memslots *slots = kvm_memslots(kvm);
1511 int offset = offset_in_page(gpa); 1510 int offset = offset_in_page(gpa);
1512 gfn_t gfn = gpa >> PAGE_SHIFT; 1511 gfn_t gfn = gpa >> PAGE_SHIFT;
1513 1512
1514 ghc->gpa = gpa; 1513 ghc->gpa = gpa;
1515 ghc->generation = slots->generation; 1514 ghc->generation = slots->generation;
1516 ghc->memslot = gfn_to_memslot(kvm, gfn); 1515 ghc->memslot = gfn_to_memslot(kvm, gfn);
1517 ghc->hva = gfn_to_hva_many(ghc->memslot, gfn, NULL); 1516 ghc->hva = gfn_to_hva_many(ghc->memslot, gfn, NULL);
1518 if (!kvm_is_error_hva(ghc->hva)) 1517 if (!kvm_is_error_hva(ghc->hva))
1519 ghc->hva += offset; 1518 ghc->hva += offset;
1520 else 1519 else
1521 return -EFAULT; 1520 return -EFAULT;
1522 1521
1523 return 0; 1522 return 0;
1524 } 1523 }
1525 EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init); 1524 EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
1526 1525
1527 int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 1526 int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
1528 void *data, unsigned long len) 1527 void *data, unsigned long len)
1529 { 1528 {
1530 struct kvm_memslots *slots = kvm_memslots(kvm); 1529 struct kvm_memslots *slots = kvm_memslots(kvm);
1531 int r; 1530 int r;
1532 1531
1533 if (slots->generation != ghc->generation) 1532 if (slots->generation != ghc->generation)
1534 kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa); 1533 kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa);
1535 1534
1536 if (kvm_is_error_hva(ghc->hva)) 1535 if (kvm_is_error_hva(ghc->hva))
1537 return -EFAULT; 1536 return -EFAULT;
1538 1537
1539 r = __copy_to_user((void __user *)ghc->hva, data, len); 1538 r = __copy_to_user((void __user *)ghc->hva, data, len);
1540 if (r) 1539 if (r)
1541 return -EFAULT; 1540 return -EFAULT;
1542 mark_page_dirty_in_slot(kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT); 1541 mark_page_dirty_in_slot(kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT);
1543 1542
1544 return 0; 1543 return 0;
1545 } 1544 }
1546 EXPORT_SYMBOL_GPL(kvm_write_guest_cached); 1545 EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
1547 1546
1548 int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 1547 int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
1549 void *data, unsigned long len) 1548 void *data, unsigned long len)
1550 { 1549 {
1551 struct kvm_memslots *slots = kvm_memslots(kvm); 1550 struct kvm_memslots *slots = kvm_memslots(kvm);
1552 int r; 1551 int r;
1553 1552
1554 if (slots->generation != ghc->generation) 1553 if (slots->generation != ghc->generation)
1555 kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa); 1554 kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa);
1556 1555
1557 if (kvm_is_error_hva(ghc->hva)) 1556 if (kvm_is_error_hva(ghc->hva))
1558 return -EFAULT; 1557 return -EFAULT;
1559 1558
1560 r = __copy_from_user(data, (void __user *)ghc->hva, len); 1559 r = __copy_from_user(data, (void __user *)ghc->hva, len);
1561 if (r) 1560 if (r)
1562 return -EFAULT; 1561 return -EFAULT;
1563 1562
1564 return 0; 1563 return 0;
1565 } 1564 }
1566 EXPORT_SYMBOL_GPL(kvm_read_guest_cached); 1565 EXPORT_SYMBOL_GPL(kvm_read_guest_cached);
1567 1566
1568 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) 1567 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
1569 { 1568 {
1570 return kvm_write_guest_page(kvm, gfn, (const void *) empty_zero_page, 1569 return kvm_write_guest_page(kvm, gfn, (const void *) empty_zero_page,
1571 offset, len); 1570 offset, len);
1572 } 1571 }
1573 EXPORT_SYMBOL_GPL(kvm_clear_guest_page); 1572 EXPORT_SYMBOL_GPL(kvm_clear_guest_page);
1574 1573
1575 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) 1574 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
1576 { 1575 {
1577 gfn_t gfn = gpa >> PAGE_SHIFT; 1576 gfn_t gfn = gpa >> PAGE_SHIFT;
1578 int seg; 1577 int seg;
1579 int offset = offset_in_page(gpa); 1578 int offset = offset_in_page(gpa);
1580 int ret; 1579 int ret;
1581 1580
1582 while ((seg = next_segment(len, offset)) != 0) { 1581 while ((seg = next_segment(len, offset)) != 0) {
1583 ret = kvm_clear_guest_page(kvm, gfn, offset, seg); 1582 ret = kvm_clear_guest_page(kvm, gfn, offset, seg);
1584 if (ret < 0) 1583 if (ret < 0)
1585 return ret; 1584 return ret;
1586 offset = 0; 1585 offset = 0;
1587 len -= seg; 1586 len -= seg;
1588 ++gfn; 1587 ++gfn;
1589 } 1588 }
1590 return 0; 1589 return 0;
1591 } 1590 }
1592 EXPORT_SYMBOL_GPL(kvm_clear_guest); 1591 EXPORT_SYMBOL_GPL(kvm_clear_guest);
1593 1592
1594 void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot, 1593 void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot,
1595 gfn_t gfn) 1594 gfn_t gfn)
1596 { 1595 {
1597 if (memslot && memslot->dirty_bitmap) { 1596 if (memslot && memslot->dirty_bitmap) {
1598 unsigned long rel_gfn = gfn - memslot->base_gfn; 1597 unsigned long rel_gfn = gfn - memslot->base_gfn;
1599 1598
1600 set_bit_le(rel_gfn, memslot->dirty_bitmap); 1599 set_bit_le(rel_gfn, memslot->dirty_bitmap);
1601 } 1600 }
1602 } 1601 }
1603 1602
1604 void mark_page_dirty(struct kvm *kvm, gfn_t gfn) 1603 void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
1605 { 1604 {
1606 struct kvm_memory_slot *memslot; 1605 struct kvm_memory_slot *memslot;
1607 1606
1608 memslot = gfn_to_memslot(kvm, gfn); 1607 memslot = gfn_to_memslot(kvm, gfn);
1609 mark_page_dirty_in_slot(kvm, memslot, gfn); 1608 mark_page_dirty_in_slot(kvm, memslot, gfn);
1610 } 1609 }
1611 1610
1612 /* 1611 /*
1613 * The vCPU has executed a HLT instruction with in-kernel mode enabled. 1612 * The vCPU has executed a HLT instruction with in-kernel mode enabled.
1614 */ 1613 */
1615 void kvm_vcpu_block(struct kvm_vcpu *vcpu) 1614 void kvm_vcpu_block(struct kvm_vcpu *vcpu)
1616 { 1615 {
1617 DEFINE_WAIT(wait); 1616 DEFINE_WAIT(wait);
1618 1617
1619 for (;;) { 1618 for (;;) {
1620 prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); 1619 prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
1621 1620
1622 if (kvm_arch_vcpu_runnable(vcpu)) { 1621 if (kvm_arch_vcpu_runnable(vcpu)) {
1623 kvm_make_request(KVM_REQ_UNHALT, vcpu); 1622 kvm_make_request(KVM_REQ_UNHALT, vcpu);
1624 break; 1623 break;
1625 } 1624 }
1626 if (kvm_cpu_has_pending_timer(vcpu)) 1625 if (kvm_cpu_has_pending_timer(vcpu))
1627 break; 1626 break;
1628 if (signal_pending(current)) 1627 if (signal_pending(current))
1629 break; 1628 break;
1630 1629
1631 schedule(); 1630 schedule();
1632 } 1631 }
1633 1632
1634 finish_wait(&vcpu->wq, &wait); 1633 finish_wait(&vcpu->wq, &wait);
1635 } 1634 }
1636 1635
1637 #ifndef CONFIG_S390 1636 #ifndef CONFIG_S390
1638 /* 1637 /*
1639 * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode. 1638 * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode.
1640 */ 1639 */
1641 void kvm_vcpu_kick(struct kvm_vcpu *vcpu) 1640 void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
1642 { 1641 {
1643 int me; 1642 int me;
1644 int cpu = vcpu->cpu; 1643 int cpu = vcpu->cpu;
1645 wait_queue_head_t *wqp; 1644 wait_queue_head_t *wqp;
1646 1645
1647 wqp = kvm_arch_vcpu_wq(vcpu); 1646 wqp = kvm_arch_vcpu_wq(vcpu);
1648 if (waitqueue_active(wqp)) { 1647 if (waitqueue_active(wqp)) {
1649 wake_up_interruptible(wqp); 1648 wake_up_interruptible(wqp);
1650 ++vcpu->stat.halt_wakeup; 1649 ++vcpu->stat.halt_wakeup;
1651 } 1650 }
1652 1651
1653 me = get_cpu(); 1652 me = get_cpu();
1654 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) 1653 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
1655 if (kvm_arch_vcpu_should_kick(vcpu)) 1654 if (kvm_arch_vcpu_should_kick(vcpu))
1656 smp_send_reschedule(cpu); 1655 smp_send_reschedule(cpu);
1657 put_cpu(); 1656 put_cpu();
1658 } 1657 }
1659 #endif /* !CONFIG_S390 */ 1658 #endif /* !CONFIG_S390 */
1660 1659
1661 void kvm_resched(struct kvm_vcpu *vcpu) 1660 void kvm_resched(struct kvm_vcpu *vcpu)
1662 { 1661 {
1663 if (!need_resched()) 1662 if (!need_resched())
1664 return; 1663 return;
1665 cond_resched(); 1664 cond_resched();
1666 } 1665 }
1667 EXPORT_SYMBOL_GPL(kvm_resched); 1666 EXPORT_SYMBOL_GPL(kvm_resched);
1668 1667
1669 bool kvm_vcpu_yield_to(struct kvm_vcpu *target) 1668 bool kvm_vcpu_yield_to(struct kvm_vcpu *target)
1670 { 1669 {
1671 struct pid *pid; 1670 struct pid *pid;
1672 struct task_struct *task = NULL; 1671 struct task_struct *task = NULL;
1673 1672
1674 rcu_read_lock(); 1673 rcu_read_lock();
1675 pid = rcu_dereference(target->pid); 1674 pid = rcu_dereference(target->pid);
1676 if (pid) 1675 if (pid)
1677 task = get_pid_task(target->pid, PIDTYPE_PID); 1676 task = get_pid_task(target->pid, PIDTYPE_PID);
1678 rcu_read_unlock(); 1677 rcu_read_unlock();
1679 if (!task) 1678 if (!task)
1680 return false; 1679 return false;
1681 if (task->flags & PF_VCPU) { 1680 if (task->flags & PF_VCPU) {
1682 put_task_struct(task); 1681 put_task_struct(task);
1683 return false; 1682 return false;
1684 } 1683 }
1685 if (yield_to(task, 1)) { 1684 if (yield_to(task, 1)) {
1686 put_task_struct(task); 1685 put_task_struct(task);
1687 return true; 1686 return true;
1688 } 1687 }
1689 put_task_struct(task); 1688 put_task_struct(task);
1690 return false; 1689 return false;
1691 } 1690 }
1692 EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to); 1691 EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
1693 1692
1694 #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT 1693 #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
1695 /* 1694 /*
1696 * Helper that checks whether a VCPU is eligible for directed yield. 1695 * Helper that checks whether a VCPU is eligible for directed yield.
1697 * Most eligible candidate to yield is decided by following heuristics: 1696 * Most eligible candidate to yield is decided by following heuristics:
1698 * 1697 *
1699 * (a) VCPU which has not done pl-exit or cpu relax intercepted recently 1698 * (a) VCPU which has not done pl-exit or cpu relax intercepted recently
1700 * (preempted lock holder), indicated by @in_spin_loop. 1699 * (preempted lock holder), indicated by @in_spin_loop.
1701 * Set at the beiginning and cleared at the end of interception/PLE handler. 1700 * Set at the beiginning and cleared at the end of interception/PLE handler.
1702 * 1701 *
1703 * (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get 1702 * (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get
1704 * chance last time (mostly it has become eligible now since we have probably 1703 * chance last time (mostly it has become eligible now since we have probably
1705 * yielded to lockholder in last iteration. This is done by toggling 1704 * yielded to lockholder in last iteration. This is done by toggling
1706 * @dy_eligible each time a VCPU checked for eligibility.) 1705 * @dy_eligible each time a VCPU checked for eligibility.)
1707 * 1706 *
1708 * Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding 1707 * Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding
1709 * to preempted lock-holder could result in wrong VCPU selection and CPU 1708 * to preempted lock-holder could result in wrong VCPU selection and CPU
1710 * burning. Giving priority for a potential lock-holder increases lock 1709 * burning. Giving priority for a potential lock-holder increases lock
1711 * progress. 1710 * progress.
1712 * 1711 *
1713 * Since algorithm is based on heuristics, accessing another VCPU data without 1712 * Since algorithm is based on heuristics, accessing another VCPU data without
1714 * locking does not harm. It may result in trying to yield to same VCPU, fail 1713 * locking does not harm. It may result in trying to yield to same VCPU, fail
1715 * and continue with next VCPU and so on. 1714 * and continue with next VCPU and so on.
1716 */ 1715 */
1717 bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu) 1716 bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
1718 { 1717 {
1719 bool eligible; 1718 bool eligible;
1720 1719
1721 eligible = !vcpu->spin_loop.in_spin_loop || 1720 eligible = !vcpu->spin_loop.in_spin_loop ||
1722 (vcpu->spin_loop.in_spin_loop && 1721 (vcpu->spin_loop.in_spin_loop &&
1723 vcpu->spin_loop.dy_eligible); 1722 vcpu->spin_loop.dy_eligible);
1724 1723
1725 if (vcpu->spin_loop.in_spin_loop) 1724 if (vcpu->spin_loop.in_spin_loop)
1726 kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible); 1725 kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
1727 1726
1728 return eligible; 1727 return eligible;
1729 } 1728 }
1730 #endif 1729 #endif
1731 void kvm_vcpu_on_spin(struct kvm_vcpu *me) 1730 void kvm_vcpu_on_spin(struct kvm_vcpu *me)
1732 { 1731 {
1733 struct kvm *kvm = me->kvm; 1732 struct kvm *kvm = me->kvm;
1734 struct kvm_vcpu *vcpu; 1733 struct kvm_vcpu *vcpu;
1735 int last_boosted_vcpu = me->kvm->last_boosted_vcpu; 1734 int last_boosted_vcpu = me->kvm->last_boosted_vcpu;
1736 int yielded = 0; 1735 int yielded = 0;
1737 int pass; 1736 int pass;
1738 int i; 1737 int i;
1739 1738
1740 kvm_vcpu_set_in_spin_loop(me, true); 1739 kvm_vcpu_set_in_spin_loop(me, true);
1741 /* 1740 /*
1742 * We boost the priority of a VCPU that is runnable but not 1741 * We boost the priority of a VCPU that is runnable but not
1743 * currently running, because it got preempted by something 1742 * currently running, because it got preempted by something
1744 * else and called schedule in __vcpu_run. Hopefully that 1743 * else and called schedule in __vcpu_run. Hopefully that
1745 * VCPU is holding the lock that we need and will release it. 1744 * VCPU is holding the lock that we need and will release it.
1746 * We approximate round-robin by starting at the last boosted VCPU. 1745 * We approximate round-robin by starting at the last boosted VCPU.
1747 */ 1746 */
1748 for (pass = 0; pass < 2 && !yielded; pass++) { 1747 for (pass = 0; pass < 2 && !yielded; pass++) {
1749 kvm_for_each_vcpu(i, vcpu, kvm) { 1748 kvm_for_each_vcpu(i, vcpu, kvm) {
1750 if (!pass && i <= last_boosted_vcpu) { 1749 if (!pass && i <= last_boosted_vcpu) {
1751 i = last_boosted_vcpu; 1750 i = last_boosted_vcpu;
1752 continue; 1751 continue;
1753 } else if (pass && i > last_boosted_vcpu) 1752 } else if (pass && i > last_boosted_vcpu)
1754 break; 1753 break;
1755 if (vcpu == me) 1754 if (vcpu == me)
1756 continue; 1755 continue;
1757 if (waitqueue_active(&vcpu->wq)) 1756 if (waitqueue_active(&vcpu->wq))
1758 continue; 1757 continue;
1759 if (!kvm_vcpu_eligible_for_directed_yield(vcpu)) 1758 if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
1760 continue; 1759 continue;
1761 if (kvm_vcpu_yield_to(vcpu)) { 1760 if (kvm_vcpu_yield_to(vcpu)) {
1762 kvm->last_boosted_vcpu = i; 1761 kvm->last_boosted_vcpu = i;
1763 yielded = 1; 1762 yielded = 1;
1764 break; 1763 break;
1765 } 1764 }
1766 } 1765 }
1767 } 1766 }
1768 kvm_vcpu_set_in_spin_loop(me, false); 1767 kvm_vcpu_set_in_spin_loop(me, false);
1769 1768
1770 /* Ensure vcpu is not eligible during next spinloop */ 1769 /* Ensure vcpu is not eligible during next spinloop */
1771 kvm_vcpu_set_dy_eligible(me, false); 1770 kvm_vcpu_set_dy_eligible(me, false);
1772 } 1771 }
1773 EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin); 1772 EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
1774 1773
1775 static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 1774 static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1776 { 1775 {
1777 struct kvm_vcpu *vcpu = vma->vm_file->private_data; 1776 struct kvm_vcpu *vcpu = vma->vm_file->private_data;
1778 struct page *page; 1777 struct page *page;
1779 1778
1780 if (vmf->pgoff == 0) 1779 if (vmf->pgoff == 0)
1781 page = virt_to_page(vcpu->run); 1780 page = virt_to_page(vcpu->run);
1782 #ifdef CONFIG_X86 1781 #ifdef CONFIG_X86
1783 else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET) 1782 else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
1784 page = virt_to_page(vcpu->arch.pio_data); 1783 page = virt_to_page(vcpu->arch.pio_data);
1785 #endif 1784 #endif
1786 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 1785 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
1787 else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET) 1786 else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
1788 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring); 1787 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
1789 #endif 1788 #endif
1790 else 1789 else
1791 return kvm_arch_vcpu_fault(vcpu, vmf); 1790 return kvm_arch_vcpu_fault(vcpu, vmf);
1792 get_page(page); 1791 get_page(page);
1793 vmf->page = page; 1792 vmf->page = page;
1794 return 0; 1793 return 0;
1795 } 1794 }
1796 1795
1797 static const struct vm_operations_struct kvm_vcpu_vm_ops = { 1796 static const struct vm_operations_struct kvm_vcpu_vm_ops = {
1798 .fault = kvm_vcpu_fault, 1797 .fault = kvm_vcpu_fault,
1799 }; 1798 };
1800 1799
1801 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma) 1800 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
1802 { 1801 {
1803 vma->vm_ops = &kvm_vcpu_vm_ops; 1802 vma->vm_ops = &kvm_vcpu_vm_ops;
1804 return 0; 1803 return 0;
1805 } 1804 }
1806 1805
1807 static int kvm_vcpu_release(struct inode *inode, struct file *filp) 1806 static int kvm_vcpu_release(struct inode *inode, struct file *filp)
1808 { 1807 {
1809 struct kvm_vcpu *vcpu = filp->private_data; 1808 struct kvm_vcpu *vcpu = filp->private_data;
1810 1809
1811 kvm_put_kvm(vcpu->kvm); 1810 kvm_put_kvm(vcpu->kvm);
1812 return 0; 1811 return 0;
1813 } 1812 }
1814 1813
1815 static struct file_operations kvm_vcpu_fops = { 1814 static struct file_operations kvm_vcpu_fops = {
1816 .release = kvm_vcpu_release, 1815 .release = kvm_vcpu_release,
1817 .unlocked_ioctl = kvm_vcpu_ioctl, 1816 .unlocked_ioctl = kvm_vcpu_ioctl,
1818 #ifdef CONFIG_COMPAT 1817 #ifdef CONFIG_COMPAT
1819 .compat_ioctl = kvm_vcpu_compat_ioctl, 1818 .compat_ioctl = kvm_vcpu_compat_ioctl,
1820 #endif 1819 #endif
1821 .mmap = kvm_vcpu_mmap, 1820 .mmap = kvm_vcpu_mmap,
1822 .llseek = noop_llseek, 1821 .llseek = noop_llseek,
1823 }; 1822 };
1824 1823
1825 /* 1824 /*
1826 * Allocates an inode for the vcpu. 1825 * Allocates an inode for the vcpu.
1827 */ 1826 */
1828 static int create_vcpu_fd(struct kvm_vcpu *vcpu) 1827 static int create_vcpu_fd(struct kvm_vcpu *vcpu)
1829 { 1828 {
1830 return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR); 1829 return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR);
1831 } 1830 }
1832 1831
1833 /* 1832 /*
1834 * Creates some virtual cpus. Good luck creating more than one. 1833 * Creates some virtual cpus. Good luck creating more than one.
1835 */ 1834 */
1836 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) 1835 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
1837 { 1836 {
1838 int r; 1837 int r;
1839 struct kvm_vcpu *vcpu, *v; 1838 struct kvm_vcpu *vcpu, *v;
1840 1839
1841 vcpu = kvm_arch_vcpu_create(kvm, id); 1840 vcpu = kvm_arch_vcpu_create(kvm, id);
1842 if (IS_ERR(vcpu)) 1841 if (IS_ERR(vcpu))
1843 return PTR_ERR(vcpu); 1842 return PTR_ERR(vcpu);
1844 1843
1845 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); 1844 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
1846 1845
1847 r = kvm_arch_vcpu_setup(vcpu); 1846 r = kvm_arch_vcpu_setup(vcpu);
1848 if (r) 1847 if (r)
1849 goto vcpu_destroy; 1848 goto vcpu_destroy;
1850 1849
1851 mutex_lock(&kvm->lock); 1850 mutex_lock(&kvm->lock);
1852 if (!kvm_vcpu_compatible(vcpu)) { 1851 if (!kvm_vcpu_compatible(vcpu)) {
1853 r = -EINVAL; 1852 r = -EINVAL;
1854 goto unlock_vcpu_destroy; 1853 goto unlock_vcpu_destroy;
1855 } 1854 }
1856 if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) { 1855 if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) {
1857 r = -EINVAL; 1856 r = -EINVAL;
1858 goto unlock_vcpu_destroy; 1857 goto unlock_vcpu_destroy;
1859 } 1858 }
1860 1859
1861 kvm_for_each_vcpu(r, v, kvm) 1860 kvm_for_each_vcpu(r, v, kvm)
1862 if (v->vcpu_id == id) { 1861 if (v->vcpu_id == id) {
1863 r = -EEXIST; 1862 r = -EEXIST;
1864 goto unlock_vcpu_destroy; 1863 goto unlock_vcpu_destroy;
1865 } 1864 }
1866 1865
1867 BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]); 1866 BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]);
1868 1867
1869 /* Now it's all set up, let userspace reach it */ 1868 /* Now it's all set up, let userspace reach it */
1870 kvm_get_kvm(kvm); 1869 kvm_get_kvm(kvm);
1871 r = create_vcpu_fd(vcpu); 1870 r = create_vcpu_fd(vcpu);
1872 if (r < 0) { 1871 if (r < 0) {
1873 kvm_put_kvm(kvm); 1872 kvm_put_kvm(kvm);
1874 goto unlock_vcpu_destroy; 1873 goto unlock_vcpu_destroy;
1875 } 1874 }
1876 1875
1877 kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu; 1876 kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu;
1878 smp_wmb(); 1877 smp_wmb();
1879 atomic_inc(&kvm->online_vcpus); 1878 atomic_inc(&kvm->online_vcpus);
1880 1879
1881 mutex_unlock(&kvm->lock); 1880 mutex_unlock(&kvm->lock);
1882 kvm_arch_vcpu_postcreate(vcpu); 1881 kvm_arch_vcpu_postcreate(vcpu);
1883 return r; 1882 return r;
1884 1883
1885 unlock_vcpu_destroy: 1884 unlock_vcpu_destroy:
1886 mutex_unlock(&kvm->lock); 1885 mutex_unlock(&kvm->lock);
1887 vcpu_destroy: 1886 vcpu_destroy:
1888 kvm_arch_vcpu_destroy(vcpu); 1887 kvm_arch_vcpu_destroy(vcpu);
1889 return r; 1888 return r;
1890 } 1889 }
1891 1890
1892 static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset) 1891 static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
1893 { 1892 {
1894 if (sigset) { 1893 if (sigset) {
1895 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP)); 1894 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
1896 vcpu->sigset_active = 1; 1895 vcpu->sigset_active = 1;
1897 vcpu->sigset = *sigset; 1896 vcpu->sigset = *sigset;
1898 } else 1897 } else
1899 vcpu->sigset_active = 0; 1898 vcpu->sigset_active = 0;
1900 return 0; 1899 return 0;
1901 } 1900 }
1902 1901
1903 static long kvm_vcpu_ioctl(struct file *filp, 1902 static long kvm_vcpu_ioctl(struct file *filp,
1904 unsigned int ioctl, unsigned long arg) 1903 unsigned int ioctl, unsigned long arg)
1905 { 1904 {
1906 struct kvm_vcpu *vcpu = filp->private_data; 1905 struct kvm_vcpu *vcpu = filp->private_data;
1907 void __user *argp = (void __user *)arg; 1906 void __user *argp = (void __user *)arg;
1908 int r; 1907 int r;
1909 struct kvm_fpu *fpu = NULL; 1908 struct kvm_fpu *fpu = NULL;
1910 struct kvm_sregs *kvm_sregs = NULL; 1909 struct kvm_sregs *kvm_sregs = NULL;
1911 1910
1912 if (vcpu->kvm->mm != current->mm) 1911 if (vcpu->kvm->mm != current->mm)
1913 return -EIO; 1912 return -EIO;
1914 1913
1915 #if defined(CONFIG_S390) || defined(CONFIG_PPC) 1914 #if defined(CONFIG_S390) || defined(CONFIG_PPC)
1916 /* 1915 /*
1917 * Special cases: vcpu ioctls that are asynchronous to vcpu execution, 1916 * Special cases: vcpu ioctls that are asynchronous to vcpu execution,
1918 * so vcpu_load() would break it. 1917 * so vcpu_load() would break it.
1919 */ 1918 */
1920 if (ioctl == KVM_S390_INTERRUPT || ioctl == KVM_INTERRUPT) 1919 if (ioctl == KVM_S390_INTERRUPT || ioctl == KVM_INTERRUPT)
1921 return kvm_arch_vcpu_ioctl(filp, ioctl, arg); 1920 return kvm_arch_vcpu_ioctl(filp, ioctl, arg);
1922 #endif 1921 #endif
1923 1922
1924 1923
1925 r = vcpu_load(vcpu); 1924 r = vcpu_load(vcpu);
1926 if (r) 1925 if (r)
1927 return r; 1926 return r;
1928 switch (ioctl) { 1927 switch (ioctl) {
1929 case KVM_RUN: 1928 case KVM_RUN:
1930 r = -EINVAL; 1929 r = -EINVAL;
1931 if (arg) 1930 if (arg)
1932 goto out; 1931 goto out;
1933 r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run); 1932 r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
1934 trace_kvm_userspace_exit(vcpu->run->exit_reason, r); 1933 trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
1935 break; 1934 break;
1936 case KVM_GET_REGS: { 1935 case KVM_GET_REGS: {
1937 struct kvm_regs *kvm_regs; 1936 struct kvm_regs *kvm_regs;
1938 1937
1939 r = -ENOMEM; 1938 r = -ENOMEM;
1940 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL); 1939 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL);
1941 if (!kvm_regs) 1940 if (!kvm_regs)
1942 goto out; 1941 goto out;
1943 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs); 1942 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
1944 if (r) 1943 if (r)
1945 goto out_free1; 1944 goto out_free1;
1946 r = -EFAULT; 1945 r = -EFAULT;
1947 if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs))) 1946 if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs)))
1948 goto out_free1; 1947 goto out_free1;
1949 r = 0; 1948 r = 0;
1950 out_free1: 1949 out_free1:
1951 kfree(kvm_regs); 1950 kfree(kvm_regs);
1952 break; 1951 break;
1953 } 1952 }
1954 case KVM_SET_REGS: { 1953 case KVM_SET_REGS: {
1955 struct kvm_regs *kvm_regs; 1954 struct kvm_regs *kvm_regs;
1956 1955
1957 r = -ENOMEM; 1956 r = -ENOMEM;
1958 kvm_regs = memdup_user(argp, sizeof(*kvm_regs)); 1957 kvm_regs = memdup_user(argp, sizeof(*kvm_regs));
1959 if (IS_ERR(kvm_regs)) { 1958 if (IS_ERR(kvm_regs)) {
1960 r = PTR_ERR(kvm_regs); 1959 r = PTR_ERR(kvm_regs);
1961 goto out; 1960 goto out;
1962 } 1961 }
1963 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs); 1962 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
1964 kfree(kvm_regs); 1963 kfree(kvm_regs);
1965 break; 1964 break;
1966 } 1965 }
1967 case KVM_GET_SREGS: { 1966 case KVM_GET_SREGS: {
1968 kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL); 1967 kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL);
1969 r = -ENOMEM; 1968 r = -ENOMEM;
1970 if (!kvm_sregs) 1969 if (!kvm_sregs)
1971 goto out; 1970 goto out;
1972 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs); 1971 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs);
1973 if (r) 1972 if (r)
1974 goto out; 1973 goto out;
1975 r = -EFAULT; 1974 r = -EFAULT;
1976 if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs))) 1975 if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs)))
1977 goto out; 1976 goto out;
1978 r = 0; 1977 r = 0;
1979 break; 1978 break;
1980 } 1979 }
1981 case KVM_SET_SREGS: { 1980 case KVM_SET_SREGS: {
1982 kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs)); 1981 kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs));
1983 if (IS_ERR(kvm_sregs)) { 1982 if (IS_ERR(kvm_sregs)) {
1984 r = PTR_ERR(kvm_sregs); 1983 r = PTR_ERR(kvm_sregs);
1985 kvm_sregs = NULL; 1984 kvm_sregs = NULL;
1986 goto out; 1985 goto out;
1987 } 1986 }
1988 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs); 1987 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
1989 break; 1988 break;
1990 } 1989 }
1991 case KVM_GET_MP_STATE: { 1990 case KVM_GET_MP_STATE: {
1992 struct kvm_mp_state mp_state; 1991 struct kvm_mp_state mp_state;
1993 1992
1994 r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state); 1993 r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state);
1995 if (r) 1994 if (r)
1996 goto out; 1995 goto out;
1997 r = -EFAULT; 1996 r = -EFAULT;
1998 if (copy_to_user(argp, &mp_state, sizeof mp_state)) 1997 if (copy_to_user(argp, &mp_state, sizeof mp_state))
1999 goto out; 1998 goto out;
2000 r = 0; 1999 r = 0;
2001 break; 2000 break;
2002 } 2001 }
2003 case KVM_SET_MP_STATE: { 2002 case KVM_SET_MP_STATE: {
2004 struct kvm_mp_state mp_state; 2003 struct kvm_mp_state mp_state;
2005 2004
2006 r = -EFAULT; 2005 r = -EFAULT;
2007 if (copy_from_user(&mp_state, argp, sizeof mp_state)) 2006 if (copy_from_user(&mp_state, argp, sizeof mp_state))
2008 goto out; 2007 goto out;
2009 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state); 2008 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
2010 break; 2009 break;
2011 } 2010 }
2012 case KVM_TRANSLATE: { 2011 case KVM_TRANSLATE: {
2013 struct kvm_translation tr; 2012 struct kvm_translation tr;
2014 2013
2015 r = -EFAULT; 2014 r = -EFAULT;
2016 if (copy_from_user(&tr, argp, sizeof tr)) 2015 if (copy_from_user(&tr, argp, sizeof tr))
2017 goto out; 2016 goto out;
2018 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr); 2017 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
2019 if (r) 2018 if (r)
2020 goto out; 2019 goto out;
2021 r = -EFAULT; 2020 r = -EFAULT;
2022 if (copy_to_user(argp, &tr, sizeof tr)) 2021 if (copy_to_user(argp, &tr, sizeof tr))
2023 goto out; 2022 goto out;
2024 r = 0; 2023 r = 0;
2025 break; 2024 break;
2026 } 2025 }
2027 case KVM_SET_GUEST_DEBUG: { 2026 case KVM_SET_GUEST_DEBUG: {
2028 struct kvm_guest_debug dbg; 2027 struct kvm_guest_debug dbg;
2029 2028
2030 r = -EFAULT; 2029 r = -EFAULT;
2031 if (copy_from_user(&dbg, argp, sizeof dbg)) 2030 if (copy_from_user(&dbg, argp, sizeof dbg))
2032 goto out; 2031 goto out;
2033 r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg); 2032 r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
2034 break; 2033 break;
2035 } 2034 }
2036 case KVM_SET_SIGNAL_MASK: { 2035 case KVM_SET_SIGNAL_MASK: {
2037 struct kvm_signal_mask __user *sigmask_arg = argp; 2036 struct kvm_signal_mask __user *sigmask_arg = argp;
2038 struct kvm_signal_mask kvm_sigmask; 2037 struct kvm_signal_mask kvm_sigmask;
2039 sigset_t sigset, *p; 2038 sigset_t sigset, *p;
2040 2039
2041 p = NULL; 2040 p = NULL;
2042 if (argp) { 2041 if (argp) {
2043 r = -EFAULT; 2042 r = -EFAULT;
2044 if (copy_from_user(&kvm_sigmask, argp, 2043 if (copy_from_user(&kvm_sigmask, argp,
2045 sizeof kvm_sigmask)) 2044 sizeof kvm_sigmask))
2046 goto out; 2045 goto out;
2047 r = -EINVAL; 2046 r = -EINVAL;
2048 if (kvm_sigmask.len != sizeof sigset) 2047 if (kvm_sigmask.len != sizeof sigset)
2049 goto out; 2048 goto out;
2050 r = -EFAULT; 2049 r = -EFAULT;
2051 if (copy_from_user(&sigset, sigmask_arg->sigset, 2050 if (copy_from_user(&sigset, sigmask_arg->sigset,
2052 sizeof sigset)) 2051 sizeof sigset))
2053 goto out; 2052 goto out;
2054 p = &sigset; 2053 p = &sigset;
2055 } 2054 }
2056 r = kvm_vcpu_ioctl_set_sigmask(vcpu, p); 2055 r = kvm_vcpu_ioctl_set_sigmask(vcpu, p);
2057 break; 2056 break;
2058 } 2057 }
2059 case KVM_GET_FPU: { 2058 case KVM_GET_FPU: {
2060 fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL); 2059 fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL);
2061 r = -ENOMEM; 2060 r = -ENOMEM;
2062 if (!fpu) 2061 if (!fpu)
2063 goto out; 2062 goto out;
2064 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu); 2063 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu);
2065 if (r) 2064 if (r)
2066 goto out; 2065 goto out;
2067 r = -EFAULT; 2066 r = -EFAULT;
2068 if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu))) 2067 if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu)))
2069 goto out; 2068 goto out;
2070 r = 0; 2069 r = 0;
2071 break; 2070 break;
2072 } 2071 }
2073 case KVM_SET_FPU: { 2072 case KVM_SET_FPU: {
2074 fpu = memdup_user(argp, sizeof(*fpu)); 2073 fpu = memdup_user(argp, sizeof(*fpu));
2075 if (IS_ERR(fpu)) { 2074 if (IS_ERR(fpu)) {
2076 r = PTR_ERR(fpu); 2075 r = PTR_ERR(fpu);
2077 fpu = NULL; 2076 fpu = NULL;
2078 goto out; 2077 goto out;
2079 } 2078 }
2080 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu); 2079 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
2081 break; 2080 break;
2082 } 2081 }
2083 default: 2082 default:
2084 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); 2083 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
2085 } 2084 }
2086 out: 2085 out:
2087 vcpu_put(vcpu); 2086 vcpu_put(vcpu);
2088 kfree(fpu); 2087 kfree(fpu);
2089 kfree(kvm_sregs); 2088 kfree(kvm_sregs);
2090 return r; 2089 return r;
2091 } 2090 }
2092 2091
2093 #ifdef CONFIG_COMPAT 2092 #ifdef CONFIG_COMPAT
2094 static long kvm_vcpu_compat_ioctl(struct file *filp, 2093 static long kvm_vcpu_compat_ioctl(struct file *filp,
2095 unsigned int ioctl, unsigned long arg) 2094 unsigned int ioctl, unsigned long arg)
2096 { 2095 {
2097 struct kvm_vcpu *vcpu = filp->private_data; 2096 struct kvm_vcpu *vcpu = filp->private_data;
2098 void __user *argp = compat_ptr(arg); 2097 void __user *argp = compat_ptr(arg);
2099 int r; 2098 int r;
2100 2099
2101 if (vcpu->kvm->mm != current->mm) 2100 if (vcpu->kvm->mm != current->mm)
2102 return -EIO; 2101 return -EIO;
2103 2102
2104 switch (ioctl) { 2103 switch (ioctl) {
2105 case KVM_SET_SIGNAL_MASK: { 2104 case KVM_SET_SIGNAL_MASK: {
2106 struct kvm_signal_mask __user *sigmask_arg = argp; 2105 struct kvm_signal_mask __user *sigmask_arg = argp;
2107 struct kvm_signal_mask kvm_sigmask; 2106 struct kvm_signal_mask kvm_sigmask;
2108 compat_sigset_t csigset; 2107 compat_sigset_t csigset;
2109 sigset_t sigset; 2108 sigset_t sigset;
2110 2109
2111 if (argp) { 2110 if (argp) {
2112 r = -EFAULT; 2111 r = -EFAULT;
2113 if (copy_from_user(&kvm_sigmask, argp, 2112 if (copy_from_user(&kvm_sigmask, argp,
2114 sizeof kvm_sigmask)) 2113 sizeof kvm_sigmask))
2115 goto out; 2114 goto out;
2116 r = -EINVAL; 2115 r = -EINVAL;
2117 if (kvm_sigmask.len != sizeof csigset) 2116 if (kvm_sigmask.len != sizeof csigset)
2118 goto out; 2117 goto out;
2119 r = -EFAULT; 2118 r = -EFAULT;
2120 if (copy_from_user(&csigset, sigmask_arg->sigset, 2119 if (copy_from_user(&csigset, sigmask_arg->sigset,
2121 sizeof csigset)) 2120 sizeof csigset))
2122 goto out; 2121 goto out;
2123 sigset_from_compat(&sigset, &csigset); 2122 sigset_from_compat(&sigset, &csigset);
2124 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset); 2123 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
2125 } else 2124 } else
2126 r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL); 2125 r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL);
2127 break; 2126 break;
2128 } 2127 }
2129 default: 2128 default:
2130 r = kvm_vcpu_ioctl(filp, ioctl, arg); 2129 r = kvm_vcpu_ioctl(filp, ioctl, arg);
2131 } 2130 }
2132 2131
2133 out: 2132 out:
2134 return r; 2133 return r;
2135 } 2134 }
2136 #endif 2135 #endif
2137 2136
2138 static long kvm_vm_ioctl(struct file *filp, 2137 static long kvm_vm_ioctl(struct file *filp,
2139 unsigned int ioctl, unsigned long arg) 2138 unsigned int ioctl, unsigned long arg)
2140 { 2139 {
2141 struct kvm *kvm = filp->private_data; 2140 struct kvm *kvm = filp->private_data;
2142 void __user *argp = (void __user *)arg; 2141 void __user *argp = (void __user *)arg;
2143 int r; 2142 int r;
2144 2143
2145 if (kvm->mm != current->mm) 2144 if (kvm->mm != current->mm)
2146 return -EIO; 2145 return -EIO;
2147 switch (ioctl) { 2146 switch (ioctl) {
2148 case KVM_CREATE_VCPU: 2147 case KVM_CREATE_VCPU:
2149 r = kvm_vm_ioctl_create_vcpu(kvm, arg); 2148 r = kvm_vm_ioctl_create_vcpu(kvm, arg);
2150 break; 2149 break;
2151 case KVM_SET_USER_MEMORY_REGION: { 2150 case KVM_SET_USER_MEMORY_REGION: {
2152 struct kvm_userspace_memory_region kvm_userspace_mem; 2151 struct kvm_userspace_memory_region kvm_userspace_mem;
2153 2152
2154 r = -EFAULT; 2153 r = -EFAULT;
2155 if (copy_from_user(&kvm_userspace_mem, argp, 2154 if (copy_from_user(&kvm_userspace_mem, argp,
2156 sizeof kvm_userspace_mem)) 2155 sizeof kvm_userspace_mem))
2157 goto out; 2156 goto out;
2158 2157
2159 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, true); 2158 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, true);
2160 break; 2159 break;
2161 } 2160 }
2162 case KVM_GET_DIRTY_LOG: { 2161 case KVM_GET_DIRTY_LOG: {
2163 struct kvm_dirty_log log; 2162 struct kvm_dirty_log log;
2164 2163
2165 r = -EFAULT; 2164 r = -EFAULT;
2166 if (copy_from_user(&log, argp, sizeof log)) 2165 if (copy_from_user(&log, argp, sizeof log))
2167 goto out; 2166 goto out;
2168 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 2167 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
2169 break; 2168 break;
2170 } 2169 }
2171 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 2170 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
2172 case KVM_REGISTER_COALESCED_MMIO: { 2171 case KVM_REGISTER_COALESCED_MMIO: {
2173 struct kvm_coalesced_mmio_zone zone; 2172 struct kvm_coalesced_mmio_zone zone;
2174 r = -EFAULT; 2173 r = -EFAULT;
2175 if (copy_from_user(&zone, argp, sizeof zone)) 2174 if (copy_from_user(&zone, argp, sizeof zone))
2176 goto out; 2175 goto out;
2177 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone); 2176 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
2178 break; 2177 break;
2179 } 2178 }
2180 case KVM_UNREGISTER_COALESCED_MMIO: { 2179 case KVM_UNREGISTER_COALESCED_MMIO: {
2181 struct kvm_coalesced_mmio_zone zone; 2180 struct kvm_coalesced_mmio_zone zone;
2182 r = -EFAULT; 2181 r = -EFAULT;
2183 if (copy_from_user(&zone, argp, sizeof zone)) 2182 if (copy_from_user(&zone, argp, sizeof zone))
2184 goto out; 2183 goto out;
2185 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone); 2184 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
2186 break; 2185 break;
2187 } 2186 }
2188 #endif 2187 #endif
2189 case KVM_IRQFD: { 2188 case KVM_IRQFD: {
2190 struct kvm_irqfd data; 2189 struct kvm_irqfd data;
2191 2190
2192 r = -EFAULT; 2191 r = -EFAULT;
2193 if (copy_from_user(&data, argp, sizeof data)) 2192 if (copy_from_user(&data, argp, sizeof data))
2194 goto out; 2193 goto out;
2195 r = kvm_irqfd(kvm, &data); 2194 r = kvm_irqfd(kvm, &data);
2196 break; 2195 break;
2197 } 2196 }
2198 case KVM_IOEVENTFD: { 2197 case KVM_IOEVENTFD: {
2199 struct kvm_ioeventfd data; 2198 struct kvm_ioeventfd data;
2200 2199
2201 r = -EFAULT; 2200 r = -EFAULT;
2202 if (copy_from_user(&data, argp, sizeof data)) 2201 if (copy_from_user(&data, argp, sizeof data))
2203 goto out; 2202 goto out;
2204 r = kvm_ioeventfd(kvm, &data); 2203 r = kvm_ioeventfd(kvm, &data);
2205 break; 2204 break;
2206 } 2205 }
2207 #ifdef CONFIG_KVM_APIC_ARCHITECTURE 2206 #ifdef CONFIG_KVM_APIC_ARCHITECTURE
2208 case KVM_SET_BOOT_CPU_ID: 2207 case KVM_SET_BOOT_CPU_ID:
2209 r = 0; 2208 r = 0;
2210 mutex_lock(&kvm->lock); 2209 mutex_lock(&kvm->lock);
2211 if (atomic_read(&kvm->online_vcpus) != 0) 2210 if (atomic_read(&kvm->online_vcpus) != 0)
2212 r = -EBUSY; 2211 r = -EBUSY;
2213 else 2212 else
2214 kvm->bsp_vcpu_id = arg; 2213 kvm->bsp_vcpu_id = arg;
2215 mutex_unlock(&kvm->lock); 2214 mutex_unlock(&kvm->lock);
2216 break; 2215 break;
2217 #endif 2216 #endif
2218 #ifdef CONFIG_HAVE_KVM_MSI 2217 #ifdef CONFIG_HAVE_KVM_MSI
2219 case KVM_SIGNAL_MSI: { 2218 case KVM_SIGNAL_MSI: {
2220 struct kvm_msi msi; 2219 struct kvm_msi msi;
2221 2220
2222 r = -EFAULT; 2221 r = -EFAULT;
2223 if (copy_from_user(&msi, argp, sizeof msi)) 2222 if (copy_from_user(&msi, argp, sizeof msi))
2224 goto out; 2223 goto out;
2225 r = kvm_send_userspace_msi(kvm, &msi); 2224 r = kvm_send_userspace_msi(kvm, &msi);
2226 break; 2225 break;
2227 } 2226 }
2228 #endif 2227 #endif
2229 #ifdef __KVM_HAVE_IRQ_LINE 2228 #ifdef __KVM_HAVE_IRQ_LINE
2230 case KVM_IRQ_LINE_STATUS: 2229 case KVM_IRQ_LINE_STATUS:
2231 case KVM_IRQ_LINE: { 2230 case KVM_IRQ_LINE: {
2232 struct kvm_irq_level irq_event; 2231 struct kvm_irq_level irq_event;
2233 2232
2234 r = -EFAULT; 2233 r = -EFAULT;
2235 if (copy_from_user(&irq_event, argp, sizeof irq_event)) 2234 if (copy_from_user(&irq_event, argp, sizeof irq_event))
2236 goto out; 2235 goto out;
2237 2236
2238 r = kvm_vm_ioctl_irq_line(kvm, &irq_event); 2237 r = kvm_vm_ioctl_irq_line(kvm, &irq_event);
2239 if (r) 2238 if (r)
2240 goto out; 2239 goto out;
2241 2240
2242 r = -EFAULT; 2241 r = -EFAULT;
2243 if (ioctl == KVM_IRQ_LINE_STATUS) { 2242 if (ioctl == KVM_IRQ_LINE_STATUS) {
2244 if (copy_to_user(argp, &irq_event, sizeof irq_event)) 2243 if (copy_to_user(argp, &irq_event, sizeof irq_event))
2245 goto out; 2244 goto out;
2246 } 2245 }
2247 2246
2248 r = 0; 2247 r = 0;
2249 break; 2248 break;
2250 } 2249 }
2251 #endif 2250 #endif
2252 default: 2251 default:
2253 r = kvm_arch_vm_ioctl(filp, ioctl, arg); 2252 r = kvm_arch_vm_ioctl(filp, ioctl, arg);
2254 if (r == -ENOTTY) 2253 if (r == -ENOTTY)
2255 r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg); 2254 r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg);
2256 } 2255 }
2257 out: 2256 out:
2258 return r; 2257 return r;
2259 } 2258 }
2260 2259
2261 #ifdef CONFIG_COMPAT 2260 #ifdef CONFIG_COMPAT
2262 struct compat_kvm_dirty_log { 2261 struct compat_kvm_dirty_log {
2263 __u32 slot; 2262 __u32 slot;
2264 __u32 padding1; 2263 __u32 padding1;
2265 union { 2264 union {
2266 compat_uptr_t dirty_bitmap; /* one bit per page */ 2265 compat_uptr_t dirty_bitmap; /* one bit per page */
2267 __u64 padding2; 2266 __u64 padding2;
2268 }; 2267 };
2269 }; 2268 };
2270 2269
2271 static long kvm_vm_compat_ioctl(struct file *filp, 2270 static long kvm_vm_compat_ioctl(struct file *filp,
2272 unsigned int ioctl, unsigned long arg) 2271 unsigned int ioctl, unsigned long arg)
2273 { 2272 {
2274 struct kvm *kvm = filp->private_data; 2273 struct kvm *kvm = filp->private_data;
2275 int r; 2274 int r;
2276 2275
2277 if (kvm->mm != current->mm) 2276 if (kvm->mm != current->mm)
2278 return -EIO; 2277 return -EIO;
2279 switch (ioctl) { 2278 switch (ioctl) {
2280 case KVM_GET_DIRTY_LOG: { 2279 case KVM_GET_DIRTY_LOG: {
2281 struct compat_kvm_dirty_log compat_log; 2280 struct compat_kvm_dirty_log compat_log;
2282 struct kvm_dirty_log log; 2281 struct kvm_dirty_log log;
2283 2282
2284 r = -EFAULT; 2283 r = -EFAULT;
2285 if (copy_from_user(&compat_log, (void __user *)arg, 2284 if (copy_from_user(&compat_log, (void __user *)arg,
2286 sizeof(compat_log))) 2285 sizeof(compat_log)))
2287 goto out; 2286 goto out;
2288 log.slot = compat_log.slot; 2287 log.slot = compat_log.slot;
2289 log.padding1 = compat_log.padding1; 2288 log.padding1 = compat_log.padding1;
2290 log.padding2 = compat_log.padding2; 2289 log.padding2 = compat_log.padding2;
2291 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap); 2290 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
2292 2291
2293 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 2292 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
2294 break; 2293 break;
2295 } 2294 }
2296 default: 2295 default:
2297 r = kvm_vm_ioctl(filp, ioctl, arg); 2296 r = kvm_vm_ioctl(filp, ioctl, arg);
2298 } 2297 }
2299 2298
2300 out: 2299 out:
2301 return r; 2300 return r;
2302 } 2301 }
2303 #endif 2302 #endif
2304 2303
2305 static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 2304 static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2306 { 2305 {
2307 struct page *page[1]; 2306 struct page *page[1];
2308 unsigned long addr; 2307 unsigned long addr;
2309 int npages; 2308 int npages;
2310 gfn_t gfn = vmf->pgoff; 2309 gfn_t gfn = vmf->pgoff;
2311 struct kvm *kvm = vma->vm_file->private_data; 2310 struct kvm *kvm = vma->vm_file->private_data;
2312 2311
2313 addr = gfn_to_hva(kvm, gfn); 2312 addr = gfn_to_hva(kvm, gfn);
2314 if (kvm_is_error_hva(addr)) 2313 if (kvm_is_error_hva(addr))
2315 return VM_FAULT_SIGBUS; 2314 return VM_FAULT_SIGBUS;
2316 2315
2317 npages = get_user_pages(current, current->mm, addr, 1, 1, 0, page, 2316 npages = get_user_pages(current, current->mm, addr, 1, 1, 0, page,
2318 NULL); 2317 NULL);
2319 if (unlikely(npages != 1)) 2318 if (unlikely(npages != 1))
2320 return VM_FAULT_SIGBUS; 2319 return VM_FAULT_SIGBUS;
2321 2320
2322 vmf->page = page[0]; 2321 vmf->page = page[0];
2323 return 0; 2322 return 0;
2324 } 2323 }
2325 2324
2326 static const struct vm_operations_struct kvm_vm_vm_ops = { 2325 static const struct vm_operations_struct kvm_vm_vm_ops = {
2327 .fault = kvm_vm_fault, 2326 .fault = kvm_vm_fault,
2328 }; 2327 };
2329 2328
2330 static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma) 2329 static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
2331 { 2330 {
2332 vma->vm_ops = &kvm_vm_vm_ops; 2331 vma->vm_ops = &kvm_vm_vm_ops;
2333 return 0; 2332 return 0;
2334 } 2333 }
2335 2334
2336 static struct file_operations kvm_vm_fops = { 2335 static struct file_operations kvm_vm_fops = {
2337 .release = kvm_vm_release, 2336 .release = kvm_vm_release,
2338 .unlocked_ioctl = kvm_vm_ioctl, 2337 .unlocked_ioctl = kvm_vm_ioctl,
2339 #ifdef CONFIG_COMPAT 2338 #ifdef CONFIG_COMPAT
2340 .compat_ioctl = kvm_vm_compat_ioctl, 2339 .compat_ioctl = kvm_vm_compat_ioctl,
2341 #endif 2340 #endif
2342 .mmap = kvm_vm_mmap, 2341 .mmap = kvm_vm_mmap,
2343 .llseek = noop_llseek, 2342 .llseek = noop_llseek,
2344 }; 2343 };
2345 2344
2346 static int kvm_dev_ioctl_create_vm(unsigned long type) 2345 static int kvm_dev_ioctl_create_vm(unsigned long type)
2347 { 2346 {
2348 int r; 2347 int r;
2349 struct kvm *kvm; 2348 struct kvm *kvm;
2350 2349
2351 kvm = kvm_create_vm(type); 2350 kvm = kvm_create_vm(type);
2352 if (IS_ERR(kvm)) 2351 if (IS_ERR(kvm))
2353 return PTR_ERR(kvm); 2352 return PTR_ERR(kvm);
2354 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 2353 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
2355 r = kvm_coalesced_mmio_init(kvm); 2354 r = kvm_coalesced_mmio_init(kvm);
2356 if (r < 0) { 2355 if (r < 0) {
2357 kvm_put_kvm(kvm); 2356 kvm_put_kvm(kvm);
2358 return r; 2357 return r;
2359 } 2358 }
2360 #endif 2359 #endif
2361 r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR); 2360 r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
2362 if (r < 0) 2361 if (r < 0)
2363 kvm_put_kvm(kvm); 2362 kvm_put_kvm(kvm);
2364 2363
2365 return r; 2364 return r;
2366 } 2365 }
2367 2366
2368 static long kvm_dev_ioctl_check_extension_generic(long arg) 2367 static long kvm_dev_ioctl_check_extension_generic(long arg)
2369 { 2368 {
2370 switch (arg) { 2369 switch (arg) {
2371 case KVM_CAP_USER_MEMORY: 2370 case KVM_CAP_USER_MEMORY:
2372 case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: 2371 case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
2373 case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS: 2372 case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
2374 #ifdef CONFIG_KVM_APIC_ARCHITECTURE 2373 #ifdef CONFIG_KVM_APIC_ARCHITECTURE
2375 case KVM_CAP_SET_BOOT_CPU_ID: 2374 case KVM_CAP_SET_BOOT_CPU_ID:
2376 #endif 2375 #endif
2377 case KVM_CAP_INTERNAL_ERROR_DATA: 2376 case KVM_CAP_INTERNAL_ERROR_DATA:
2378 #ifdef CONFIG_HAVE_KVM_MSI 2377 #ifdef CONFIG_HAVE_KVM_MSI
2379 case KVM_CAP_SIGNAL_MSI: 2378 case KVM_CAP_SIGNAL_MSI:
2380 #endif 2379 #endif
2381 return 1; 2380 return 1;
2382 #ifdef KVM_CAP_IRQ_ROUTING 2381 #ifdef KVM_CAP_IRQ_ROUTING
2383 case KVM_CAP_IRQ_ROUTING: 2382 case KVM_CAP_IRQ_ROUTING:
2384 return KVM_MAX_IRQ_ROUTES; 2383 return KVM_MAX_IRQ_ROUTES;
2385 #endif 2384 #endif
2386 default: 2385 default:
2387 break; 2386 break;
2388 } 2387 }
2389 return kvm_dev_ioctl_check_extension(arg); 2388 return kvm_dev_ioctl_check_extension(arg);
2390 } 2389 }
2391 2390
2392 static long kvm_dev_ioctl(struct file *filp, 2391 static long kvm_dev_ioctl(struct file *filp,
2393 unsigned int ioctl, unsigned long arg) 2392 unsigned int ioctl, unsigned long arg)
2394 { 2393 {
2395 long r = -EINVAL; 2394 long r = -EINVAL;
2396 2395
2397 switch (ioctl) { 2396 switch (ioctl) {
2398 case KVM_GET_API_VERSION: 2397 case KVM_GET_API_VERSION:
2399 r = -EINVAL; 2398 r = -EINVAL;
2400 if (arg) 2399 if (arg)
2401 goto out; 2400 goto out;
2402 r = KVM_API_VERSION; 2401 r = KVM_API_VERSION;
2403 break; 2402 break;
2404 case KVM_CREATE_VM: 2403 case KVM_CREATE_VM:
2405 r = kvm_dev_ioctl_create_vm(arg); 2404 r = kvm_dev_ioctl_create_vm(arg);
2406 break; 2405 break;
2407 case KVM_CHECK_EXTENSION: 2406 case KVM_CHECK_EXTENSION:
2408 r = kvm_dev_ioctl_check_extension_generic(arg); 2407 r = kvm_dev_ioctl_check_extension_generic(arg);
2409 break; 2408 break;
2410 case KVM_GET_VCPU_MMAP_SIZE: 2409 case KVM_GET_VCPU_MMAP_SIZE:
2411 r = -EINVAL; 2410 r = -EINVAL;
2412 if (arg) 2411 if (arg)
2413 goto out; 2412 goto out;
2414 r = PAGE_SIZE; /* struct kvm_run */ 2413 r = PAGE_SIZE; /* struct kvm_run */
2415 #ifdef CONFIG_X86 2414 #ifdef CONFIG_X86
2416 r += PAGE_SIZE; /* pio data page */ 2415 r += PAGE_SIZE; /* pio data page */
2417 #endif 2416 #endif
2418 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 2417 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
2419 r += PAGE_SIZE; /* coalesced mmio ring page */ 2418 r += PAGE_SIZE; /* coalesced mmio ring page */
2420 #endif 2419 #endif
2421 break; 2420 break;
2422 case KVM_TRACE_ENABLE: 2421 case KVM_TRACE_ENABLE:
2423 case KVM_TRACE_PAUSE: 2422 case KVM_TRACE_PAUSE:
2424 case KVM_TRACE_DISABLE: 2423 case KVM_TRACE_DISABLE:
2425 r = -EOPNOTSUPP; 2424 r = -EOPNOTSUPP;
2426 break; 2425 break;
2427 default: 2426 default:
2428 return kvm_arch_dev_ioctl(filp, ioctl, arg); 2427 return kvm_arch_dev_ioctl(filp, ioctl, arg);
2429 } 2428 }
2430 out: 2429 out:
2431 return r; 2430 return r;
2432 } 2431 }
2433 2432
2434 static struct file_operations kvm_chardev_ops = { 2433 static struct file_operations kvm_chardev_ops = {
2435 .unlocked_ioctl = kvm_dev_ioctl, 2434 .unlocked_ioctl = kvm_dev_ioctl,
2436 .compat_ioctl = kvm_dev_ioctl, 2435 .compat_ioctl = kvm_dev_ioctl,
2437 .llseek = noop_llseek, 2436 .llseek = noop_llseek,
2438 }; 2437 };
2439 2438
2440 static struct miscdevice kvm_dev = { 2439 static struct miscdevice kvm_dev = {
2441 KVM_MINOR, 2440 KVM_MINOR,
2442 "kvm", 2441 "kvm",
2443 &kvm_chardev_ops, 2442 &kvm_chardev_ops,
2444 }; 2443 };
2445 2444
2446 static void hardware_enable_nolock(void *junk) 2445 static void hardware_enable_nolock(void *junk)
2447 { 2446 {
2448 int cpu = raw_smp_processor_id(); 2447 int cpu = raw_smp_processor_id();
2449 int r; 2448 int r;
2450 2449
2451 if (cpumask_test_cpu(cpu, cpus_hardware_enabled)) 2450 if (cpumask_test_cpu(cpu, cpus_hardware_enabled))
2452 return; 2451 return;
2453 2452
2454 cpumask_set_cpu(cpu, cpus_hardware_enabled); 2453 cpumask_set_cpu(cpu, cpus_hardware_enabled);
2455 2454
2456 r = kvm_arch_hardware_enable(NULL); 2455 r = kvm_arch_hardware_enable(NULL);
2457 2456
2458 if (r) { 2457 if (r) {
2459 cpumask_clear_cpu(cpu, cpus_hardware_enabled); 2458 cpumask_clear_cpu(cpu, cpus_hardware_enabled);
2460 atomic_inc(&hardware_enable_failed); 2459 atomic_inc(&hardware_enable_failed);
2461 printk(KERN_INFO "kvm: enabling virtualization on " 2460 printk(KERN_INFO "kvm: enabling virtualization on "
2462 "CPU%d failed\n", cpu); 2461 "CPU%d failed\n", cpu);
2463 } 2462 }
2464 } 2463 }
2465 2464
2466 static void hardware_enable(void *junk) 2465 static void hardware_enable(void *junk)
2467 { 2466 {
2468 raw_spin_lock(&kvm_lock); 2467 raw_spin_lock(&kvm_lock);
2469 hardware_enable_nolock(junk); 2468 hardware_enable_nolock(junk);
2470 raw_spin_unlock(&kvm_lock); 2469 raw_spin_unlock(&kvm_lock);
2471 } 2470 }
2472 2471
2473 static void hardware_disable_nolock(void *junk) 2472 static void hardware_disable_nolock(void *junk)
2474 { 2473 {
2475 int cpu = raw_smp_processor_id(); 2474 int cpu = raw_smp_processor_id();
2476 2475
2477 if (!cpumask_test_cpu(cpu, cpus_hardware_enabled)) 2476 if (!cpumask_test_cpu(cpu, cpus_hardware_enabled))
2478 return; 2477 return;
2479 cpumask_clear_cpu(cpu, cpus_hardware_enabled); 2478 cpumask_clear_cpu(cpu, cpus_hardware_enabled);
2480 kvm_arch_hardware_disable(NULL); 2479 kvm_arch_hardware_disable(NULL);
2481 } 2480 }
2482 2481
2483 static void hardware_disable(void *junk) 2482 static void hardware_disable(void *junk)
2484 { 2483 {
2485 raw_spin_lock(&kvm_lock); 2484 raw_spin_lock(&kvm_lock);
2486 hardware_disable_nolock(junk); 2485 hardware_disable_nolock(junk);
2487 raw_spin_unlock(&kvm_lock); 2486 raw_spin_unlock(&kvm_lock);
2488 } 2487 }
2489 2488
2490 static void hardware_disable_all_nolock(void) 2489 static void hardware_disable_all_nolock(void)
2491 { 2490 {
2492 BUG_ON(!kvm_usage_count); 2491 BUG_ON(!kvm_usage_count);
2493 2492
2494 kvm_usage_count--; 2493 kvm_usage_count--;
2495 if (!kvm_usage_count) 2494 if (!kvm_usage_count)
2496 on_each_cpu(hardware_disable_nolock, NULL, 1); 2495 on_each_cpu(hardware_disable_nolock, NULL, 1);
2497 } 2496 }
2498 2497
2499 static void hardware_disable_all(void) 2498 static void hardware_disable_all(void)
2500 { 2499 {
2501 raw_spin_lock(&kvm_lock); 2500 raw_spin_lock(&kvm_lock);
2502 hardware_disable_all_nolock(); 2501 hardware_disable_all_nolock();
2503 raw_spin_unlock(&kvm_lock); 2502 raw_spin_unlock(&kvm_lock);
2504 } 2503 }
2505 2504
2506 static int hardware_enable_all(void) 2505 static int hardware_enable_all(void)
2507 { 2506 {
2508 int r = 0; 2507 int r = 0;
2509 2508
2510 raw_spin_lock(&kvm_lock); 2509 raw_spin_lock(&kvm_lock);
2511 2510
2512 kvm_usage_count++; 2511 kvm_usage_count++;
2513 if (kvm_usage_count == 1) { 2512 if (kvm_usage_count == 1) {
2514 atomic_set(&hardware_enable_failed, 0); 2513 atomic_set(&hardware_enable_failed, 0);
2515 on_each_cpu(hardware_enable_nolock, NULL, 1); 2514 on_each_cpu(hardware_enable_nolock, NULL, 1);
2516 2515
2517 if (atomic_read(&hardware_enable_failed)) { 2516 if (atomic_read(&hardware_enable_failed)) {
2518 hardware_disable_all_nolock(); 2517 hardware_disable_all_nolock();
2519 r = -EBUSY; 2518 r = -EBUSY;
2520 } 2519 }
2521 } 2520 }
2522 2521
2523 raw_spin_unlock(&kvm_lock); 2522 raw_spin_unlock(&kvm_lock);
2524 2523
2525 return r; 2524 return r;
2526 } 2525 }
2527 2526
2528 static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, 2527 static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
2529 void *v) 2528 void *v)
2530 { 2529 {
2531 int cpu = (long)v; 2530 int cpu = (long)v;
2532 2531
2533 if (!kvm_usage_count) 2532 if (!kvm_usage_count)
2534 return NOTIFY_OK; 2533 return NOTIFY_OK;
2535 2534
2536 val &= ~CPU_TASKS_FROZEN; 2535 val &= ~CPU_TASKS_FROZEN;
2537 switch (val) { 2536 switch (val) {
2538 case CPU_DYING: 2537 case CPU_DYING:
2539 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", 2538 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
2540 cpu); 2539 cpu);
2541 hardware_disable(NULL); 2540 hardware_disable(NULL);
2542 break; 2541 break;
2543 case CPU_STARTING: 2542 case CPU_STARTING:
2544 printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n", 2543 printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
2545 cpu); 2544 cpu);
2546 hardware_enable(NULL); 2545 hardware_enable(NULL);
2547 break; 2546 break;
2548 } 2547 }
2549 return NOTIFY_OK; 2548 return NOTIFY_OK;
2550 } 2549 }
2551 2550
2552 2551
2553 asmlinkage void kvm_spurious_fault(void) 2552 asmlinkage void kvm_spurious_fault(void)
2554 { 2553 {
2555 /* Fault while not rebooting. We want the trace. */ 2554 /* Fault while not rebooting. We want the trace. */
2556 BUG(); 2555 BUG();
2557 } 2556 }
2558 EXPORT_SYMBOL_GPL(kvm_spurious_fault); 2557 EXPORT_SYMBOL_GPL(kvm_spurious_fault);
2559 2558
2560 static int kvm_reboot(struct notifier_block *notifier, unsigned long val, 2559 static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
2561 void *v) 2560 void *v)
2562 { 2561 {
2563 /* 2562 /*
2564 * Some (well, at least mine) BIOSes hang on reboot if 2563 * Some (well, at least mine) BIOSes hang on reboot if
2565 * in vmx root mode. 2564 * in vmx root mode.
2566 * 2565 *
2567 * And Intel TXT required VMX off for all cpu when system shutdown. 2566 * And Intel TXT required VMX off for all cpu when system shutdown.
2568 */ 2567 */
2569 printk(KERN_INFO "kvm: exiting hardware virtualization\n"); 2568 printk(KERN_INFO "kvm: exiting hardware virtualization\n");
2570 kvm_rebooting = true; 2569 kvm_rebooting = true;
2571 on_each_cpu(hardware_disable_nolock, NULL, 1); 2570 on_each_cpu(hardware_disable_nolock, NULL, 1);
2572 return NOTIFY_OK; 2571 return NOTIFY_OK;
2573 } 2572 }
2574 2573
2575 static struct notifier_block kvm_reboot_notifier = { 2574 static struct notifier_block kvm_reboot_notifier = {
2576 .notifier_call = kvm_reboot, 2575 .notifier_call = kvm_reboot,
2577 .priority = 0, 2576 .priority = 0,
2578 }; 2577 };
2579 2578
2580 static void kvm_io_bus_destroy(struct kvm_io_bus *bus) 2579 static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
2581 { 2580 {
2582 int i; 2581 int i;
2583 2582
2584 for (i = 0; i < bus->dev_count; i++) { 2583 for (i = 0; i < bus->dev_count; i++) {
2585 struct kvm_io_device *pos = bus->range[i].dev; 2584 struct kvm_io_device *pos = bus->range[i].dev;
2586 2585
2587 kvm_iodevice_destructor(pos); 2586 kvm_iodevice_destructor(pos);
2588 } 2587 }
2589 kfree(bus); 2588 kfree(bus);
2590 } 2589 }
2591 2590
2592 int kvm_io_bus_sort_cmp(const void *p1, const void *p2) 2591 int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
2593 { 2592 {
2594 const struct kvm_io_range *r1 = p1; 2593 const struct kvm_io_range *r1 = p1;
2595 const struct kvm_io_range *r2 = p2; 2594 const struct kvm_io_range *r2 = p2;
2596 2595
2597 if (r1->addr < r2->addr) 2596 if (r1->addr < r2->addr)
2598 return -1; 2597 return -1;
2599 if (r1->addr + r1->len > r2->addr + r2->len) 2598 if (r1->addr + r1->len > r2->addr + r2->len)
2600 return 1; 2599 return 1;
2601 return 0; 2600 return 0;
2602 } 2601 }
2603 2602
2604 int kvm_io_bus_insert_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev, 2603 int kvm_io_bus_insert_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev,
2605 gpa_t addr, int len) 2604 gpa_t addr, int len)
2606 { 2605 {
2607 bus->range[bus->dev_count++] = (struct kvm_io_range) { 2606 bus->range[bus->dev_count++] = (struct kvm_io_range) {
2608 .addr = addr, 2607 .addr = addr,
2609 .len = len, 2608 .len = len,
2610 .dev = dev, 2609 .dev = dev,
2611 }; 2610 };
2612 2611
2613 sort(bus->range, bus->dev_count, sizeof(struct kvm_io_range), 2612 sort(bus->range, bus->dev_count, sizeof(struct kvm_io_range),
2614 kvm_io_bus_sort_cmp, NULL); 2613 kvm_io_bus_sort_cmp, NULL);
2615 2614
2616 return 0; 2615 return 0;
2617 } 2616 }
2618 2617
2619 int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus, 2618 int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus,
2620 gpa_t addr, int len) 2619 gpa_t addr, int len)
2621 { 2620 {
2622 struct kvm_io_range *range, key; 2621 struct kvm_io_range *range, key;
2623 int off; 2622 int off;
2624 2623
2625 key = (struct kvm_io_range) { 2624 key = (struct kvm_io_range) {
2626 .addr = addr, 2625 .addr = addr,
2627 .len = len, 2626 .len = len,
2628 }; 2627 };
2629 2628
2630 range = bsearch(&key, bus->range, bus->dev_count, 2629 range = bsearch(&key, bus->range, bus->dev_count,
2631 sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp); 2630 sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp);
2632 if (range == NULL) 2631 if (range == NULL)
2633 return -ENOENT; 2632 return -ENOENT;
2634 2633
2635 off = range - bus->range; 2634 off = range - bus->range;
2636 2635
2637 while (off > 0 && kvm_io_bus_sort_cmp(&key, &bus->range[off-1]) == 0) 2636 while (off > 0 && kvm_io_bus_sort_cmp(&key, &bus->range[off-1]) == 0)
2638 off--; 2637 off--;
2639 2638
2640 return off; 2639 return off;
2641 } 2640 }
2642 2641
2643 /* kvm_io_bus_write - called under kvm->slots_lock */ 2642 /* kvm_io_bus_write - called under kvm->slots_lock */
2644 int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, 2643 int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
2645 int len, const void *val) 2644 int len, const void *val)
2646 { 2645 {
2647 int idx; 2646 int idx;
2648 struct kvm_io_bus *bus; 2647 struct kvm_io_bus *bus;
2649 struct kvm_io_range range; 2648 struct kvm_io_range range;
2650 2649
2651 range = (struct kvm_io_range) { 2650 range = (struct kvm_io_range) {
2652 .addr = addr, 2651 .addr = addr,
2653 .len = len, 2652 .len = len,
2654 }; 2653 };
2655 2654
2656 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); 2655 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
2657 idx = kvm_io_bus_get_first_dev(bus, addr, len); 2656 idx = kvm_io_bus_get_first_dev(bus, addr, len);
2658 if (idx < 0) 2657 if (idx < 0)
2659 return -EOPNOTSUPP; 2658 return -EOPNOTSUPP;
2660 2659
2661 while (idx < bus->dev_count && 2660 while (idx < bus->dev_count &&
2662 kvm_io_bus_sort_cmp(&range, &bus->range[idx]) == 0) { 2661 kvm_io_bus_sort_cmp(&range, &bus->range[idx]) == 0) {
2663 if (!kvm_iodevice_write(bus->range[idx].dev, addr, len, val)) 2662 if (!kvm_iodevice_write(bus->range[idx].dev, addr, len, val))
2664 return 0; 2663 return 0;
2665 idx++; 2664 idx++;
2666 } 2665 }
2667 2666
2668 return -EOPNOTSUPP; 2667 return -EOPNOTSUPP;
2669 } 2668 }
2670 2669
2671 /* kvm_io_bus_read - called under kvm->slots_lock */ 2670 /* kvm_io_bus_read - called under kvm->slots_lock */
2672 int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, 2671 int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
2673 int len, void *val) 2672 int len, void *val)
2674 { 2673 {
2675 int idx; 2674 int idx;
2676 struct kvm_io_bus *bus; 2675 struct kvm_io_bus *bus;
2677 struct kvm_io_range range; 2676 struct kvm_io_range range;
2678 2677
2679 range = (struct kvm_io_range) { 2678 range = (struct kvm_io_range) {
2680 .addr = addr, 2679 .addr = addr,
2681 .len = len, 2680 .len = len,
2682 }; 2681 };
2683 2682
2684 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); 2683 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
2685 idx = kvm_io_bus_get_first_dev(bus, addr, len); 2684 idx = kvm_io_bus_get_first_dev(bus, addr, len);
2686 if (idx < 0) 2685 if (idx < 0)
2687 return -EOPNOTSUPP; 2686 return -EOPNOTSUPP;
2688 2687
2689 while (idx < bus->dev_count && 2688 while (idx < bus->dev_count &&
2690 kvm_io_bus_sort_cmp(&range, &bus->range[idx]) == 0) { 2689 kvm_io_bus_sort_cmp(&range, &bus->range[idx]) == 0) {
2691 if (!kvm_iodevice_read(bus->range[idx].dev, addr, len, val)) 2690 if (!kvm_iodevice_read(bus->range[idx].dev, addr, len, val))
2692 return 0; 2691 return 0;
2693 idx++; 2692 idx++;
2694 } 2693 }
2695 2694
2696 return -EOPNOTSUPP; 2695 return -EOPNOTSUPP;
2697 } 2696 }
2698 2697
2699 /* Caller must hold slots_lock. */ 2698 /* Caller must hold slots_lock. */
2700 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, 2699 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
2701 int len, struct kvm_io_device *dev) 2700 int len, struct kvm_io_device *dev)
2702 { 2701 {
2703 struct kvm_io_bus *new_bus, *bus; 2702 struct kvm_io_bus *new_bus, *bus;
2704 2703
2705 bus = kvm->buses[bus_idx]; 2704 bus = kvm->buses[bus_idx];
2706 if (bus->dev_count > NR_IOBUS_DEVS - 1) 2705 if (bus->dev_count > NR_IOBUS_DEVS - 1)
2707 return -ENOSPC; 2706 return -ENOSPC;
2708 2707
2709 new_bus = kzalloc(sizeof(*bus) + ((bus->dev_count + 1) * 2708 new_bus = kzalloc(sizeof(*bus) + ((bus->dev_count + 1) *
2710 sizeof(struct kvm_io_range)), GFP_KERNEL); 2709 sizeof(struct kvm_io_range)), GFP_KERNEL);
2711 if (!new_bus) 2710 if (!new_bus)
2712 return -ENOMEM; 2711 return -ENOMEM;
2713 memcpy(new_bus, bus, sizeof(*bus) + (bus->dev_count * 2712 memcpy(new_bus, bus, sizeof(*bus) + (bus->dev_count *
2714 sizeof(struct kvm_io_range))); 2713 sizeof(struct kvm_io_range)));
2715 kvm_io_bus_insert_dev(new_bus, dev, addr, len); 2714 kvm_io_bus_insert_dev(new_bus, dev, addr, len);
2716 rcu_assign_pointer(kvm->buses[bus_idx], new_bus); 2715 rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
2717 synchronize_srcu_expedited(&kvm->srcu); 2716 synchronize_srcu_expedited(&kvm->srcu);
2718 kfree(bus); 2717 kfree(bus);
2719 2718
2720 return 0; 2719 return 0;
2721 } 2720 }
2722 2721
2723 /* Caller must hold slots_lock. */ 2722 /* Caller must hold slots_lock. */
2724 int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, 2723 int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
2725 struct kvm_io_device *dev) 2724 struct kvm_io_device *dev)
2726 { 2725 {
2727 int i, r; 2726 int i, r;
2728 struct kvm_io_bus *new_bus, *bus; 2727 struct kvm_io_bus *new_bus, *bus;
2729 2728
2730 bus = kvm->buses[bus_idx]; 2729 bus = kvm->buses[bus_idx];
2731 r = -ENOENT; 2730 r = -ENOENT;
2732 for (i = 0; i < bus->dev_count; i++) 2731 for (i = 0; i < bus->dev_count; i++)
2733 if (bus->range[i].dev == dev) { 2732 if (bus->range[i].dev == dev) {
2734 r = 0; 2733 r = 0;
2735 break; 2734 break;
2736 } 2735 }
2737 2736
2738 if (r) 2737 if (r)
2739 return r; 2738 return r;
2740 2739
2741 new_bus = kzalloc(sizeof(*bus) + ((bus->dev_count - 1) * 2740 new_bus = kzalloc(sizeof(*bus) + ((bus->dev_count - 1) *
2742 sizeof(struct kvm_io_range)), GFP_KERNEL); 2741 sizeof(struct kvm_io_range)), GFP_KERNEL);
2743 if (!new_bus) 2742 if (!new_bus)
2744 return -ENOMEM; 2743 return -ENOMEM;
2745 2744
2746 memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range)); 2745 memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range));
2747 new_bus->dev_count--; 2746 new_bus->dev_count--;
2748 memcpy(new_bus->range + i, bus->range + i + 1, 2747 memcpy(new_bus->range + i, bus->range + i + 1,
2749 (new_bus->dev_count - i) * sizeof(struct kvm_io_range)); 2748 (new_bus->dev_count - i) * sizeof(struct kvm_io_range));
2750 2749
2751 rcu_assign_pointer(kvm->buses[bus_idx], new_bus); 2750 rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
2752 synchronize_srcu_expedited(&kvm->srcu); 2751 synchronize_srcu_expedited(&kvm->srcu);
2753 kfree(bus); 2752 kfree(bus);
2754 return r; 2753 return r;
2755 } 2754 }
2756 2755
2757 static struct notifier_block kvm_cpu_notifier = { 2756 static struct notifier_block kvm_cpu_notifier = {
2758 .notifier_call = kvm_cpu_hotplug, 2757 .notifier_call = kvm_cpu_hotplug,
2759 }; 2758 };
2760 2759
2761 static int vm_stat_get(void *_offset, u64 *val) 2760 static int vm_stat_get(void *_offset, u64 *val)
2762 { 2761 {
2763 unsigned offset = (long)_offset; 2762 unsigned offset = (long)_offset;
2764 struct kvm *kvm; 2763 struct kvm *kvm;
2765 2764
2766 *val = 0; 2765 *val = 0;
2767 raw_spin_lock(&kvm_lock); 2766 raw_spin_lock(&kvm_lock);
2768 list_for_each_entry(kvm, &vm_list, vm_list) 2767 list_for_each_entry(kvm, &vm_list, vm_list)
2769 *val += *(u32 *)((void *)kvm + offset); 2768 *val += *(u32 *)((void *)kvm + offset);
2770 raw_spin_unlock(&kvm_lock); 2769 raw_spin_unlock(&kvm_lock);
2771 return 0; 2770 return 0;
2772 } 2771 }
2773 2772
2774 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, NULL, "%llu\n"); 2773 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, NULL, "%llu\n");
2775 2774
2776 static int vcpu_stat_get(void *_offset, u64 *val) 2775 static int vcpu_stat_get(void *_offset, u64 *val)
2777 { 2776 {
2778 unsigned offset = (long)_offset; 2777 unsigned offset = (long)_offset;
2779 struct kvm *kvm; 2778 struct kvm *kvm;
2780 struct kvm_vcpu *vcpu; 2779 struct kvm_vcpu *vcpu;
2781 int i; 2780 int i;
2782 2781
2783 *val = 0; 2782 *val = 0;
2784 raw_spin_lock(&kvm_lock); 2783 raw_spin_lock(&kvm_lock);
2785 list_for_each_entry(kvm, &vm_list, vm_list) 2784 list_for_each_entry(kvm, &vm_list, vm_list)
2786 kvm_for_each_vcpu(i, vcpu, kvm) 2785 kvm_for_each_vcpu(i, vcpu, kvm)
2787 *val += *(u32 *)((void *)vcpu + offset); 2786 *val += *(u32 *)((void *)vcpu + offset);
2788 2787
2789 raw_spin_unlock(&kvm_lock); 2788 raw_spin_unlock(&kvm_lock);
2790 return 0; 2789 return 0;
2791 } 2790 }
2792 2791
2793 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, NULL, "%llu\n"); 2792 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, NULL, "%llu\n");
2794 2793
2795 static const struct file_operations *stat_fops[] = { 2794 static const struct file_operations *stat_fops[] = {
2796 [KVM_STAT_VCPU] = &vcpu_stat_fops, 2795 [KVM_STAT_VCPU] = &vcpu_stat_fops,
2797 [KVM_STAT_VM] = &vm_stat_fops, 2796 [KVM_STAT_VM] = &vm_stat_fops,
2798 }; 2797 };
2799 2798
2800 static int kvm_init_debug(void) 2799 static int kvm_init_debug(void)
2801 { 2800 {
2802 int r = -EFAULT; 2801 int r = -EFAULT;
2803 struct kvm_stats_debugfs_item *p; 2802 struct kvm_stats_debugfs_item *p;
2804 2803
2805 kvm_debugfs_dir = debugfs_create_dir("kvm", NULL); 2804 kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
2806 if (kvm_debugfs_dir == NULL) 2805 if (kvm_debugfs_dir == NULL)
2807 goto out; 2806 goto out;
2808 2807
2809 for (p = debugfs_entries; p->name; ++p) { 2808 for (p = debugfs_entries; p->name; ++p) {
2810 p->dentry = debugfs_create_file(p->name, 0444, kvm_debugfs_dir, 2809 p->dentry = debugfs_create_file(p->name, 0444, kvm_debugfs_dir,
2811 (void *)(long)p->offset, 2810 (void *)(long)p->offset,
2812 stat_fops[p->kind]); 2811 stat_fops[p->kind]);
2813 if (p->dentry == NULL) 2812 if (p->dentry == NULL)
2814 goto out_dir; 2813 goto out_dir;
2815 } 2814 }
2816 2815
2817 return 0; 2816 return 0;
2818 2817
2819 out_dir: 2818 out_dir:
2820 debugfs_remove_recursive(kvm_debugfs_dir); 2819 debugfs_remove_recursive(kvm_debugfs_dir);
2821 out: 2820 out:
2822 return r; 2821 return r;
2823 } 2822 }
2824 2823
2825 static void kvm_exit_debug(void) 2824 static void kvm_exit_debug(void)
2826 { 2825 {
2827 struct kvm_stats_debugfs_item *p; 2826 struct kvm_stats_debugfs_item *p;
2828 2827
2829 for (p = debugfs_entries; p->name; ++p) 2828 for (p = debugfs_entries; p->name; ++p)
2830 debugfs_remove(p->dentry); 2829 debugfs_remove(p->dentry);
2831 debugfs_remove(kvm_debugfs_dir); 2830 debugfs_remove(kvm_debugfs_dir);
2832 } 2831 }
2833 2832
2834 static int kvm_suspend(void) 2833 static int kvm_suspend(void)
2835 { 2834 {
2836 if (kvm_usage_count) 2835 if (kvm_usage_count)
2837 hardware_disable_nolock(NULL); 2836 hardware_disable_nolock(NULL);
2838 return 0; 2837 return 0;
2839 } 2838 }
2840 2839
2841 static void kvm_resume(void) 2840 static void kvm_resume(void)
2842 { 2841 {
2843 if (kvm_usage_count) { 2842 if (kvm_usage_count) {
2844 WARN_ON(raw_spin_is_locked(&kvm_lock)); 2843 WARN_ON(raw_spin_is_locked(&kvm_lock));
2845 hardware_enable_nolock(NULL); 2844 hardware_enable_nolock(NULL);
2846 } 2845 }
2847 } 2846 }
2848 2847
2849 static struct syscore_ops kvm_syscore_ops = { 2848 static struct syscore_ops kvm_syscore_ops = {
2850 .suspend = kvm_suspend, 2849 .suspend = kvm_suspend,
2851 .resume = kvm_resume, 2850 .resume = kvm_resume,
2852 }; 2851 };
2853 2852
2854 static inline 2853 static inline
2855 struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) 2854 struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
2856 { 2855 {
2857 return container_of(pn, struct kvm_vcpu, preempt_notifier); 2856 return container_of(pn, struct kvm_vcpu, preempt_notifier);
2858 } 2857 }
2859 2858
2860 static void kvm_sched_in(struct preempt_notifier *pn, int cpu) 2859 static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
2861 { 2860 {
2862 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 2861 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
2863 2862
2864 kvm_arch_vcpu_load(vcpu, cpu); 2863 kvm_arch_vcpu_load(vcpu, cpu);
2865 } 2864 }
2866 2865
2867 static void kvm_sched_out(struct preempt_notifier *pn, 2866 static void kvm_sched_out(struct preempt_notifier *pn,
2868 struct task_struct *next) 2867 struct task_struct *next)
2869 { 2868 {
2870 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 2869 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
2871 2870
2872 kvm_arch_vcpu_put(vcpu); 2871 kvm_arch_vcpu_put(vcpu);
2873 } 2872 }
2874 2873
2875 int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, 2874 int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
2876 struct module *module) 2875 struct module *module)
2877 { 2876 {
2878 int r; 2877 int r;
2879 int cpu; 2878 int cpu;
2880 2879
2881 r = kvm_arch_init(opaque); 2880 r = kvm_arch_init(opaque);
2882 if (r) 2881 if (r)
2883 goto out_fail; 2882 goto out_fail;
2884 2883
2885 if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { 2884 if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
2886 r = -ENOMEM; 2885 r = -ENOMEM;
2887 goto out_free_0; 2886 goto out_free_0;
2888 } 2887 }
2889 2888
2890 r = kvm_arch_hardware_setup(); 2889 r = kvm_arch_hardware_setup();
2891 if (r < 0) 2890 if (r < 0)
2892 goto out_free_0a; 2891 goto out_free_0a;
2893 2892
2894 for_each_online_cpu(cpu) { 2893 for_each_online_cpu(cpu) {
2895 smp_call_function_single(cpu, 2894 smp_call_function_single(cpu,
2896 kvm_arch_check_processor_compat, 2895 kvm_arch_check_processor_compat,
2897 &r, 1); 2896 &r, 1);
2898 if (r < 0) 2897 if (r < 0)
2899 goto out_free_1; 2898 goto out_free_1;
2900 } 2899 }
2901 2900
2902 r = register_cpu_notifier(&kvm_cpu_notifier); 2901 r = register_cpu_notifier(&kvm_cpu_notifier);
2903 if (r) 2902 if (r)
2904 goto out_free_2; 2903 goto out_free_2;
2905 register_reboot_notifier(&kvm_reboot_notifier); 2904 register_reboot_notifier(&kvm_reboot_notifier);
2906 2905
2907 /* A kmem cache lets us meet the alignment requirements of fx_save. */ 2906 /* A kmem cache lets us meet the alignment requirements of fx_save. */
2908 if (!vcpu_align) 2907 if (!vcpu_align)
2909 vcpu_align = __alignof__(struct kvm_vcpu); 2908 vcpu_align = __alignof__(struct kvm_vcpu);
2910 kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, vcpu_align, 2909 kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, vcpu_align,
2911 0, NULL); 2910 0, NULL);
2912 if (!kvm_vcpu_cache) { 2911 if (!kvm_vcpu_cache) {
2913 r = -ENOMEM; 2912 r = -ENOMEM;
2914 goto out_free_3; 2913 goto out_free_3;
2915 } 2914 }
2916 2915
2917 r = kvm_async_pf_init(); 2916 r = kvm_async_pf_init();
2918 if (r) 2917 if (r)
2919 goto out_free; 2918 goto out_free;
2920 2919
2921 kvm_chardev_ops.owner = module; 2920 kvm_chardev_ops.owner = module;
2922 kvm_vm_fops.owner = module; 2921 kvm_vm_fops.owner = module;
2923 kvm_vcpu_fops.owner = module; 2922 kvm_vcpu_fops.owner = module;
2924 2923
2925 r = misc_register(&kvm_dev); 2924 r = misc_register(&kvm_dev);
2926 if (r) { 2925 if (r) {
2927 printk(KERN_ERR "kvm: misc device register failed\n"); 2926 printk(KERN_ERR "kvm: misc device register failed\n");
2928 goto out_unreg; 2927 goto out_unreg;
2929 } 2928 }
2930 2929
2931 register_syscore_ops(&kvm_syscore_ops); 2930 register_syscore_ops(&kvm_syscore_ops);
2932 2931
2933 kvm_preempt_ops.sched_in = kvm_sched_in; 2932 kvm_preempt_ops.sched_in = kvm_sched_in;
2934 kvm_preempt_ops.sched_out = kvm_sched_out; 2933 kvm_preempt_ops.sched_out = kvm_sched_out;
2935 2934
2936 r = kvm_init_debug(); 2935 r = kvm_init_debug();
2937 if (r) { 2936 if (r) {
2938 printk(KERN_ERR "kvm: create debugfs files failed\n"); 2937 printk(KERN_ERR "kvm: create debugfs files failed\n");
2939 goto out_undebugfs; 2938 goto out_undebugfs;
2940 } 2939 }
2941 2940
2942 return 0; 2941 return 0;
2943 2942
2944 out_undebugfs: 2943 out_undebugfs:
2945 unregister_syscore_ops(&kvm_syscore_ops); 2944 unregister_syscore_ops(&kvm_syscore_ops);
2946 out_unreg: 2945 out_unreg:
2947 kvm_async_pf_deinit(); 2946 kvm_async_pf_deinit();
2948 out_free: 2947 out_free:
2949 kmem_cache_destroy(kvm_vcpu_cache); 2948 kmem_cache_destroy(kvm_vcpu_cache);
2950 out_free_3: 2949 out_free_3:
2951 unregister_reboot_notifier(&kvm_reboot_notifier); 2950 unregister_reboot_notifier(&kvm_reboot_notifier);
2952 unregister_cpu_notifier(&kvm_cpu_notifier); 2951 unregister_cpu_notifier(&kvm_cpu_notifier);
2953 out_free_2: 2952 out_free_2:
2954 out_free_1: 2953 out_free_1:
2955 kvm_arch_hardware_unsetup(); 2954 kvm_arch_hardware_unsetup();
2956 out_free_0a: 2955 out_free_0a:
2957 free_cpumask_var(cpus_hardware_enabled); 2956 free_cpumask_var(cpus_hardware_enabled);
2958 out_free_0: 2957 out_free_0:
2959 kvm_arch_exit(); 2958 kvm_arch_exit();
2960 out_fail: 2959 out_fail:
2961 return r; 2960 return r;
2962 } 2961 }
2963 EXPORT_SYMBOL_GPL(kvm_init); 2962 EXPORT_SYMBOL_GPL(kvm_init);
2964 2963
2965 void kvm_exit(void) 2964 void kvm_exit(void)
2966 { 2965 {
2967 kvm_exit_debug(); 2966 kvm_exit_debug();
2968 misc_deregister(&kvm_dev); 2967 misc_deregister(&kvm_dev);
2969 kmem_cache_destroy(kvm_vcpu_cache); 2968 kmem_cache_destroy(kvm_vcpu_cache);
2970 kvm_async_pf_deinit(); 2969 kvm_async_pf_deinit();
2971 unregister_syscore_ops(&kvm_syscore_ops); 2970 unregister_syscore_ops(&kvm_syscore_ops);
2972 unregister_reboot_notifier(&kvm_reboot_notifier); 2971 unregister_reboot_notifier(&kvm_reboot_notifier);
2973 unregister_cpu_notifier(&kvm_cpu_notifier); 2972 unregister_cpu_notifier(&kvm_cpu_notifier);
2974 on_each_cpu(hardware_disable_nolock, NULL, 1); 2973 on_each_cpu(hardware_disable_nolock, NULL, 1);
2975 kvm_arch_hardware_unsetup(); 2974 kvm_arch_hardware_unsetup();
2976 kvm_arch_exit(); 2975 kvm_arch_exit();
2977 free_cpumask_var(cpus_hardware_enabled); 2976 free_cpumask_var(cpus_hardware_enabled);
2978 } 2977 }
2979 EXPORT_SYMBOL_GPL(kvm_exit); 2978 EXPORT_SYMBOL_GPL(kvm_exit);
2980 2979