Doug / smarc-fsl-linux-kernel

1

/*

1

/*

2

* Kernel-based Virtual Machine driver for Linux

2

* Kernel-based Virtual Machine driver for Linux

3

*

3

*

4

* derived from drivers/kvm/kvm_main.c

4

* derived from drivers/kvm/kvm_main.c

5

*

5

*

6

7

8

* Copyright IBM Corporation, 2008

8

* Copyright IBM Corporation, 2008

9

10

*

10

*

11

* Authors:

11

* Authors:

12

* Avi Kivity <avi@qumranet.com>

12

* Avi Kivity <avi@qumranet.com>

13

* Yaniv Kamay <yaniv@qumranet.com>

13

* Yaniv Kamay <yaniv@qumranet.com>

14

* Amit Shah <amit.shah@qumranet.com>

14

* Amit Shah <amit.shah@qumranet.com>

15

* Ben-Ami Yassour <benami@il.ibm.com>

15

* Ben-Ami Yassour <benami@il.ibm.com>

16

*

16

*

17

* This work is licensed under the terms of the GNU GPL, version 2. See

17

* This work is licensed under the terms of the GNU GPL, version 2. See

18

* the COPYING file in the top-level directory.

18

* the COPYING file in the top-level directory.

19

*

19

*

20

*/

20

*/

21

22

#include <linux/kvm_host.h>

22

#include <linux/kvm_host.h>

23

#include "irq.h"

23

#include "irq.h"

24

#include "mmu.h"

24

#include "mmu.h"

25

#include "i8254.h"

25

#include "i8254.h"

26

#include "tss.h"

26

#include "tss.h"

27

#include "kvm_cache_regs.h"

27

#include "kvm_cache_regs.h"

28

#include "x86.h"

28

#include "x86.h"

29

#include "cpuid.h"

29

#include "cpuid.h"

30

31

#include <linux/clocksource.h>

31

#include <linux/clocksource.h>

32

#include <linux/interrupt.h>

32

#include <linux/interrupt.h>

33

#include <linux/kvm.h>

33

#include <linux/kvm.h>

34

#include <linux/fs.h>

34

#include <linux/fs.h>

35

#include <linux/vmalloc.h>

35

#include <linux/vmalloc.h>

36

#include <linux/module.h>

36

#include <linux/module.h>

37

#include <linux/mman.h>

37

#include <linux/mman.h>

38

#include <linux/highmem.h>

38

#include <linux/highmem.h>

39

#include <linux/iommu.h>

39

#include <linux/iommu.h>

40

#include <linux/intel-iommu.h>

40

#include <linux/intel-iommu.h>

41

#include <linux/cpufreq.h>

41

#include <linux/cpufreq.h>

42

#include <linux/user-return-notifier.h>

42

#include <linux/user-return-notifier.h>

43

#include <linux/srcu.h>

43

#include <linux/srcu.h>

44

#include <linux/slab.h>

44

#include <linux/slab.h>

45

#include <linux/perf_event.h>

45

#include <linux/perf_event.h>

46

#include <linux/uaccess.h>

46

#include <linux/uaccess.h>

47

#include <linux/hash.h>

47

#include <linux/hash.h>

48

#include <linux/pci.h>

48

#include <linux/pci.h>

49

#include <linux/timekeeper_internal.h>

49

#include <linux/timekeeper_internal.h>

50

#include <linux/pvclock_gtod.h>

50

#include <linux/pvclock_gtod.h>

51

#include <trace/events/kvm.h>

51

#include <trace/events/kvm.h>

52

53

#define CREATE_TRACE_POINTS

53

#define CREATE_TRACE_POINTS

54

#include "trace.h"

54

#include "trace.h"

55

56

#include <asm/debugreg.h>

56

#include <asm/debugreg.h>

57

#include <asm/msr.h>

57

#include <asm/msr.h>

58

#include <asm/desc.h>

58

#include <asm/desc.h>

59

#include <asm/mtrr.h>

59

#include <asm/mtrr.h>

60

#include <asm/mce.h>

60

#include <asm/mce.h>

61

#include <asm/i387.h>

61

#include <asm/i387.h>

62

#include <asm/fpu-internal.h> /* Ugh! */

62

#include <asm/fpu-internal.h> /* Ugh! */

63

#include <asm/xcr.h>

63

#include <asm/xcr.h>

64

#include <asm/pvclock.h>

64

#include <asm/pvclock.h>

65

#include <asm/div64.h>

65

#include <asm/div64.h>

66

67

#define MAX_IO_MSRS 256

67

#define MAX_IO_MSRS 256

68

#define KVM_MAX_MCE_BANKS 32

68

#define KVM_MAX_MCE_BANKS 32

69

#define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P)

69

#define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P)

70

71

#define emul_to_vcpu(ctxt) \

71

#define emul_to_vcpu(ctxt) \

72

container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt)

72

container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt)

73

74

/* EFER defaults:

74

/* EFER defaults:

75

* - enable syscall per default because its emulated by KVM

75

* - enable syscall per default because its emulated by KVM

76

* - enable LME and LMA per default on 64 bit KVM

76

* - enable LME and LMA per default on 64 bit KVM

77

*/

77

*/

78

#ifdef CONFIG_X86_64

78

#ifdef CONFIG_X86_64

79

static

79

static

80

u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA));

80

u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA));

81

#else

81

#else

82

static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);

82

static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);

83

#endif

83

#endif

84

85

#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM

85

#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM

86

#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU

86

#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU

87

88

static void update_cr8_intercept(struct kvm_vcpu *vcpu);

88

static void update_cr8_intercept(struct kvm_vcpu *vcpu);

89

static void process_nmi(struct kvm_vcpu *vcpu);

89

static void process_nmi(struct kvm_vcpu *vcpu);

90

91

struct kvm_x86_ops *kvm_x86_ops;

91

struct kvm_x86_ops *kvm_x86_ops;

92

EXPORT_SYMBOL_GPL(kvm_x86_ops);

92

EXPORT_SYMBOL_GPL(kvm_x86_ops);

93

94

static bool ignore_msrs = 0;

94

static bool ignore_msrs = 0;

95

module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);

95

module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);

96

97

bool kvm_has_tsc_control;

97

bool kvm_has_tsc_control;

98

EXPORT_SYMBOL_GPL(kvm_has_tsc_control);

98

EXPORT_SYMBOL_GPL(kvm_has_tsc_control);

99

u32 kvm_max_guest_tsc_khz;

99

u32 kvm_max_guest_tsc_khz;

100

EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);

100

EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);

101

102

/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */

102

/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */

103

static u32 tsc_tolerance_ppm = 250;

103

static u32 tsc_tolerance_ppm = 250;

104

module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);

104

module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);

105

106

#define KVM_NR_SHARED_MSRS 16

106

#define KVM_NR_SHARED_MSRS 16

107

108

struct kvm_shared_msrs_global {

108

struct kvm_shared_msrs_global {

109

int nr;

109

int nr;

110

u32 msrs[KVM_NR_SHARED_MSRS];

110

u32 msrs[KVM_NR_SHARED_MSRS];

111

};

111

};

112

113

struct kvm_shared_msrs {

113

struct kvm_shared_msrs {

114

struct user_return_notifier urn;

114

struct user_return_notifier urn;

115

bool registered;

115

bool registered;

116

struct kvm_shared_msr_values {

116

struct kvm_shared_msr_values {

117

u64 host;

117

u64 host;

118

u64 curr;

118

u64 curr;

119

} values[KVM_NR_SHARED_MSRS];

119

} values[KVM_NR_SHARED_MSRS];

120

};

120

};

121

122

static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;

122

static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;

123

static DEFINE_PER_CPU(struct kvm_shared_msrs, shared_msrs);

123

static DEFINE_PER_CPU(struct kvm_shared_msrs, shared_msrs);

124

125

struct kvm_stats_debugfs_item debugfs_entries[] = {

125

struct kvm_stats_debugfs_item debugfs_entries[] = {

126

{ "pf_fixed", VCPU_STAT(pf_fixed) },

126

{ "pf_fixed", VCPU_STAT(pf_fixed) },

127

{ "pf_guest", VCPU_STAT(pf_guest) },

127

{ "pf_guest", VCPU_STAT(pf_guest) },

128

{ "tlb_flush", VCPU_STAT(tlb_flush) },

128

{ "tlb_flush", VCPU_STAT(tlb_flush) },

129

{ "invlpg", VCPU_STAT(invlpg) },

129

{ "invlpg", VCPU_STAT(invlpg) },

130

{ "exits", VCPU_STAT(exits) },

130

{ "exits", VCPU_STAT(exits) },

131

{ "io_exits", VCPU_STAT(io_exits) },

131

{ "io_exits", VCPU_STAT(io_exits) },

132

{ "mmio_exits", VCPU_STAT(mmio_exits) },

132

{ "mmio_exits", VCPU_STAT(mmio_exits) },

133

{ "signal_exits", VCPU_STAT(signal_exits) },

133

{ "signal_exits", VCPU_STAT(signal_exits) },

134

{ "irq_window", VCPU_STAT(irq_window_exits) },

134

{ "irq_window", VCPU_STAT(irq_window_exits) },

135

{ "nmi_window", VCPU_STAT(nmi_window_exits) },

135

{ "nmi_window", VCPU_STAT(nmi_window_exits) },

136

{ "halt_exits", VCPU_STAT(halt_exits) },

136

{ "halt_exits", VCPU_STAT(halt_exits) },

137

{ "halt_wakeup", VCPU_STAT(halt_wakeup) },

137

{ "halt_wakeup", VCPU_STAT(halt_wakeup) },

138

{ "hypercalls", VCPU_STAT(hypercalls) },

138

{ "hypercalls", VCPU_STAT(hypercalls) },

139

{ "request_irq", VCPU_STAT(request_irq_exits) },

139

{ "request_irq", VCPU_STAT(request_irq_exits) },

140

{ "irq_exits", VCPU_STAT(irq_exits) },

140

{ "irq_exits", VCPU_STAT(irq_exits) },

141

{ "host_state_reload", VCPU_STAT(host_state_reload) },

141

{ "host_state_reload", VCPU_STAT(host_state_reload) },

142

{ "efer_reload", VCPU_STAT(efer_reload) },

142

{ "efer_reload", VCPU_STAT(efer_reload) },

143

{ "fpu_reload", VCPU_STAT(fpu_reload) },

143

{ "fpu_reload", VCPU_STAT(fpu_reload) },

144

{ "insn_emulation", VCPU_STAT(insn_emulation) },

144

{ "insn_emulation", VCPU_STAT(insn_emulation) },

145

{ "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },

145

{ "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },

146

{ "irq_injections", VCPU_STAT(irq_injections) },

146

{ "irq_injections", VCPU_STAT(irq_injections) },

147

{ "nmi_injections", VCPU_STAT(nmi_injections) },

147

{ "nmi_injections", VCPU_STAT(nmi_injections) },

148

{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },

148

{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },

149

{ "mmu_pte_write", VM_STAT(mmu_pte_write) },

149

{ "mmu_pte_write", VM_STAT(mmu_pte_write) },

150

{ "mmu_pte_updated", VM_STAT(mmu_pte_updated) },

150

{ "mmu_pte_updated", VM_STAT(mmu_pte_updated) },

151

{ "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },

151

{ "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },

152

{ "mmu_flooded", VM_STAT(mmu_flooded) },

152

{ "mmu_flooded", VM_STAT(mmu_flooded) },

153

{ "mmu_recycled", VM_STAT(mmu_recycled) },

153

{ "mmu_recycled", VM_STAT(mmu_recycled) },

154

{ "mmu_cache_miss", VM_STAT(mmu_cache_miss) },

154

{ "mmu_cache_miss", VM_STAT(mmu_cache_miss) },

155

{ "mmu_unsync", VM_STAT(mmu_unsync) },

155

{ "mmu_unsync", VM_STAT(mmu_unsync) },

156

{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },

156

{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },

157

{ "largepages", VM_STAT(lpages) },

157

{ "largepages", VM_STAT(lpages) },

158

{ NULL }

158

{ NULL }

159

};

159

};

160

161

u64 __read_mostly host_xcr0;

161

u64 __read_mostly host_xcr0;

162

163

static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);

163

static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);

164

165

static int kvm_vcpu_reset(struct kvm_vcpu *vcpu);

165

static int kvm_vcpu_reset(struct kvm_vcpu *vcpu);

166

167

static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)

167

static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)

168

{

168

{

169

int i;

169

int i;

170

for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++)

170

for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++)

171

vcpu->arch.apf.gfns[i] = ~0;

171

vcpu->arch.apf.gfns[i] = ~0;

172

}

172

}

173

174

static void kvm_on_user_return(struct user_return_notifier *urn)

174

static void kvm_on_user_return(struct user_return_notifier *urn)

175

{

175

{

176

unsigned slot;

176

unsigned slot;

177

struct kvm_shared_msrs *locals

177

struct kvm_shared_msrs *locals

178

= container_of(urn, struct kvm_shared_msrs, urn);

178

= container_of(urn, struct kvm_shared_msrs, urn);

179

struct kvm_shared_msr_values *values;

179

struct kvm_shared_msr_values *values;

180

181

for (slot = 0; slot < shared_msrs_global.nr; ++slot) {

181

for (slot = 0; slot < shared_msrs_global.nr; ++slot) {

182

values = &locals->values[slot];

182

values = &locals->values[slot];

183

if (values->host != values->curr) {

183

if (values->host != values->curr) {

184

wrmsrl(shared_msrs_global.msrs[slot], values->host);

184

wrmsrl(shared_msrs_global.msrs[slot], values->host);

185

values->curr = values->host;

185

values->curr = values->host;

186

}

186

}

187

}

187

}

188

locals->registered = false;

188

locals->registered = false;

189

user_return_notifier_unregister(urn);

189

user_return_notifier_unregister(urn);

190

}

190

}

191

192

static void shared_msr_update(unsigned slot, u32 msr)

192

static void shared_msr_update(unsigned slot, u32 msr)

193

{

193

{

194

struct kvm_shared_msrs *smsr;

194

struct kvm_shared_msrs *smsr;

195

u64 value;

195

u64 value;

196

197

smsr = &__get_cpu_var(shared_msrs);

197

smsr = &__get_cpu_var(shared_msrs);

198

/* only read, and nobody should modify it at this time,

198

/* only read, and nobody should modify it at this time,

199

* so don't need lock */

199

* so don't need lock */

200

if (slot >= shared_msrs_global.nr) {

200

if (slot >= shared_msrs_global.nr) {

201

printk(KERN_ERR "kvm: invalid MSR slot!");

201

printk(KERN_ERR "kvm: invalid MSR slot!");

202

return;

202

return;

203

}

203

}

204

rdmsrl_safe(msr, &value);

204

rdmsrl_safe(msr, &value);

205

smsr->values[slot].host = value;

205

smsr->values[slot].host = value;

206

smsr->values[slot].curr = value;

206

smsr->values[slot].curr = value;

207

}

207

}

208

209

void kvm_define_shared_msr(unsigned slot, u32 msr)

209

void kvm_define_shared_msr(unsigned slot, u32 msr)

210

{

210

{

211

if (slot >= shared_msrs_global.nr)

211

if (slot >= shared_msrs_global.nr)

212

shared_msrs_global.nr = slot + 1;

212

shared_msrs_global.nr = slot + 1;

213

shared_msrs_global.msrs[slot] = msr;

213

shared_msrs_global.msrs[slot] = msr;

214

/* we need ensured the shared_msr_global have been updated */

214

/* we need ensured the shared_msr_global have been updated */

215

smp_wmb();

215

smp_wmb();

216

}

216

}

217

EXPORT_SYMBOL_GPL(kvm_define_shared_msr);

217

EXPORT_SYMBOL_GPL(kvm_define_shared_msr);

218

219

static void kvm_shared_msr_cpu_online(void)

219

static void kvm_shared_msr_cpu_online(void)

220

{

220

{

221

unsigned i;

221

unsigned i;

222

223

for (i = 0; i < shared_msrs_global.nr; ++i)

223

for (i = 0; i < shared_msrs_global.nr; ++i)

224

shared_msr_update(i, shared_msrs_global.msrs[i]);

224

shared_msr_update(i, shared_msrs_global.msrs[i]);

225

}

225

}

226

227

void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)

227

void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)

228

{

228

{

229

struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);

229

struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);

230

231

if (((value ^ smsr->values[slot].curr) & mask) == 0)

231

if (((value ^ smsr->values[slot].curr) & mask) == 0)

232

return;

232

return;

233

smsr->values[slot].curr = value;

233

smsr->values[slot].curr = value;

234

wrmsrl(shared_msrs_global.msrs[slot], value);

234

wrmsrl(shared_msrs_global.msrs[slot], value);

235

if (!smsr->registered) {

235

if (!smsr->registered) {

236

smsr->urn.on_user_return = kvm_on_user_return;

236

smsr->urn.on_user_return = kvm_on_user_return;

237

user_return_notifier_register(&smsr->urn);

237

user_return_notifier_register(&smsr->urn);

238

smsr->registered = true;

238

smsr->registered = true;

239

}

239

}

240

}

240

}

241

EXPORT_SYMBOL_GPL(kvm_set_shared_msr);

241

EXPORT_SYMBOL_GPL(kvm_set_shared_msr);

242

243

static void drop_user_return_notifiers(void *ignore)

243

static void drop_user_return_notifiers(void *ignore)

244

{

244

{

245

struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);

245

struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);

246

247

if (smsr->registered)

247

if (smsr->registered)

248

kvm_on_user_return(&smsr->urn);

248

kvm_on_user_return(&smsr->urn);

249

}

249

}

250

251

u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)

251

u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)

252

{

252

{

253

return vcpu->arch.apic_base;

253

return vcpu->arch.apic_base;

254

}

254

}

255

EXPORT_SYMBOL_GPL(kvm_get_apic_base);

255

EXPORT_SYMBOL_GPL(kvm_get_apic_base);

256

257

void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)

257

void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)

258

{

258

{

259

/* TODO: reserve bits check */

259

/* TODO: reserve bits check */

260

kvm_lapic_set_base(vcpu, data);

260

kvm_lapic_set_base(vcpu, data);

261

}

261

}

262

EXPORT_SYMBOL_GPL(kvm_set_apic_base);

262

EXPORT_SYMBOL_GPL(kvm_set_apic_base);

263

264

#define EXCPT_BENIGN 0

264

#define EXCPT_BENIGN 0

265

#define EXCPT_CONTRIBUTORY 1

265

#define EXCPT_CONTRIBUTORY 1

266

#define EXCPT_PF 2

266

#define EXCPT_PF 2

267

268

static int exception_class(int vector)

268

static int exception_class(int vector)

269

{

269

{

270

switch (vector) {

270

switch (vector) {

271

case PF_VECTOR:

271

case PF_VECTOR:

272

return EXCPT_PF;

272

return EXCPT_PF;

273

case DE_VECTOR:

273

case DE_VECTOR:

274

case TS_VECTOR:

274

case TS_VECTOR:

275

case NP_VECTOR:

275

case NP_VECTOR:

276

case SS_VECTOR:

276

case SS_VECTOR:

277

case GP_VECTOR:

277

case GP_VECTOR:

278

return EXCPT_CONTRIBUTORY;

278

return EXCPT_CONTRIBUTORY;

279

default:

279

default:

280

break;

280

break;

281

}

281

}

282

return EXCPT_BENIGN;

282

return EXCPT_BENIGN;

283

}

283

}

284

285

static void kvm_multiple_exception(struct kvm_vcpu *vcpu,

285

static void kvm_multiple_exception(struct kvm_vcpu *vcpu,

286

unsigned nr, bool has_error, u32 error_code,

286

unsigned nr, bool has_error, u32 error_code,

287

bool reinject)

287

bool reinject)

288

{

288

{

289

u32 prev_nr;

289

u32 prev_nr;

290

int class1, class2;

290

int class1, class2;

291

292

kvm_make_request(KVM_REQ_EVENT, vcpu);

292

kvm_make_request(KVM_REQ_EVENT, vcpu);

293

294

if (!vcpu->arch.exception.pending) {

294

if (!vcpu->arch.exception.pending) {

295

queue:

295

queue:

296

vcpu->arch.exception.pending = true;

296

vcpu->arch.exception.pending = true;

297

vcpu->arch.exception.has_error_code = has_error;

297

vcpu->arch.exception.has_error_code = has_error;

298

vcpu->arch.exception.nr = nr;

298

vcpu->arch.exception.nr = nr;

299

vcpu->arch.exception.error_code = error_code;

299

vcpu->arch.exception.error_code = error_code;

300

vcpu->arch.exception.reinject = reinject;

300

vcpu->arch.exception.reinject = reinject;

301

return;

301

return;

302

}

302

}

303

304

/* to check exception */

304

/* to check exception */

305

prev_nr = vcpu->arch.exception.nr;

305

prev_nr = vcpu->arch.exception.nr;

306

if (prev_nr == DF_VECTOR) {

306

if (prev_nr == DF_VECTOR) {

307

/* triple fault -> shutdown */

307

/* triple fault -> shutdown */

308

kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);

308

kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);

309

return;

309

return;

310

}

310

}

311

class1 = exception_class(prev_nr);

311

class1 = exception_class(prev_nr);

312

class2 = exception_class(nr);

312

class2 = exception_class(nr);

313

if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)

313

if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)

314

|| (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {

314

|| (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {

315

/* generate double fault per SDM Table 5-5 */

315

/* generate double fault per SDM Table 5-5 */

316

vcpu->arch.exception.pending = true;

316

vcpu->arch.exception.pending = true;

317

vcpu->arch.exception.has_error_code = true;

317

vcpu->arch.exception.has_error_code = true;

318

vcpu->arch.exception.nr = DF_VECTOR;

318

vcpu->arch.exception.nr = DF_VECTOR;

319

vcpu->arch.exception.error_code = 0;

319

vcpu->arch.exception.error_code = 0;

320

} else

320

} else

321

/* replace previous exception with a new one in a hope

321

/* replace previous exception with a new one in a hope

322

that instruction re-execution will regenerate lost

322

that instruction re-execution will regenerate lost

323

exception */

323

exception */

324

goto queue;

324

goto queue;

325

}

325

}

326

327

void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)

327

void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)

328

{

328

{

329

kvm_multiple_exception(vcpu, nr, false, 0, false);

329

kvm_multiple_exception(vcpu, nr, false, 0, false);

330

}

330

}

331

EXPORT_SYMBOL_GPL(kvm_queue_exception);

331

EXPORT_SYMBOL_GPL(kvm_queue_exception);

332

333

void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)

333

void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)

334

{

334

{

335

kvm_multiple_exception(vcpu, nr, false, 0, true);

335

kvm_multiple_exception(vcpu, nr, false, 0, true);

336

}

336

}

337

EXPORT_SYMBOL_GPL(kvm_requeue_exception);

337

EXPORT_SYMBOL_GPL(kvm_requeue_exception);

338

339

void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)

339

void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)

340

{

340

{

341

if (err)

341

if (err)

342

kvm_inject_gp(vcpu, 0);

342

kvm_inject_gp(vcpu, 0);

343

else

343

else

344

kvm_x86_ops->skip_emulated_instruction(vcpu);

344

kvm_x86_ops->skip_emulated_instruction(vcpu);

345

}

345

}

346

EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);

346

EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);

347

348

void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)

348

void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)

349

{

349

{

350

++vcpu->stat.pf_guest;

350

++vcpu->stat.pf_guest;

351

vcpu->arch.cr2 = fault->address;

351

vcpu->arch.cr2 = fault->address;

352

kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);

352

kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);

353

}

353

}

354

EXPORT_SYMBOL_GPL(kvm_inject_page_fault);

354

EXPORT_SYMBOL_GPL(kvm_inject_page_fault);

355

356

void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)

356

void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)

357

{

357

{

358

if (mmu_is_nested(vcpu) && !fault->nested_page_fault)

358

if (mmu_is_nested(vcpu) && !fault->nested_page_fault)

359

vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault);

359

vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault);

360

else

360

else

361

vcpu->arch.mmu.inject_page_fault(vcpu, fault);

361

vcpu->arch.mmu.inject_page_fault(vcpu, fault);

362

}

362

}

363

364

void kvm_inject_nmi(struct kvm_vcpu *vcpu)

364

void kvm_inject_nmi(struct kvm_vcpu *vcpu)

365

{

365

{

366

atomic_inc(&vcpu->arch.nmi_queued);

366

atomic_inc(&vcpu->arch.nmi_queued);

367

kvm_make_request(KVM_REQ_NMI, vcpu);

367

kvm_make_request(KVM_REQ_NMI, vcpu);

368

}

368

}

369

EXPORT_SYMBOL_GPL(kvm_inject_nmi);

369

EXPORT_SYMBOL_GPL(kvm_inject_nmi);

370

371

void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)

371

void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)

372

{

372

{

373

kvm_multiple_exception(vcpu, nr, true, error_code, false);

373

kvm_multiple_exception(vcpu, nr, true, error_code, false);

374

}

374

}

375

EXPORT_SYMBOL_GPL(kvm_queue_exception_e);

375

EXPORT_SYMBOL_GPL(kvm_queue_exception_e);

376

377

void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)

377

void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)

378

{

378

{

379

kvm_multiple_exception(vcpu, nr, true, error_code, true);

379

kvm_multiple_exception(vcpu, nr, true, error_code, true);

380

}

380

}

381

EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);

381

EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);

382

383

/*

383

/*

384

* Checks if cpl <= required_cpl; if true, return true. Otherwise queue

384

* Checks if cpl <= required_cpl; if true, return true. Otherwise queue

385

* a #GP and return false.

385

* a #GP and return false.

386

*/

386

*/

387

bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)

387

bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)

388

{

388

{

389

if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl)

389

if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl)

390

return true;

390

return true;

391

kvm_queue_exception_e(vcpu, GP_VECTOR, 0);

391

kvm_queue_exception_e(vcpu, GP_VECTOR, 0);

392

return false;

392

return false;

393

}

393

}

394

EXPORT_SYMBOL_GPL(kvm_require_cpl);

394

EXPORT_SYMBOL_GPL(kvm_require_cpl);

395

396

/*

396

/*

397

* This function will be used to read from the physical memory of the currently

397

* This function will be used to read from the physical memory of the currently

398

* running guest. The difference to kvm_read_guest_page is that this function

398

* running guest. The difference to kvm_read_guest_page is that this function

399

* can read from guest physical or from the guest's guest physical memory.

399

* can read from guest physical or from the guest's guest physical memory.

400

*/

400

*/

401

int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,

401

int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,

402

gfn_t ngfn, void *data, int offset, int len,

402

gfn_t ngfn, void *data, int offset, int len,

403

u32 access)

403

u32 access)

404

{

404

{

405

gfn_t real_gfn;

405

gfn_t real_gfn;

406

gpa_t ngpa;

406

gpa_t ngpa;

407

408

ngpa = gfn_to_gpa(ngfn);

408

ngpa = gfn_to_gpa(ngfn);

409

real_gfn = mmu->translate_gpa(vcpu, ngpa, access);

409

real_gfn = mmu->translate_gpa(vcpu, ngpa, access);

410

if (real_gfn == UNMAPPED_GVA)

410

if (real_gfn == UNMAPPED_GVA)

411

return -EFAULT;

411

return -EFAULT;

412

413

real_gfn = gpa_to_gfn(real_gfn);

413

real_gfn = gpa_to_gfn(real_gfn);

414

415

return kvm_read_guest_page(vcpu->kvm, real_gfn, data, offset, len);

415

return kvm_read_guest_page(vcpu->kvm, real_gfn, data, offset, len);

416

}

416

}

417

EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);

417

EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);

418

419

int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,

419

int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,

420

void *data, int offset, int len, u32 access)

420

void *data, int offset, int len, u32 access)

421

{

421

{

422

return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn,

422

return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn,

423

data, offset, len, access);

423

data, offset, len, access);

424

}

424

}

425

426

/*

426

/*

427

* Load the pae pdptrs. Return true is they are all valid.

427

* Load the pae pdptrs. Return true is they are all valid.

428

*/

428

*/

429

int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)

429

int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)

430

{

430

{

431

gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;

431

gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;

432

unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;

432

unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;

433

int i;

433

int i;

434

int ret;

434

int ret;

435

u64 pdpte[ARRAY_SIZE(mmu->pdptrs)];

435

u64 pdpte[ARRAY_SIZE(mmu->pdptrs)];

436

437

ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte,

437

ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte,

438

offset * sizeof(u64), sizeof(pdpte),

438

offset * sizeof(u64), sizeof(pdpte),

439

PFERR_USER_MASK|PFERR_WRITE_MASK);

439

PFERR_USER_MASK|PFERR_WRITE_MASK);

440

if (ret < 0) {

440

if (ret < 0) {

441

ret = 0;

441

ret = 0;

442

goto out;

442

goto out;

443

}

443

}

444

for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {

444

for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {

445

if (is_present_gpte(pdpte[i]) &&

445

if (is_present_gpte(pdpte[i]) &&

446

(pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) {

446

(pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) {

447

ret = 0;

447

ret = 0;

448

goto out;

448

goto out;

449

}

449

}

450

}

450

}

451

ret = 1;

451

ret = 1;

452

453

memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));

453

memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));

454

__set_bit(VCPU_EXREG_PDPTR,

454

__set_bit(VCPU_EXREG_PDPTR,

455

(unsigned long *)&vcpu->arch.regs_avail);

455

(unsigned long *)&vcpu->arch.regs_avail);

456

__set_bit(VCPU_EXREG_PDPTR,

456

__set_bit(VCPU_EXREG_PDPTR,

457

(unsigned long *)&vcpu->arch.regs_dirty);

457

(unsigned long *)&vcpu->arch.regs_dirty);

458

out:

458

out:

459

460

return ret;

460

return ret;

461

}

461

}

462

EXPORT_SYMBOL_GPL(load_pdptrs);

462

EXPORT_SYMBOL_GPL(load_pdptrs);

463

464

static bool pdptrs_changed(struct kvm_vcpu *vcpu)

464

static bool pdptrs_changed(struct kvm_vcpu *vcpu)

465

{

465

{

466

u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];

466

u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];

467

bool changed = true;

467

bool changed = true;

468

int offset;

468

int offset;

469

gfn_t gfn;

469

gfn_t gfn;

470

int r;

470

int r;

471

472

if (is_long_mode(vcpu) || !is_pae(vcpu))

472

if (is_long_mode(vcpu) || !is_pae(vcpu))

473

return false;

473

return false;

474

475

if (!test_bit(VCPU_EXREG_PDPTR,

475

if (!test_bit(VCPU_EXREG_PDPTR,

476

(unsigned long *)&vcpu->arch.regs_avail))

476

(unsigned long *)&vcpu->arch.regs_avail))

477

return true;

477

return true;

478

479

gfn = (kvm_read_cr3(vcpu) & ~31u) >> PAGE_SHIFT;

479

gfn = (kvm_read_cr3(vcpu) & ~31u) >> PAGE_SHIFT;

480

offset = (kvm_read_cr3(vcpu) & ~31u) & (PAGE_SIZE - 1);

480

offset = (kvm_read_cr3(vcpu) & ~31u) & (PAGE_SIZE - 1);

481

r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),

481

r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),

482

PFERR_USER_MASK | PFERR_WRITE_MASK);

482

PFERR_USER_MASK | PFERR_WRITE_MASK);

483

if (r < 0)

483

if (r < 0)

484

goto out;

484

goto out;

485

changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;

485

changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;

486

out:

486

out:

487

488

return changed;

488

return changed;

489

}

489

}

490

491

int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)

491

int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)

492

{

492

{

493

unsigned long old_cr0 = kvm_read_cr0(vcpu);

493

unsigned long old_cr0 = kvm_read_cr0(vcpu);

494

unsigned long update_bits = X86_CR0_PG | X86_CR0_WP |

494

unsigned long update_bits = X86_CR0_PG | X86_CR0_WP |

495

X86_CR0_CD | X86_CR0_NW;

495

X86_CR0_CD | X86_CR0_NW;

496

497

cr0 |= X86_CR0_ET;

497

cr0 |= X86_CR0_ET;

498

499

#ifdef CONFIG_X86_64

499

#ifdef CONFIG_X86_64

500

if (cr0 & 0xffffffff00000000UL)

500

if (cr0 & 0xffffffff00000000UL)

501

return 1;

501

return 1;

502

#endif

502

#endif

503

504

cr0 &= ~CR0_RESERVED_BITS;

504

cr0 &= ~CR0_RESERVED_BITS;

505

506

if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))

506

if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))

507

return 1;

507

return 1;

508

509

if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))

509

if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))

510

return 1;

510

return 1;

511

512

if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {

512

if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {

513

#ifdef CONFIG_X86_64

513

#ifdef CONFIG_X86_64

514

if ((vcpu->arch.efer & EFER_LME)) {

514

if ((vcpu->arch.efer & EFER_LME)) {

515

int cs_db, cs_l;

515

int cs_db, cs_l;

516

517

if (!is_pae(vcpu))

517

if (!is_pae(vcpu))

518

return 1;

518

return 1;

519

kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);

519

kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);

520

if (cs_l)

520

if (cs_l)

521

return 1;

521

return 1;

522

} else

522

} else

523

#endif

523

#endif

524

if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,

524

if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,

525

kvm_read_cr3(vcpu)))

525

kvm_read_cr3(vcpu)))

526

return 1;

526

return 1;

527

}

527

}

528

529

if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))

529

if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))

530

return 1;

530

return 1;

531

532

kvm_x86_ops->set_cr0(vcpu, cr0);

532

kvm_x86_ops->set_cr0(vcpu, cr0);

533

534

if ((cr0 ^ old_cr0) & X86_CR0_PG) {

534

if ((cr0 ^ old_cr0) & X86_CR0_PG) {

535

kvm_clear_async_pf_completion_queue(vcpu);

535

kvm_clear_async_pf_completion_queue(vcpu);

536

kvm_async_pf_hash_reset(vcpu);

536

kvm_async_pf_hash_reset(vcpu);

537

}

537

}

538

539

if ((cr0 ^ old_cr0) & update_bits)

539

if ((cr0 ^ old_cr0) & update_bits)

540

kvm_mmu_reset_context(vcpu);

540

kvm_mmu_reset_context(vcpu);

541

return 0;

541

return 0;

542

}

542

}

543

EXPORT_SYMBOL_GPL(kvm_set_cr0);

543

EXPORT_SYMBOL_GPL(kvm_set_cr0);

544

545

void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)

545

void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)

546

{

546

{

547

(void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));

547

(void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));

548

}

548

}

549

EXPORT_SYMBOL_GPL(kvm_lmsw);

549

EXPORT_SYMBOL_GPL(kvm_lmsw);

550

551

int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)

551

int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)

552

{

552

{

553

u64 xcr0;

553

u64 xcr0;

554

555

/* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */

555

/* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */

556

if (index != XCR_XFEATURE_ENABLED_MASK)

556

if (index != XCR_XFEATURE_ENABLED_MASK)

557

return 1;

557

return 1;

558

xcr0 = xcr;

558

xcr0 = xcr;

559

if (kvm_x86_ops->get_cpl(vcpu) != 0)

559

if (kvm_x86_ops->get_cpl(vcpu) != 0)

560

return 1;

560

return 1;

561

if (!(xcr0 & XSTATE_FP))

561

if (!(xcr0 & XSTATE_FP))

562

return 1;

562

return 1;

563

if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE))

563

if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE))

564

return 1;

564

return 1;

565

if (xcr0 & ~host_xcr0)

565

if (xcr0 & ~host_xcr0)

566

return 1;

566

return 1;

567

vcpu->arch.xcr0 = xcr0;

567

vcpu->arch.xcr0 = xcr0;

568

vcpu->guest_xcr0_loaded = 0;

568

vcpu->guest_xcr0_loaded = 0;

569

return 0;

569

return 0;

570

}

570

}

571

572

int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)

572

int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)

573

{

573

{

574

if (__kvm_set_xcr(vcpu, index, xcr)) {

574

if (__kvm_set_xcr(vcpu, index, xcr)) {

575

kvm_inject_gp(vcpu, 0);

575

kvm_inject_gp(vcpu, 0);

576

return 1;

576

return 1;

577

}

577

}

578

return 0;

578

return 0;

579

}

579

}

580

EXPORT_SYMBOL_GPL(kvm_set_xcr);

580

EXPORT_SYMBOL_GPL(kvm_set_xcr);

581

582

int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)

582

int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)

583

{

583

{

584

unsigned long old_cr4 = kvm_read_cr4(vcpu);

584

unsigned long old_cr4 = kvm_read_cr4(vcpu);

585

unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE |

585

unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE |

586

X86_CR4_PAE | X86_CR4_SMEP;

586

X86_CR4_PAE | X86_CR4_SMEP;

587

if (cr4 & CR4_RESERVED_BITS)

587

if (cr4 & CR4_RESERVED_BITS)

588

return 1;

588

return 1;

589

590

if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE))

590

if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE))

591

return 1;

591

return 1;

592

593

if (!guest_cpuid_has_smep(vcpu) && (cr4 & X86_CR4_SMEP))

593

if (!guest_cpuid_has_smep(vcpu) && (cr4 & X86_CR4_SMEP))

594

return 1;

594

return 1;

595

596

if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_RDWRGSFS))

596

if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_RDWRGSFS))

597

return 1;

597

return 1;

598

599

if (is_long_mode(vcpu)) {

599

if (is_long_mode(vcpu)) {

600

if (!(cr4 & X86_CR4_PAE))

600

if (!(cr4 & X86_CR4_PAE))

601

return 1;

601

return 1;

602

} else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)

602

} else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)

603

&& ((cr4 ^ old_cr4) & pdptr_bits)

603

&& ((cr4 ^ old_cr4) & pdptr_bits)

604

&& !load_pdptrs(vcpu, vcpu->arch.walk_mmu,

604

&& !load_pdptrs(vcpu, vcpu->arch.walk_mmu,

605

kvm_read_cr3(vcpu)))

605

kvm_read_cr3(vcpu)))

606

return 1;

606

return 1;

607

608

if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) {

608

if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) {

609

if (!guest_cpuid_has_pcid(vcpu))

609

if (!guest_cpuid_has_pcid(vcpu))

610

return 1;

610

return 1;

611

612

/* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */

612

/* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */

613

if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))

613

if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))

614

return 1;

614

return 1;

615

}

615

}

616

617

if (kvm_x86_ops->set_cr4(vcpu, cr4))

617

if (kvm_x86_ops->set_cr4(vcpu, cr4))

618

return 1;

618

return 1;

619

620

if (((cr4 ^ old_cr4) & pdptr_bits) ||

620

if (((cr4 ^ old_cr4) & pdptr_bits) ||

621

(!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))

621

(!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))

622

kvm_mmu_reset_context(vcpu);

622

kvm_mmu_reset_context(vcpu);

623

624

if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE)

624

if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE)

625

kvm_update_cpuid(vcpu);

625

kvm_update_cpuid(vcpu);

626

627

return 0;

627

return 0;

628

}

628

}

629

EXPORT_SYMBOL_GPL(kvm_set_cr4);

629

EXPORT_SYMBOL_GPL(kvm_set_cr4);

630

631

int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)

631

int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)

632

{

632

{

633

if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {

633

if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {

634

kvm_mmu_sync_roots(vcpu);

634

kvm_mmu_sync_roots(vcpu);

635

kvm_mmu_flush_tlb(vcpu);

635

kvm_mmu_flush_tlb(vcpu);

636

return 0;

636

return 0;

637

}

637

}

638

639

if (is_long_mode(vcpu)) {

639

if (is_long_mode(vcpu)) {

640

if (kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) {

640

if (kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) {

641

if (cr3 & CR3_PCID_ENABLED_RESERVED_BITS)

641

if (cr3 & CR3_PCID_ENABLED_RESERVED_BITS)

642

return 1;

642

return 1;

643

} else

643

} else

644

if (cr3 & CR3_L_MODE_RESERVED_BITS)

644

if (cr3 & CR3_L_MODE_RESERVED_BITS)

645

return 1;

645

return 1;

646

} else {

646

} else {

647

if (is_pae(vcpu)) {

647

if (is_pae(vcpu)) {

648

if (cr3 & CR3_PAE_RESERVED_BITS)

648

if (cr3 & CR3_PAE_RESERVED_BITS)

649

return 1;

649

return 1;

650

if (is_paging(vcpu) &&

650

if (is_paging(vcpu) &&

651

!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))

651

!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))

652

return 1;

652

return 1;

653

}

653

}

654

/*

654

/*

655

* We don't check reserved bits in nonpae mode, because

655

* We don't check reserved bits in nonpae mode, because

656

* this isn't enforced, and VMware depends on this.

656

* this isn't enforced, and VMware depends on this.

657

*/

657

*/

658

}

658

}

659

660

/*

660

/*

661

* Does the new cr3 value map to physical memory? (Note, we

661

* Does the new cr3 value map to physical memory? (Note, we

662

* catch an invalid cr3 even in real-mode, because it would

662

* catch an invalid cr3 even in real-mode, because it would

663

* cause trouble later on when we turn on paging anyway.)

663

* cause trouble later on when we turn on paging anyway.)

664

*

664

*

665

* A real CPU would silently accept an invalid cr3 and would

665

* A real CPU would silently accept an invalid cr3 and would

666

* attempt to use it - with largely undefined (and often hard

666

* attempt to use it - with largely undefined (and often hard

667

* to debug) behavior on the guest side.

667

* to debug) behavior on the guest side.

668

*/

668

*/

669

if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))

669

if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))

670

return 1;

670

return 1;

671

vcpu->arch.cr3 = cr3;

671

vcpu->arch.cr3 = cr3;

672

__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);

672

__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);

673

vcpu->arch.mmu.new_cr3(vcpu);

673

vcpu->arch.mmu.new_cr3(vcpu);

674

return 0;

674

return 0;

675

}

675

}

676

EXPORT_SYMBOL_GPL(kvm_set_cr3);

676

EXPORT_SYMBOL_GPL(kvm_set_cr3);

677

678

int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)

678

int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)

679

{

679

{

680

if (cr8 & CR8_RESERVED_BITS)

680

if (cr8 & CR8_RESERVED_BITS)

681

return 1;

681

return 1;

682

if (irqchip_in_kernel(vcpu->kvm))

682

if (irqchip_in_kernel(vcpu->kvm))

683

kvm_lapic_set_tpr(vcpu, cr8);

683

kvm_lapic_set_tpr(vcpu, cr8);

684

else

684

else

685

vcpu->arch.cr8 = cr8;

685

vcpu->arch.cr8 = cr8;

686

return 0;

686

return 0;

687

}

687

}

688

EXPORT_SYMBOL_GPL(kvm_set_cr8);

688

EXPORT_SYMBOL_GPL(kvm_set_cr8);

689

690

unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)

690

unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)

691

{

691

{

692

if (irqchip_in_kernel(vcpu->kvm))

692

if (irqchip_in_kernel(vcpu->kvm))

693

return kvm_lapic_get_cr8(vcpu);

693

return kvm_lapic_get_cr8(vcpu);

694

else

694

else

695

return vcpu->arch.cr8;

695

return vcpu->arch.cr8;

696

}

696

}

697

EXPORT_SYMBOL_GPL(kvm_get_cr8);

697

EXPORT_SYMBOL_GPL(kvm_get_cr8);

698

699

static void kvm_update_dr7(struct kvm_vcpu *vcpu)

699

static void kvm_update_dr7(struct kvm_vcpu *vcpu)

700

{

700

{

701

unsigned long dr7;

701

unsigned long dr7;

702

703

if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)

703

if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)

704

dr7 = vcpu->arch.guest_debug_dr7;

704

dr7 = vcpu->arch.guest_debug_dr7;

705

else

705

else

706

dr7 = vcpu->arch.dr7;

706

dr7 = vcpu->arch.dr7;

707

kvm_x86_ops->set_dr7(vcpu, dr7);

707

kvm_x86_ops->set_dr7(vcpu, dr7);

708

vcpu->arch.switch_db_regs = (dr7 & DR7_BP_EN_MASK);

708

vcpu->arch.switch_db_regs = (dr7 & DR7_BP_EN_MASK);

709

}

709

}

710

711

static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)

711

static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)

712

{

712

{

713

switch (dr) {

713

switch (dr) {

714

case 0 ... 3:

714

case 0 ... 3:

715

vcpu->arch.db[dr] = val;

715

vcpu->arch.db[dr] = val;

716

if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))

716

if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))

717

vcpu->arch.eff_db[dr] = val;

717

vcpu->arch.eff_db[dr] = val;

718

break;

718

break;

719

case 4:

719

case 4:

720

if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))

720

if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))

721

return 1; /* #UD */

721

return 1; /* #UD */

722

/* fall through */

722

/* fall through */

723

case 6:

723

case 6:

724

if (val & 0xffffffff00000000ULL)

724

if (val & 0xffffffff00000000ULL)

725

return -1; /* #GP */

725

return -1; /* #GP */

726

vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;

726

vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;

727

break;

727

break;

728

case 5:

728

case 5:

729

if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))

729

if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))

730

return 1; /* #UD */

730

return 1; /* #UD */

731

/* fall through */

731

/* fall through */

732

default: /* 7 */

732

default: /* 7 */

733

if (val & 0xffffffff00000000ULL)

733

if (val & 0xffffffff00000000ULL)

734

return -1; /* #GP */

734

return -1; /* #GP */

735

vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;

735

vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;

736

kvm_update_dr7(vcpu);

736

kvm_update_dr7(vcpu);

737

break;

737

break;

738

}

738

}

739

740

return 0;

740

return 0;

741

}

741

}

742

743

int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)

743

int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)

744

{

744

{

745

int res;

745

int res;

746

747

res = __kvm_set_dr(vcpu, dr, val);

747

res = __kvm_set_dr(vcpu, dr, val);

748

if (res > 0)

748

if (res > 0)

749

kvm_queue_exception(vcpu, UD_VECTOR);

749

kvm_queue_exception(vcpu, UD_VECTOR);

750

else if (res < 0)

750

else if (res < 0)

751

kvm_inject_gp(vcpu, 0);

751

kvm_inject_gp(vcpu, 0);

752

753

return res;

753

return res;

754

}

754

}

755

EXPORT_SYMBOL_GPL(kvm_set_dr);

755

EXPORT_SYMBOL_GPL(kvm_set_dr);

756

757

static int _kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)

757

static int _kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)

758

{

758

{

759

switch (dr) {

759

switch (dr) {

760

case 0 ... 3:

760

case 0 ... 3:

761

*val = vcpu->arch.db[dr];

761

*val = vcpu->arch.db[dr];

762

break;

762

break;

763

case 4:

763

case 4:

764

if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))

764

if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))

765

return 1;

765

return 1;

766

/* fall through */

766

/* fall through */

767

case 6:

767

case 6:

768

*val = vcpu->arch.dr6;

768

*val = vcpu->arch.dr6;

769

break;

769

break;

770

case 5:

770

case 5:

771

if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))

771

if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))

772

return 1;

772

return 1;

773

/* fall through */

773

/* fall through */

774

default: /* 7 */

774

default: /* 7 */

775

*val = vcpu->arch.dr7;

775

*val = vcpu->arch.dr7;

776

break;

776

break;

777

}

777

}

778

779

return 0;

779

return 0;

780

}

780

}

781

782

int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)

782

int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)

783

{

783

{

784

if (_kvm_get_dr(vcpu, dr, val)) {

784

if (_kvm_get_dr(vcpu, dr, val)) {

785

kvm_queue_exception(vcpu, UD_VECTOR);

785

kvm_queue_exception(vcpu, UD_VECTOR);

786

return 1;

786

return 1;

787

}

787

}

788

return 0;

788

return 0;

789

}

789

}

790

EXPORT_SYMBOL_GPL(kvm_get_dr);

790

EXPORT_SYMBOL_GPL(kvm_get_dr);

791

792

bool kvm_rdpmc(struct kvm_vcpu *vcpu)

792

bool kvm_rdpmc(struct kvm_vcpu *vcpu)

793

{

793

{

794

u32 ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);

794

u32 ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);

795

u64 data;

795

u64 data;

796

int err;

796

int err;

797

798

err = kvm_pmu_read_pmc(vcpu, ecx, &data);

798

err = kvm_pmu_read_pmc(vcpu, ecx, &data);

799

if (err)

799

if (err)

800

return err;

800

return err;

801

kvm_register_write(vcpu, VCPU_REGS_RAX, (u32)data);

801

kvm_register_write(vcpu, VCPU_REGS_RAX, (u32)data);

802

kvm_register_write(vcpu, VCPU_REGS_RDX, data >> 32);

802

kvm_register_write(vcpu, VCPU_REGS_RDX, data >> 32);

803

return err;

803

return err;

804

}

804

}

805

EXPORT_SYMBOL_GPL(kvm_rdpmc);

805

EXPORT_SYMBOL_GPL(kvm_rdpmc);

806

807

/*

807

/*

808

* List of msr numbers which we expose to userspace through KVM_GET_MSRS

808

* List of msr numbers which we expose to userspace through KVM_GET_MSRS

809

* and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.

809

* and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.

810

*

810

*

811

* This list is modified at module load time to reflect the

811

* This list is modified at module load time to reflect the

812

* capabilities of the host cpu. This capabilities test skips MSRs that are

812

* capabilities of the host cpu. This capabilities test skips MSRs that are

813

* kvm-specific. Those are put in the beginning of the list.

813

* kvm-specific. Those are put in the beginning of the list.

814

*/

814

*/

815

816

#define KVM_SAVE_MSRS_BEGIN 10

816

#define KVM_SAVE_MSRS_BEGIN 10

817

static u32 msrs_to_save[] = {

817

static u32 msrs_to_save[] = {

818

MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,

818

MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,

819

MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,

819

MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,

820

HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,

820

HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,

821

HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,

821

HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,

822

MSR_KVM_PV_EOI_EN,

822

MSR_KVM_PV_EOI_EN,

823

MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,

823

MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,

824

MSR_STAR,

824

MSR_STAR,

825

#ifdef CONFIG_X86_64

825

#ifdef CONFIG_X86_64

826

MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,

826

MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,

827

#endif

827

#endif

828

MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA

828

MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA

829

};

829

};

830

831

static unsigned num_msrs_to_save;

831

static unsigned num_msrs_to_save;

832

833

static const u32 emulated_msrs[] = {

833

static const u32 emulated_msrs[] = {

834

MSR_IA32_TSC_ADJUST,

834

MSR_IA32_TSC_ADJUST,

835

MSR_IA32_TSCDEADLINE,

835

MSR_IA32_TSCDEADLINE,

836

MSR_IA32_MISC_ENABLE,

836

MSR_IA32_MISC_ENABLE,

837

MSR_IA32_MCG_STATUS,

837

MSR_IA32_MCG_STATUS,

838

MSR_IA32_MCG_CTL,

838

MSR_IA32_MCG_CTL,

839

};

839

};

840

841

static int set_efer(struct kvm_vcpu *vcpu, u64 efer)

841

static int set_efer(struct kvm_vcpu *vcpu, u64 efer)

842

{

842

{

843

u64 old_efer = vcpu->arch.efer;

843

u64 old_efer = vcpu->arch.efer;

844

845

if (efer & efer_reserved_bits)

845

if (efer & efer_reserved_bits)

846

return 1;

846

return 1;

847

848

if (is_paging(vcpu)

848

if (is_paging(vcpu)

849

&& (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))

849

&& (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))

850

return 1;

850

return 1;

851

852

if (efer & EFER_FFXSR) {

852

if (efer & EFER_FFXSR) {

853

struct kvm_cpuid_entry2 *feat;

853

struct kvm_cpuid_entry2 *feat;

854

855

feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);

855

feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);

856

if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT)))

856

if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT)))

857

return 1;

857

return 1;

858

}

858

}

859

860

if (efer & EFER_SVME) {

860

if (efer & EFER_SVME) {

861

struct kvm_cpuid_entry2 *feat;

861

struct kvm_cpuid_entry2 *feat;

862

863

feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);

863

feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);

864

if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM)))

864

if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM)))

865

return 1;

865

return 1;

866

}

866

}

867

868

efer &= ~EFER_LMA;

868

efer &= ~EFER_LMA;

869

efer |= vcpu->arch.efer & EFER_LMA;

869

efer |= vcpu->arch.efer & EFER_LMA;

870

871

kvm_x86_ops->set_efer(vcpu, efer);

871

kvm_x86_ops->set_efer(vcpu, efer);

872

873

vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;

873

vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;

874

875

/* Update reserved bits */

875

/* Update reserved bits */

876

if ((efer ^ old_efer) & EFER_NX)

876

if ((efer ^ old_efer) & EFER_NX)

877

kvm_mmu_reset_context(vcpu);

877

kvm_mmu_reset_context(vcpu);

878

879

return 0;

879

return 0;

880

}

880

}

881

882

void kvm_enable_efer_bits(u64 mask)

882

void kvm_enable_efer_bits(u64 mask)

883

{

883

{

884

efer_reserved_bits &= ~mask;

884

efer_reserved_bits &= ~mask;

885

}

885

}

886

EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);

886

EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);

887

888

889

/*

889

/*

890

* Writes msr value into into the appropriate "register".

890

* Writes msr value into into the appropriate "register".

891

* Returns 0 on success, non-0 otherwise.

891

* Returns 0 on success, non-0 otherwise.

892

* Assumes vcpu_load() was already called.

892

* Assumes vcpu_load() was already called.

893

*/

893

*/

894

int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)

894

int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)

895

{

895

{

896

return kvm_x86_ops->set_msr(vcpu, msr);

896

return kvm_x86_ops->set_msr(vcpu, msr);

897

}

897

}

898

899

/*

899

/*

900

* Adapt set_msr() to msr_io()'s calling convention

900

* Adapt set_msr() to msr_io()'s calling convention

901

*/

901

*/

902

static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)

902

static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)

903

{

903

{

904

struct msr_data msr;

904

struct msr_data msr;

905

906

msr.data = *data;

906

msr.data = *data;

907

msr.index = index;

907

msr.index = index;

908

msr.host_initiated = true;

908

msr.host_initiated = true;

909

return kvm_set_msr(vcpu, &msr);

909

return kvm_set_msr(vcpu, &msr);

910

}

910

}

911

912

#ifdef CONFIG_X86_64

912

#ifdef CONFIG_X86_64

913

struct pvclock_gtod_data {

913

struct pvclock_gtod_data {

914

seqcount_t seq;

914

seqcount_t seq;

915

916

struct { /* extract of a clocksource struct */

916

struct { /* extract of a clocksource struct */

917

int vclock_mode;

917

int vclock_mode;

918

cycle_t cycle_last;

918

cycle_t cycle_last;

919

cycle_t mask;

919

cycle_t mask;

920

u32 mult;

920

u32 mult;

921

u32 shift;

921

u32 shift;

922

} clock;

922

} clock;

923

924

/* open coded 'struct timespec' */

924

/* open coded 'struct timespec' */

925

u64 monotonic_time_snsec;

925

u64 monotonic_time_snsec;

926

time_t monotonic_time_sec;

926

time_t monotonic_time_sec;

927

};

927

};

928

929

static struct pvclock_gtod_data pvclock_gtod_data;

929

static struct pvclock_gtod_data pvclock_gtod_data;

930

931

static void update_pvclock_gtod(struct timekeeper *tk)

931

static void update_pvclock_gtod(struct timekeeper *tk)

932

{

932

{

933

struct pvclock_gtod_data *vdata = &pvclock_gtod_data;

933

struct pvclock_gtod_data *vdata = &pvclock_gtod_data;

934

935

write_seqcount_begin(&vdata->seq);

935

write_seqcount_begin(&vdata->seq);

936

937

/* copy pvclock gtod data */

937

/* copy pvclock gtod data */

938

vdata->clock.vclock_mode = tk->clock->archdata.vclock_mode;

938

vdata->clock.vclock_mode = tk->clock->archdata.vclock_mode;

939

vdata->clock.cycle_last = tk->clock->cycle_last;

939

vdata->clock.cycle_last = tk->clock->cycle_last;

940

vdata->clock.mask = tk->clock->mask;

940

vdata->clock.mask = tk->clock->mask;

941

vdata->clock.mult = tk->mult;

941

vdata->clock.mult = tk->mult;

942

vdata->clock.shift = tk->shift;

942

vdata->clock.shift = tk->shift;

943

944

vdata->monotonic_time_sec = tk->xtime_sec

944

vdata->monotonic_time_sec = tk->xtime_sec

945

+ tk->wall_to_monotonic.tv_sec;

945

+ tk->wall_to_monotonic.tv_sec;

946

vdata->monotonic_time_snsec = tk->xtime_nsec

946

vdata->monotonic_time_snsec = tk->xtime_nsec

947

+ (tk->wall_to_monotonic.tv_nsec

947

+ (tk->wall_to_monotonic.tv_nsec

948

<< tk->shift);

948

<< tk->shift);

949

while (vdata->monotonic_time_snsec >=

949

while (vdata->monotonic_time_snsec >=

950

(((u64)NSEC_PER_SEC) << tk->shift)) {

950

(((u64)NSEC_PER_SEC) << tk->shift)) {

951

vdata->monotonic_time_snsec -=

951

vdata->monotonic_time_snsec -=

952

((u64)NSEC_PER_SEC) << tk->shift;

952

((u64)NSEC_PER_SEC) << tk->shift;

953

vdata->monotonic_time_sec++;

953

vdata->monotonic_time_sec++;

954

}

954

}

955

956

write_seqcount_end(&vdata->seq);

956

write_seqcount_end(&vdata->seq);

957

}

957

}

958

#endif

958

#endif

959

960

961

static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)

961

static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)

962

{

962

{

963

int version;

963

int version;

964

int r;

964

int r;

965

struct pvclock_wall_clock wc;

965

struct pvclock_wall_clock wc;

966

struct timespec boot;

966

struct timespec boot;

967

968

if (!wall_clock)

968

if (!wall_clock)

969

return;

969

return;

970

971

r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version));

971

r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version));

972

if (r)

972

if (r)

973

return;

973

return;

974

975

if (version & 1)

975

if (version & 1)

976

++version; /* first time write, random junk */

976

++version; /* first time write, random junk */

977

978

++version;

978

++version;

979

980

kvm_write_guest(kvm, wall_clock, &version, sizeof(version));

980

kvm_write_guest(kvm, wall_clock, &version, sizeof(version));

981

982

/*

982

/*

983

* The guest calculates current wall clock time by adding

983

* The guest calculates current wall clock time by adding

984

* system time (updated by kvm_guest_time_update below) to the

984

* system time (updated by kvm_guest_time_update below) to the

985

* wall clock specified here. guest system time equals host

985

* wall clock specified here. guest system time equals host

986

* system time for us, thus we must fill in host boot time here.

986

* system time for us, thus we must fill in host boot time here.

987

*/

987

*/

988

getboottime(&boot);

988

getboottime(&boot);

989

990

if (kvm->arch.kvmclock_offset) {

990

if (kvm->arch.kvmclock_offset) {

991

struct timespec ts = ns_to_timespec(kvm->arch.kvmclock_offset);

991

struct timespec ts = ns_to_timespec(kvm->arch.kvmclock_offset);

992

boot = timespec_sub(boot, ts);

992

boot = timespec_sub(boot, ts);

993

}

993

}

994

wc.sec = boot.tv_sec;

994

wc.sec = boot.tv_sec;

995

wc.nsec = boot.tv_nsec;

995

wc.nsec = boot.tv_nsec;

996

wc.version = version;

996

wc.version = version;

997

998

kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));

998

kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));

999

1000

version++;

1000

version++;

1001

kvm_write_guest(kvm, wall_clock, &version, sizeof(version));

1001

kvm_write_guest(kvm, wall_clock, &version, sizeof(version));

1002

}

1002

}

1003

1004

static uint32_t div_frac(uint32_t dividend, uint32_t divisor)

1004

static uint32_t div_frac(uint32_t dividend, uint32_t divisor)

1005

{

1005

{

1006

uint32_t quotient, remainder;

1006

uint32_t quotient, remainder;

1007

1008

/* Don't try to replace with do_div(), this one calculates

1008

/* Don't try to replace with do_div(), this one calculates

1009

* "(dividend << 32) / divisor" */

1009

* "(dividend << 32) / divisor" */

1010

__asm__ ( "divl %4"

1010

__asm__ ( "divl %4"

1011

: "=a" (quotient), "=d" (remainder)

1011

: "=a" (quotient), "=d" (remainder)

1012

: "0" (0), "1" (dividend), "r" (divisor) );

1012

: "0" (0), "1" (dividend), "r" (divisor) );

1013

return quotient;

1013

return quotient;

1014

}

1014

}

1015

1016

static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz,

1016

static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz,

1017

s8 *pshift, u32 *pmultiplier)

1017

s8 *pshift, u32 *pmultiplier)

1018

{

1018

{

1019

uint64_t scaled64;

1019

uint64_t scaled64;

1020

int32_t shift = 0;

1020

int32_t shift = 0;

1021

uint64_t tps64;

1021

uint64_t tps64;

1022

uint32_t tps32;

1022

uint32_t tps32;

1023

1024

tps64 = base_khz * 1000LL;

1024

tps64 = base_khz * 1000LL;

1025

scaled64 = scaled_khz * 1000LL;

1025

scaled64 = scaled_khz * 1000LL;

1026

while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) {

1026

while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) {

1027

tps64 >>= 1;

1027

tps64 >>= 1;

1028

shift--;

1028

shift--;

1029

}

1029

}

1030

1031

tps32 = (uint32_t)tps64;

1031

tps32 = (uint32_t)tps64;

1032

while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) {

1032

while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) {

1033

if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000)

1033

if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000)

1034

scaled64 >>= 1;

1034

scaled64 >>= 1;

1035

else

1035

else

1036

tps32 <<= 1;

1036

tps32 <<= 1;

1037

shift++;

1037

shift++;

1038

}

1038

}

1039

1040

*pshift = shift;

1040

*pshift = shift;

1041

*pmultiplier = div_frac(scaled64, tps32);

1041

*pmultiplier = div_frac(scaled64, tps32);

1042

1043

pr_debug("%s: base_khz %u => %u, shift %d, mul %u\n",

1043

pr_debug("%s: base_khz %u => %u, shift %d, mul %u\n",

1044

__func__, base_khz, scaled_khz, shift, *pmultiplier);

1044

__func__, base_khz, scaled_khz, shift, *pmultiplier);

1045

}

1045

}

1046

1047

static inline u64 get_kernel_ns(void)

1047

static inline u64 get_kernel_ns(void)

1048

{

1048

{

1049

struct timespec ts;

1049

struct timespec ts;

1050

1051

WARN_ON(preemptible());

1051

WARN_ON(preemptible());

1052

ktime_get_ts(&ts);

1052

ktime_get_ts(&ts);

1053

monotonic_to_bootbased(&ts);

1053

monotonic_to_bootbased(&ts);

1054

return timespec_to_ns(&ts);

1054

return timespec_to_ns(&ts);

1055

}

1055

}

1056

1057

#ifdef CONFIG_X86_64

1057

#ifdef CONFIG_X86_64

1058

static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);

1058

static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);

1059

#endif

1059

#endif

1060

1061

static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);

1061

static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);

1062

unsigned long max_tsc_khz;

1062

unsigned long max_tsc_khz;

1063

1064

static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)

1064

static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)

1065

{

1065

{

1066

return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult,

1066

return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult,

1067

vcpu->arch.virtual_tsc_shift);

1067

vcpu->arch.virtual_tsc_shift);

1068

}

1068

}

1069

1070

static u32 adjust_tsc_khz(u32 khz, s32 ppm)

1070

static u32 adjust_tsc_khz(u32 khz, s32 ppm)

1071

{

1071

{

1072

u64 v = (u64)khz * (1000000 + ppm);

1072

u64 v = (u64)khz * (1000000 + ppm);

1073

do_div(v, 1000000);

1073

do_div(v, 1000000);

1074

return v;

1074

return v;

1075

}

1075

}

1076

1077

static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)

1077

static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)

1078

{

1078

{

1079

u32 thresh_lo, thresh_hi;

1079

u32 thresh_lo, thresh_hi;

1080

int use_scaling = 0;

1080

int use_scaling = 0;

1081

1082

/* Compute a scale to convert nanoseconds in TSC cycles */

1082

/* Compute a scale to convert nanoseconds in TSC cycles */

1083

kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,

1083

kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,

1084

&vcpu->arch.virtual_tsc_shift,

1084

&vcpu->arch.virtual_tsc_shift,

1085

&vcpu->arch.virtual_tsc_mult);

1085

&vcpu->arch.virtual_tsc_mult);

1086

vcpu->arch.virtual_tsc_khz = this_tsc_khz;

1086

vcpu->arch.virtual_tsc_khz = this_tsc_khz;

1087

1088

/*

1088

/*

1089

* Compute the variation in TSC rate which is acceptable

1089

* Compute the variation in TSC rate which is acceptable

1090

* within the range of tolerance and decide if the

1090

* within the range of tolerance and decide if the

1091

* rate being applied is within that bounds of the hardware

1091

* rate being applied is within that bounds of the hardware

1092

* rate. If so, no scaling or compensation need be done.

1092

* rate. If so, no scaling or compensation need be done.

1093

*/

1093

*/

1094

thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);

1094

thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);

1095

thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);

1095

thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);

1096

if (this_tsc_khz < thresh_lo || this_tsc_khz > thresh_hi) {

1096

if (this_tsc_khz < thresh_lo || this_tsc_khz > thresh_hi) {

1097

pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", this_tsc_khz, thresh_lo, thresh_hi);

1097

pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", this_tsc_khz, thresh_lo, thresh_hi);

1098

use_scaling = 1;

1098

use_scaling = 1;

1099

}

1099

}

1100

kvm_x86_ops->set_tsc_khz(vcpu, this_tsc_khz, use_scaling);

1100

kvm_x86_ops->set_tsc_khz(vcpu, this_tsc_khz, use_scaling);

1101

}

1101

}

1102

1103

static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)

1103

static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)

1104

{

1104

{

1105

u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec,

1105

u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec,

1106

vcpu->arch.virtual_tsc_mult,

1106

vcpu->arch.virtual_tsc_mult,

1107

vcpu->arch.virtual_tsc_shift);

1107

vcpu->arch.virtual_tsc_shift);

1108

tsc += vcpu->arch.this_tsc_write;

1108

tsc += vcpu->arch.this_tsc_write;

1109

return tsc;

1109

return tsc;

1110

}

1110

}

1111

1112

void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)

1112

void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)

1113

{

1113

{

1114

#ifdef CONFIG_X86_64

1114

#ifdef CONFIG_X86_64

1115

bool vcpus_matched;

1115

bool vcpus_matched;

1116

bool do_request = false;

1116

bool do_request = false;

1117

struct kvm_arch *ka = &vcpu->kvm->arch;

1117

struct kvm_arch *ka = &vcpu->kvm->arch;

1118

struct pvclock_gtod_data *gtod = &pvclock_gtod_data;

1118

struct pvclock_gtod_data *gtod = &pvclock_gtod_data;

1119

1120

vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==

1120

vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==

1121

atomic_read(&vcpu->kvm->online_vcpus));

1121

atomic_read(&vcpu->kvm->online_vcpus));

1122

1123

if (vcpus_matched && gtod->clock.vclock_mode == VCLOCK_TSC)

1123

if (vcpus_matched && gtod->clock.vclock_mode == VCLOCK_TSC)

1124

if (!ka->use_master_clock)

1124

if (!ka->use_master_clock)

1125

do_request = 1;

1125

do_request = 1;

1126

1127

if (!vcpus_matched && ka->use_master_clock)

1127

if (!vcpus_matched && ka->use_master_clock)

1128

do_request = 1;

1128

do_request = 1;

1129

1130

if (do_request)

1130

if (do_request)

1131

kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);

1131

kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);

1132

1133

trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc,

1133

trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc,

1134

atomic_read(&vcpu->kvm->online_vcpus),

1134

atomic_read(&vcpu->kvm->online_vcpus),

1135

ka->use_master_clock, gtod->clock.vclock_mode);

1135

ka->use_master_clock, gtod->clock.vclock_mode);

1136

#endif

1136

#endif

1137

}

1137

}

1138

1139

static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)

1139

static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)

1140

{

1140

{

1141

u64 curr_offset = kvm_x86_ops->read_tsc_offset(vcpu);

1141

u64 curr_offset = kvm_x86_ops->read_tsc_offset(vcpu);

1142

vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;

1142

vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;

1143

}

1143

}

1144

1145

void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)

1145

void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)

1146

{

1146

{

1147

struct kvm *kvm = vcpu->kvm;

1147

struct kvm *kvm = vcpu->kvm;

1148

u64 offset, ns, elapsed;

1148

u64 offset, ns, elapsed;

1149

unsigned long flags;

1149

unsigned long flags;

1150

s64 usdiff;

1150

s64 usdiff;

1151

bool matched;

1151

bool matched;

1152

u64 data = msr->data;

1152

u64 data = msr->data;

1153

1154

raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);

1154

raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);

1155

offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);

1155

offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);

1156

ns = get_kernel_ns();

1156

ns = get_kernel_ns();

1157

elapsed = ns - kvm->arch.last_tsc_nsec;

1157

elapsed = ns - kvm->arch.last_tsc_nsec;

1158

1159

/* n.b - signed multiplication and division required */

1159

/* n.b - signed multiplication and division required */

1160

usdiff = data - kvm->arch.last_tsc_write;

1160

usdiff = data - kvm->arch.last_tsc_write;

1161

#ifdef CONFIG_X86_64

1161

#ifdef CONFIG_X86_64

1162

usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz;

1162

usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz;

1163

#else

1163

#else

1164

/* do_div() only does unsigned */

1164

/* do_div() only does unsigned */

1165

asm("idivl %2; xor %%edx, %%edx"

1165

asm("idivl %2; xor %%edx, %%edx"

1166

: "=A"(usdiff)

1166

: "=A"(usdiff)

1167

: "A"(usdiff * 1000), "rm"(vcpu->arch.virtual_tsc_khz));

1167

: "A"(usdiff * 1000), "rm"(vcpu->arch.virtual_tsc_khz));

1168

#endif

1168

#endif

1169

do_div(elapsed, 1000);

1169

do_div(elapsed, 1000);

1170

usdiff -= elapsed;

1170

usdiff -= elapsed;

1171

if (usdiff < 0)

1171

if (usdiff < 0)

1172

usdiff = -usdiff;

1172

usdiff = -usdiff;

1173

1174

/*

1174

/*

1175

* Special case: TSC write with a small delta (1 second) of virtual

1175

* Special case: TSC write with a small delta (1 second) of virtual

1176

* cycle time against real time is interpreted as an attempt to

1176

* cycle time against real time is interpreted as an attempt to

1177

* synchronize the CPU.

1177

* synchronize the CPU.

1178

*

1178

*

1179

* For a reliable TSC, we can match TSC offsets, and for an unstable

1179

* For a reliable TSC, we can match TSC offsets, and for an unstable

1180

* TSC, we add elapsed time in this computation. We could let the

1180

* TSC, we add elapsed time in this computation. We could let the

1181

* compensation code attempt to catch up if we fall behind, but

1181

* compensation code attempt to catch up if we fall behind, but

1182

* it's better to try to match offsets from the beginning.

1182

* it's better to try to match offsets from the beginning.

1183

*/

1183

*/

1184

if (usdiff < USEC_PER_SEC &&

1184

if (usdiff < USEC_PER_SEC &&

1185

vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {

1185

vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {

1186

if (!check_tsc_unstable()) {

1186

if (!check_tsc_unstable()) {

1187

offset = kvm->arch.cur_tsc_offset;

1187

offset = kvm->arch.cur_tsc_offset;

1188

pr_debug("kvm: matched tsc offset for %llu\n", data);

1188

pr_debug("kvm: matched tsc offset for %llu\n", data);

1189

} else {

1189

} else {

1190

u64 delta = nsec_to_cycles(vcpu, elapsed);

1190

u64 delta = nsec_to_cycles(vcpu, elapsed);

1191

data += delta;

1191

data += delta;

1192

offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);

1192

offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);

1193

pr_debug("kvm: adjusted tsc offset by %llu\n", delta);

1193

pr_debug("kvm: adjusted tsc offset by %llu\n", delta);

1194

}

1194

}

1195

matched = true;

1195

matched = true;

1196

} else {

1196

} else {

1197

/*

1197

/*

1198

* We split periods of matched TSC writes into generations.

1198

* We split periods of matched TSC writes into generations.

1199

* For each generation, we track the original measured

1199

* For each generation, we track the original measured

1200

* nanosecond time, offset, and write, so if TSCs are in

1200

* nanosecond time, offset, and write, so if TSCs are in

1201

* sync, we can match exact offset, and if not, we can match

1201

* sync, we can match exact offset, and if not, we can match

1202

* exact software computation in compute_guest_tsc()

1202

* exact software computation in compute_guest_tsc()

1203

*

1203

*

1204

* These values are tracked in kvm->arch.cur_xxx variables.

1204

* These values are tracked in kvm->arch.cur_xxx variables.

1205

*/

1205

*/

1206

kvm->arch.cur_tsc_generation++;

1206

kvm->arch.cur_tsc_generation++;

1207

kvm->arch.cur_tsc_nsec = ns;

1207

kvm->arch.cur_tsc_nsec = ns;

1208

kvm->arch.cur_tsc_write = data;

1208

kvm->arch.cur_tsc_write = data;

1209

kvm->arch.cur_tsc_offset = offset;

1209

kvm->arch.cur_tsc_offset = offset;

1210

matched = false;

1210

matched = false;

1211

pr_debug("kvm: new tsc generation %u, clock %llu\n",

1211

pr_debug("kvm: new tsc generation %u, clock %llu\n",

1212

kvm->arch.cur_tsc_generation, data);

1212

kvm->arch.cur_tsc_generation, data);

1213

}

1213

}

1214

1215

/*

1215

/*

1216

* We also track th most recent recorded KHZ, write and time to

1216

* We also track th most recent recorded KHZ, write and time to

1217

* allow the matching interval to be extended at each write.

1217

* allow the matching interval to be extended at each write.

1218

*/

1218

*/

1219

kvm->arch.last_tsc_nsec = ns;

1219

kvm->arch.last_tsc_nsec = ns;

1220

kvm->arch.last_tsc_write = data;

1220

kvm->arch.last_tsc_write = data;

1221

kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;

1221

kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;

1222

1223

/* Reset of TSC must disable overshoot protection below */

1223

/* Reset of TSC must disable overshoot protection below */

1224

vcpu->arch.hv_clock.tsc_timestamp = 0;

1224

vcpu->arch.hv_clock.tsc_timestamp = 0;

1225

vcpu->arch.last_guest_tsc = data;

1225

vcpu->arch.last_guest_tsc = data;

1226

1227

/* Keep track of which generation this VCPU has synchronized to */

1227

/* Keep track of which generation this VCPU has synchronized to */

1228

vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;

1228

vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;

1229

vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;

1229

vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;

1230

vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;

1230

vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;

1231

1232

if (guest_cpuid_has_tsc_adjust(vcpu) && !msr->host_initiated)

1232

if (guest_cpuid_has_tsc_adjust(vcpu) && !msr->host_initiated)

1233

update_ia32_tsc_adjust_msr(vcpu, offset);

1233

update_ia32_tsc_adjust_msr(vcpu, offset);

1234

kvm_x86_ops->write_tsc_offset(vcpu, offset);

1234

kvm_x86_ops->write_tsc_offset(vcpu, offset);

1235

raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);

1235

raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);

1236

1237

spin_lock(&kvm->arch.pvclock_gtod_sync_lock);

1237

spin_lock(&kvm->arch.pvclock_gtod_sync_lock);

1238

if (matched)

1238

if (matched)

1239

kvm->arch.nr_vcpus_matched_tsc++;

1239

kvm->arch.nr_vcpus_matched_tsc++;

1240

else

1240

else

1241

kvm->arch.nr_vcpus_matched_tsc = 0;

1241

kvm->arch.nr_vcpus_matched_tsc = 0;

1242

1243

kvm_track_tsc_matching(vcpu);

1243

kvm_track_tsc_matching(vcpu);

1244

spin_unlock(&kvm->arch.pvclock_gtod_sync_lock);

1244

spin_unlock(&kvm->arch.pvclock_gtod_sync_lock);

1245

}

1245

}

1246

1247

EXPORT_SYMBOL_GPL(kvm_write_tsc);

1247

EXPORT_SYMBOL_GPL(kvm_write_tsc);

1248

1249

#ifdef CONFIG_X86_64

1249

#ifdef CONFIG_X86_64

1250

1251

static cycle_t read_tsc(void)

1251

static cycle_t read_tsc(void)

1252

{

1252

{

1253

cycle_t ret;

1253

cycle_t ret;

1254

u64 last;

1254

u64 last;

1255

1256

/*

1256

/*

1257

* Empirically, a fence (of type that depends on the CPU)

1257

* Empirically, a fence (of type that depends on the CPU)

1258

* before rdtsc is enough to ensure that rdtsc is ordered

1258

* before rdtsc is enough to ensure that rdtsc is ordered

1259

* with respect to loads. The various CPU manuals are unclear

1259

* with respect to loads. The various CPU manuals are unclear

1260

* as to whether rdtsc can be reordered with later loads,

1260

* as to whether rdtsc can be reordered with later loads,

1261

* but no one has ever seen it happen.

1261

* but no one has ever seen it happen.

1262

*/

1262

*/

1263

rdtsc_barrier();

1263

rdtsc_barrier();

1264

ret = (cycle_t)vget_cycles();

1264

ret = (cycle_t)vget_cycles();

1265

1266

last = pvclock_gtod_data.clock.cycle_last;

1266

last = pvclock_gtod_data.clock.cycle_last;

1267

1268

if (likely(ret >= last))

1268

if (likely(ret >= last))

1269

return ret;

1269

return ret;

1270

1271

/*

1271

/*

1272

* GCC likes to generate cmov here, but this branch is extremely

1272

* GCC likes to generate cmov here, but this branch is extremely

1273

* predictable (it's just a funciton of time and the likely is

1273

* predictable (it's just a funciton of time and the likely is

1274

* very likely) and there's a data dependence, so force GCC

1274

* very likely) and there's a data dependence, so force GCC

1275

* to generate a branch instead. I don't barrier() because

1275

* to generate a branch instead. I don't barrier() because

1276

* we don't actually need a barrier, and if this function

1276

* we don't actually need a barrier, and if this function

1277

* ever gets inlined it will generate worse code.

1277

* ever gets inlined it will generate worse code.

1278

*/

1278

*/

1279

asm volatile ("");

1279

asm volatile ("");

1280

return last;

1280

return last;

1281

}

1281

}

1282

1283

static inline u64 vgettsc(cycle_t *cycle_now)

1283

static inline u64 vgettsc(cycle_t *cycle_now)

1284

{

1284

{

1285

long v;

1285

long v;

1286

struct pvclock_gtod_data *gtod = &pvclock_gtod_data;

1286

struct pvclock_gtod_data *gtod = &pvclock_gtod_data;

1287

1288

*cycle_now = read_tsc();

1288

*cycle_now = read_tsc();

1289

1290

v = (*cycle_now - gtod->clock.cycle_last) & gtod->clock.mask;

1290

v = (*cycle_now - gtod->clock.cycle_last) & gtod->clock.mask;

1291

return v * gtod->clock.mult;

1291

return v * gtod->clock.mult;

1292

}

1292

}

1293

1294

static int do_monotonic(struct timespec *ts, cycle_t *cycle_now)

1294

static int do_monotonic(struct timespec *ts, cycle_t *cycle_now)

1295

{

1295

{

1296

unsigned long seq;

1296

unsigned long seq;

1297

u64 ns;

1297

u64 ns;

1298

int mode;

1298

int mode;

1299

struct pvclock_gtod_data *gtod = &pvclock_gtod_data;

1299

struct pvclock_gtod_data *gtod = &pvclock_gtod_data;

1300

1301

ts->tv_nsec = 0;

1301

ts->tv_nsec = 0;

1302

do {

1302

do {

1303

seq = read_seqcount_begin(&gtod->seq);

1303

seq = read_seqcount_begin(&gtod->seq);

1304

mode = gtod->clock.vclock_mode;

1304

mode = gtod->clock.vclock_mode;

1305

ts->tv_sec = gtod->monotonic_time_sec;

1305

ts->tv_sec = gtod->monotonic_time_sec;

1306

ns = gtod->monotonic_time_snsec;

1306

ns = gtod->monotonic_time_snsec;

1307

ns += vgettsc(cycle_now);

1307

ns += vgettsc(cycle_now);

1308

ns >>= gtod->clock.shift;

1308

ns >>= gtod->clock.shift;

1309

} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));

1309

} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));

1310

timespec_add_ns(ts, ns);

1310

timespec_add_ns(ts, ns);

1311

1312

return mode;

1312

return mode;

1313

}

1313

}

1314

1315

/* returns true if host is using tsc clocksource */

1315

/* returns true if host is using tsc clocksource */

1316

static bool kvm_get_time_and_clockread(s64 *kernel_ns, cycle_t *cycle_now)

1316

static bool kvm_get_time_and_clockread(s64 *kernel_ns, cycle_t *cycle_now)

1317

{

1317

{

1318

struct timespec ts;

1318

struct timespec ts;

1319

1320

/* checked again under seqlock below */

1320

/* checked again under seqlock below */

1321

if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC)

1321

if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC)

1322

return false;

1322

return false;

1323

1324

if (do_monotonic(&ts, cycle_now) != VCLOCK_TSC)

1324

if (do_monotonic(&ts, cycle_now) != VCLOCK_TSC)

1325

return false;

1325

return false;

1326

1327

monotonic_to_bootbased(&ts);

1327

monotonic_to_bootbased(&ts);

1328

*kernel_ns = timespec_to_ns(&ts);

1328

*kernel_ns = timespec_to_ns(&ts);

1329

1330

return true;

1330

return true;

1331

}

1331

}

1332

#endif

1332

#endif

1333

1334

/*

1334

/*

1335

*

1335

*

1336

* Assuming a stable TSC across physical CPUS, and a stable TSC

1336

* Assuming a stable TSC across physical CPUS, and a stable TSC

1337

* across virtual CPUs, the following condition is possible.

1337

* across virtual CPUs, the following condition is possible.

1338

* Each numbered line represents an event visible to both

1338

* Each numbered line represents an event visible to both

1339

* CPUs at the next numbered event.

1339

* CPUs at the next numbered event.

1340

*

1340

*

1341

* "timespecX" represents host monotonic time. "tscX" represents

1341

* "timespecX" represents host monotonic time. "tscX" represents

1342

* RDTSC value.

1342

* RDTSC value.

1343

*

1343

*

1344

* VCPU0 on CPU0 | VCPU1 on CPU1

1344

* VCPU0 on CPU0 | VCPU1 on CPU1

1345

*

1345

*

1346

* 1. read timespec0,tsc0

1346

* 1. read timespec0,tsc0

1347

* 2. | timespec1 = timespec0 + N

1347

* 2. | timespec1 = timespec0 + N

1348

* | tsc1 = tsc0 + M

1348

* | tsc1 = tsc0 + M

1349

* 3. transition to guest | transition to guest

1349

* 3. transition to guest | transition to guest

1350

* 4. ret0 = timespec0 + (rdtsc - tsc0) |

1350

* 4. ret0 = timespec0 + (rdtsc - tsc0) |

1351

* 5. | ret1 = timespec1 + (rdtsc - tsc1)

1351

* 5. | ret1 = timespec1 + (rdtsc - tsc1)

1352

* | ret1 = timespec0 + N + (rdtsc - (tsc0 + M))

1352

* | ret1 = timespec0 + N + (rdtsc - (tsc0 + M))

1353

*

1353

*

1354

* Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity:

1354

* Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity:

1355

*

1355

*

1356

* - ret0 < ret1

1356

* - ret0 < ret1

1357

* - timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M))

1357

* - timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M))

1358

* ...

1358

* ...

1359

* - 0 < N - M => M < N

1359

* - 0 < N - M => M < N

1360

*

1360

*

1361

* That is, when timespec0 != timespec1, M < N. Unfortunately that is not

1361

* That is, when timespec0 != timespec1, M < N. Unfortunately that is not

1362

* always the case (the difference between two distinct xtime instances

1362

* always the case (the difference between two distinct xtime instances

1363

* might be smaller then the difference between corresponding TSC reads,

1363

* might be smaller then the difference between corresponding TSC reads,

1364

* when updating guest vcpus pvclock areas).

1364

* when updating guest vcpus pvclock areas).

1365

*

1365

*

1366

* To avoid that problem, do not allow visibility of distinct

1366

* To avoid that problem, do not allow visibility of distinct

1367

* system_timestamp/tsc_timestamp values simultaneously: use a master

1367

* system_timestamp/tsc_timestamp values simultaneously: use a master

1368

* copy of host monotonic time values. Update that master copy

1368

* copy of host monotonic time values. Update that master copy

1369

* in lockstep.

1369

* in lockstep.

1370

*

1370

*

1371

* Rely on synchronization of host TSCs and guest TSCs for monotonicity.

1371

* Rely on synchronization of host TSCs and guest TSCs for monotonicity.

1372

*

1372

*

1373

*/

1373

*/

1374

1375

static void pvclock_update_vm_gtod_copy(struct kvm *kvm)

1375

static void pvclock_update_vm_gtod_copy(struct kvm *kvm)

1376

{

1376

{

1377

#ifdef CONFIG_X86_64

1377

#ifdef CONFIG_X86_64

1378

struct kvm_arch *ka = &kvm->arch;

1378

struct kvm_arch *ka = &kvm->arch;

1379

int vclock_mode;

1379

int vclock_mode;

1380

bool host_tsc_clocksource, vcpus_matched;

1380

bool host_tsc_clocksource, vcpus_matched;

1381

1382

vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==

1382

vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==

1383

atomic_read(&kvm->online_vcpus));

1383

atomic_read(&kvm->online_vcpus));

1384

1385

/*

1385

/*

1386

* If the host uses TSC clock, then passthrough TSC as stable

1386

* If the host uses TSC clock, then passthrough TSC as stable

1387

* to the guest.

1387

* to the guest.

1388

*/

1388

*/

1389

host_tsc_clocksource = kvm_get_time_and_clockread(

1389

host_tsc_clocksource = kvm_get_time_and_clockread(

1390

&ka->master_kernel_ns,

1390

&ka->master_kernel_ns,

1391

&ka->master_cycle_now);

1391

&ka->master_cycle_now);

1392

1393

ka->use_master_clock = host_tsc_clocksource & vcpus_matched;

1393

ka->use_master_clock = host_tsc_clocksource & vcpus_matched;

1394

1395

if (ka->use_master_clock)

1395

if (ka->use_master_clock)

1396

atomic_set(&kvm_guest_has_master_clock, 1);

1396

atomic_set(&kvm_guest_has_master_clock, 1);

1397

1398

vclock_mode = pvclock_gtod_data.clock.vclock_mode;

1398

vclock_mode = pvclock_gtod_data.clock.vclock_mode;

1399

trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode,

1399

trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode,

1400

vcpus_matched);

1400

vcpus_matched);

1401

#endif

1401

#endif

1402

}

1402

}

1403

1404

static int kvm_guest_time_update(struct kvm_vcpu *v)

1404

static int kvm_guest_time_update(struct kvm_vcpu *v)

1405

{

1405

{

1406

unsigned long flags, this_tsc_khz;

1406

unsigned long flags, this_tsc_khz;

1407

struct kvm_vcpu_arch *vcpu = &v->arch;

1407

struct kvm_vcpu_arch *vcpu = &v->arch;

1408

struct kvm_arch *ka = &v->kvm->arch;

1408

struct kvm_arch *ka = &v->kvm->arch;

1409

void *shared_kaddr;

1409

void *shared_kaddr;

1410

s64 kernel_ns, max_kernel_ns;

1410

s64 kernel_ns, max_kernel_ns;

1411

u64 tsc_timestamp, host_tsc;

1411

u64 tsc_timestamp, host_tsc;

1412

struct pvclock_vcpu_time_info *guest_hv_clock;

1412

struct pvclock_vcpu_time_info *guest_hv_clock;

1413

u8 pvclock_flags;

1413

u8 pvclock_flags;

1414

bool use_master_clock;

1414

bool use_master_clock;

1415

1416

kernel_ns = 0;

1416

kernel_ns = 0;

1417

host_tsc = 0;

1417

host_tsc = 0;

1418

1419

/* Keep irq disabled to prevent changes to the clock */

1419

/* Keep irq disabled to prevent changes to the clock */

1420

local_irq_save(flags);

1420

local_irq_save(flags);

1421

this_tsc_khz = __get_cpu_var(cpu_tsc_khz);

1421

this_tsc_khz = __get_cpu_var(cpu_tsc_khz);

1422

if (unlikely(this_tsc_khz == 0)) {

1422

if (unlikely(this_tsc_khz == 0)) {

1423

local_irq_restore(flags);

1423

local_irq_restore(flags);

1424

kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);

1424

kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);

1425

return 1;

1425

return 1;

1426

}

1426

}

1427

1428

/*

1428

/*

1429

* If the host uses TSC clock, then passthrough TSC as stable

1429

* If the host uses TSC clock, then passthrough TSC as stable

1430

* to the guest.

1430

* to the guest.

1431

*/

1431

*/

1432

spin_lock(&ka->pvclock_gtod_sync_lock);

1432

spin_lock(&ka->pvclock_gtod_sync_lock);

1433

use_master_clock = ka->use_master_clock;

1433

use_master_clock = ka->use_master_clock;

1434

if (use_master_clock) {

1434

if (use_master_clock) {

1435

host_tsc = ka->master_cycle_now;

1435

host_tsc = ka->master_cycle_now;

1436

kernel_ns = ka->master_kernel_ns;

1436

kernel_ns = ka->master_kernel_ns;

1437

}

1437

}

1438

spin_unlock(&ka->pvclock_gtod_sync_lock);

1438

spin_unlock(&ka->pvclock_gtod_sync_lock);

1439

if (!use_master_clock) {

1439

if (!use_master_clock) {

1440

host_tsc = native_read_tsc();

1440

host_tsc = native_read_tsc();

1441

kernel_ns = get_kernel_ns();

1441

kernel_ns = get_kernel_ns();

1442

}

1442

}

1443

1444

tsc_timestamp = kvm_x86_ops->read_l1_tsc(v, host_tsc);

1444

tsc_timestamp = kvm_x86_ops->read_l1_tsc(v, host_tsc);

1445

1446

/*

1446

/*

1447

* We may have to catch up the TSC to match elapsed wall clock

1447

* We may have to catch up the TSC to match elapsed wall clock

1448

* time for two reasons, even if kvmclock is used.

1448

* time for two reasons, even if kvmclock is used.

1449

* 1) CPU could have been running below the maximum TSC rate

1449

* 1) CPU could have been running below the maximum TSC rate

1450

* 2) Broken TSC compensation resets the base at each VCPU

1450

* 2) Broken TSC compensation resets the base at each VCPU

1451

* entry to avoid unknown leaps of TSC even when running

1451

* entry to avoid unknown leaps of TSC even when running

1452

* again on the same CPU. This may cause apparent elapsed

1452

* again on the same CPU. This may cause apparent elapsed

1453

* time to disappear, and the guest to stand still or run

1453

* time to disappear, and the guest to stand still or run

1454

* very slowly.

1454

* very slowly.

1455

*/

1455

*/

1456

if (vcpu->tsc_catchup) {

1456

if (vcpu->tsc_catchup) {

1457

u64 tsc = compute_guest_tsc(v, kernel_ns);

1457

u64 tsc = compute_guest_tsc(v, kernel_ns);

1458

if (tsc > tsc_timestamp) {

1458

if (tsc > tsc_timestamp) {

1459

adjust_tsc_offset_guest(v, tsc - tsc_timestamp);

1459

adjust_tsc_offset_guest(v, tsc - tsc_timestamp);

1460

tsc_timestamp = tsc;

1460

tsc_timestamp = tsc;

1461

}

1461

}

1462

}

1462

}

1463

1464

local_irq_restore(flags);

1464

local_irq_restore(flags);

1465

1466

if (!vcpu->time_page)

1466

if (!vcpu->time_page)

1467

return 0;

1467

return 0;

1468

1469

/*

1469

/*

1470

* Time as measured by the TSC may go backwards when resetting the base

1470

* Time as measured by the TSC may go backwards when resetting the base

1471

* tsc_timestamp. The reason for this is that the TSC resolution is

1471

* tsc_timestamp. The reason for this is that the TSC resolution is

1472

* higher than the resolution of the other clock scales. Thus, many

1472

* higher than the resolution of the other clock scales. Thus, many

1473

* possible measurments of the TSC correspond to one measurement of any

1473

* possible measurments of the TSC correspond to one measurement of any

1474

* other clock, and so a spread of values is possible. This is not a

1474

* other clock, and so a spread of values is possible. This is not a

1475

* problem for the computation of the nanosecond clock; with TSC rates

1475

* problem for the computation of the nanosecond clock; with TSC rates

1476

* around 1GHZ, there can only be a few cycles which correspond to one

1476

* around 1GHZ, there can only be a few cycles which correspond to one

1477

* nanosecond value, and any path through this code will inevitably

1477

* nanosecond value, and any path through this code will inevitably

1478

* take longer than that. However, with the kernel_ns value itself,

1478

* take longer than that. However, with the kernel_ns value itself,

1479

* the precision may be much lower, down to HZ granularity. If the

1479

* the precision may be much lower, down to HZ granularity. If the

1480

* first sampling of TSC against kernel_ns ends in the low part of the

1480

* first sampling of TSC against kernel_ns ends in the low part of the

1481

* range, and the second in the high end of the range, we can get:

1481

* range, and the second in the high end of the range, we can get:

1482

*

1482

*

1483

* (TSC - offset_low) * S + kns_old > (TSC - offset_high) * S + kns_new

1483

* (TSC - offset_low) * S + kns_old > (TSC - offset_high) * S + kns_new

1484

*

1484

*

1485

* As the sampling errors potentially range in the thousands of cycles,

1485

* As the sampling errors potentially range in the thousands of cycles,

1486

* it is possible such a time value has already been observed by the

1486

* it is possible such a time value has already been observed by the

1487

* guest. To protect against this, we must compute the system time as

1487

* guest. To protect against this, we must compute the system time as

1488

* observed by the guest and ensure the new system time is greater.

1488

* observed by the guest and ensure the new system time is greater.

1489

*/

1489

*/

1490

max_kernel_ns = 0;

1490

max_kernel_ns = 0;

1491

if (vcpu->hv_clock.tsc_timestamp) {

1491

if (vcpu->hv_clock.tsc_timestamp) {

1492

max_kernel_ns = vcpu->last_guest_tsc -

1492

max_kernel_ns = vcpu->last_guest_tsc -

1493

vcpu->hv_clock.tsc_timestamp;

1493

vcpu->hv_clock.tsc_timestamp;

1494

max_kernel_ns = pvclock_scale_delta(max_kernel_ns,

1494

max_kernel_ns = pvclock_scale_delta(max_kernel_ns,

1495

vcpu->hv_clock.tsc_to_system_mul,

1495

vcpu->hv_clock.tsc_to_system_mul,

1496

vcpu->hv_clock.tsc_shift);

1496

vcpu->hv_clock.tsc_shift);

1497

max_kernel_ns += vcpu->last_kernel_ns;

1497

max_kernel_ns += vcpu->last_kernel_ns;

1498

}

1498

}

1499

1500

if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) {

1500

if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) {

1501

kvm_get_time_scale(NSEC_PER_SEC / 1000, this_tsc_khz,

1501

kvm_get_time_scale(NSEC_PER_SEC / 1000, this_tsc_khz,

1502

&vcpu->hv_clock.tsc_shift,

1502

&vcpu->hv_clock.tsc_shift,

1503

&vcpu->hv_clock.tsc_to_system_mul);

1503

&vcpu->hv_clock.tsc_to_system_mul);

1504

vcpu->hw_tsc_khz = this_tsc_khz;

1504

vcpu->hw_tsc_khz = this_tsc_khz;

1505

}

1505

}

1506

1507

/* with a master <monotonic time, tsc value> tuple,

1507

/* with a master <monotonic time, tsc value> tuple,

1508

* pvclock clock reads always increase at the (scaled) rate

1508

* pvclock clock reads always increase at the (scaled) rate

1509

* of guest TSC - no need to deal with sampling errors.

1509

* of guest TSC - no need to deal with sampling errors.

1510

*/

1510

*/

1511

if (!use_master_clock) {

1511

if (!use_master_clock) {

1512

if (max_kernel_ns > kernel_ns)

1512

if (max_kernel_ns > kernel_ns)

1513

kernel_ns = max_kernel_ns;

1513

kernel_ns = max_kernel_ns;

1514

}

1514

}

1515

/* With all the info we got, fill in the values */

1515

/* With all the info we got, fill in the values */

1516

vcpu->hv_clock.tsc_timestamp = tsc_timestamp;

1516

vcpu->hv_clock.tsc_timestamp = tsc_timestamp;

1517

vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;

1517

vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;

1518

vcpu->last_kernel_ns = kernel_ns;

1518

vcpu->last_kernel_ns = kernel_ns;

1519

vcpu->last_guest_tsc = tsc_timestamp;

1519

vcpu->last_guest_tsc = tsc_timestamp;

1520

1521

/*

1521

/*

1522

* The interface expects us to write an even number signaling that the

1522

* The interface expects us to write an even number signaling that the

1523

* update is finished. Since the guest won't see the intermediate

1523

* update is finished. Since the guest won't see the intermediate

1524

* state, we just increase by 2 at the end.

1524

* state, we just increase by 2 at the end.

1525

*/

1525

*/

1526

vcpu->hv_clock.version += 2;

1526

vcpu->hv_clock.version += 2;

1527

1528

shared_kaddr = kmap_atomic(vcpu->time_page);

1528

shared_kaddr = kmap_atomic(vcpu->time_page);

1529

1530

guest_hv_clock = shared_kaddr + vcpu->time_offset;

1530

guest_hv_clock = shared_kaddr + vcpu->time_offset;

1531

1532

/* retain PVCLOCK_GUEST_STOPPED if set in guest copy */

1532

/* retain PVCLOCK_GUEST_STOPPED if set in guest copy */

1533

pvclock_flags = (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED);

1533

pvclock_flags = (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED);

1534

1535

if (vcpu->pvclock_set_guest_stopped_request) {

1535

if (vcpu->pvclock_set_guest_stopped_request) {

1536

pvclock_flags |= PVCLOCK_GUEST_STOPPED;

1536

pvclock_flags |= PVCLOCK_GUEST_STOPPED;

1537

vcpu->pvclock_set_guest_stopped_request = false;

1537

vcpu->pvclock_set_guest_stopped_request = false;

1538

}

1538

}

1539

1540

/* If the host uses TSC clocksource, then it is stable */

1540

/* If the host uses TSC clocksource, then it is stable */

1541

if (use_master_clock)

1541

if (use_master_clock)

1542

pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;

1542

pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;

1543

1544

vcpu->hv_clock.flags = pvclock_flags;

1544

vcpu->hv_clock.flags = pvclock_flags;

1545

1546

memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,

1546

memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,

1547

sizeof(vcpu->hv_clock));

1547

sizeof(vcpu->hv_clock));

1548

1549

kunmap_atomic(shared_kaddr);

1549

kunmap_atomic(shared_kaddr);

1550

1551

mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);

1551

mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);

1552

return 0;

1552

return 0;

1553

}

1553

}

1554

1555

static bool msr_mtrr_valid(unsigned msr)

1555

static bool msr_mtrr_valid(unsigned msr)

1556

{

1556

{

1557

switch (msr) {

1557

switch (msr) {

1558

case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1:

1558

case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1:

1559

case MSR_MTRRfix64K_00000:

1559

case MSR_MTRRfix64K_00000:

1560

case MSR_MTRRfix16K_80000:

1560

case MSR_MTRRfix16K_80000:

1561

case MSR_MTRRfix16K_A0000:

1561

case MSR_MTRRfix16K_A0000:

1562

case MSR_MTRRfix4K_C0000:

1562

case MSR_MTRRfix4K_C0000:

1563

case MSR_MTRRfix4K_C8000:

1563

case MSR_MTRRfix4K_C8000:

1564

case MSR_MTRRfix4K_D0000:

1564

case MSR_MTRRfix4K_D0000:

1565

case MSR_MTRRfix4K_D8000:

1565

case MSR_MTRRfix4K_D8000:

1566

case MSR_MTRRfix4K_E0000:

1566

case MSR_MTRRfix4K_E0000:

1567

case MSR_MTRRfix4K_E8000:

1567

case MSR_MTRRfix4K_E8000:

1568

case MSR_MTRRfix4K_F0000:

1568

case MSR_MTRRfix4K_F0000:

1569

case MSR_MTRRfix4K_F8000:

1569

case MSR_MTRRfix4K_F8000:

1570

case MSR_MTRRdefType:

1570

case MSR_MTRRdefType:

1571

case MSR_IA32_CR_PAT:

1571

case MSR_IA32_CR_PAT:

1572

return true;

1572

return true;

1573

case 0x2f8:

1573

case 0x2f8:

1574

return true;

1574

return true;

1575

}

1575

}

1576

return false;

1576

return false;

1577

}

1577

}

1578

1579

static bool valid_pat_type(unsigned t)

1579

static bool valid_pat_type(unsigned t)

1580

{

1580

{

1581

return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */

1581

return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */

1582

}

1582

}

1583

1584

static bool valid_mtrr_type(unsigned t)

1584

static bool valid_mtrr_type(unsigned t)

1585

{

1585

{

1586

return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */

1586

return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */

1587

}

1587

}

1588

1589

static bool mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)

1589

static bool mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)

1590

{

1590

{

1591

int i;

1591

int i;

1592

1593

if (!msr_mtrr_valid(msr))

1593

if (!msr_mtrr_valid(msr))

1594

return false;

1594

return false;

1595

1596

if (msr == MSR_IA32_CR_PAT) {

1596

if (msr == MSR_IA32_CR_PAT) {

1597

for (i = 0; i < 8; i++)

1597

for (i = 0; i < 8; i++)

1598

if (!valid_pat_type((data >> (i * 8)) & 0xff))

1598

if (!valid_pat_type((data >> (i * 8)) & 0xff))

1599

return false;

1599

return false;

1600

return true;

1600

return true;

1601

} else if (msr == MSR_MTRRdefType) {

1601

} else if (msr == MSR_MTRRdefType) {

1602

if (data & ~0xcff)

1602

if (data & ~0xcff)

1603

return false;

1603

return false;

1604

return valid_mtrr_type(data & 0xff);

1604

return valid_mtrr_type(data & 0xff);

1605

} else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) {

1605

} else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) {

1606

for (i = 0; i < 8 ; i++)

1606

for (i = 0; i < 8 ; i++)

1607

if (!valid_mtrr_type((data >> (i * 8)) & 0xff))

1607

if (!valid_mtrr_type((data >> (i * 8)) & 0xff))

1608

return false;

1608

return false;

1609

return true;

1609

return true;

1610

}

1610

}

1611

1612

/* variable MTRRs */

1612

/* variable MTRRs */

1613

return valid_mtrr_type(data & 0xff);

1613

return valid_mtrr_type(data & 0xff);

1614

}

1614

}

1615

1616

static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)

1616

static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)

1617

{

1617

{

1618

u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;

1618

u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;

1619

1620

if (!mtrr_valid(vcpu, msr, data))

1620

if (!mtrr_valid(vcpu, msr, data))

1621

return 1;

1621

return 1;

1622

1623

if (msr == MSR_MTRRdefType) {

1623

if (msr == MSR_MTRRdefType) {

1624

vcpu->arch.mtrr_state.def_type = data;

1624

vcpu->arch.mtrr_state.def_type = data;

1625

vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10;

1625

vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10;

1626

} else if (msr == MSR_MTRRfix64K_00000)

1626

} else if (msr == MSR_MTRRfix64K_00000)

1627

p[0] = data;

1627

p[0] = data;

1628

else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)

1628

else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)

1629

p[1 + msr - MSR_MTRRfix16K_80000] = data;

1629

p[1 + msr - MSR_MTRRfix16K_80000] = data;

1630

else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)

1630

else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)

1631

p[3 + msr - MSR_MTRRfix4K_C0000] = data;

1631

p[3 + msr - MSR_MTRRfix4K_C0000] = data;

1632

else if (msr == MSR_IA32_CR_PAT)

1632

else if (msr == MSR_IA32_CR_PAT)

1633

vcpu->arch.pat = data;

1633

vcpu->arch.pat = data;

1634

else { /* Variable MTRRs */

1634

else { /* Variable MTRRs */

1635

int idx, is_mtrr_mask;

1635

int idx, is_mtrr_mask;

1636

u64 *pt;

1636

u64 *pt;

1637

1638

idx = (msr - 0x200) / 2;

1638

idx = (msr - 0x200) / 2;

1639

is_mtrr_mask = msr - 0x200 - 2 * idx;

1639

is_mtrr_mask = msr - 0x200 - 2 * idx;

1640

if (!is_mtrr_mask)

1640

if (!is_mtrr_mask)

1641

pt =

1641

pt =

1642

(u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;

1642

(u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;

1643

else

1643

else

1644

pt =

1644

pt =

1645

(u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;

1645

(u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;

1646

*pt = data;

1646

*pt = data;

1647

}

1647

}

1648

1649

kvm_mmu_reset_context(vcpu);

1649

kvm_mmu_reset_context(vcpu);

1650

return 0;

1650

return 0;

1651

}

1651

}

1652

1653

static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)

1653

static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)

1654

{

1654

{

1655

u64 mcg_cap = vcpu->arch.mcg_cap;

1655

u64 mcg_cap = vcpu->arch.mcg_cap;

1656

unsigned bank_num = mcg_cap & 0xff;

1656

unsigned bank_num = mcg_cap & 0xff;

1657

1658

switch (msr) {

1658

switch (msr) {

1659

case MSR_IA32_MCG_STATUS:

1659

case MSR_IA32_MCG_STATUS:

1660

vcpu->arch.mcg_status = data;

1660

vcpu->arch.mcg_status = data;

1661

break;

1661

break;

1662

case MSR_IA32_MCG_CTL:

1662

case MSR_IA32_MCG_CTL:

1663

if (!(mcg_cap & MCG_CTL_P))

1663

if (!(mcg_cap & MCG_CTL_P))

1664

return 1;

1664

return 1;

1665

if (data != 0 && data != ~(u64)0)

1665

if (data != 0 && data != ~(u64)0)

1666

return -1;

1666

return -1;

1667

vcpu->arch.mcg_ctl = data;

1667

vcpu->arch.mcg_ctl = data;

1668

break;

1668

break;

1669

default:

1669

default:

1670

if (msr >= MSR_IA32_MC0_CTL &&

1670

if (msr >= MSR_IA32_MC0_CTL &&

1671

msr < MSR_IA32_MC0_CTL + 4 * bank_num) {

1671

msr < MSR_IA32_MC0_CTL + 4 * bank_num) {

1672

u32 offset = msr - MSR_IA32_MC0_CTL;

1672

u32 offset = msr - MSR_IA32_MC0_CTL;

1673

/* only 0 or all 1s can be written to IA32_MCi_CTL

1673

/* only 0 or all 1s can be written to IA32_MCi_CTL

1674

* some Linux kernels though clear bit 10 in bank 4 to

1674

* some Linux kernels though clear bit 10 in bank 4 to

1675

* workaround a BIOS/GART TBL issue on AMD K8s, ignore

1675

* workaround a BIOS/GART TBL issue on AMD K8s, ignore

1676

* this to avoid an uncatched #GP in the guest

1676

* this to avoid an uncatched #GP in the guest

1677

*/

1677

*/

1678

if ((offset & 0x3) == 0 &&

1678

if ((offset & 0x3) == 0 &&

1679

data != 0 && (data | (1 << 10)) != ~(u64)0)

1679

data != 0 && (data | (1 << 10)) != ~(u64)0)

1680

return -1;

1680

return -1;

1681

vcpu->arch.mce_banks[offset] = data;

1681

vcpu->arch.mce_banks[offset] = data;

1682

break;

1682

break;

1683

}

1683

}

1684

return 1;

1684

return 1;

1685

}

1685

}

1686

return 0;

1686

return 0;

1687

}

1687

}

1688

1689

static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)

1689

static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)

1690

{

1690

{

1691

struct kvm *kvm = vcpu->kvm;

1691

struct kvm *kvm = vcpu->kvm;

1692

int lm = is_long_mode(vcpu);

1692

int lm = is_long_mode(vcpu);

1693

u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64

1693

u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64

1694

: (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32;

1694

: (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32;

1695

u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64

1695

u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64

1696

: kvm->arch.xen_hvm_config.blob_size_32;

1696

: kvm->arch.xen_hvm_config.blob_size_32;

1697

u32 page_num = data & ~PAGE_MASK;

1697

u32 page_num = data & ~PAGE_MASK;

1698

u64 page_addr = data & PAGE_MASK;

1698

u64 page_addr = data & PAGE_MASK;

1699

u8 *page;

1699

u8 *page;

1700

int r;

1700

int r;

1701

1702

r = -E2BIG;

1702

r = -E2BIG;

1703

if (page_num >= blob_size)

1703

if (page_num >= blob_size)

1704

goto out;

1704

goto out;

1705

r = -ENOMEM;

1705

r = -ENOMEM;

1706

page = memdup_user(blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE);

1706

page = memdup_user(blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE);

1707

if (IS_ERR(page)) {

1707

if (IS_ERR(page)) {

1708

r = PTR_ERR(page);

1708

r = PTR_ERR(page);

1709

goto out;

1709

goto out;

1710

}

1710

}

1711

if (kvm_write_guest(kvm, page_addr, page, PAGE_SIZE))

1711

if (kvm_write_guest(kvm, page_addr, page, PAGE_SIZE))

1712

goto out_free;

1712

goto out_free;

1713

r = 0;

1713

r = 0;

1714

out_free:

1714

out_free:

1715

kfree(page);

1715

kfree(page);

1716

out:

1716

out:

1717

return r;

1717

return r;

1718

}

1718

}

1719

1720

static bool kvm_hv_hypercall_enabled(struct kvm *kvm)

1720

static bool kvm_hv_hypercall_enabled(struct kvm *kvm)

1721

{

1721

{

1722

return kvm->arch.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE;

1722

return kvm->arch.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE;

1723

}

1723

}

1724

1725

static bool kvm_hv_msr_partition_wide(u32 msr)

1725

static bool kvm_hv_msr_partition_wide(u32 msr)

1726

{

1726

{

1727

bool r = false;

1727

bool r = false;

1728

switch (msr) {

1728

switch (msr) {

1729

case HV_X64_MSR_GUEST_OS_ID:

1729

case HV_X64_MSR_GUEST_OS_ID:

1730

case HV_X64_MSR_HYPERCALL:

1730

case HV_X64_MSR_HYPERCALL:

1731

r = true;

1731

r = true;

1732

break;

1732

break;

1733

}

1733

}

1734

1735

return r;

1735

return r;

1736

}

1736

}

1737

1738

static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data)

1738

static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data)

1739

{

1739

{

1740

struct kvm *kvm = vcpu->kvm;

1740

struct kvm *kvm = vcpu->kvm;

1741

1742

switch (msr) {

1742

switch (msr) {

1743

case HV_X64_MSR_GUEST_OS_ID:

1743

case HV_X64_MSR_GUEST_OS_ID:

1744

kvm->arch.hv_guest_os_id = data;

1744

kvm->arch.hv_guest_os_id = data;

1745

/* setting guest os id to zero disables hypercall page */

1745

/* setting guest os id to zero disables hypercall page */

1746

if (!kvm->arch.hv_guest_os_id)

1746

if (!kvm->arch.hv_guest_os_id)

1747

kvm->arch.hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE;

1747

kvm->arch.hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE;

1748

break;

1748

break;

1749

case HV_X64_MSR_HYPERCALL: {

1749

case HV_X64_MSR_HYPERCALL: {

1750

u64 gfn;

1750

u64 gfn;

1751

unsigned long addr;

1751

unsigned long addr;

1752

u8 instructions[4];

1752

u8 instructions[4];

1753

1754

/* if guest os id is not set hypercall should remain disabled */

1754

/* if guest os id is not set hypercall should remain disabled */

1755

if (!kvm->arch.hv_guest_os_id)

1755

if (!kvm->arch.hv_guest_os_id)

1756

break;

1756

break;

1757

if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) {

1757

if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) {

1758

kvm->arch.hv_hypercall = data;

1758

kvm->arch.hv_hypercall = data;

1759

break;

1759

break;

1760

}

1760

}

1761

gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT;

1761

gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT;

1762

addr = gfn_to_hva(kvm, gfn);

1762

addr = gfn_to_hva(kvm, gfn);

1763

if (kvm_is_error_hva(addr))

1763

if (kvm_is_error_hva(addr))

1764

return 1;

1764

return 1;

1765

kvm_x86_ops->patch_hypercall(vcpu, instructions);

1765

kvm_x86_ops->patch_hypercall(vcpu, instructions);

1766

((unsigned char *)instructions)[3] = 0xc3; /* ret */

1766

((unsigned char *)instructions)[3] = 0xc3; /* ret */

1767

if (__copy_to_user((void __user *)addr, instructions, 4))

1767

if (__copy_to_user((void __user *)addr, instructions, 4))

1768

return 1;

1768

return 1;

1769

kvm->arch.hv_hypercall = data;

1769

kvm->arch.hv_hypercall = data;

1770

break;

1770

break;

1771

}

1771

}

1772

default:

1772

default:

1773

vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "

1773

vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "

1774

"data 0x%llx\n", msr, data);

1774

"data 0x%llx\n", msr, data);

1775

return 1;

1775

return 1;

1776

}

1776

}

1777

return 0;

1777

return 0;

1778

}

1778

}

1779

1780

static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)

1780

static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)

1781

{

1781

{

1782

switch (msr) {

1782

switch (msr) {

1783

case HV_X64_MSR_APIC_ASSIST_PAGE: {

1783

case HV_X64_MSR_APIC_ASSIST_PAGE: {

1784

unsigned long addr;

1784

unsigned long addr;

1785

1786

if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) {

1786

if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) {

1787

vcpu->arch.hv_vapic = data;

1787

vcpu->arch.hv_vapic = data;

1788

break;

1788

break;

1789

}

1789

}

1790

addr = gfn_to_hva(vcpu->kvm, data >>

1790

addr = gfn_to_hva(vcpu->kvm, data >>

1791

HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT);

1791

HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT);

1792

if (kvm_is_error_hva(addr))

1792

if (kvm_is_error_hva(addr))

1793

return 1;

1793

return 1;

1794

if (__clear_user((void __user *)addr, PAGE_SIZE))

1794

if (__clear_user((void __user *)addr, PAGE_SIZE))

1795

return 1;

1795

return 1;

1796

vcpu->arch.hv_vapic = data;

1796

vcpu->arch.hv_vapic = data;

1797

break;

1797

break;

1798

}

1798

}

1799

case HV_X64_MSR_EOI:

1799

case HV_X64_MSR_EOI:

1800

return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data);

1800

return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data);

1801

case HV_X64_MSR_ICR:

1801

case HV_X64_MSR_ICR:

1802

return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data);

1802

return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data);

1803

case HV_X64_MSR_TPR:

1803

case HV_X64_MSR_TPR:

1804

return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data);

1804

return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data);

1805

default:

1805

default:

1806

vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "

1806

vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "

1807

"data 0x%llx\n", msr, data);

1807

"data 0x%llx\n", msr, data);

1808

return 1;

1808

return 1;

1809

}

1809

}

1810

1811

return 0;

1811

return 0;

1812

}

1812

}

1813

1814

static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)

1814

static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)

1815

{

1815

{

1816

gpa_t gpa = data & ~0x3f;

1816

gpa_t gpa = data & ~0x3f;

1817

1818

/* Bits 2:5 are reserved, Should be zero */

1818

/* Bits 2:5 are reserved, Should be zero */

1819

if (data & 0x3c)

1819

if (data & 0x3c)

1820

return 1;

1820

return 1;

1821

1822

vcpu->arch.apf.msr_val = data;

1822

vcpu->arch.apf.msr_val = data;

1823

1824

if (!(data & KVM_ASYNC_PF_ENABLED)) {

1824

if (!(data & KVM_ASYNC_PF_ENABLED)) {

1825

kvm_clear_async_pf_completion_queue(vcpu);

1825

kvm_clear_async_pf_completion_queue(vcpu);

1826

kvm_async_pf_hash_reset(vcpu);

1826

kvm_async_pf_hash_reset(vcpu);

1827

return 0;

1827

return 0;

1828

}

1828

}

1829

1830

if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa))

1830

if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa))

1831

return 1;

1831

return 1;

1832

1833

vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);

1833

vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);

1834

kvm_async_pf_wakeup_all(vcpu);

1834

kvm_async_pf_wakeup_all(vcpu);

1835

return 0;

1835

return 0;

1836

}

1836

}

1837

1838

static void kvmclock_reset(struct kvm_vcpu *vcpu)

1838

static void kvmclock_reset(struct kvm_vcpu *vcpu)

1839

{

1839

{

1840

if (vcpu->arch.time_page) {

1840

if (vcpu->arch.time_page) {

1841

kvm_release_page_dirty(vcpu->arch.time_page);

1841

kvm_release_page_dirty(vcpu->arch.time_page);

1842

vcpu->arch.time_page = NULL;

1842

vcpu->arch.time_page = NULL;

1843

}

1843

}

1844

}

1844

}

1845

1846

static void accumulate_steal_time(struct kvm_vcpu *vcpu)

1846

static void accumulate_steal_time(struct kvm_vcpu *vcpu)

1847

{

1847

{

1848

u64 delta;

1848

u64 delta;

1849

1850

if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))

1850

if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))

1851

return;

1851

return;

1852

1853

delta = current->sched_info.run_delay - vcpu->arch.st.last_steal;

1853

delta = current->sched_info.run_delay - vcpu->arch.st.last_steal;

1854

vcpu->arch.st.last_steal = current->sched_info.run_delay;

1854

vcpu->arch.st.last_steal = current->sched_info.run_delay;

1855

vcpu->arch.st.accum_steal = delta;

1855

vcpu->arch.st.accum_steal = delta;

1856

}

1856

}

1857

1858

static void record_steal_time(struct kvm_vcpu *vcpu)

1858

static void record_steal_time(struct kvm_vcpu *vcpu)

1859

{

1859

{

1860

if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))

1860

if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))

1861

return;

1861

return;

1862

1863

if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,

1863

if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,

1864

&vcpu->arch.st.steal, sizeof(struct kvm_steal_time))))

1864

&vcpu->arch.st.steal, sizeof(struct kvm_steal_time))))

1865

return;

1865

return;

1866

1867

vcpu->arch.st.steal.steal += vcpu->arch.st.accum_steal;

1867

vcpu->arch.st.steal.steal += vcpu->arch.st.accum_steal;

1868

vcpu->arch.st.steal.version += 2;

1868

vcpu->arch.st.steal.version += 2;

1869

vcpu->arch.st.accum_steal = 0;

1869

vcpu->arch.st.accum_steal = 0;

1870

1871

kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,

1871

kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,

1872

&vcpu->arch.st.steal, sizeof(struct kvm_steal_time));

1872

&vcpu->arch.st.steal, sizeof(struct kvm_steal_time));

1873

}

1873

}

1874

1875

int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)

1875

int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)

1876

{

1876

{

1877

bool pr = false;

1877

bool pr = false;

1878

u32 msr = msr_info->index;

1878

u32 msr = msr_info->index;

1879

u64 data = msr_info->data;

1879

u64 data = msr_info->data;

1880

1881

switch (msr) {

1881

switch (msr) {

1882

case MSR_EFER:

1882

case MSR_EFER:

1883

return set_efer(vcpu, data);

1883

return set_efer(vcpu, data);

1884

case MSR_K7_HWCR:

1884

case MSR_K7_HWCR:

1885

data &= ~(u64)0x40; /* ignore flush filter disable */

1885

data &= ~(u64)0x40; /* ignore flush filter disable */

1886

data &= ~(u64)0x100; /* ignore ignne emulation enable */

1886

data &= ~(u64)0x100; /* ignore ignne emulation enable */

1887

data &= ~(u64)0x8; /* ignore TLB cache disable */

1887

data &= ~(u64)0x8; /* ignore TLB cache disable */

1888

if (data != 0) {

1888

if (data != 0) {

1889

vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",

1889

vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",

1890

data);

1890

data);

1891

return 1;

1891

return 1;

1892

}

1892

}

1893

break;

1893

break;

1894

case MSR_FAM10H_MMIO_CONF_BASE:

1894

case MSR_FAM10H_MMIO_CONF_BASE:

1895

if (data != 0) {

1895

if (data != 0) {

1896

vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "

1896

vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "

1897

"0x%llx\n", data);

1897

"0x%llx\n", data);

1898

return 1;

1898

return 1;

1899

}

1899

}

1900

break;

1900

break;

1901

case MSR_AMD64_NB_CFG:

1901

case MSR_AMD64_NB_CFG:

1902

break;

1902

break;

1903

case MSR_IA32_DEBUGCTLMSR:

1903

case MSR_IA32_DEBUGCTLMSR:

1904

if (!data) {

1904

if (!data) {

1905

/* We support the non-activated case already */

1905

/* We support the non-activated case already */

1906

break;

1906

break;

1907

} else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {

1907

} else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {

1908

/* Values other than LBR and BTF are vendor-specific,

1908

/* Values other than LBR and BTF are vendor-specific,

1909

thus reserved and should throw a #GP */

1909

thus reserved and should throw a #GP */

1910

return 1;

1910

return 1;

1911

}

1911

}

1912

vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",

1912

vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",

1913

__func__, data);

1913

__func__, data);

1914

break;

1914

break;

1915

case MSR_IA32_UCODE_REV:

1915

case MSR_IA32_UCODE_REV:

1916

case MSR_IA32_UCODE_WRITE:

1916

case MSR_IA32_UCODE_WRITE:

1917

case MSR_VM_HSAVE_PA:

1917

case MSR_VM_HSAVE_PA:

1918

case MSR_AMD64_PATCH_LOADER:

1918

case MSR_AMD64_PATCH_LOADER:

1919

break;

1919

break;

1920

case 0x200 ... 0x2ff:

1920

case 0x200 ... 0x2ff:

1921

return set_msr_mtrr(vcpu, msr, data);

1921

return set_msr_mtrr(vcpu, msr, data);

1922

case MSR_IA32_APICBASE:

1922

case MSR_IA32_APICBASE:

1923

kvm_set_apic_base(vcpu, data);

1923

kvm_set_apic_base(vcpu, data);

1924

break;

1924

break;

1925

case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:

1925

case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:

1926

return kvm_x2apic_msr_write(vcpu, msr, data);

1926

return kvm_x2apic_msr_write(vcpu, msr, data);

1927

case MSR_IA32_TSCDEADLINE:

1927

case MSR_IA32_TSCDEADLINE:

1928

kvm_set_lapic_tscdeadline_msr(vcpu, data);

1928

kvm_set_lapic_tscdeadline_msr(vcpu, data);

1929

break;

1929

break;

1930

case MSR_IA32_TSC_ADJUST:

1930

case MSR_IA32_TSC_ADJUST:

1931

if (guest_cpuid_has_tsc_adjust(vcpu)) {

1931

if (guest_cpuid_has_tsc_adjust(vcpu)) {

1932

if (!msr_info->host_initiated) {

1932

if (!msr_info->host_initiated) {

1933

u64 adj = data - vcpu->arch.ia32_tsc_adjust_msr;

1933

u64 adj = data - vcpu->arch.ia32_tsc_adjust_msr;

1934

kvm_x86_ops->adjust_tsc_offset(vcpu, adj, true);

1934

kvm_x86_ops->adjust_tsc_offset(vcpu, adj, true);

1935

}

1935

}

1936

vcpu->arch.ia32_tsc_adjust_msr = data;

1936

vcpu->arch.ia32_tsc_adjust_msr = data;

1937

}

1937

}

1938

break;

1938

break;

1939

case MSR_IA32_MISC_ENABLE:

1939

case MSR_IA32_MISC_ENABLE:

1940

vcpu->arch.ia32_misc_enable_msr = data;

1940

vcpu->arch.ia32_misc_enable_msr = data;

1941

break;

1941

break;

1942

case MSR_KVM_WALL_CLOCK_NEW:

1942

case MSR_KVM_WALL_CLOCK_NEW:

1943

case MSR_KVM_WALL_CLOCK:

1943

case MSR_KVM_WALL_CLOCK:

1944

vcpu->kvm->arch.wall_clock = data;

1944

vcpu->kvm->arch.wall_clock = data;

1945

kvm_write_wall_clock(vcpu->kvm, data);

1945

kvm_write_wall_clock(vcpu->kvm, data);

1946

break;

1946

break;

1947

case MSR_KVM_SYSTEM_TIME_NEW:

1947

case MSR_KVM_SYSTEM_TIME_NEW:

1948

case MSR_KVM_SYSTEM_TIME: {

1948

case MSR_KVM_SYSTEM_TIME: {

1949

kvmclock_reset(vcpu);

1949

kvmclock_reset(vcpu);

1950

1951

vcpu->arch.time = data;

1951

vcpu->arch.time = data;

1952

kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);

1952

kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);

1953

1954

/* we verify if the enable bit is set... */

1954

/* we verify if the enable bit is set... */

1955

if (!(data & 1))

1955

if (!(data & 1))

1956

break;

1956

break;

1957

1958

/* ...but clean it before doing the actual write */

1958

/* ...but clean it before doing the actual write */

1959

vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);

1959

vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);

1960

1961

vcpu->arch.time_page =

1961

vcpu->arch.time_page =

1962

gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);

1962

gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);

1963

1964

if (is_error_page(vcpu->arch.time_page))

1964

if (is_error_page(vcpu->arch.time_page))

1965

vcpu->arch.time_page = NULL;

1965

vcpu->arch.time_page = NULL;

1966

1967

break;

1967

break;

1968

}

1968

}

1969

case MSR_KVM_ASYNC_PF_EN:

1969

case MSR_KVM_ASYNC_PF_EN:

1970

if (kvm_pv_enable_async_pf(vcpu, data))

1970

if (kvm_pv_enable_async_pf(vcpu, data))

1971

return 1;

1971

return 1;

1972

break;

1972

break;

1973

case MSR_KVM_STEAL_TIME:

1973

case MSR_KVM_STEAL_TIME:

1974

1975

if (unlikely(!sched_info_on()))

1975

if (unlikely(!sched_info_on()))

1976

return 1;

1976

return 1;

1977

1978

if (data & KVM_STEAL_RESERVED_MASK)

1978

if (data & KVM_STEAL_RESERVED_MASK)

1979

return 1;

1979

return 1;

1980

1981

if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime,

1981

if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime,

1982

data & KVM_STEAL_VALID_BITS))

1982

data & KVM_STEAL_VALID_BITS))

1983

return 1;

1983

return 1;

1984

1985

vcpu->arch.st.msr_val = data;

1985

vcpu->arch.st.msr_val = data;

1986

1987

if (!(data & KVM_MSR_ENABLED))

1987

if (!(data & KVM_MSR_ENABLED))

1988

break;

1988

break;

1989

1990

vcpu->arch.st.last_steal = current->sched_info.run_delay;

1990

vcpu->arch.st.last_steal = current->sched_info.run_delay;

1991

1992

preempt_disable();

1992

preempt_disable();

1993

accumulate_steal_time(vcpu);

1993

accumulate_steal_time(vcpu);

1994

preempt_enable();

1994

preempt_enable();

1995

1996

kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);

1996

kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);

1997

1998

break;

1998

break;

1999

case MSR_KVM_PV_EOI_EN:

1999

case MSR_KVM_PV_EOI_EN:

2000

if (kvm_lapic_enable_pv_eoi(vcpu, data))

2000

if (kvm_lapic_enable_pv_eoi(vcpu, data))

2001

return 1;

2001

return 1;

2002

break;

2002

break;

2003

2004

case MSR_IA32_MCG_CTL:

2004

case MSR_IA32_MCG_CTL:

2005

case MSR_IA32_MCG_STATUS:

2005

case MSR_IA32_MCG_STATUS:

2006

case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:

2006

case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:

2007

return set_msr_mce(vcpu, msr, data);

2007

return set_msr_mce(vcpu, msr, data);

2008

2009

/* Performance counters are not protected by a CPUID bit,

2009

/* Performance counters are not protected by a CPUID bit,

2010

* so we should check all of them in the generic path for the sake of

2010

* so we should check all of them in the generic path for the sake of

2011

* cross vendor migration.

2011

* cross vendor migration.

2012

* Writing a zero into the event select MSRs disables them,

2012

* Writing a zero into the event select MSRs disables them,

2013

* which we perfectly emulate ;-). Any other value should be at least

2013

* which we perfectly emulate ;-). Any other value should be at least

2014

* reported, some guests depend on them.

2014

* reported, some guests depend on them.

2015

*/

2015

*/

2016

case MSR_K7_EVNTSEL0:

2016

case MSR_K7_EVNTSEL0:

2017

case MSR_K7_EVNTSEL1:

2017

case MSR_K7_EVNTSEL1:

2018

case MSR_K7_EVNTSEL2:

2018

case MSR_K7_EVNTSEL2:

2019

case MSR_K7_EVNTSEL3:

2019

case MSR_K7_EVNTSEL3:

2020

if (data != 0)

2020

if (data != 0)

2021

vcpu_unimpl(vcpu, "unimplemented perfctr wrmsr: "

2021

vcpu_unimpl(vcpu, "unimplemented perfctr wrmsr: "

2022

"0x%x data 0x%llx\n", msr, data);

2022

"0x%x data 0x%llx\n", msr, data);

2023

break;

2023

break;

2024

/* at least RHEL 4 unconditionally writes to the perfctr registers,

2024

/* at least RHEL 4 unconditionally writes to the perfctr registers,

2025

* so we ignore writes to make it happy.

2025

* so we ignore writes to make it happy.

2026

*/

2026

*/

2027

case MSR_K7_PERFCTR0:

2027

case MSR_K7_PERFCTR0:

2028

case MSR_K7_PERFCTR1:

2028

case MSR_K7_PERFCTR1:

2029

case MSR_K7_PERFCTR2:

2029

case MSR_K7_PERFCTR2:

2030

case MSR_K7_PERFCTR3:

2030

case MSR_K7_PERFCTR3:

2031

vcpu_unimpl(vcpu, "unimplemented perfctr wrmsr: "

2031

vcpu_unimpl(vcpu, "unimplemented perfctr wrmsr: "

2032

"0x%x data 0x%llx\n", msr, data);

2032

"0x%x data 0x%llx\n", msr, data);

2033

break;

2033

break;

2034

case MSR_P6_PERFCTR0:

2034

case MSR_P6_PERFCTR0:

2035

case MSR_P6_PERFCTR1:

2035

case MSR_P6_PERFCTR1:

2036

pr = true;

2036

pr = true;

2037

case MSR_P6_EVNTSEL0:

2037

case MSR_P6_EVNTSEL0:

2038

case MSR_P6_EVNTSEL1:

2038

case MSR_P6_EVNTSEL1:

2039

if (kvm_pmu_msr(vcpu, msr))

2039

if (kvm_pmu_msr(vcpu, msr))

2040

return kvm_pmu_set_msr(vcpu, msr, data);

2040

return kvm_pmu_set_msr(vcpu, msr, data);

2041

2042

if (pr || data != 0)

2042

if (pr || data != 0)

2043

vcpu_unimpl(vcpu, "disabled perfctr wrmsr: "

2043

vcpu_unimpl(vcpu, "disabled perfctr wrmsr: "

2044

"0x%x data 0x%llx\n", msr, data);

2044

"0x%x data 0x%llx\n", msr, data);

2045

break;

2045

break;

2046

case MSR_K7_CLK_CTL:

2046

case MSR_K7_CLK_CTL:

2047

/*

2047

/*

2048

* Ignore all writes to this no longer documented MSR.

2048

* Ignore all writes to this no longer documented MSR.

2049

* Writes are only relevant for old K7 processors,

2049

* Writes are only relevant for old K7 processors,

2050

* all pre-dating SVM, but a recommended workaround from

2050

* all pre-dating SVM, but a recommended workaround from

2051

* AMD for these chips. It is possible to specify the

2051

* AMD for these chips. It is possible to specify the

2052

* affected processor models on the command line, hence

2052

* affected processor models on the command line, hence

2053

* the need to ignore the workaround.

2053

* the need to ignore the workaround.

2054

*/

2054

*/

2055

break;

2055

break;

2056

case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:

2056

case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:

2057

if (kvm_hv_msr_partition_wide(msr)) {

2057

if (kvm_hv_msr_partition_wide(msr)) {

2058

int r;

2058

int r;

2059

mutex_lock(&vcpu->kvm->lock);

2059

mutex_lock(&vcpu->kvm->lock);

2060

r = set_msr_hyperv_pw(vcpu, msr, data);

2060

r = set_msr_hyperv_pw(vcpu, msr, data);

2061

mutex_unlock(&vcpu->kvm->lock);

2061

mutex_unlock(&vcpu->kvm->lock);

2062

return r;

2062

return r;

2063

} else

2063

} else

2064

return set_msr_hyperv(vcpu, msr, data);

2064

return set_msr_hyperv(vcpu, msr, data);

2065

break;

2065

break;

2066

case MSR_IA32_BBL_CR_CTL3:

2066

case MSR_IA32_BBL_CR_CTL3:

2067

/* Drop writes to this legacy MSR -- see rdmsr

2067

/* Drop writes to this legacy MSR -- see rdmsr

2068

* counterpart for further detail.

2068

* counterpart for further detail.

2069

*/

2069

*/

2070

vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data);

2070

vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data);

2071

break;

2071

break;

2072

case MSR_AMD64_OSVW_ID_LENGTH:

2072

case MSR_AMD64_OSVW_ID_LENGTH:

2073

if (!guest_cpuid_has_osvw(vcpu))

2073

if (!guest_cpuid_has_osvw(vcpu))

2074

return 1;

2074

return 1;

2075

vcpu->arch.osvw.length = data;

2075

vcpu->arch.osvw.length = data;

2076

break;

2076

break;

2077

case MSR_AMD64_OSVW_STATUS:

2077

case MSR_AMD64_OSVW_STATUS:

2078

if (!guest_cpuid_has_osvw(vcpu))

2078

if (!guest_cpuid_has_osvw(vcpu))

2079

return 1;

2079

return 1;

2080

vcpu->arch.osvw.status = data;

2080

vcpu->arch.osvw.status = data;

2081

break;

2081

break;

2082

default:

2082

default:

2083

if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))

2083

if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))

2084

return xen_hvm_config(vcpu, data);

2084

return xen_hvm_config(vcpu, data);

2085

if (kvm_pmu_msr(vcpu, msr))

2085

if (kvm_pmu_msr(vcpu, msr))

2086

return kvm_pmu_set_msr(vcpu, msr, data);

2086

return kvm_pmu_set_msr(vcpu, msr, data);

2087

if (!ignore_msrs) {

2087

if (!ignore_msrs) {

2088

vcpu_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",

2088

vcpu_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",

2089

msr, data);

2089

msr, data);

2090

return 1;

2090

return 1;

2091

} else {

2091

} else {

2092

vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n",

2092

vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n",

2093

msr, data);

2093

msr, data);

2094

break;

2094

break;

2095

}

2095

}

2096

}

2096

}

2097

return 0;

2097

return 0;

2098

}

2098

}

2099

EXPORT_SYMBOL_GPL(kvm_set_msr_common);

2099

EXPORT_SYMBOL_GPL(kvm_set_msr_common);

2100

2101

2102

/*

2102

/*

2103

* Reads an msr value (of 'msr_index') into 'pdata'.

2103

* Reads an msr value (of 'msr_index') into 'pdata'.

2104

* Returns 0 on success, non-0 otherwise.

2104

* Returns 0 on success, non-0 otherwise.

2105

* Assumes vcpu_load() was already called.

2105

* Assumes vcpu_load() was already called.

2106

*/

2106

*/

2107

int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)

2107

int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)

2108

{

2108

{

2109

return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);

2109

return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);

2110

}

2110

}

2111

2112

static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)

2112

static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)

2113

{

2113

{

2114

u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;

2114

u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;

2115

2116

if (!msr_mtrr_valid(msr))

2116

if (!msr_mtrr_valid(msr))

2117

return 1;

2117

return 1;

2118

2119

if (msr == MSR_MTRRdefType)

2119

if (msr == MSR_MTRRdefType)

2120

*pdata = vcpu->arch.mtrr_state.def_type +

2120

*pdata = vcpu->arch.mtrr_state.def_type +

2121

(vcpu->arch.mtrr_state.enabled << 10);

2121

(vcpu->arch.mtrr_state.enabled << 10);

2122

else if (msr == MSR_MTRRfix64K_00000)

2122

else if (msr == MSR_MTRRfix64K_00000)

2123

*pdata = p[0];

2123

*pdata = p[0];

2124

else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)

2124

else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)

2125

*pdata = p[1 + msr - MSR_MTRRfix16K_80000];

2125

*pdata = p[1 + msr - MSR_MTRRfix16K_80000];

2126

else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)

2126

else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)

2127

*pdata = p[3 + msr - MSR_MTRRfix4K_C0000];

2127

*pdata = p[3 + msr - MSR_MTRRfix4K_C0000];

2128

else if (msr == MSR_IA32_CR_PAT)

2128

else if (msr == MSR_IA32_CR_PAT)

2129

*pdata = vcpu->arch.pat;

2129

*pdata = vcpu->arch.pat;

2130

else { /* Variable MTRRs */

2130

else { /* Variable MTRRs */

2131

int idx, is_mtrr_mask;

2131

int idx, is_mtrr_mask;

2132

u64 *pt;

2132

u64 *pt;

2133

2134

idx = (msr - 0x200) / 2;

2134

idx = (msr - 0x200) / 2;

2135

is_mtrr_mask = msr - 0x200 - 2 * idx;

2135

is_mtrr_mask = msr - 0x200 - 2 * idx;

2136

if (!is_mtrr_mask)

2136

if (!is_mtrr_mask)

2137

pt =

2137

pt =

2138

(u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;

2138

(u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;

2139

else

2139

else

2140

pt =

2140

pt =

2141

(u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;

2141

(u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;

2142

*pdata = *pt;

2142

*pdata = *pt;

2143

}

2143

}

2144

2145

return 0;

2145

return 0;

2146

}

2146

}

2147

2148

static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)

2148

static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)

2149

{

2149

{

2150

u64 data;

2150

u64 data;

2151

u64 mcg_cap = vcpu->arch.mcg_cap;

2151

u64 mcg_cap = vcpu->arch.mcg_cap;

2152

unsigned bank_num = mcg_cap & 0xff;

2152

unsigned bank_num = mcg_cap & 0xff;

2153

2154

switch (msr) {

2154

switch (msr) {

2155

case MSR_IA32_P5_MC_ADDR:

2155

case MSR_IA32_P5_MC_ADDR:

2156

case MSR_IA32_P5_MC_TYPE:

2156

case MSR_IA32_P5_MC_TYPE:

2157

data = 0;

2157

data = 0;

2158

break;

2158

break;

2159

case MSR_IA32_MCG_CAP:

2159

case MSR_IA32_MCG_CAP:

2160

data = vcpu->arch.mcg_cap;

2160

data = vcpu->arch.mcg_cap;

2161

break;

2161

break;

2162

case MSR_IA32_MCG_CTL:

2162

case MSR_IA32_MCG_CTL:

2163

if (!(mcg_cap & MCG_CTL_P))

2163

if (!(mcg_cap & MCG_CTL_P))

2164

return 1;

2164

return 1;

2165

data = vcpu->arch.mcg_ctl;

2165

data = vcpu->arch.mcg_ctl;

2166

break;

2166

break;

2167

case MSR_IA32_MCG_STATUS:

2167

case MSR_IA32_MCG_STATUS:

2168

data = vcpu->arch.mcg_status;

2168

data = vcpu->arch.mcg_status;

2169

break;

2169

break;

2170

default:

2170

default:

2171

if (msr >= MSR_IA32_MC0_CTL &&

2171

if (msr >= MSR_IA32_MC0_CTL &&

2172

msr < MSR_IA32_MC0_CTL + 4 * bank_num) {

2172

msr < MSR_IA32_MC0_CTL + 4 * bank_num) {

2173

u32 offset = msr - MSR_IA32_MC0_CTL;

2173

u32 offset = msr - MSR_IA32_MC0_CTL;

2174

data = vcpu->arch.mce_banks[offset];

2174

data = vcpu->arch.mce_banks[offset];

2175

break;

2175

break;

2176

}

2176

}

2177

return 1;

2177

return 1;

2178

}

2178

}

2179

*pdata = data;

2179

*pdata = data;

2180

return 0;

2180

return 0;

2181

}

2181

}

2182

2183

static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)

2183

static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)

2184

{

2184

{

2185

u64 data = 0;

2185

u64 data = 0;

2186

struct kvm *kvm = vcpu->kvm;

2186

struct kvm *kvm = vcpu->kvm;

2187

2188

switch (msr) {

2188

switch (msr) {

2189

case HV_X64_MSR_GUEST_OS_ID:

2189

case HV_X64_MSR_GUEST_OS_ID:

2190

data = kvm->arch.hv_guest_os_id;

2190

data = kvm->arch.hv_guest_os_id;

2191

break;

2191

break;

2192

case HV_X64_MSR_HYPERCALL:

2192

case HV_X64_MSR_HYPERCALL:

2193

data = kvm->arch.hv_hypercall;

2193

data = kvm->arch.hv_hypercall;

2194

break;

2194

break;

2195

default:

2195

default:

2196

vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);

2196

vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);

2197

return 1;

2197

return 1;

2198

}

2198

}

2199

2200

*pdata = data;

2200

*pdata = data;

2201

return 0;

2201

return 0;

2202

}

2202

}

2203

2204

static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)

2204

static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)

2205

{

2205

{

2206

u64 data = 0;

2206

u64 data = 0;

2207

2208

switch (msr) {

2208

switch (msr) {

2209

case HV_X64_MSR_VP_INDEX: {

2209

case HV_X64_MSR_VP_INDEX: {

2210

int r;

2210

int r;

2211

struct kvm_vcpu *v;

2211

struct kvm_vcpu *v;

2212

kvm_for_each_vcpu(r, v, vcpu->kvm)

2212

kvm_for_each_vcpu(r, v, vcpu->kvm)

2213

if (v == vcpu)

2213

if (v == vcpu)

2214

data = r;

2214

data = r;

2215

break;

2215

break;

2216

}

2216

}

2217

case HV_X64_MSR_EOI:

2217

case HV_X64_MSR_EOI:

2218

return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata);

2218

return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata);

2219

case HV_X64_MSR_ICR:

2219

case HV_X64_MSR_ICR:

2220

return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata);

2220

return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata);

2221

case HV_X64_MSR_TPR:

2221

case HV_X64_MSR_TPR:

2222

return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata);

2222

return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata);

2223

case HV_X64_MSR_APIC_ASSIST_PAGE:

2223

case HV_X64_MSR_APIC_ASSIST_PAGE:

2224

data = vcpu->arch.hv_vapic;

2224

data = vcpu->arch.hv_vapic;

2225

break;

2225

break;

2226

default:

2226

default:

2227

vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);

2227

vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);

2228

return 1;

2228

return 1;

2229

}

2229

}

2230

*pdata = data;

2230

*pdata = data;

2231

return 0;

2231

return 0;

2232

}

2232

}

2233

2234

int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)

2234

int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)

2235

{

2235

{

2236

u64 data;

2236

u64 data;

2237

2238

switch (msr) {

2238

switch (msr) {

2239

case MSR_IA32_PLATFORM_ID:

2239

case MSR_IA32_PLATFORM_ID:

2240

case MSR_IA32_EBL_CR_POWERON:

2240

case MSR_IA32_EBL_CR_POWERON:

2241

case MSR_IA32_DEBUGCTLMSR:

2241

case MSR_IA32_DEBUGCTLMSR:

2242

case MSR_IA32_LASTBRANCHFROMIP:

2242

case MSR_IA32_LASTBRANCHFROMIP:

2243

case MSR_IA32_LASTBRANCHTOIP:

2243

case MSR_IA32_LASTBRANCHTOIP:

2244

case MSR_IA32_LASTINTFROMIP:

2244

case MSR_IA32_LASTINTFROMIP:

2245

case MSR_IA32_LASTINTTOIP:

2245

case MSR_IA32_LASTINTTOIP:

2246

case MSR_K8_SYSCFG:

2246

case MSR_K8_SYSCFG:

2247

case MSR_K7_HWCR:

2247

case MSR_K7_HWCR:

2248

case MSR_VM_HSAVE_PA:

2248

case MSR_VM_HSAVE_PA:

2249

case MSR_K7_EVNTSEL0:

2249

case MSR_K7_EVNTSEL0:

2250

case MSR_K7_PERFCTR0:

2250

case MSR_K7_PERFCTR0:

2251

case MSR_K8_INT_PENDING_MSG:

2251

case MSR_K8_INT_PENDING_MSG:

2252

case MSR_AMD64_NB_CFG:

2252

case MSR_AMD64_NB_CFG:

2253

case MSR_FAM10H_MMIO_CONF_BASE:

2253

case MSR_FAM10H_MMIO_CONF_BASE:

2254

data = 0;

2254

data = 0;

2255

break;

2255

break;

2256

case MSR_P6_PERFCTR0:

2256

case MSR_P6_PERFCTR0:

2257

case MSR_P6_PERFCTR1:

2257

case MSR_P6_PERFCTR1:

2258

case MSR_P6_EVNTSEL0:

2258

case MSR_P6_EVNTSEL0:

2259

case MSR_P6_EVNTSEL1:

2259

case MSR_P6_EVNTSEL1:

2260

if (kvm_pmu_msr(vcpu, msr))

2260

if (kvm_pmu_msr(vcpu, msr))

2261

return kvm_pmu_get_msr(vcpu, msr, pdata);

2261

return kvm_pmu_get_msr(vcpu, msr, pdata);

2262

data = 0;

2262

data = 0;

2263

break;

2263

break;

2264

case MSR_IA32_UCODE_REV:

2264

case MSR_IA32_UCODE_REV:

2265

data = 0x100000000ULL;

2265

data = 0x100000000ULL;

2266

break;

2266

break;

2267

case MSR_MTRRcap:

2267

case MSR_MTRRcap:

2268

data = 0x500 | KVM_NR_VAR_MTRR;

2268

data = 0x500 | KVM_NR_VAR_MTRR;

2269

break;

2269

break;

2270

case 0x200 ... 0x2ff:

2270

case 0x200 ... 0x2ff:

2271

return get_msr_mtrr(vcpu, msr, pdata);

2271

return get_msr_mtrr(vcpu, msr, pdata);

2272

case 0xcd: /* fsb frequency */

2272

case 0xcd: /* fsb frequency */

2273

data = 3;

2273

data = 3;

2274

break;

2274

break;

2275

/*

2275

/*

2276

* MSR_EBC_FREQUENCY_ID

2276

* MSR_EBC_FREQUENCY_ID

2277

* Conservative value valid for even the basic CPU models.

2277

* Conservative value valid for even the basic CPU models.

2278

* Models 0,1: 000 in bits 23:21 indicating a bus speed of

2278

* Models 0,1: 000 in bits 23:21 indicating a bus speed of

2279

* 100MHz, model 2 000 in bits 18:16 indicating 100MHz,

2279

* 100MHz, model 2 000 in bits 18:16 indicating 100MHz,

2280

* and 266MHz for model 3, or 4. Set Core Clock

2280

* and 266MHz for model 3, or 4. Set Core Clock

2281

* Frequency to System Bus Frequency Ratio to 1 (bits

2281

* Frequency to System Bus Frequency Ratio to 1 (bits

2282

* 31:24) even though these are only valid for CPU

2282

* 31:24) even though these are only valid for CPU

2283

* models > 2, however guests may end up dividing or

2283

* models > 2, however guests may end up dividing or

2284

* multiplying by zero otherwise.

2284

* multiplying by zero otherwise.

2285

*/

2285

*/

2286

case MSR_EBC_FREQUENCY_ID:

2286

case MSR_EBC_FREQUENCY_ID:

2287

data = 1 << 24;

2287

data = 1 << 24;

2288

break;

2288

break;

2289

case MSR_IA32_APICBASE:

2289

case MSR_IA32_APICBASE:

2290

data = kvm_get_apic_base(vcpu);

2290

data = kvm_get_apic_base(vcpu);

2291

break;

2291

break;

2292

case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:

2292

case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:

2293

return kvm_x2apic_msr_read(vcpu, msr, pdata);

2293

return kvm_x2apic_msr_read(vcpu, msr, pdata);

2294

break;

2294

break;

2295

case MSR_IA32_TSCDEADLINE:

2295

case MSR_IA32_TSCDEADLINE:

2296

data = kvm_get_lapic_tscdeadline_msr(vcpu);

2296

data = kvm_get_lapic_tscdeadline_msr(vcpu);

2297

break;

2297

break;

2298

case MSR_IA32_TSC_ADJUST:

2298

case MSR_IA32_TSC_ADJUST:

2299

data = (u64)vcpu->arch.ia32_tsc_adjust_msr;

2299

data = (u64)vcpu->arch.ia32_tsc_adjust_msr;

2300

break;

2300

break;

2301

case MSR_IA32_MISC_ENABLE:

2301

case MSR_IA32_MISC_ENABLE:

2302

data = vcpu->arch.ia32_misc_enable_msr;

2302

data = vcpu->arch.ia32_misc_enable_msr;

2303

break;

2303

break;

2304

case MSR_IA32_PERF_STATUS:

2304

case MSR_IA32_PERF_STATUS:

2305

/* TSC increment by tick */

2305

/* TSC increment by tick */

2306

data = 1000ULL;

2306

data = 1000ULL;

2307

/* CPU multiplier */

2307

/* CPU multiplier */

2308

data |= (((uint64_t)4ULL) << 40);

2308

data |= (((uint64_t)4ULL) << 40);

2309

break;

2309

break;

2310

case MSR_EFER:

2310

case MSR_EFER:

2311

data = vcpu->arch.efer;

2311

data = vcpu->arch.efer;

2312

break;

2312

break;

2313

case MSR_KVM_WALL_CLOCK:

2313

case MSR_KVM_WALL_CLOCK:

2314

case MSR_KVM_WALL_CLOCK_NEW:

2314

case MSR_KVM_WALL_CLOCK_NEW:

2315

data = vcpu->kvm->arch.wall_clock;

2315

data = vcpu->kvm->arch.wall_clock;

2316

break;

2316

break;

2317

case MSR_KVM_SYSTEM_TIME:

2317

case MSR_KVM_SYSTEM_TIME:

2318

case MSR_KVM_SYSTEM_TIME_NEW:

2318

case MSR_KVM_SYSTEM_TIME_NEW:

2319

data = vcpu->arch.time;

2319

data = vcpu->arch.time;

2320

break;

2320

break;

2321

case MSR_KVM_ASYNC_PF_EN:

2321

case MSR_KVM_ASYNC_PF_EN:

2322

data = vcpu->arch.apf.msr_val;

2322

data = vcpu->arch.apf.msr_val;

2323

break;

2323

break;

2324

case MSR_KVM_STEAL_TIME:

2324

case MSR_KVM_STEAL_TIME:

2325

data = vcpu->arch.st.msr_val;

2325

data = vcpu->arch.st.msr_val;

2326

break;

2326

break;

2327

case MSR_KVM_PV_EOI_EN:

2327

case MSR_KVM_PV_EOI_EN:

2328

data = vcpu->arch.pv_eoi.msr_val;

2328

data = vcpu->arch.pv_eoi.msr_val;

2329

break;

2329

break;

2330

case MSR_IA32_P5_MC_ADDR:

2330

case MSR_IA32_P5_MC_ADDR:

2331

case MSR_IA32_P5_MC_TYPE:

2331

case MSR_IA32_P5_MC_TYPE:

2332

case MSR_IA32_MCG_CAP:

2332

case MSR_IA32_MCG_CAP:

2333

case MSR_IA32_MCG_CTL:

2333

case MSR_IA32_MCG_CTL:

2334

case MSR_IA32_MCG_STATUS:

2334

case MSR_IA32_MCG_STATUS:

2335

case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:

2335

case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:

2336

return get_msr_mce(vcpu, msr, pdata);

2336

return get_msr_mce(vcpu, msr, pdata);

2337

case MSR_K7_CLK_CTL:

2337

case MSR_K7_CLK_CTL:

2338

/*

2338

/*

2339

* Provide expected ramp-up count for K7. All other

2339

* Provide expected ramp-up count for K7. All other

2340

* are set to zero, indicating minimum divisors for

2340

* are set to zero, indicating minimum divisors for

2341

* every field.

2341

* every field.

2342

*

2342

*

2343

* This prevents guest kernels on AMD host with CPU

2343

* This prevents guest kernels on AMD host with CPU

2344

* type 6, model 8 and higher from exploding due to

2344

* type 6, model 8 and higher from exploding due to

2345

* the rdmsr failing.

2345

* the rdmsr failing.

2346

*/

2346

*/

2347

data = 0x20000000;

2347

data = 0x20000000;

2348

break;

2348

break;

2349

case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:

2349

case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:

2350

if (kvm_hv_msr_partition_wide(msr)) {

2350

if (kvm_hv_msr_partition_wide(msr)) {

2351

int r;

2351

int r;

2352

mutex_lock(&vcpu->kvm->lock);

2352

mutex_lock(&vcpu->kvm->lock);

2353

r = get_msr_hyperv_pw(vcpu, msr, pdata);

2353

r = get_msr_hyperv_pw(vcpu, msr, pdata);

2354

mutex_unlock(&vcpu->kvm->lock);

2354

mutex_unlock(&vcpu->kvm->lock);

2355

return r;

2355

return r;

2356

} else

2356

} else

2357

return get_msr_hyperv(vcpu, msr, pdata);

2357

return get_msr_hyperv(vcpu, msr, pdata);

2358

break;

2358

break;

2359

case MSR_IA32_BBL_CR_CTL3:

2359

case MSR_IA32_BBL_CR_CTL3:

2360

/* This legacy MSR exists but isn't fully documented in current

2360

/* This legacy MSR exists but isn't fully documented in current

2361

* silicon. It is however accessed by winxp in very narrow

2361

* silicon. It is however accessed by winxp in very narrow

2362

* scenarios where it sets bit #19, itself documented as

2362

* scenarios where it sets bit #19, itself documented as

2363

* a "reserved" bit. Best effort attempt to source coherent

2363

* a "reserved" bit. Best effort attempt to source coherent

2364

* read data here should the balance of the register be

2364

* read data here should the balance of the register be

2365

* interpreted by the guest:

2365

* interpreted by the guest:

2366

*

2366

*

2367

* L2 cache control register 3: 64GB range, 256KB size,

2367

* L2 cache control register 3: 64GB range, 256KB size,

2368

* enabled, latency 0x1, configured

2368

* enabled, latency 0x1, configured

2369

*/

2369

*/

2370

data = 0xbe702111;

2370

data = 0xbe702111;

2371

break;

2371

break;

2372

case MSR_AMD64_OSVW_ID_LENGTH:

2372

case MSR_AMD64_OSVW_ID_LENGTH:

2373

if (!guest_cpuid_has_osvw(vcpu))

2373

if (!guest_cpuid_has_osvw(vcpu))

2374

return 1;

2374

return 1;

2375

data = vcpu->arch.osvw.length;

2375

data = vcpu->arch.osvw.length;

2376

break;

2376

break;

2377

case MSR_AMD64_OSVW_STATUS:

2377

case MSR_AMD64_OSVW_STATUS:

2378

if (!guest_cpuid_has_osvw(vcpu))

2378

if (!guest_cpuid_has_osvw(vcpu))

2379

return 1;

2379

return 1;

2380

data = vcpu->arch.osvw.status;

2380

data = vcpu->arch.osvw.status;

2381

break;

2381

break;

2382

default:

2382

default:

2383

if (kvm_pmu_msr(vcpu, msr))

2383

if (kvm_pmu_msr(vcpu, msr))

2384

return kvm_pmu_get_msr(vcpu, msr, pdata);

2384

return kvm_pmu_get_msr(vcpu, msr, pdata);

2385

if (!ignore_msrs) {

2385

if (!ignore_msrs) {

2386

vcpu_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);

2386

vcpu_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);

2387

return 1;

2387

return 1;

2388

} else {

2388

} else {

2389

vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr);

2389

vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr);

2390

data = 0;

2390

data = 0;

2391

}

2391

}

2392

break;

2392

break;

2393

}

2393

}

2394

*pdata = data;

2394

*pdata = data;

2395

return 0;

2395

return 0;

2396

}

2396

}

2397

EXPORT_SYMBOL_GPL(kvm_get_msr_common);

2397

EXPORT_SYMBOL_GPL(kvm_get_msr_common);

2398

2399

/*

2399

/*

2400

* Read or write a bunch of msrs. All parameters are kernel addresses.

2400

* Read or write a bunch of msrs. All parameters are kernel addresses.

2401

*

2401

*

2402

* @return number of msrs set successfully.

2402

* @return number of msrs set successfully.

2403

*/

2403

*/

2404

static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,

2404

static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,

2405

struct kvm_msr_entry *entries,

2405

struct kvm_msr_entry *entries,

2406

int (*do_msr)(struct kvm_vcpu *vcpu,

2406

int (*do_msr)(struct kvm_vcpu *vcpu,

2407

unsigned index, u64 *data))

2407

unsigned index, u64 *data))

2408

{

2408

{

2409

int i, idx;

2409

int i, idx;

2410

2411

idx = srcu_read_lock(&vcpu->kvm->srcu);

2411

idx = srcu_read_lock(&vcpu->kvm->srcu);

2412

for (i = 0; i < msrs->nmsrs; ++i)

2412

for (i = 0; i < msrs->nmsrs; ++i)

2413

if (do_msr(vcpu, entries[i].index, &entries[i].data))

2413

if (do_msr(vcpu, entries[i].index, &entries[i].data))

2414

break;

2414

break;

2415

srcu_read_unlock(&vcpu->kvm->srcu, idx);

2415

srcu_read_unlock(&vcpu->kvm->srcu, idx);

2416

2417

return i;

2417

return i;

2418

}

2418

}

2419

2420

/*

2420

/*

2421

* Read or write a bunch of msrs. Parameters are user addresses.

2421

* Read or write a bunch of msrs. Parameters are user addresses.

2422

*

2422

*

2423

* @return number of msrs set successfully.

2423

* @return number of msrs set successfully.

2424

*/

2424

*/

2425

static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,

2425

static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,

2426

int (*do_msr)(struct kvm_vcpu *vcpu,

2426

int (*do_msr)(struct kvm_vcpu *vcpu,

2427

unsigned index, u64 *data),

2427

unsigned index, u64 *data),

2428

int writeback)

2428

int writeback)

2429

{

2429

{

2430

struct kvm_msrs msrs;

2430

struct kvm_msrs msrs;

2431

struct kvm_msr_entry *entries;

2431

struct kvm_msr_entry *entries;

2432

int r, n;

2432

int r, n;

2433

unsigned size;

2433

unsigned size;

2434

2435

r = -EFAULT;

2435

r = -EFAULT;

2436

if (copy_from_user(&msrs, user_msrs, sizeof msrs))

2436

if (copy_from_user(&msrs, user_msrs, sizeof msrs))

2437

goto out;

2437

goto out;

2438

2439

r = -E2BIG;

2439

r = -E2BIG;

2440

if (msrs.nmsrs >= MAX_IO_MSRS)

2440

if (msrs.nmsrs >= MAX_IO_MSRS)

2441

goto out;

2441

goto out;

2442

2443

size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;

2443

size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;

2444

entries = memdup_user(user_msrs->entries, size);

2444

entries = memdup_user(user_msrs->entries, size);

2445

if (IS_ERR(entries)) {

2445

if (IS_ERR(entries)) {

2446

r = PTR_ERR(entries);

2446

r = PTR_ERR(entries);

2447

goto out;

2447

goto out;

2448

}

2448

}

2449

2450

r = n = __msr_io(vcpu, &msrs, entries, do_msr);

2450

r = n = __msr_io(vcpu, &msrs, entries, do_msr);

2451

if (r < 0)

2451

if (r < 0)

2452

goto out_free;

2452

goto out_free;

2453

2454

r = -EFAULT;

2454

r = -EFAULT;

2455

if (writeback && copy_to_user(user_msrs->entries, entries, size))

2455

if (writeback && copy_to_user(user_msrs->entries, entries, size))

2456

goto out_free;

2456

goto out_free;

2457

2458

r = n;

2458

r = n;

2459

2460

out_free:

2460

out_free:

2461

kfree(entries);

2461

kfree(entries);

2462

out:

2462

out:

2463

return r;

2463

return r;

2464

}

2464

}

2465

2466

int kvm_dev_ioctl_check_extension(long ext)

2466

int kvm_dev_ioctl_check_extension(long ext)

2467

{

2467

{

2468

int r;

2468

int r;

2469

2470

switch (ext) {

2470

switch (ext) {

2471

case KVM_CAP_IRQCHIP:

2471

case KVM_CAP_IRQCHIP:

2472

case KVM_CAP_HLT:

2472

case KVM_CAP_HLT:

2473

case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:

2473

case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:

2474

case KVM_CAP_SET_TSS_ADDR:

2474

case KVM_CAP_SET_TSS_ADDR:

2475

case KVM_CAP_EXT_CPUID:

2475

case KVM_CAP_EXT_CPUID:

2476

case KVM_CAP_CLOCKSOURCE:

2476

case KVM_CAP_CLOCKSOURCE:

2477

case KVM_CAP_PIT:

2477

case KVM_CAP_PIT:

2478

case KVM_CAP_NOP_IO_DELAY:

2478

case KVM_CAP_NOP_IO_DELAY:

2479

case KVM_CAP_MP_STATE:

2479

case KVM_CAP_MP_STATE:

2480

case KVM_CAP_SYNC_MMU:

2480

case KVM_CAP_SYNC_MMU:

2481

case KVM_CAP_USER_NMI:

2481

case KVM_CAP_USER_NMI:

2482

case KVM_CAP_REINJECT_CONTROL:

2482

case KVM_CAP_REINJECT_CONTROL:

2483

case KVM_CAP_IRQ_INJECT_STATUS:

2483

case KVM_CAP_IRQ_INJECT_STATUS:

2484

case KVM_CAP_ASSIGN_DEV_IRQ:

2484

case KVM_CAP_ASSIGN_DEV_IRQ:

2485

case KVM_CAP_IRQFD:

2485

case KVM_CAP_IRQFD:

2486

case KVM_CAP_IOEVENTFD:

2486

case KVM_CAP_IOEVENTFD:

2487

case KVM_CAP_PIT2:

2487

case KVM_CAP_PIT2:

2488

case KVM_CAP_PIT_STATE2:

2488

case KVM_CAP_PIT_STATE2:

2489

case KVM_CAP_SET_IDENTITY_MAP_ADDR:

2489

case KVM_CAP_SET_IDENTITY_MAP_ADDR:

2490

case KVM_CAP_XEN_HVM:

2490

case KVM_CAP_XEN_HVM:

2491

case KVM_CAP_ADJUST_CLOCK:

2491

case KVM_CAP_ADJUST_CLOCK:

2492

case KVM_CAP_VCPU_EVENTS:

2492

case KVM_CAP_VCPU_EVENTS:

2493

case KVM_CAP_HYPERV:

2493

case KVM_CAP_HYPERV:

2494

case KVM_CAP_HYPERV_VAPIC:

2494

case KVM_CAP_HYPERV_VAPIC:

2495

case KVM_CAP_HYPERV_SPIN:

2495

case KVM_CAP_HYPERV_SPIN:

2496

case KVM_CAP_PCI_SEGMENT:

2496

case KVM_CAP_PCI_SEGMENT:

2497

case KVM_CAP_DEBUGREGS:

2497

case KVM_CAP_DEBUGREGS:

2498

case KVM_CAP_X86_ROBUST_SINGLESTEP:

2498

case KVM_CAP_X86_ROBUST_SINGLESTEP:

2499

case KVM_CAP_XSAVE:

2499

case KVM_CAP_XSAVE:

2500

case KVM_CAP_ASYNC_PF:

2500

case KVM_CAP_ASYNC_PF:

2501

case KVM_CAP_GET_TSC_KHZ:

2501

case KVM_CAP_GET_TSC_KHZ:

2502

case KVM_CAP_PCI_2_3:

2502

case KVM_CAP_PCI_2_3:

2503

case KVM_CAP_KVMCLOCK_CTRL:

2503

case KVM_CAP_KVMCLOCK_CTRL:

2504

case KVM_CAP_READONLY_MEM:

2504

case KVM_CAP_READONLY_MEM:

2505

case KVM_CAP_IRQFD_RESAMPLE:

2505

case KVM_CAP_IRQFD_RESAMPLE:

2506

r = 1;

2506

r = 1;

2507

break;

2507

break;

2508

case KVM_CAP_COALESCED_MMIO:

2508

case KVM_CAP_COALESCED_MMIO:

2509

r = KVM_COALESCED_MMIO_PAGE_OFFSET;

2509

r = KVM_COALESCED_MMIO_PAGE_OFFSET;

2510

break;

2510

break;

2511

case KVM_CAP_VAPIC:

2511

case KVM_CAP_VAPIC:

2512

r = !kvm_x86_ops->cpu_has_accelerated_tpr();

2512

r = !kvm_x86_ops->cpu_has_accelerated_tpr();

2513

break;

2513

break;

2514

case KVM_CAP_NR_VCPUS:

2514

case KVM_CAP_NR_VCPUS:

2515

r = KVM_SOFT_MAX_VCPUS;

2515

r = KVM_SOFT_MAX_VCPUS;

2516

break;

2516

break;

2517

case KVM_CAP_MAX_VCPUS:

2517

case KVM_CAP_MAX_VCPUS:

2518

r = KVM_MAX_VCPUS;

2518

r = KVM_MAX_VCPUS;

2519

break;

2519

break;

2520

case KVM_CAP_NR_MEMSLOTS:

2520

case KVM_CAP_NR_MEMSLOTS:

2521

r = KVM_USER_MEM_SLOTS;

2521

r = KVM_USER_MEM_SLOTS;

2522

break;

2522

break;

2523

case KVM_CAP_PV_MMU: /* obsolete */

2523

case KVM_CAP_PV_MMU: /* obsolete */

2524

r = 0;

2524

r = 0;

2525

break;

2525

break;

2526

case KVM_CAP_IOMMU:

2526

case KVM_CAP_IOMMU:

2527

r = iommu_present(&pci_bus_type);

2527

r = iommu_present(&pci_bus_type);

2528

break;

2528

break;

2529

case KVM_CAP_MCE:

2529

case KVM_CAP_MCE:

2530

r = KVM_MAX_MCE_BANKS;

2530

r = KVM_MAX_MCE_BANKS;

2531

break;

2531

break;

2532

case KVM_CAP_XCRS:

2532

case KVM_CAP_XCRS:

2533

r = cpu_has_xsave;

2533

r = cpu_has_xsave;

2534

break;

2534

break;

2535

case KVM_CAP_TSC_CONTROL:

2535

case KVM_CAP_TSC_CONTROL:

2536

r = kvm_has_tsc_control;

2536

r = kvm_has_tsc_control;

2537

break;

2537

break;

2538

case KVM_CAP_TSC_DEADLINE_TIMER:

2538

case KVM_CAP_TSC_DEADLINE_TIMER:

2539

r = boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER);

2539

r = boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER);

2540

break;

2540

break;

2541

default:

2541

default:

2542

r = 0;

2542

r = 0;

2543

break;

2543

break;

2544

}

2544

}

2545

return r;

2545

return r;

2546

2547

}

2547

}

2548

2549

long kvm_arch_dev_ioctl(struct file *filp,

2549

long kvm_arch_dev_ioctl(struct file *filp,

2550

unsigned int ioctl, unsigned long arg)

2550

unsigned int ioctl, unsigned long arg)

2551

{

2551

{

2552

void __user *argp = (void __user *)arg;

2552

void __user *argp = (void __user *)arg;

2553

long r;

2553

long r;

2554

2555

switch (ioctl) {

2555

switch (ioctl) {

2556

case KVM_GET_MSR_INDEX_LIST: {

2556

case KVM_GET_MSR_INDEX_LIST: {

2557

struct kvm_msr_list __user *user_msr_list = argp;

2557

struct kvm_msr_list __user *user_msr_list = argp;

2558

struct kvm_msr_list msr_list;

2558

struct kvm_msr_list msr_list;

2559

unsigned n;

2559

unsigned n;

2560

2561

r = -EFAULT;

2561

r = -EFAULT;

2562

if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))

2562

if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))

2563

goto out;

2563

goto out;

2564

n = msr_list.nmsrs;

2564

n = msr_list.nmsrs;

2565

msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);

2565

msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);

2566

if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))

2566

if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))

2567

goto out;

2567

goto out;

2568

r = -E2BIG;

2568

r = -E2BIG;

2569

if (n < msr_list.nmsrs)

2569

if (n < msr_list.nmsrs)

2570

goto out;

2570

goto out;

2571

r = -EFAULT;

2571

r = -EFAULT;

2572

if (copy_to_user(user_msr_list->indices, &msrs_to_save,

2572

if (copy_to_user(user_msr_list->indices, &msrs_to_save,

2573

num_msrs_to_save * sizeof(u32)))

2573

num_msrs_to_save * sizeof(u32)))

2574

goto out;

2574

goto out;

2575

if (copy_to_user(user_msr_list->indices + num_msrs_to_save,

2575

if (copy_to_user(user_msr_list->indices + num_msrs_to_save,

2576

&emulated_msrs,

2576

&emulated_msrs,

2577

ARRAY_SIZE(emulated_msrs) * sizeof(u32)))

2577

ARRAY_SIZE(emulated_msrs) * sizeof(u32)))

2578

goto out;

2578

goto out;

2579

r = 0;

2579

r = 0;

2580

break;

2580

break;

2581

}

2581

}

2582

case KVM_GET_SUPPORTED_CPUID: {

2582

case KVM_GET_SUPPORTED_CPUID: {

2583

struct kvm_cpuid2 __user *cpuid_arg = argp;

2583

struct kvm_cpuid2 __user *cpuid_arg = argp;

2584

struct kvm_cpuid2 cpuid;

2584

struct kvm_cpuid2 cpuid;

2585

2586

r = -EFAULT;

2586

r = -EFAULT;

2587

if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))

2587

if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))

2588

goto out;

2588

goto out;

2589

r = kvm_dev_ioctl_get_supported_cpuid(&cpuid,

2589

r = kvm_dev_ioctl_get_supported_cpuid(&cpuid,

2590

cpuid_arg->entries);

2590

cpuid_arg->entries);

2591

if (r)

2591

if (r)

2592

goto out;

2592

goto out;

2593

2594

r = -EFAULT;

2594

r = -EFAULT;

2595

if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))

2595

if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))

2596

goto out;

2596

goto out;

2597

r = 0;

2597

r = 0;

2598

break;

2598

break;

2599

}

2599

}

2600

case KVM_X86_GET_MCE_CAP_SUPPORTED: {

2600

case KVM_X86_GET_MCE_CAP_SUPPORTED: {

2601

u64 mce_cap;

2601

u64 mce_cap;

2602

2603

mce_cap = KVM_MCE_CAP_SUPPORTED;

2603

mce_cap = KVM_MCE_CAP_SUPPORTED;

2604

r = -EFAULT;

2604

r = -EFAULT;

2605

if (copy_to_user(argp, &mce_cap, sizeof mce_cap))

2605

if (copy_to_user(argp, &mce_cap, sizeof mce_cap))

2606

goto out;

2606

goto out;

2607

r = 0;

2607

r = 0;

2608

break;

2608

break;

2609

}

2609

}

2610

default:

2610

default:

2611

r = -EINVAL;

2611

r = -EINVAL;

2612

}

2612

}

2613

out:

2613

out:

2614

return r;

2614

return r;

2615

}

2615

}

2616

2617

static void wbinvd_ipi(void *garbage)

2617

static void wbinvd_ipi(void *garbage)

2618

{

2618

{

2619

wbinvd();

2619

wbinvd();

2620

}

2620

}

2621

2622

static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)

2622

static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)

2623

{

2623

{

2624

return vcpu->kvm->arch.iommu_domain &&

2624

return vcpu->kvm->arch.iommu_domain &&

2625

!(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY);

2625

!(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY);

2626

}

2626

}

2627

2628

void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)

2628

void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)

2629

{

2629

{

2630

/* Address WBINVD may be executed by guest */

2630

/* Address WBINVD may be executed by guest */

2631

if (need_emulate_wbinvd(vcpu)) {

2631

if (need_emulate_wbinvd(vcpu)) {

2632

if (kvm_x86_ops->has_wbinvd_exit())

2632

if (kvm_x86_ops->has_wbinvd_exit())

2633

cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);

2633

cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);

2634

else if (vcpu->cpu != -1 && vcpu->cpu != cpu)

2634

else if (vcpu->cpu != -1 && vcpu->cpu != cpu)

2635

smp_call_function_single(vcpu->cpu,

2635

smp_call_function_single(vcpu->cpu,

2636

wbinvd_ipi, NULL, 1);

2636

wbinvd_ipi, NULL, 1);

2637

}

2637

}

2638

2639

kvm_x86_ops->vcpu_load(vcpu, cpu);

2639

kvm_x86_ops->vcpu_load(vcpu, cpu);

2640

2641

/* Apply any externally detected TSC adjustments (due to suspend) */

2641

/* Apply any externally detected TSC adjustments (due to suspend) */

2642

if (unlikely(vcpu->arch.tsc_offset_adjustment)) {

2642

if (unlikely(vcpu->arch.tsc_offset_adjustment)) {

2643

adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);

2643

adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);

2644

vcpu->arch.tsc_offset_adjustment = 0;

2644

vcpu->arch.tsc_offset_adjustment = 0;

2645

set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);

2645

set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);

2646

}

2646

}

2647

2648

if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {

2648

if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {

2649

s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :

2649

s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :

2650

native_read_tsc() - vcpu->arch.last_host_tsc;

2650

native_read_tsc() - vcpu->arch.last_host_tsc;

2651

if (tsc_delta < 0)

2651

if (tsc_delta < 0)

2652

mark_tsc_unstable("KVM discovered backwards TSC");

2652

mark_tsc_unstable("KVM discovered backwards TSC");

2653

if (check_tsc_unstable()) {

2653

if (check_tsc_unstable()) {

2654

u64 offset = kvm_x86_ops->compute_tsc_offset(vcpu,

2654

u64 offset = kvm_x86_ops->compute_tsc_offset(vcpu,

2655

vcpu->arch.last_guest_tsc);

2655

vcpu->arch.last_guest_tsc);

2656

kvm_x86_ops->write_tsc_offset(vcpu, offset);

2656

kvm_x86_ops->write_tsc_offset(vcpu, offset);

2657

vcpu->arch.tsc_catchup = 1;

2657

vcpu->arch.tsc_catchup = 1;

2658

}

2658

}

2659

/*

2659

/*

2660

* On a host with synchronized TSC, there is no need to update

2660

* On a host with synchronized TSC, there is no need to update

2661

* kvmclock on vcpu->cpu migration

2661

* kvmclock on vcpu->cpu migration

2662

*/

2662

*/

2663

if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1)

2663

if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1)

2664

kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);

2664

kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);

2665

if (vcpu->cpu != cpu)

2665

if (vcpu->cpu != cpu)

2666

kvm_migrate_timers(vcpu);

2666

kvm_migrate_timers(vcpu);

2667

vcpu->cpu = cpu;

2667

vcpu->cpu = cpu;

2668

}

2668

}

2669

2670

accumulate_steal_time(vcpu);

2670

accumulate_steal_time(vcpu);

2671

kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);

2671

kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);

2672

}

2672

}

2673

2674

void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)

2674

void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)

2675

{

2675

{

2676

kvm_x86_ops->vcpu_put(vcpu);

2676

kvm_x86_ops->vcpu_put(vcpu);

2677

kvm_put_guest_fpu(vcpu);

2677

kvm_put_guest_fpu(vcpu);

2678

vcpu->arch.last_host_tsc = native_read_tsc();

2678

vcpu->arch.last_host_tsc = native_read_tsc();

2679

}

2679

}

2680

2681

static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,

2681

static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,

2682

struct kvm_lapic_state *s)

2682

struct kvm_lapic_state *s)

2683

{

2683

{

2684

memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);

2684

memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);

2685

2686

return 0;

2686

return 0;

2687

}

2687

}

2688

2689

static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,

2689

static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,

2690

struct kvm_lapic_state *s)

2690

struct kvm_lapic_state *s)

2691

{

2691

{

2692

kvm_apic_post_state_restore(vcpu, s);

2692

kvm_apic_post_state_restore(vcpu, s);

2693

update_cr8_intercept(vcpu);

2693

update_cr8_intercept(vcpu);

2694

2695

return 0;

2695

return 0;

2696

}

2696

}

2697

2698

static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,

2698

static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,

2699

struct kvm_interrupt *irq)

2699

struct kvm_interrupt *irq)

2700

{

2700

{

2701

if (irq->irq < 0 || irq->irq >= KVM_NR_INTERRUPTS)

2701

if (irq->irq < 0 || irq->irq >= KVM_NR_INTERRUPTS)

2702

return -EINVAL;

2702

return -EINVAL;

2703

if (irqchip_in_kernel(vcpu->kvm))

2703

if (irqchip_in_kernel(vcpu->kvm))

2704

return -ENXIO;

2704

return -ENXIO;

2705

2706

kvm_queue_interrupt(vcpu, irq->irq, false);

2706

kvm_queue_interrupt(vcpu, irq->irq, false);

2707

kvm_make_request(KVM_REQ_EVENT, vcpu);

2707

kvm_make_request(KVM_REQ_EVENT, vcpu);

2708

2709

return 0;

2709

return 0;

2710

}

2710

}

2711

2712

static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)

2712

static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)

2713

{

2713

{

2714

kvm_inject_nmi(vcpu);

2714

kvm_inject_nmi(vcpu);

2715

2716

return 0;

2716

return 0;

2717

}

2717

}

2718

2719

static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,

2719

static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,

2720

struct kvm_tpr_access_ctl *tac)

2720

struct kvm_tpr_access_ctl *tac)

2721

{

2721

{

2722

if (tac->flags)

2722

if (tac->flags)

2723

return -EINVAL;

2723

return -EINVAL;

2724

vcpu->arch.tpr_access_reporting = !!tac->enabled;

2724

vcpu->arch.tpr_access_reporting = !!tac->enabled;

2725

return 0;

2725

return 0;

2726

}

2726

}

2727

2728

static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,

2728

static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,

2729

u64 mcg_cap)

2729

u64 mcg_cap)

2730

{

2730

{

2731

int r;

2731

int r;

2732

unsigned bank_num = mcg_cap & 0xff, bank;

2732

unsigned bank_num = mcg_cap & 0xff, bank;

2733

2734

r = -EINVAL;

2734

r = -EINVAL;

2735

if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)

2735

if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)

2736

goto out;

2736

goto out;

2737

if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000))

2737

if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000))

2738

goto out;

2738

goto out;

2739

r = 0;

2739

r = 0;

2740

vcpu->arch.mcg_cap = mcg_cap;

2740

vcpu->arch.mcg_cap = mcg_cap;

2741

/* Init IA32_MCG_CTL to all 1s */

2741

/* Init IA32_MCG_CTL to all 1s */

2742

if (mcg_cap & MCG_CTL_P)

2742

if (mcg_cap & MCG_CTL_P)

2743

vcpu->arch.mcg_ctl = ~(u64)0;

2743

vcpu->arch.mcg_ctl = ~(u64)0;

2744

/* Init IA32_MCi_CTL to all 1s */

2744

/* Init IA32_MCi_CTL to all 1s */

2745

for (bank = 0; bank < bank_num; bank++)

2745

for (bank = 0; bank < bank_num; bank++)

2746

vcpu->arch.mce_banks[bank*4] = ~(u64)0;

2746

vcpu->arch.mce_banks[bank*4] = ~(u64)0;

2747

out:

2747

out:

2748

return r;

2748

return r;

2749

}

2749

}

2750

2751

static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,

2751

static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,

2752

struct kvm_x86_mce *mce)

2752

struct kvm_x86_mce *mce)

2753

{

2753

{

2754

u64 mcg_cap = vcpu->arch.mcg_cap;

2754

u64 mcg_cap = vcpu->arch.mcg_cap;

2755

unsigned bank_num = mcg_cap & 0xff;

2755

unsigned bank_num = mcg_cap & 0xff;

2756

u64 *banks = vcpu->arch.mce_banks;

2756

u64 *banks = vcpu->arch.mce_banks;

2757

2758

if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))

2758

if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))

2759

return -EINVAL;

2759

return -EINVAL;

2760

/*

2760

/*

2761

* if IA32_MCG_CTL is not all 1s, the uncorrected error

2761

* if IA32_MCG_CTL is not all 1s, the uncorrected error

2762

* reporting is disabled

2762

* reporting is disabled

2763

*/

2763

*/

2764

if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&

2764

if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&

2765

vcpu->arch.mcg_ctl != ~(u64)0)

2765

vcpu->arch.mcg_ctl != ~(u64)0)

2766

return 0;

2766

return 0;

2767

banks += 4 * mce->bank;

2767

banks += 4 * mce->bank;

2768

/*

2768

/*

2769

* if IA32_MCi_CTL is not all 1s, the uncorrected error

2769

* if IA32_MCi_CTL is not all 1s, the uncorrected error

2770

* reporting is disabled for the bank

2770

* reporting is disabled for the bank

2771

*/

2771

*/

2772

if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0)

2772

if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0)

2773

return 0;

2773

return 0;

2774

if (mce->status & MCI_STATUS_UC) {

2774

if (mce->status & MCI_STATUS_UC) {

2775

if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||

2775

if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||

2776

!kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) {

2776

!kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) {

2777

kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);

2777

kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);

2778

return 0;

2778

return 0;

2779

}

2779

}

2780

if (banks[1] & MCI_STATUS_VAL)

2780

if (banks[1] & MCI_STATUS_VAL)

2781

mce->status |= MCI_STATUS_OVER;

2781

mce->status |= MCI_STATUS_OVER;

2782

banks[2] = mce->addr;

2782

banks[2] = mce->addr;

2783

banks[3] = mce->misc;

2783

banks[3] = mce->misc;

2784

vcpu->arch.mcg_status = mce->mcg_status;

2784

vcpu->arch.mcg_status = mce->mcg_status;

2785

banks[1] = mce->status;

2785

banks[1] = mce->status;

2786

kvm_queue_exception(vcpu, MC_VECTOR);

2786

kvm_queue_exception(vcpu, MC_VECTOR);

2787

} else if (!(banks[1] & MCI_STATUS_VAL)

2787

} else if (!(banks[1] & MCI_STATUS_VAL)

2788

|| !(banks[1] & MCI_STATUS_UC)) {

2788

|| !(banks[1] & MCI_STATUS_UC)) {

2789

if (banks[1] & MCI_STATUS_VAL)

2789

if (banks[1] & MCI_STATUS_VAL)

2790

mce->status |= MCI_STATUS_OVER;

2790

mce->status |= MCI_STATUS_OVER;

2791

banks[2] = mce->addr;

2791

banks[2] = mce->addr;

2792

banks[3] = mce->misc;

2792

banks[3] = mce->misc;

2793

banks[1] = mce->status;

2793

banks[1] = mce->status;

2794

} else

2794

} else

2795

banks[1] |= MCI_STATUS_OVER;

2795

banks[1] |= MCI_STATUS_OVER;

2796

return 0;

2796

return 0;

2797

}

2797

}

2798

2799

static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,

2799

static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,

2800

struct kvm_vcpu_events *events)

2800

struct kvm_vcpu_events *events)

2801

{

2801

{

2802

process_nmi(vcpu);

2802

process_nmi(vcpu);

2803

events->exception.injected =

2803

events->exception.injected =

2804

vcpu->arch.exception.pending &&

2804

vcpu->arch.exception.pending &&

2805

!kvm_exception_is_soft(vcpu->arch.exception.nr);

2805

!kvm_exception_is_soft(vcpu->arch.exception.nr);

2806

events->exception.nr = vcpu->arch.exception.nr;

2806

events->exception.nr = vcpu->arch.exception.nr;

2807

events->exception.has_error_code = vcpu->arch.exception.has_error_code;

2807

events->exception.has_error_code = vcpu->arch.exception.has_error_code;

2808

events->exception.pad = 0;

2808

events->exception.pad = 0;

2809

events->exception.error_code = vcpu->arch.exception.error_code;

2809

events->exception.error_code = vcpu->arch.exception.error_code;

2810

2811

events->interrupt.injected =

2811

events->interrupt.injected =

2812

vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft;

2812

vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft;

2813

events->interrupt.nr = vcpu->arch.interrupt.nr;

2813

events->interrupt.nr = vcpu->arch.interrupt.nr;

2814

events->interrupt.soft = 0;

2814

events->interrupt.soft = 0;

2815

events->interrupt.shadow =

2815

events->interrupt.shadow =

2816

kvm_x86_ops->get_interrupt_shadow(vcpu,

2816

kvm_x86_ops->get_interrupt_shadow(vcpu,

2817

KVM_X86_SHADOW_INT_MOV_SS | KVM_X86_SHADOW_INT_STI);

2817

KVM_X86_SHADOW_INT_MOV_SS | KVM_X86_SHADOW_INT_STI);

2818

2819

events->nmi.injected = vcpu->arch.nmi_injected;

2819

events->nmi.injected = vcpu->arch.nmi_injected;

2820

events->nmi.pending = vcpu->arch.nmi_pending != 0;

2820

events->nmi.pending = vcpu->arch.nmi_pending != 0;

2821

events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);

2821

events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);

2822

events->nmi.pad = 0;

2822

events->nmi.pad = 0;

2823

2824

events->sipi_vector = vcpu->arch.sipi_vector;

2824

events->sipi_vector = vcpu->arch.sipi_vector;

2825

2826

events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING

2826

events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING

2827

| KVM_VCPUEVENT_VALID_SIPI_VECTOR

2827

| KVM_VCPUEVENT_VALID_SIPI_VECTOR

2828

| KVM_VCPUEVENT_VALID_SHADOW);

2828

| KVM_VCPUEVENT_VALID_SHADOW);

2829

memset(&events->reserved, 0, sizeof(events->reserved));

2829

memset(&events->reserved, 0, sizeof(events->reserved));

2830

}

2830

}

2831

2832

static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,

2832

static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,

2833

struct kvm_vcpu_events *events)

2833

struct kvm_vcpu_events *events)

2834

{

2834

{

2835

if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING

2835

if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING

2836

| KVM_VCPUEVENT_VALID_SIPI_VECTOR

2836

| KVM_VCPUEVENT_VALID_SIPI_VECTOR

2837

| KVM_VCPUEVENT_VALID_SHADOW))

2837

| KVM_VCPUEVENT_VALID_SHADOW))

2838

return -EINVAL;

2838

return -EINVAL;

2839

2840

process_nmi(vcpu);

2840

process_nmi(vcpu);

2841

vcpu->arch.exception.pending = events->exception.injected;

2841

vcpu->arch.exception.pending = events->exception.injected;

2842

vcpu->arch.exception.nr = events->exception.nr;

2842

vcpu->arch.exception.nr = events->exception.nr;

2843

vcpu->arch.exception.has_error_code = events->exception.has_error_code;

2843

vcpu->arch.exception.has_error_code = events->exception.has_error_code;

2844

vcpu->arch.exception.error_code = events->exception.error_code;

2844

vcpu->arch.exception.error_code = events->exception.error_code;

2845

2846

vcpu->arch.interrupt.pending = events->interrupt.injected;

2846

vcpu->arch.interrupt.pending = events->interrupt.injected;

2847

vcpu->arch.interrupt.nr = events->interrupt.nr;

2847

vcpu->arch.interrupt.nr = events->interrupt.nr;

2848

vcpu->arch.interrupt.soft = events->interrupt.soft;

2848

vcpu->arch.interrupt.soft = events->interrupt.soft;

2849

if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)

2849

if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)

2850

kvm_x86_ops->set_interrupt_shadow(vcpu,

2850

kvm_x86_ops->set_interrupt_shadow(vcpu,

2851

events->interrupt.shadow);

2851

events->interrupt.shadow);

2852

2853

vcpu->arch.nmi_injected = events->nmi.injected;

2853

vcpu->arch.nmi_injected = events->nmi.injected;

2854

if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)

2854

if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)

2855

vcpu->arch.nmi_pending = events->nmi.pending;

2855

vcpu->arch.nmi_pending = events->nmi.pending;

2856

kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);

2856

kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);

2857

2858

if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR)

2858

if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR)

2859

vcpu->arch.sipi_vector = events->sipi_vector;

2859

vcpu->arch.sipi_vector = events->sipi_vector;

2860

2861

kvm_make_request(KVM_REQ_EVENT, vcpu);

2861

kvm_make_request(KVM_REQ_EVENT, vcpu);

2862

2863

return 0;

2863

return 0;

2864

}

2864

}

2865

2866

static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,

2866

static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,

2867

struct kvm_debugregs *dbgregs)

2867

struct kvm_debugregs *dbgregs)

2868

{

2868

{

2869

memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));

2869

memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));

2870

dbgregs->dr6 = vcpu->arch.dr6;

2870

dbgregs->dr6 = vcpu->arch.dr6;

2871

dbgregs->dr7 = vcpu->arch.dr7;

2871

dbgregs->dr7 = vcpu->arch.dr7;

2872

dbgregs->flags = 0;

2872

dbgregs->flags = 0;

2873

memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved));

2873

memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved));

2874

}

2874

}

2875

2876

static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,

2876

static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,

2877

struct kvm_debugregs *dbgregs)

2877

struct kvm_debugregs *dbgregs)

2878

{

2878

{

2879

if (dbgregs->flags)

2879

if (dbgregs->flags)

2880

return -EINVAL;

2880

return -EINVAL;

2881

2882

memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));

2882

memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));

2883

vcpu->arch.dr6 = dbgregs->dr6;

2883

vcpu->arch.dr6 = dbgregs->dr6;

2884

vcpu->arch.dr7 = dbgregs->dr7;

2884

vcpu->arch.dr7 = dbgregs->dr7;

2885

2886

return 0;

2886

return 0;

2887

}

2887

}

2888

2889

static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,

2889

static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,

2890

struct kvm_xsave *guest_xsave)

2890

struct kvm_xsave *guest_xsave)

2891

{

2891

{

2892

if (cpu_has_xsave)

2892

if (cpu_has_xsave)

2893

memcpy(guest_xsave->region,

2893

memcpy(guest_xsave->region,

2894

&vcpu->arch.guest_fpu.state->xsave,

2894

&vcpu->arch.guest_fpu.state->xsave,

2895

xstate_size);

2895

xstate_size);

2896

else {

2896

else {

2897

memcpy(guest_xsave->region,

2897

memcpy(guest_xsave->region,

2898

&vcpu->arch.guest_fpu.state->fxsave,

2898

&vcpu->arch.guest_fpu.state->fxsave,

2899

sizeof(struct i387_fxsave_struct));

2899

sizeof(struct i387_fxsave_struct));

2900

*(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] =

2900

*(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] =

2901

XSTATE_FPSSE;

2901

XSTATE_FPSSE;

2902

}

2902

}

2903

}

2903

}

2904

2905

static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,

2905

static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,

2906

struct kvm_xsave *guest_xsave)

2906

struct kvm_xsave *guest_xsave)

2907

{

2907

{

2908

u64 xstate_bv =

2908

u64 xstate_bv =

2909

*(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];

2909

*(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];

2910

2911

if (cpu_has_xsave)

2911

if (cpu_has_xsave)

2912

memcpy(&vcpu->arch.guest_fpu.state->xsave,

2912

memcpy(&vcpu->arch.guest_fpu.state->xsave,

2913

guest_xsave->region, xstate_size);

2913

guest_xsave->region, xstate_size);

2914

else {

2914

else {

2915

if (xstate_bv & ~XSTATE_FPSSE)

2915

if (xstate_bv & ~XSTATE_FPSSE)

2916

return -EINVAL;

2916

return -EINVAL;

2917

memcpy(&vcpu->arch.guest_fpu.state->fxsave,

2917

memcpy(&vcpu->arch.guest_fpu.state->fxsave,

2918

guest_xsave->region, sizeof(struct i387_fxsave_struct));

2918

guest_xsave->region, sizeof(struct i387_fxsave_struct));

2919

}

2919

}

2920

return 0;

2920

return 0;

2921

}

2921

}

2922

2923

static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,

2923

static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,

2924

struct kvm_xcrs *guest_xcrs)

2924

struct kvm_xcrs *guest_xcrs)

2925

{

2925

{

2926

if (!cpu_has_xsave) {

2926

if (!cpu_has_xsave) {

2927

guest_xcrs->nr_xcrs = 0;

2927

guest_xcrs->nr_xcrs = 0;

2928

return;

2928

return;

2929

}

2929

}

2930

2931

guest_xcrs->nr_xcrs = 1;

2931

guest_xcrs->nr_xcrs = 1;

2932

guest_xcrs->flags = 0;

2932

guest_xcrs->flags = 0;

2933

guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK;

2933

guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK;

2934

guest_xcrs->xcrs[0].value = vcpu->arch.xcr0;

2934

guest_xcrs->xcrs[0].value = vcpu->arch.xcr0;

2935

}

2935

}

2936

2937

static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,

2937

static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,

2938

struct kvm_xcrs *guest_xcrs)

2938

struct kvm_xcrs *guest_xcrs)

2939

{

2939

{

2940

int i, r = 0;

2940

int i, r = 0;

2941

2942

if (!cpu_has_xsave)

2942

if (!cpu_has_xsave)

2943

return -EINVAL;

2943

return -EINVAL;

2944

2945

if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags)

2945

if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags)

2946

return -EINVAL;

2946

return -EINVAL;

2947

2948

for (i = 0; i < guest_xcrs->nr_xcrs; i++)

2948

for (i = 0; i < guest_xcrs->nr_xcrs; i++)

2949

/* Only support XCR0 currently */

2949

/* Only support XCR0 currently */

2950

if (guest_xcrs->xcrs[0].xcr == XCR_XFEATURE_ENABLED_MASK) {

2950

if (guest_xcrs->xcrs[0].xcr == XCR_XFEATURE_ENABLED_MASK) {

2951

r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK,

2951

r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK,

2952

guest_xcrs->xcrs[0].value);

2952

guest_xcrs->xcrs[0].value);

2953

break;

2953

break;

2954

}

2954

}

2955

if (r)

2955

if (r)

2956

r = -EINVAL;

2956

r = -EINVAL;

2957

return r;

2957

return r;

2958

}

2958

}

2959

2960

/*

2960

/*

2961

* kvm_set_guest_paused() indicates to the guest kernel that it has been

2961

* kvm_set_guest_paused() indicates to the guest kernel that it has been

2962

* stopped by the hypervisor. This function will be called from the host only.

2962

* stopped by the hypervisor. This function will be called from the host only.

2963

* EINVAL is returned when the host attempts to set the flag for a guest that

2963

* EINVAL is returned when the host attempts to set the flag for a guest that

2964

* does not support pv clocks.

2964

* does not support pv clocks.

2965

*/

2965

*/

2966

static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)

2966

static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)

2967

{

2967

{

2968

if (!vcpu->arch.time_page)

2968

if (!vcpu->arch.time_page)

2969

return -EINVAL;

2969

return -EINVAL;

2970

vcpu->arch.pvclock_set_guest_stopped_request = true;

2970

vcpu->arch.pvclock_set_guest_stopped_request = true;

2971

kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);

2971

kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);

2972

return 0;

2972

return 0;

2973

}

2973

}

2974

2975

long kvm_arch_vcpu_ioctl(struct file *filp,

2975

long kvm_arch_vcpu_ioctl(struct file *filp,

2976

unsigned int ioctl, unsigned long arg)

2976

unsigned int ioctl, unsigned long arg)

2977

{

2977

{

2978

struct kvm_vcpu *vcpu = filp->private_data;

2978

struct kvm_vcpu *vcpu = filp->private_data;

2979

void __user *argp = (void __user *)arg;

2979

void __user *argp = (void __user *)arg;

2980

int r;

2980

int r;

2981

union {

2981

union {

2982

struct kvm_lapic_state *lapic;

2982

struct kvm_lapic_state *lapic;

2983

struct kvm_xsave *xsave;

2983

struct kvm_xsave *xsave;

2984

struct kvm_xcrs *xcrs;

2984

struct kvm_xcrs *xcrs;

2985

void *buffer;

2985

void *buffer;

2986

} u;

2986

} u;

2987

2988

u.buffer = NULL;

2988

u.buffer = NULL;

2989

switch (ioctl) {

2989

switch (ioctl) {

2990

case KVM_GET_LAPIC: {

2990

case KVM_GET_LAPIC: {

2991

r = -EINVAL;

2991

r = -EINVAL;

2992

if (!vcpu->arch.apic)

2992

if (!vcpu->arch.apic)

2993

goto out;

2993

goto out;

2994

u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);

2994

u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);

2995

2996

r = -ENOMEM;

2996

r = -ENOMEM;

2997

if (!u.lapic)

2997

if (!u.lapic)

2998

goto out;

2998

goto out;

2999

r = kvm_vcpu_ioctl_get_lapic(vcpu, u.lapic);

2999

r = kvm_vcpu_ioctl_get_lapic(vcpu, u.lapic);

3000

if (r)

3000

if (r)

3001

goto out;

3001

goto out;

3002

r = -EFAULT;

3002

r = -EFAULT;

3003

if (copy_to_user(argp, u.lapic, sizeof(struct kvm_lapic_state)))

3003

if (copy_to_user(argp, u.lapic, sizeof(struct kvm_lapic_state)))

3004

goto out;

3004

goto out;

3005

r = 0;

3005

r = 0;

3006

break;

3006

break;

3007

}

3007

}

3008

case KVM_SET_LAPIC: {

3008

case KVM_SET_LAPIC: {

3009

r = -EINVAL;

3009

r = -EINVAL;

3010

if (!vcpu->arch.apic)

3010

if (!vcpu->arch.apic)

3011

goto out;

3011

goto out;

3012

u.lapic = memdup_user(argp, sizeof(*u.lapic));

3012

u.lapic = memdup_user(argp, sizeof(*u.lapic));

3013

if (IS_ERR(u.lapic))

3013

if (IS_ERR(u.lapic))

3014

return PTR_ERR(u.lapic);

3014

return PTR_ERR(u.lapic);

3015

3016

r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic);

3016

r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic);

3017

break;

3017

break;

3018

}

3018

}

3019

case KVM_INTERRUPT: {

3019

case KVM_INTERRUPT: {

3020

struct kvm_interrupt irq;

3020

struct kvm_interrupt irq;

3021

3022

r = -EFAULT;

3022

r = -EFAULT;

3023

if (copy_from_user(&irq, argp, sizeof irq))

3023

if (copy_from_user(&irq, argp, sizeof irq))

3024

goto out;

3024

goto out;

3025

r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);

3025

r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);

3026

break;

3026

break;

3027

}

3027

}

3028

case KVM_NMI: {

3028

case KVM_NMI: {

3029

r = kvm_vcpu_ioctl_nmi(vcpu);

3029

r = kvm_vcpu_ioctl_nmi(vcpu);

3030

break;

3030

break;

3031

}

3031

}

3032

case KVM_SET_CPUID: {

3032

case KVM_SET_CPUID: {

3033

struct kvm_cpuid __user *cpuid_arg = argp;

3033

struct kvm_cpuid __user *cpuid_arg = argp;

3034

struct kvm_cpuid cpuid;

3034

struct kvm_cpuid cpuid;

3035

3036

r = -EFAULT;

3036

r = -EFAULT;

3037

if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))

3037

if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))

3038

goto out;

3038

goto out;

3039

r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);

3039

r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);

3040

break;

3040

break;

3041

}

3041

}

3042

case KVM_SET_CPUID2: {

3042

case KVM_SET_CPUID2: {

3043

struct kvm_cpuid2 __user *cpuid_arg = argp;

3043

struct kvm_cpuid2 __user *cpuid_arg = argp;

3044

struct kvm_cpuid2 cpuid;

3044

struct kvm_cpuid2 cpuid;

3045

3046

r = -EFAULT;

3046

r = -EFAULT;

3047

if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))

3047

if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))

3048

goto out;

3048

goto out;

3049

r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,

3049

r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,

3050

cpuid_arg->entries);

3050

cpuid_arg->entries);

3051

break;

3051

break;

3052

}

3052

}

3053

case KVM_GET_CPUID2: {

3053

case KVM_GET_CPUID2: {

3054

struct kvm_cpuid2 __user *cpuid_arg = argp;

3054

struct kvm_cpuid2 __user *cpuid_arg = argp;

3055

struct kvm_cpuid2 cpuid;

3055

struct kvm_cpuid2 cpuid;

3056

3057

r = -EFAULT;

3057

r = -EFAULT;

3058

if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))

3058

if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))

3059

goto out;

3059

goto out;

3060

r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,

3060

r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,

3061

cpuid_arg->entries);

3061

cpuid_arg->entries);

3062

if (r)

3062

if (r)

3063

goto out;

3063

goto out;

3064

r = -EFAULT;

3064

r = -EFAULT;

3065

if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))

3065

if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))

3066

goto out;

3066

goto out;

3067

r = 0;

3067

r = 0;

3068

break;

3068

break;

3069

}

3069

}

3070

case KVM_GET_MSRS:

3070

case KVM_GET_MSRS:

3071

r = msr_io(vcpu, argp, kvm_get_msr, 1);

3071

r = msr_io(vcpu, argp, kvm_get_msr, 1);

3072

break;

3072

break;

3073

case KVM_SET_MSRS:

3073

case KVM_SET_MSRS:

3074

r = msr_io(vcpu, argp, do_set_msr, 0);

3074

r = msr_io(vcpu, argp, do_set_msr, 0);

3075

break;

3075

break;

3076

case KVM_TPR_ACCESS_REPORTING: {

3076

case KVM_TPR_ACCESS_REPORTING: {

3077

struct kvm_tpr_access_ctl tac;

3077

struct kvm_tpr_access_ctl tac;

3078

3079

r = -EFAULT;

3079

r = -EFAULT;

3080

if (copy_from_user(&tac, argp, sizeof tac))

3080

if (copy_from_user(&tac, argp, sizeof tac))

3081

goto out;

3081

goto out;

3082

r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac);

3082

r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac);

3083

if (r)

3083

if (r)

3084

goto out;

3084

goto out;

3085

r = -EFAULT;

3085

r = -EFAULT;

3086

if (copy_to_user(argp, &tac, sizeof tac))

3086

if (copy_to_user(argp, &tac, sizeof tac))

3087

goto out;

3087

goto out;

3088

r = 0;

3088

r = 0;

3089

break;

3089

break;

3090

};

3090

};

3091

case KVM_SET_VAPIC_ADDR: {

3091

case KVM_SET_VAPIC_ADDR: {

3092

struct kvm_vapic_addr va;

3092

struct kvm_vapic_addr va;

3093

3094

r = -EINVAL;

3094

r = -EINVAL;

3095

if (!irqchip_in_kernel(vcpu->kvm))

3095

if (!irqchip_in_kernel(vcpu->kvm))

3096

goto out;

3096

goto out;

3097

r = -EFAULT;

3097

r = -EFAULT;

3098

if (copy_from_user(&va, argp, sizeof va))

3098

if (copy_from_user(&va, argp, sizeof va))

3099

goto out;

3099

goto out;

3100

r = 0;

3100

r = 0;

3101

kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);

3101

kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);

3102

break;

3102

break;

3103

}

3103

}

3104

case KVM_X86_SETUP_MCE: {

3104

case KVM_X86_SETUP_MCE: {

3105

u64 mcg_cap;

3105

u64 mcg_cap;

3106

3107

r = -EFAULT;

3107

r = -EFAULT;

3108

if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap))

3108

if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap))

3109

goto out;

3109

goto out;

3110

r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap);

3110

r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap);

3111

break;

3111

break;

3112

}

3112

}

3113

case KVM_X86_SET_MCE: {

3113

case KVM_X86_SET_MCE: {

3114

struct kvm_x86_mce mce;

3114

struct kvm_x86_mce mce;

3115

3116

r = -EFAULT;

3116

r = -EFAULT;

3117

if (copy_from_user(&mce, argp, sizeof mce))

3117

if (copy_from_user(&mce, argp, sizeof mce))

3118

goto out;

3118

goto out;

3119

r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);

3119

r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);

3120

break;

3120

break;

3121

}

3121

}

3122

case KVM_GET_VCPU_EVENTS: {

3122

case KVM_GET_VCPU_EVENTS: {

3123

struct kvm_vcpu_events events;

3123

struct kvm_vcpu_events events;

3124

3125

kvm_vcpu_ioctl_x86_get_vcpu_events(vcpu, &events);

3125

kvm_vcpu_ioctl_x86_get_vcpu_events(vcpu, &events);

3126

3127

r = -EFAULT;

3127

r = -EFAULT;

3128

if (copy_to_user(argp, &events, sizeof(struct kvm_vcpu_events)))

3128

if (copy_to_user(argp, &events, sizeof(struct kvm_vcpu_events)))

3129

break;

3129

break;

3130

r = 0;

3130

r = 0;

3131

break;

3131

break;

3132

}

3132

}

3133

case KVM_SET_VCPU_EVENTS: {

3133

case KVM_SET_VCPU_EVENTS: {

3134

struct kvm_vcpu_events events;

3134

struct kvm_vcpu_events events;

3135

3136

r = -EFAULT;

3136

r = -EFAULT;

3137

if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events)))

3137

if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events)))

3138

break;

3138

break;

3139

3140

r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);

3140

r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);

3141

break;

3141

break;

3142

}

3142

}

3143

case KVM_GET_DEBUGREGS: {

3143

case KVM_GET_DEBUGREGS: {

3144

struct kvm_debugregs dbgregs;

3144

struct kvm_debugregs dbgregs;

3145

3146

kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs);

3146

kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs);

3147

3148

r = -EFAULT;

3148

r = -EFAULT;

3149

if (copy_to_user(argp, &dbgregs,

3149

if (copy_to_user(argp, &dbgregs,

3150

sizeof(struct kvm_debugregs)))

3150

sizeof(struct kvm_debugregs)))

3151

break;

3151

break;

3152

r = 0;

3152

r = 0;

3153

break;

3153

break;

3154

}

3154

}

3155

case KVM_SET_DEBUGREGS: {

3155

case KVM_SET_DEBUGREGS: {

3156

struct kvm_debugregs dbgregs;

3156

struct kvm_debugregs dbgregs;

3157

3158

r = -EFAULT;

3158

r = -EFAULT;

3159

if (copy_from_user(&dbgregs, argp,

3159

if (copy_from_user(&dbgregs, argp,

3160

sizeof(struct kvm_debugregs)))

3160

sizeof(struct kvm_debugregs)))

3161

break;

3161

break;

3162

3163

r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs);

3163

r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs);

3164

break;

3164

break;

3165

}

3165

}

3166

case KVM_GET_XSAVE: {

3166

case KVM_GET_XSAVE: {

3167

u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);

3167

u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);

3168

r = -ENOMEM;

3168

r = -ENOMEM;

3169

if (!u.xsave)

3169

if (!u.xsave)

3170

break;

3170

break;

3171

3172

kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave);

3172

kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave);

3173

3174

r = -EFAULT;

3174

r = -EFAULT;

3175

if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave)))

3175

if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave)))

3176

break;

3176

break;

3177

r = 0;

3177

r = 0;

3178

break;

3178

break;

3179

}

3179

}

3180

case KVM_SET_XSAVE: {

3180

case KVM_SET_XSAVE: {

3181

u.xsave = memdup_user(argp, sizeof(*u.xsave));

3181

u.xsave = memdup_user(argp, sizeof(*u.xsave));

3182

if (IS_ERR(u.xsave))

3182

if (IS_ERR(u.xsave))

3183

return PTR_ERR(u.xsave);

3183

return PTR_ERR(u.xsave);

3184

3185

r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);

3185

r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);

3186

break;

3186

break;

3187

}

3187

}

3188

case KVM_GET_XCRS: {

3188

case KVM_GET_XCRS: {

3189

u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);

3189

u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);

3190

r = -ENOMEM;

3190

r = -ENOMEM;

3191

if (!u.xcrs)

3191

if (!u.xcrs)

3192

break;

3192

break;

3193

3194

kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs);

3194

kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs);

3195

3196

r = -EFAULT;

3196

r = -EFAULT;

3197

if (copy_to_user(argp, u.xcrs,

3197

if (copy_to_user(argp, u.xcrs,

3198

sizeof(struct kvm_xcrs)))

3198

sizeof(struct kvm_xcrs)))

3199

break;

3199

break;

3200

r = 0;

3200

r = 0;

3201

break;

3201

break;

3202

}

3202

}

3203

case KVM_SET_XCRS: {

3203

case KVM_SET_XCRS: {

3204

u.xcrs = memdup_user(argp, sizeof(*u.xcrs));

3204

u.xcrs = memdup_user(argp, sizeof(*u.xcrs));

3205

if (IS_ERR(u.xcrs))

3205

if (IS_ERR(u.xcrs))

3206

return PTR_ERR(u.xcrs);

3206

return PTR_ERR(u.xcrs);

3207

3208

r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);

3208

r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);

3209

break;

3209

break;

3210

}

3210

}

3211

case KVM_SET_TSC_KHZ: {

3211

case KVM_SET_TSC_KHZ: {

3212

u32 user_tsc_khz;

3212

u32 user_tsc_khz;

3213

3214

r = -EINVAL;

3214

r = -EINVAL;

3215

user_tsc_khz = (u32)arg;

3215

user_tsc_khz = (u32)arg;

3216

3217

if (user_tsc_khz >= kvm_max_guest_tsc_khz)

3217

if (user_tsc_khz >= kvm_max_guest_tsc_khz)

3218

goto out;

3218

goto out;

3219

3220

if (user_tsc_khz == 0)

3220

if (user_tsc_khz == 0)

3221

user_tsc_khz = tsc_khz;

3221

user_tsc_khz = tsc_khz;

3222

3223

kvm_set_tsc_khz(vcpu, user_tsc_khz);

3223

kvm_set_tsc_khz(vcpu, user_tsc_khz);

3224

3225

r = 0;

3225

r = 0;

3226

goto out;

3226

goto out;

3227

}

3227

}

3228

case KVM_GET_TSC_KHZ: {

3228

case KVM_GET_TSC_KHZ: {

3229

r = vcpu->arch.virtual_tsc_khz;

3229

r = vcpu->arch.virtual_tsc_khz;

3230

goto out;

3230

goto out;

3231

}

3231

}

3232

case KVM_KVMCLOCK_CTRL: {

3232

case KVM_KVMCLOCK_CTRL: {

3233

r = kvm_set_guest_paused(vcpu);

3233

r = kvm_set_guest_paused(vcpu);

3234

goto out;

3234

goto out;

3235

}

3235

}

3236

default:

3236

default:

3237

r = -EINVAL;

3237

r = -EINVAL;

3238

}

3238

}

3239

out:

3239

out:

3240

kfree(u.buffer);

3240

kfree(u.buffer);

3241

return r;

3241

return r;

3242

}

3242

}

3243

3244

int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)

3244

int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)

3245

{

3245

{

3246

return VM_FAULT_SIGBUS;

3246

return VM_FAULT_SIGBUS;

3247

}

3247

}

3248

3249

static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)

3249

static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)

3250

{

3250

{

3251

int ret;

3251

int ret;

3252

3253

if (addr > (unsigned int)(-3 * PAGE_SIZE))

3253

if (addr > (unsigned int)(-3 * PAGE_SIZE))

3254

return -EINVAL;

3254

return -EINVAL;

3255

ret = kvm_x86_ops->set_tss_addr(kvm, addr);

3255

ret = kvm_x86_ops->set_tss_addr(kvm, addr);

3256

return ret;

3256

return ret;

3257

}

3257

}

3258

3259

static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,

3259

static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,

3260

u64 ident_addr)

3260

u64 ident_addr)

3261

{

3261

{

3262

kvm->arch.ept_identity_map_addr = ident_addr;

3262

kvm->arch.ept_identity_map_addr = ident_addr;

3263

return 0;

3263

return 0;

3264

}

3264

}

3265

3266

static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,

3266

static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,

3267

u32 kvm_nr_mmu_pages)

3267

u32 kvm_nr_mmu_pages)

3268

{

3268

{

3269

if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)

3269

if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)

3270

return -EINVAL;

3270

return -EINVAL;

3271

3272

mutex_lock(&kvm->slots_lock);

3272

mutex_lock(&kvm->slots_lock);

3273

spin_lock(&kvm->mmu_lock);

3273

spin_lock(&kvm->mmu_lock);

3274

3275

kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);

3275

kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);

3276

kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;

3276

kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;

3277

3278

spin_unlock(&kvm->mmu_lock);

3278

spin_unlock(&kvm->mmu_lock);

3279

mutex_unlock(&kvm->slots_lock);

3279

mutex_unlock(&kvm->slots_lock);

3280

return 0;

3280

return 0;

3281

}

3281

}

3282

3283

static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)

3283

static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)

3284

{

3284

{

3285

return kvm->arch.n_max_mmu_pages;

3285

return kvm->arch.n_max_mmu_pages;

3286

}

3286

}

3287

3288

static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)

3288

static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)

3289

{

3289

{

3290

int r;

3290

int r;

3291

3292

r = 0;

3292

r = 0;

3293

switch (chip->chip_id) {

3293

switch (chip->chip_id) {

3294

case KVM_IRQCHIP_PIC_MASTER:

3294

case KVM_IRQCHIP_PIC_MASTER:

3295

memcpy(&chip->chip.pic,

3295

memcpy(&chip->chip.pic,

3296

&pic_irqchip(kvm)->pics[0],

3296

&pic_irqchip(kvm)->pics[0],

3297

sizeof(struct kvm_pic_state));

3297

sizeof(struct kvm_pic_state));

3298

break;

3298

break;

3299

case KVM_IRQCHIP_PIC_SLAVE:

3299

case KVM_IRQCHIP_PIC_SLAVE:

3300

memcpy(&chip->chip.pic,

3300

memcpy(&chip->chip.pic,

3301

&pic_irqchip(kvm)->pics[1],

3301

&pic_irqchip(kvm)->pics[1],

3302

sizeof(struct kvm_pic_state));

3302

sizeof(struct kvm_pic_state));

3303

break;

3303

break;

3304

case KVM_IRQCHIP_IOAPIC:

3304

case KVM_IRQCHIP_IOAPIC:

3305

r = kvm_get_ioapic(kvm, &chip->chip.ioapic);

3305

r = kvm_get_ioapic(kvm, &chip->chip.ioapic);

3306

break;

3306

break;

3307

default:

3307

default:

3308

r = -EINVAL;

3308

r = -EINVAL;

3309

break;

3309

break;

3310

}

3310

}

3311

return r;

3311

return r;

3312

}

3312

}

3313

3314

static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)

3314

static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)

3315

{

3315

{

3316

int r;

3316

int r;

3317

3318

r = 0;

3318

r = 0;

3319

switch (chip->chip_id) {

3319

switch (chip->chip_id) {

3320

case KVM_IRQCHIP_PIC_MASTER:

3320

case KVM_IRQCHIP_PIC_MASTER:

3321

spin_lock(&pic_irqchip(kvm)->lock);

3321

spin_lock(&pic_irqchip(kvm)->lock);

3322

memcpy(&pic_irqchip(kvm)->pics[0],

3322

memcpy(&pic_irqchip(kvm)->pics[0],

3323

&chip->chip.pic,

3323

&chip->chip.pic,

3324

sizeof(struct kvm_pic_state));

3324

sizeof(struct kvm_pic_state));

3325

spin_unlock(&pic_irqchip(kvm)->lock);

3325

spin_unlock(&pic_irqchip(kvm)->lock);

3326

break;

3326

break;

3327

case KVM_IRQCHIP_PIC_SLAVE:

3327

case KVM_IRQCHIP_PIC_SLAVE:

3328

spin_lock(&pic_irqchip(kvm)->lock);

3328

spin_lock(&pic_irqchip(kvm)->lock);

3329

memcpy(&pic_irqchip(kvm)->pics[1],

3329

memcpy(&pic_irqchip(kvm)->pics[1],

3330

&chip->chip.pic,

3330

&chip->chip.pic,

3331

sizeof(struct kvm_pic_state));

3331

sizeof(struct kvm_pic_state));

3332

spin_unlock(&pic_irqchip(kvm)->lock);

3332

spin_unlock(&pic_irqchip(kvm)->lock);

3333

break;

3333

break;

3334

case KVM_IRQCHIP_IOAPIC:

3334

case KVM_IRQCHIP_IOAPIC:

3335

r = kvm_set_ioapic(kvm, &chip->chip.ioapic);

3335

r = kvm_set_ioapic(kvm, &chip->chip.ioapic);

3336

break;

3336

break;

3337

default:

3337

default:

3338

r = -EINVAL;

3338

r = -EINVAL;

3339

break;

3339

break;

3340

}

3340

}

3341

kvm_pic_update_irq(pic_irqchip(kvm));

3341

kvm_pic_update_irq(pic_irqchip(kvm));

3342

return r;

3342

return r;

3343

}

3343

}

3344

3345

static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)

3345

static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)

3346

{

3346

{

3347

int r = 0;

3347

int r = 0;

3348

3349

mutex_lock(&kvm->arch.vpit->pit_state.lock);

3349

mutex_lock(&kvm->arch.vpit->pit_state.lock);

3350

memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));

3350

memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));

3351

mutex_unlock(&kvm->arch.vpit->pit_state.lock);

3351

mutex_unlock(&kvm->arch.vpit->pit_state.lock);

3352

return r;

3352

return r;

3353

}

3353

}

3354

3355

static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)

3355

static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)

3356

{

3356

{

3357

int r = 0;

3357

int r = 0;

3358

3359

mutex_lock(&kvm->arch.vpit->pit_state.lock);

3359

mutex_lock(&kvm->arch.vpit->pit_state.lock);

3360

memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));

3360

memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));

3361

kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0);

3361

kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0);

3362

mutex_unlock(&kvm->arch.vpit->pit_state.lock);

3362

mutex_unlock(&kvm->arch.vpit->pit_state.lock);

3363

return r;

3363

return r;

3364

}

3364

}

3365

3366

static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)

3366

static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)

3367

{

3367

{

3368

int r = 0;

3368

int r = 0;

3369

3370

mutex_lock(&kvm->arch.vpit->pit_state.lock);

3370

mutex_lock(&kvm->arch.vpit->pit_state.lock);

3371

memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,

3371

memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,

3372

sizeof(ps->channels));

3372

sizeof(ps->channels));

3373

ps->flags = kvm->arch.vpit->pit_state.flags;

3373

ps->flags = kvm->arch.vpit->pit_state.flags;

3374

mutex_unlock(&kvm->arch.vpit->pit_state.lock);

3374

mutex_unlock(&kvm->arch.vpit->pit_state.lock);

3375

memset(&ps->reserved, 0, sizeof(ps->reserved));

3375

memset(&ps->reserved, 0, sizeof(ps->reserved));

3376

return r;

3376

return r;

3377

}

3377

}

3378

3379

static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)

3379

static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)

3380

{

3380

{

3381

int r = 0, start = 0;

3381

int r = 0, start = 0;

3382

u32 prev_legacy, cur_legacy;

3382

u32 prev_legacy, cur_legacy;

3383

mutex_lock(&kvm->arch.vpit->pit_state.lock);

3383

mutex_lock(&kvm->arch.vpit->pit_state.lock);

3384

prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;

3384

prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;

3385

cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;

3385

cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;

3386

if (!prev_legacy && cur_legacy)

3386

if (!prev_legacy && cur_legacy)

3387

start = 1;

3387

start = 1;

3388

memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels,

3388

memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels,

3389

sizeof(kvm->arch.vpit->pit_state.channels));

3389

sizeof(kvm->arch.vpit->pit_state.channels));

3390

kvm->arch.vpit->pit_state.flags = ps->flags;

3390

kvm->arch.vpit->pit_state.flags = ps->flags;

3391

kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start);

3391

kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start);

3392

mutex_unlock(&kvm->arch.vpit->pit_state.lock);

3392

mutex_unlock(&kvm->arch.vpit->pit_state.lock);

3393

return r;

3393

return r;

3394

}

3394

}

3395

3396

static int kvm_vm_ioctl_reinject(struct kvm *kvm,

3396

static int kvm_vm_ioctl_reinject(struct kvm *kvm,

3397

struct kvm_reinject_control *control)

3397

struct kvm_reinject_control *control)

3398

{

3398

{

3399

if (!kvm->arch.vpit)

3399

if (!kvm->arch.vpit)

3400

return -ENXIO;

3400

return -ENXIO;

3401

mutex_lock(&kvm->arch.vpit->pit_state.lock);

3401

mutex_lock(&kvm->arch.vpit->pit_state.lock);

3402

kvm->arch.vpit->pit_state.reinject = control->pit_reinject;

3402

kvm->arch.vpit->pit_state.reinject = control->pit_reinject;

3403

mutex_unlock(&kvm->arch.vpit->pit_state.lock);

3403

mutex_unlock(&kvm->arch.vpit->pit_state.lock);

3404

return 0;

3404

return 0;

3405

}

3405

}

3406

3407

/**

3407

/**

3408

* kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot

3408

* kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot

3409

* @kvm: kvm instance

3409

* @kvm: kvm instance

3410

* @log: slot id and address to which we copy the log

3410

* @log: slot id and address to which we copy the log

3411

*

3411

*

3412

* We need to keep it in mind that VCPU threads can write to the bitmap

3412

* We need to keep it in mind that VCPU threads can write to the bitmap

3413

* concurrently. So, to avoid losing data, we keep the following order for

3413

* concurrently. So, to avoid losing data, we keep the following order for

3414

* each bit:

3414

* each bit:

3415

*

3415

*

3416

* 1. Take a snapshot of the bit and clear it if needed.

3416

* 1. Take a snapshot of the bit and clear it if needed.

3417

* 2. Write protect the corresponding page.

3417

* 2. Write protect the corresponding page.

3418

* 3. Flush TLB's if needed.

3418

* 3. Flush TLB's if needed.

3419

* 4. Copy the snapshot to the userspace.

3419

* 4. Copy the snapshot to the userspace.

3420

*

3420

*

3421

* Between 2 and 3, the guest may write to the page using the remaining TLB

3421

* Between 2 and 3, the guest may write to the page using the remaining TLB

3422

* entry. This is not a problem because the page will be reported dirty at

3422

* entry. This is not a problem because the page will be reported dirty at

3423

* step 4 using the snapshot taken before and step 3 ensures that successive

3423

* step 4 using the snapshot taken before and step 3 ensures that successive

3424

* writes will be logged for the next call.

3424

* writes will be logged for the next call.

3425

*/

3425

*/

3426

int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)

3426

int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)

3427

{

3427

{

3428

int r;

3428

int r;

3429

struct kvm_memory_slot *memslot;

3429

struct kvm_memory_slot *memslot;

3430

unsigned long n, i;

3430

unsigned long n, i;

3431

unsigned long *dirty_bitmap;

3431

unsigned long *dirty_bitmap;

3432

unsigned long *dirty_bitmap_buffer;

3432

unsigned long *dirty_bitmap_buffer;

3433

bool is_dirty = false;

3433

bool is_dirty = false;

3434

3435

mutex_lock(&kvm->slots_lock);

3435

mutex_lock(&kvm->slots_lock);

3436

3437

r = -EINVAL;

3437

r = -EINVAL;

3438

if (log->slot >= KVM_USER_MEM_SLOTS)

3438

if (log->slot >= KVM_USER_MEM_SLOTS)

3439

goto out;

3439

goto out;

3440

3441

memslot = id_to_memslot(kvm->memslots, log->slot);

3441

memslot = id_to_memslot(kvm->memslots, log->slot);

3442

3443

dirty_bitmap = memslot->dirty_bitmap;

3443

dirty_bitmap = memslot->dirty_bitmap;

3444

r = -ENOENT;

3444

r = -ENOENT;

3445

if (!dirty_bitmap)

3445

if (!dirty_bitmap)

3446

goto out;

3446

goto out;

3447

3448

n = kvm_dirty_bitmap_bytes(memslot);

3448

n = kvm_dirty_bitmap_bytes(memslot);

3449

3450

dirty_bitmap_buffer = dirty_bitmap + n / sizeof(long);

3450

dirty_bitmap_buffer = dirty_bitmap + n / sizeof(long);

3451

memset(dirty_bitmap_buffer, 0, n);

3451

memset(dirty_bitmap_buffer, 0, n);

3452

3453

spin_lock(&kvm->mmu_lock);

3453

spin_lock(&kvm->mmu_lock);

3454

3455

for (i = 0; i < n / sizeof(long); i++) {

3455

for (i = 0; i < n / sizeof(long); i++) {

3456

unsigned long mask;

3456

unsigned long mask;

3457

gfn_t offset;

3457

gfn_t offset;

3458

3459

if (!dirty_bitmap[i])

3459

if (!dirty_bitmap[i])

3460

continue;

3460

continue;

3461

3462

is_dirty = true;

3462

is_dirty = true;

3463

3464

mask = xchg(&dirty_bitmap[i], 0);

3464

mask = xchg(&dirty_bitmap[i], 0);

3465

dirty_bitmap_buffer[i] = mask;

3465

dirty_bitmap_buffer[i] = mask;

3466

3467

offset = i * BITS_PER_LONG;

3467

offset = i * BITS_PER_LONG;

3468

kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, mask);

3468

kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, mask);

3469

}

3469

}

3470

if (is_dirty)

3470

if (is_dirty)

3471

kvm_flush_remote_tlbs(kvm);

3471

kvm_flush_remote_tlbs(kvm);

3472

3473

spin_unlock(&kvm->mmu_lock);

3473

spin_unlock(&kvm->mmu_lock);

3474

3475

r = -EFAULT;

3475

r = -EFAULT;

3476

if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))

3476

if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))

3477

goto out;

3477

goto out;

3478

3479

r = 0;

3479

r = 0;

3480

out:

3480

out:

3481

mutex_unlock(&kvm->slots_lock);

3481

mutex_unlock(&kvm->slots_lock);

3482

return r;

3482

return r;

3483

}

3483

}

3484

3485

int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event)

3485

int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event)

3486

{

3486

{

3487

if (!irqchip_in_kernel(kvm))

3487

if (!irqchip_in_kernel(kvm))

3488

return -ENXIO;

3488

return -ENXIO;

3489

3490

irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,

3490

irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,

3491

irq_event->irq, irq_event->level);

3491

irq_event->irq, irq_event->level);

3492

return 0;

3492

return 0;

3493

}

3493

}

3494

3495

long kvm_arch_vm_ioctl(struct file *filp,

3495

long kvm_arch_vm_ioctl(struct file *filp,

3496

unsigned int ioctl, unsigned long arg)

3496

unsigned int ioctl, unsigned long arg)

3497

{

3497

{

3498

struct kvm *kvm = filp->private_data;

3498

struct kvm *kvm = filp->private_data;

3499

void __user *argp = (void __user *)arg;

3499

void __user *argp = (void __user *)arg;

3500

int r = -ENOTTY;

3500

int r = -ENOTTY;

3501

/*

3501

/*

3502

* This union makes it completely explicit to gcc-3.x

3502

* This union makes it completely explicit to gcc-3.x

3503

* that these two variables' stack usage should be

3503

* that these two variables' stack usage should be

3504

* combined, not added together.

3504

* combined, not added together.

3505

*/

3505

*/

3506

union {

3506

union {

3507

struct kvm_pit_state ps;

3507

struct kvm_pit_state ps;

3508

struct kvm_pit_state2 ps2;

3508

struct kvm_pit_state2 ps2;

3509

struct kvm_pit_config pit_config;

3509

struct kvm_pit_config pit_config;

3510

} u;

3510

} u;

3511

3512

switch (ioctl) {

3512

switch (ioctl) {

3513

case KVM_SET_TSS_ADDR:

3513

case KVM_SET_TSS_ADDR:

3514

r = kvm_vm_ioctl_set_tss_addr(kvm, arg);

3514

r = kvm_vm_ioctl_set_tss_addr(kvm, arg);

3515

break;

3515

break;

3516

case KVM_SET_IDENTITY_MAP_ADDR: {

3516

case KVM_SET_IDENTITY_MAP_ADDR: {

3517

u64 ident_addr;

3517

u64 ident_addr;

3518

3519

r = -EFAULT;

3519

r = -EFAULT;

3520

if (copy_from_user(&ident_addr, argp, sizeof ident_addr))

3520

if (copy_from_user(&ident_addr, argp, sizeof ident_addr))

3521

goto out;

3521

goto out;

3522

r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr);

3522

r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr);

3523

break;

3523

break;

3524

}

3524

}

3525

case KVM_SET_NR_MMU_PAGES:

3525

case KVM_SET_NR_MMU_PAGES:

3526

r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);

3526

r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);

3527

break;

3527

break;

3528

case KVM_GET_NR_MMU_PAGES:

3528

case KVM_GET_NR_MMU_PAGES:

3529

r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);

3529

r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);

3530

break;

3530

break;

3531

case KVM_CREATE_IRQCHIP: {

3531

case KVM_CREATE_IRQCHIP: {

3532

struct kvm_pic *vpic;

3532

struct kvm_pic *vpic;

3533

3534

mutex_lock(&kvm->lock);

3534

mutex_lock(&kvm->lock);

3535

r = -EEXIST;

3535

r = -EEXIST;

3536

if (kvm->arch.vpic)

3536

if (kvm->arch.vpic)

3537

goto create_irqchip_unlock;

3537

goto create_irqchip_unlock;

3538

r = -EINVAL;

3538

r = -EINVAL;

3539

if (atomic_read(&kvm->online_vcpus))

3539

if (atomic_read(&kvm->online_vcpus))

3540

goto create_irqchip_unlock;

3540

goto create_irqchip_unlock;

3541

r = -ENOMEM;

3541

r = -ENOMEM;

3542

vpic = kvm_create_pic(kvm);

3542

vpic = kvm_create_pic(kvm);

3543

if (vpic) {

3543

if (vpic) {

3544

r = kvm_ioapic_init(kvm);

3544

r = kvm_ioapic_init(kvm);

3545

if (r) {

3545

if (r) {

3546

mutex_lock(&kvm->slots_lock);

3546

mutex_lock(&kvm->slots_lock);

3547

kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,

3547

kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,

3548

&vpic->dev_master);

3548

&vpic->dev_master);

3549

kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,

3549

kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,

3550

&vpic->dev_slave);

3550

&vpic->dev_slave);

3551

kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,

3551

kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,

3552

&vpic->dev_eclr);

3552

&vpic->dev_eclr);

3553

mutex_unlock(&kvm->slots_lock);

3553

mutex_unlock(&kvm->slots_lock);

3554

kfree(vpic);

3554

kfree(vpic);

3555

goto create_irqchip_unlock;

3555

goto create_irqchip_unlock;

3556

}

3556

}

3557

} else

3557

} else

3558

goto create_irqchip_unlock;

3558

goto create_irqchip_unlock;

3559

smp_wmb();

3559

smp_wmb();

3560

kvm->arch.vpic = vpic;

3560

kvm->arch.vpic = vpic;

3561

smp_wmb();

3561

smp_wmb();

3562

r = kvm_setup_default_irq_routing(kvm);

3562

r = kvm_setup_default_irq_routing(kvm);

3563

if (r) {

3563

if (r) {

3564

mutex_lock(&kvm->slots_lock);

3564

mutex_lock(&kvm->slots_lock);

3565

mutex_lock(&kvm->irq_lock);

3565

mutex_lock(&kvm->irq_lock);

3566

kvm_ioapic_destroy(kvm);

3566

kvm_ioapic_destroy(kvm);

3567

kvm_destroy_pic(kvm);

3567

kvm_destroy_pic(kvm);

3568

mutex_unlock(&kvm->irq_lock);

3568

mutex_unlock(&kvm->irq_lock);

3569

mutex_unlock(&kvm->slots_lock);

3569

mutex_unlock(&kvm->slots_lock);

3570

}

3570

}

3571

create_irqchip_unlock:

3571

create_irqchip_unlock:

3572

mutex_unlock(&kvm->lock);

3572

mutex_unlock(&kvm->lock);

3573

break;

3573

break;

3574

}

3574

}

3575

case KVM_CREATE_PIT:

3575

case KVM_CREATE_PIT:

3576

u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY;

3576

u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY;

3577

goto create_pit;

3577

goto create_pit;

3578

case KVM_CREATE_PIT2:

3578

case KVM_CREATE_PIT2:

3579

r = -EFAULT;

3579

r = -EFAULT;

3580

if (copy_from_user(&u.pit_config, argp,

3580

if (copy_from_user(&u.pit_config, argp,

3581

sizeof(struct kvm_pit_config)))

3581

sizeof(struct kvm_pit_config)))

3582

goto out;

3582

goto out;

3583

create_pit:

3583

create_pit:

3584

mutex_lock(&kvm->slots_lock);

3584

mutex_lock(&kvm->slots_lock);

3585

r = -EEXIST;

3585

r = -EEXIST;

3586

if (kvm->arch.vpit)

3586

if (kvm->arch.vpit)

3587

goto create_pit_unlock;

3587

goto create_pit_unlock;

3588

r = -ENOMEM;

3588

r = -ENOMEM;

3589

kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags);

3589

kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags);

3590

if (kvm->arch.vpit)

3590

if (kvm->arch.vpit)

3591

r = 0;

3591

r = 0;

3592

create_pit_unlock:

3592

create_pit_unlock:

3593

mutex_unlock(&kvm->slots_lock);

3593

mutex_unlock(&kvm->slots_lock);

3594

break;

3594

break;

3595

case KVM_GET_IRQCHIP: {

3595

case KVM_GET_IRQCHIP: {

3596

/* 0: PIC master, 1: PIC slave, 2: IOAPIC */

3596

/* 0: PIC master, 1: PIC slave, 2: IOAPIC */

3597

struct kvm_irqchip *chip;

3597

struct kvm_irqchip *chip;

3598

3599

chip = memdup_user(argp, sizeof(*chip));

3599

chip = memdup_user(argp, sizeof(*chip));

3600

if (IS_ERR(chip)) {

3600

if (IS_ERR(chip)) {

3601

r = PTR_ERR(chip);

3601

r = PTR_ERR(chip);

3602

goto out;

3602

goto out;

3603

}

3603

}

3604

3605

r = -ENXIO;

3605

r = -ENXIO;

3606

if (!irqchip_in_kernel(kvm))

3606

if (!irqchip_in_kernel(kvm))

3607

goto get_irqchip_out;

3607

goto get_irqchip_out;

3608

r = kvm_vm_ioctl_get_irqchip(kvm, chip);

3608

r = kvm_vm_ioctl_get_irqchip(kvm, chip);

3609

if (r)

3609

if (r)

3610

goto get_irqchip_out;

3610

goto get_irqchip_out;

3611

r = -EFAULT;

3611

r = -EFAULT;

3612

if (copy_to_user(argp, chip, sizeof *chip))

3612

if (copy_to_user(argp, chip, sizeof *chip))

3613

goto get_irqchip_out;

3613

goto get_irqchip_out;

3614

r = 0;

3614

r = 0;

3615

get_irqchip_out:

3615

get_irqchip_out:

3616

kfree(chip);

3616

kfree(chip);

3617

break;

3617

break;

3618

}

3618

}

3619

case KVM_SET_IRQCHIP: {

3619

case KVM_SET_IRQCHIP: {

3620

/* 0: PIC master, 1: PIC slave, 2: IOAPIC */

3620

/* 0: PIC master, 1: PIC slave, 2: IOAPIC */

3621

struct kvm_irqchip *chip;

3621

struct kvm_irqchip *chip;

3622

3623

chip = memdup_user(argp, sizeof(*chip));

3623

chip = memdup_user(argp, sizeof(*chip));

3624

if (IS_ERR(chip)) {

3624

if (IS_ERR(chip)) {

3625

r = PTR_ERR(chip);

3625

r = PTR_ERR(chip);

3626

goto out;

3626

goto out;

3627

}

3627

}

3628

3629

r = -ENXIO;

3629

r = -ENXIO;

3630

if (!irqchip_in_kernel(kvm))

3630

if (!irqchip_in_kernel(kvm))

3631

goto set_irqchip_out;

3631

goto set_irqchip_out;

3632

r = kvm_vm_ioctl_set_irqchip(kvm, chip);

3632

r = kvm_vm_ioctl_set_irqchip(kvm, chip);

3633

if (r)

3633

if (r)

3634

goto set_irqchip_out;

3634

goto set_irqchip_out;

3635

r = 0;

3635

r = 0;

3636

set_irqchip_out:

3636

set_irqchip_out:

3637

kfree(chip);

3637

kfree(chip);

3638

break;

3638

break;

3639

}

3639

}

3640

case KVM_GET_PIT: {

3640

case KVM_GET_PIT: {

3641

r = -EFAULT;

3641

r = -EFAULT;

3642

if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state)))

3642

if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state)))

3643

goto out;

3643

goto out;

3644

r = -ENXIO;

3644

r = -ENXIO;

3645

if (!kvm->arch.vpit)

3645

if (!kvm->arch.vpit)

3646

goto out;

3646

goto out;

3647

r = kvm_vm_ioctl_get_pit(kvm, &u.ps);

3647

r = kvm_vm_ioctl_get_pit(kvm, &u.ps);

3648

if (r)

3648

if (r)

3649

goto out;

3649

goto out;

3650

r = -EFAULT;

3650

r = -EFAULT;

3651

if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state)))

3651

if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state)))

3652

goto out;

3652

goto out;

3653

r = 0;

3653

r = 0;

3654

break;

3654

break;

3655

}

3655

}

3656

case KVM_SET_PIT: {

3656

case KVM_SET_PIT: {

3657

r = -EFAULT;

3657

r = -EFAULT;

3658

if (copy_from_user(&u.ps, argp, sizeof u.ps))

3658

if (copy_from_user(&u.ps, argp, sizeof u.ps))

3659

goto out;

3659

goto out;

3660

r = -ENXIO;

3660

r = -ENXIO;

3661

if (!kvm->arch.vpit)

3661

if (!kvm->arch.vpit)

3662

goto out;

3662

goto out;

3663

r = kvm_vm_ioctl_set_pit(kvm, &u.ps);

3663

r = kvm_vm_ioctl_set_pit(kvm, &u.ps);

3664

break;

3664

break;

3665

}

3665

}

3666

case KVM_GET_PIT2: {

3666

case KVM_GET_PIT2: {

3667

r = -ENXIO;

3667

r = -ENXIO;

3668

if (!kvm->arch.vpit)

3668

if (!kvm->arch.vpit)

3669

goto out;

3669

goto out;

3670

r = kvm_vm_ioctl_get_pit2(kvm, &u.ps2);

3670

r = kvm_vm_ioctl_get_pit2(kvm, &u.ps2);

3671

if (r)

3671

if (r)

3672

goto out;

3672

goto out;

3673

r = -EFAULT;

3673

r = -EFAULT;

3674

if (copy_to_user(argp, &u.ps2, sizeof(u.ps2)))

3674

if (copy_to_user(argp, &u.ps2, sizeof(u.ps2)))

3675

goto out;

3675

goto out;

3676

r = 0;

3676

r = 0;

3677

break;

3677

break;

3678

}

3678

}

3679

case KVM_SET_PIT2: {

3679

case KVM_SET_PIT2: {

3680

r = -EFAULT;

3680

r = -EFAULT;

3681

if (copy_from_user(&u.ps2, argp, sizeof(u.ps2)))

3681

if (copy_from_user(&u.ps2, argp, sizeof(u.ps2)))

3682

goto out;

3682

goto out;

3683

r = -ENXIO;

3683

r = -ENXIO;

3684

if (!kvm->arch.vpit)

3684

if (!kvm->arch.vpit)

3685

goto out;

3685

goto out;

3686

r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2);

3686

r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2);

3687

break;

3687

break;

3688

}

3688

}

3689

case KVM_REINJECT_CONTROL: {

3689

case KVM_REINJECT_CONTROL: {

3690

struct kvm_reinject_control control;

3690

struct kvm_reinject_control control;

3691

r = -EFAULT;

3691

r = -EFAULT;

3692

if (copy_from_user(&control, argp, sizeof(control)))

3692

if (copy_from_user(&control, argp, sizeof(control)))

3693

goto out;

3693

goto out;

3694

r = kvm_vm_ioctl_reinject(kvm, &control);

3694

r = kvm_vm_ioctl_reinject(kvm, &control);

3695

break;

3695

break;

3696

}

3696

}

3697

case KVM_XEN_HVM_CONFIG: {

3697

case KVM_XEN_HVM_CONFIG: {

3698

r = -EFAULT;

3698

r = -EFAULT;

3699

if (copy_from_user(&kvm->arch.xen_hvm_config, argp,

3699

if (copy_from_user(&kvm->arch.xen_hvm_config, argp,

3700

sizeof(struct kvm_xen_hvm_config)))

3700

sizeof(struct kvm_xen_hvm_config)))

3701

goto out;

3701

goto out;

3702

r = -EINVAL;

3702

r = -EINVAL;

3703

if (kvm->arch.xen_hvm_config.flags)

3703

if (kvm->arch.xen_hvm_config.flags)

3704

goto out;

3704

goto out;

3705

r = 0;

3705

r = 0;

3706

break;

3706

break;

3707

}

3707

}

3708

case KVM_SET_CLOCK: {

3708

case KVM_SET_CLOCK: {

3709

struct kvm_clock_data user_ns;

3709

struct kvm_clock_data user_ns;

3710

u64 now_ns;

3710

u64 now_ns;

3711

s64 delta;

3711

s64 delta;

3712

3713

r = -EFAULT;

3713

r = -EFAULT;

3714

if (copy_from_user(&user_ns, argp, sizeof(user_ns)))

3714

if (copy_from_user(&user_ns, argp, sizeof(user_ns)))

3715

goto out;

3715

goto out;

3716

3717

r = -EINVAL;

3717

r = -EINVAL;

3718

if (user_ns.flags)

3718

if (user_ns.flags)

3719

goto out;

3719

goto out;

3720

3721

r = 0;

3721

r = 0;

3722

local_irq_disable();

3722

local_irq_disable();

3723

now_ns = get_kernel_ns();

3723

now_ns = get_kernel_ns();

3724

delta = user_ns.clock - now_ns;

3724

delta = user_ns.clock - now_ns;

3725

local_irq_enable();

3725

local_irq_enable();

3726

kvm->arch.kvmclock_offset = delta;

3726

kvm->arch.kvmclock_offset = delta;

3727

break;

3727

break;

3728

}

3728

}

3729

case KVM_GET_CLOCK: {

3729

case KVM_GET_CLOCK: {

3730

struct kvm_clock_data user_ns;

3730

struct kvm_clock_data user_ns;

3731

u64 now_ns;

3731

u64 now_ns;

3732

3733

local_irq_disable();

3733

local_irq_disable();

3734

now_ns = get_kernel_ns();

3734

now_ns = get_kernel_ns();

3735

user_ns.clock = kvm->arch.kvmclock_offset + now_ns;

3735

user_ns.clock = kvm->arch.kvmclock_offset + now_ns;

3736

local_irq_enable();

3736

local_irq_enable();

3737

user_ns.flags = 0;

3737

user_ns.flags = 0;

3738

memset(&user_ns.pad, 0, sizeof(user_ns.pad));

3738

memset(&user_ns.pad, 0, sizeof(user_ns.pad));

3739

3740

r = -EFAULT;

3740

r = -EFAULT;

3741

if (copy_to_user(argp, &user_ns, sizeof(user_ns)))

3741

if (copy_to_user(argp, &user_ns, sizeof(user_ns)))

3742

goto out;

3742

goto out;

3743

r = 0;

3743

r = 0;

3744

break;

3744

break;

3745

}

3745

}

3746

3747

default:

3747

default:

3748

;

3748

;

3749

}

3749

}

3750

out:

3750

out:

3751

return r;

3751

return r;

3752

}

3752

}

3753

3754

static void kvm_init_msr_list(void)

3754

static void kvm_init_msr_list(void)

3755

{

3755

{

3756

u32 dummy[2];

3756

u32 dummy[2];

3757

unsigned i, j;

3757

unsigned i, j;

3758

3759

/* skip the first msrs in the list. KVM-specific */

3759

/* skip the first msrs in the list. KVM-specific */

3760

for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) {

3760

for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) {

3761

if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)

3761

if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)

3762

continue;

3762

continue;

3763

if (j < i)

3763

if (j < i)

3764

msrs_to_save[j] = msrs_to_save[i];

3764

msrs_to_save[j] = msrs_to_save[i];

3765

j++;

3765

j++;

3766

}

3766

}

3767

num_msrs_to_save = j;

3767

num_msrs_to_save = j;

3768

}

3768

}

3769

3770

static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,

3770

static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,

3771

const void *v)

3771

const void *v)

3772

{

3772

{

3773

int handled = 0;

3773

int handled = 0;

3774

int n;

3774

int n;

3775

3776

do {

3776

do {

3777

n = min(len, 8);

3777

n = min(len, 8);

3778

if (!(vcpu->arch.apic &&

3778

if (!(vcpu->arch.apic &&

3779

!kvm_iodevice_write(&vcpu->arch.apic->dev, addr, n, v))

3779

!kvm_iodevice_write(&vcpu->arch.apic->dev, addr, n, v))

3780

&& kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, n, v))

3780

&& kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, n, v))

3781

break;

3781

break;

3782

handled += n;

3782

handled += n;

3783

addr += n;

3783

addr += n;

3784

len -= n;

3784

len -= n;

3785

v += n;

3785

v += n;

3786

} while (len);

3786

} while (len);

3787

3788

return handled;

3788

return handled;

3789

}

3789

}

3790

3791

static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)

3791

static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)

3792

{

3792

{

3793

int handled = 0;

3793

int handled = 0;

3794

int n;

3794

int n;

3795

3796

do {

3796

do {

3797

n = min(len, 8);

3797

n = min(len, 8);

3798

if (!(vcpu->arch.apic &&

3798

if (!(vcpu->arch.apic &&

3799

!kvm_iodevice_read(&vcpu->arch.apic->dev, addr, n, v))

3799

!kvm_iodevice_read(&vcpu->arch.apic->dev, addr, n, v))

3800

&& kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, n, v))

3800

&& kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, n, v))

3801

break;

3801

break;

3802

trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, *(u64 *)v);

3802

trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, *(u64 *)v);

3803

handled += n;

3803

handled += n;

3804

addr += n;

3804

addr += n;

3805

len -= n;

3805

len -= n;

3806

v += n;

3806

v += n;

3807

} while (len);

3807

} while (len);

3808

3809

return handled;

3809

return handled;

3810

}

3810

}

3811

3812

static void kvm_set_segment(struct kvm_vcpu *vcpu,

3812

static void kvm_set_segment(struct kvm_vcpu *vcpu,

3813

struct kvm_segment *var, int seg)

3813

struct kvm_segment *var, int seg)

3814

{

3814

{

3815

kvm_x86_ops->set_segment(vcpu, var, seg);

3815

kvm_x86_ops->set_segment(vcpu, var, seg);

3816

}

3816

}

3817

3818

void kvm_get_segment(struct kvm_vcpu *vcpu,

3818

void kvm_get_segment(struct kvm_vcpu *vcpu,

3819

struct kvm_segment *var, int seg)

3819

struct kvm_segment *var, int seg)

3820

{

3820

{

3821

kvm_x86_ops->get_segment(vcpu, var, seg);

3821

kvm_x86_ops->get_segment(vcpu, var, seg);

3822

}

3822

}

3823

3824

gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)

3824

gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)

3825

{

3825

{

3826

gpa_t t_gpa;

3826

gpa_t t_gpa;

3827

struct x86_exception exception;

3827

struct x86_exception exception;

3828

3829

BUG_ON(!mmu_is_nested(vcpu));

3829

BUG_ON(!mmu_is_nested(vcpu));

3830

3831

/* NPT walks are always user-walks */

3831

/* NPT walks are always user-walks */

3832

access |= PFERR_USER_MASK;

3832

access |= PFERR_USER_MASK;

3833

t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &exception);

3833

t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &exception);

3834

3835

return t_gpa;

3835

return t_gpa;

3836

}

3836

}

3837

3838

gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,

3838

gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,

3839

struct x86_exception *exception)

3839

struct x86_exception *exception)

3840

{

3840

{

3841

u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;

3841

u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;

3842

return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);

3842

return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);

3843

}

3843

}

3844

3845

gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,

3845

gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,

3846

struct x86_exception *exception)

3846

struct x86_exception *exception)

3847

{

3847

{

3848

u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;

3848

u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;

3849

access |= PFERR_FETCH_MASK;

3849

access |= PFERR_FETCH_MASK;

3850

return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);

3850

return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);

3851

}

3851

}

3852

3853

gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,

3853

gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,

3854

struct x86_exception *exception)

3854

struct x86_exception *exception)

3855

{

3855

{

3856

u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;

3856

u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;

3857

access |= PFERR_WRITE_MASK;

3857

access |= PFERR_WRITE_MASK;

3858

return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);

3858

return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);

3859

}

3859

}

3860

3861

/* uses this to access any guest's mapped memory without checking CPL */

3861

/* uses this to access any guest's mapped memory without checking CPL */

3862

gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,

3862

gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,

3863

struct x86_exception *exception)

3863

struct x86_exception *exception)

3864

{

3864

{

3865

return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, exception);

3865

return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, exception);

3866

}

3866

}

3867

3868

static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,

3868

static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,

3869

struct kvm_vcpu *vcpu, u32 access,

3869

struct kvm_vcpu *vcpu, u32 access,

3870

struct x86_exception *exception)

3870

struct x86_exception *exception)

3871

{

3871

{

3872

void *data = val;

3872

void *data = val;

3873

int r = X86EMUL_CONTINUE;

3873

int r = X86EMUL_CONTINUE;

3874

3875

while (bytes) {

3875

while (bytes) {

3876

gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access,

3876

gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access,

3877

exception);

3877

exception);

3878

unsigned offset = addr & (PAGE_SIZE-1);

3878

unsigned offset = addr & (PAGE_SIZE-1);

3879

unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);

3879

unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);

3880

int ret;

3880

int ret;

3881

3882

if (gpa == UNMAPPED_GVA)

3882

if (gpa == UNMAPPED_GVA)

3883

return X86EMUL_PROPAGATE_FAULT;

3883

return X86EMUL_PROPAGATE_FAULT;

3884

ret = kvm_read_guest(vcpu->kvm, gpa, data, toread);

3884

ret = kvm_read_guest(vcpu->kvm, gpa, data, toread);

3885

if (ret < 0) {

3885

if (ret < 0) {

3886

r = X86EMUL_IO_NEEDED;

3886

r = X86EMUL_IO_NEEDED;

3887

goto out;

3887

goto out;

3888

}

3888

}

3889

3890

bytes -= toread;

3890

bytes -= toread;

3891

data += toread;

3891

data += toread;

3892

addr += toread;

3892

addr += toread;

3893

}

3893

}

3894

out:

3894

out:

3895

return r;

3895

return r;

3896

}

3896

}

3897

3898

/* used for instruction fetching */

3898

/* used for instruction fetching */

3899

static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,

3899

static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,

3900

gva_t addr, void *val, unsigned int bytes,

3900

gva_t addr, void *val, unsigned int bytes,

3901

struct x86_exception *exception)

3901

struct x86_exception *exception)

3902

{

3902

{

3903

struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);

3903

struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);

3904

u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;

3904

u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;

3905

3906

return kvm_read_guest_virt_helper(addr, val, bytes, vcpu,

3906

return kvm_read_guest_virt_helper(addr, val, bytes, vcpu,

3907

access | PFERR_FETCH_MASK,

3907

access | PFERR_FETCH_MASK,

3908

exception);

3908

exception);

3909

}

3909

}

3910

3911

int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,

3911

int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,

3912

gva_t addr, void *val, unsigned int bytes,

3912

gva_t addr, void *val, unsigned int bytes,

3913

struct x86_exception *exception)

3913

struct x86_exception *exception)

3914

{

3914

{

3915

struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);

3915

struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);

3916

u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;

3916

u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;

3917

3918

return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,

3918

return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,

3919

exception);

3919

exception);

3920

}

3920

}

3921

EXPORT_SYMBOL_GPL(kvm_read_guest_virt);

3921

EXPORT_SYMBOL_GPL(kvm_read_guest_virt);

3922

3923

static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt,

3923

static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt,

3924

gva_t addr, void *val, unsigned int bytes,

3924

gva_t addr, void *val, unsigned int bytes,

3925

struct x86_exception *exception)

3925

struct x86_exception *exception)

3926

{

3926

{

3927

struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);

3927

struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);

3928

return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception);

3928

return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception);

3929

}

3929

}

3930

3931

int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,

3931

int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,

3932

gva_t addr, void *val,

3932

gva_t addr, void *val,

3933

unsigned int bytes,

3933

unsigned int bytes,

3934

struct x86_exception *exception)

3934

struct x86_exception *exception)

3935

{

3935

{

3936

struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);

3936

struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);

3937

void *data = val;

3937

void *data = val;

3938

int r = X86EMUL_CONTINUE;

3938

int r = X86EMUL_CONTINUE;

3939

3940

while (bytes) {

3940

while (bytes) {

3941

gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr,

3941

gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr,

3942

PFERR_WRITE_MASK,

3942

PFERR_WRITE_MASK,

3943

exception);

3943

exception);

3944

unsigned offset = addr & (PAGE_SIZE-1);

3944

unsigned offset = addr & (PAGE_SIZE-1);

3945

unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);

3945

unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);

3946

int ret;

3946

int ret;

3947

3948

if (gpa == UNMAPPED_GVA)

3948

if (gpa == UNMAPPED_GVA)

3949

return X86EMUL_PROPAGATE_FAULT;

3949

return X86EMUL_PROPAGATE_FAULT;

3950

ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);

3950

ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);

3951

if (ret < 0) {

3951

if (ret < 0) {

3952

r = X86EMUL_IO_NEEDED;

3952

r = X86EMUL_IO_NEEDED;

3953

goto out;

3953

goto out;

3954

}

3954

}

3955

3956

bytes -= towrite;

3956

bytes -= towrite;

3957

data += towrite;

3957

data += towrite;

3958

addr += towrite;

3958

addr += towrite;

3959

}

3959

}

3960

out:

3960

out:

3961

return r;

3961

return r;

3962

}

3962

}

3963

EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);

3963

EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);

3964

3965

static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,

3965

static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,

3966

gpa_t *gpa, struct x86_exception *exception,

3966

gpa_t *gpa, struct x86_exception *exception,

3967

bool write)

3967

bool write)

3968

{

3968

{

3969

u32 access = ((kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0)

3969

u32 access = ((kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0)

3970

| (write ? PFERR_WRITE_MASK : 0);

3970

| (write ? PFERR_WRITE_MASK : 0);

3971

3972

if (vcpu_match_mmio_gva(vcpu, gva)

3972

if (vcpu_match_mmio_gva(vcpu, gva)

3973

&& !permission_fault(vcpu->arch.walk_mmu, vcpu->arch.access, access)) {

3973

&& !permission_fault(vcpu->arch.walk_mmu, vcpu->arch.access, access)) {

3974

*gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |

3974

*gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |

3975

(gva & (PAGE_SIZE - 1));

3975

(gva & (PAGE_SIZE - 1));

3976

trace_vcpu_match_mmio(gva, *gpa, write, false);

3976

trace_vcpu_match_mmio(gva, *gpa, write, false);

3977

return 1;

3977

return 1;

3978

}

3978

}

3979

3980

*gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);

3980

*gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);

3981

3982

if (*gpa == UNMAPPED_GVA)

3982

if (*gpa == UNMAPPED_GVA)

3983

return -1;

3983

return -1;

3984

3985

/* For APIC access vmexit */

3985

/* For APIC access vmexit */

3986

if ((*gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)

3986

if ((*gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)

3987

return 1;

3987

return 1;

3988

3989

if (vcpu_match_mmio_gpa(vcpu, *gpa)) {

3989

if (vcpu_match_mmio_gpa(vcpu, *gpa)) {

3990

trace_vcpu_match_mmio(gva, *gpa, write, true);

3990

trace_vcpu_match_mmio(gva, *gpa, write, true);

3991

return 1;

3991

return 1;

3992

}

3992

}

3993

3994

return 0;

3994

return 0;

3995

}

3995

}

3996

3997

int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,

3997

int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,

3998

const void *val, int bytes)

3998

const void *val, int bytes)

3999

{

3999

{

4000

int ret;

4000

int ret;

4001

4002

ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);

4002

ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);

4003

if (ret < 0)

4003

if (ret < 0)

4004

return 0;

4004

return 0;

4005

kvm_mmu_pte_write(vcpu, gpa, val, bytes);

4005

kvm_mmu_pte_write(vcpu, gpa, val, bytes);

4006

return 1;

4006

return 1;

4007

}

4007

}

4008

4009

struct read_write_emulator_ops {

4009

struct read_write_emulator_ops {

4010

int (*read_write_prepare)(struct kvm_vcpu *vcpu, void *val,

4010

int (*read_write_prepare)(struct kvm_vcpu *vcpu, void *val,

4011

int bytes);

4011

int bytes);

4012

int (*read_write_emulate)(struct kvm_vcpu *vcpu, gpa_t gpa,

4012

int (*read_write_emulate)(struct kvm_vcpu *vcpu, gpa_t gpa,

4013

void *val, int bytes);

4013

void *val, int bytes);

4014

int (*read_write_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa,

4014

int (*read_write_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa,

4015

int bytes, void *val);

4015

int bytes, void *val);

4016

int (*read_write_exit_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa,

4016

int (*read_write_exit_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa,

4017

void *val, int bytes);

4017

void *val, int bytes);

4018

bool write;

4018

bool write;

4019

};

4019

};

4020

4021

static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes)

4021

static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes)

4022

{

4022

{

4023

if (vcpu->mmio_read_completed) {

4023

if (vcpu->mmio_read_completed) {

4024

trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,

4024

trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,

4025

vcpu->mmio_fragments[0].gpa, *(u64 *)val);

4025

vcpu->mmio_fragments[0].gpa, *(u64 *)val);

4026

vcpu->mmio_read_completed = 0;

4026

vcpu->mmio_read_completed = 0;

4027

return 1;

4027

return 1;

4028

}

4028

}

4029

4030

return 0;

4030

return 0;

4031

}

4031

}

4032

4033

static int read_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,

4033

static int read_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,

4034

void *val, int bytes)

4034

void *val, int bytes)

4035

{

4035

{

4036

return !kvm_read_guest(vcpu->kvm, gpa, val, bytes);

4036

return !kvm_read_guest(vcpu->kvm, gpa, val, bytes);

4037

}

4037

}

4038

4039

static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,

4039

static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,

4040

void *val, int bytes)

4040

void *val, int bytes)

4041

{

4041

{

4042

return emulator_write_phys(vcpu, gpa, val, bytes);

4042

return emulator_write_phys(vcpu, gpa, val, bytes);

4043

}

4043

}

4044

4045

static int write_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes, void *val)

4045

static int write_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes, void *val)

4046

{

4046

{

4047

trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);

4047

trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);

4048

return vcpu_mmio_write(vcpu, gpa, bytes, val);

4048

return vcpu_mmio_write(vcpu, gpa, bytes, val);

4049

}

4049

}

4050

4051

static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,

4051

static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,

4052

void *val, int bytes)

4052

void *val, int bytes)

4053

{

4053

{

4054

trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);

4054

trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);

4055

return X86EMUL_IO_NEEDED;

4055

return X86EMUL_IO_NEEDED;

4056

}

4056

}

4057

4058

static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,

4058

static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,

4059

void *val, int bytes)

4059

void *val, int bytes)

4060

{

4060

{

4061

struct kvm_mmio_fragment *frag = &vcpu->mmio_fragments[0];

4061

struct kvm_mmio_fragment *frag = &vcpu->mmio_fragments[0];

4062

4063

memcpy(vcpu->run->mmio.data, frag->data, min(8u, frag->len));

4063

memcpy(vcpu->run->mmio.data, frag->data, min(8u, frag->len));

4064

return X86EMUL_CONTINUE;

4064

return X86EMUL_CONTINUE;

4065

}

4065

}

4066

4067

static const struct read_write_emulator_ops read_emultor = {

4067

static const struct read_write_emulator_ops read_emultor = {

4068

.read_write_prepare = read_prepare,

4068

.read_write_prepare = read_prepare,

4069

.read_write_emulate = read_emulate,

4069

.read_write_emulate = read_emulate,

4070

.read_write_mmio = vcpu_mmio_read,

4070

.read_write_mmio = vcpu_mmio_read,

4071

.read_write_exit_mmio = read_exit_mmio,

4071

.read_write_exit_mmio = read_exit_mmio,

4072

};

4072

};

4073

4074

static const struct read_write_emulator_ops write_emultor = {

4074

static const struct read_write_emulator_ops write_emultor = {

4075

.read_write_emulate = write_emulate,

4075

.read_write_emulate = write_emulate,

4076

.read_write_mmio = write_mmio,

4076

.read_write_mmio = write_mmio,

4077

.read_write_exit_mmio = write_exit_mmio,

4077

.read_write_exit_mmio = write_exit_mmio,

4078

.write = true,

4078

.write = true,

4079

};

4079

};

4080

4081

static int emulator_read_write_onepage(unsigned long addr, void *val,

4081

static int emulator_read_write_onepage(unsigned long addr, void *val,

4082

unsigned int bytes,

4082

unsigned int bytes,

4083

struct x86_exception *exception,

4083

struct x86_exception *exception,

4084

struct kvm_vcpu *vcpu,

4084

struct kvm_vcpu *vcpu,

4085

const struct read_write_emulator_ops *ops)

4085

const struct read_write_emulator_ops *ops)

4086

{

4086

{

4087

gpa_t gpa;

4087

gpa_t gpa;

4088

int handled, ret;

4088

int handled, ret;

4089

bool write = ops->write;

4089

bool write = ops->write;

4090

struct kvm_mmio_fragment *frag;

4090

struct kvm_mmio_fragment *frag;

4091

4092

ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write);

4092

ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write);

4093

4094

if (ret < 0)

4094

if (ret < 0)

4095

return X86EMUL_PROPAGATE_FAULT;

4095

return X86EMUL_PROPAGATE_FAULT;

4096

4097

/* For APIC access vmexit */

4097

/* For APIC access vmexit */

4098

if (ret)

4098

if (ret)

4099

goto mmio;

4099

goto mmio;

4100

4101

if (ops->read_write_emulate(vcpu, gpa, val, bytes))

4101

if (ops->read_write_emulate(vcpu, gpa, val, bytes))

4102

return X86EMUL_CONTINUE;

4102

return X86EMUL_CONTINUE;

4103

4104

mmio:

4104

mmio:

4105

/*

4105

/*

4106

* Is this MMIO handled locally?

4106

* Is this MMIO handled locally?

4107

*/

4107

*/

4108

handled = ops->read_write_mmio(vcpu, gpa, bytes, val);

4108

handled = ops->read_write_mmio(vcpu, gpa, bytes, val);

4109

if (handled == bytes)

4109

if (handled == bytes)

4110

return X86EMUL_CONTINUE;

4110

return X86EMUL_CONTINUE;

4111

4112

gpa += handled;

4112

gpa += handled;

4113

bytes -= handled;

4113

bytes -= handled;

4114

val += handled;

4114

val += handled;

4115

4116

WARN_ON(vcpu->mmio_nr_fragments >= KVM_MAX_MMIO_FRAGMENTS);

4116

WARN_ON(vcpu->mmio_nr_fragments >= KVM_MAX_MMIO_FRAGMENTS);

4117

frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++];

4117

frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++];

4118

frag->gpa = gpa;

4118

frag->gpa = gpa;

4119

frag->data = val;

4119

frag->data = val;

4120

frag->len = bytes;

4120

frag->len = bytes;

4121

return X86EMUL_CONTINUE;

4121

return X86EMUL_CONTINUE;

4122

}

4122

}

4123

4124

int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr,

4124

int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr,

4125

void *val, unsigned int bytes,

4125

void *val, unsigned int bytes,

4126

struct x86_exception *exception,

4126

struct x86_exception *exception,

4127

const struct read_write_emulator_ops *ops)

4127

const struct read_write_emulator_ops *ops)

4128

{

4128

{

4129

struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);

4129

struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);

4130

gpa_t gpa;

4130

gpa_t gpa;

4131

int rc;

4131

int rc;

4132

4133

if (ops->read_write_prepare &&

4133

if (ops->read_write_prepare &&

4134

ops->read_write_prepare(vcpu, val, bytes))

4134

ops->read_write_prepare(vcpu, val, bytes))

4135

return X86EMUL_CONTINUE;

4135

return X86EMUL_CONTINUE;

4136

4137

vcpu->mmio_nr_fragments = 0;

4137

vcpu->mmio_nr_fragments = 0;

4138

4139

/* Crossing a page boundary? */

4139

/* Crossing a page boundary? */

4140

if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {

4140

if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {

4141

int now;

4141

int now;

4142

4143

now = -addr & ~PAGE_MASK;

4143

now = -addr & ~PAGE_MASK;

4144

rc = emulator_read_write_onepage(addr, val, now, exception,

4144

rc = emulator_read_write_onepage(addr, val, now, exception,

4145

vcpu, ops);

4145

vcpu, ops);

4146

4147

if (rc != X86EMUL_CONTINUE)

4147

if (rc != X86EMUL_CONTINUE)

4148

return rc;

4148

return rc;

4149

addr += now;

4149

addr += now;

4150

val += now;

4150

val += now;

4151

bytes -= now;

4151

bytes -= now;

4152

}

4152

}

4153

4154

rc = emulator_read_write_onepage(addr, val, bytes, exception,

4154

rc = emulator_read_write_onepage(addr, val, bytes, exception,

4155

vcpu, ops);

4155

vcpu, ops);

4156

if (rc != X86EMUL_CONTINUE)

4156

if (rc != X86EMUL_CONTINUE)

4157

return rc;

4157

return rc;

4158

4159

if (!vcpu->mmio_nr_fragments)

4159

if (!vcpu->mmio_nr_fragments)

4160

return rc;

4160

return rc;

4161

4162

gpa = vcpu->mmio_fragments[0].gpa;

4162

gpa = vcpu->mmio_fragments[0].gpa;

4163

4164

vcpu->mmio_needed = 1;

4164

vcpu->mmio_needed = 1;

4165

vcpu->mmio_cur_fragment = 0;

4165

vcpu->mmio_cur_fragment = 0;

4166

4167

vcpu->run->mmio.len = min(8u, vcpu->mmio_fragments[0].len);

4167

vcpu->run->mmio.len = min(8u, vcpu->mmio_fragments[0].len);

4168

vcpu->run->mmio.is_write = vcpu->mmio_is_write = ops->write;

4168

vcpu->run->mmio.is_write = vcpu->mmio_is_write = ops->write;

4169

vcpu->run->exit_reason = KVM_EXIT_MMIO;

4169

vcpu->run->exit_reason = KVM_EXIT_MMIO;

4170

vcpu->run->mmio.phys_addr = gpa;

4170

vcpu->run->mmio.phys_addr = gpa;

4171

4172

return ops->read_write_exit_mmio(vcpu, gpa, val, bytes);

4172

return ops->read_write_exit_mmio(vcpu, gpa, val, bytes);

4173

}

4173

}

4174

4175

static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,

4175

static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,

4176

unsigned long addr,

4176

unsigned long addr,

4177

void *val,

4177

void *val,

4178

unsigned int bytes,

4178

unsigned int bytes,

4179

struct x86_exception *exception)

4179

struct x86_exception *exception)

4180

{

4180

{

4181

return emulator_read_write(ctxt, addr, val, bytes,

4181

return emulator_read_write(ctxt, addr, val, bytes,

4182

exception, &read_emultor);

4182

exception, &read_emultor);

4183

}

4183

}

4184

4185

int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,

4185

int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,

4186

unsigned long addr,

4186

unsigned long addr,

4187

const void *val,

4187

const void *val,

4188

unsigned int bytes,

4188

unsigned int bytes,

4189

struct x86_exception *exception)

4189

struct x86_exception *exception)

4190

{

4190

{

4191

return emulator_read_write(ctxt, addr, (void *)val, bytes,

4191

return emulator_read_write(ctxt, addr, (void *)val, bytes,

4192

exception, &write_emultor);

4192

exception, &write_emultor);

4193

}

4193

}

4194

4195

#define CMPXCHG_TYPE(t, ptr, old, new) \

4195

#define CMPXCHG_TYPE(t, ptr, old, new) \

4196

(cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old))

4196

(cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old))

4197

4198

#ifdef CONFIG_X86_64

4198

#ifdef CONFIG_X86_64

4199

# define CMPXCHG64(ptr, old, new) CMPXCHG_TYPE(u64, ptr, old, new)

4199

# define CMPXCHG64(ptr, old, new) CMPXCHG_TYPE(u64, ptr, old, new)

4200

#else

4200

#else

4201

# define CMPXCHG64(ptr, old, new) \

4201

# define CMPXCHG64(ptr, old, new) \

4202

(cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old))

4202

(cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old))

4203

#endif

4203

#endif

4204

4205

static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,

4205

static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,

4206

unsigned long addr,

4206

unsigned long addr,

4207

const void *old,

4207

const void *old,

4208

const void *new,

4208

const void *new,

4209

unsigned int bytes,

4209

unsigned int bytes,

4210

struct x86_exception *exception)

4210

struct x86_exception *exception)

4211

{

4211

{

4212

struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);

4212

struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);

4213

gpa_t gpa;

4213

gpa_t gpa;

4214

struct page *page;

4214

struct page *page;

4215

char *kaddr;

4215

char *kaddr;

4216

bool exchanged;

4216

bool exchanged;

4217

4218

/* guests cmpxchg8b have to be emulated atomically */

4218

/* guests cmpxchg8b have to be emulated atomically */

4219

if (bytes > 8 || (bytes & (bytes - 1)))

4219

if (bytes > 8 || (bytes & (bytes - 1)))

4220

goto emul_write;

4220

goto emul_write;

4221

4222

gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);

4222

gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);

4223

4224

if (gpa == UNMAPPED_GVA ||

4224

if (gpa == UNMAPPED_GVA ||

4225

(gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)

4225

(gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)

4226

goto emul_write;

4226

goto emul_write;

4227

4228

if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))

4228

if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))

4229

goto emul_write;

4229

goto emul_write;

4230

4231

page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);

4231

page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);

4232

if (is_error_page(page))

4232

if (is_error_page(page))

4233

goto emul_write;

4233

goto emul_write;

4234

4235

kaddr = kmap_atomic(page);

4235

kaddr = kmap_atomic(page);

4236

kaddr += offset_in_page(gpa);

4236

kaddr += offset_in_page(gpa);

4237

switch (bytes) {

4237

switch (bytes) {

4238

case 1:

4238

case 1:

4239

exchanged = CMPXCHG_TYPE(u8, kaddr, old, new);

4239

exchanged = CMPXCHG_TYPE(u8, kaddr, old, new);

4240

break;

4240

break;

4241

case 2:

4241

case 2:

4242

exchanged = CMPXCHG_TYPE(u16, kaddr, old, new);

4242

exchanged = CMPXCHG_TYPE(u16, kaddr, old, new);

4243

break;

4243

break;

4244

case 4:

4244

case 4:

4245

exchanged = CMPXCHG_TYPE(u32, kaddr, old, new);

4245

exchanged = CMPXCHG_TYPE(u32, kaddr, old, new);

4246

break;

4246

break;

4247

case 8:

4247

case 8:

4248

exchanged = CMPXCHG64(kaddr, old, new);

4248

exchanged = CMPXCHG64(kaddr, old, new);

4249

break;

4249

break;

4250

default:

4250

default:

4251

BUG();

4251

BUG();

4252

}

4252

}

4253

kunmap_atomic(kaddr);

4253

kunmap_atomic(kaddr);

4254

kvm_release_page_dirty(page);

4254

kvm_release_page_dirty(page);

4255

4256

if (!exchanged)

4256

if (!exchanged)

4257

return X86EMUL_CMPXCHG_FAILED;

4257

return X86EMUL_CMPXCHG_FAILED;

4258

4259

kvm_mmu_pte_write(vcpu, gpa, new, bytes);

4259

kvm_mmu_pte_write(vcpu, gpa, new, bytes);

4260

4261

return X86EMUL_CONTINUE;

4261

return X86EMUL_CONTINUE;

4262

4263

emul_write:

4263

emul_write:

4264

printk_once(KERN_WARNING "kvm: emulating exchange as write\n");

4264

printk_once(KERN_WARNING "kvm: emulating exchange as write\n");

4265

4266

return emulator_write_emulated(ctxt, addr, new, bytes, exception);

4266

return emulator_write_emulated(ctxt, addr, new, bytes, exception);

4267

}

4267

}

4268

4269

static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)

4269

static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)

4270

{

4270

{

4271

/* TODO: String I/O for in kernel device */

4271

/* TODO: String I/O for in kernel device */

4272

int r;

4272

int r;

4273

4274

if (vcpu->arch.pio.in)

4274

if (vcpu->arch.pio.in)

4275

r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port,

4275

r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port,

4276

vcpu->arch.pio.size, pd);

4276

vcpu->arch.pio.size, pd);

4277

else

4277

else

4278

r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,

4278

r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,

4279

vcpu->arch.pio.port, vcpu->arch.pio.size,

4279

vcpu->arch.pio.port, vcpu->arch.pio.size,

4280

pd);

4280

pd);

4281

return r;

4281

return r;

4282

}

4282

}

4283

4284

static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size,

4284

static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size,

4285

unsigned short port, void *val,

4285

unsigned short port, void *val,

4286

unsigned int count, bool in)

4286

unsigned int count, bool in)

4287

{

4287

{

4288

trace_kvm_pio(!in, port, size, count);

4288

trace_kvm_pio(!in, port, size, count);

4289

4290

vcpu->arch.pio.port = port;

4290

vcpu->arch.pio.port = port;

4291

vcpu->arch.pio.in = in;

4291

vcpu->arch.pio.in = in;

4292

vcpu->arch.pio.count = count;

4292

vcpu->arch.pio.count = count;

4293

vcpu->arch.pio.size = size;

4293

vcpu->arch.pio.size = size;

4294

4295

if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {

4295

if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {

4296

vcpu->arch.pio.count = 0;

4296

vcpu->arch.pio.count = 0;

4297

return 1;

4297

return 1;

4298

}

4298

}

4299

4300

vcpu->run->exit_reason = KVM_EXIT_IO;

4300

vcpu->run->exit_reason = KVM_EXIT_IO;

4301

vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;

4301

vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;

4302

vcpu->run->io.size = size;

4302

vcpu->run->io.size = size;

4303

vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;

4303

vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;

4304

vcpu->run->io.count = count;

4304

vcpu->run->io.count = count;

4305

vcpu->run->io.port = port;

4305

vcpu->run->io.port = port;

4306

4307

return 0;

4307

return 0;

4308

}

4308

}

4309

4310

static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,

4310

static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,

4311

int size, unsigned short port, void *val,

4311

int size, unsigned short port, void *val,

4312

unsigned int count)

4312

unsigned int count)

4313

{

4313

{

4314

struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);

4314

struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);

4315

int ret;

4315

int ret;

4316

4317

if (vcpu->arch.pio.count)

4317

if (vcpu->arch.pio.count)

4318

goto data_avail;

4318

goto data_avail;

4319

4320

ret = emulator_pio_in_out(vcpu, size, port, val, count, true);

4320

ret = emulator_pio_in_out(vcpu, size, port, val, count, true);

4321

if (ret) {

4321

if (ret) {

4322

data_avail:

4322

data_avail:

4323

memcpy(val, vcpu->arch.pio_data, size * count);

4323

memcpy(val, vcpu->arch.pio_data, size * count);

4324

vcpu->arch.pio.count = 0;

4324

vcpu->arch.pio.count = 0;

4325

return 1;

4325

return 1;

4326

}

4326

}

4327

4328

return 0;

4328

return 0;

4329

}

4329

}

4330

4331

static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,

4331

static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,

4332

int size, unsigned short port,

4332

int size, unsigned short port,

4333

const void *val, unsigned int count)

4333

const void *val, unsigned int count)

4334

{

4334

{

4335

struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);

4335

struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);

4336

4337

memcpy(vcpu->arch.pio_data, val, size * count);

4337

memcpy(vcpu->arch.pio_data, val, size * count);

4338

return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false);

4338

return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false);

4339

}

4339

}

4340

4341

static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)

4341

static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)

4342

{

4342

{

4343

return kvm_x86_ops->get_segment_base(vcpu, seg);

4343

return kvm_x86_ops->get_segment_base(vcpu, seg);

4344

}

4344

}

4345

4346

static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address)

4346

static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address)

4347

{

4347

{

4348

kvm_mmu_invlpg(emul_to_vcpu(ctxt), address);

4348

kvm_mmu_invlpg(emul_to_vcpu(ctxt), address);

4349

}

4349

}

4350

4351

int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)

4351

int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)

4352

{

4352

{

4353

if (!need_emulate_wbinvd(vcpu))

4353

if (!need_emulate_wbinvd(vcpu))

4354

return X86EMUL_CONTINUE;

4354

return X86EMUL_CONTINUE;

4355

4356

if (kvm_x86_ops->has_wbinvd_exit()) {

4356

if (kvm_x86_ops->has_wbinvd_exit()) {

4357

int cpu = get_cpu();

4357

int cpu = get_cpu();

4358

4359

cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);

4359

cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);

4360

smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,

4360

smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,

4361

wbinvd_ipi, NULL, 1);

4361

wbinvd_ipi, NULL, 1);

4362

put_cpu();

4362

put_cpu();

4363

cpumask_clear(vcpu->arch.wbinvd_dirty_mask);

4363

cpumask_clear(vcpu->arch.wbinvd_dirty_mask);

4364

} else

4364

} else

4365

wbinvd();

4365

wbinvd();

4366

return X86EMUL_CONTINUE;

4366

return X86EMUL_CONTINUE;

4367

}

4367

}

4368

EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);

4368

EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);

4369

4370

static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt)

4370

static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt)

4371

{

4371

{

4372

kvm_emulate_wbinvd(emul_to_vcpu(ctxt));

4372

kvm_emulate_wbinvd(emul_to_vcpu(ctxt));

4373

}

4373

}

4374

4375

int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)

4375

int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)

4376

{

4376

{

4377

return _kvm_get_dr(emul_to_vcpu(ctxt), dr, dest);

4377

return _kvm_get_dr(emul_to_vcpu(ctxt), dr, dest);

4378

}

4378

}

4379

4380

int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)

4380

int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)

4381

{

4381

{

4382

4383

return __kvm_set_dr(emul_to_vcpu(ctxt), dr, value);

4383

return __kvm_set_dr(emul_to_vcpu(ctxt), dr, value);

4384

}

4384

}

4385

4386

static u64 mk_cr_64(u64 curr_cr, u32 new_val)

4386

static u64 mk_cr_64(u64 curr_cr, u32 new_val)

4387

{

4387

{

4388

return (curr_cr & ~((1ULL << 32) - 1)) | new_val;

4388

return (curr_cr & ~((1ULL << 32) - 1)) | new_val;

4389

}

4389

}

4390

4391

static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr)

4391

static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr)

4392

{

4392

{

4393

struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);

4393

struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);

4394

unsigned long value;

4394

unsigned long value;

4395

4396

switch (cr) {

4396

switch (cr) {

4397

case 0:

4397

case 0:

4398

value = kvm_read_cr0(vcpu);

4398

value = kvm_read_cr0(vcpu);

4399

break;

4399

break;

4400

case 2:

4400

case 2:

4401

value = vcpu->arch.cr2;

4401

value = vcpu->arch.cr2;

4402

break;

4402

break;

4403

case 3:

4403

case 3:

4404

value = kvm_read_cr3(vcpu);

4404

value = kvm_read_cr3(vcpu);

4405

break;

4405

break;

4406

case 4:

4406

case 4:

4407

value = kvm_read_cr4(vcpu);

4407

value = kvm_read_cr4(vcpu);

4408

break;

4408

break;

4409

case 8:

4409

case 8:

4410

value = kvm_get_cr8(vcpu);

4410

value = kvm_get_cr8(vcpu);

4411

break;

4411

break;

4412

default:

4412

default:

4413

kvm_err("%s: unexpected cr %u\n", __func__, cr);

4413

kvm_err("%s: unexpected cr %u\n", __func__, cr);

4414

return 0;

4414

return 0;

4415

}

4415

}

4416

4417

return value;

4417

return value;

4418

}

4418

}

4419

4420

static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)

4420

static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)

4421

{

4421

{

4422

struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);

4422

struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);

4423

int res = 0;

4423

int res = 0;

4424

4425

switch (cr) {

4425

switch (cr) {

4426

case 0:

4426

case 0:

4427

res = kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));

4427

res = kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));

4428

break;

4428

break;

4429

case 2:

4429

case 2:

4430

vcpu->arch.cr2 = val;

4430

vcpu->arch.cr2 = val;

4431

break;

4431

break;

4432

case 3:

4432

case 3:

4433

res = kvm_set_cr3(vcpu, val);

4433

res = kvm_set_cr3(vcpu, val);

4434

break;

4434

break;

4435

case 4:

4435

case 4:

4436

res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));

4436

res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));

4437

break;

4437

break;

4438

case 8:

4438

case 8:

4439

res = kvm_set_cr8(vcpu, val);

4439

res = kvm_set_cr8(vcpu, val);

4440

break;

4440

break;

4441

default:

4441

default:

4442

kvm_err("%s: unexpected cr %u\n", __func__, cr);

4442

kvm_err("%s: unexpected cr %u\n", __func__, cr);

4443

res = -1;

4443

res = -1;

4444

}

4444

}

4445

4446

return res;

4446

return res;

4447

}

4447

}

4448

4449

static void emulator_set_rflags(struct x86_emulate_ctxt *ctxt, ulong val)

4449

static void emulator_set_rflags(struct x86_emulate_ctxt *ctxt, ulong val)

4450

{

4450

{

4451

kvm_set_rflags(emul_to_vcpu(ctxt), val);

4451

kvm_set_rflags(emul_to_vcpu(ctxt), val);

4452

}

4452

}

4453

4454

static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)

4454

static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)

4455

{

4455

{

4456

return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt));

4456

return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt));

4457

}

4457

}

4458

4459

static void emulator_get_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)

4459

static void emulator_get_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)

4460

{

4460

{

4461

kvm_x86_ops->get_gdt(emul_to_vcpu(ctxt), dt);

4461

kvm_x86_ops->get_gdt(emul_to_vcpu(ctxt), dt);

4462

}

4462

}

4463

4464

static void emulator_get_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)

4464

static void emulator_get_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)

4465

{

4465

{

4466

kvm_x86_ops->get_idt(emul_to_vcpu(ctxt), dt);

4466

kvm_x86_ops->get_idt(emul_to_vcpu(ctxt), dt);

4467

}

4467

}

4468

4469

static void emulator_set_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)

4469

static void emulator_set_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)

4470

{

4470

{

4471

kvm_x86_ops->set_gdt(emul_to_vcpu(ctxt), dt);

4471

kvm_x86_ops->set_gdt(emul_to_vcpu(ctxt), dt);

4472

}

4472

}

4473

4474

static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)

4474

static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)

4475

{

4475

{

4476

kvm_x86_ops->set_idt(emul_to_vcpu(ctxt), dt);

4476

kvm_x86_ops->set_idt(emul_to_vcpu(ctxt), dt);

4477

}

4477

}

4478

4479

static unsigned long emulator_get_cached_segment_base(

4479

static unsigned long emulator_get_cached_segment_base(

4480

struct x86_emulate_ctxt *ctxt, int seg)

4480

struct x86_emulate_ctxt *ctxt, int seg)

4481

{

4481

{

4482

return get_segment_base(emul_to_vcpu(ctxt), seg);

4482

return get_segment_base(emul_to_vcpu(ctxt), seg);

4483

}

4483

}

4484

4485

static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector,

4485

static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector,

4486

struct desc_struct *desc, u32 *base3,

4486

struct desc_struct *desc, u32 *base3,

4487

int seg)

4487

int seg)

4488

{

4488

{

4489

struct kvm_segment var;

4489

struct kvm_segment var;

4490

4491

kvm_get_segment(emul_to_vcpu(ctxt), &var, seg);

4491

kvm_get_segment(emul_to_vcpu(ctxt), &var, seg);

4492

*selector = var.selector;

4492

*selector = var.selector;

4493

4494

if (var.unusable)

4494

if (var.unusable)

4495

return false;

4495

return false;

4496

4497

if (var.g)

4497

if (var.g)

4498

var.limit >>= 12;

4498

var.limit >>= 12;

4499

set_desc_limit(desc, var.limit);

4499

set_desc_limit(desc, var.limit);

4500

set_desc_base(desc, (unsigned long)var.base);

4500

set_desc_base(desc, (unsigned long)var.base);

4501

#ifdef CONFIG_X86_64

4501

#ifdef CONFIG_X86_64

4502

if (base3)

4502

if (base3)

4503

*base3 = var.base >> 32;

4503

*base3 = var.base >> 32;

4504

#endif

4504

#endif

4505

desc->type = var.type;

4505

desc->type = var.type;

4506

desc->s = var.s;

4506

desc->s = var.s;

4507

desc->dpl = var.dpl;

4507

desc->dpl = var.dpl;

4508

desc->p = var.present;

4508

desc->p = var.present;

4509

desc->avl = var.avl;

4509

desc->avl = var.avl;

4510

desc->l = var.l;

4510

desc->l = var.l;

4511

desc->d = var.db;

4511

desc->d = var.db;

4512

desc->g = var.g;

4512

desc->g = var.g;

4513

4514

return true;

4514

return true;

4515

}

4515

}

4516

4517

static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector,

4517

static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector,

4518

struct desc_struct *desc, u32 base3,

4518

struct desc_struct *desc, u32 base3,

4519

int seg)

4519

int seg)

4520

{

4520

{

4521

struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);

4521

struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);

4522

struct kvm_segment var;

4522

struct kvm_segment var;

4523

4524

var.selector = selector;

4524

var.selector = selector;

4525

var.base = get_desc_base(desc);

4525

var.base = get_desc_base(desc);

4526

#ifdef CONFIG_X86_64

4526

#ifdef CONFIG_X86_64

4527

var.base |= ((u64)base3) << 32;

4527

var.base |= ((u64)base3) << 32;

4528

#endif

4528

#endif

4529

var.limit = get_desc_limit(desc);

4529

var.limit = get_desc_limit(desc);

4530

if (desc->g)

4530

if (desc->g)

4531

var.limit = (var.limit << 12) | 0xfff;

4531

var.limit = (var.limit << 12) | 0xfff;

4532

var.type = desc->type;

4532

var.type = desc->type;

4533

var.present = desc->p;

4533

var.present = desc->p;

4534

var.dpl = desc->dpl;

4534

var.dpl = desc->dpl;

4535

var.db = desc->d;

4535

var.db = desc->d;

4536

var.s = desc->s;

4536

var.s = desc->s;

4537

var.l = desc->l;

4537

var.l = desc->l;

4538

var.g = desc->g;

4538

var.g = desc->g;

4539

var.avl = desc->avl;

4539

var.avl = desc->avl;

4540

var.present = desc->p;

4540

var.present = desc->p;

4541

var.unusable = !var.present;

4541

var.unusable = !var.present;

4542

var.padding = 0;

4542

var.padding = 0;

4543

4544

kvm_set_segment(vcpu, &var, seg);

4544

kvm_set_segment(vcpu, &var, seg);

4545

return;

4545

return;

4546

}

4546

}

4547

4548

static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,

4548

static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,

4549

u32 msr_index, u64 *pdata)

4549

u32 msr_index, u64 *pdata)

4550

{

4550

{

4551

return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata);

4551

return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata);

4552

}

4552

}

4553

4554

static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,

4554

static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,

4555

u32 msr_index, u64 data)

4555

u32 msr_index, u64 data)

4556

{

4556

{

4557

struct msr_data msr;

4557

struct msr_data msr;

4558

4559

msr.data = data;

4559

msr.data = data;

4560

msr.index = msr_index;

4560

msr.index = msr_index;

4561

msr.host_initiated = false;

4561

msr.host_initiated = false;

4562

return kvm_set_msr(emul_to_vcpu(ctxt), &msr);

4562

return kvm_set_msr(emul_to_vcpu(ctxt), &msr);

4563

}

4563

}

4564

4565

static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,

4565

static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,

4566

u32 pmc, u64 *pdata)

4566

u32 pmc, u64 *pdata)

4567

{

4567

{

4568

return kvm_pmu_read_pmc(emul_to_vcpu(ctxt), pmc, pdata);

4568

return kvm_pmu_read_pmc(emul_to_vcpu(ctxt), pmc, pdata);

4569

}

4569

}

4570

4571

static void emulator_halt(struct x86_emulate_ctxt *ctxt)

4571

static void emulator_halt(struct x86_emulate_ctxt *ctxt)

4572

{

4572

{

4573

emul_to_vcpu(ctxt)->arch.halt_request = 1;

4573

emul_to_vcpu(ctxt)->arch.halt_request = 1;

4574

}

4574

}

4575

4576

static void emulator_get_fpu(struct x86_emulate_ctxt *ctxt)

4576

static void emulator_get_fpu(struct x86_emulate_ctxt *ctxt)

4577

{

4577

{

4578

preempt_disable();

4578

preempt_disable();

4579

kvm_load_guest_fpu(emul_to_vcpu(ctxt));

4579

kvm_load_guest_fpu(emul_to_vcpu(ctxt));

4580

/*

4580

/*

4581

* CR0.TS may reference the host fpu state, not the guest fpu state,

4581

* CR0.TS may reference the host fpu state, not the guest fpu state,

4582

* so it may be clear at this point.

4582

* so it may be clear at this point.

4583

*/

4583

*/

4584

clts();

4584

clts();

4585

}

4585

}

4586

4587

static void emulator_put_fpu(struct x86_emulate_ctxt *ctxt)

4587

static void emulator_put_fpu(struct x86_emulate_ctxt *ctxt)

4588

{

4588

{

4589

preempt_enable();

4589

preempt_enable();

4590

}

4590

}

4591

4592

static int emulator_intercept(struct x86_emulate_ctxt *ctxt,

4592

static int emulator_intercept(struct x86_emulate_ctxt *ctxt,

4593

struct x86_instruction_info *info,

4593

struct x86_instruction_info *info,

4594

enum x86_intercept_stage stage)

4594

enum x86_intercept_stage stage)

4595

{

4595

{

4596

return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage);

4596

return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage);

4597

}

4597

}

4598

4599

static void emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,

4599

static void emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,

4600

u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)

4600

u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)

4601

{

4601

{

4602

kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx);

4602

kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx);

4603

}

4603

}

4604

4605

static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg)

4605

static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg)

4606

{

4606

{

4607

return kvm_register_read(emul_to_vcpu(ctxt), reg);

4607

return kvm_register_read(emul_to_vcpu(ctxt), reg);

4608

}

4608

}

4609

4610

static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulong val)

4610

static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulong val)

4611

{

4611

{

4612

kvm_register_write(emul_to_vcpu(ctxt), reg, val);

4612

kvm_register_write(emul_to_vcpu(ctxt), reg, val);

4613

}

4613

}

4614

4615

static const struct x86_emulate_ops emulate_ops = {

4615

static const struct x86_emulate_ops emulate_ops = {

4616

.read_gpr = emulator_read_gpr,

4616

.read_gpr = emulator_read_gpr,

4617

.write_gpr = emulator_write_gpr,

4617

.write_gpr = emulator_write_gpr,

4618

.read_std = kvm_read_guest_virt_system,

4618

.read_std = kvm_read_guest_virt_system,

4619

.write_std = kvm_write_guest_virt_system,

4619

.write_std = kvm_write_guest_virt_system,

4620

.fetch = kvm_fetch_guest_virt,

4620

.fetch = kvm_fetch_guest_virt,

4621

.read_emulated = emulator_read_emulated,

4621

.read_emulated = emulator_read_emulated,

4622

.write_emulated = emulator_write_emulated,

4622

.write_emulated = emulator_write_emulated,

4623

.cmpxchg_emulated = emulator_cmpxchg_emulated,

4623

.cmpxchg_emulated = emulator_cmpxchg_emulated,

4624

.invlpg = emulator_invlpg,

4624

.invlpg = emulator_invlpg,

4625

.pio_in_emulated = emulator_pio_in_emulated,

4625

.pio_in_emulated = emulator_pio_in_emulated,

4626

.pio_out_emulated = emulator_pio_out_emulated,

4626

.pio_out_emulated = emulator_pio_out_emulated,

4627

.get_segment = emulator_get_segment,

4627

.get_segment = emulator_get_segment,

4628

.set_segment = emulator_set_segment,

4628

.set_segment = emulator_set_segment,

4629

.get_cached_segment_base = emulator_get_cached_segment_base,

4629

.get_cached_segment_base = emulator_get_cached_segment_base,

4630

.get_gdt = emulator_get_gdt,

4630

.get_gdt = emulator_get_gdt,

4631

.get_idt = emulator_get_idt,

4631

.get_idt = emulator_get_idt,

4632

.set_gdt = emulator_set_gdt,

4632

.set_gdt = emulator_set_gdt,

4633

.set_idt = emulator_set_idt,

4633

.set_idt = emulator_set_idt,

4634

.get_cr = emulator_get_cr,

4634

.get_cr = emulator_get_cr,

4635

.set_cr = emulator_set_cr,

4635

.set_cr = emulator_set_cr,

4636

.set_rflags = emulator_set_rflags,

4636

.set_rflags = emulator_set_rflags,

4637

.cpl = emulator_get_cpl,

4637

.cpl = emulator_get_cpl,

4638

.get_dr = emulator_get_dr,

4638

.get_dr = emulator_get_dr,

4639

.set_dr = emulator_set_dr,

4639

.set_dr = emulator_set_dr,

4640

.set_msr = emulator_set_msr,

4640

.set_msr = emulator_set_msr,

4641

.get_msr = emulator_get_msr,

4641

.get_msr = emulator_get_msr,

4642

.read_pmc = emulator_read_pmc,

4642

.read_pmc = emulator_read_pmc,

4643

.halt = emulator_halt,

4643

.halt = emulator_halt,

4644

.wbinvd = emulator_wbinvd,

4644

.wbinvd = emulator_wbinvd,

4645

.fix_hypercall = emulator_fix_hypercall,

4645

.fix_hypercall = emulator_fix_hypercall,

4646

.get_fpu = emulator_get_fpu,

4646

.get_fpu = emulator_get_fpu,

4647

.put_fpu = emulator_put_fpu,

4647

.put_fpu = emulator_put_fpu,

4648

.intercept = emulator_intercept,

4648

.intercept = emulator_intercept,

4649

.get_cpuid = emulator_get_cpuid,

4649

.get_cpuid = emulator_get_cpuid,

4650

};

4650

};

4651

4652

static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)

4652

static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)

4653

{

4653

{

4654

u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask);

4654

u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask);

4655

/*

4655

/*

4656

* an sti; sti; sequence only disable interrupts for the first

4656

* an sti; sti; sequence only disable interrupts for the first

4657

* instruction. So, if the last instruction, be it emulated or

4657

* instruction. So, if the last instruction, be it emulated or

4658

* not, left the system with the INT_STI flag enabled, it

4658

* not, left the system with the INT_STI flag enabled, it

4659

* means that the last instruction is an sti. We should not

4659

* means that the last instruction is an sti. We should not

4660

* leave the flag on in this case. The same goes for mov ss

4660

* leave the flag on in this case. The same goes for mov ss

4661

*/

4661

*/

4662

if (!(int_shadow & mask))

4662

if (!(int_shadow & mask))

4663

kvm_x86_ops->set_interrupt_shadow(vcpu, mask);

4663

kvm_x86_ops->set_interrupt_shadow(vcpu, mask);

4664

}

4664

}

4665

4666

static void inject_emulated_exception(struct kvm_vcpu *vcpu)

4666

static void inject_emulated_exception(struct kvm_vcpu *vcpu)

4667

{

4667

{

4668

struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;

4668

struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;

4669

if (ctxt->exception.vector == PF_VECTOR)

4669

if (ctxt->exception.vector == PF_VECTOR)

4670

kvm_propagate_fault(vcpu, &ctxt->exception);

4670

kvm_propagate_fault(vcpu, &ctxt->exception);

4671

else if (ctxt->exception.error_code_valid)

4671

else if (ctxt->exception.error_code_valid)

4672

kvm_queue_exception_e(vcpu, ctxt->exception.vector,

4672

kvm_queue_exception_e(vcpu, ctxt->exception.vector,

4673

ctxt->exception.error_code);

4673

ctxt->exception.error_code);

4674

else

4674

else

4675

kvm_queue_exception(vcpu, ctxt->exception.vector);

4675

kvm_queue_exception(vcpu, ctxt->exception.vector);

4676

}

4676

}

4677

4678

static void init_decode_cache(struct x86_emulate_ctxt *ctxt)

4678

static void init_decode_cache(struct x86_emulate_ctxt *ctxt)

4679

{

4679

{

4680

memset(&ctxt->twobyte, 0,

4680

memset(&ctxt->twobyte, 0,

4681

(void *)&ctxt->_regs - (void *)&ctxt->twobyte);

4681

(void *)&ctxt->_regs - (void *)&ctxt->twobyte);

4682

4683

ctxt->fetch.start = 0;

4683

ctxt->fetch.start = 0;

4684

ctxt->fetch.end = 0;

4684

ctxt->fetch.end = 0;

4685

ctxt->io_read.pos = 0;

4685

ctxt->io_read.pos = 0;

4686

ctxt->io_read.end = 0;

4686

ctxt->io_read.end = 0;

4687

ctxt->mem_read.pos = 0;

4687

ctxt->mem_read.pos = 0;

4688

ctxt->mem_read.end = 0;

4688

ctxt->mem_read.end = 0;

4689

}

4689

}

4690

4691

static void init_emulate_ctxt(struct kvm_vcpu *vcpu)

4691

static void init_emulate_ctxt(struct kvm_vcpu *vcpu)

4692

{

4692

{

4693

struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;

4693

struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;

4694

int cs_db, cs_l;

4694

int cs_db, cs_l;

4695

4696

kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);

4696

kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);

4697

4698

ctxt->eflags = kvm_get_rflags(vcpu);

4698

ctxt->eflags = kvm_get_rflags(vcpu);

4699

ctxt->eip = kvm_rip_read(vcpu);

4699

ctxt->eip = kvm_rip_read(vcpu);

4700

ctxt->mode = (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :

4700

ctxt->mode = (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :

4701

(ctxt->eflags & X86_EFLAGS_VM) ? X86EMUL_MODE_VM86 :

4701

(ctxt->eflags & X86_EFLAGS_VM) ? X86EMUL_MODE_VM86 :

4702

cs_l ? X86EMUL_MODE_PROT64 :

4702

cs_l ? X86EMUL_MODE_PROT64 :

4703

cs_db ? X86EMUL_MODE_PROT32 :

4703

cs_db ? X86EMUL_MODE_PROT32 :

4704

X86EMUL_MODE_PROT16;

4704

X86EMUL_MODE_PROT16;

4705

ctxt->guest_mode = is_guest_mode(vcpu);

4705

ctxt->guest_mode = is_guest_mode(vcpu);

4706

4707

init_decode_cache(ctxt);

4707

init_decode_cache(ctxt);

4708

vcpu->arch.emulate_regs_need_sync_from_vcpu = false;

4708

vcpu->arch.emulate_regs_need_sync_from_vcpu = false;

4709

}

4709

}

4710

4711

int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)

4711

int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)

4712

{

4712

{

4713

struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;

4713

struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;

4714

int ret;

4714

int ret;

4715

4716

init_emulate_ctxt(vcpu);

4716

init_emulate_ctxt(vcpu);

4717

4718

ctxt->op_bytes = 2;

4718

ctxt->op_bytes = 2;

4719

ctxt->ad_bytes = 2;

4719

ctxt->ad_bytes = 2;

4720

ctxt->_eip = ctxt->eip + inc_eip;

4720

ctxt->_eip = ctxt->eip + inc_eip;

4721

ret = emulate_int_real(ctxt, irq);

4721

ret = emulate_int_real(ctxt, irq);

4722

4723

if (ret != X86EMUL_CONTINUE)

4723

if (ret != X86EMUL_CONTINUE)

4724

return EMULATE_FAIL;

4724

return EMULATE_FAIL;

4725

4726

ctxt->eip = ctxt->_eip;

4726

ctxt->eip = ctxt->_eip;

4727

kvm_rip_write(vcpu, ctxt->eip);

4727

kvm_rip_write(vcpu, ctxt->eip);

4728

kvm_set_rflags(vcpu, ctxt->eflags);

4728

kvm_set_rflags(vcpu, ctxt->eflags);

4729

4730

if (irq == NMI_VECTOR)

4730

if (irq == NMI_VECTOR)

4731

vcpu->arch.nmi_pending = 0;

4731

vcpu->arch.nmi_pending = 0;

4732

else

4732

else

4733

vcpu->arch.interrupt.pending = false;

4733

vcpu->arch.interrupt.pending = false;

4734

4735

return EMULATE_DONE;

4735

return EMULATE_DONE;

4736

}

4736

}

4737

EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);

4737

EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);

4738

4739

static int handle_emulation_failure(struct kvm_vcpu *vcpu)

4739

static int handle_emulation_failure(struct kvm_vcpu *vcpu)

4740

{

4740

{

4741

int r = EMULATE_DONE;

4741

int r = EMULATE_DONE;

4742

4743

++vcpu->stat.insn_emulation_fail;

4743

++vcpu->stat.insn_emulation_fail;

4744

trace_kvm_emulate_insn_failed(vcpu);

4744

trace_kvm_emulate_insn_failed(vcpu);

4745

if (!is_guest_mode(vcpu)) {

4745

if (!is_guest_mode(vcpu)) {

4746

vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;

4746

vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;

4747

vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;

4747

vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;

4748

vcpu->run->internal.ndata = 0;

4748

vcpu->run->internal.ndata = 0;

4749

r = EMULATE_FAIL;

4749

r = EMULATE_FAIL;

4750

}

4750

}

4751

kvm_queue_exception(vcpu, UD_VECTOR);

4751

kvm_queue_exception(vcpu, UD_VECTOR);

4752

4753

return r;

4753

return r;

4754

}

4754

}

4755

4756

static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)

4756

static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)

4757

{

4757

{

4758

gpa_t gpa;

4758

gpa_t gpa;

4759

pfn_t pfn;

4759

pfn_t pfn;

4760

4761

if (tdp_enabled)

4761

if (tdp_enabled)

4762

return false;

4762

return false;

4763

4764

/*

4764

/*

4765

* if emulation was due to access to shadowed page table

4765

* if emulation was due to access to shadowed page table

4766

* and it failed try to unshadow page and re-enter the

4766

* and it failed try to unshadow page and re-enter the

4767

* guest to let CPU execute the instruction.

4767

* guest to let CPU execute the instruction.

4768

*/

4768

*/

4769

if (kvm_mmu_unprotect_page_virt(vcpu, gva))

4769

if (kvm_mmu_unprotect_page_virt(vcpu, gva))

4770

return true;

4770

return true;

4771

4772

gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL);

4772

gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL);

4773

4774

if (gpa == UNMAPPED_GVA)

4774

if (gpa == UNMAPPED_GVA)

4775

return true; /* let cpu generate fault */

4775

return true; /* let cpu generate fault */

4776

4777

/*

4777

/*

4778

* Do not retry the unhandleable instruction if it faults on the

4778

* Do not retry the unhandleable instruction if it faults on the

4779

* readonly host memory, otherwise it will goto a infinite loop:

4779

* readonly host memory, otherwise it will goto a infinite loop:

4780

* retry instruction -> write #PF -> emulation fail -> retry

4780

* retry instruction -> write #PF -> emulation fail -> retry

4781

* instruction -> ...

4781

* instruction -> ...

4782

*/

4782

*/

4783

pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));

4783

pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));

4784

if (!is_error_noslot_pfn(pfn)) {

4784

if (!is_error_noslot_pfn(pfn)) {

4785

kvm_release_pfn_clean(pfn);

4785

kvm_release_pfn_clean(pfn);

4786

return true;

4786

return true;

4787

}

4787

}

4788

4789

return false;

4789

return false;

4790

}

4790

}

4791

4792

static bool retry_instruction(struct x86_emulate_ctxt *ctxt,

4792

static bool retry_instruction(struct x86_emulate_ctxt *ctxt,

4793

unsigned long cr2, int emulation_type)

4793

unsigned long cr2, int emulation_type)

4794

{

4794

{

4795

struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);

4795

struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);

4796

unsigned long last_retry_eip, last_retry_addr, gpa = cr2;

4796

unsigned long last_retry_eip, last_retry_addr, gpa = cr2;

4797

4798

last_retry_eip = vcpu->arch.last_retry_eip;

4798

last_retry_eip = vcpu->arch.last_retry_eip;

4799

last_retry_addr = vcpu->arch.last_retry_addr;

4799

last_retry_addr = vcpu->arch.last_retry_addr;

4800

4801

/*

4801

/*

4802

* If the emulation is caused by #PF and it is non-page_table

4802

* If the emulation is caused by #PF and it is non-page_table

4803

* writing instruction, it means the VM-EXIT is caused by shadow

4803

* writing instruction, it means the VM-EXIT is caused by shadow

4804

* page protected, we can zap the shadow page and retry this

4804

* page protected, we can zap the shadow page and retry this

4805

* instruction directly.

4805

* instruction directly.

4806

*

4806

*

4807

* Note: if the guest uses a non-page-table modifying instruction

4807

* Note: if the guest uses a non-page-table modifying instruction

4808

* on the PDE that points to the instruction, then we will unmap

4808

* on the PDE that points to the instruction, then we will unmap

4809

* the instruction and go to an infinite loop. So, we cache the

4809

* the instruction and go to an infinite loop. So, we cache the

4810

* last retried eip and the last fault address, if we meet the eip

4810

* last retried eip and the last fault address, if we meet the eip

4811

* and the address again, we can break out of the potential infinite

4811

* and the address again, we can break out of the potential infinite

4812

* loop.

4812

* loop.

4813

*/

4813

*/

4814

vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0;

4814

vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0;

4815

4816

if (!(emulation_type & EMULTYPE_RETRY))

4816

if (!(emulation_type & EMULTYPE_RETRY))

4817

return false;

4817

return false;

4818

4819

if (x86_page_table_writing_insn(ctxt))

4819

if (x86_page_table_writing_insn(ctxt))

4820

return false;

4820

return false;

4821

4822

if (ctxt->eip == last_retry_eip && last_retry_addr == cr2)

4822

if (ctxt->eip == last_retry_eip && last_retry_addr == cr2)

4823

return false;

4823

return false;

4824

4825

vcpu->arch.last_retry_eip = ctxt->eip;

4825

vcpu->arch.last_retry_eip = ctxt->eip;

4826

vcpu->arch.last_retry_addr = cr2;

4826

vcpu->arch.last_retry_addr = cr2;

4827

4828

if (!vcpu->arch.mmu.direct_map)

4828

if (!vcpu->arch.mmu.direct_map)

4829

gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);

4829

gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);

4830

4831

kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);

4831

kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);

4832

4833

return true;

4833

return true;

4834

}

4834

}

4835

4836

static int complete_emulated_mmio(struct kvm_vcpu *vcpu);

4836

static int complete_emulated_mmio(struct kvm_vcpu *vcpu);

4837

static int complete_emulated_pio(struct kvm_vcpu *vcpu);

4837

static int complete_emulated_pio(struct kvm_vcpu *vcpu);

4838

4839

int x86_emulate_instruction(struct kvm_vcpu *vcpu,

4839

int x86_emulate_instruction(struct kvm_vcpu *vcpu,

4840

unsigned long cr2,

4840

unsigned long cr2,

4841

int emulation_type,

4841

int emulation_type,

4842

void *insn,

4842

void *insn,

4843

int insn_len)

4843

int insn_len)

4844

{

4844

{

4845

int r;

4845

int r;

4846

struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;

4846

struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;

4847

bool writeback = true;

4847

bool writeback = true;

4848

4849

kvm_clear_exception_queue(vcpu);

4849

kvm_clear_exception_queue(vcpu);

4850

4851

if (!(emulation_type & EMULTYPE_NO_DECODE)) {

4851

if (!(emulation_type & EMULTYPE_NO_DECODE)) {

4852

init_emulate_ctxt(vcpu);

4852

init_emulate_ctxt(vcpu);

4853

ctxt->interruptibility = 0;

4853

ctxt->interruptibility = 0;

4854

ctxt->have_exception = false;

4854

ctxt->have_exception = false;

4855

ctxt->perm_ok = false;

4855

ctxt->perm_ok = false;

4856

4857

ctxt->only_vendor_specific_insn

4857

ctxt->only_vendor_specific_insn

4858

= emulation_type & EMULTYPE_TRAP_UD;

4858

= emulation_type & EMULTYPE_TRAP_UD;

4859

4860

r = x86_decode_insn(ctxt, insn, insn_len);

4860

r = x86_decode_insn(ctxt, insn, insn_len);

4861

4862

trace_kvm_emulate_insn_start(vcpu);

4862

trace_kvm_emulate_insn_start(vcpu);

4863

++vcpu->stat.insn_emulation;

4863

++vcpu->stat.insn_emulation;

4864

if (r != EMULATION_OK) {

4864

if (r != EMULATION_OK) {

4865

if (emulation_type & EMULTYPE_TRAP_UD)

4865

if (emulation_type & EMULTYPE_TRAP_UD)

4866

return EMULATE_FAIL;

4866

return EMULATE_FAIL;

4867

if (reexecute_instruction(vcpu, cr2))

4867

if (reexecute_instruction(vcpu, cr2))

4868

return EMULATE_DONE;

4868

return EMULATE_DONE;

4869

if (emulation_type & EMULTYPE_SKIP)

4869

if (emulation_type & EMULTYPE_SKIP)

4870

return EMULATE_FAIL;

4870

return EMULATE_FAIL;

4871

return handle_emulation_failure(vcpu);

4871

return handle_emulation_failure(vcpu);

4872

}

4872

}

4873

}

4873

}

4874

4875

if (emulation_type & EMULTYPE_SKIP) {

4875

if (emulation_type & EMULTYPE_SKIP) {

4876

kvm_rip_write(vcpu, ctxt->_eip);

4876

kvm_rip_write(vcpu, ctxt->_eip);

4877

return EMULATE_DONE;

4877

return EMULATE_DONE;

4878

}

4878

}

4879

4880

if (retry_instruction(ctxt, cr2, emulation_type))

4880

if (retry_instruction(ctxt, cr2, emulation_type))

4881

return EMULATE_DONE;

4881

return EMULATE_DONE;

4882

4883

/* this is needed for vmware backdoor interface to work since it

4883

/* this is needed for vmware backdoor interface to work since it

4884

changes registers values during IO operation */

4884

changes registers values during IO operation */

4885

if (vcpu->arch.emulate_regs_need_sync_from_vcpu) {

4885

if (vcpu->arch.emulate_regs_need_sync_from_vcpu) {

4886

vcpu->arch.emulate_regs_need_sync_from_vcpu = false;

4886

vcpu->arch.emulate_regs_need_sync_from_vcpu = false;

4887

emulator_invalidate_register_cache(ctxt);

4887

emulator_invalidate_register_cache(ctxt);

4888

}

4888

}

4889

4890

restart:

4890

restart:

4891

r = x86_emulate_insn(ctxt);

4891

r = x86_emulate_insn(ctxt);

4892

4893

if (r == EMULATION_INTERCEPTED)

4893

if (r == EMULATION_INTERCEPTED)

4894

return EMULATE_DONE;

4894

return EMULATE_DONE;

4895

4896

if (r == EMULATION_FAILED) {

4896

if (r == EMULATION_FAILED) {

4897

if (reexecute_instruction(vcpu, cr2))

4897

if (reexecute_instruction(vcpu, cr2))

4898

return EMULATE_DONE;

4898

return EMULATE_DONE;

4899

4900

return handle_emulation_failure(vcpu);

4900

return handle_emulation_failure(vcpu);

4901

}

4901

}

4902

4903

if (ctxt->have_exception) {

4903

if (ctxt->have_exception) {

4904

inject_emulated_exception(vcpu);

4904

inject_emulated_exception(vcpu);

4905

r = EMULATE_DONE;

4905

r = EMULATE_DONE;

4906

} else if (vcpu->arch.pio.count) {

4906

} else if (vcpu->arch.pio.count) {

4907

if (!vcpu->arch.pio.in)

4907

if (!vcpu->arch.pio.in)

4908

vcpu->arch.pio.count = 0;

4908

vcpu->arch.pio.count = 0;

4909

else {

4909

else {

4910

writeback = false;

4910

writeback = false;

4911

vcpu->arch.complete_userspace_io = complete_emulated_pio;

4911

vcpu->arch.complete_userspace_io = complete_emulated_pio;

4912

}

4912

}

4913

r = EMULATE_DO_MMIO;

4913

r = EMULATE_DO_MMIO;

4914

} else if (vcpu->mmio_needed) {

4914

} else if (vcpu->mmio_needed) {

4915

if (!vcpu->mmio_is_write)

4915

if (!vcpu->mmio_is_write)

4916

writeback = false;

4916

writeback = false;

4917

r = EMULATE_DO_MMIO;

4917

r = EMULATE_DO_MMIO;

4918

vcpu->arch.complete_userspace_io = complete_emulated_mmio;

4918

vcpu->arch.complete_userspace_io = complete_emulated_mmio;

4919

} else if (r == EMULATION_RESTART)

4919

} else if (r == EMULATION_RESTART)

4920

goto restart;

4920

goto restart;

4921

else

4921

else

4922

r = EMULATE_DONE;

4922

r = EMULATE_DONE;

4923

4924

if (writeback) {

4924

if (writeback) {

4925

toggle_interruptibility(vcpu, ctxt->interruptibility);

4925

toggle_interruptibility(vcpu, ctxt->interruptibility);

4926

kvm_set_rflags(vcpu, ctxt->eflags);

4926

kvm_set_rflags(vcpu, ctxt->eflags);

4927

kvm_make_request(KVM_REQ_EVENT, vcpu);

4927

kvm_make_request(KVM_REQ_EVENT, vcpu);

4928

vcpu->arch.emulate_regs_need_sync_to_vcpu = false;

4928

vcpu->arch.emulate_regs_need_sync_to_vcpu = false;

4929

kvm_rip_write(vcpu, ctxt->eip);

4929

kvm_rip_write(vcpu, ctxt->eip);

4930

} else

4930

} else

4931

vcpu->arch.emulate_regs_need_sync_to_vcpu = true;

4931

vcpu->arch.emulate_regs_need_sync_to_vcpu = true;

4932

4933

return r;

4933

return r;

4934

}

4934

}

4935

EXPORT_SYMBOL_GPL(x86_emulate_instruction);

4935

EXPORT_SYMBOL_GPL(x86_emulate_instruction);

4936

4937

int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)

4937

int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)

4938

{

4938

{

4939

unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX);

4939

unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX);

4940

int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt,

4940

int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt,

4941

size, port, &val, 1);

4941

size, port, &val, 1);

4942

/* do not return to emulator after return from userspace */

4942

/* do not return to emulator after return from userspace */

4943

vcpu->arch.pio.count = 0;

4943

vcpu->arch.pio.count = 0;

4944

return ret;

4944

return ret;

4945

}

4945

}

4946

EXPORT_SYMBOL_GPL(kvm_fast_pio_out);

4946

EXPORT_SYMBOL_GPL(kvm_fast_pio_out);

4947

4948

static void tsc_bad(void *info)

4948

static void tsc_bad(void *info)

4949

{

4949

{

4950

__this_cpu_write(cpu_tsc_khz, 0);

4950

__this_cpu_write(cpu_tsc_khz, 0);

4951

}

4951

}

4952

4953

static void tsc_khz_changed(void *data)

4953

static void tsc_khz_changed(void *data)

4954

{

4954

{

4955

struct cpufreq_freqs *freq = data;

4955

struct cpufreq_freqs *freq = data;

4956

unsigned long khz = 0;

4956

unsigned long khz = 0;

4957

4958

if (data)

4958

if (data)

4959

khz = freq->new;

4959

khz = freq->new;

4960

else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))

4960

else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))

4961

khz = cpufreq_quick_get(raw_smp_processor_id());

4961

khz = cpufreq_quick_get(raw_smp_processor_id());

4962

if (!khz)

4962

if (!khz)

4963

khz = tsc_khz;

4963

khz = tsc_khz;

4964

__this_cpu_write(cpu_tsc_khz, khz);

4964

__this_cpu_write(cpu_tsc_khz, khz);

4965

}

4965

}

4966

4967

static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,

4967

static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,

4968

void *data)

4968

void *data)

4969

{

4969

{

4970

struct cpufreq_freqs *freq = data;

4970

struct cpufreq_freqs *freq = data;

4971

struct kvm *kvm;

4971

struct kvm *kvm;

4972

struct kvm_vcpu *vcpu;

4972

struct kvm_vcpu *vcpu;

4973

int i, send_ipi = 0;

4973

int i, send_ipi = 0;

4974

4975

/*

4975

/*

4976

* We allow guests to temporarily run on slowing clocks,

4976

* We allow guests to temporarily run on slowing clocks,

4977

* provided we notify them after, or to run on accelerating

4977

* provided we notify them after, or to run on accelerating

4978

* clocks, provided we notify them before. Thus time never

4978

* clocks, provided we notify them before. Thus time never

4979

* goes backwards.

4979

* goes backwards.

4980

*

4980

*

4981

* However, we have a problem. We can't atomically update

4981

* However, we have a problem. We can't atomically update

4982

* the frequency of a given CPU from this function; it is

4982

* the frequency of a given CPU from this function; it is

4983

* merely a notifier, which can be called from any CPU.

4983

* merely a notifier, which can be called from any CPU.

4984

* Changing the TSC frequency at arbitrary points in time

4984

* Changing the TSC frequency at arbitrary points in time

4985

* requires a recomputation of local variables related to

4985

* requires a recomputation of local variables related to

4986

* the TSC for each VCPU. We must flag these local variables

4986

* the TSC for each VCPU. We must flag these local variables

4987

* to be updated and be sure the update takes place with the

4987

* to be updated and be sure the update takes place with the

4988

* new frequency before any guests proceed.

4988

* new frequency before any guests proceed.

4989

*

4989

*

4990

* Unfortunately, the combination of hotplug CPU and frequency

4990

* Unfortunately, the combination of hotplug CPU and frequency

4991

* change creates an intractable locking scenario; the order

4991

* change creates an intractable locking scenario; the order

4992

* of when these callouts happen is undefined with respect to

4992

* of when these callouts happen is undefined with respect to

4993

* CPU hotplug, and they can race with each other. As such,

4993

* CPU hotplug, and they can race with each other. As such,

4994

* merely setting per_cpu(cpu_tsc_khz) = X during a hotadd is

4994

* merely setting per_cpu(cpu_tsc_khz) = X during a hotadd is

4995

* undefined; you can actually have a CPU frequency change take

4995

* undefined; you can actually have a CPU frequency change take

4996

* place in between the computation of X and the setting of the

4996

* place in between the computation of X and the setting of the

4997

* variable. To protect against this problem, all updates of

4997

* variable. To protect against this problem, all updates of

4998

* the per_cpu tsc_khz variable are done in an interrupt

4998

* the per_cpu tsc_khz variable are done in an interrupt

4999

* protected IPI, and all callers wishing to update the value

4999

* protected IPI, and all callers wishing to update the value

5000

* must wait for a synchronous IPI to complete (which is trivial

5000

* must wait for a synchronous IPI to complete (which is trivial

5001

* if the caller is on the CPU already). This establishes the

5001

* if the caller is on the CPU already). This establishes the

5002

* necessary total order on variable updates.

5002

* necessary total order on variable updates.

5003

*

5003

*

5004

* Note that because a guest time update may take place

5004

* Note that because a guest time update may take place

5005

* anytime after the setting of the VCPU's request bit, the

5005

* anytime after the setting of the VCPU's request bit, the

5006

* correct TSC value must be set before the request. However,

5006

* correct TSC value must be set before the request. However,

5007

* to ensure the update actually makes it to any guest which

5007

* to ensure the update actually makes it to any guest which

5008

* starts running in hardware virtualization between the set

5008

* starts running in hardware virtualization between the set

5009

* and the acquisition of the spinlock, we must also ping the

5009

* and the acquisition of the spinlock, we must also ping the

5010

* CPU after setting the request bit.

5010

* CPU after setting the request bit.

5011

*

5011

*

5012

*/

5012

*/

5013

5014

if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)

5014

if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)

5015

return 0;

5015

return 0;

5016

if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)

5016

if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)

5017

return 0;

5017

return 0;

5018

5019

smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);

5019

smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);

5020

5021

raw_spin_lock(&kvm_lock);

5021

raw_spin_lock(&kvm_lock);

5022

list_for_each_entry(kvm, &vm_list, vm_list) {

5022

list_for_each_entry(kvm, &vm_list, vm_list) {

5023

kvm_for_each_vcpu(i, vcpu, kvm) {

5023

kvm_for_each_vcpu(i, vcpu, kvm) {

5024

if (vcpu->cpu != freq->cpu)

5024

if (vcpu->cpu != freq->cpu)

5025

continue;

5025

continue;

5026

kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);

5026

kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);

5027

if (vcpu->cpu != smp_processor_id())

5027

if (vcpu->cpu != smp_processor_id())

5028

send_ipi = 1;

5028

send_ipi = 1;

5029

}

5029

}

5030

}

5030

}

5031

raw_spin_unlock(&kvm_lock);

5031

raw_spin_unlock(&kvm_lock);

5032

5033

if (freq->old < freq->new && send_ipi) {

5033

if (freq->old < freq->new && send_ipi) {

5034

/*

5034

/*

5035

* We upscale the frequency. Must make the guest

5035

* We upscale the frequency. Must make the guest

5036

* doesn't see old kvmclock values while running with

5036

* doesn't see old kvmclock values while running with

5037

* the new frequency, otherwise we risk the guest sees

5037

* the new frequency, otherwise we risk the guest sees

5038

* time go backwards.

5038

* time go backwards.

5039

*

5039

*

5040

* In case we update the frequency for another cpu

5040

* In case we update the frequency for another cpu

5041

* (which might be in guest context) send an interrupt

5041

* (which might be in guest context) send an interrupt

5042

* to kick the cpu out of guest context. Next time

5042

* to kick the cpu out of guest context. Next time

5043

* guest context is entered kvmclock will be updated,

5043

* guest context is entered kvmclock will be updated,

5044

* so the guest will not see stale values.

5044

* so the guest will not see stale values.

5045

*/

5045

*/

5046

smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);

5046

smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);

5047

}

5047

}

5048

return 0;

5048

return 0;

5049

}

5049

}

5050

5051

static struct notifier_block kvmclock_cpufreq_notifier_block = {

5051

static struct notifier_block kvmclock_cpufreq_notifier_block = {

5052

.notifier_call = kvmclock_cpufreq_notifier

5052

.notifier_call = kvmclock_cpufreq_notifier

5053

};

5053

};

5054

5055

static int kvmclock_cpu_notifier(struct notifier_block *nfb,

5055

static int kvmclock_cpu_notifier(struct notifier_block *nfb,

5056

unsigned long action, void *hcpu)

5056

unsigned long action, void *hcpu)

5057

{

5057

{

5058

unsigned int cpu = (unsigned long)hcpu;

5058

unsigned int cpu = (unsigned long)hcpu;

5059

5060

switch (action) {

5060

switch (action) {

5061

case CPU_ONLINE:

5061

case CPU_ONLINE:

5062

case CPU_DOWN_FAILED:

5062

case CPU_DOWN_FAILED:

5063

smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);

5063

smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);

5064

break;

5064

break;

5065

case CPU_DOWN_PREPARE:

5065

case CPU_DOWN_PREPARE:

5066

smp_call_function_single(cpu, tsc_bad, NULL, 1);

5066

smp_call_function_single(cpu, tsc_bad, NULL, 1);

5067

break;

5067

break;

5068

}

5068

}

5069

return NOTIFY_OK;

5069

return NOTIFY_OK;

5070

}

5070

}

5071

5072

static struct notifier_block kvmclock_cpu_notifier_block = {

5072

static struct notifier_block kvmclock_cpu_notifier_block = {

5073

.notifier_call = kvmclock_cpu_notifier,

5073

.notifier_call = kvmclock_cpu_notifier,

5074

.priority = -INT_MAX

5074

.priority = -INT_MAX

5075

};

5075

};

5076

5077

static void kvm_timer_init(void)

5077

static void kvm_timer_init(void)

5078

{

5078

{

5079

int cpu;

5079

int cpu;

5080

5081

max_tsc_khz = tsc_khz;

5081

max_tsc_khz = tsc_khz;

5082

register_hotcpu_notifier(&kvmclock_cpu_notifier_block);

5082

register_hotcpu_notifier(&kvmclock_cpu_notifier_block);

5083

if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {

5083

if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {

5084

#ifdef CONFIG_CPU_FREQ

5084

#ifdef CONFIG_CPU_FREQ

5085

struct cpufreq_policy policy;

5085

struct cpufreq_policy policy;

5086

memset(&policy, 0, sizeof(policy));

5086

memset(&policy, 0, sizeof(policy));

5087

cpu = get_cpu();

5087

cpu = get_cpu();

5088

cpufreq_get_policy(&policy, cpu);

5088

cpufreq_get_policy(&policy, cpu);

5089

if (policy.cpuinfo.max_freq)

5089

if (policy.cpuinfo.max_freq)

5090

max_tsc_khz = policy.cpuinfo.max_freq;

5090

max_tsc_khz = policy.cpuinfo.max_freq;

5091

put_cpu();

5091

put_cpu();

5092

#endif

5092

#endif

5093

cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,

5093

cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,

5094

CPUFREQ_TRANSITION_NOTIFIER);

5094

CPUFREQ_TRANSITION_NOTIFIER);

5095

}

5095

}

5096

pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz);

5096

pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz);

5097

for_each_online_cpu(cpu)

5097

for_each_online_cpu(cpu)

5098

smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);

5098

smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);

5099

}

5099

}

5100

5101

static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);

5101

static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);

5102

5103

int kvm_is_in_guest(void)

5103

int kvm_is_in_guest(void)

5104

{

5104

{

5105

return __this_cpu_read(current_vcpu) != NULL;

5105

return __this_cpu_read(current_vcpu) != NULL;

5106

}

5106

}

5107

5108

static int kvm_is_user_mode(void)

5108

static int kvm_is_user_mode(void)

5109

{

5109

{

5110

int user_mode = 3;

5110

int user_mode = 3;

5111

5112

if (__this_cpu_read(current_vcpu))

5112

if (__this_cpu_read(current_vcpu))

5113

user_mode = kvm_x86_ops->get_cpl(__this_cpu_read(current_vcpu));

5113

user_mode = kvm_x86_ops->get_cpl(__this_cpu_read(current_vcpu));

5114

5115

return user_mode != 0;

5115

return user_mode != 0;

5116

}

5116

}

5117

5118

static unsigned long kvm_get_guest_ip(void)

5118

static unsigned long kvm_get_guest_ip(void)

5119

{

5119

{

5120

unsigned long ip = 0;

5120

unsigned long ip = 0;

5121

5122

if (__this_cpu_read(current_vcpu))

5122

if (__this_cpu_read(current_vcpu))

5123

ip = kvm_rip_read(__this_cpu_read(current_vcpu));

5123

ip = kvm_rip_read(__this_cpu_read(current_vcpu));

5124

5125

return ip;

5125

return ip;

5126

}

5126

}

5127

5128

static struct perf_guest_info_callbacks kvm_guest_cbs = {

5128

static struct perf_guest_info_callbacks kvm_guest_cbs = {

5129

.is_in_guest = kvm_is_in_guest,

5129

.is_in_guest = kvm_is_in_guest,

5130

.is_user_mode = kvm_is_user_mode,

5130

.is_user_mode = kvm_is_user_mode,

5131

.get_guest_ip = kvm_get_guest_ip,

5131

.get_guest_ip = kvm_get_guest_ip,

5132

};

5132

};

5133

5134

void kvm_before_handle_nmi(struct kvm_vcpu *vcpu)

5134

void kvm_before_handle_nmi(struct kvm_vcpu *vcpu)

5135

{

5135

{

5136

__this_cpu_write(current_vcpu, vcpu);

5136

__this_cpu_write(current_vcpu, vcpu);

5137

}

5137

}

5138

EXPORT_SYMBOL_GPL(kvm_before_handle_nmi);

5138

EXPORT_SYMBOL_GPL(kvm_before_handle_nmi);

5139

5140

void kvm_after_handle_nmi(struct kvm_vcpu *vcpu)

5140

void kvm_after_handle_nmi(struct kvm_vcpu *vcpu)

5141

{

5141

{

5142

__this_cpu_write(current_vcpu, NULL);

5142

__this_cpu_write(current_vcpu, NULL);

5143

}

5143

}

5144

EXPORT_SYMBOL_GPL(kvm_after_handle_nmi);

5144

EXPORT_SYMBOL_GPL(kvm_after_handle_nmi);

5145

5146

static void kvm_set_mmio_spte_mask(void)

5146

static void kvm_set_mmio_spte_mask(void)

5147

{

5147

{

5148

u64 mask;

5148

u64 mask;

5149

int maxphyaddr = boot_cpu_data.x86_phys_bits;

5149

int maxphyaddr = boot_cpu_data.x86_phys_bits;

5150

5151

/*

5151

/*

5152

* Set the reserved bits and the present bit of an paging-structure

5152

* Set the reserved bits and the present bit of an paging-structure

5153

* entry to generate page fault with PFER.RSV = 1.

5153

* entry to generate page fault with PFER.RSV = 1.

5154

*/

5154

*/

5155

mask = ((1ull << (62 - maxphyaddr + 1)) - 1) << maxphyaddr;

5155

mask = ((1ull << (62 - maxphyaddr + 1)) - 1) << maxphyaddr;

5156

mask |= 1ull;

5156

mask |= 1ull;

5157

5158

#ifdef CONFIG_X86_64

5158

#ifdef CONFIG_X86_64

5159

/*

5159

/*

5160

* If reserved bit is not supported, clear the present bit to disable

5160

* If reserved bit is not supported, clear the present bit to disable

5161

* mmio page fault.

5161

* mmio page fault.

5162

*/

5162

*/

5163

if (maxphyaddr == 52)

5163

if (maxphyaddr == 52)

5164

mask &= ~1ull;

5164

mask &= ~1ull;

5165

#endif

5165

#endif

5166

5167

kvm_mmu_set_mmio_spte_mask(mask);

5167

kvm_mmu_set_mmio_spte_mask(mask);

5168

}

5168

}

5169

5170

#ifdef CONFIG_X86_64

5170

#ifdef CONFIG_X86_64

5171

static void pvclock_gtod_update_fn(struct work_struct *work)

5171

static void pvclock_gtod_update_fn(struct work_struct *work)

5172

{

5172

{

5173

struct kvm *kvm;

5173

struct kvm *kvm;

5174

5175

struct kvm_vcpu *vcpu;

5175

struct kvm_vcpu *vcpu;

5176

int i;

5176

int i;

5177

5178

raw_spin_lock(&kvm_lock);

5178

raw_spin_lock(&kvm_lock);

5179

list_for_each_entry(kvm, &vm_list, vm_list)

5179

list_for_each_entry(kvm, &vm_list, vm_list)

5180

kvm_for_each_vcpu(i, vcpu, kvm)

5180

kvm_for_each_vcpu(i, vcpu, kvm)

5181

set_bit(KVM_REQ_MASTERCLOCK_UPDATE, &vcpu->requests);

5181

set_bit(KVM_REQ_MASTERCLOCK_UPDATE, &vcpu->requests);

5182

atomic_set(&kvm_guest_has_master_clock, 0);

5182

atomic_set(&kvm_guest_has_master_clock, 0);

5183

raw_spin_unlock(&kvm_lock);

5183

raw_spin_unlock(&kvm_lock);

5184

}

5184

}

5185

5186

static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);

5186

static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);

5187

5188

/*

5188

/*

5189

* Notification about pvclock gtod data update.

5189

* Notification about pvclock gtod data update.

5190

*/

5190

*/

5191

static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,

5191

static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,

5192

void *priv)

5192

void *priv)

5193

{

5193

{

5194

struct pvclock_gtod_data *gtod = &pvclock_gtod_data;

5194

struct pvclock_gtod_data *gtod = &pvclock_gtod_data;

5195

struct timekeeper *tk = priv;

5195

struct timekeeper *tk = priv;

5196

5197

update_pvclock_gtod(tk);

5197

update_pvclock_gtod(tk);

5198

5199

/* disable master clock if host does not trust, or does not

5199

/* disable master clock if host does not trust, or does not

5200

* use, TSC clocksource

5200

* use, TSC clocksource

5201

*/

5201

*/

5202

if (gtod->clock.vclock_mode != VCLOCK_TSC &&

5202

if (gtod->clock.vclock_mode != VCLOCK_TSC &&

5203

atomic_read(&kvm_guest_has_master_clock) != 0)

5203

atomic_read(&kvm_guest_has_master_clock) != 0)

5204

queue_work(system_long_wq, &pvclock_gtod_work);

5204

queue_work(system_long_wq, &pvclock_gtod_work);

5205

5206

return 0;

5206

return 0;

5207

}

5207

}

5208

5209

static struct notifier_block pvclock_gtod_notifier = {

5209

static struct notifier_block pvclock_gtod_notifier = {

5210

.notifier_call = pvclock_gtod_notify,

5210

.notifier_call = pvclock_gtod_notify,

5211

};

5211

};

5212

#endif

5212

#endif

5213

5214

int kvm_arch_init(void *opaque)

5214

int kvm_arch_init(void *opaque)

5215

{

5215

{

5216

int r;

5216

int r;

5217

struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;

5217

struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;

5218

5219

if (kvm_x86_ops) {

5219

if (kvm_x86_ops) {

5220

printk(KERN_ERR "kvm: already loaded the other module\n");

5220

printk(KERN_ERR "kvm: already loaded the other module\n");

5221

r = -EEXIST;

5221

r = -EEXIST;

5222

goto out;

5222

goto out;

5223

}

5223

}

5224

5225

if (!ops->cpu_has_kvm_support()) {

5225

if (!ops->cpu_has_kvm_support()) {

5226

printk(KERN_ERR "kvm: no hardware support\n");

5226

printk(KERN_ERR "kvm: no hardware support\n");

5227

r = -EOPNOTSUPP;

5227

r = -EOPNOTSUPP;

5228

goto out;

5228

goto out;

5229

}

5229

}

5230

if (ops->disabled_by_bios()) {

5230

if (ops->disabled_by_bios()) {

5231

printk(KERN_ERR "kvm: disabled by bios\n");

5231

printk(KERN_ERR "kvm: disabled by bios\n");

5232

r = -EOPNOTSUPP;

5232

r = -EOPNOTSUPP;

5233

goto out;

5233

goto out;

5234

}

5234

}

5235

5236

r = kvm_mmu_module_init();

5236

r = kvm_mmu_module_init();

5237

if (r)

5237

if (r)

5238

goto out;

5238

goto out;

5239

5240

kvm_set_mmio_spte_mask();

5240

kvm_set_mmio_spte_mask();

5241

kvm_init_msr_list();

5241

kvm_init_msr_list();

5242

5243

kvm_x86_ops = ops;

5243

kvm_x86_ops = ops;

5244

kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,

5244

kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,

5245

PT_DIRTY_MASK, PT64_NX_MASK, 0);

5245

PT_DIRTY_MASK, PT64_NX_MASK, 0);

5246

5247

kvm_timer_init();

5247

kvm_timer_init();

5248

5249

perf_register_guest_info_callbacks(&kvm_guest_cbs);

5249

perf_register_guest_info_callbacks(&kvm_guest_cbs);

5250

5251

if (cpu_has_xsave)

5251

if (cpu_has_xsave)

5252

host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);

5252

host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);

5253

5254

kvm_lapic_init();

5254

kvm_lapic_init();

5255

#ifdef CONFIG_X86_64

5255

#ifdef CONFIG_X86_64

5256

pvclock_gtod_register_notifier(&pvclock_gtod_notifier);

5256

pvclock_gtod_register_notifier(&pvclock_gtod_notifier);

5257

#endif

5257

#endif

5258

5259

return 0;

5259

return 0;

5260

5261

out:

5261

out:

5262

return r;

5262

return r;

5263

}

5263

}

5264

5265

void kvm_arch_exit(void)

5265

void kvm_arch_exit(void)

5266

{

5266

{

5267

perf_unregister_guest_info_callbacks(&kvm_guest_cbs);

5267

perf_unregister_guest_info_callbacks(&kvm_guest_cbs);

5268

5269

if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))

5269

if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))

5270

cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,

5270

cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,

5271

CPUFREQ_TRANSITION_NOTIFIER);

5271

CPUFREQ_TRANSITION_NOTIFIER);

5272

unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block);

5272

unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block);

5273

#ifdef CONFIG_X86_64

5273

#ifdef CONFIG_X86_64

5274

pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);

5274

pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);

5275

#endif

5275

#endif

5276

kvm_x86_ops = NULL;

5276

kvm_x86_ops = NULL;

5277

kvm_mmu_module_exit();

5277

kvm_mmu_module_exit();

5278

}

5278

}

5279

5280

int kvm_emulate_halt(struct kvm_vcpu *vcpu)

5280

int kvm_emulate_halt(struct kvm_vcpu *vcpu)

5281

{

5281

{

5282

++vcpu->stat.halt_exits;

5282

++vcpu->stat.halt_exits;

5283

if (irqchip_in_kernel(vcpu->kvm)) {

5283

if (irqchip_in_kernel(vcpu->kvm)) {

5284

vcpu->arch.mp_state = KVM_MP_STATE_HALTED;

5284

vcpu->arch.mp_state = KVM_MP_STATE_HALTED;

5285

return 1;

5285

return 1;

5286

} else {

5286

} else {

5287

vcpu->run->exit_reason = KVM_EXIT_HLT;

5287

vcpu->run->exit_reason = KVM_EXIT_HLT;

5288

return 0;

5288

return 0;

5289

}

5289

}

5290

}

5290

}

5291

EXPORT_SYMBOL_GPL(kvm_emulate_halt);

5291

EXPORT_SYMBOL_GPL(kvm_emulate_halt);

5292

5293

int kvm_hv_hypercall(struct kvm_vcpu *vcpu)

5293

int kvm_hv_hypercall(struct kvm_vcpu *vcpu)

5294

{

5294

{

5295

u64 param, ingpa, outgpa, ret;

5295

u64 param, ingpa, outgpa, ret;

5296

uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0;

5296

uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0;

5297

bool fast, longmode;

5297

bool fast, longmode;

5298

int cs_db, cs_l;

5298

int cs_db, cs_l;

5299

5300

/*

5300

/*

5301

* hypercall generates UD from non zero cpl and real mode

5301

* hypercall generates UD from non zero cpl and real mode

5302

* per HYPER-V spec

5302

* per HYPER-V spec

5303

*/

5303

*/

5304

if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) {

5304

if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) {

5305

kvm_queue_exception(vcpu, UD_VECTOR);

5305

kvm_queue_exception(vcpu, UD_VECTOR);

5306

return 0;

5306

return 0;

5307

}

5307

}

5308

5309

kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);

5309

kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);

5310

longmode = is_long_mode(vcpu) && cs_l == 1;

5310

longmode = is_long_mode(vcpu) && cs_l == 1;

5311

5312

if (!longmode) {

5312

if (!longmode) {

5313

param = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) |

5313

param = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) |

5314

(kvm_register_read(vcpu, VCPU_REGS_RAX) & 0xffffffff);

5314

(kvm_register_read(vcpu, VCPU_REGS_RAX) & 0xffffffff);

5315

ingpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RBX) << 32) |

5315

ingpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RBX) << 32) |

5316

(kvm_register_read(vcpu, VCPU_REGS_RCX) & 0xffffffff);

5316

(kvm_register_read(vcpu, VCPU_REGS_RCX) & 0xffffffff);

5317

outgpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDI) << 32) |

5317

outgpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDI) << 32) |

5318

(kvm_register_read(vcpu, VCPU_REGS_RSI) & 0xffffffff);

5318

(kvm_register_read(vcpu, VCPU_REGS_RSI) & 0xffffffff);

5319

}

5319

}

5320

#ifdef CONFIG_X86_64

5320

#ifdef CONFIG_X86_64

5321

else {

5321

else {

5322

param = kvm_register_read(vcpu, VCPU_REGS_RCX);

5322

param = kvm_register_read(vcpu, VCPU_REGS_RCX);

5323

ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX);

5323

ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX);

5324

outgpa = kvm_register_read(vcpu, VCPU_REGS_R8);

5324

outgpa = kvm_register_read(vcpu, VCPU_REGS_R8);

5325

}

5325

}

5326

#endif

5326

#endif

5327

5328

code = param & 0xffff;

5328

code = param & 0xffff;

5329

fast = (param >> 16) & 0x1;

5329

fast = (param >> 16) & 0x1;

5330

rep_cnt = (param >> 32) & 0xfff;

5330

rep_cnt = (param >> 32) & 0xfff;

5331

rep_idx = (param >> 48) & 0xfff;

5331

rep_idx = (param >> 48) & 0xfff;

5332

5333

trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa);

5333

trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa);

5334

5335

switch (code) {

5335

switch (code) {

5336

case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT:

5336

case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT:

5337

kvm_vcpu_on_spin(vcpu);

5337

kvm_vcpu_on_spin(vcpu);

5338

break;

5338

break;

5339

default:

5339

default:

5340

res = HV_STATUS_INVALID_HYPERCALL_CODE;

5340

res = HV_STATUS_INVALID_HYPERCALL_CODE;

5341

break;

5341

break;

5342

}

5342

}

5343

5344

ret = res | (((u64)rep_done & 0xfff) << 32);

5344

ret = res | (((u64)rep_done & 0xfff) << 32);

5345

if (longmode) {

5345

if (longmode) {

5346

kvm_register_write(vcpu, VCPU_REGS_RAX, ret);

5346

kvm_register_write(vcpu, VCPU_REGS_RAX, ret);

5347

} else {

5347

} else {

5348

kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32);

5348

kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32);

5349

kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff);

5349

kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff);

5350

}

5350

}

5351

5352

return 1;

5352

return 1;

5353

}

5353

}

5354

5355

int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)

5355

int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)

5356

{

5356

{

5357

unsigned long nr, a0, a1, a2, a3, ret;

5357

unsigned long nr, a0, a1, a2, a3, ret;

5358

int r = 1;

5358

int r = 1;

5359

5360

if (kvm_hv_hypercall_enabled(vcpu->kvm))

5360

if (kvm_hv_hypercall_enabled(vcpu->kvm))

5361

return kvm_hv_hypercall(vcpu);

5361

return kvm_hv_hypercall(vcpu);

5362

5363

nr = kvm_register_read(vcpu, VCPU_REGS_RAX);

5363

nr = kvm_register_read(vcpu, VCPU_REGS_RAX);

5364

a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);

5364

a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);

5365

a1 = kvm_register_read(vcpu, VCPU_REGS_RCX);

5365

a1 = kvm_register_read(vcpu, VCPU_REGS_RCX);

5366

a2 = kvm_register_read(vcpu, VCPU_REGS_RDX);

5366

a2 = kvm_register_read(vcpu, VCPU_REGS_RDX);

5367

a3 = kvm_register_read(vcpu, VCPU_REGS_RSI);

5367

a3 = kvm_register_read(vcpu, VCPU_REGS_RSI);

5368

5369

trace_kvm_hypercall(nr, a0, a1, a2, a3);

5369

trace_kvm_hypercall(nr, a0, a1, a2, a3);

5370

5371

if (!is_long_mode(vcpu)) {

5371

if (!is_long_mode(vcpu)) {

5372

nr &= 0xFFFFFFFF;

5372

nr &= 0xFFFFFFFF;

5373

a0 &= 0xFFFFFFFF;

5373

a0 &= 0xFFFFFFFF;

5374

a1 &= 0xFFFFFFFF;

5374

a1 &= 0xFFFFFFFF;

5375

a2 &= 0xFFFFFFFF;

5375

a2 &= 0xFFFFFFFF;

5376

a3 &= 0xFFFFFFFF;

5376

a3 &= 0xFFFFFFFF;

5377

}

5377

}

5378

5379

if (kvm_x86_ops->get_cpl(vcpu) != 0) {

5379

if (kvm_x86_ops->get_cpl(vcpu) != 0) {

5380

ret = -KVM_EPERM;

5380

ret = -KVM_EPERM;

5381

goto out;

5381

goto out;

5382

}

5382

}

5383

5384

switch (nr) {

5384

switch (nr) {

5385

case KVM_HC_VAPIC_POLL_IRQ:

5385

case KVM_HC_VAPIC_POLL_IRQ:

5386

ret = 0;

5386

ret = 0;

5387

break;

5387

break;

5388

default:

5388

default:

5389

ret = -KVM_ENOSYS;

5389

ret = -KVM_ENOSYS;

5390

break;

5390

break;

5391

}

5391

}

5392

out:

5392

out:

5393

kvm_register_write(vcpu, VCPU_REGS_RAX, ret);

5393

kvm_register_write(vcpu, VCPU_REGS_RAX, ret);

5394

++vcpu->stat.hypercalls;

5394

++vcpu->stat.hypercalls;

5395

return r;

5395

return r;

5396

}

5396

}

5397

EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);

5397

EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);

5398

5399

static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)

5399

static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)

5400

{

5400

{

5401

struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);

5401

struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);

5402

char instruction[3];

5402

char instruction[3];

5403

unsigned long rip = kvm_rip_read(vcpu);

5403

unsigned long rip = kvm_rip_read(vcpu);

5404

5405

/*

5405

/*

5406

* Blow out the MMU to ensure that no other VCPU has an active mapping

5406

* Blow out the MMU to ensure that no other VCPU has an active mapping

5407

* to ensure that the updated hypercall appears atomically across all

5407

* to ensure that the updated hypercall appears atomically across all

5408

* VCPUs.

5408

* VCPUs.

5409

*/

5409

*/

5410

kvm_mmu_zap_all(vcpu->kvm);

5410

kvm_mmu_zap_all(vcpu->kvm);

5411

5412

kvm_x86_ops->patch_hypercall(vcpu, instruction);

5412

kvm_x86_ops->patch_hypercall(vcpu, instruction);

5413

5414

return emulator_write_emulated(ctxt, rip, instruction, 3, NULL);

5414

return emulator_write_emulated(ctxt, rip, instruction, 3, NULL);

5415

}

5415

}

5416

5417

/*

5417

/*

5418

* Check if userspace requested an interrupt window, and that the

5418

* Check if userspace requested an interrupt window, and that the

5419

* interrupt window is open.

5419

* interrupt window is open.

5420

*

5420

*

5421

* No need to exit to userspace if we already have an interrupt queued.

5421

* No need to exit to userspace if we already have an interrupt queued.

5422

*/

5422

*/

5423

static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)

5423

static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)

5424

{

5424

{

5425

return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&

5425

return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&

5426

vcpu->run->request_interrupt_window &&

5426

vcpu->run->request_interrupt_window &&

5427

kvm_arch_interrupt_allowed(vcpu));

5427

kvm_arch_interrupt_allowed(vcpu));

5428

}

5428

}

5429

5430

static void post_kvm_run_save(struct kvm_vcpu *vcpu)

5430

static void post_kvm_run_save(struct kvm_vcpu *vcpu)

5431

{

5431

{

5432

struct kvm_run *kvm_run = vcpu->run;

5432

struct kvm_run *kvm_run = vcpu->run;

5433

5434

kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;

5434

kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;

5435

kvm_run->cr8 = kvm_get_cr8(vcpu);

5435

kvm_run->cr8 = kvm_get_cr8(vcpu);

5436

kvm_run->apic_base = kvm_get_apic_base(vcpu);

5436

kvm_run->apic_base = kvm_get_apic_base(vcpu);

5437

if (irqchip_in_kernel(vcpu->kvm))

5437

if (irqchip_in_kernel(vcpu->kvm))

5438

kvm_run->ready_for_interrupt_injection = 1;

5438

kvm_run->ready_for_interrupt_injection = 1;

5439

else

5439

else

5440

kvm_run->ready_for_interrupt_injection =

5440

kvm_run->ready_for_interrupt_injection =

5441

kvm_arch_interrupt_allowed(vcpu) &&

5441

kvm_arch_interrupt_allowed(vcpu) &&

5442

!kvm_cpu_has_interrupt(vcpu) &&

5442

!kvm_cpu_has_interrupt(vcpu) &&

5443

!kvm_event_needs_reinjection(vcpu);

5443

!kvm_event_needs_reinjection(vcpu);

5444

}

5444

}

5445

5446

static int vapic_enter(struct kvm_vcpu *vcpu)

5446

static int vapic_enter(struct kvm_vcpu *vcpu)

5447

{

5447

{

5448

struct kvm_lapic *apic = vcpu->arch.apic;

5448

struct kvm_lapic *apic = vcpu->arch.apic;

5449

struct page *page;

5449

struct page *page;

5450

5451

if (!apic || !apic->vapic_addr)

5451

if (!apic || !apic->vapic_addr)

5452

return 0;

5452

return 0;

5453

5454

page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);

5454

page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);

5455

if (is_error_page(page))

5455

if (is_error_page(page))

5456

return -EFAULT;

5456

return -EFAULT;

5457

5458

vcpu->arch.apic->vapic_page = page;

5458

vcpu->arch.apic->vapic_page = page;

5459

return 0;

5459

return 0;

5460

}

5460

}

5461

5462

static void vapic_exit(struct kvm_vcpu *vcpu)

5462

static void vapic_exit(struct kvm_vcpu *vcpu)

5463

{

5463

{

5464

struct kvm_lapic *apic = vcpu->arch.apic;

5464

struct kvm_lapic *apic = vcpu->arch.apic;

5465

int idx;

5465

int idx;

5466

5467

if (!apic || !apic->vapic_addr)

5467

if (!apic || !apic->vapic_addr)

5468

return;

5468

return;

5469

5470

idx = srcu_read_lock(&vcpu->kvm->srcu);

5470

idx = srcu_read_lock(&vcpu->kvm->srcu);

5471

kvm_release_page_dirty(apic->vapic_page);

5471

kvm_release_page_dirty(apic->vapic_page);

5472

mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);

5472

mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);

5473

srcu_read_unlock(&vcpu->kvm->srcu, idx);

5473

srcu_read_unlock(&vcpu->kvm->srcu, idx);

5474

}

5474

}

5475

5476

static void update_cr8_intercept(struct kvm_vcpu *vcpu)

5476

static void update_cr8_intercept(struct kvm_vcpu *vcpu)

5477

{

5477

{

5478

int max_irr, tpr;

5478

int max_irr, tpr;

5479

5480

if (!kvm_x86_ops->update_cr8_intercept)

5480

if (!kvm_x86_ops->update_cr8_intercept)

5481

return;

5481

return;

5482

5483

if (!vcpu->arch.apic)

5483

if (!vcpu->arch.apic)

5484

return;

5484

return;

5485

5486

if (!vcpu->arch.apic->vapic_addr)

5486

if (!vcpu->arch.apic->vapic_addr)

5487

max_irr = kvm_lapic_find_highest_irr(vcpu);

5487

max_irr = kvm_lapic_find_highest_irr(vcpu);

5488

else

5488

else

5489

max_irr = -1;

5489

max_irr = -1;

5490

5491

if (max_irr != -1)

5491

if (max_irr != -1)

5492

max_irr >>= 4;

5492

max_irr >>= 4;

5493

5494

tpr = kvm_lapic_get_cr8(vcpu);

5494

tpr = kvm_lapic_get_cr8(vcpu);

5495

5496

kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);

5496

kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);

5497

}

5497

}

5498

5499

static void inject_pending_event(struct kvm_vcpu *vcpu)

5499

static void inject_pending_event(struct kvm_vcpu *vcpu)

5500

{

5500

{

5501

/* try to reinject previous events if any */

5501

/* try to reinject previous events if any */

5502

if (vcpu->arch.exception.pending) {

5502

if (vcpu->arch.exception.pending) {

5503

trace_kvm_inj_exception(vcpu->arch.exception.nr,

5503

trace_kvm_inj_exception(vcpu->arch.exception.nr,

5504

vcpu->arch.exception.has_error_code,

5504

vcpu->arch.exception.has_error_code,

5505

vcpu->arch.exception.error_code);

5505

vcpu->arch.exception.error_code);

5506

kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,

5506

kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,

5507

vcpu->arch.exception.has_error_code,

5507

vcpu->arch.exception.has_error_code,

5508

vcpu->arch.exception.error_code,

5508

vcpu->arch.exception.error_code,

5509

vcpu->arch.exception.reinject);

5509

vcpu->arch.exception.reinject);

5510

return;

5510

return;

5511

}

5511

}

5512

5513

if (vcpu->arch.nmi_injected) {

5513

if (vcpu->arch.nmi_injected) {

5514

kvm_x86_ops->set_nmi(vcpu);

5514

kvm_x86_ops->set_nmi(vcpu);

5515

return;

5515

return;

5516

}

5516

}

5517

5518

if (vcpu->arch.interrupt.pending) {

5518

if (vcpu->arch.interrupt.pending) {

5519

kvm_x86_ops->set_irq(vcpu);

5519

kvm_x86_ops->set_irq(vcpu);

5520

return;

5520

return;

5521

}

5521

}

5522

5523

/* try to inject new event if pending */

5523

/* try to inject new event if pending */

5524

if (vcpu->arch.nmi_pending) {

5524

if (vcpu->arch.nmi_pending) {

5525

if (kvm_x86_ops->nmi_allowed(vcpu)) {

5525

if (kvm_x86_ops->nmi_allowed(vcpu)) {

5526

--vcpu->arch.nmi_pending;

5526

--vcpu->arch.nmi_pending;

5527

vcpu->arch.nmi_injected = true;

5527

vcpu->arch.nmi_injected = true;

5528

kvm_x86_ops->set_nmi(vcpu);

5528

kvm_x86_ops->set_nmi(vcpu);

5529

}

5529

}

5530

} else if (kvm_cpu_has_interrupt(vcpu)) {

5530

} else if (kvm_cpu_has_interrupt(vcpu)) {

5531

if (kvm_x86_ops->interrupt_allowed(vcpu)) {

5531

if (kvm_x86_ops->interrupt_allowed(vcpu)) {

5532

kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),

5532

kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),

5533

false);

5533

false);

5534

kvm_x86_ops->set_irq(vcpu);

5534

kvm_x86_ops->set_irq(vcpu);

5535

}

5535

}

5536

}

5536

}

5537

}

5537

}

5538

5539

static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)

5539

static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)

5540

{

5540

{

5541

if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&

5541

if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&

5542

!vcpu->guest_xcr0_loaded) {

5542

!vcpu->guest_xcr0_loaded) {

5543

/* kvm_set_xcr() also depends on this */

5543

/* kvm_set_xcr() also depends on this */

5544

xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);

5544

xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);

5545

vcpu->guest_xcr0_loaded = 1;

5545

vcpu->guest_xcr0_loaded = 1;

5546

}

5546

}

5547

}

5547

}

5548

5549

static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)

5549

static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)

5550

{

5550

{

5551

if (vcpu->guest_xcr0_loaded) {

5551

if (vcpu->guest_xcr0_loaded) {

5552

if (vcpu->arch.xcr0 != host_xcr0)

5552

if (vcpu->arch.xcr0 != host_xcr0)

5553

xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);

5553

xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);

5554

vcpu->guest_xcr0_loaded = 0;

5554

vcpu->guest_xcr0_loaded = 0;

5555

}

5555

}

5556

}

5556

}

5557

5558

static void process_nmi(struct kvm_vcpu *vcpu)

5558

static void process_nmi(struct kvm_vcpu *vcpu)

5559

{

5559

{

5560

unsigned limit = 2;

5560

unsigned limit = 2;

5561

5562

/*

5562

/*

5563

* x86 is limited to one NMI running, and one NMI pending after it.

5563

* x86 is limited to one NMI running, and one NMI pending after it.

5564

* If an NMI is already in progress, limit further NMIs to just one.

5564

* If an NMI is already in progress, limit further NMIs to just one.

5565

* Otherwise, allow two (and we'll inject the first one immediately).

5565

* Otherwise, allow two (and we'll inject the first one immediately).

5566

*/

5566

*/

5567

if (kvm_x86_ops->get_nmi_mask(vcpu) || vcpu->arch.nmi_injected)

5567

if (kvm_x86_ops->get_nmi_mask(vcpu) || vcpu->arch.nmi_injected)

5568

limit = 1;

5568

limit = 1;

5569

5570

vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);

5570

vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);

5571

vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit);

5571

vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit);

5572

kvm_make_request(KVM_REQ_EVENT, vcpu);

5572

kvm_make_request(KVM_REQ_EVENT, vcpu);

5573

}

5573

}

5574

5575

static void kvm_gen_update_masterclock(struct kvm *kvm)

5575

static void kvm_gen_update_masterclock(struct kvm *kvm)

5576

{

5576

{

5577

#ifdef CONFIG_X86_64

5577

#ifdef CONFIG_X86_64

5578

int i;

5578

int i;

5579

struct kvm_vcpu *vcpu;

5579

struct kvm_vcpu *vcpu;

5580

struct kvm_arch *ka = &kvm->arch;

5580

struct kvm_arch *ka = &kvm->arch;

5581

5582

spin_lock(&ka->pvclock_gtod_sync_lock);

5582

spin_lock(&ka->pvclock_gtod_sync_lock);

5583

kvm_make_mclock_inprogress_request(kvm);

5583

kvm_make_mclock_inprogress_request(kvm);

5584

/* no guest entries from this point */

5584

/* no guest entries from this point */

5585

pvclock_update_vm_gtod_copy(kvm);

5585

pvclock_update_vm_gtod_copy(kvm);

5586

5587

kvm_for_each_vcpu(i, vcpu, kvm)

5587

kvm_for_each_vcpu(i, vcpu, kvm)

5588

set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);

5588

set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);

5589

5590

/* guest entries allowed */

5590

/* guest entries allowed */

5591

kvm_for_each_vcpu(i, vcpu, kvm)

5591

kvm_for_each_vcpu(i, vcpu, kvm)

5592

clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests);

5592

clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests);

5593

5594

spin_unlock(&ka->pvclock_gtod_sync_lock);

5594

spin_unlock(&ka->pvclock_gtod_sync_lock);

5595

#endif

5595

#endif

5596

}

5596

}

5597

5598

static int vcpu_enter_guest(struct kvm_vcpu *vcpu)

5598

static int vcpu_enter_guest(struct kvm_vcpu *vcpu)

5599

{

5599

{

5600

int r;

5600

int r;

5601

bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&

5601

bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&

5602

vcpu->run->request_interrupt_window;

5602

vcpu->run->request_interrupt_window;

5603

bool req_immediate_exit = 0;

5603

bool req_immediate_exit = 0;

5604

5605

if (vcpu->requests) {

5605

if (vcpu->requests) {

5606

if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))

5606

if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))

5607

kvm_mmu_unload(vcpu);

5607

kvm_mmu_unload(vcpu);

5608

if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))

5608

if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))

5609

__kvm_migrate_timers(vcpu);

5609

__kvm_migrate_timers(vcpu);

5610

if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu))

5610

if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu))

5611

kvm_gen_update_masterclock(vcpu->kvm);

5611

kvm_gen_update_masterclock(vcpu->kvm);

5612

if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {

5612

if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {

5613

r = kvm_guest_time_update(vcpu);

5613

r = kvm_guest_time_update(vcpu);

5614

if (unlikely(r))

5614

if (unlikely(r))

5615

goto out;

5615

goto out;

5616

}

5616

}

5617

if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))

5617

if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))

5618

kvm_mmu_sync_roots(vcpu);

5618

kvm_mmu_sync_roots(vcpu);

5619

if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))

5619

if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))

5620

kvm_x86_ops->tlb_flush(vcpu);

5620

kvm_x86_ops->tlb_flush(vcpu);

5621

if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {

5621

if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {

5622

vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;

5622

vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;

5623

r = 0;

5623

r = 0;

5624

goto out;

5624

goto out;

5625

}

5625

}

5626

if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {

5626

if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {

5627

vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;

5627

vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;

5628

r = 0;

5628

r = 0;

5629

goto out;

5629

goto out;

5630

}

5630

}

5631

if (kvm_check_request(KVM_REQ_DEACTIVATE_FPU, vcpu)) {

5631

if (kvm_check_request(KVM_REQ_DEACTIVATE_FPU, vcpu)) {

5632

vcpu->fpu_active = 0;

5632

vcpu->fpu_active = 0;

5633

kvm_x86_ops->fpu_deactivate(vcpu);

5633

kvm_x86_ops->fpu_deactivate(vcpu);

5634

}

5634

}

5635

if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {

5635

if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {

5636

/* Page is swapped out. Do synthetic halt */

5636

/* Page is swapped out. Do synthetic halt */

5637

vcpu->arch.apf.halted = true;

5637

vcpu->arch.apf.halted = true;

5638

r = 1;

5638

r = 1;

5639

goto out;

5639

goto out;

5640

}

5640

}

5641

if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))

5641

if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))

5642

record_steal_time(vcpu);

5642

record_steal_time(vcpu);

5643

if (kvm_check_request(KVM_REQ_NMI, vcpu))

5643

if (kvm_check_request(KVM_REQ_NMI, vcpu))

5644

process_nmi(vcpu);

5644

process_nmi(vcpu);

5645

req_immediate_exit =

5645

req_immediate_exit =

5646

kvm_check_request(KVM_REQ_IMMEDIATE_EXIT, vcpu);

5646

kvm_check_request(KVM_REQ_IMMEDIATE_EXIT, vcpu);

5647

if (kvm_check_request(KVM_REQ_PMU, vcpu))

5647

if (kvm_check_request(KVM_REQ_PMU, vcpu))

5648

kvm_handle_pmu_event(vcpu);

5648

kvm_handle_pmu_event(vcpu);

5649

if (kvm_check_request(KVM_REQ_PMI, vcpu))

5649

if (kvm_check_request(KVM_REQ_PMI, vcpu))

5650

kvm_deliver_pmi(vcpu);

5650

kvm_deliver_pmi(vcpu);

5651

}

5651

}

5652

5653

if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {

5653

if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {

5654

inject_pending_event(vcpu);

5654

inject_pending_event(vcpu);

5655

5656

/* enable NMI/IRQ window open exits if needed */

5656

/* enable NMI/IRQ window open exits if needed */

5657

if (vcpu->arch.nmi_pending)

5657

if (vcpu->arch.nmi_pending)

5658

kvm_x86_ops->enable_nmi_window(vcpu);

5658

kvm_x86_ops->enable_nmi_window(vcpu);

5659

else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)

5659

else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)

5660

kvm_x86_ops->enable_irq_window(vcpu);

5660

kvm_x86_ops->enable_irq_window(vcpu);

5661

5662

if (kvm_lapic_enabled(vcpu)) {

5662

if (kvm_lapic_enabled(vcpu)) {

5663

update_cr8_intercept(vcpu);

5663

update_cr8_intercept(vcpu);

5664

kvm_lapic_sync_to_vapic(vcpu);

5664

kvm_lapic_sync_to_vapic(vcpu);

5665

}

5665

}

5666

}

5666

}

5667

5668

r = kvm_mmu_reload(vcpu);

5668

r = kvm_mmu_reload(vcpu);

5669

if (unlikely(r)) {

5669

if (unlikely(r)) {

5670

goto cancel_injection;

5670

goto cancel_injection;

5671

}

5671

}

5672

5673

preempt_disable();

5673

preempt_disable();

5674

5675

kvm_x86_ops->prepare_guest_switch(vcpu);

5675

kvm_x86_ops->prepare_guest_switch(vcpu);

5676

if (vcpu->fpu_active)

5676

if (vcpu->fpu_active)

5677

kvm_load_guest_fpu(vcpu);

5677

kvm_load_guest_fpu(vcpu);

5678

kvm_load_guest_xcr0(vcpu);

5678

kvm_load_guest_xcr0(vcpu);

5679

5680

vcpu->mode = IN_GUEST_MODE;

5680

vcpu->mode = IN_GUEST_MODE;

5681

5682

/* We should set ->mode before check ->requests,

5682

/* We should set ->mode before check ->requests,

5683

* see the comment in make_all_cpus_request.

5683

* see the comment in make_all_cpus_request.

5684

*/

5684

*/

5685

smp_mb();

5685

smp_mb();

5686

5687

local_irq_disable();

5687

local_irq_disable();

5688

5689

if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests

5689

if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests

5690

|| need_resched() || signal_pending(current)) {

5690

|| need_resched() || signal_pending(current)) {

5691

vcpu->mode = OUTSIDE_GUEST_MODE;

5691

vcpu->mode = OUTSIDE_GUEST_MODE;

5692

smp_wmb();

5692

smp_wmb();

5693

local_irq_enable();

5693

local_irq_enable();

5694

preempt_enable();

5694

preempt_enable();

5695

r = 1;

5695

r = 1;

5696

goto cancel_injection;

5696

goto cancel_injection;

5697

}

5697

}

5698

5699

srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);

5699

srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);

5700

5701

if (req_immediate_exit)

5701

if (req_immediate_exit)

5702

smp_send_reschedule(vcpu->cpu);

5702

smp_send_reschedule(vcpu->cpu);

5703

5704

kvm_guest_enter();

5704

kvm_guest_enter();

5705

5706

if (unlikely(vcpu->arch.switch_db_regs)) {

5706

if (unlikely(vcpu->arch.switch_db_regs)) {

5707

set_debugreg(0, 7);

5707

set_debugreg(0, 7);

5708

set_debugreg(vcpu->arch.eff_db[0], 0);

5708

set_debugreg(vcpu->arch.eff_db[0], 0);

5709

set_debugreg(vcpu->arch.eff_db[1], 1);

5709

set_debugreg(vcpu->arch.eff_db[1], 1);

5710

set_debugreg(vcpu->arch.eff_db[2], 2);

5710

set_debugreg(vcpu->arch.eff_db[2], 2);

5711

set_debugreg(vcpu->arch.eff_db[3], 3);

5711

set_debugreg(vcpu->arch.eff_db[3], 3);

5712

}

5712

}

5713

5714

trace_kvm_entry(vcpu->vcpu_id);

5714

trace_kvm_entry(vcpu->vcpu_id);

5715

kvm_x86_ops->run(vcpu);

5715

kvm_x86_ops->run(vcpu);

5716

5717

/*

5717

/*

5718

* If the guest has used debug registers, at least dr7

5718

* If the guest has used debug registers, at least dr7

5719

* will be disabled while returning to the host.

5719

* will be disabled while returning to the host.

5720

* If we don't have active breakpoints in the host, we don't

5720

* If we don't have active breakpoints in the host, we don't

5721

* care about the messed up debug address registers. But if

5721

* care about the messed up debug address registers. But if

5722

* we have some of them active, restore the old state.

5722

* we have some of them active, restore the old state.

5723

*/

5723

*/

5724

if (hw_breakpoint_active())

5724

if (hw_breakpoint_active())

5725

hw_breakpoint_restore();

5725

hw_breakpoint_restore();

5726

5727

vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu,

5727

vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu,

5728

native_read_tsc());

5728

native_read_tsc());

5729

5730

vcpu->mode = OUTSIDE_GUEST_MODE;

5730

vcpu->mode = OUTSIDE_GUEST_MODE;

5731

smp_wmb();

5731

smp_wmb();

5732

local_irq_enable();

5732

local_irq_enable();

5733

5734

++vcpu->stat.exits;

5734

++vcpu->stat.exits;

5735

5736

/*

5736

/*

5737

* We must have an instruction between local_irq_enable() and

5737

* We must have an instruction between local_irq_enable() and

5738

* kvm_guest_exit(), so the timer interrupt isn't delayed by

5738

* kvm_guest_exit(), so the timer interrupt isn't delayed by

5739

* the interrupt shadow. The stat.exits increment will do nicely.

5739

* the interrupt shadow. The stat.exits increment will do nicely.

5740

* But we need to prevent reordering, hence this barrier():

5740

* But we need to prevent reordering, hence this barrier():

5741

*/

5741

*/

5742

barrier();

5742

barrier();

5743

5744

kvm_guest_exit();

5744

kvm_guest_exit();

5745

5746

preempt_enable();

5746

preempt_enable();

5747

5748

vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);

5748

vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);

5749

5750

/*

5750

/*

5751

* Profile KVM exit RIPs:

5751

* Profile KVM exit RIPs:

5752

*/

5752

*/

5753

if (unlikely(prof_on == KVM_PROFILING)) {

5753

if (unlikely(prof_on == KVM_PROFILING)) {

5754

unsigned long rip = kvm_rip_read(vcpu);

5754

unsigned long rip = kvm_rip_read(vcpu);

5755

profile_hit(KVM_PROFILING, (void *)rip);

5755

profile_hit(KVM_PROFILING, (void *)rip);

5756

}

5756

}

5757

5758

if (unlikely(vcpu->arch.tsc_always_catchup))

5758

if (unlikely(vcpu->arch.tsc_always_catchup))

5759

kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);

5759

kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);

5760

5761

if (vcpu->arch.apic_attention)

5761

if (vcpu->arch.apic_attention)

5762

kvm_lapic_sync_from_vapic(vcpu);

5762

kvm_lapic_sync_from_vapic(vcpu);

5763

5764

r = kvm_x86_ops->handle_exit(vcpu);

5764

r = kvm_x86_ops->handle_exit(vcpu);

5765

return r;

5765

return r;

5766

5767

cancel_injection:

5767

cancel_injection:

5768

kvm_x86_ops->cancel_injection(vcpu);

5768

kvm_x86_ops->cancel_injection(vcpu);

5769

if (unlikely(vcpu->arch.apic_attention))

5769

if (unlikely(vcpu->arch.apic_attention))

5770

kvm_lapic_sync_from_vapic(vcpu);

5770

kvm_lapic_sync_from_vapic(vcpu);

5771

out:

5771

out:

5772

return r;

5772

return r;

5773

}

5773

}

5774

5775

5776

static int __vcpu_run(struct kvm_vcpu *vcpu)

5776

static int __vcpu_run(struct kvm_vcpu *vcpu)

5777

{

5777

{

5778

int r;

5778

int r;

5779

struct kvm *kvm = vcpu->kvm;

5779

struct kvm *kvm = vcpu->kvm;

5780

5781

if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {

5781

if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {

5782

pr_debug("vcpu %d received sipi with vector # %x\n",

5782

pr_debug("vcpu %d received sipi with vector # %x\n",

5783

vcpu->vcpu_id, vcpu->arch.sipi_vector);

5783

vcpu->vcpu_id, vcpu->arch.sipi_vector);

5784

kvm_lapic_reset(vcpu);

5784

kvm_lapic_reset(vcpu);

5785

r = kvm_vcpu_reset(vcpu);

5785

r = kvm_vcpu_reset(vcpu);

5786

if (r)

5786

if (r)

5787

return r;

5787

return r;

5788

vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;

5788

vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;

5789

}

5789

}

5790

5791

vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);

5791

vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);

5792

r = vapic_enter(vcpu);

5792

r = vapic_enter(vcpu);

5793

if (r) {

5793

if (r) {

5794

srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);

5794

srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);

5795

return r;

5795

return r;

5796

}

5796

}

5797

5798

r = 1;

5798

r = 1;

5799

while (r > 0) {

5799

while (r > 0) {

5800

if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&

5800

if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&

5801

!vcpu->arch.apf.halted)

5801

!vcpu->arch.apf.halted)

5802

r = vcpu_enter_guest(vcpu);

5802

r = vcpu_enter_guest(vcpu);

5803

else {

5803

else {

5804

srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);

5804

srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);

5805

kvm_vcpu_block(vcpu);

5805

kvm_vcpu_block(vcpu);

5806

vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);

5806

vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);

5807

if (kvm_check_request(KVM_REQ_UNHALT, vcpu))

5807

if (kvm_check_request(KVM_REQ_UNHALT, vcpu))

5808

{

5808

{

5809

switch(vcpu->arch.mp_state) {

5809

switch(vcpu->arch.mp_state) {

5810

case KVM_MP_STATE_HALTED:

5810

case KVM_MP_STATE_HALTED:

5811

vcpu->arch.mp_state =

5811

vcpu->arch.mp_state =

5812

KVM_MP_STATE_RUNNABLE;

5812

KVM_MP_STATE_RUNNABLE;

5813

case KVM_MP_STATE_RUNNABLE:

5813

case KVM_MP_STATE_RUNNABLE:

5814

vcpu->arch.apf.halted = false;

5814

vcpu->arch.apf.halted = false;

5815

break;

5815

break;

5816

case KVM_MP_STATE_SIPI_RECEIVED:

5816

case KVM_MP_STATE_SIPI_RECEIVED:

5817

default:

5817

default:

5818

r = -EINTR;

5818

r = -EINTR;

5819

break;

5819

break;

5820

}

5820

}

5821

}

5821

}

5822

}

5822

}

5823

5824

if (r <= 0)

5824

if (r <= 0)

5825

break;

5825

break;

5826

5827

clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);

5827

clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);

5828

if (kvm_cpu_has_pending_timer(vcpu))

5828

if (kvm_cpu_has_pending_timer(vcpu))

5829

kvm_inject_pending_timer_irqs(vcpu);

5829

kvm_inject_pending_timer_irqs(vcpu);

5830

5831

if (dm_request_for_irq_injection(vcpu)) {

5831

if (dm_request_for_irq_injection(vcpu)) {

5832

r = -EINTR;

5832

r = -EINTR;

5833

vcpu->run->exit_reason = KVM_EXIT_INTR;

5833

vcpu->run->exit_reason = KVM_EXIT_INTR;

5834

++vcpu->stat.request_irq_exits;

5834

++vcpu->stat.request_irq_exits;

5835

}

5835

}

5836

5837

kvm_check_async_pf_completion(vcpu);

5837

kvm_check_async_pf_completion(vcpu);

5838

5839

if (signal_pending(current)) {

5839

if (signal_pending(current)) {

5840

r = -EINTR;

5840

r = -EINTR;

5841

vcpu->run->exit_reason = KVM_EXIT_INTR;

5841

vcpu->run->exit_reason = KVM_EXIT_INTR;

5842

++vcpu->stat.signal_exits;

5842

++vcpu->stat.signal_exits;

5843

}

5843

}

5844

if (need_resched()) {

5844

if (need_resched()) {

5845

srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);

5845

srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);

5846

kvm_resched(vcpu);

5846

kvm_resched(vcpu);

5847

vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);

5847

vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);

5848

}

5848

}

5849

}

5849

}

5850

5851

srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);

5851

srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);

5852

5853

vapic_exit(vcpu);

5853

vapic_exit(vcpu);

5854

5855

return r;

5855

return r;

5856

}

5856

}

5857

5858

static inline int complete_emulated_io(struct kvm_vcpu *vcpu)

5858

static inline int complete_emulated_io(struct kvm_vcpu *vcpu)

5859

{

5859

{

5860

int r;

5860

int r;

5861

vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);

5861

vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);

5862

r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);

5862

r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);

5863

srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);

5863

srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);

5864

if (r != EMULATE_DONE)

5864

if (r != EMULATE_DONE)

5865

return 0;

5865

return 0;

5866

return 1;

5866

return 1;

5867

}

5867

}

5868

5869

static int complete_emulated_pio(struct kvm_vcpu *vcpu)

5869

static int complete_emulated_pio(struct kvm_vcpu *vcpu)

5870

{

5870

{

5871

BUG_ON(!vcpu->arch.pio.count);

5871

BUG_ON(!vcpu->arch.pio.count);

5872

5873

return complete_emulated_io(vcpu);

5873

return complete_emulated_io(vcpu);

5874

}

5874

}

5875

5876

/*

5876

/*

5877

* Implements the following, as a state machine:

5877

* Implements the following, as a state machine:

5878

*

5878

*

5879

* read:

5879

* read:

5880

* for each fragment

5880

* for each fragment

5881

* for each mmio piece in the fragment

5881

* for each mmio piece in the fragment

5882

* write gpa, len

5882

* write gpa, len

5883

* exit

5883

* exit

5884

* copy data

5884

* copy data

5885

* execute insn

5885

* execute insn

5886

*

5886

*

5887

* write:

5887

* write:

5888

* for each fragment

5888

* for each fragment

5889

* for each mmio piece in the fragment

5889

* for each mmio piece in the fragment

5890

* write gpa, len

5890

* write gpa, len

5891

* copy data

5891

* copy data

5892

* exit

5892

* exit

5893

*/

5893

*/

5894

static int complete_emulated_mmio(struct kvm_vcpu *vcpu)

5894

static int complete_emulated_mmio(struct kvm_vcpu *vcpu)

5895

{

5895

{

5896

struct kvm_run *run = vcpu->run;

5896

struct kvm_run *run = vcpu->run;

5897

struct kvm_mmio_fragment *frag;

5897

struct kvm_mmio_fragment *frag;

5898

unsigned len;

5898

unsigned len;

5899

5900

BUG_ON(!vcpu->mmio_needed);

5900

BUG_ON(!vcpu->mmio_needed);

5901

5902

/* Complete previous fragment */

5902

/* Complete previous fragment */

5903

frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment];

5903

frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment];

5904

len = min(8u, frag->len);

5904

len = min(8u, frag->len);

5905

if (!vcpu->mmio_is_write)

5905

if (!vcpu->mmio_is_write)

5906

memcpy(frag->data, run->mmio.data, len);

5906

memcpy(frag->data, run->mmio.data, len);

5907

5908

if (frag->len <= 8) {

5908

if (frag->len <= 8) {

5909

/* Switch to the next fragment. */

5909

/* Switch to the next fragment. */

5910

frag++;

5910

frag++;

5911

vcpu->mmio_cur_fragment++;

5911

vcpu->mmio_cur_fragment++;

5912

} else {

5912

} else {

5913

/* Go forward to the next mmio piece. */

5913

/* Go forward to the next mmio piece. */

5914

frag->data += len;

5914

frag->data += len;

5915

frag->gpa += len;

5915

frag->gpa += len;

5916

frag->len -= len;

5916

frag->len -= len;

5917

}

5917

}

5918

5919

if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) {

5919

if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) {

5920

vcpu->mmio_needed = 0;

5920

vcpu->mmio_needed = 0;

5921

if (vcpu->mmio_is_write)

5921

if (vcpu->mmio_is_write)

5922

return 1;

5922

return 1;

5923

vcpu->mmio_read_completed = 1;

5923

vcpu->mmio_read_completed = 1;

5924

return complete_emulated_io(vcpu);

5924

return complete_emulated_io(vcpu);

5925

}

5925

}

5926

5927

run->exit_reason = KVM_EXIT_MMIO;

5927

run->exit_reason = KVM_EXIT_MMIO;

5928

run->mmio.phys_addr = frag->gpa;

5928

run->mmio.phys_addr = frag->gpa;

5929

if (vcpu->mmio_is_write)

5929

if (vcpu->mmio_is_write)

5930

memcpy(run->mmio.data, frag->data, min(8u, frag->len));

5930

memcpy(run->mmio.data, frag->data, min(8u, frag->len));

5931

run->mmio.len = min(8u, frag->len);

5931

run->mmio.len = min(8u, frag->len);

5932

run->mmio.is_write = vcpu->mmio_is_write;

5932

run->mmio.is_write = vcpu->mmio_is_write;

5933

vcpu->arch.complete_userspace_io = complete_emulated_mmio;

5933

vcpu->arch.complete_userspace_io = complete_emulated_mmio;

5934

return 0;

5934

return 0;

5935

}

5935

}

5936

5937

5938

int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)

5938

int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)

5939

{

5939

{

5940

int r;

5940

int r;

5941

sigset_t sigsaved;

5941

sigset_t sigsaved;

5942

5943

if (!tsk_used_math(current) && init_fpu(current))

5943

if (!tsk_used_math(current) && init_fpu(current))

5944

return -ENOMEM;

5944

return -ENOMEM;

5945

5946

if (vcpu->sigset_active)

5946

if (vcpu->sigset_active)

5947

sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);

5947

sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);

5948

5949

if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {

5949

if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {

5950

kvm_vcpu_block(vcpu);

5950

kvm_vcpu_block(vcpu);

5951

clear_bit(KVM_REQ_UNHALT, &vcpu->requests);

5951

clear_bit(KVM_REQ_UNHALT, &vcpu->requests);

5952

r = -EAGAIN;

5952

r = -EAGAIN;

5953

goto out;

5953

goto out;

5954

}

5954

}

5955

5956

/* re-sync apic's tpr */

5956

/* re-sync apic's tpr */

5957

if (!irqchip_in_kernel(vcpu->kvm)) {

5957

if (!irqchip_in_kernel(vcpu->kvm)) {

5958

if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {

5958

if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {

5959

r = -EINVAL;

5959

r = -EINVAL;

5960

goto out;

5960

goto out;

5961

}

5961

}

5962

}

5962

}

5963

5964

if (unlikely(vcpu->arch.complete_userspace_io)) {

5964

if (unlikely(vcpu->arch.complete_userspace_io)) {

5965

int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io;

5965

int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io;

5966

vcpu->arch.complete_userspace_io = NULL;

5966

vcpu->arch.complete_userspace_io = NULL;

5967

r = cui(vcpu);

5967

r = cui(vcpu);

5968

if (r <= 0)

5968

if (r <= 0)

5969

goto out;

5969

goto out;

5970

} else

5970

} else

5971

WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);

5971

WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);

5972

5973

r = __vcpu_run(vcpu);

5973

r = __vcpu_run(vcpu);

5974

5975

out:

5975

out:

5976

post_kvm_run_save(vcpu);

5976

post_kvm_run_save(vcpu);

5977

if (vcpu->sigset_active)

5977

if (vcpu->sigset_active)

5978

sigprocmask(SIG_SETMASK, &sigsaved, NULL);

5978

sigprocmask(SIG_SETMASK, &sigsaved, NULL);

5979

5980

return r;

5980

return r;

5981

}

5981

}

5982

5983

int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)

5983

int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)

5984

{

5984

{

5985

if (vcpu->arch.emulate_regs_need_sync_to_vcpu) {

5985

if (vcpu->arch.emulate_regs_need_sync_to_vcpu) {

5986

/*

5986

/*

5987

* We are here if userspace calls get_regs() in the middle of

5987

* We are here if userspace calls get_regs() in the middle of

5988

* instruction emulation. Registers state needs to be copied

5988

* instruction emulation. Registers state needs to be copied

5989

* back from emulation context to vcpu. Userspace shouldn't do

5989

* back from emulation context to vcpu. Userspace shouldn't do

5990

* that usually, but some bad designed PV devices (vmware

5990

* that usually, but some bad designed PV devices (vmware

5991

* backdoor interface) need this to work

5991

* backdoor interface) need this to work

5992

*/

5992

*/

5993

emulator_writeback_register_cache(&vcpu->arch.emulate_ctxt);

5993

emulator_writeback_register_cache(&vcpu->arch.emulate_ctxt);

5994

vcpu->arch.emulate_regs_need_sync_to_vcpu = false;

5994

vcpu->arch.emulate_regs_need_sync_to_vcpu = false;

5995

}

5995

}

5996

regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);

5996

regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);

5997

regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);

5997

regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);

5998

regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);

5998

regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);

5999

regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX);

5999

regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX);

6000

regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI);

6000

regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI);

6001

regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI);

6001

regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI);

6002

regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);

6002

regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);

6003

regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP);

6003

regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP);

6004

#ifdef CONFIG_X86_64

6004

#ifdef CONFIG_X86_64

6005

regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8);

6005

regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8);

6006

regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9);

6006

regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9);

6007

regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10);

6007

regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10);

6008

regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11);

6008

regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11);

6009

regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12);

6009

regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12);

6010

regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13);

6010

regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13);

6011

regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14);

6011

regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14);

6012

regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15);

6012

regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15);

6013

#endif

6013

#endif

6014

6015

regs->rip = kvm_rip_read(vcpu);

6015

regs->rip = kvm_rip_read(vcpu);

6016

regs->rflags = kvm_get_rflags(vcpu);

6016

regs->rflags = kvm_get_rflags(vcpu);

6017

6018

return 0;

6018

return 0;

6019

}

6019

}

6020

6021

int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)

6021

int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)

6022

{

6022

{

6023

vcpu->arch.emulate_regs_need_sync_from_vcpu = true;

6023

vcpu->arch.emulate_regs_need_sync_from_vcpu = true;

6024

vcpu->arch.emulate_regs_need_sync_to_vcpu = false;

6024

vcpu->arch.emulate_regs_need_sync_to_vcpu = false;

6025

6026

kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);

6026

kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);

6027

kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);

6027

kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);

6028

kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);

6028

kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);

6029

kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx);

6029

kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx);

6030

kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi);

6030

kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi);

6031

kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi);

6031

kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi);

6032

kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp);

6032

kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp);

6033

kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp);

6033

kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp);

6034

#ifdef CONFIG_X86_64

6034

#ifdef CONFIG_X86_64

6035

kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8);

6035

kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8);

6036

kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9);

6036

kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9);

6037

kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10);

6037

kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10);

6038

kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11);

6038

kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11);

6039

kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12);

6039

kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12);

6040

kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13);

6040

kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13);

6041

kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14);

6041

kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14);

6042

kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15);

6042

kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15);

6043

#endif

6043

#endif

6044

6045

kvm_rip_write(vcpu, regs->rip);

6045

kvm_rip_write(vcpu, regs->rip);

6046

kvm_set_rflags(vcpu, regs->rflags);

6046

kvm_set_rflags(vcpu, regs->rflags);

6047

6048

vcpu->arch.exception.pending = false;

6048

vcpu->arch.exception.pending = false;

6049

6050

kvm_make_request(KVM_REQ_EVENT, vcpu);

6050

kvm_make_request(KVM_REQ_EVENT, vcpu);

6051

6052

return 0;

6052

return 0;

6053

}

6053

}

6054

6055

void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)

6055

void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)

6056

{

6056

{

6057

struct kvm_segment cs;

6057

struct kvm_segment cs;

6058

6059

kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);

6059

kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);

6060

*db = cs.db;

6060

*db = cs.db;

6061

*l = cs.l;

6061

*l = cs.l;

6062

}

6062

}

6063

EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);

6063

EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);

6064

6065

int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,

6065

int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,

6066

struct kvm_sregs *sregs)

6066

struct kvm_sregs *sregs)

6067

{

6067

{

6068

struct desc_ptr dt;

6068

struct desc_ptr dt;

6069

6070

kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);

6070

kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);

6071

kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);

6071

kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);

6072

kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);

6072

kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);

6073

kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);

6073

kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);

6074

kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);

6074

kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);

6075

kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);

6075

kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);

6076

6077

kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);

6077

kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);

6078

kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);

6078

kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);

6079

6080

kvm_x86_ops->get_idt(vcpu, &dt);

6080

kvm_x86_ops->get_idt(vcpu, &dt);

6081

sregs->idt.limit = dt.size;

6081

sregs->idt.limit = dt.size;

6082

sregs->idt.base = dt.address;

6082

sregs->idt.base = dt.address;

6083

kvm_x86_ops->get_gdt(vcpu, &dt);

6083

kvm_x86_ops->get_gdt(vcpu, &dt);

6084

sregs->gdt.limit = dt.size;

6084

sregs->gdt.limit = dt.size;

6085

sregs->gdt.base = dt.address;

6085

sregs->gdt.base = dt.address;

6086

6087

sregs->cr0 = kvm_read_cr0(vcpu);

6087

sregs->cr0 = kvm_read_cr0(vcpu);

6088

sregs->cr2 = vcpu->arch.cr2;

6088

sregs->cr2 = vcpu->arch.cr2;

6089

sregs->cr3 = kvm_read_cr3(vcpu);

6089

sregs->cr3 = kvm_read_cr3(vcpu);

6090

sregs->cr4 = kvm_read_cr4(vcpu);

6090

sregs->cr4 = kvm_read_cr4(vcpu);

6091

sregs->cr8 = kvm_get_cr8(vcpu);

6091

sregs->cr8 = kvm_get_cr8(vcpu);

6092

sregs->efer = vcpu->arch.efer;

6092

sregs->efer = vcpu->arch.efer;

6093

sregs->apic_base = kvm_get_apic_base(vcpu);

6093

sregs->apic_base = kvm_get_apic_base(vcpu);

6094

6095

memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap);

6095

memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap);

6096

6097

if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft)

6097

if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft)

6098

set_bit(vcpu->arch.interrupt.nr,

6098

set_bit(vcpu->arch.interrupt.nr,

6099

(unsigned long *)sregs->interrupt_bitmap);

6099

(unsigned long *)sregs->interrupt_bitmap);

6100

6101

return 0;

6101

return 0;

6102

}

6102

}

6103

6104

int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,

6104

int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,

6105

struct kvm_mp_state *mp_state)

6105

struct kvm_mp_state *mp_state)

6106

{

6106

{

6107

mp_state->mp_state = vcpu->arch.mp_state;

6107

mp_state->mp_state = vcpu->arch.mp_state;

6108

return 0;

6108

return 0;

6109

}

6109

}

6110

6111

int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,

6111

int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,

6112

struct kvm_mp_state *mp_state)

6112

struct kvm_mp_state *mp_state)

6113

{

6113

{

6114

vcpu->arch.mp_state = mp_state->mp_state;

6114

vcpu->arch.mp_state = mp_state->mp_state;

6115

kvm_make_request(KVM_REQ_EVENT, vcpu);

6115

kvm_make_request(KVM_REQ_EVENT, vcpu);

6116

return 0;

6116

return 0;

6117

}

6117

}

6118

6119

int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,

6119

int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,

6120

int reason, bool has_error_code, u32 error_code)

6120

int reason, bool has_error_code, u32 error_code)

6121

{

6121

{

6122

struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;

6122

struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;

6123

int ret;

6123

int ret;

6124

6125

init_emulate_ctxt(vcpu);

6125

init_emulate_ctxt(vcpu);

6126

6127

ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason,

6127

ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason,

6128

has_error_code, error_code);

6128

has_error_code, error_code);

6129

6130

if (ret)

6130

if (ret)

6131

return EMULATE_FAIL;

6131

return EMULATE_FAIL;

6132

6133

kvm_rip_write(vcpu, ctxt->eip);

6133

kvm_rip_write(vcpu, ctxt->eip);

6134

kvm_set_rflags(vcpu, ctxt->eflags);

6134

kvm_set_rflags(vcpu, ctxt->eflags);

6135

kvm_make_request(KVM_REQ_EVENT, vcpu);

6135

kvm_make_request(KVM_REQ_EVENT, vcpu);

6136

return EMULATE_DONE;

6136

return EMULATE_DONE;

6137

}

6137

}

6138

EXPORT_SYMBOL_GPL(kvm_task_switch);

6138

EXPORT_SYMBOL_GPL(kvm_task_switch);

6139

6140

int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,

6140

int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,

6141

struct kvm_sregs *sregs)

6141

struct kvm_sregs *sregs)

6142

{

6142

{

6143

int mmu_reset_needed = 0;

6143

int mmu_reset_needed = 0;

6144

int pending_vec, max_bits, idx;

6144

int pending_vec, max_bits, idx;

6145

struct desc_ptr dt;

6145

struct desc_ptr dt;

6146

6147

if (!guest_cpuid_has_xsave(vcpu) && (sregs->cr4 & X86_CR4_OSXSAVE))

6147

if (!guest_cpuid_has_xsave(vcpu) && (sregs->cr4 & X86_CR4_OSXSAVE))

6148

return -EINVAL;

6148

return -EINVAL;

6149

6150

dt.size = sregs->idt.limit;

6150

dt.size = sregs->idt.limit;

6151

dt.address = sregs->idt.base;

6151

dt.address = sregs->idt.base;

6152

kvm_x86_ops->set_idt(vcpu, &dt);

6152

kvm_x86_ops->set_idt(vcpu, &dt);

6153

dt.size = sregs->gdt.limit;

6153

dt.size = sregs->gdt.limit;

6154

dt.address = sregs->gdt.base;

6154

dt.address = sregs->gdt.base;

6155

kvm_x86_ops->set_gdt(vcpu, &dt);

6155

kvm_x86_ops->set_gdt(vcpu, &dt);

6156

6157

vcpu->arch.cr2 = sregs->cr2;

6157

vcpu->arch.cr2 = sregs->cr2;

6158

mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;

6158

mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;

6159

vcpu->arch.cr3 = sregs->cr3;

6159

vcpu->arch.cr3 = sregs->cr3;

6160

__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);

6160

__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);

6161

6162

kvm_set_cr8(vcpu, sregs->cr8);

6162

kvm_set_cr8(vcpu, sregs->cr8);

6163

6164

mmu_reset_needed |= vcpu->arch.efer != sregs->efer;

6164

mmu_reset_needed |= vcpu->arch.efer != sregs->efer;

6165

kvm_x86_ops->set_efer(vcpu, sregs->efer);

6165

kvm_x86_ops->set_efer(vcpu, sregs->efer);

6166

kvm_set_apic_base(vcpu, sregs->apic_base);

6166

kvm_set_apic_base(vcpu, sregs->apic_base);

6167

6168

mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;

6168

mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;

6169

kvm_x86_ops->set_cr0(vcpu, sregs->cr0);

6169

kvm_x86_ops->set_cr0(vcpu, sregs->cr0);

6170

vcpu->arch.cr0 = sregs->cr0;

6170

vcpu->arch.cr0 = sregs->cr0;

6171

6172

mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;

6172

mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;

6173

kvm_x86_ops->set_cr4(vcpu, sregs->cr4);

6173

kvm_x86_ops->set_cr4(vcpu, sregs->cr4);

6174

if (sregs->cr4 & X86_CR4_OSXSAVE)

6174

if (sregs->cr4 & X86_CR4_OSXSAVE)

6175

kvm_update_cpuid(vcpu);

6175

kvm_update_cpuid(vcpu);

6176

6177

idx = srcu_read_lock(&vcpu->kvm->srcu);

6177

idx = srcu_read_lock(&vcpu->kvm->srcu);

6178

if (!is_long_mode(vcpu) && is_pae(vcpu)) {

6178

if (!is_long_mode(vcpu) && is_pae(vcpu)) {

6179

load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));

6179

load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));

6180

mmu_reset_needed = 1;

6180

mmu_reset_needed = 1;

6181

}

6181

}

6182

srcu_read_unlock(&vcpu->kvm->srcu, idx);

6182

srcu_read_unlock(&vcpu->kvm->srcu, idx);

6183

6184

if (mmu_reset_needed)

6184

if (mmu_reset_needed)

6185

kvm_mmu_reset_context(vcpu);

6185

kvm_mmu_reset_context(vcpu);

6186

6187

max_bits = KVM_NR_INTERRUPTS;

6187

max_bits = KVM_NR_INTERRUPTS;

6188

pending_vec = find_first_bit(

6188

pending_vec = find_first_bit(

6189

(const unsigned long *)sregs->interrupt_bitmap, max_bits);

6189

(const unsigned long *)sregs->interrupt_bitmap, max_bits);

6190

if (pending_vec < max_bits) {

6190

if (pending_vec < max_bits) {

6191

kvm_queue_interrupt(vcpu, pending_vec, false);

6191

kvm_queue_interrupt(vcpu, pending_vec, false);

6192

pr_debug("Set back pending irq %d\n", pending_vec);

6192

pr_debug("Set back pending irq %d\n", pending_vec);

6193

}

6193

}

6194

6195

kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);

6195

kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);

6196

kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);

6196

kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);

6197

kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES);

6197

kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES);

6198

kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);

6198

kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);

6199

kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);

6199

kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);

6200

kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);

6200

kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);

6201

6202

kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);

6202

kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);

6203

kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);

6203

kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);

6204

6205

update_cr8_intercept(vcpu);

6205

update_cr8_intercept(vcpu);

6206

6207

/* Older userspace won't unhalt the vcpu on reset. */

6207

/* Older userspace won't unhalt the vcpu on reset. */

6208

if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 &&

6208

if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 &&

6209

sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&

6209

sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&

6210

!is_protmode(vcpu))

6210

!is_protmode(vcpu))

6211

vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;

6211

vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;

6212

6213

kvm_make_request(KVM_REQ_EVENT, vcpu);

6213

kvm_make_request(KVM_REQ_EVENT, vcpu);

6214

6215

return 0;

6215

return 0;

6216

}

6216

}

6217

6218

int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,

6218

int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,

6219

struct kvm_guest_debug *dbg)

6219

struct kvm_guest_debug *dbg)

6220

{

6220

{

6221

unsigned long rflags;

6221

unsigned long rflags;

6222

int i, r;

6222

int i, r;

6223

6224

if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {

6224

if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {

6225

r = -EBUSY;

6225

r = -EBUSY;

6226

if (vcpu->arch.exception.pending)

6226

if (vcpu->arch.exception.pending)

6227

goto out;

6227

goto out;

6228

if (dbg->control & KVM_GUESTDBG_INJECT_DB)

6228

if (dbg->control & KVM_GUESTDBG_INJECT_DB)

6229

kvm_queue_exception(vcpu, DB_VECTOR);

6229

kvm_queue_exception(vcpu, DB_VECTOR);

6230

else

6230

else

6231

kvm_queue_exception(vcpu, BP_VECTOR);

6231

kvm_queue_exception(vcpu, BP_VECTOR);

6232

}

6232

}

6233

6234

/*

6234

/*

6235

* Read rflags as long as potentially injected trace flags are still

6235

* Read rflags as long as potentially injected trace flags are still

6236

* filtered out.

6236

* filtered out.

6237

*/

6237

*/

6238

rflags = kvm_get_rflags(vcpu);

6238

rflags = kvm_get_rflags(vcpu);

6239

6240

vcpu->guest_debug = dbg->control;

6240

vcpu->guest_debug = dbg->control;

6241

if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))

6241

if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))

6242

vcpu->guest_debug = 0;

6242

vcpu->guest_debug = 0;

6243

6244

if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {

6244

if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {

6245

for (i = 0; i < KVM_NR_DB_REGS; ++i)

6245

for (i = 0; i < KVM_NR_DB_REGS; ++i)

6246

vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];

6246

vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];

6247

vcpu->arch.guest_debug_dr7 = dbg->arch.debugreg[7];

6247

vcpu->arch.guest_debug_dr7 = dbg->arch.debugreg[7];

6248

} else {

6248

} else {

6249

for (i = 0; i < KVM_NR_DB_REGS; i++)

6249

for (i = 0; i < KVM_NR_DB_REGS; i++)

6250

vcpu->arch.eff_db[i] = vcpu->arch.db[i];

6250

vcpu->arch.eff_db[i] = vcpu->arch.db[i];

6251

}

6251

}

6252

kvm_update_dr7(vcpu);

6252

kvm_update_dr7(vcpu);

6253

6254

if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)

6254

if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)

6255

vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) +

6255

vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) +

6256

get_segment_base(vcpu, VCPU_SREG_CS);

6256

get_segment_base(vcpu, VCPU_SREG_CS);

6257

6258

/*

6258

/*

6259

* Trigger an rflags update that will inject or remove the trace

6259

* Trigger an rflags update that will inject or remove the trace

6260

* flags.

6260

* flags.

6261

*/

6261

*/

6262

kvm_set_rflags(vcpu, rflags);

6262

kvm_set_rflags(vcpu, rflags);

6263

6264

kvm_x86_ops->update_db_bp_intercept(vcpu);

6264

kvm_x86_ops->update_db_bp_intercept(vcpu);

6265

6266

r = 0;

6266

r = 0;

6267

6268

out:

6268

out:

6269

6270

return r;

6270

return r;

6271

}

6271

}

6272

6273

/*

6273

/*

6274

* Translate a guest virtual address to a guest physical address.

6274

* Translate a guest virtual address to a guest physical address.

6275

*/

6275

*/

6276

int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,

6276

int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,

6277

struct kvm_translation *tr)

6277

struct kvm_translation *tr)

6278

{

6278

{

6279

unsigned long vaddr = tr->linear_address;

6279

unsigned long vaddr = tr->linear_address;

6280

gpa_t gpa;

6280

gpa_t gpa;

6281

int idx;

6281

int idx;

6282

6283

idx = srcu_read_lock(&vcpu->kvm->srcu);

6283

idx = srcu_read_lock(&vcpu->kvm->srcu);

6284

gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL);

6284

gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL);

6285

srcu_read_unlock(&vcpu->kvm->srcu, idx);

6285

srcu_read_unlock(&vcpu->kvm->srcu, idx);

6286

tr->physical_address = gpa;

6286

tr->physical_address = gpa;

6287

tr->valid = gpa != UNMAPPED_GVA;

6287

tr->valid = gpa != UNMAPPED_GVA;

6288

tr->writeable = 1;

6288

tr->writeable = 1;

6289

tr->usermode = 0;

6289

tr->usermode = 0;

6290

6291

return 0;

6291

return 0;

6292

}

6292

}

6293

6294

int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)

6294

int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)

6295

{

6295

{

6296

struct i387_fxsave_struct *fxsave =

6296

struct i387_fxsave_struct *fxsave =

6297

&vcpu->arch.guest_fpu.state->fxsave;

6297

&vcpu->arch.guest_fpu.state->fxsave;

6298

6299

memcpy(fpu->fpr, fxsave->st_space, 128);

6299

memcpy(fpu->fpr, fxsave->st_space, 128);

6300

fpu->fcw = fxsave->cwd;

6300

fpu->fcw = fxsave->cwd;

6301

fpu->fsw = fxsave->swd;

6301

fpu->fsw = fxsave->swd;

6302

fpu->ftwx = fxsave->twd;

6302

fpu->ftwx = fxsave->twd;

6303

fpu->last_opcode = fxsave->fop;

6303

fpu->last_opcode = fxsave->fop;

6304

fpu->last_ip = fxsave->rip;

6304

fpu->last_ip = fxsave->rip;

6305

fpu->last_dp = fxsave->rdp;

6305

fpu->last_dp = fxsave->rdp;

6306

memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);

6306

memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);

6307

6308

return 0;

6308

return 0;

6309

}

6309

}

6310

6311

int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)

6311

int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)

6312

{

6312

{

6313

struct i387_fxsave_struct *fxsave =

6313

struct i387_fxsave_struct *fxsave =

6314

&vcpu->arch.guest_fpu.state->fxsave;

6314

&vcpu->arch.guest_fpu.state->fxsave;

6315

6316

memcpy(fxsave->st_space, fpu->fpr, 128);

6316

memcpy(fxsave->st_space, fpu->fpr, 128);

6317

fxsave->cwd = fpu->fcw;

6317

fxsave->cwd = fpu->fcw;

6318

fxsave->swd = fpu->fsw;

6318

fxsave->swd = fpu->fsw;

6319

fxsave->twd = fpu->ftwx;

6319

fxsave->twd = fpu->ftwx;

6320

fxsave->fop = fpu->last_opcode;

6320

fxsave->fop = fpu->last_opcode;

6321

fxsave->rip = fpu->last_ip;

6321

fxsave->rip = fpu->last_ip;

6322

fxsave->rdp = fpu->last_dp;

6322

fxsave->rdp = fpu->last_dp;

6323

memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);

6323

memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);

6324

6325

return 0;

6325

return 0;

6326

}

6326

}

6327

6328

int fx_init(struct kvm_vcpu *vcpu)

6328

int fx_init(struct kvm_vcpu *vcpu)

6329

{

6329

{

6330

int err;

6330

int err;

6331

6332

err = fpu_alloc(&vcpu->arch.guest_fpu);

6332

err = fpu_alloc(&vcpu->arch.guest_fpu);

6333

if (err)

6333

if (err)

6334

return err;

6334

return err;

6335

6336

fpu_finit(&vcpu->arch.guest_fpu);

6336

fpu_finit(&vcpu->arch.guest_fpu);

6337

6338

/*

6338

/*

6339

* Ensure guest xcr0 is valid for loading

6339

* Ensure guest xcr0 is valid for loading

6340

*/

6340

*/

6341

vcpu->arch.xcr0 = XSTATE_FP;

6341

vcpu->arch.xcr0 = XSTATE_FP;

6342

6343

vcpu->arch.cr0 |= X86_CR0_ET;

6343

vcpu->arch.cr0 |= X86_CR0_ET;

6344

6345

return 0;

6345

return 0;

6346

}

6346

}

6347

EXPORT_SYMBOL_GPL(fx_init);

6347

EXPORT_SYMBOL_GPL(fx_init);

6348

6349

static void fx_free(struct kvm_vcpu *vcpu)

6349

static void fx_free(struct kvm_vcpu *vcpu)

6350

{

6350

{

6351

fpu_free(&vcpu->arch.guest_fpu);

6351

fpu_free(&vcpu->arch.guest_fpu);

6352

}

6352

}

6353

6354

void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)

6354

void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)

6355

{

6355

{

6356

if (vcpu->guest_fpu_loaded)

6356

if (vcpu->guest_fpu_loaded)

6357

return;

6357

return;

6358

6359

/*

6359

/*

6360

* Restore all possible states in the guest,

6360

* Restore all possible states in the guest,

6361

* and assume host would use all available bits.

6361

* and assume host would use all available bits.

6362

* Guest xcr0 would be loaded later.

6362

* Guest xcr0 would be loaded later.

6363

*/

6363

*/

6364

kvm_put_guest_xcr0(vcpu);

6364

kvm_put_guest_xcr0(vcpu);

6365

vcpu->guest_fpu_loaded = 1;

6365

vcpu->guest_fpu_loaded = 1;

6366

__kernel_fpu_begin();

6366

__kernel_fpu_begin();

6367

fpu_restore_checking(&vcpu->arch.guest_fpu);

6367

fpu_restore_checking(&vcpu->arch.guest_fpu);

6368

trace_kvm_fpu(1);

6368

trace_kvm_fpu(1);

6369

}

6369

}

6370

6371

void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)

6371

void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)

6372

{

6372

{

6373

kvm_put_guest_xcr0(vcpu);

6373

kvm_put_guest_xcr0(vcpu);

6374

6375

if (!vcpu->guest_fpu_loaded)

6375

if (!vcpu->guest_fpu_loaded)

6376

return;

6376

return;

6377

6378

vcpu->guest_fpu_loaded = 0;

6378

vcpu->guest_fpu_loaded = 0;

6379

fpu_save_init(&vcpu->arch.guest_fpu);

6379

fpu_save_init(&vcpu->arch.guest_fpu);

6380

__kernel_fpu_end();

6380

__kernel_fpu_end();

6381

++vcpu->stat.fpu_reload;

6381

++vcpu->stat.fpu_reload;

6382

kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);

6382

kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);

6383

trace_kvm_fpu(0);

6383

trace_kvm_fpu(0);

6384

}

6384

}

6385

6386

void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)

6386

void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)

6387

{

6387

{

6388

kvmclock_reset(vcpu);

6388

kvmclock_reset(vcpu);

6389

6390

free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);

6390

free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);

6391

fx_free(vcpu);

6391

fx_free(vcpu);

6392

kvm_x86_ops->vcpu_free(vcpu);

6392

kvm_x86_ops->vcpu_free(vcpu);

6393

}

6393

}

6394

6395

struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,

6395

struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,

6396

unsigned int id)

6396

unsigned int id)

6397

{

6397

{

6398

if (check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)

6398

if (check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)

6399

printk_once(KERN_WARNING

6399

printk_once(KERN_WARNING

6400

"kvm: SMP vm created on host with unstable TSC; "

6400

"kvm: SMP vm created on host with unstable TSC; "

6401

"guest TSC will not be reliable\n");

6401

"guest TSC will not be reliable\n");

6402

return kvm_x86_ops->vcpu_create(kvm, id);

6402

return kvm_x86_ops->vcpu_create(kvm, id);

6403

}

6403

}

6404

6405

int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)

6405

int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)

6406

{

6406

{

6407

int r;

6407

int r;

6408

6409

vcpu->arch.mtrr_state.have_fixed = 1;

6409

vcpu->arch.mtrr_state.have_fixed = 1;

6410

r = vcpu_load(vcpu);

6410

r = vcpu_load(vcpu);

6411

if (r)

6411

if (r)

6412

return r;

6412

return r;

6413

r = kvm_vcpu_reset(vcpu);

6413

r = kvm_vcpu_reset(vcpu);

6414

if (r == 0)

6414

if (r == 0)

6415

r = kvm_mmu_setup(vcpu);

6415

r = kvm_mmu_setup(vcpu);

6416

vcpu_put(vcpu);

6416

vcpu_put(vcpu);

6417

6418

return r;

6418

return r;

6419

}

6419

}

6420

6421

int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)

6421

int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)

6422

{

6422

{

6423

int r;

6423

int r;

6424

struct msr_data msr;

6424

struct msr_data msr;

6425

6426

r = vcpu_load(vcpu);

6426

r = vcpu_load(vcpu);

6427

if (r)

6427

if (r)

6428

return r;

6428

return r;

6429

msr.data = 0x0;

6429

msr.data = 0x0;

6430

msr.index = MSR_IA32_TSC;

6430

msr.index = MSR_IA32_TSC;

6431

msr.host_initiated = true;

6431

msr.host_initiated = true;

6432

kvm_write_tsc(vcpu, &msr);

6432

kvm_write_tsc(vcpu, &msr);

6433

vcpu_put(vcpu);

6433

vcpu_put(vcpu);

6434

6435

return r;

6435

return r;

6436

}

6436

}

6437

6438

void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)

6438

void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)

6439

{

6439

{

6440

int r;

6440

int r;

6441

vcpu->arch.apf.msr_val = 0;

6441

vcpu->arch.apf.msr_val = 0;

6442

6443

r = vcpu_load(vcpu);

6443

r = vcpu_load(vcpu);

6444

BUG_ON(r);

6444

BUG_ON(r);

6445

kvm_mmu_unload(vcpu);

6445

kvm_mmu_unload(vcpu);

6446

vcpu_put(vcpu);

6446

vcpu_put(vcpu);

6447

6448

fx_free(vcpu);

6448

fx_free(vcpu);

6449

kvm_x86_ops->vcpu_free(vcpu);

6449

kvm_x86_ops->vcpu_free(vcpu);

6450

}

6450

}

6451

6452

static int kvm_vcpu_reset(struct kvm_vcpu *vcpu)

6452

static int kvm_vcpu_reset(struct kvm_vcpu *vcpu)

6453

{

6453

{

6454

atomic_set(&vcpu->arch.nmi_queued, 0);

6454

atomic_set(&vcpu->arch.nmi_queued, 0);

6455

vcpu->arch.nmi_pending = 0;

6455

vcpu->arch.nmi_pending = 0;

6456

vcpu->arch.nmi_injected = false;

6456

vcpu->arch.nmi_injected = false;

6457

6458

memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));

6458

memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));

6459

vcpu->arch.dr6 = DR6_FIXED_1;

6459

vcpu->arch.dr6 = DR6_FIXED_1;

6460

vcpu->arch.dr7 = DR7_FIXED_1;

6460

vcpu->arch.dr7 = DR7_FIXED_1;

6461

kvm_update_dr7(vcpu);

6461

kvm_update_dr7(vcpu);

6462

6463

kvm_make_request(KVM_REQ_EVENT, vcpu);

6463

kvm_make_request(KVM_REQ_EVENT, vcpu);

6464

vcpu->arch.apf.msr_val = 0;

6464

vcpu->arch.apf.msr_val = 0;

6465

vcpu->arch.st.msr_val = 0;

6465

vcpu->arch.st.msr_val = 0;

6466

6467

kvmclock_reset(vcpu);

6467

kvmclock_reset(vcpu);

6468

6469

kvm_clear_async_pf_completion_queue(vcpu);

6469

kvm_clear_async_pf_completion_queue(vcpu);

6470

kvm_async_pf_hash_reset(vcpu);

6470

kvm_async_pf_hash_reset(vcpu);

6471

vcpu->arch.apf.halted = false;

6471

vcpu->arch.apf.halted = false;

6472

6473

kvm_pmu_reset(vcpu);

6473

kvm_pmu_reset(vcpu);

6474

6475

memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));

6475

memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));

6476

vcpu->arch.regs_avail = ~0;

6476

vcpu->arch.regs_avail = ~0;

6477

vcpu->arch.regs_dirty = ~0;

6477

vcpu->arch.regs_dirty = ~0;

6478

6479

return kvm_x86_ops->vcpu_reset(vcpu);

6479

return kvm_x86_ops->vcpu_reset(vcpu);

6480

}

6480

}

6481

6482

int kvm_arch_hardware_enable(void *garbage)

6482

int kvm_arch_hardware_enable(void *garbage)

6483

{

6483

{

6484

struct kvm *kvm;

6484

struct kvm *kvm;

6485

struct kvm_vcpu *vcpu;

6485

struct kvm_vcpu *vcpu;

6486

int i;

6486

int i;

6487

int ret;

6487

int ret;

6488

u64 local_tsc;

6488

u64 local_tsc;

6489

u64 max_tsc = 0;

6489

u64 max_tsc = 0;

6490

bool stable, backwards_tsc = false;

6490

bool stable, backwards_tsc = false;

6491

6492

kvm_shared_msr_cpu_online();

6492

kvm_shared_msr_cpu_online();

6493

ret = kvm_x86_ops->hardware_enable(garbage);

6493

ret = kvm_x86_ops->hardware_enable(garbage);

6494

if (ret != 0)

6494

if (ret != 0)

6495

return ret;

6495

return ret;

6496

6497

local_tsc = native_read_tsc();

6497

local_tsc = native_read_tsc();

6498

stable = !check_tsc_unstable();

6498

stable = !check_tsc_unstable();

6499

list_for_each_entry(kvm, &vm_list, vm_list) {

6499

list_for_each_entry(kvm, &vm_list, vm_list) {

6500

kvm_for_each_vcpu(i, vcpu, kvm) {

6500

kvm_for_each_vcpu(i, vcpu, kvm) {

6501

if (!stable && vcpu->cpu == smp_processor_id())

6501

if (!stable && vcpu->cpu == smp_processor_id())

6502

set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);

6502

set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);

6503

if (stable && vcpu->arch.last_host_tsc > local_tsc) {

6503

if (stable && vcpu->arch.last_host_tsc > local_tsc) {

6504

backwards_tsc = true;

6504

backwards_tsc = true;

6505

if (vcpu->arch.last_host_tsc > max_tsc)

6505

if (vcpu->arch.last_host_tsc > max_tsc)

6506

max_tsc = vcpu->arch.last_host_tsc;

6506

max_tsc = vcpu->arch.last_host_tsc;

6507

}

6507

}

6508

}

6508

}

6509

}

6509

}

6510

6511

/*

6511

/*

6512

* Sometimes, even reliable TSCs go backwards. This happens on

6512

* Sometimes, even reliable TSCs go backwards. This happens on

6513

* platforms that reset TSC during suspend or hibernate actions, but

6513

* platforms that reset TSC during suspend or hibernate actions, but

6514

* maintain synchronization. We must compensate. Fortunately, we can

6514

* maintain synchronization. We must compensate. Fortunately, we can

6515

* detect that condition here, which happens early in CPU bringup,

6515

* detect that condition here, which happens early in CPU bringup,

6516

* before any KVM threads can be running. Unfortunately, we can't

6516

* before any KVM threads can be running. Unfortunately, we can't

6517

* bring the TSCs fully up to date with real time, as we aren't yet far

6517

* bring the TSCs fully up to date with real time, as we aren't yet far

6518

* enough into CPU bringup that we know how much real time has actually

6518

* enough into CPU bringup that we know how much real time has actually

6519

* elapsed; our helper function, get_kernel_ns() will be using boot

6519

* elapsed; our helper function, get_kernel_ns() will be using boot

6520

* variables that haven't been updated yet.

6520

* variables that haven't been updated yet.

6521

*

6521

*

6522

* So we simply find the maximum observed TSC above, then record the

6522

* So we simply find the maximum observed TSC above, then record the

6523

* adjustment to TSC in each VCPU. When the VCPU later gets loaded,

6523

* adjustment to TSC in each VCPU. When the VCPU later gets loaded,

6524

* the adjustment will be applied. Note that we accumulate

6524

* the adjustment will be applied. Note that we accumulate

6525

* adjustments, in case multiple suspend cycles happen before some VCPU

6525

* adjustments, in case multiple suspend cycles happen before some VCPU

6526

* gets a chance to run again. In the event that no KVM threads get a

6526

* gets a chance to run again. In the event that no KVM threads get a

6527

* chance to run, we will miss the entire elapsed period, as we'll have

6527

* chance to run, we will miss the entire elapsed period, as we'll have

6528

* reset last_host_tsc, so VCPUs will not have the TSC adjusted and may

6528

* reset last_host_tsc, so VCPUs will not have the TSC adjusted and may

6529

* loose cycle time. This isn't too big a deal, since the loss will be

6529

* loose cycle time. This isn't too big a deal, since the loss will be

6530

* uniform across all VCPUs (not to mention the scenario is extremely

6530

* uniform across all VCPUs (not to mention the scenario is extremely

6531

* unlikely). It is possible that a second hibernate recovery happens

6531

* unlikely). It is possible that a second hibernate recovery happens

6532

* much faster than a first, causing the observed TSC here to be

6532

* much faster than a first, causing the observed TSC here to be

6533

* smaller; this would require additional padding adjustment, which is

6533

* smaller; this would require additional padding adjustment, which is

6534

* why we set last_host_tsc to the local tsc observed here.

6534

* why we set last_host_tsc to the local tsc observed here.

6535

*

6535

*

6536

* N.B. - this code below runs only on platforms with reliable TSC,

6536

* N.B. - this code below runs only on platforms with reliable TSC,

6537

* as that is the only way backwards_tsc is set above. Also note

6537

* as that is the only way backwards_tsc is set above. Also note

6538

* that this runs for ALL vcpus, which is not a bug; all VCPUs should

6538

* that this runs for ALL vcpus, which is not a bug; all VCPUs should

6539

* have the same delta_cyc adjustment applied if backwards_tsc

6539

* have the same delta_cyc adjustment applied if backwards_tsc

6540

* is detected. Note further, this adjustment is only done once,

6540

* is detected. Note further, this adjustment is only done once,

6541

* as we reset last_host_tsc on all VCPUs to stop this from being

6541

* as we reset last_host_tsc on all VCPUs to stop this from being

6542

* called multiple times (one for each physical CPU bringup).

6542

* called multiple times (one for each physical CPU bringup).

6543

*

6543

*

6544

* Platforms with unreliable TSCs don't have to deal with this, they

6544

* Platforms with unreliable TSCs don't have to deal with this, they

6545

* will be compensated by the logic in vcpu_load, which sets the TSC to

6545

* will be compensated by the logic in vcpu_load, which sets the TSC to

6546

* catchup mode. This will catchup all VCPUs to real time, but cannot

6546

* catchup mode. This will catchup all VCPUs to real time, but cannot

6547

* guarantee that they stay in perfect synchronization.

6547

* guarantee that they stay in perfect synchronization.

6548

*/

6548

*/

6549

if (backwards_tsc) {

6549

if (backwards_tsc) {

6550

u64 delta_cyc = max_tsc - local_tsc;

6550

u64 delta_cyc = max_tsc - local_tsc;

6551

list_for_each_entry(kvm, &vm_list, vm_list) {

6551

list_for_each_entry(kvm, &vm_list, vm_list) {

6552

kvm_for_each_vcpu(i, vcpu, kvm) {

6552

kvm_for_each_vcpu(i, vcpu, kvm) {

6553

vcpu->arch.tsc_offset_adjustment += delta_cyc;

6553

vcpu->arch.tsc_offset_adjustment += delta_cyc;

6554

vcpu->arch.last_host_tsc = local_tsc;

6554

vcpu->arch.last_host_tsc = local_tsc;

6555

set_bit(KVM_REQ_MASTERCLOCK_UPDATE,

6555

set_bit(KVM_REQ_MASTERCLOCK_UPDATE,

6556

&vcpu->requests);

6556

&vcpu->requests);

6557

}

6557

}

6558

6559

/*

6559

/*

6560

* We have to disable TSC offset matching.. if you were

6560

* We have to disable TSC offset matching.. if you were

6561

* booting a VM while issuing an S4 host suspend....

6561

* booting a VM while issuing an S4 host suspend....

6562

* you may have some problem. Solving this issue is

6562

* you may have some problem. Solving this issue is

6563

* left as an exercise to the reader.

6563

* left as an exercise to the reader.

6564

*/

6564

*/

6565

kvm->arch.last_tsc_nsec = 0;

6565

kvm->arch.last_tsc_nsec = 0;

6566

kvm->arch.last_tsc_write = 0;

6566

kvm->arch.last_tsc_write = 0;

6567

}

6567

}

6568

6569

}

6569

}

6570

return 0;

6570

return 0;

6571

}

6571

}

6572

6573

void kvm_arch_hardware_disable(void *garbage)

6573

void kvm_arch_hardware_disable(void *garbage)

6574

{

6574

{

6575

kvm_x86_ops->hardware_disable(garbage);

6575

kvm_x86_ops->hardware_disable(garbage);

6576

drop_user_return_notifiers(garbage);

6576

drop_user_return_notifiers(garbage);

6577

}

6577

}

6578

6579

int kvm_arch_hardware_setup(void)

6579

int kvm_arch_hardware_setup(void)

6580

{

6580

{

6581

return kvm_x86_ops->hardware_setup();

6581

return kvm_x86_ops->hardware_setup();

6582

}

6582

}

6583

6584

void kvm_arch_hardware_unsetup(void)

6584

void kvm_arch_hardware_unsetup(void)

6585

{

6585

{

6586

kvm_x86_ops->hardware_unsetup();

6586

kvm_x86_ops->hardware_unsetup();

6587

}

6587

}

6588

6589

void kvm_arch_check_processor_compat(void *rtn)

6589

void kvm_arch_check_processor_compat(void *rtn)

6590

{

6590

{

6591

kvm_x86_ops->check_processor_compatibility(rtn);

6591

kvm_x86_ops->check_processor_compatibility(rtn);

6592

}

6592

}

6593

6594

bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)

6594

bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)

6595

{

6595

{

6596

return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL);

6596

return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL);

6597

}

6597

}

6598

6599

struct static_key kvm_no_apic_vcpu __read_mostly;

6599

struct static_key kvm_no_apic_vcpu __read_mostly;

6600

6601

int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)

6601

int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)

6602

{

6602

{

6603

struct page *page;

6603

struct page *page;

6604

struct kvm *kvm;

6604

struct kvm *kvm;

6605

int r;

6605

int r;

6606

6607

BUG_ON(vcpu->kvm == NULL);

6607

BUG_ON(vcpu->kvm == NULL);

6608

kvm = vcpu->kvm;

6608

kvm = vcpu->kvm;

6609

6610

vcpu->arch.emulate_ctxt.ops = &emulate_ops;

6610

vcpu->arch.emulate_ctxt.ops = &emulate_ops;

6611

if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))

6611

if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))

6612

vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;

6612

vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;

6613

else

6613

else

6614

vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;

6614

vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;

6615

6616

page = alloc_page(GFP_KERNEL | __GFP_ZERO);

6616

page = alloc_page(GFP_KERNEL | __GFP_ZERO);

6617

if (!page) {

6617

if (!page) {

6618

r = -ENOMEM;

6618

r = -ENOMEM;

6619

goto fail;

6619

goto fail;

6620

}

6620

}

6621

vcpu->arch.pio_data = page_address(page);

6621

vcpu->arch.pio_data = page_address(page);

6622

6623

kvm_set_tsc_khz(vcpu, max_tsc_khz);

6623

kvm_set_tsc_khz(vcpu, max_tsc_khz);

6624

6625

r = kvm_mmu_create(vcpu);

6625

r = kvm_mmu_create(vcpu);

6626

if (r < 0)

6626

if (r < 0)

6627

goto fail_free_pio_data;

6627

goto fail_free_pio_data;

6628

6629

if (irqchip_in_kernel(kvm)) {

6629

if (irqchip_in_kernel(kvm)) {

6630

r = kvm_create_lapic(vcpu);

6630

r = kvm_create_lapic(vcpu);

6631

if (r < 0)

6631

if (r < 0)

6632

goto fail_mmu_destroy;

6632

goto fail_mmu_destroy;

6633

} else

6633

} else

6634

static_key_slow_inc(&kvm_no_apic_vcpu);

6634

static_key_slow_inc(&kvm_no_apic_vcpu);

6635

6636

vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,

6636

vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,

6637

GFP_KERNEL);

6637

GFP_KERNEL);

6638

if (!vcpu->arch.mce_banks) {

6638

if (!vcpu->arch.mce_banks) {

6639

r = -ENOMEM;

6639

r = -ENOMEM;

6640

goto fail_free_lapic;

6640

goto fail_free_lapic;

6641

}

6641

}

6642

vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;

6642

vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;

6643

6644

if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL))

6644

if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL))

6645

goto fail_free_mce_banks;

6645

goto fail_free_mce_banks;

6646

6647

r = fx_init(vcpu);

6647

r = fx_init(vcpu);

6648

if (r)

6648

if (r)

6649

goto fail_free_wbinvd_dirty_mask;

6649

goto fail_free_wbinvd_dirty_mask;

6650

6651

vcpu->arch.ia32_tsc_adjust_msr = 0x0;

6651

vcpu->arch.ia32_tsc_adjust_msr = 0x0;

6652

kvm_async_pf_hash_reset(vcpu);

6652

kvm_async_pf_hash_reset(vcpu);

6653

kvm_pmu_init(vcpu);

6653

kvm_pmu_init(vcpu);

6654

6655

return 0;

6655

return 0;

6656

fail_free_wbinvd_dirty_mask:

6656

fail_free_wbinvd_dirty_mask:

6657

free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);

6657

free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);

6658

fail_free_mce_banks:

6658

fail_free_mce_banks:

6659

kfree(vcpu->arch.mce_banks);

6659

kfree(vcpu->arch.mce_banks);

6660

fail_free_lapic:

6660

fail_free_lapic:

6661

kvm_free_lapic(vcpu);

6661

kvm_free_lapic(vcpu);

6662

fail_mmu_destroy:

6662

fail_mmu_destroy:

6663

kvm_mmu_destroy(vcpu);

6663

kvm_mmu_destroy(vcpu);

6664

fail_free_pio_data:

6664

fail_free_pio_data:

6665

free_page((unsigned long)vcpu->arch.pio_data);

6665

free_page((unsigned long)vcpu->arch.pio_data);

6666

fail:

6666

fail:

6667

return r;

6667

return r;

6668

}

6668

}

6669

6670

void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)

6670

void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)

6671

{

6671

{

6672

int idx;

6672

int idx;

6673

6674

kvm_pmu_destroy(vcpu);

6674

kvm_pmu_destroy(vcpu);

6675

kfree(vcpu->arch.mce_banks);

6675

kfree(vcpu->arch.mce_banks);

6676

kvm_free_lapic(vcpu);

6676

kvm_free_lapic(vcpu);

6677

idx = srcu_read_lock(&vcpu->kvm->srcu);

6677

idx = srcu_read_lock(&vcpu->kvm->srcu);

6678

kvm_mmu_destroy(vcpu);

6678

kvm_mmu_destroy(vcpu);

6679

srcu_read_unlock(&vcpu->kvm->srcu, idx);

6679

srcu_read_unlock(&vcpu->kvm->srcu, idx);

6680

free_page((unsigned long)vcpu->arch.pio_data);

6680

free_page((unsigned long)vcpu->arch.pio_data);

6681

if (!irqchip_in_kernel(vcpu->kvm))

6681

if (!irqchip_in_kernel(vcpu->kvm))

6682

static_key_slow_dec(&kvm_no_apic_vcpu);

6682

static_key_slow_dec(&kvm_no_apic_vcpu);

6683

}

6683

}

6684

6685

int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)

6685

int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)

6686

{

6686

{

6687

if (type)

6687

if (type)

6688

return -EINVAL;

6688

return -EINVAL;

6689

6690

INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);

6690

INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);

6691

INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);

6691

INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);

6692

6693

/* Reserve bit 0 of irq_sources_bitmap for userspace irq source */

6693

/* Reserve bit 0 of irq_sources_bitmap for userspace irq source */

6694

set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);

6694

set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);

6695

/* Reserve bit 1 of irq_sources_bitmap for irqfd-resampler */

6695

/* Reserve bit 1 of irq_sources_bitmap for irqfd-resampler */

6696

set_bit(KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,

6696

set_bit(KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,

6697

&kvm->arch.irq_sources_bitmap);

6697

&kvm->arch.irq_sources_bitmap);

6698

6699

raw_spin_lock_init(&kvm->arch.tsc_write_lock);

6699

raw_spin_lock_init(&kvm->arch.tsc_write_lock);

6700

mutex_init(&kvm->arch.apic_map_lock);

6700

mutex_init(&kvm->arch.apic_map_lock);

6701

spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);

6701

spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);

6702

6703

pvclock_update_vm_gtod_copy(kvm);

6703

pvclock_update_vm_gtod_copy(kvm);

6704

6705

return 0;

6705

return 0;

6706

}

6706

}

6707

6708

static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)

6708

static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)

6709

{

6709

{

6710

int r;

6710

int r;

6711

r = vcpu_load(vcpu);

6711

r = vcpu_load(vcpu);

6712

BUG_ON(r);

6712

BUG_ON(r);

6713

kvm_mmu_unload(vcpu);

6713

kvm_mmu_unload(vcpu);

6714

vcpu_put(vcpu);

6714

vcpu_put(vcpu);

6715

}

6715

}

6716

6717

static void kvm_free_vcpus(struct kvm *kvm)

6717

static void kvm_free_vcpus(struct kvm *kvm)

6718

{

6718

{

6719

unsigned int i;

6719

unsigned int i;

6720

struct kvm_vcpu *vcpu;

6720

struct kvm_vcpu *vcpu;

6721

6722

/*

6722

/*

6723

* Unpin any mmu pages first.

6723

* Unpin any mmu pages first.

6724

*/

6724

*/

6725

kvm_for_each_vcpu(i, vcpu, kvm) {

6725

kvm_for_each_vcpu(i, vcpu, kvm) {

6726

kvm_clear_async_pf_completion_queue(vcpu);

6726

kvm_clear_async_pf_completion_queue(vcpu);

6727

kvm_unload_vcpu_mmu(vcpu);

6727

kvm_unload_vcpu_mmu(vcpu);

6728

}

6728

}

6729

kvm_for_each_vcpu(i, vcpu, kvm)

6729

kvm_for_each_vcpu(i, vcpu, kvm)

6730

kvm_arch_vcpu_free(vcpu);

6730

kvm_arch_vcpu_free(vcpu);

6731

6732

mutex_lock(&kvm->lock);

6732

mutex_lock(&kvm->lock);

6733

for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)

6733

for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)

6734

kvm->vcpus[i] = NULL;

6734

kvm->vcpus[i] = NULL;

6735

6736

atomic_set(&kvm->online_vcpus, 0);

6736

atomic_set(&kvm->online_vcpus, 0);

6737

mutex_unlock(&kvm->lock);

6737

mutex_unlock(&kvm->lock);

6738

}

6738

}

6739

6740

void kvm_arch_sync_events(struct kvm *kvm)

6740

void kvm_arch_sync_events(struct kvm *kvm)

6741

{

6741

{

6742

kvm_free_all_assigned_devices(kvm);

6742

kvm_free_all_assigned_devices(kvm);

6743

kvm_free_pit(kvm);

6743

kvm_free_pit(kvm);

6744

}

6744

}

6745

6746

void kvm_arch_destroy_vm(struct kvm *kvm)

6746

void kvm_arch_destroy_vm(struct kvm *kvm)

6747

{

6747

{

6748

kvm_iommu_unmap_guest(kvm);

6748

kvm_iommu_unmap_guest(kvm);

6749

kfree(kvm->arch.vpic);

6749

kfree(kvm->arch.vpic);

6750

kfree(kvm->arch.vioapic);

6750

kfree(kvm->arch.vioapic);

6751

kvm_free_vcpus(kvm);

6751

kvm_free_vcpus(kvm);

6752

if (kvm->arch.apic_access_page)

6752

if (kvm->arch.apic_access_page)

6753

put_page(kvm->arch.apic_access_page);

6753

put_page(kvm->arch.apic_access_page);

6754

if (kvm->arch.ept_identity_pagetable)

6754

if (kvm->arch.ept_identity_pagetable)

6755

put_page(kvm->arch.ept_identity_pagetable);

6755

put_page(kvm->arch.ept_identity_pagetable);

6756

kfree(rcu_dereference_check(kvm->arch.apic_map, 1));

6756

kfree(rcu_dereference_check(kvm->arch.apic_map, 1));

6757

}

6757

}

6758

6759

void kvm_arch_free_memslot(struct kvm_memory_slot *free,

6759

void kvm_arch_free_memslot(struct kvm_memory_slot *free,

6760

struct kvm_memory_slot *dont)

6760

struct kvm_memory_slot *dont)

6761

{

6761

{

6762

int i;

6762

int i;

6763

6764

for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {

6764

for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {

6765

if (!dont || free->arch.rmap[i] != dont->arch.rmap[i]) {

6765

if (!dont || free->arch.rmap[i] != dont->arch.rmap[i]) {

6766

kvm_kvfree(free->arch.rmap[i]);

6766

kvm_kvfree(free->arch.rmap[i]);

6767

free->arch.rmap[i] = NULL;

6767

free->arch.rmap[i] = NULL;

6768

}

6768

}

6769

if (i == 0)

6769

if (i == 0)

6770

continue;

6770

continue;

6771

6772

if (!dont || free->arch.lpage_info[i - 1] !=

6772

if (!dont || free->arch.lpage_info[i - 1] !=

6773

dont->arch.lpage_info[i - 1]) {

6773

dont->arch.lpage_info[i - 1]) {

6774

kvm_kvfree(free->arch.lpage_info[i - 1]);

6774

kvm_kvfree(free->arch.lpage_info[i - 1]);

6775

free->arch.lpage_info[i - 1] = NULL;

6775

free->arch.lpage_info[i - 1] = NULL;

6776

}

6776

}

6777

}

6777

}

6778

}

6778

}

6779

6780

int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)

6780

int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)

6781

{

6781

{

6782

int i;

6782

int i;

6783

6784

for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {

6784

for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {

6785

unsigned long ugfn;

6785

unsigned long ugfn;

6786

int lpages;

6786

int lpages;

6787

int level = i + 1;

6787

int level = i + 1;

6788

6789

lpages = gfn_to_index(slot->base_gfn + npages - 1,

6789

lpages = gfn_to_index(slot->base_gfn + npages - 1,

6790

slot->base_gfn, level) + 1;

6790

slot->base_gfn, level) + 1;

6791

6792

slot->arch.rmap[i] =

6792

slot->arch.rmap[i] =

6793

kvm_kvzalloc(lpages * sizeof(*slot->arch.rmap[i]));

6793

kvm_kvzalloc(lpages * sizeof(*slot->arch.rmap[i]));

6794

if (!slot->arch.rmap[i])

6794

if (!slot->arch.rmap[i])

6795

goto out_free;

6795

goto out_free;

6796

if (i == 0)

6796

if (i == 0)

6797

continue;

6797

continue;

6798

6799

slot->arch.lpage_info[i - 1] = kvm_kvzalloc(lpages *

6799

slot->arch.lpage_info[i - 1] = kvm_kvzalloc(lpages *

6800

sizeof(*slot->arch.lpage_info[i - 1]));

6800

sizeof(*slot->arch.lpage_info[i - 1]));

6801

if (!slot->arch.lpage_info[i - 1])

6801

if (!slot->arch.lpage_info[i - 1])

6802

goto out_free;

6802

goto out_free;

6803

6804

if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))

6804

if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))

6805

slot->arch.lpage_info[i - 1][0].write_count = 1;

6805

slot->arch.lpage_info[i - 1][0].write_count = 1;

6806

if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))

6806

if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))

6807

slot->arch.lpage_info[i - 1][lpages - 1].write_count = 1;

6807

slot->arch.lpage_info[i - 1][lpages - 1].write_count = 1;

6808

ugfn = slot->userspace_addr >> PAGE_SHIFT;

6808

ugfn = slot->userspace_addr >> PAGE_SHIFT;

6809

/*

6809

/*

6810

* If the gfn and userspace address are not aligned wrt each

6810

* If the gfn and userspace address are not aligned wrt each

6811

* other, or if explicitly asked to, disable large page

6811

* other, or if explicitly asked to, disable large page

6812

* support for this slot

6812

* support for this slot

6813

*/

6813

*/

6814

if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) ||

6814

if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) ||

6815

!kvm_largepages_enabled()) {

6815

!kvm_largepages_enabled()) {

6816

unsigned long j;

6816

unsigned long j;

6817

6818

for (j = 0; j < lpages; ++j)

6818

for (j = 0; j < lpages; ++j)

6819

slot->arch.lpage_info[i - 1][j].write_count = 1;

6819

slot->arch.lpage_info[i - 1][j].write_count = 1;

6820

}

6820

}

6821

}

6821

}

6822

6823

return 0;

6823

return 0;

6824

6825

out_free:

6825

out_free:

6826

for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {

6826

for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {

6827

kvm_kvfree(slot->arch.rmap[i]);

6827

kvm_kvfree(slot->arch.rmap[i]);

6828

slot->arch.rmap[i] = NULL;

6828

slot->arch.rmap[i] = NULL;

6829

if (i == 0)

6829

if (i == 0)

6830

continue;

6830

continue;

6831

6832

kvm_kvfree(slot->arch.lpage_info[i - 1]);

6832

kvm_kvfree(slot->arch.lpage_info[i - 1]);

6833

slot->arch.lpage_info[i - 1] = NULL;

6833

slot->arch.lpage_info[i - 1] = NULL;

6834

}

6834

}

6835

return -ENOMEM;

6835

return -ENOMEM;

6836

}

6836

}

6837

6838

int kvm_arch_prepare_memory_region(struct kvm *kvm,

6838

int kvm_arch_prepare_memory_region(struct kvm *kvm,

6839

struct kvm_memory_slot *memslot,

6839

struct kvm_memory_slot *memslot,

6840

struct kvm_memory_slot old,

6840

struct kvm_memory_slot old,

6841

struct kvm_userspace_memory_region *mem,

6841

struct kvm_userspace_memory_region *mem,

6842

bool user_alloc)

6842

bool user_alloc)

6843

{

6843

{

6844

int npages = memslot->npages;

6844

int npages = memslot->npages;

6845

int map_flags = MAP_PRIVATE | MAP_ANONYMOUS;

6845

int map_flags = MAP_PRIVATE | MAP_ANONYMOUS;

6846

6847

/* Prevent internal slot pages from being moved by fork()/COW. */

6847

/* Prevent internal slot pages from being moved by fork()/COW. */

6848

if (memslot->id >= KVM_USER_MEM_SLOTS)

6848

if (memslot->id >= KVM_USER_MEM_SLOTS)

6849

map_flags = MAP_SHARED | MAP_ANONYMOUS;

6849

map_flags = MAP_SHARED | MAP_ANONYMOUS;

6850

6851

/*To keep backward compatibility with older userspace,

6851

/*To keep backward compatibility with older userspace,

6852

*x86 needs to handle !user_alloc case.

6852

*x86 needs to handle !user_alloc case.

6853

*/

6853

*/

6854

if (!user_alloc) {

6854

if (!user_alloc) {

6855

if (npages && !old.npages) {

6855

if (npages && !old.npages) {

6856

unsigned long userspace_addr;

6856

unsigned long userspace_addr;

6857

6858

userspace_addr = vm_mmap(NULL, 0,

6858

userspace_addr = vm_mmap(NULL, 0,

6859

npages * PAGE_SIZE,

6859

npages * PAGE_SIZE,

6860

PROT_READ | PROT_WRITE,

6860

PROT_READ | PROT_WRITE,

6861

map_flags,

6861

map_flags,

6862

0);

6862

0);

6863

6864

if (IS_ERR((void *)userspace_addr))

6864

if (IS_ERR((void *)userspace_addr))

6865

return PTR_ERR((void *)userspace_addr);

6865

return PTR_ERR((void *)userspace_addr);

6866

6867

memslot->userspace_addr = userspace_addr;

6867

memslot->userspace_addr = userspace_addr;

6868

}

6868

}

6869

}

6869

}

6870

6871

6872

return 0;

6872

return 0;

6873

}

6873

}

6874

6875

void kvm_arch_commit_memory_region(struct kvm *kvm,

6875

void kvm_arch_commit_memory_region(struct kvm *kvm,

6876

struct kvm_userspace_memory_region *mem,

6876

struct kvm_userspace_memory_region *mem,

6877

struct kvm_memory_slot old,

6877

struct kvm_memory_slot old,

6878

bool user_alloc)

6878

bool user_alloc)

6879

{

6879

{

6880

6881

int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT;

6881

int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT;

6882

6883

if (!user_alloc && !old.user_alloc && old.npages && !npages) {

6883

if (!user_alloc && !old.user_alloc && old.npages && !npages) {

6884

int ret;

6884

int ret;

6885

6886

ret = vm_munmap(old.userspace_addr,

6886

ret = vm_munmap(old.userspace_addr,

6887

old.npages * PAGE_SIZE);

6887

old.npages * PAGE_SIZE);

6888

if (ret < 0)

6888

if (ret < 0)

6889

printk(KERN_WARNING

6889

printk(KERN_WARNING

6890

"kvm_vm_ioctl_set_memory_region: "

6890

"kvm_vm_ioctl_set_memory_region: "

6891

"failed to munmap memory\n");

6891

"failed to munmap memory\n");

6892

}

6892

}

6893

6894

if (!kvm->arch.n_requested_mmu_pages)

6894

if (!kvm->arch.n_requested_mmu_pages)

6895

nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);

6895

nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);

6896

6897

spin_lock(&kvm->mmu_lock);

6897

spin_lock(&kvm->mmu_lock);

6898

if (nr_mmu_pages)

6898

if (nr_mmu_pages)

6899

kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);

6899

kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);

6900

kvm_mmu_slot_remove_write_access(kvm, mem->slot);

6900

/*

6901

* Write protect all pages for dirty logging.

6902

* Existing largepage mappings are destroyed here and new ones will

6903

* not be created until the end of the logging.

6904

*/

6905

if (npages && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES))

6906

kvm_mmu_slot_remove_write_access(kvm, mem->slot);

6901

spin_unlock(&kvm->mmu_lock);

6907

spin_unlock(&kvm->mmu_lock);

6902

/*

6908

/*

6903

* If memory slot is created, or moved, we need to clear all

6909

* If memory slot is created, or moved, we need to clear all

6904

* mmio sptes.

6910

* mmio sptes.

6905

*/

6911

*/

6906

if (npages && old.base_gfn != mem->guest_phys_addr >> PAGE_SHIFT) {

6912

if (npages && old.base_gfn != mem->guest_phys_addr >> PAGE_SHIFT) {

6907

kvm_mmu_zap_all(kvm);

6913

kvm_mmu_zap_all(kvm);

6908

kvm_reload_remote_mmus(kvm);

6914

kvm_reload_remote_mmus(kvm);

6909

}

6915

}

6910

}

6916

}

6911

6917

6912

void kvm_arch_flush_shadow_all(struct kvm *kvm)

6918

void kvm_arch_flush_shadow_all(struct kvm *kvm)

6913

{

6919

{

6914

kvm_mmu_zap_all(kvm);

6920

kvm_mmu_zap_all(kvm);

6915

kvm_reload_remote_mmus(kvm);

6921

kvm_reload_remote_mmus(kvm);

6916

}

6922

}

6917

6923

6918

void kvm_arch_flush_shadow_memslot(struct kvm *kvm,

6924

void kvm_arch_flush_shadow_memslot(struct kvm *kvm,

6919

struct kvm_memory_slot *slot)

6925

struct kvm_memory_slot *slot)

6920

{

6926

{

6921

kvm_arch_flush_shadow_all(kvm);

6927

kvm_arch_flush_shadow_all(kvm);

6922

}

6928

}

6923

6929

6924

int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)

6930

int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)

6925

{

6931

{

6926

return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&

6932

return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&

6927

!vcpu->arch.apf.halted)

6933

!vcpu->arch.apf.halted)

6928

|| !list_empty_careful(&vcpu->async_pf.done)

6934

|| !list_empty_careful(&vcpu->async_pf.done)

6929

|| vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED

6935

|| vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED

6930

|| atomic_read(&vcpu->arch.nmi_queued) ||

6936

|| atomic_read(&vcpu->arch.nmi_queued) ||

6931

(kvm_arch_interrupt_allowed(vcpu) &&

6937

(kvm_arch_interrupt_allowed(vcpu) &&

6932

kvm_cpu_has_interrupt(vcpu));

6938

kvm_cpu_has_interrupt(vcpu));

6933

}

6939

}

6934

6940

6935

int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)

6941

int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)

6936

{

6942

{

6937

return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;

6943

return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;

6938

}

6944

}

6939

6945

6940

int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)

6946

int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)

6941

{

6947

{

6942

return kvm_x86_ops->interrupt_allowed(vcpu);

6948

return kvm_x86_ops->interrupt_allowed(vcpu);

6943

}

6949

}

6944

6950

6945

bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip)

6951

bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip)

6946

{

6952

{

6947

unsigned long current_rip = kvm_rip_read(vcpu) +

6953

unsigned long current_rip = kvm_rip_read(vcpu) +

6948

get_segment_base(vcpu, VCPU_SREG_CS);

6954

get_segment_base(vcpu, VCPU_SREG_CS);

6949

6955

6950

return current_rip == linear_rip;

6956

return current_rip == linear_rip;

6951

}

6957

}

6952

EXPORT_SYMBOL_GPL(kvm_is_linear_rip);

6958

EXPORT_SYMBOL_GPL(kvm_is_linear_rip);

6953

6959

6954

unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)

6960

unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)

6955

{

6961

{

6956

unsigned long rflags;

6962

unsigned long rflags;

6957

6963

6958

rflags = kvm_x86_ops->get_rflags(vcpu);

6964

rflags = kvm_x86_ops->get_rflags(vcpu);

6959

if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)

6965

if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)

6960

rflags &= ~X86_EFLAGS_TF;

6966

rflags &= ~X86_EFLAGS_TF;

6961

return rflags;

6967

return rflags;

6962

}

6968

}

6963

EXPORT_SYMBOL_GPL(kvm_get_rflags);

6969

EXPORT_SYMBOL_GPL(kvm_get_rflags);

6964

6970

6965

void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)

6971

void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)

6966

{

6972

{

6967

if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&

6973

if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&

6968

kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))

6974

kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))

6969

rflags |= X86_EFLAGS_TF;

6975

rflags |= X86_EFLAGS_TF;

6970

kvm_x86_ops->set_rflags(vcpu, rflags);

6976

kvm_x86_ops->set_rflags(vcpu, rflags);

6971

kvm_make_request(KVM_REQ_EVENT, vcpu);

6977

kvm_make_request(KVM_REQ_EVENT, vcpu);

6972

}

6978

}

6973

EXPORT_SYMBOL_GPL(kvm_set_rflags);

6979

EXPORT_SYMBOL_GPL(kvm_set_rflags);

6974

6980

6975

void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)

6981

void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)

6976

{

6982

{

6977

int r;

6983

int r;

6978

6984

6979

if ((vcpu->arch.mmu.direct_map != work->arch.direct_map) ||

6985

if ((vcpu->arch.mmu.direct_map != work->arch.direct_map) ||

6980

is_error_page(work->page))

6986

is_error_page(work->page))

6981

return;

6987

return;

6982

6988

6983

r = kvm_mmu_reload(vcpu);

6989

r = kvm_mmu_reload(vcpu);

6984

if (unlikely(r))

6990

if (unlikely(r))

6985

return;

6991

return;

6986

6992

6987

if (!vcpu->arch.mmu.direct_map &&

6993

if (!vcpu->arch.mmu.direct_map &&

6988

work->arch.cr3 != vcpu->arch.mmu.get_cr3(vcpu))

6994

work->arch.cr3 != vcpu->arch.mmu.get_cr3(vcpu))

6989

return;

6995

return;

6990

6996

6991

vcpu->arch.mmu.page_fault(vcpu, work->gva, 0, true);

6997

vcpu->arch.mmu.page_fault(vcpu, work->gva, 0, true);

6992

}

6998

}

6993

6999

6994

static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)

7000

static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)

6995

{

7001

{

6996

return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU));

7002

return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU));

6997

}

7003

}

6998

7004

6999

static inline u32 kvm_async_pf_next_probe(u32 key)

7005

static inline u32 kvm_async_pf_next_probe(u32 key)

7000

{

7006

{

7001

return (key + 1) & (roundup_pow_of_two(ASYNC_PF_PER_VCPU) - 1);

7007

return (key + 1) & (roundup_pow_of_two(ASYNC_PF_PER_VCPU) - 1);

7002

}

7008

}

7003

7009

7004

static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)

7010

static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)

7005

{

7011

{

7006

u32 key = kvm_async_pf_hash_fn(gfn);

7012

u32 key = kvm_async_pf_hash_fn(gfn);

7007

7013

7008

while (vcpu->arch.apf.gfns[key] != ~0)

7014

while (vcpu->arch.apf.gfns[key] != ~0)

7009

key = kvm_async_pf_next_probe(key);

7015

key = kvm_async_pf_next_probe(key);

7010

7016

7011

vcpu->arch.apf.gfns[key] = gfn;

7017

vcpu->arch.apf.gfns[key] = gfn;

7012

}

7018

}

7013

7019

7014

static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn)

7020

static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn)

7015

{

7021

{

7016

int i;

7022

int i;

7017

u32 key = kvm_async_pf_hash_fn(gfn);

7023

u32 key = kvm_async_pf_hash_fn(gfn);

7018

7024

7019

for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU) &&

7025

for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU) &&

7020

(vcpu->arch.apf.gfns[key] != gfn &&

7026

(vcpu->arch.apf.gfns[key] != gfn &&

7021

vcpu->arch.apf.gfns[key] != ~0); i++)

7027

vcpu->arch.apf.gfns[key] != ~0); i++)

7022

key = kvm_async_pf_next_probe(key);

7028

key = kvm_async_pf_next_probe(key);

7023

7029

7024

return key;

7030

return key;

7025

}

7031

}

7026

7032

7027

bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)

7033

bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)

7028

{

7034

{

7029

return vcpu->arch.apf.gfns[kvm_async_pf_gfn_slot(vcpu, gfn)] == gfn;

7035

return vcpu->arch.apf.gfns[kvm_async_pf_gfn_slot(vcpu, gfn)] == gfn;

7030

}

7036

}

7031

7037

7032

static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)

7038

static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)

7033

{

7039

{

7034

u32 i, j, k;

7040

u32 i, j, k;

7035

7041

7036

i = j = kvm_async_pf_gfn_slot(vcpu, gfn);

7042

i = j = kvm_async_pf_gfn_slot(vcpu, gfn);

7037

while (true) {

7043

while (true) {

7038

vcpu->arch.apf.gfns[i] = ~0;

7044

vcpu->arch.apf.gfns[i] = ~0;

7039

do {

7045

do {

7040

j = kvm_async_pf_next_probe(j);

7046

j = kvm_async_pf_next_probe(j);

7041

if (vcpu->arch.apf.gfns[j] == ~0)

7047

if (vcpu->arch.apf.gfns[j] == ~0)

7042

return;

7048

return;

7043

k = kvm_async_pf_hash_fn(vcpu->arch.apf.gfns[j]);

7049

k = kvm_async_pf_hash_fn(vcpu->arch.apf.gfns[j]);

7044

/*

7050

/*

7045

* k lies cyclically in ]i,j]

7051

* k lies cyclically in ]i,j]

7046

* | i.k.j |

7052

* | i.k.j |

7047

* |....j i.k.| or |.k..j i...|

7053

* |....j i.k.| or |.k..j i...|

7048

*/

7054

*/

7049

} while ((i <= j) ? (i < k && k <= j) : (i < k || k <= j));

7055

} while ((i <= j) ? (i < k && k <= j) : (i < k || k <= j));

7050

vcpu->arch.apf.gfns[i] = vcpu->arch.apf.gfns[j];

7056

vcpu->arch.apf.gfns[i] = vcpu->arch.apf.gfns[j];

7051

i = j;

7057

i = j;

7052

}

7058

}

7053

}

7059

}

7054

7060

7055

static int apf_put_user(struct kvm_vcpu *vcpu, u32 val)

7061

static int apf_put_user(struct kvm_vcpu *vcpu, u32 val)

7056

{

7062

{

7057

7063

7058

return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val,

7064

return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val,

7059

sizeof(val));

7065

sizeof(val));

7060

}

7066

}

7061

7067

7062

void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,

7068

void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,

7063

struct kvm_async_pf *work)

7069

struct kvm_async_pf *work)

7064

{

7070

{

7065

struct x86_exception fault;

7071

struct x86_exception fault;

7066

7072

7067

trace_kvm_async_pf_not_present(work->arch.token, work->gva);

7073

trace_kvm_async_pf_not_present(work->arch.token, work->gva);

7068

kvm_add_async_pf_gfn(vcpu, work->arch.gfn);

7074

kvm_add_async_pf_gfn(vcpu, work->arch.gfn);

7069

7075

7070

if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) ||

7076

if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) ||

7071

(vcpu->arch.apf.send_user_only &&

7077

(vcpu->arch.apf.send_user_only &&

7072

kvm_x86_ops->get_cpl(vcpu) == 0))

7078

kvm_x86_ops->get_cpl(vcpu) == 0))

7073

kvm_make_request(KVM_REQ_APF_HALT, vcpu);

7079

kvm_make_request(KVM_REQ_APF_HALT, vcpu);

7074

else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) {

7080

else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) {

7075

fault.vector = PF_VECTOR;

7081

fault.vector = PF_VECTOR;

7076

fault.error_code_valid = true;

7082

fault.error_code_valid = true;

7077

fault.error_code = 0;

7083

fault.error_code = 0;

7078

fault.nested_page_fault = false;

7084

fault.nested_page_fault = false;

7079

fault.address = work->arch.token;

7085

fault.address = work->arch.token;

7080

kvm_inject_page_fault(vcpu, &fault);

7086

kvm_inject_page_fault(vcpu, &fault);

7081

}

7087

}

7082

}

7088

}

7083

7089

7084

void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,

7090

void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,

7085

struct kvm_async_pf *work)

7091

struct kvm_async_pf *work)

7086

{

7092

{

7087

struct x86_exception fault;

7093

struct x86_exception fault;

7088

7094

7089

trace_kvm_async_pf_ready(work->arch.token, work->gva);

7095

trace_kvm_async_pf_ready(work->arch.token, work->gva);

7090

if (is_error_page(work->page))

7096

if (is_error_page(work->page))

7091

work->arch.token = ~0; /* broadcast wakeup */

7097

work->arch.token = ~0; /* broadcast wakeup */

7092

else

7098

else

7093

kvm_del_async_pf_gfn(vcpu, work->arch.gfn);

7099

kvm_del_async_pf_gfn(vcpu, work->arch.gfn);

7094

7100

7095

if ((vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) &&

7101

if ((vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) &&

7096

!apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {

7102

!apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {

7097

fault.vector = PF_VECTOR;

7103

fault.vector = PF_VECTOR;

7098

fault.error_code_valid = true;

7104

fault.error_code_valid = true;

7099

fault.error_code = 0;

7105

fault.error_code = 0;

7100

fault.nested_page_fault = false;

7106

fault.nested_page_fault = false;

7101

fault.address = work->arch.token;

7107

fault.address = work->arch.token;

7102

kvm_inject_page_fault(vcpu, &fault);

7108

kvm_inject_page_fault(vcpu, &fault);

7103

}

7109

}

7104

vcpu->arch.apf.halted = false;

7110

vcpu->arch.apf.halted = false;

7105

vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;

7111

vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;

7106

}

7112

}

7107

7113

7108

bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)

7114

bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)

7109

{

7115

{

7110

if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED))

7116

if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED))

7111

return true;

7117

return true;

7112

else

7118

else

7113

return !kvm_event_needs_reinjection(vcpu) &&

7119

return !kvm_event_needs_reinjection(vcpu) &&

7114

kvm_x86_ops->interrupt_allowed(vcpu);

7120

kvm_x86_ops->interrupt_allowed(vcpu);

7115

}

7121

}

7116

7122

7117

EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);

7123

EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);

7118

EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);

7124

EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);

7119

EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);

7125

EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);

7120

EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);

7126

EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);

7121

EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr);

7127

EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr);

7122

EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun);

7128

EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun);

7123

EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit);

7129

EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit);

7124

EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);

7130

EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);

7125

EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);

7131

EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);

7126

EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);

7132

EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);

7127

EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);

7133

EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);

7128

EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);

7134

EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);

7129

7135

GITLAB

KVM: Write protect the updated slot only when dirty logging is enabled

 /*
  * Kernel-based Virtual Machine driver for Linux
  *
  * derived from drivers/kvm/kvm_main.c
  *
  * Copyright (C) 2006 Qumranet, Inc.
  * Copyright (C) 2008 Qumranet, Inc.
  * Copyright IBM Corporation, 2008
  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  *
  * Authors:
  *   Avi Kivity   <avi@qumranet.com>
  *   Yaniv Kamay  <yaniv@qumranet.com>
  *   Amit Shah    <amit.shah@qumranet.com>
  *   Ben-Ami Yassour <benami@il.ibm.com>
  *
  * This work is licensed under the terms of the GNU GPL, version 2.  See
  * the COPYING file in the top-level directory.
  *
  */
 #include <linux/kvm_host.h>
 #include "irq.h"
 #include "mmu.h"
 #include "i8254.h"
 #include "tss.h"
 #include "kvm_cache_regs.h"
 #include "x86.h"
 #include "cpuid.h"
 #include <linux/clocksource.h>
 #include <linux/interrupt.h>
 #include <linux/kvm.h>
 #include <linux/fs.h>
 #include <linux/vmalloc.h>
 #include <linux/module.h>
 #include <linux/mman.h>
 #include <linux/highmem.h>
 #include <linux/iommu.h>
 #include <linux/intel-iommu.h>
 #include <linux/cpufreq.h>
 #include <linux/user-return-notifier.h>
 #include <linux/srcu.h>
 #include <linux/slab.h>
 #include <linux/perf_event.h>
 #include <linux/uaccess.h>
 #include <linux/hash.h>
 #include <linux/pci.h>
 #include <linux/timekeeper_internal.h>
 #include <linux/pvclock_gtod.h>
 #include <trace/events/kvm.h>
 #define CREATE_TRACE_POINTS
 #include "trace.h"
 #include <asm/debugreg.h>
 #include <asm/msr.h>
 #include <asm/desc.h>
 #include <asm/mtrr.h>
 #include <asm/mce.h>
 #include <asm/i387.h>
 #include <asm/fpu-internal.h> /* Ugh! */
 #include <asm/xcr.h>
 #include <asm/pvclock.h>
 #include <asm/div64.h>
 #define MAX_IO_MSRS 256
 #define KVM_MAX_MCE_BANKS 32
 #define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P)
 #define emul_to_vcpu(ctxt) \
 	container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt)
 /* EFER defaults:
  * - enable syscall per default because its emulated by KVM
  * - enable LME and LMA per default on 64 bit KVM
  */
 #ifdef CONFIG_X86_64
 static
 u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA));
 #else
 static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
 #endif
 #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
 static void update_cr8_intercept(struct kvm_vcpu *vcpu);
 static void process_nmi(struct kvm_vcpu *vcpu);
 struct kvm_x86_ops *kvm_x86_ops;
 EXPORT_SYMBOL_GPL(kvm_x86_ops);
 static bool ignore_msrs = 0;
 module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
 bool kvm_has_tsc_control;
 EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
 u32  kvm_max_guest_tsc_khz;
 EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
 /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
 static u32 tsc_tolerance_ppm = 250;
 module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
 #define KVM_NR_SHARED_MSRS 16
 struct kvm_shared_msrs_global {
 	int nr;
 	u32 msrs[KVM_NR_SHARED_MSRS];
 };
 struct kvm_shared_msrs {
 	struct user_return_notifier urn;
 	bool registered;
 	struct kvm_shared_msr_values {
 		u64 host;
 		u64 curr;
 	} values[KVM_NR_SHARED_MSRS];
 };
 static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
 static DEFINE_PER_CPU(struct kvm_shared_msrs, shared_msrs);
 struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "pf_fixed", VCPU_STAT(pf_fixed) },
 	{ "pf_guest", VCPU_STAT(pf_guest) },
 	{ "tlb_flush", VCPU_STAT(tlb_flush) },
 	{ "invlpg", VCPU_STAT(invlpg) },
 	{ "exits", VCPU_STAT(exits) },
 	{ "io_exits", VCPU_STAT(io_exits) },
 	{ "mmio_exits", VCPU_STAT(mmio_exits) },
 	{ "signal_exits", VCPU_STAT(signal_exits) },
 	{ "irq_window", VCPU_STAT(irq_window_exits) },
 	{ "nmi_window", VCPU_STAT(nmi_window_exits) },
 	{ "halt_exits", VCPU_STAT(halt_exits) },
 	{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
 	{ "hypercalls", VCPU_STAT(hypercalls) },
 	{ "request_irq", VCPU_STAT(request_irq_exits) },
 	{ "irq_exits", VCPU_STAT(irq_exits) },
 	{ "host_state_reload", VCPU_STAT(host_state_reload) },
 	{ "efer_reload", VCPU_STAT(efer_reload) },
 	{ "fpu_reload", VCPU_STAT(fpu_reload) },
 	{ "insn_emulation", VCPU_STAT(insn_emulation) },
 	{ "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
 	{ "irq_injections", VCPU_STAT(irq_injections) },
 	{ "nmi_injections", VCPU_STAT(nmi_injections) },
 	{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
 	{ "mmu_pte_write", VM_STAT(mmu_pte_write) },
 	{ "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
 	{ "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
 	{ "mmu_flooded", VM_STAT(mmu_flooded) },
 	{ "mmu_recycled", VM_STAT(mmu_recycled) },
 	{ "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
 	{ "mmu_unsync", VM_STAT(mmu_unsync) },
 	{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
 	{ "largepages", VM_STAT(lpages) },
 	{ NULL }
 };
 u64 __read_mostly host_xcr0;
 static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
 static int kvm_vcpu_reset(struct kvm_vcpu *vcpu);
 static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
 {
 	int i;
 	for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++)
 		vcpu->arch.apf.gfns[i] = ~0;
 }
 static void kvm_on_user_return(struct user_return_notifier *urn)
 {
 	unsigned slot;
 	struct kvm_shared_msrs *locals
 		= container_of(urn, struct kvm_shared_msrs, urn);
 	struct kvm_shared_msr_values *values;
 	for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
 		values = &locals->values[slot];
 		if (values->host != values->curr) {
 			wrmsrl(shared_msrs_global.msrs[slot], values->host);
 			values->curr = values->host;
 		}
 	}
 	locals->registered = false;
 	user_return_notifier_unregister(urn);
 }
 static void shared_msr_update(unsigned slot, u32 msr)
 {
 	struct kvm_shared_msrs *smsr;
 	u64 value;
 	smsr = &__get_cpu_var(shared_msrs);
 	/* only read, and nobody should modify it at this time,
 	 * so don't need lock */
 	if (slot >= shared_msrs_global.nr) {
 		printk(KERN_ERR "kvm: invalid MSR slot!");
 		return;
 	}
 	rdmsrl_safe(msr, &value);
 	smsr->values[slot].host = value;
 	smsr->values[slot].curr = value;
 }
 void kvm_define_shared_msr(unsigned slot, u32 msr)
 {
 	if (slot >= shared_msrs_global.nr)
 		shared_msrs_global.nr = slot + 1;
 	shared_msrs_global.msrs[slot] = msr;
 	/* we need ensured the shared_msr_global have been updated */
 	smp_wmb();
 }
 EXPORT_SYMBOL_GPL(kvm_define_shared_msr);
 static void kvm_shared_msr_cpu_online(void)
 {
 	unsigned i;
 	for (i = 0; i < shared_msrs_global.nr; ++i)
 		shared_msr_update(i, shared_msrs_global.msrs[i]);
 }
 void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
 {
 	struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);
 	if (((value ^ smsr->values[slot].curr) & mask) == 0)
 		return;
 	smsr->values[slot].curr = value;
 	wrmsrl(shared_msrs_global.msrs[slot], value);
 	if (!smsr->registered) {
 		smsr->urn.on_user_return = kvm_on_user_return;
 		user_return_notifier_register(&smsr->urn);
 		smsr->registered = true;
 	}
 }
 EXPORT_SYMBOL_GPL(kvm_set_shared_msr);
 static void drop_user_return_notifiers(void *ignore)
 {
 	struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);
 	if (smsr->registered)
 		kvm_on_user_return(&smsr->urn);
 }
 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
 {
 	return vcpu->arch.apic_base;
 }
 EXPORT_SYMBOL_GPL(kvm_get_apic_base);
 void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
 {
 	/* TODO: reserve bits check */
 	kvm_lapic_set_base(vcpu, data);
 }
 EXPORT_SYMBOL_GPL(kvm_set_apic_base);
 #define EXCPT_BENIGN		0
 #define EXCPT_CONTRIBUTORY	1
 #define EXCPT_PF		2
 static int exception_class(int vector)
 {
 	switch (vector) {
 	case PF_VECTOR:
 		return EXCPT_PF;
 	case DE_VECTOR:
 	case TS_VECTOR:
 	case NP_VECTOR:
 	case SS_VECTOR:
 	case GP_VECTOR:
 		return EXCPT_CONTRIBUTORY;
 	default:
 		break;
 	}
 	return EXCPT_BENIGN;
 }
 static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
 		unsigned nr, bool has_error, u32 error_code,
 		bool reinject)
 {
 	u32 prev_nr;
 	int class1, class2;
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
 	if (!vcpu->arch.exception.pending) {
 	queue:
 		vcpu->arch.exception.pending = true;
 		vcpu->arch.exception.has_error_code = has_error;
 		vcpu->arch.exception.nr = nr;
 		vcpu->arch.exception.error_code = error_code;
 		vcpu->arch.exception.reinject = reinject;
 		return;
 	}
 	/* to check exception */
 	prev_nr = vcpu->arch.exception.nr;
 	if (prev_nr == DF_VECTOR) {
 		/* triple fault -> shutdown */
 		kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
 		return;
 	}
 	class1 = exception_class(prev_nr);
 	class2 = exception_class(nr);
 	if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
 		|| (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
 		/* generate double fault per SDM Table 5-5 */
 		vcpu->arch.exception.pending = true;
 		vcpu->arch.exception.has_error_code = true;
 		vcpu->arch.exception.nr = DF_VECTOR;
 		vcpu->arch.exception.error_code = 0;
 	} else
 		/* replace previous exception with a new one in a hope
 		   that instruction re-execution will regenerate lost
 		   exception */
 		goto queue;
 }
 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
 {
 	kvm_multiple_exception(vcpu, nr, false, 0, false);
 }
 EXPORT_SYMBOL_GPL(kvm_queue_exception);
 void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
 {
 	kvm_multiple_exception(vcpu, nr, false, 0, true);
 }
 EXPORT_SYMBOL_GPL(kvm_requeue_exception);
 void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
 {
 	if (err)
 		kvm_inject_gp(vcpu, 0);
 	else
 		kvm_x86_ops->skip_emulated_instruction(vcpu);
 }
 EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
 {
 	++vcpu->stat.pf_guest;
 	vcpu->arch.cr2 = fault->address;
 	kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
 }
 EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
 void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
 {
 	if (mmu_is_nested(vcpu) && !fault->nested_page_fault)
 		vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault);
 	else
 		vcpu->arch.mmu.inject_page_fault(vcpu, fault);
 }
 void kvm_inject_nmi(struct kvm_vcpu *vcpu)
 {
 	atomic_inc(&vcpu->arch.nmi_queued);
 	kvm_make_request(KVM_REQ_NMI, vcpu);
 }
 EXPORT_SYMBOL_GPL(kvm_inject_nmi);
 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
 {
 	kvm_multiple_exception(vcpu, nr, true, error_code, false);
 }
 EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
 void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
 {
 	kvm_multiple_exception(vcpu, nr, true, error_code, true);
 }
 EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);
 /*
  * Checks if cpl <= required_cpl; if true, return true.  Otherwise queue
  * a #GP and return false.
  */
 bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
 {
 	if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl)
 		return true;
 	kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
 	return false;
 }
 EXPORT_SYMBOL_GPL(kvm_require_cpl);
 /*
  * This function will be used to read from the physical memory of the currently
  * running guest. The difference to kvm_read_guest_page is that this function
  * can read from guest physical or from the guest's guest physical memory.
  */
 int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 			    gfn_t ngfn, void *data, int offset, int len,
 			    u32 access)
 {
 	gfn_t real_gfn;
 	gpa_t ngpa;
 	ngpa     = gfn_to_gpa(ngfn);
 	real_gfn = mmu->translate_gpa(vcpu, ngpa, access);
 	if (real_gfn == UNMAPPED_GVA)
 		return -EFAULT;
 	real_gfn = gpa_to_gfn(real_gfn);
 	return kvm_read_guest_page(vcpu->kvm, real_gfn, data, offset, len);
 }
 EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
 int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
 			       void *data, int offset, int len, u32 access)
 {
 	return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn,
 				       data, offset, len, access);
 }
 /*
  * Load the pae pdptrs.  Return true is they are all valid.
  */
 int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
 {
 	gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
 	unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
 	int i;
 	int ret;
 	u64 pdpte[ARRAY_SIZE(mmu->pdptrs)];
 	ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte,
 				      offset * sizeof(u64), sizeof(pdpte),
 				      PFERR_USER_MASK|PFERR_WRITE_MASK);
 	if (ret < 0) {
 		ret = 0;
 		goto out;
 	}
 	for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
 		if (is_present_gpte(pdpte[i]) &&
 		    (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) {
 			ret = 0;
 			goto out;
 		}
 	}
 	ret = 1;
 	memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
 	__set_bit(VCPU_EXREG_PDPTR,
 		  (unsigned long *)&vcpu->arch.regs_avail);
 	__set_bit(VCPU_EXREG_PDPTR,
 		  (unsigned long *)&vcpu->arch.regs_dirty);
 out:
 	return ret;
 }
 EXPORT_SYMBOL_GPL(load_pdptrs);
 static bool pdptrs_changed(struct kvm_vcpu *vcpu)
 {
 	u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
 	bool changed = true;
 	int offset;
 	gfn_t gfn;
 	int r;
 	if (is_long_mode(vcpu) || !is_pae(vcpu))
 		return false;
 	if (!test_bit(VCPU_EXREG_PDPTR,
 		      (unsigned long *)&vcpu->arch.regs_avail))
 		return true;
 	gfn = (kvm_read_cr3(vcpu) & ~31u) >> PAGE_SHIFT;
 	offset = (kvm_read_cr3(vcpu) & ~31u) & (PAGE_SIZE - 1);
 	r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
 				       PFERR_USER_MASK | PFERR_WRITE_MASK);
 	if (r < 0)
 		goto out;
 	changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
 out:
 	return changed;
 }
 int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 {
 	unsigned long old_cr0 = kvm_read_cr0(vcpu);
 	unsigned long update_bits = X86_CR0_PG | X86_CR0_WP |
 				    X86_CR0_CD | X86_CR0_NW;
 	cr0 |= X86_CR0_ET;
 #ifdef CONFIG_X86_64
 	if (cr0 & 0xffffffff00000000UL)
 		return 1;
 #endif
 	cr0 &= ~CR0_RESERVED_BITS;
 	if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
 		return 1;
 	if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
 		return 1;
 	if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
 #ifdef CONFIG_X86_64
 		if ((vcpu->arch.efer & EFER_LME)) {
 			int cs_db, cs_l;
 			if (!is_pae(vcpu))
 				return 1;
 			kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
 			if (cs_l)
 				return 1;
 		} else
 #endif
 		if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
 						 kvm_read_cr3(vcpu)))
 			return 1;
 	}
 	if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
 		return 1;
 	kvm_x86_ops->set_cr0(vcpu, cr0);
 	if ((cr0 ^ old_cr0) & X86_CR0_PG) {
 		kvm_clear_async_pf_completion_queue(vcpu);
 		kvm_async_pf_hash_reset(vcpu);
 	}
 	if ((cr0 ^ old_cr0) & update_bits)
 		kvm_mmu_reset_context(vcpu);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(kvm_set_cr0);
 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
 {
 	(void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
 }
 EXPORT_SYMBOL_GPL(kvm_lmsw);
 int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
 {
 	u64 xcr0;
 	/* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now  */
 	if (index != XCR_XFEATURE_ENABLED_MASK)
 		return 1;
 	xcr0 = xcr;
 	if (kvm_x86_ops->get_cpl(vcpu) != 0)
 		return 1;
 	if (!(xcr0 & XSTATE_FP))
 		return 1;
 	if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE))
 		return 1;
 	if (xcr0 & ~host_xcr0)
 		return 1;
 	vcpu->arch.xcr0 = xcr0;
 	vcpu->guest_xcr0_loaded = 0;
 	return 0;
 }
 int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
 {
 	if (__kvm_set_xcr(vcpu, index, xcr)) {
 		kvm_inject_gp(vcpu, 0);
 		return 1;
 	}
 	return 0;
 }
 EXPORT_SYMBOL_GPL(kvm_set_xcr);
 int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
 	unsigned long old_cr4 = kvm_read_cr4(vcpu);
 	unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE |
 				   X86_CR4_PAE | X86_CR4_SMEP;
 	if (cr4 & CR4_RESERVED_BITS)
 		return 1;
 	if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE))
 		return 1;
 	if (!guest_cpuid_has_smep(vcpu) && (cr4 & X86_CR4_SMEP))
 		return 1;
 	if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_RDWRGSFS))
 		return 1;
 	if (is_long_mode(vcpu)) {
 		if (!(cr4 & X86_CR4_PAE))
 			return 1;
 	} else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
 		   && ((cr4 ^ old_cr4) & pdptr_bits)
 		   && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
 				   kvm_read_cr3(vcpu)))
 		return 1;
 	if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) {
 		if (!guest_cpuid_has_pcid(vcpu))
 			return 1;
 		/* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
 		if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))
 			return 1;
 	}
 	if (kvm_x86_ops->set_cr4(vcpu, cr4))
 		return 1;
 	if (((cr4 ^ old_cr4) & pdptr_bits) ||
 	    (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
 		kvm_mmu_reset_context(vcpu);
 	if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE)
 		kvm_update_cpuid(vcpu);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(kvm_set_cr4);
 int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 {
 	if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
 		kvm_mmu_sync_roots(vcpu);
 		kvm_mmu_flush_tlb(vcpu);
 		return 0;
 	}
 	if (is_long_mode(vcpu)) {
 		if (kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) {
 			if (cr3 & CR3_PCID_ENABLED_RESERVED_BITS)
 				return 1;
 		} else
 			if (cr3 & CR3_L_MODE_RESERVED_BITS)
 				return 1;
 	} else {
 		if (is_pae(vcpu)) {
 			if (cr3 & CR3_PAE_RESERVED_BITS)
 				return 1;
 			if (is_paging(vcpu) &&
 			    !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
 				return 1;
 		}
 		/*
 		 * We don't check reserved bits in nonpae mode, because
 		 * this isn't enforced, and VMware depends on this.
 		 */
 	}
 	/*
 	 * Does the new cr3 value map to physical memory? (Note, we
 	 * catch an invalid cr3 even in real-mode, because it would
 	 * cause trouble later on when we turn on paging anyway.)
 	 *
 	 * A real CPU would silently accept an invalid cr3 and would
 	 * attempt to use it - with largely undefined (and often hard
 	 * to debug) behavior on the guest side.
 	 */
 	if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
 		return 1;
 	vcpu->arch.cr3 = cr3;
 	__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
 	vcpu->arch.mmu.new_cr3(vcpu);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(kvm_set_cr3);
 int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
 {
 	if (cr8 & CR8_RESERVED_BITS)
 		return 1;
 	if (irqchip_in_kernel(vcpu->kvm))
 		kvm_lapic_set_tpr(vcpu, cr8);
 	else
 		vcpu->arch.cr8 = cr8;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(kvm_set_cr8);
 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
 {
 	if (irqchip_in_kernel(vcpu->kvm))
 		return kvm_lapic_get_cr8(vcpu);
 	else
 		return vcpu->arch.cr8;
 }
 EXPORT_SYMBOL_GPL(kvm_get_cr8);
 static void kvm_update_dr7(struct kvm_vcpu *vcpu)
 {
 	unsigned long dr7;
 	if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
 		dr7 = vcpu->arch.guest_debug_dr7;
 	else
 		dr7 = vcpu->arch.dr7;
 	kvm_x86_ops->set_dr7(vcpu, dr7);
 	vcpu->arch.switch_db_regs = (dr7 & DR7_BP_EN_MASK);
 }
 static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
 {
 	switch (dr) {
 	case 0 ... 3:
 		vcpu->arch.db[dr] = val;
 		if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
 			vcpu->arch.eff_db[dr] = val;
 		break;
 	case 4:
 		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
 			return 1; /* #UD */
 		/* fall through */
 	case 6:
 		if (val & 0xffffffff00000000ULL)
 			return -1; /* #GP */
 		vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
 		break;
 	case 5:
 		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
 			return 1; /* #UD */
 		/* fall through */
 	default: /* 7 */
 		if (val & 0xffffffff00000000ULL)
 			return -1; /* #GP */
 		vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
 		kvm_update_dr7(vcpu);
 		break;
 	}
 	return 0;
 }
 int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
 {
 	int res;
 	res = __kvm_set_dr(vcpu, dr, val);
 	if (res > 0)
 		kvm_queue_exception(vcpu, UD_VECTOR);
 	else if (res < 0)
 		kvm_inject_gp(vcpu, 0);
 	return res;
 }
 EXPORT_SYMBOL_GPL(kvm_set_dr);
 static int _kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
 {
 	switch (dr) {
 	case 0 ... 3:
 		*val = vcpu->arch.db[dr];
 		break;
 	case 4:
 		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
 			return 1;
 		/* fall through */
 	case 6:
 		*val = vcpu->arch.dr6;
 		break;
 	case 5:
 		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
 			return 1;
 		/* fall through */
 	default: /* 7 */
 		*val = vcpu->arch.dr7;
 		break;
 	}
 	return 0;
 }
 int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
 {
 	if (_kvm_get_dr(vcpu, dr, val)) {
 		kvm_queue_exception(vcpu, UD_VECTOR);
 		return 1;
 	}
 	return 0;
 }
 EXPORT_SYMBOL_GPL(kvm_get_dr);
 bool kvm_rdpmc(struct kvm_vcpu *vcpu)
 {
 	u32 ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
 	u64 data;
 	int err;
 	err = kvm_pmu_read_pmc(vcpu, ecx, &data);
 	if (err)
 		return err;
 	kvm_register_write(vcpu, VCPU_REGS_RAX, (u32)data);
 	kvm_register_write(vcpu, VCPU_REGS_RDX, data >> 32);
 	return err;
 }
 EXPORT_SYMBOL_GPL(kvm_rdpmc);
 /*
  * List of msr numbers which we expose to userspace through KVM_GET_MSRS
  * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
  *
  * This list is modified at module load time to reflect the
  * capabilities of the host cpu. This capabilities test skips MSRs that are
  * kvm-specific. Those are put in the beginning of the list.
  */
 #define KVM_SAVE_MSRS_BEGIN	10
 static u32 msrs_to_save[] = {
 	MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
 	MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
 	HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
 	HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
 	MSR_KVM_PV_EOI_EN,
 	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
 	MSR_STAR,
 #ifdef CONFIG_X86_64
 	MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
 #endif
 	MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
 };
 static unsigned num_msrs_to_save;
 static const u32 emulated_msrs[] = {
 	MSR_IA32_TSC_ADJUST,
 	MSR_IA32_TSCDEADLINE,
 	MSR_IA32_MISC_ENABLE,
 	MSR_IA32_MCG_STATUS,
 	MSR_IA32_MCG_CTL,
 };
 static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
 {
 	u64 old_efer = vcpu->arch.efer;
 	if (efer & efer_reserved_bits)
 		return 1;
 	if (is_paging(vcpu)
 	    && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
 		return 1;
 	if (efer & EFER_FFXSR) {
 		struct kvm_cpuid_entry2 *feat;
 		feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
 		if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT)))
 			return 1;
 	}
 	if (efer & EFER_SVME) {
 		struct kvm_cpuid_entry2 *feat;
 		feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
 		if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM)))
 			return 1;
 	}
 	efer &= ~EFER_LMA;
 	efer |= vcpu->arch.efer & EFER_LMA;
 	kvm_x86_ops->set_efer(vcpu, efer);
 	vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
 	/* Update reserved bits */
 	if ((efer ^ old_efer) & EFER_NX)
 		kvm_mmu_reset_context(vcpu);
 	return 0;
 }
 void kvm_enable_efer_bits(u64 mask)
 {
        efer_reserved_bits &= ~mask;
 }
 EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
 /*
  * Writes msr value into into the appropriate "register".
  * Returns 0 on success, non-0 otherwise.
  * Assumes vcpu_load() was already called.
  */
 int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
 {
 	return kvm_x86_ops->set_msr(vcpu, msr);
 }
 /*
  * Adapt set_msr() to msr_io()'s calling convention
  */
 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
 {
 	struct msr_data msr;
 	msr.data = *data;
 	msr.index = index;
 	msr.host_initiated = true;
 	return kvm_set_msr(vcpu, &msr);
 }
 #ifdef CONFIG_X86_64
 struct pvclock_gtod_data {
 	seqcount_t	seq;
 	struct { /* extract of a clocksource struct */
 		int vclock_mode;
 		cycle_t	cycle_last;
 		cycle_t	mask;
 		u32	mult;
 		u32	shift;
 	} clock;
 	/* open coded 'struct timespec' */
 	u64		monotonic_time_snsec;
 	time_t		monotonic_time_sec;
 };
 static struct pvclock_gtod_data pvclock_gtod_data;
 static void update_pvclock_gtod(struct timekeeper *tk)
 {
 	struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
 	write_seqcount_begin(&vdata->seq);
 	/* copy pvclock gtod data */
 	vdata->clock.vclock_mode	= tk->clock->archdata.vclock_mode;
 	vdata->clock.cycle_last		= tk->clock->cycle_last;
 	vdata->clock.mask		= tk->clock->mask;
 	vdata->clock.mult		= tk->mult;
 	vdata->clock.shift		= tk->shift;
 	vdata->monotonic_time_sec	= tk->xtime_sec
 					+ tk->wall_to_monotonic.tv_sec;
 	vdata->monotonic_time_snsec	= tk->xtime_nsec
 					+ (tk->wall_to_monotonic.tv_nsec
 						<< tk->shift);
 	while (vdata->monotonic_time_snsec >=
 					(((u64)NSEC_PER_SEC) << tk->shift)) {
 		vdata->monotonic_time_snsec -=
 					((u64)NSEC_PER_SEC) << tk->shift;
 		vdata->monotonic_time_sec++;
 	}
 	write_seqcount_end(&vdata->seq);
 }
 #endif
 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
 {
 	int version;
 	int r;
 	struct pvclock_wall_clock wc;
 	struct timespec boot;
 	if (!wall_clock)
 		return;
 	r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version));
 	if (r)
 		return;
 	if (version & 1)
 		++version;  /* first time write, random junk */
 	++version;
 	kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
 	/*
 	 * The guest calculates current wall clock time by adding
 	 * system time (updated by kvm_guest_time_update below) to the
 	 * wall clock specified here.  guest system time equals host
 	 * system time for us, thus we must fill in host boot time here.
 	 */
 	getboottime(&boot);
 	if (kvm->arch.kvmclock_offset) {
 		struct timespec ts = ns_to_timespec(kvm->arch.kvmclock_offset);
 		boot = timespec_sub(boot, ts);
 	}
 	wc.sec = boot.tv_sec;
 	wc.nsec = boot.tv_nsec;
 	wc.version = version;
 	kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
 	version++;
 	kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
 }
 static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
 {
 	uint32_t quotient, remainder;
 	/* Don't try to replace with do_div(), this one calculates
 	 * "(dividend << 32) / divisor" */
 	__asm__ ( "divl %4"
 		  : "=a" (quotient), "=d" (remainder)
 		  : "0" (0), "1" (dividend), "r" (divisor) );
 	return quotient;
 }
 static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz,
 			       s8 *pshift, u32 *pmultiplier)
 {
 	uint64_t scaled64;
 	int32_t  shift = 0;
 	uint64_t tps64;
 	uint32_t tps32;
 	tps64 = base_khz * 1000LL;
 	scaled64 = scaled_khz * 1000LL;
 	while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) {
 		tps64 >>= 1;
 		shift--;
 	}
 	tps32 = (uint32_t)tps64;
 	while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) {
 		if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000)
 			scaled64 >>= 1;
 		else
 			tps32 <<= 1;
 		shift++;
 	}
 	*pshift = shift;
 	*pmultiplier = div_frac(scaled64, tps32);
 	pr_debug("%s: base_khz %u => %u, shift %d, mul %u\n",
 		 __func__, base_khz, scaled_khz, shift, *pmultiplier);
 }
 static inline u64 get_kernel_ns(void)
 {
 	struct timespec ts;
 	WARN_ON(preemptible());
 	ktime_get_ts(&ts);
 	monotonic_to_bootbased(&ts);
 	return timespec_to_ns(&ts);
 }
 #ifdef CONFIG_X86_64
 static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
 #endif
 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
 unsigned long max_tsc_khz;
 static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
 {
 	return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult,
 				   vcpu->arch.virtual_tsc_shift);
 }
 static u32 adjust_tsc_khz(u32 khz, s32 ppm)
 {
 	u64 v = (u64)khz * (1000000 + ppm);
 	do_div(v, 1000000);
 	return v;
 }
 static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
 {
 	u32 thresh_lo, thresh_hi;
 	int use_scaling = 0;
 	/* Compute a scale to convert nanoseconds in TSC cycles */
 	kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
 			   &vcpu->arch.virtual_tsc_shift,
 			   &vcpu->arch.virtual_tsc_mult);
 	vcpu->arch.virtual_tsc_khz = this_tsc_khz;
 	/*
 	 * Compute the variation in TSC rate which is acceptable
 	 * within the range of tolerance and decide if the
 	 * rate being applied is within that bounds of the hardware
 	 * rate.  If so, no scaling or compensation need be done.
 	 */
 	thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
 	thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);
 	if (this_tsc_khz < thresh_lo || this_tsc_khz > thresh_hi) {
 		pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", this_tsc_khz, thresh_lo, thresh_hi);
 		use_scaling = 1;
 	}
 	kvm_x86_ops->set_tsc_khz(vcpu, this_tsc_khz, use_scaling);
 }
 static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
 {
 	u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec,
 				      vcpu->arch.virtual_tsc_mult,
 				      vcpu->arch.virtual_tsc_shift);
 	tsc += vcpu->arch.this_tsc_write;
 	return tsc;
 }
 void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
 {
 #ifdef CONFIG_X86_64
 	bool vcpus_matched;
 	bool do_request = false;
 	struct kvm_arch *ka = &vcpu->kvm->arch;
 	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
 	vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
 			 atomic_read(&vcpu->kvm->online_vcpus));
 	if (vcpus_matched && gtod->clock.vclock_mode == VCLOCK_TSC)
 		if (!ka->use_master_clock)
 			do_request = 1;
 	if (!vcpus_matched && ka->use_master_clock)
 			do_request = 1;
 	if (do_request)
 		kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
 	trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc,
 			    atomic_read(&vcpu->kvm->online_vcpus),
 		            ka->use_master_clock, gtod->clock.vclock_mode);
 #endif
 }
 static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)
 {
 	u64 curr_offset = kvm_x86_ops->read_tsc_offset(vcpu);
 	vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;
 }
 void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
 {
 	struct kvm *kvm = vcpu->kvm;
 	u64 offset, ns, elapsed;
 	unsigned long flags;
 	s64 usdiff;
 	bool matched;
 	u64 data = msr->data;
 	raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
 	offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
 	ns = get_kernel_ns();
 	elapsed = ns - kvm->arch.last_tsc_nsec;
 	/* n.b - signed multiplication and division required */
 	usdiff = data - kvm->arch.last_tsc_write;
 #ifdef CONFIG_X86_64
 	usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz;
 #else
 	/* do_div() only does unsigned */
 	asm("idivl %2; xor %%edx, %%edx"
 	    : "=A"(usdiff)
 	    : "A"(usdiff * 1000), "rm"(vcpu->arch.virtual_tsc_khz));
 #endif
 	do_div(elapsed, 1000);
 	usdiff -= elapsed;
 	if (usdiff < 0)
 		usdiff = -usdiff;
 	/*
 	 * Special case: TSC write with a small delta (1 second) of virtual
 	 * cycle time against real time is interpreted as an attempt to
 	 * synchronize the CPU.
          *
 	 * For a reliable TSC, we can match TSC offsets, and for an unstable
 	 * TSC, we add elapsed time in this computation.  We could let the
 	 * compensation code attempt to catch up if we fall behind, but
 	 * it's better to try to match offsets from the beginning.
          */
 	if (usdiff < USEC_PER_SEC &&
 	    vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
 		if (!check_tsc_unstable()) {
 			offset = kvm->arch.cur_tsc_offset;
 			pr_debug("kvm: matched tsc offset for %llu\n", data);
 		} else {
 			u64 delta = nsec_to_cycles(vcpu, elapsed);
 			data += delta;
 			offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
 			pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
 		}
 		matched = true;
 	} else {
 		/*
 		 * We split periods of matched TSC writes into generations.
 		 * For each generation, we track the original measured
 		 * nanosecond time, offset, and write, so if TSCs are in
 		 * sync, we can match exact offset, and if not, we can match
 		 * exact software computation in compute_guest_tsc()
 		 *
 		 * These values are tracked in kvm->arch.cur_xxx variables.
 		 */
 		kvm->arch.cur_tsc_generation++;
 		kvm->arch.cur_tsc_nsec = ns;
 		kvm->arch.cur_tsc_write = data;
 		kvm->arch.cur_tsc_offset = offset;
 		matched = false;
 		pr_debug("kvm: new tsc generation %u, clock %llu\n",
 			 kvm->arch.cur_tsc_generation, data);
 	}
 	/*
 	 * We also track th most recent recorded KHZ, write and time to
 	 * allow the matching interval to be extended at each write.
 	 */
 	kvm->arch.last_tsc_nsec = ns;
 	kvm->arch.last_tsc_write = data;
 	kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
 	/* Reset of TSC must disable overshoot protection below */
 	vcpu->arch.hv_clock.tsc_timestamp = 0;
 	vcpu->arch.last_guest_tsc = data;
 	/* Keep track of which generation this VCPU has synchronized to */
 	vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;
 	vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
 	vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
 	if (guest_cpuid_has_tsc_adjust(vcpu) && !msr->host_initiated)
 		update_ia32_tsc_adjust_msr(vcpu, offset);
 	kvm_x86_ops->write_tsc_offset(vcpu, offset);
 	raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
 	spin_lock(&kvm->arch.pvclock_gtod_sync_lock);
 	if (matched)
 		kvm->arch.nr_vcpus_matched_tsc++;
 	else
 		kvm->arch.nr_vcpus_matched_tsc = 0;
 	kvm_track_tsc_matching(vcpu);
 	spin_unlock(&kvm->arch.pvclock_gtod_sync_lock);
 }
 EXPORT_SYMBOL_GPL(kvm_write_tsc);
 #ifdef CONFIG_X86_64
 static cycle_t read_tsc(void)
 {
 	cycle_t ret;
 	u64 last;
 	/*
 	 * Empirically, a fence (of type that depends on the CPU)
 	 * before rdtsc is enough to ensure that rdtsc is ordered
 	 * with respect to loads.  The various CPU manuals are unclear
 	 * as to whether rdtsc can be reordered with later loads,
 	 * but no one has ever seen it happen.
 	 */
 	rdtsc_barrier();
 	ret = (cycle_t)vget_cycles();
 	last = pvclock_gtod_data.clock.cycle_last;
 	if (likely(ret >= last))
 		return ret;
 	/*
 	 * GCC likes to generate cmov here, but this branch is extremely
 	 * predictable (it's just a funciton of time and the likely is
 	 * very likely) and there's a data dependence, so force GCC
 	 * to generate a branch instead.  I don't barrier() because
 	 * we don't actually need a barrier, and if this function
 	 * ever gets inlined it will generate worse code.
 	 */
 	asm volatile ("");
 	return last;
 }
 static inline u64 vgettsc(cycle_t *cycle_now)
 {
 	long v;
 	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
 	*cycle_now = read_tsc();
 	v = (*cycle_now - gtod->clock.cycle_last) & gtod->clock.mask;
 	return v * gtod->clock.mult;
 }
 static int do_monotonic(struct timespec *ts, cycle_t *cycle_now)
 {
 	unsigned long seq;
 	u64 ns;
 	int mode;
 	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
 	ts->tv_nsec = 0;
 	do {
 		seq = read_seqcount_begin(&gtod->seq);
 		mode = gtod->clock.vclock_mode;
 		ts->tv_sec = gtod->monotonic_time_sec;
 		ns = gtod->monotonic_time_snsec;
 		ns += vgettsc(cycle_now);
 		ns >>= gtod->clock.shift;
 	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
 	timespec_add_ns(ts, ns);
 	return mode;
 }
 /* returns true if host is using tsc clocksource */
 static bool kvm_get_time_and_clockread(s64 *kernel_ns, cycle_t *cycle_now)
 {
 	struct timespec ts;
 	/* checked again under seqlock below */
 	if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC)
 		return false;
 	if (do_monotonic(&ts, cycle_now) != VCLOCK_TSC)
 		return false;
 	monotonic_to_bootbased(&ts);
 	*kernel_ns = timespec_to_ns(&ts);
 	return true;
 }
 #endif
 /*
  *
  * Assuming a stable TSC across physical CPUS, and a stable TSC
  * across virtual CPUs, the following condition is possible.
  * Each numbered line represents an event visible to both
  * CPUs at the next numbered event.
  *
  * "timespecX" represents host monotonic time. "tscX" represents
  * RDTSC value.
  *
  * 		VCPU0 on CPU0		|	VCPU1 on CPU1
  *
  * 1.  read timespec0,tsc0
  * 2.					| timespec1 = timespec0 + N
  * 					| tsc1 = tsc0 + M
  * 3. transition to guest		| transition to guest
  * 4. ret0 = timespec0 + (rdtsc - tsc0) |
  * 5.				        | ret1 = timespec1 + (rdtsc - tsc1)
  * 				        | ret1 = timespec0 + N + (rdtsc - (tsc0 + M))
  *
  * Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity:
  *
  * 	- ret0 < ret1
  *	- timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M))
  *		...
  *	- 0 < N - M => M < N
  *
  * That is, when timespec0 != timespec1, M < N. Unfortunately that is not
  * always the case (the difference between two distinct xtime instances
  * might be smaller then the difference between corresponding TSC reads,
  * when updating guest vcpus pvclock areas).
  *
  * To avoid that problem, do not allow visibility of distinct
  * system_timestamp/tsc_timestamp values simultaneously: use a master
  * copy of host monotonic time values. Update that master copy
  * in lockstep.
  *
  * Rely on synchronization of host TSCs and guest TSCs for monotonicity.
  *
  */
 static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
 {
 #ifdef CONFIG_X86_64
 	struct kvm_arch *ka = &kvm->arch;
 	int vclock_mode;
 	bool host_tsc_clocksource, vcpus_matched;
 	vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
 			atomic_read(&kvm->online_vcpus));
 	/*
 	 * If the host uses TSC clock, then passthrough TSC as stable
 	 * to the guest.
 	 */
 	host_tsc_clocksource = kvm_get_time_and_clockread(
 					&ka->master_kernel_ns,
 					&ka->master_cycle_now);
 	ka->use_master_clock = host_tsc_clocksource & vcpus_matched;
 	if (ka->use_master_clock)
 		atomic_set(&kvm_guest_has_master_clock, 1);
 	vclock_mode = pvclock_gtod_data.clock.vclock_mode;
 	trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode,
 					vcpus_matched);
 #endif
 }
 static int kvm_guest_time_update(struct kvm_vcpu *v)
 {
 	unsigned long flags, this_tsc_khz;
 	struct kvm_vcpu_arch *vcpu = &v->arch;
 	struct kvm_arch *ka = &v->kvm->arch;
 	void *shared_kaddr;
 	s64 kernel_ns, max_kernel_ns;
 	u64 tsc_timestamp, host_tsc;
 	struct pvclock_vcpu_time_info *guest_hv_clock;
 	u8 pvclock_flags;
 	bool use_master_clock;
 	kernel_ns = 0;
 	host_tsc = 0;
 	/* Keep irq disabled to prevent changes to the clock */
 	local_irq_save(flags);
 	this_tsc_khz = __get_cpu_var(cpu_tsc_khz);
 	if (unlikely(this_tsc_khz == 0)) {
 		local_irq_restore(flags);
 		kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
 		return 1;
 	}
 	/*
 	 * If the host uses TSC clock, then passthrough TSC as stable
 	 * to the guest.
 	 */
 	spin_lock(&ka->pvclock_gtod_sync_lock);
 	use_master_clock = ka->use_master_clock;
 	if (use_master_clock) {
 		host_tsc = ka->master_cycle_now;
 		kernel_ns = ka->master_kernel_ns;
 	}
 	spin_unlock(&ka->pvclock_gtod_sync_lock);
 	if (!use_master_clock) {
 		host_tsc = native_read_tsc();
 		kernel_ns = get_kernel_ns();
 	}
 	tsc_timestamp = kvm_x86_ops->read_l1_tsc(v, host_tsc);
 	/*
 	 * We may have to catch up the TSC to match elapsed wall clock
 	 * time for two reasons, even if kvmclock is used.
 	 *   1) CPU could have been running below the maximum TSC rate
 	 *   2) Broken TSC compensation resets the base at each VCPU
 	 *      entry to avoid unknown leaps of TSC even when running
 	 *      again on the same CPU.  This may cause apparent elapsed
 	 *      time to disappear, and the guest to stand still or run
 	 *	very slowly.
 	 */
 	if (vcpu->tsc_catchup) {
 		u64 tsc = compute_guest_tsc(v, kernel_ns);
 		if (tsc > tsc_timestamp) {
 			adjust_tsc_offset_guest(v, tsc - tsc_timestamp);
 			tsc_timestamp = tsc;
 		}
 	}
 	local_irq_restore(flags);
 	if (!vcpu->time_page)
 		return 0;
 	/*
 	 * Time as measured by the TSC may go backwards when resetting the base
 	 * tsc_timestamp.  The reason for this is that the TSC resolution is
 	 * higher than the resolution of the other clock scales.  Thus, many
 	 * possible measurments of the TSC correspond to one measurement of any
 	 * other clock, and so a spread of values is possible.  This is not a
 	 * problem for the computation of the nanosecond clock; with TSC rates
 	 * around 1GHZ, there can only be a few cycles which correspond to one
 	 * nanosecond value, and any path through this code will inevitably
 	 * take longer than that.  However, with the kernel_ns value itself,
 	 * the precision may be much lower, down to HZ granularity.  If the
 	 * first sampling of TSC against kernel_ns ends in the low part of the
 	 * range, and the second in the high end of the range, we can get:
 	 *
 	 * (TSC - offset_low) * S + kns_old > (TSC - offset_high) * S + kns_new
 	 *
 	 * As the sampling errors potentially range in the thousands of cycles,
 	 * it is possible such a time value has already been observed by the
 	 * guest.  To protect against this, we must compute the system time as
 	 * observed by the guest and ensure the new system time is greater.
 	 */
 	max_kernel_ns = 0;
 	if (vcpu->hv_clock.tsc_timestamp) {
 		max_kernel_ns = vcpu->last_guest_tsc -
 				vcpu->hv_clock.tsc_timestamp;
 		max_kernel_ns = pvclock_scale_delta(max_kernel_ns,
 				    vcpu->hv_clock.tsc_to_system_mul,
 				    vcpu->hv_clock.tsc_shift);
 		max_kernel_ns += vcpu->last_kernel_ns;
 	}
 	if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) {
 		kvm_get_time_scale(NSEC_PER_SEC / 1000, this_tsc_khz,
 				   &vcpu->hv_clock.tsc_shift,
 				   &vcpu->hv_clock.tsc_to_system_mul);
 		vcpu->hw_tsc_khz = this_tsc_khz;
 	}
 	/* with a master <monotonic time, tsc value> tuple,
 	 * pvclock clock reads always increase at the (scaled) rate
 	 * of guest TSC - no need to deal with sampling errors.
 	 */
 	if (!use_master_clock) {
 		if (max_kernel_ns > kernel_ns)
 			kernel_ns = max_kernel_ns;
 	}
 	/* With all the info we got, fill in the values */
 	vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
 	vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
 	vcpu->last_kernel_ns = kernel_ns;
 	vcpu->last_guest_tsc = tsc_timestamp;
 	/*
 	 * The interface expects us to write an even number signaling that the
 	 * update is finished. Since the guest won't see the intermediate
 	 * state, we just increase by 2 at the end.
 	 */
 	vcpu->hv_clock.version += 2;
 	shared_kaddr = kmap_atomic(vcpu->time_page);
 	guest_hv_clock = shared_kaddr + vcpu->time_offset;
 	/* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
 	pvclock_flags = (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED);
 	if (vcpu->pvclock_set_guest_stopped_request) {
 		pvclock_flags |= PVCLOCK_GUEST_STOPPED;
 		vcpu->pvclock_set_guest_stopped_request = false;
 	}
 	/* If the host uses TSC clocksource, then it is stable */
 	if (use_master_clock)
 		pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
 	vcpu->hv_clock.flags = pvclock_flags;
 	memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
 	       sizeof(vcpu->hv_clock));
 	kunmap_atomic(shared_kaddr);
 	mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
 	return 0;
 }
 static bool msr_mtrr_valid(unsigned msr)
 {
 	switch (msr) {
 	case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1:
 	case MSR_MTRRfix64K_00000:
 	case MSR_MTRRfix16K_80000:
 	case MSR_MTRRfix16K_A0000:
 	case MSR_MTRRfix4K_C0000:
 	case MSR_MTRRfix4K_C8000:
 	case MSR_MTRRfix4K_D0000:
 	case MSR_MTRRfix4K_D8000:
 	case MSR_MTRRfix4K_E0000:
 	case MSR_MTRRfix4K_E8000:
 	case MSR_MTRRfix4K_F0000:
 	case MSR_MTRRfix4K_F8000:
 	case MSR_MTRRdefType:
 	case MSR_IA32_CR_PAT:
 		return true;
 	case 0x2f8:
 		return true;
 	}
 	return false;
 }
 static bool valid_pat_type(unsigned t)
 {
 	return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */
 }
 static bool valid_mtrr_type(unsigned t)
 {
 	return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */
 }
 static bool mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 {
 	int i;
 	if (!msr_mtrr_valid(msr))
 		return false;
 	if (msr == MSR_IA32_CR_PAT) {
 		for (i = 0; i < 8; i++)
 			if (!valid_pat_type((data >> (i * 8)) & 0xff))
 				return false;
 		return true;
 	} else if (msr == MSR_MTRRdefType) {
 		if (data & ~0xcff)
 			return false;
 		return valid_mtrr_type(data & 0xff);
 	} else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) {
 		for (i = 0; i < 8 ; i++)
 			if (!valid_mtrr_type((data >> (i * 8)) & 0xff))
 				return false;
 		return true;
 	}
 	/* variable MTRRs */
 	return valid_mtrr_type(data & 0xff);
 }
 static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 {
 	u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
 	if (!mtrr_valid(vcpu, msr, data))
 		return 1;
 	if (msr == MSR_MTRRdefType) {
 		vcpu->arch.mtrr_state.def_type = data;
 		vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10;
 	} else if (msr == MSR_MTRRfix64K_00000)
 		p[0] = data;
 	else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
 		p[1 + msr - MSR_MTRRfix16K_80000] = data;
 	else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
 		p[3 + msr - MSR_MTRRfix4K_C0000] = data;
 	else if (msr == MSR_IA32_CR_PAT)
 		vcpu->arch.pat = data;
 	else {	/* Variable MTRRs */
 		int idx, is_mtrr_mask;
 		u64 *pt;
 		idx = (msr - 0x200) / 2;
 		is_mtrr_mask = msr - 0x200 - 2 * idx;
 		if (!is_mtrr_mask)
 			pt =
 			  (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
 		else
 			pt =
 			  (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
 		*pt = data;
 	}
 	kvm_mmu_reset_context(vcpu);
 	return 0;
 }
 static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 {
 	u64 mcg_cap = vcpu->arch.mcg_cap;
 	unsigned bank_num = mcg_cap & 0xff;
 	switch (msr) {
 	case MSR_IA32_MCG_STATUS:
 		vcpu->arch.mcg_status = data;
 		break;
 	case MSR_IA32_MCG_CTL:
 		if (!(mcg_cap & MCG_CTL_P))
 			return 1;
 		if (data != 0 && data != ~(u64)0)
 			return -1;
 		vcpu->arch.mcg_ctl = data;
 		break;
 	default:
 		if (msr >= MSR_IA32_MC0_CTL &&
 		    msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
 			u32 offset = msr - MSR_IA32_MC0_CTL;
 			/* only 0 or all 1s can be written to IA32_MCi_CTL
 			 * some Linux kernels though clear bit 10 in bank 4 to
 			 * workaround a BIOS/GART TBL issue on AMD K8s, ignore
 			 * this to avoid an uncatched #GP in the guest
 			 */
 			if ((offset & 0x3) == 0 &&
 			    data != 0 && (data | (1 << 10)) != ~(u64)0)
 				return -1;
 			vcpu->arch.mce_banks[offset] = data;
 			break;
 		}
 		return 1;
 	}
 	return 0;
 }
 static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
 {
 	struct kvm *kvm = vcpu->kvm;
 	int lm = is_long_mode(vcpu);
 	u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64
 		: (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32;
 	u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
 		: kvm->arch.xen_hvm_config.blob_size_32;
 	u32 page_num = data & ~PAGE_MASK;
 	u64 page_addr = data & PAGE_MASK;
 	u8 *page;
 	int r;
 	r = -E2BIG;
 	if (page_num >= blob_size)
 		goto out;
 	r = -ENOMEM;
 	page = memdup_user(blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE);
 	if (IS_ERR(page)) {
 		r = PTR_ERR(page);
 		goto out;
 	}
 	if (kvm_write_guest(kvm, page_addr, page, PAGE_SIZE))
 		goto out_free;
 	r = 0;
 out_free:
 	kfree(page);
 out:
 	return r;
 }
 static bool kvm_hv_hypercall_enabled(struct kvm *kvm)
 {
 	return kvm->arch.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE;
 }
 static bool kvm_hv_msr_partition_wide(u32 msr)
 {
 	bool r = false;
 	switch (msr) {
 	case HV_X64_MSR_GUEST_OS_ID:
 	case HV_X64_MSR_HYPERCALL:
 		r = true;
 		break;
 	}
 	return r;
 }
 static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 {
 	struct kvm *kvm = vcpu->kvm;
 	switch (msr) {
 	case HV_X64_MSR_GUEST_OS_ID:
 		kvm->arch.hv_guest_os_id = data;
 		/* setting guest os id to zero disables hypercall page */
 		if (!kvm->arch.hv_guest_os_id)
 			kvm->arch.hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE;
 		break;
 	case HV_X64_MSR_HYPERCALL: {
 		u64 gfn;
 		unsigned long addr;
 		u8 instructions[4];
 		/* if guest os id is not set hypercall should remain disabled */
 		if (!kvm->arch.hv_guest_os_id)
 			break;
 		if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) {
 			kvm->arch.hv_hypercall = data;
 			break;
 		}
 		gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT;
 		addr = gfn_to_hva(kvm, gfn);
 		if (kvm_is_error_hva(addr))
 			return 1;
 		kvm_x86_ops->patch_hypercall(vcpu, instructions);
 		((unsigned char *)instructions)[3] = 0xc3; /* ret */
 		if (__copy_to_user((void __user *)addr, instructions, 4))
 			return 1;
 		kvm->arch.hv_hypercall = data;
 		break;
 	}
 	default:
 		vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
 			    "data 0x%llx\n", msr, data);
 		return 1;
 	}
 	return 0;
 }
 static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 {
 	switch (msr) {
 	case HV_X64_MSR_APIC_ASSIST_PAGE: {
 		unsigned long addr;
 		if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) {
 			vcpu->arch.hv_vapic = data;
 			break;
 		}
 		addr = gfn_to_hva(vcpu->kvm, data >>
 				  HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT);
 		if (kvm_is_error_hva(addr))
 			return 1;
 		if (__clear_user((void __user *)addr, PAGE_SIZE))
 			return 1;
 		vcpu->arch.hv_vapic = data;
 		break;
 	}
 	case HV_X64_MSR_EOI:
 		return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data);
 	case HV_X64_MSR_ICR:
 		return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data);
 	case HV_X64_MSR_TPR:
 		return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data);
 	default:
 		vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
 			    "data 0x%llx\n", msr, data);
 		return 1;
 	}
 	return 0;
 }
 static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
 {
 	gpa_t gpa = data & ~0x3f;
 	/* Bits 2:5 are reserved, Should be zero */
 	if (data & 0x3c)
 		return 1;
 	vcpu->arch.apf.msr_val = data;
 	if (!(data & KVM_ASYNC_PF_ENABLED)) {
 		kvm_clear_async_pf_completion_queue(vcpu);
 		kvm_async_pf_hash_reset(vcpu);
 		return 0;
 	}
 	if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa))
 		return 1;
 	vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
 	kvm_async_pf_wakeup_all(vcpu);
 	return 0;
 }
 static void kvmclock_reset(struct kvm_vcpu *vcpu)
 {
 	if (vcpu->arch.time_page) {
 		kvm_release_page_dirty(vcpu->arch.time_page);
 		vcpu->arch.time_page = NULL;
 	}
 }
 static void accumulate_steal_time(struct kvm_vcpu *vcpu)
 {
 	u64 delta;
 	if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
 		return;
 	delta = current->sched_info.run_delay - vcpu->arch.st.last_steal;
 	vcpu->arch.st.last_steal = current->sched_info.run_delay;
 	vcpu->arch.st.accum_steal = delta;
 }
 static void record_steal_time(struct kvm_vcpu *vcpu)
 {
 	if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
 		return;
 	if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
 		&vcpu->arch.st.steal, sizeof(struct kvm_steal_time))))
 		return;
 	vcpu->arch.st.steal.steal += vcpu->arch.st.accum_steal;
 	vcpu->arch.st.steal.version += 2;
 	vcpu->arch.st.accum_steal = 0;
 	kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
 		&vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
 }
 int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 {
 	bool pr = false;
 	u32 msr = msr_info->index;
 	u64 data = msr_info->data;
 	switch (msr) {
 	case MSR_EFER:
 		return set_efer(vcpu, data);
 	case MSR_K7_HWCR:
 		data &= ~(u64)0x40;	/* ignore flush filter disable */
 		data &= ~(u64)0x100;	/* ignore ignne emulation enable */
 		data &= ~(u64)0x8;	/* ignore TLB cache disable */
 		if (data != 0) {
 			vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
 				    data);
 			return 1;
 		}
 		break;
 	case MSR_FAM10H_MMIO_CONF_BASE:
 		if (data != 0) {
 			vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "
 				    "0x%llx\n", data);
 			return 1;
 		}
 		break;
 	case MSR_AMD64_NB_CFG:
 		break;
 	case MSR_IA32_DEBUGCTLMSR:
 		if (!data) {
 			/* We support the non-activated case already */
 			break;
 		} else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
 			/* Values other than LBR and BTF are vendor-specific,
 			   thus reserved and should throw a #GP */
 			return 1;
 		}
 		vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
 			    __func__, data);
 		break;
 	case MSR_IA32_UCODE_REV:
 	case MSR_IA32_UCODE_WRITE:
 	case MSR_VM_HSAVE_PA:
 	case MSR_AMD64_PATCH_LOADER:
 		break;
 	case 0x200 ... 0x2ff:
 		return set_msr_mtrr(vcpu, msr, data);
 	case MSR_IA32_APICBASE:
 		kvm_set_apic_base(vcpu, data);
 		break;
 	case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
 		return kvm_x2apic_msr_write(vcpu, msr, data);
 	case MSR_IA32_TSCDEADLINE:
 		kvm_set_lapic_tscdeadline_msr(vcpu, data);
 		break;
 	case MSR_IA32_TSC_ADJUST:
 		if (guest_cpuid_has_tsc_adjust(vcpu)) {
 			if (!msr_info->host_initiated) {
 				u64 adj = data - vcpu->arch.ia32_tsc_adjust_msr;
 				kvm_x86_ops->adjust_tsc_offset(vcpu, adj, true);
 			}
 			vcpu->arch.ia32_tsc_adjust_msr = data;
 		}
 		break;
 	case MSR_IA32_MISC_ENABLE:
 		vcpu->arch.ia32_misc_enable_msr = data;
 		break;
 	case MSR_KVM_WALL_CLOCK_NEW:
 	case MSR_KVM_WALL_CLOCK:
 		vcpu->kvm->arch.wall_clock = data;
 		kvm_write_wall_clock(vcpu->kvm, data);
 		break;
 	case MSR_KVM_SYSTEM_TIME_NEW:
 	case MSR_KVM_SYSTEM_TIME: {
 		kvmclock_reset(vcpu);
 		vcpu->arch.time = data;
 		kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
 		/* we verify if the enable bit is set... */
 		if (!(data & 1))
 			break;
 		/* ...but clean it before doing the actual write */
 		vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
 		vcpu->arch.time_page =
 				gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
 		if (is_error_page(vcpu->arch.time_page))
 			vcpu->arch.time_page = NULL;
 		break;
 	}
 	case MSR_KVM_ASYNC_PF_EN:
 		if (kvm_pv_enable_async_pf(vcpu, data))
 			return 1;
 		break;
 	case MSR_KVM_STEAL_TIME:
 		if (unlikely(!sched_info_on()))
 			return 1;
 		if (data & KVM_STEAL_RESERVED_MASK)
 			return 1;
 		if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime,
 							data & KVM_STEAL_VALID_BITS))
 			return 1;
 		vcpu->arch.st.msr_val = data;
 		if (!(data & KVM_MSR_ENABLED))
 			break;
 		vcpu->arch.st.last_steal = current->sched_info.run_delay;
 		preempt_disable();
 		accumulate_steal_time(vcpu);
 		preempt_enable();
 		kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
 		break;
 	case MSR_KVM_PV_EOI_EN:
 		if (kvm_lapic_enable_pv_eoi(vcpu, data))
 			return 1;
 		break;
 	case MSR_IA32_MCG_CTL:
 	case MSR_IA32_MCG_STATUS:
 	case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
 		return set_msr_mce(vcpu, msr, data);
 	/* Performance counters are not protected by a CPUID bit,
 	 * so we should check all of them in the generic path for the sake of
 	 * cross vendor migration.
 	 * Writing a zero into the event select MSRs disables them,
 	 * which we perfectly emulate ;-). Any other value should be at least
 	 * reported, some guests depend on them.
 	 */
 	case MSR_K7_EVNTSEL0:
 	case MSR_K7_EVNTSEL1:
 	case MSR_K7_EVNTSEL2:
 	case MSR_K7_EVNTSEL3:
 		if (data != 0)
 			vcpu_unimpl(vcpu, "unimplemented perfctr wrmsr: "
 				    "0x%x data 0x%llx\n", msr, data);
 		break;
 	/* at least RHEL 4 unconditionally writes to the perfctr registers,
 	 * so we ignore writes to make it happy.
 	 */
 	case MSR_K7_PERFCTR0:
 	case MSR_K7_PERFCTR1:
 	case MSR_K7_PERFCTR2:
 	case MSR_K7_PERFCTR3:
 		vcpu_unimpl(vcpu, "unimplemented perfctr wrmsr: "
 			    "0x%x data 0x%llx\n", msr, data);
 		break;
 	case MSR_P6_PERFCTR0:
 	case MSR_P6_PERFCTR1:
 		pr = true;
 	case MSR_P6_EVNTSEL0:
 	case MSR_P6_EVNTSEL1:
 		if (kvm_pmu_msr(vcpu, msr))
 			return kvm_pmu_set_msr(vcpu, msr, data);
 		if (pr || data != 0)
 			vcpu_unimpl(vcpu, "disabled perfctr wrmsr: "
 				    "0x%x data 0x%llx\n", msr, data);
 		break;
 	case MSR_K7_CLK_CTL:
 		/*
 		 * Ignore all writes to this no longer documented MSR.
 		 * Writes are only relevant for old K7 processors,
 		 * all pre-dating SVM, but a recommended workaround from
 		 * AMD for these chips. It is possible to specify the
 		 * affected processor models on the command line, hence
 		 * the need to ignore the workaround.
 		 */
 		break;
 	case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
 		if (kvm_hv_msr_partition_wide(msr)) {
 			int r;
 			mutex_lock(&vcpu->kvm->lock);
 			r = set_msr_hyperv_pw(vcpu, msr, data);
 			mutex_unlock(&vcpu->kvm->lock);
 			return r;
 		} else
 			return set_msr_hyperv(vcpu, msr, data);
 		break;
 	case MSR_IA32_BBL_CR_CTL3:
 		/* Drop writes to this legacy MSR -- see rdmsr
 		 * counterpart for further detail.
 		 */
 		vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data);
 		break;
 	case MSR_AMD64_OSVW_ID_LENGTH:
 		if (!guest_cpuid_has_osvw(vcpu))
 			return 1;
 		vcpu->arch.osvw.length = data;
 		break;
 	case MSR_AMD64_OSVW_STATUS:
 		if (!guest_cpuid_has_osvw(vcpu))
 			return 1;
 		vcpu->arch.osvw.status = data;
 		break;
 	default:
 		if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
 			return xen_hvm_config(vcpu, data);
 		if (kvm_pmu_msr(vcpu, msr))
 			return kvm_pmu_set_msr(vcpu, msr, data);
 		if (!ignore_msrs) {
 			vcpu_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
 				    msr, data);
 			return 1;
 		} else {
 			vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n",
 				    msr, data);
 			break;
 		}
 	}
 	return 0;
 }
 EXPORT_SYMBOL_GPL(kvm_set_msr_common);
 /*
  * Reads an msr value (of 'msr_index') into 'pdata'.
  * Returns 0 on success, non-0 otherwise.
  * Assumes vcpu_load() was already called.
  */
 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 {
 	return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
 }
 static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 {
 	u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
 	if (!msr_mtrr_valid(msr))
 		return 1;
 	if (msr == MSR_MTRRdefType)
 		*pdata = vcpu->arch.mtrr_state.def_type +
 			 (vcpu->arch.mtrr_state.enabled << 10);
 	else if (msr == MSR_MTRRfix64K_00000)
 		*pdata = p[0];
 	else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
 		*pdata = p[1 + msr - MSR_MTRRfix16K_80000];
 	else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
 		*pdata = p[3 + msr - MSR_MTRRfix4K_C0000];
 	else if (msr == MSR_IA32_CR_PAT)
 		*pdata = vcpu->arch.pat;
 	else {	/* Variable MTRRs */
 		int idx, is_mtrr_mask;
 		u64 *pt;
 		idx = (msr - 0x200) / 2;
 		is_mtrr_mask = msr - 0x200 - 2 * idx;
 		if (!is_mtrr_mask)
 			pt =
 			  (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
 		else
 			pt =
 			  (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
 		*pdata = *pt;
 	}
 	return 0;
 }
 static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 {
 	u64 data;
 	u64 mcg_cap = vcpu->arch.mcg_cap;
 	unsigned bank_num = mcg_cap & 0xff;
 	switch (msr) {
 	case MSR_IA32_P5_MC_ADDR:
 	case MSR_IA32_P5_MC_TYPE:
 		data = 0;
 		break;
 	case MSR_IA32_MCG_CAP:
 		data = vcpu->arch.mcg_cap;
 		break;
 	case MSR_IA32_MCG_CTL:
 		if (!(mcg_cap & MCG_CTL_P))
 			return 1;
 		data = vcpu->arch.mcg_ctl;
 		break;
 	case MSR_IA32_MCG_STATUS:
 		data = vcpu->arch.mcg_status;
 		break;
 	default:
 		if (msr >= MSR_IA32_MC0_CTL &&
 		    msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
 			u32 offset = msr - MSR_IA32_MC0_CTL;
 			data = vcpu->arch.mce_banks[offset];
 			break;
 		}
 		return 1;
 	}
 	*pdata = data;
 	return 0;
 }
 static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 {
 	u64 data = 0;
 	struct kvm *kvm = vcpu->kvm;
 	switch (msr) {
 	case HV_X64_MSR_GUEST_OS_ID:
 		data = kvm->arch.hv_guest_os_id;
 		break;
 	case HV_X64_MSR_HYPERCALL:
 		data = kvm->arch.hv_hypercall;
 		break;
 	default:
 		vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
 		return 1;
 	}
 	*pdata = data;
 	return 0;
 }
 static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 {
 	u64 data = 0;
 	switch (msr) {
 	case HV_X64_MSR_VP_INDEX: {
 		int r;
 		struct kvm_vcpu *v;
 		kvm_for_each_vcpu(r, v, vcpu->kvm)
 			if (v == vcpu)
 				data = r;
 		break;
 	}
 	case HV_X64_MSR_EOI:
 		return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata);
 	case HV_X64_MSR_ICR:
 		return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata);
 	case HV_X64_MSR_TPR:
 		return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata);
 	case HV_X64_MSR_APIC_ASSIST_PAGE:
 		data = vcpu->arch.hv_vapic;
 		break;
 	default:
 		vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
 		return 1;
 	}
 	*pdata = data;
 	return 0;
 }
 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 {
 	u64 data;
 	switch (msr) {
 	case MSR_IA32_PLATFORM_ID:
 	case MSR_IA32_EBL_CR_POWERON:
 	case MSR_IA32_DEBUGCTLMSR:
 	case MSR_IA32_LASTBRANCHFROMIP:
 	case MSR_IA32_LASTBRANCHTOIP:
 	case MSR_IA32_LASTINTFROMIP:
 	case MSR_IA32_LASTINTTOIP:
 	case MSR_K8_SYSCFG:
 	case MSR_K7_HWCR:
 	case MSR_VM_HSAVE_PA:
 	case MSR_K7_EVNTSEL0:
 	case MSR_K7_PERFCTR0:
 	case MSR_K8_INT_PENDING_MSG:
 	case MSR_AMD64_NB_CFG:
 	case MSR_FAM10H_MMIO_CONF_BASE:
 		data = 0;
 		break;
 	case MSR_P6_PERFCTR0:
 	case MSR_P6_PERFCTR1:
 	case MSR_P6_EVNTSEL0:
 	case MSR_P6_EVNTSEL1:
 		if (kvm_pmu_msr(vcpu, msr))
 			return kvm_pmu_get_msr(vcpu, msr, pdata);
 		data = 0;
 		break;
 	case MSR_IA32_UCODE_REV:
 		data = 0x100000000ULL;
 		break;
 	case MSR_MTRRcap:
 		data = 0x500 | KVM_NR_VAR_MTRR;
 		break;
 	case 0x200 ... 0x2ff:
 		return get_msr_mtrr(vcpu, msr, pdata);
 	case 0xcd: /* fsb frequency */
 		data = 3;
 		break;
 		/*
 		 * MSR_EBC_FREQUENCY_ID
 		 * Conservative value valid for even the basic CPU models.
 		 * Models 0,1: 000 in bits 23:21 indicating a bus speed of
 		 * 100MHz, model 2 000 in bits 18:16 indicating 100MHz,
 		 * and 266MHz for model 3, or 4. Set Core Clock
 		 * Frequency to System Bus Frequency Ratio to 1 (bits
 		 * 31:24) even though these are only valid for CPU
 		 * models > 2, however guests may end up dividing or
 		 * multiplying by zero otherwise.
 		 */
 	case MSR_EBC_FREQUENCY_ID:
 		data = 1 << 24;
 		break;
 	case MSR_IA32_APICBASE:
 		data = kvm_get_apic_base(vcpu);
 		break;
 	case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
 		return kvm_x2apic_msr_read(vcpu, msr, pdata);
 		break;
 	case MSR_IA32_TSCDEADLINE:
 		data = kvm_get_lapic_tscdeadline_msr(vcpu);
 		break;
 	case MSR_IA32_TSC_ADJUST:
 		data = (u64)vcpu->arch.ia32_tsc_adjust_msr;
 		break;
 	case MSR_IA32_MISC_ENABLE:
 		data = vcpu->arch.ia32_misc_enable_msr;
 		break;
 	case MSR_IA32_PERF_STATUS:
 		/* TSC increment by tick */
 		data = 1000ULL;
 		/* CPU multiplier */
 		data |= (((uint64_t)4ULL) << 40);
 		break;
 	case MSR_EFER:
 		data = vcpu->arch.efer;
 		break;
 	case MSR_KVM_WALL_CLOCK:
 	case MSR_KVM_WALL_CLOCK_NEW:
 		data = vcpu->kvm->arch.wall_clock;
 		break;
 	case MSR_KVM_SYSTEM_TIME:
 	case MSR_KVM_SYSTEM_TIME_NEW:
 		data = vcpu->arch.time;
 		break;
 	case MSR_KVM_ASYNC_PF_EN:
 		data = vcpu->arch.apf.msr_val;
 		break;
 	case MSR_KVM_STEAL_TIME:
 		data = vcpu->arch.st.msr_val;
 		break;
 	case MSR_KVM_PV_EOI_EN:
 		data = vcpu->arch.pv_eoi.msr_val;
 		break;
 	case MSR_IA32_P5_MC_ADDR:
 	case MSR_IA32_P5_MC_TYPE:
 	case MSR_IA32_MCG_CAP:
 	case MSR_IA32_MCG_CTL:
 	case MSR_IA32_MCG_STATUS:
 	case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
 		return get_msr_mce(vcpu, msr, pdata);
 	case MSR_K7_CLK_CTL:
 		/*
 		 * Provide expected ramp-up count for K7. All other
 		 * are set to zero, indicating minimum divisors for
 		 * every field.
 		 *
 		 * This prevents guest kernels on AMD host with CPU
 		 * type 6, model 8 and higher from exploding due to
 		 * the rdmsr failing.
 		 */
 		data = 0x20000000;
 		break;
 	case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
 		if (kvm_hv_msr_partition_wide(msr)) {
 			int r;
 			mutex_lock(&vcpu->kvm->lock);
 			r = get_msr_hyperv_pw(vcpu, msr, pdata);
 			mutex_unlock(&vcpu->kvm->lock);
 			return r;
 		} else
 			return get_msr_hyperv(vcpu, msr, pdata);
 		break;
 	case MSR_IA32_BBL_CR_CTL3:
 		/* This legacy MSR exists but isn't fully documented in current
 		 * silicon.  It is however accessed by winxp in very narrow
 		 * scenarios where it sets bit #19, itself documented as
 		 * a "reserved" bit.  Best effort attempt to source coherent
 		 * read data here should the balance of the register be
 		 * interpreted by the guest:
 		 *
 		 * L2 cache control register 3: 64GB range, 256KB size,
 		 * enabled, latency 0x1, configured
 		 */
 		data = 0xbe702111;
 		break;
 	case MSR_AMD64_OSVW_ID_LENGTH:
 		if (!guest_cpuid_has_osvw(vcpu))
 			return 1;
 		data = vcpu->arch.osvw.length;
 		break;
 	case MSR_AMD64_OSVW_STATUS:
 		if (!guest_cpuid_has_osvw(vcpu))
 			return 1;
 		data = vcpu->arch.osvw.status;
 		break;
 	default:
 		if (kvm_pmu_msr(vcpu, msr))
 			return kvm_pmu_get_msr(vcpu, msr, pdata);
 		if (!ignore_msrs) {
 			vcpu_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
 			return 1;
 		} else {
 			vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr);
 			data = 0;
 		}
 		break;
 	}
 	*pdata = data;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(kvm_get_msr_common);
 /*
  * Read or write a bunch of msrs. All parameters are kernel addresses.
  *
  * @return number of msrs set successfully.
  */
 static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
 		    struct kvm_msr_entry *entries,
 		    int (*do_msr)(struct kvm_vcpu *vcpu,
 				  unsigned index, u64 *data))
 {
 	int i, idx;
 	idx = srcu_read_lock(&vcpu->kvm->srcu);
 	for (i = 0; i < msrs->nmsrs; ++i)
 		if (do_msr(vcpu, entries[i].index, &entries[i].data))
 			break;
 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
 	return i;
 }
 /*
  * Read or write a bunch of msrs. Parameters are user addresses.
  *
  * @return number of msrs set successfully.
  */
 static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
 		  int (*do_msr)(struct kvm_vcpu *vcpu,
 				unsigned index, u64 *data),
 		  int writeback)
 {
 	struct kvm_msrs msrs;
 	struct kvm_msr_entry *entries;
 	int r, n;
 	unsigned size;
 	r = -EFAULT;
 	if (copy_from_user(&msrs, user_msrs, sizeof msrs))
 		goto out;
 	r = -E2BIG;
 	if (msrs.nmsrs >= MAX_IO_MSRS)
 		goto out;
 	size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
 	entries = memdup_user(user_msrs->entries, size);
 	if (IS_ERR(entries)) {
 		r = PTR_ERR(entries);
 		goto out;
 	}
 	r = n = __msr_io(vcpu, &msrs, entries, do_msr);
 	if (r < 0)
 		goto out_free;
 	r = -EFAULT;
 	if (writeback && copy_to_user(user_msrs->entries, entries, size))
 		goto out_free;
 	r = n;
 out_free:
 	kfree(entries);
 out:
 	return r;
 }
 int kvm_dev_ioctl_check_extension(long ext)
 {
 	int r;
 	switch (ext) {
 	case KVM_CAP_IRQCHIP:
 	case KVM_CAP_HLT:
 	case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
 	case KVM_CAP_SET_TSS_ADDR:
 	case KVM_CAP_EXT_CPUID:
 	case KVM_CAP_CLOCKSOURCE:
 	case KVM_CAP_PIT:
 	case KVM_CAP_NOP_IO_DELAY:
 	case KVM_CAP_MP_STATE:
 	case KVM_CAP_SYNC_MMU:
 	case KVM_CAP_USER_NMI:
 	case KVM_CAP_REINJECT_CONTROL:
 	case KVM_CAP_IRQ_INJECT_STATUS:
 	case KVM_CAP_ASSIGN_DEV_IRQ:
 	case KVM_CAP_IRQFD:
 	case KVM_CAP_IOEVENTFD:
 	case KVM_CAP_PIT2:
 	case KVM_CAP_PIT_STATE2:
 	case KVM_CAP_SET_IDENTITY_MAP_ADDR:
 	case KVM_CAP_XEN_HVM:
 	case KVM_CAP_ADJUST_CLOCK:
 	case KVM_CAP_VCPU_EVENTS:
 	case KVM_CAP_HYPERV:
 	case KVM_CAP_HYPERV_VAPIC:
 	case KVM_CAP_HYPERV_SPIN:
 	case KVM_CAP_PCI_SEGMENT:
 	case KVM_CAP_DEBUGREGS:
 	case KVM_CAP_X86_ROBUST_SINGLESTEP:
 	case KVM_CAP_XSAVE:
 	case KVM_CAP_ASYNC_PF:
 	case KVM_CAP_GET_TSC_KHZ:
 	case KVM_CAP_PCI_2_3:
 	case KVM_CAP_KVMCLOCK_CTRL:
 	case KVM_CAP_READONLY_MEM:
 	case KVM_CAP_IRQFD_RESAMPLE:
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
 		r = KVM_COALESCED_MMIO_PAGE_OFFSET;
 		break;
 	case KVM_CAP_VAPIC:
 		r = !kvm_x86_ops->cpu_has_accelerated_tpr();
 		break;
 	case KVM_CAP_NR_VCPUS:
 		r = KVM_SOFT_MAX_VCPUS;
 		break;
 	case KVM_CAP_MAX_VCPUS:
 		r = KVM_MAX_VCPUS;
 		break;
 	case KVM_CAP_NR_MEMSLOTS:
 		r = KVM_USER_MEM_SLOTS;
 		break;
 	case KVM_CAP_PV_MMU:	/* obsolete */
 		r = 0;
 		break;
 	case KVM_CAP_IOMMU:
 		r = iommu_present(&pci_bus_type);
 		break;
 	case KVM_CAP_MCE:
 		r = KVM_MAX_MCE_BANKS;
 		break;
 	case KVM_CAP_XCRS:
 		r = cpu_has_xsave;
 		break;
 	case KVM_CAP_TSC_CONTROL:
 		r = kvm_has_tsc_control;
 		break;
 	case KVM_CAP_TSC_DEADLINE_TIMER:
 		r = boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER);
 		break;
 	default:
 		r = 0;
 		break;
 	}
 	return r;
 }
 long kvm_arch_dev_ioctl(struct file *filp,
 			unsigned int ioctl, unsigned long arg)
 {
 	void __user *argp = (void __user *)arg;
 	long r;
 	switch (ioctl) {
 	case KVM_GET_MSR_INDEX_LIST: {
 		struct kvm_msr_list __user *user_msr_list = argp;
 		struct kvm_msr_list msr_list;
 		unsigned n;
 		r = -EFAULT;
 		if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
 			goto out;
 		n = msr_list.nmsrs;
 		msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
 		if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
 			goto out;
 		r = -E2BIG;
 		if (n < msr_list.nmsrs)
 			goto out;
 		r = -EFAULT;
 		if (copy_to_user(user_msr_list->indices, &msrs_to_save,
 				 num_msrs_to_save * sizeof(u32)))
 			goto out;
 		if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
 				 &emulated_msrs,
 				 ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
 			goto out;
 		r = 0;
 		break;
 	}
 	case KVM_GET_SUPPORTED_CPUID: {
 		struct kvm_cpuid2 __user *cpuid_arg = argp;
 		struct kvm_cpuid2 cpuid;
 		r = -EFAULT;
 		if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
 			goto out;
 		r = kvm_dev_ioctl_get_supported_cpuid(&cpuid,
 						      cpuid_arg->entries);
 		if (r)
 			goto out;
 		r = -EFAULT;
 		if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
 			goto out;
 		r = 0;
 		break;
 	}
 	case KVM_X86_GET_MCE_CAP_SUPPORTED: {
 		u64 mce_cap;
 		mce_cap = KVM_MCE_CAP_SUPPORTED;
 		r = -EFAULT;
 		if (copy_to_user(argp, &mce_cap, sizeof mce_cap))
 			goto out;
 		r = 0;
 		break;
 	}
 	default:
 		r = -EINVAL;
 	}
 out:
 	return r;
 }
 static void wbinvd_ipi(void *garbage)
 {
 	wbinvd();
 }
 static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
 {
 	return vcpu->kvm->arch.iommu_domain &&
 		!(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY);
 }
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
 	/* Address WBINVD may be executed by guest */
 	if (need_emulate_wbinvd(vcpu)) {
 		if (kvm_x86_ops->has_wbinvd_exit())
 			cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
 		else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
 			smp_call_function_single(vcpu->cpu,
 					wbinvd_ipi, NULL, 1);
 	}
 	kvm_x86_ops->vcpu_load(vcpu, cpu);
 	/* Apply any externally detected TSC adjustments (due to suspend) */
 	if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
 		adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
 		vcpu->arch.tsc_offset_adjustment = 0;
 		set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
 	}
 	if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
 		s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
 				native_read_tsc() - vcpu->arch.last_host_tsc;
 		if (tsc_delta < 0)
 			mark_tsc_unstable("KVM discovered backwards TSC");
 		if (check_tsc_unstable()) {
 			u64 offset = kvm_x86_ops->compute_tsc_offset(vcpu,
 						vcpu->arch.last_guest_tsc);
 			kvm_x86_ops->write_tsc_offset(vcpu, offset);
 			vcpu->arch.tsc_catchup = 1;
 		}
 		/*
 		 * On a host with synchronized TSC, there is no need to update
 		 * kvmclock on vcpu->cpu migration
 		 */
 		if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1)
 			kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
 		if (vcpu->cpu != cpu)
 			kvm_migrate_timers(vcpu);
 		vcpu->cpu = cpu;
 	}
 	accumulate_steal_time(vcpu);
 	kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
 }
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 {
 	kvm_x86_ops->vcpu_put(vcpu);
 	kvm_put_guest_fpu(vcpu);
 	vcpu->arch.last_host_tsc = native_read_tsc();
 }
 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
 				    struct kvm_lapic_state *s)
 {
 	memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
 	return 0;
 }
 static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
 				    struct kvm_lapic_state *s)
 {
 	kvm_apic_post_state_restore(vcpu, s);
 	update_cr8_intercept(vcpu);
 	return 0;
 }
 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
 				    struct kvm_interrupt *irq)
 {
 	if (irq->irq < 0 || irq->irq >= KVM_NR_INTERRUPTS)
 		return -EINVAL;
 	if (irqchip_in_kernel(vcpu->kvm))
 		return -ENXIO;
 	kvm_queue_interrupt(vcpu, irq->irq, false);
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
 	return 0;
 }
 static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
 {
 	kvm_inject_nmi(vcpu);
 	return 0;
 }
 static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
 					   struct kvm_tpr_access_ctl *tac)
 {
 	if (tac->flags)
 		return -EINVAL;
 	vcpu->arch.tpr_access_reporting = !!tac->enabled;
 	return 0;
 }
 static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
 					u64 mcg_cap)
 {
 	int r;
 	unsigned bank_num = mcg_cap & 0xff, bank;
 	r = -EINVAL;
 	if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
 		goto out;
 	if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000))
 		goto out;
 	r = 0;
 	vcpu->arch.mcg_cap = mcg_cap;
 	/* Init IA32_MCG_CTL to all 1s */
 	if (mcg_cap & MCG_CTL_P)
 		vcpu->arch.mcg_ctl = ~(u64)0;
 	/* Init IA32_MCi_CTL to all 1s */
 	for (bank = 0; bank < bank_num; bank++)
 		vcpu->arch.mce_banks[bank*4] = ~(u64)0;
 out:
 	return r;
 }
 static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
 				      struct kvm_x86_mce *mce)
 {
 	u64 mcg_cap = vcpu->arch.mcg_cap;
 	unsigned bank_num = mcg_cap & 0xff;
 	u64 *banks = vcpu->arch.mce_banks;
 	if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
 		return -EINVAL;
 	/*
 	 * if IA32_MCG_CTL is not all 1s, the uncorrected error
 	 * reporting is disabled
 	 */
 	if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
 	    vcpu->arch.mcg_ctl != ~(u64)0)
 		return 0;
 	banks += 4 * mce->bank;
 	/*
 	 * if IA32_MCi_CTL is not all 1s, the uncorrected error
 	 * reporting is disabled for the bank
 	 */
 	if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0)
 		return 0;
 	if (mce->status & MCI_STATUS_UC) {
 		if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
 		    !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) {
 			kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
 			return 0;
 		}
 		if (banks[1] & MCI_STATUS_VAL)
 			mce->status |= MCI_STATUS_OVER;
 		banks[2] = mce->addr;
 		banks[3] = mce->misc;
 		vcpu->arch.mcg_status = mce->mcg_status;
 		banks[1] = mce->status;
 		kvm_queue_exception(vcpu, MC_VECTOR);
 	} else if (!(banks[1] & MCI_STATUS_VAL)
 		   || !(banks[1] & MCI_STATUS_UC)) {
 		if (banks[1] & MCI_STATUS_VAL)
 			mce->status |= MCI_STATUS_OVER;
 		banks[2] = mce->addr;
 		banks[3] = mce->misc;
 		banks[1] = mce->status;
 	} else
 		banks[1] |= MCI_STATUS_OVER;
 	return 0;
 }
 static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
 					       struct kvm_vcpu_events *events)
 {
 	process_nmi(vcpu);
 	events->exception.injected =
 		vcpu->arch.exception.pending &&
 		!kvm_exception_is_soft(vcpu->arch.exception.nr);
 	events->exception.nr = vcpu->arch.exception.nr;
 	events->exception.has_error_code = vcpu->arch.exception.has_error_code;
 	events->exception.pad = 0;
 	events->exception.error_code = vcpu->arch.exception.error_code;
 	events->interrupt.injected =
 		vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft;
 	events->interrupt.nr = vcpu->arch.interrupt.nr;
 	events->interrupt.soft = 0;
 	events->interrupt.shadow =
 		kvm_x86_ops->get_interrupt_shadow(vcpu,
 			KVM_X86_SHADOW_INT_MOV_SS | KVM_X86_SHADOW_INT_STI);
 	events->nmi.injected = vcpu->arch.nmi_injected;
 	events->nmi.pending = vcpu->arch.nmi_pending != 0;
 	events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);
 	events->nmi.pad = 0;
 	events->sipi_vector = vcpu->arch.sipi_vector;
 	events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
 			 | KVM_VCPUEVENT_VALID_SIPI_VECTOR
 			 | KVM_VCPUEVENT_VALID_SHADOW);
 	memset(&events->reserved, 0, sizeof(events->reserved));
 }
 static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
 					      struct kvm_vcpu_events *events)
 {
 	if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
 			      | KVM_VCPUEVENT_VALID_SIPI_VECTOR
 			      | KVM_VCPUEVENT_VALID_SHADOW))
 		return -EINVAL;
 	process_nmi(vcpu);
 	vcpu->arch.exception.pending = events->exception.injected;
 	vcpu->arch.exception.nr = events->exception.nr;
 	vcpu->arch.exception.has_error_code = events->exception.has_error_code;
 	vcpu->arch.exception.error_code = events->exception.error_code;
 	vcpu->arch.interrupt.pending = events->interrupt.injected;
 	vcpu->arch.interrupt.nr = events->interrupt.nr;
 	vcpu->arch.interrupt.soft = events->interrupt.soft;
 	if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
 		kvm_x86_ops->set_interrupt_shadow(vcpu,
 						  events->interrupt.shadow);
 	vcpu->arch.nmi_injected = events->nmi.injected;
 	if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
 		vcpu->arch.nmi_pending = events->nmi.pending;
 	kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);
 	if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR)
 		vcpu->arch.sipi_vector = events->sipi_vector;
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
 	return 0;
 }
 static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
 					     struct kvm_debugregs *dbgregs)
 {
 	memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
 	dbgregs->dr6 = vcpu->arch.dr6;
 	dbgregs->dr7 = vcpu->arch.dr7;
 	dbgregs->flags = 0;
 	memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved));
 }
 static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
 					    struct kvm_debugregs *dbgregs)
 {
 	if (dbgregs->flags)
 		return -EINVAL;
 	memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
 	vcpu->arch.dr6 = dbgregs->dr6;
 	vcpu->arch.dr7 = dbgregs->dr7;
 	return 0;
 }
 static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
 					 struct kvm_xsave *guest_xsave)
 {
 	if (cpu_has_xsave)
 		memcpy(guest_xsave->region,
 			&vcpu->arch.guest_fpu.state->xsave,
 			xstate_size);
 	else {
 		memcpy(guest_xsave->region,
 			&vcpu->arch.guest_fpu.state->fxsave,
 			sizeof(struct i387_fxsave_struct));
 		*(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] =
 			XSTATE_FPSSE;
 	}
 }
 static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
 					struct kvm_xsave *guest_xsave)
 {
 	u64 xstate_bv =
 		*(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];
 	if (cpu_has_xsave)
 		memcpy(&vcpu->arch.guest_fpu.state->xsave,
 			guest_xsave->region, xstate_size);
 	else {
 		if (xstate_bv & ~XSTATE_FPSSE)
 			return -EINVAL;
 		memcpy(&vcpu->arch.guest_fpu.state->fxsave,
 			guest_xsave->region, sizeof(struct i387_fxsave_struct));
 	}
 	return 0;
 }
 static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
 					struct kvm_xcrs *guest_xcrs)
 {
 	if (!cpu_has_xsave) {
 		guest_xcrs->nr_xcrs = 0;
 		return;
 	}
 	guest_xcrs->nr_xcrs = 1;
 	guest_xcrs->flags = 0;
 	guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK;
 	guest_xcrs->xcrs[0].value = vcpu->arch.xcr0;
 }
 static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
 				       struct kvm_xcrs *guest_xcrs)
 {
 	int i, r = 0;
 	if (!cpu_has_xsave)
 		return -EINVAL;
 	if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags)
 		return -EINVAL;
 	for (i = 0; i < guest_xcrs->nr_xcrs; i++)
 		/* Only support XCR0 currently */
 		if (guest_xcrs->xcrs[0].xcr == XCR_XFEATURE_ENABLED_MASK) {
 			r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK,
 				guest_xcrs->xcrs[0].value);
 			break;
 		}
 	if (r)
 		r = -EINVAL;
 	return r;
 }
 /*
  * kvm_set_guest_paused() indicates to the guest kernel that it has been
  * stopped by the hypervisor.  This function will be called from the host only.
  * EINVAL is returned when the host attempts to set the flag for a guest that
  * does not support pv clocks.
  */
 static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
 {
 	if (!vcpu->arch.time_page)
 		return -EINVAL;
 	vcpu->arch.pvclock_set_guest_stopped_request = true;
 	kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
 	return 0;
 }
 long kvm_arch_vcpu_ioctl(struct file *filp,
 			 unsigned int ioctl, unsigned long arg)
 {
 	struct kvm_vcpu *vcpu = filp->private_data;
 	void __user *argp = (void __user *)arg;
 	int r;
 	union {
 		struct kvm_lapic_state *lapic;
 		struct kvm_xsave *xsave;
 		struct kvm_xcrs *xcrs;
 		void *buffer;
 	} u;
 	u.buffer = NULL;
 	switch (ioctl) {
 	case KVM_GET_LAPIC: {
 		r = -EINVAL;
 		if (!vcpu->arch.apic)
 			goto out;
 		u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
 		r = -ENOMEM;
 		if (!u.lapic)
 			goto out;
 		r = kvm_vcpu_ioctl_get_lapic(vcpu, u.lapic);
 		if (r)
 			goto out;
 		r = -EFAULT;
 		if (copy_to_user(argp, u.lapic, sizeof(struct kvm_lapic_state)))
 			goto out;
 		r = 0;
 		break;
 	}
 	case KVM_SET_LAPIC: {
 		r = -EINVAL;
 		if (!vcpu->arch.apic)
 			goto out;
 		u.lapic = memdup_user(argp, sizeof(*u.lapic));
 		if (IS_ERR(u.lapic))
 			return PTR_ERR(u.lapic);
 		r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic);
 		break;
 	}
 	case KVM_INTERRUPT: {
 		struct kvm_interrupt irq;
 		r = -EFAULT;
 		if (copy_from_user(&irq, argp, sizeof irq))
 			goto out;
 		r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
 		break;
 	}
 	case KVM_NMI: {
 		r = kvm_vcpu_ioctl_nmi(vcpu);
 		break;
 	}
 	case KVM_SET_CPUID: {
 		struct kvm_cpuid __user *cpuid_arg = argp;
 		struct kvm_cpuid cpuid;
 		r = -EFAULT;
 		if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
 			goto out;
 		r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
 		break;
 	}
 	case KVM_SET_CPUID2: {
 		struct kvm_cpuid2 __user *cpuid_arg = argp;
 		struct kvm_cpuid2 cpuid;
 		r = -EFAULT;
 		if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
 			goto out;
 		r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
 					      cpuid_arg->entries);
 		break;
 	}
 	case KVM_GET_CPUID2: {
 		struct kvm_cpuid2 __user *cpuid_arg = argp;
 		struct kvm_cpuid2 cpuid;
 		r = -EFAULT;
 		if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
 			goto out;
 		r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
 					      cpuid_arg->entries);
 		if (r)
 			goto out;
 		r = -EFAULT;
 		if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
 			goto out;
 		r = 0;
 		break;
 	}
 	case KVM_GET_MSRS:
 		r = msr_io(vcpu, argp, kvm_get_msr, 1);
 		break;
 	case KVM_SET_MSRS:
 		r = msr_io(vcpu, argp, do_set_msr, 0);
 		break;
 	case KVM_TPR_ACCESS_REPORTING: {
 		struct kvm_tpr_access_ctl tac;
 		r = -EFAULT;
 		if (copy_from_user(&tac, argp, sizeof tac))
 			goto out;
 		r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac);
 		if (r)
 			goto out;
 		r = -EFAULT;
 		if (copy_to_user(argp, &tac, sizeof tac))
 			goto out;
 		r = 0;
 		break;
 	};
 	case KVM_SET_VAPIC_ADDR: {
 		struct kvm_vapic_addr va;
 		r = -EINVAL;
 		if (!irqchip_in_kernel(vcpu->kvm))
 			goto out;
 		r = -EFAULT;
 		if (copy_from_user(&va, argp, sizeof va))
 			goto out;
 		r = 0;
 		kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
 		break;
 	}
 	case KVM_X86_SETUP_MCE: {
 		u64 mcg_cap;
 		r = -EFAULT;
 		if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap))
 			goto out;
 		r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap);
 		break;
 	}
 	case KVM_X86_SET_MCE: {
 		struct kvm_x86_mce mce;
 		r = -EFAULT;
 		if (copy_from_user(&mce, argp, sizeof mce))
 			goto out;
 		r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
 		break;
 	}
 	case KVM_GET_VCPU_EVENTS: {
 		struct kvm_vcpu_events events;
 		kvm_vcpu_ioctl_x86_get_vcpu_events(vcpu, &events);
 		r = -EFAULT;
 		if (copy_to_user(argp, &events, sizeof(struct kvm_vcpu_events)))
 			break;
 		r = 0;
 		break;
 	}
 	case KVM_SET_VCPU_EVENTS: {
 		struct kvm_vcpu_events events;
 		r = -EFAULT;
 		if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events)))
 			break;
 		r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);
 		break;
 	}
 	case KVM_GET_DEBUGREGS: {
 		struct kvm_debugregs dbgregs;
 		kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs);
 		r = -EFAULT;
 		if (copy_to_user(argp, &dbgregs,
 				 sizeof(struct kvm_debugregs)))
 			break;
 		r = 0;
 		break;
 	}
 	case KVM_SET_DEBUGREGS: {
 		struct kvm_debugregs dbgregs;
 		r = -EFAULT;
 		if (copy_from_user(&dbgregs, argp,
 				   sizeof(struct kvm_debugregs)))
 			break;
 		r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs);
 		break;
 	}
 	case KVM_GET_XSAVE: {
 		u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
 		r = -ENOMEM;
 		if (!u.xsave)
 			break;
 		kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave);
 		r = -EFAULT;
 		if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave)))
 			break;
 		r = 0;
 		break;
 	}
 	case KVM_SET_XSAVE: {
 		u.xsave = memdup_user(argp, sizeof(*u.xsave));
 		if (IS_ERR(u.xsave))
 			return PTR_ERR(u.xsave);
 		r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
 		break;
 	}
 	case KVM_GET_XCRS: {
 		u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
 		r = -ENOMEM;
 		if (!u.xcrs)
 			break;
 		kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs);
 		r = -EFAULT;
 		if (copy_to_user(argp, u.xcrs,
 				 sizeof(struct kvm_xcrs)))
 			break;
 		r = 0;
 		break;
 	}
 	case KVM_SET_XCRS: {
 		u.xcrs = memdup_user(argp, sizeof(*u.xcrs));
 		if (IS_ERR(u.xcrs))
 			return PTR_ERR(u.xcrs);
 		r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
 		break;
 	}
 	case KVM_SET_TSC_KHZ: {
 		u32 user_tsc_khz;
 		r = -EINVAL;
 		user_tsc_khz = (u32)arg;
 		if (user_tsc_khz >= kvm_max_guest_tsc_khz)
 			goto out;
 		if (user_tsc_khz == 0)
 			user_tsc_khz = tsc_khz;
 		kvm_set_tsc_khz(vcpu, user_tsc_khz);
 		r = 0;
 		goto out;
 	}
 	case KVM_GET_TSC_KHZ: {
 		r = vcpu->arch.virtual_tsc_khz;
 		goto out;
 	}
 	case KVM_KVMCLOCK_CTRL: {
 		r = kvm_set_guest_paused(vcpu);
 		goto out;
 	}
 	default:
 		r = -EINVAL;
 	}
 out:
 	kfree(u.buffer);
 	return r;
 }
 int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
 {
 	return VM_FAULT_SIGBUS;
 }
 static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
 {
 	int ret;
 	if (addr > (unsigned int)(-3 * PAGE_SIZE))
 		return -EINVAL;
 	ret = kvm_x86_ops->set_tss_addr(kvm, addr);
 	return ret;
 }
 static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,
 					      u64 ident_addr)
 {
 	kvm->arch.ept_identity_map_addr = ident_addr;
 	return 0;
 }
 static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
 					  u32 kvm_nr_mmu_pages)
 {
 	if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
 		return -EINVAL;
 	mutex_lock(&kvm->slots_lock);
 	spin_lock(&kvm->mmu_lock);
 	kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
 	kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
 	spin_unlock(&kvm->mmu_lock);
 	mutex_unlock(&kvm->slots_lock);
 	return 0;
 }
 static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
 {
 	return kvm->arch.n_max_mmu_pages;
 }
 static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
 {
 	int r;
 	r = 0;
 	switch (chip->chip_id) {
 	case KVM_IRQCHIP_PIC_MASTER:
 		memcpy(&chip->chip.pic,
 			&pic_irqchip(kvm)->pics[0],
 			sizeof(struct kvm_pic_state));
 		break;
 	case KVM_IRQCHIP_PIC_SLAVE:
 		memcpy(&chip->chip.pic,
 			&pic_irqchip(kvm)->pics[1],
 			sizeof(struct kvm_pic_state));
 		break;
 	case KVM_IRQCHIP_IOAPIC:
 		r = kvm_get_ioapic(kvm, &chip->chip.ioapic);
 		break;
 	default:
 		r = -EINVAL;
 		break;
 	}
 	return r;
 }
 static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
 {
 	int r;
 	r = 0;
 	switch (chip->chip_id) {
 	case KVM_IRQCHIP_PIC_MASTER:
 		spin_lock(&pic_irqchip(kvm)->lock);
 		memcpy(&pic_irqchip(kvm)->pics[0],
 			&chip->chip.pic,
 			sizeof(struct kvm_pic_state));
 		spin_unlock(&pic_irqchip(kvm)->lock);
 		break;
 	case KVM_IRQCHIP_PIC_SLAVE:
 		spin_lock(&pic_irqchip(kvm)->lock);
 		memcpy(&pic_irqchip(kvm)->pics[1],
 			&chip->chip.pic,
 			sizeof(struct kvm_pic_state));
 		spin_unlock(&pic_irqchip(kvm)->lock);
 		break;
 	case KVM_IRQCHIP_IOAPIC:
 		r = kvm_set_ioapic(kvm, &chip->chip.ioapic);
 		break;
 	default:
 		r = -EINVAL;
 		break;
 	}
 	kvm_pic_update_irq(pic_irqchip(kvm));
 	return r;
 }
 static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
 {
 	int r = 0;
 	mutex_lock(&kvm->arch.vpit->pit_state.lock);
 	memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));
 	mutex_unlock(&kvm->arch.vpit->pit_state.lock);
 	return r;
 }
 static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
 {
 	int r = 0;
 	mutex_lock(&kvm->arch.vpit->pit_state.lock);
 	memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
 	kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0);
 	mutex_unlock(&kvm->arch.vpit->pit_state.lock);
 	return r;
 }
 static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
 {
 	int r = 0;
 	mutex_lock(&kvm->arch.vpit->pit_state.lock);
 	memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
 		sizeof(ps->channels));
 	ps->flags = kvm->arch.vpit->pit_state.flags;
 	mutex_unlock(&kvm->arch.vpit->pit_state.lock);
 	memset(&ps->reserved, 0, sizeof(ps->reserved));
 	return r;
 }
 static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
 {
 	int r = 0, start = 0;
 	u32 prev_legacy, cur_legacy;
 	mutex_lock(&kvm->arch.vpit->pit_state.lock);
 	prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
 	cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
 	if (!prev_legacy && cur_legacy)
 		start = 1;
 	memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels,
 	       sizeof(kvm->arch.vpit->pit_state.channels));
 	kvm->arch.vpit->pit_state.flags = ps->flags;
 	kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start);
 	mutex_unlock(&kvm->arch.vpit->pit_state.lock);
 	return r;
 }
 static int kvm_vm_ioctl_reinject(struct kvm *kvm,
 				 struct kvm_reinject_control *control)
 {
 	if (!kvm->arch.vpit)
 		return -ENXIO;
 	mutex_lock(&kvm->arch.vpit->pit_state.lock);
 	kvm->arch.vpit->pit_state.reinject = control->pit_reinject;
 	mutex_unlock(&kvm->arch.vpit->pit_state.lock);
 	return 0;
 }
 /**
  * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
  * @kvm: kvm instance
  * @log: slot id and address to which we copy the log
  *
  * We need to keep it in mind that VCPU threads can write to the bitmap
  * concurrently.  So, to avoid losing data, we keep the following order for
  * each bit:
  *
  *   1. Take a snapshot of the bit and clear it if needed.
  *   2. Write protect the corresponding page.
  *   3. Flush TLB's if needed.
  *   4. Copy the snapshot to the userspace.
  *
  * Between 2 and 3, the guest may write to the page using the remaining TLB
  * entry.  This is not a problem because the page will be reported dirty at
  * step 4 using the snapshot taken before and step 3 ensures that successive
  * writes will be logged for the next call.
  */
 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
 {
 	int r;
 	struct kvm_memory_slot *memslot;
 	unsigned long n, i;
 	unsigned long *dirty_bitmap;
 	unsigned long *dirty_bitmap_buffer;
 	bool is_dirty = false;
 	mutex_lock(&kvm->slots_lock);
 	r = -EINVAL;
 	if (log->slot >= KVM_USER_MEM_SLOTS)
 		goto out;
 	memslot = id_to_memslot(kvm->memslots, log->slot);
 	dirty_bitmap = memslot->dirty_bitmap;
 	r = -ENOENT;
 	if (!dirty_bitmap)
 		goto out;
 	n = kvm_dirty_bitmap_bytes(memslot);
 	dirty_bitmap_buffer = dirty_bitmap + n / sizeof(long);
 	memset(dirty_bitmap_buffer, 0, n);
 	spin_lock(&kvm->mmu_lock);
 	for (i = 0; i < n / sizeof(long); i++) {
 		unsigned long mask;
 		gfn_t offset;
 		if (!dirty_bitmap[i])
 			continue;
 		is_dirty = true;
 		mask = xchg(&dirty_bitmap[i], 0);
 		dirty_bitmap_buffer[i] = mask;
 		offset = i * BITS_PER_LONG;
 		kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, mask);
 	}
 	if (is_dirty)
 		kvm_flush_remote_tlbs(kvm);
 	spin_unlock(&kvm->mmu_lock);
 	r = -EFAULT;
 	if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
 		goto out;
 	r = 0;
 out:
 	mutex_unlock(&kvm->slots_lock);
 	return r;
 }
 int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event)
 {
 	if (!irqchip_in_kernel(kvm))
 		return -ENXIO;
 	irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
 					irq_event->irq, irq_event->level);
 	return 0;
 }
 long kvm_arch_vm_ioctl(struct file *filp,
 		       unsigned int ioctl, unsigned long arg)
 {
 	struct kvm *kvm = filp->private_data;
 	void __user *argp = (void __user *)arg;
 	int r = -ENOTTY;
 	/*
 	 * This union makes it completely explicit to gcc-3.x
 	 * that these two variables' stack usage should be
 	 * combined, not added together.
 	 */
 	union {
 		struct kvm_pit_state ps;
 		struct kvm_pit_state2 ps2;
 		struct kvm_pit_config pit_config;
 	} u;
 	switch (ioctl) {
 	case KVM_SET_TSS_ADDR:
 		r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
 		break;
 	case KVM_SET_IDENTITY_MAP_ADDR: {
 		u64 ident_addr;
 		r = -EFAULT;
 		if (copy_from_user(&ident_addr, argp, sizeof ident_addr))
 			goto out;
 		r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr);
 		break;
 	}
 	case KVM_SET_NR_MMU_PAGES:
 		r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
 		break;
 	case KVM_GET_NR_MMU_PAGES:
 		r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
 		break;
 	case KVM_CREATE_IRQCHIP: {
 		struct kvm_pic *vpic;
 		mutex_lock(&kvm->lock);
 		r = -EEXIST;
 		if (kvm->arch.vpic)
 			goto create_irqchip_unlock;
 		r = -EINVAL;
 		if (atomic_read(&kvm->online_vcpus))
 			goto create_irqchip_unlock;
 		r = -ENOMEM;
 		vpic = kvm_create_pic(kvm);
 		if (vpic) {
 			r = kvm_ioapic_init(kvm);
 			if (r) {
 				mutex_lock(&kvm->slots_lock);
 				kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
 							  &vpic->dev_master);
 				kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
 							  &vpic->dev_slave);
 				kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
 							  &vpic->dev_eclr);
 				mutex_unlock(&kvm->slots_lock);
 				kfree(vpic);
 				goto create_irqchip_unlock;
 			}
 		} else
 			goto create_irqchip_unlock;
 		smp_wmb();
 		kvm->arch.vpic = vpic;
 		smp_wmb();
 		r = kvm_setup_default_irq_routing(kvm);
 		if (r) {
 			mutex_lock(&kvm->slots_lock);
 			mutex_lock(&kvm->irq_lock);
 			kvm_ioapic_destroy(kvm);
 			kvm_destroy_pic(kvm);
 			mutex_unlock(&kvm->irq_lock);
 			mutex_unlock(&kvm->slots_lock);
 		}
 	create_irqchip_unlock:
 		mutex_unlock(&kvm->lock);
 		break;
 	}
 	case KVM_CREATE_PIT:
 		u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY;
 		goto create_pit;
 	case KVM_CREATE_PIT2:
 		r = -EFAULT;
 		if (copy_from_user(&u.pit_config, argp,
 				   sizeof(struct kvm_pit_config)))
 			goto out;
 	create_pit:
 		mutex_lock(&kvm->slots_lock);
 		r = -EEXIST;
 		if (kvm->arch.vpit)
 			goto create_pit_unlock;
 		r = -ENOMEM;
 		kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags);
 		if (kvm->arch.vpit)
 			r = 0;
 	create_pit_unlock:
 		mutex_unlock(&kvm->slots_lock);
 		break;
 	case KVM_GET_IRQCHIP: {
 		/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
 		struct kvm_irqchip *chip;
 		chip = memdup_user(argp, sizeof(*chip));
 		if (IS_ERR(chip)) {
 			r = PTR_ERR(chip);
 			goto out;
 		}
 		r = -ENXIO;
 		if (!irqchip_in_kernel(kvm))
 			goto get_irqchip_out;
 		r = kvm_vm_ioctl_get_irqchip(kvm, chip);
 		if (r)
 			goto get_irqchip_out;
 		r = -EFAULT;
 		if (copy_to_user(argp, chip, sizeof *chip))
 			goto get_irqchip_out;
 		r = 0;
 	get_irqchip_out:
 		kfree(chip);
 		break;
 	}
 	case KVM_SET_IRQCHIP: {
 		/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
 		struct kvm_irqchip *chip;
 		chip = memdup_user(argp, sizeof(*chip));
 		if (IS_ERR(chip)) {
 			r = PTR_ERR(chip);
 			goto out;
 		}
 		r = -ENXIO;
 		if (!irqchip_in_kernel(kvm))
 			goto set_irqchip_out;
 		r = kvm_vm_ioctl_set_irqchip(kvm, chip);
 		if (r)
 			goto set_irqchip_out;
 		r = 0;
 	set_irqchip_out:
 		kfree(chip);
 		break;
 	}
 	case KVM_GET_PIT: {
 		r = -EFAULT;
 		if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state)))
 			goto out;
 		r = -ENXIO;
 		if (!kvm->arch.vpit)
 			goto out;
 		r = kvm_vm_ioctl_get_pit(kvm, &u.ps);
 		if (r)
 			goto out;
 		r = -EFAULT;
 		if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state)))
 			goto out;
 		r = 0;
 		break;
 	}
 	case KVM_SET_PIT: {
 		r = -EFAULT;
 		if (copy_from_user(&u.ps, argp, sizeof u.ps))
 			goto out;
 		r = -ENXIO;
 		if (!kvm->arch.vpit)
 			goto out;
 		r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
 		break;
 	}
 	case KVM_GET_PIT2: {
 		r = -ENXIO;
 		if (!kvm->arch.vpit)
 			goto out;
 		r = kvm_vm_ioctl_get_pit2(kvm, &u.ps2);
 		if (r)
 			goto out;
 		r = -EFAULT;
 		if (copy_to_user(argp, &u.ps2, sizeof(u.ps2)))
 			goto out;
 		r = 0;
 		break;
 	}
 	case KVM_SET_PIT2: {
 		r = -EFAULT;
 		if (copy_from_user(&u.ps2, argp, sizeof(u.ps2)))
 			goto out;
 		r = -ENXIO;
 		if (!kvm->arch.vpit)
 			goto out;
 		r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2);
 		break;
 	}
 	case KVM_REINJECT_CONTROL: {
 		struct kvm_reinject_control control;
 		r =  -EFAULT;
 		if (copy_from_user(&control, argp, sizeof(control)))
 			goto out;
 		r = kvm_vm_ioctl_reinject(kvm, &control);
 		break;
 	}
 	case KVM_XEN_HVM_CONFIG: {
 		r = -EFAULT;
 		if (copy_from_user(&kvm->arch.xen_hvm_config, argp,
 				   sizeof(struct kvm_xen_hvm_config)))
 			goto out;
 		r = -EINVAL;
 		if (kvm->arch.xen_hvm_config.flags)
 			goto out;
 		r = 0;
 		break;
 	}
 	case KVM_SET_CLOCK: {
 		struct kvm_clock_data user_ns;
 		u64 now_ns;
 		s64 delta;
 		r = -EFAULT;
 		if (copy_from_user(&user_ns, argp, sizeof(user_ns)))
 			goto out;
 		r = -EINVAL;
 		if (user_ns.flags)
 			goto out;
 		r = 0;
 		local_irq_disable();
 		now_ns = get_kernel_ns();
 		delta = user_ns.clock - now_ns;
 		local_irq_enable();
 		kvm->arch.kvmclock_offset = delta;
 		break;
 	}
 	case KVM_GET_CLOCK: {
 		struct kvm_clock_data user_ns;
 		u64 now_ns;
 		local_irq_disable();
 		now_ns = get_kernel_ns();
 		user_ns.clock = kvm->arch.kvmclock_offset + now_ns;
 		local_irq_enable();
 		user_ns.flags = 0;
 		memset(&user_ns.pad, 0, sizeof(user_ns.pad));
 		r = -EFAULT;
 		if (copy_to_user(argp, &user_ns, sizeof(user_ns)))
 			goto out;
 		r = 0;
 		break;
 	}
 	default:
 		;
 	}
 out:
 	return r;
 }
 static void kvm_init_msr_list(void)
 {
 	u32 dummy[2];
 	unsigned i, j;
 	/* skip the first msrs in the list. KVM-specific */
 	for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) {
 		if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
 			continue;
 		if (j < i)
 			msrs_to_save[j] = msrs_to_save[i];
 		j++;
 	}
 	num_msrs_to_save = j;
 }
 static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
 			   const void *v)
 {
 	int handled = 0;
 	int n;
 	do {
 		n = min(len, 8);
 		if (!(vcpu->arch.apic &&
 		      !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, n, v))
 		    && kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, n, v))
 			break;
 		handled += n;
 		addr += n;
 		len -= n;
 		v += n;
 	} while (len);
 	return handled;
 }
 static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
 {
 	int handled = 0;
 	int n;
 	do {
 		n = min(len, 8);
 		if (!(vcpu->arch.apic &&
 		      !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, n, v))
 		    && kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, n, v))
 			break;
 		trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, *(u64 *)v);
 		handled += n;
 		addr += n;
 		len -= n;
 		v += n;
 	} while (len);
 	return handled;
 }
 static void kvm_set_segment(struct kvm_vcpu *vcpu,
 			struct kvm_segment *var, int seg)
 {
 	kvm_x86_ops->set_segment(vcpu, var, seg);
 }
 void kvm_get_segment(struct kvm_vcpu *vcpu,
 		     struct kvm_segment *var, int seg)
 {
 	kvm_x86_ops->get_segment(vcpu, var, seg);
 }
 gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
 {
 	gpa_t t_gpa;
 	struct x86_exception exception;
 	BUG_ON(!mmu_is_nested(vcpu));
 	/* NPT walks are always user-walks */
 	access |= PFERR_USER_MASK;
 	t_gpa  = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &exception);
 	return t_gpa;
 }
 gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
 			      struct x86_exception *exception)
 {
 	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
 	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
 }
  gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
 				struct x86_exception *exception)
 {
 	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
 	access |= PFERR_FETCH_MASK;
 	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
 }
 gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
 			       struct x86_exception *exception)
 {
 	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
 	access |= PFERR_WRITE_MASK;
 	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
 }
 /* uses this to access any guest's mapped memory without checking CPL */
 gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
 				struct x86_exception *exception)
 {
 	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, exception);
 }
 static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
 				      struct kvm_vcpu *vcpu, u32 access,
 				      struct x86_exception *exception)
 {
 	void *data = val;
 	int r = X86EMUL_CONTINUE;
 	while (bytes) {
 		gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access,
 							    exception);
 		unsigned offset = addr & (PAGE_SIZE-1);
 		unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
 		int ret;
 		if (gpa == UNMAPPED_GVA)
 			return X86EMUL_PROPAGATE_FAULT;
 		ret = kvm_read_guest(vcpu->kvm, gpa, data, toread);
 		if (ret < 0) {
 			r = X86EMUL_IO_NEEDED;
 			goto out;
 		}
 		bytes -= toread;
 		data += toread;
 		addr += toread;
 	}
 out:
 	return r;
 }
 /* used for instruction fetching */
 static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
 				gva_t addr, void *val, unsigned int bytes,
 				struct x86_exception *exception)
 {
 	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
 	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
 	return kvm_read_guest_virt_helper(addr, val, bytes, vcpu,
 					  access | PFERR_FETCH_MASK,
 					  exception);
 }
 int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
 			       gva_t addr, void *val, unsigned int bytes,
 			       struct x86_exception *exception)
 {
 	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
 	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
 	return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
 					  exception);
 }
 EXPORT_SYMBOL_GPL(kvm_read_guest_virt);
 static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt,
 				      gva_t addr, void *val, unsigned int bytes,
 				      struct x86_exception *exception)
 {
 	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
 	return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception);
 }
 int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
 				       gva_t addr, void *val,
 				       unsigned int bytes,
 				       struct x86_exception *exception)
 {
 	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
 	void *data = val;
 	int r = X86EMUL_CONTINUE;
 	while (bytes) {
 		gpa_t gpa =  vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr,
 							     PFERR_WRITE_MASK,
 							     exception);
 		unsigned offset = addr & (PAGE_SIZE-1);
 		unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
 		int ret;
 		if (gpa == UNMAPPED_GVA)
 			return X86EMUL_PROPAGATE_FAULT;
 		ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);
 		if (ret < 0) {
 			r = X86EMUL_IO_NEEDED;
 			goto out;
 		}
 		bytes -= towrite;
 		data += towrite;
 		addr += towrite;
 	}
 out:
 	return r;
 }
 EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
 static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
 				gpa_t *gpa, struct x86_exception *exception,
 				bool write)
 {
 	u32 access = ((kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0)
 		| (write ? PFERR_WRITE_MASK : 0);
 	if (vcpu_match_mmio_gva(vcpu, gva)
 	    && !permission_fault(vcpu->arch.walk_mmu, vcpu->arch.access, access)) {
 		*gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
 					(gva & (PAGE_SIZE - 1));
 		trace_vcpu_match_mmio(gva, *gpa, write, false);
 		return 1;
 	}
 	*gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
 	if (*gpa == UNMAPPED_GVA)
 		return -1;
 	/* For APIC access vmexit */
 	if ((*gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
 		return 1;
 	if (vcpu_match_mmio_gpa(vcpu, *gpa)) {
 		trace_vcpu_match_mmio(gva, *gpa, write, true);
 		return 1;
 	}
 	return 0;
 }
 int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
 			const void *val, int bytes)
 {
 	int ret;
 	ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
 	if (ret < 0)
 		return 0;
 	kvm_mmu_pte_write(vcpu, gpa, val, bytes);
 	return 1;
 }
 struct read_write_emulator_ops {
 	int (*read_write_prepare)(struct kvm_vcpu *vcpu, void *val,
 				  int bytes);
 	int (*read_write_emulate)(struct kvm_vcpu *vcpu, gpa_t gpa,
 				  void *val, int bytes);
 	int (*read_write_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa,
 			       int bytes, void *val);
 	int (*read_write_exit_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa,
 				    void *val, int bytes);
 	bool write;
 };
 static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes)
 {
 	if (vcpu->mmio_read_completed) {
 		trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
 			       vcpu->mmio_fragments[0].gpa, *(u64 *)val);
 		vcpu->mmio_read_completed = 0;
 		return 1;
 	}
 	return 0;
 }
 static int read_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
 			void *val, int bytes)
 {
 	return !kvm_read_guest(vcpu->kvm, gpa, val, bytes);
 }
 static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
 			 void *val, int bytes)
 {
 	return emulator_write_phys(vcpu, gpa, val, bytes);
 }
 static int write_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes, void *val)
 {
 	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
 	return vcpu_mmio_write(vcpu, gpa, bytes, val);
 }
 static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
 			  void *val, int bytes)
 {
 	trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);
 	return X86EMUL_IO_NEEDED;
 }
 static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
 			   void *val, int bytes)
 {
 	struct kvm_mmio_fragment *frag = &vcpu->mmio_fragments[0];
 	memcpy(vcpu->run->mmio.data, frag->data, min(8u, frag->len));
 	return X86EMUL_CONTINUE;
 }
 static const struct read_write_emulator_ops read_emultor = {
 	.read_write_prepare = read_prepare,
 	.read_write_emulate = read_emulate,
 	.read_write_mmio = vcpu_mmio_read,
 	.read_write_exit_mmio = read_exit_mmio,
 };
 static const struct read_write_emulator_ops write_emultor = {
 	.read_write_emulate = write_emulate,
 	.read_write_mmio = write_mmio,
 	.read_write_exit_mmio = write_exit_mmio,
 	.write = true,
 };
 static int emulator_read_write_onepage(unsigned long addr, void *val,
 				       unsigned int bytes,
 				       struct x86_exception *exception,
 				       struct kvm_vcpu *vcpu,
 				       const struct read_write_emulator_ops *ops)
 {
 	gpa_t gpa;
 	int handled, ret;
 	bool write = ops->write;
 	struct kvm_mmio_fragment *frag;
 	ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write);
 	if (ret < 0)
 		return X86EMUL_PROPAGATE_FAULT;
 	/* For APIC access vmexit */
 	if (ret)
 		goto mmio;
 	if (ops->read_write_emulate(vcpu, gpa, val, bytes))
 		return X86EMUL_CONTINUE;
 mmio:
 	/*
 	 * Is this MMIO handled locally?
 	 */
 	handled = ops->read_write_mmio(vcpu, gpa, bytes, val);
 	if (handled == bytes)
 		return X86EMUL_CONTINUE;
 	gpa += handled;
 	bytes -= handled;
 	val += handled;
 	WARN_ON(vcpu->mmio_nr_fragments >= KVM_MAX_MMIO_FRAGMENTS);
 	frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++];
 	frag->gpa = gpa;
 	frag->data = val;
 	frag->len = bytes;
 	return X86EMUL_CONTINUE;
 }
 int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr,
 			void *val, unsigned int bytes,
 			struct x86_exception *exception,
 			const struct read_write_emulator_ops *ops)
 {
 	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
 	gpa_t gpa;
 	int rc;
 	if (ops->read_write_prepare &&
 		  ops->read_write_prepare(vcpu, val, bytes))
 		return X86EMUL_CONTINUE;
 	vcpu->mmio_nr_fragments = 0;
 	/* Crossing a page boundary? */
 	if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
 		int now;
 		now = -addr & ~PAGE_MASK;
 		rc = emulator_read_write_onepage(addr, val, now, exception,
 						 vcpu, ops);
 		if (rc != X86EMUL_CONTINUE)
 			return rc;
 		addr += now;
 		val += now;
 		bytes -= now;
 	}
 	rc = emulator_read_write_onepage(addr, val, bytes, exception,
 					 vcpu, ops);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 	if (!vcpu->mmio_nr_fragments)
 		return rc;
 	gpa = vcpu->mmio_fragments[0].gpa;
 	vcpu->mmio_needed = 1;
 	vcpu->mmio_cur_fragment = 0;
 	vcpu->run->mmio.len = min(8u, vcpu->mmio_fragments[0].len);
 	vcpu->run->mmio.is_write = vcpu->mmio_is_write = ops->write;
 	vcpu->run->exit_reason = KVM_EXIT_MMIO;
 	vcpu->run->mmio.phys_addr = gpa;
 	return ops->read_write_exit_mmio(vcpu, gpa, val, bytes);
 }
 static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
 				  unsigned long addr,
 				  void *val,
 				  unsigned int bytes,
 				  struct x86_exception *exception)
 {
 	return emulator_read_write(ctxt, addr, val, bytes,
 				   exception, &read_emultor);
 }
 int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,
 			    unsigned long addr,
 			    const void *val,
 			    unsigned int bytes,
 			    struct x86_exception *exception)
 {
 	return emulator_read_write(ctxt, addr, (void *)val, bytes,
 				   exception, &write_emultor);
 }
 #define CMPXCHG_TYPE(t, ptr, old, new) \
 	(cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old))
 #ifdef CONFIG_X86_64
 #  define CMPXCHG64(ptr, old, new) CMPXCHG_TYPE(u64, ptr, old, new)
 #else
 #  define CMPXCHG64(ptr, old, new) \
 	(cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old))
 #endif
 static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
 				     unsigned long addr,
 				     const void *old,
 				     const void *new,
 				     unsigned int bytes,
 				     struct x86_exception *exception)
 {
 	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
 	gpa_t gpa;
 	struct page *page;
 	char *kaddr;
 	bool exchanged;
 	/* guests cmpxchg8b have to be emulated atomically */
 	if (bytes > 8 || (bytes & (bytes - 1)))
 		goto emul_write;
 	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
 	if (gpa == UNMAPPED_GVA ||
 	    (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
 		goto emul_write;
 	if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
 		goto emul_write;
 	page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
 	if (is_error_page(page))
 		goto emul_write;
 	kaddr = kmap_atomic(page);
 	kaddr += offset_in_page(gpa);
 	switch (bytes) {
 	case 1:
 		exchanged = CMPXCHG_TYPE(u8, kaddr, old, new);
 		break;
 	case 2:
 		exchanged = CMPXCHG_TYPE(u16, kaddr, old, new);
 		break;
 	case 4:
 		exchanged = CMPXCHG_TYPE(u32, kaddr, old, new);
 		break;
 	case 8:
 		exchanged = CMPXCHG64(kaddr, old, new);
 		break;
 	default:
 		BUG();
 	}
 	kunmap_atomic(kaddr);
 	kvm_release_page_dirty(page);
 	if (!exchanged)
 		return X86EMUL_CMPXCHG_FAILED;
 	kvm_mmu_pte_write(vcpu, gpa, new, bytes);
 	return X86EMUL_CONTINUE;
 emul_write:
 	printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
 	return emulator_write_emulated(ctxt, addr, new, bytes, exception);
 }
 static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
 {
 	/* TODO: String I/O for in kernel device */
 	int r;
 	if (vcpu->arch.pio.in)
 		r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port,
 				    vcpu->arch.pio.size, pd);
 	else
 		r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
 				     vcpu->arch.pio.port, vcpu->arch.pio.size,
 				     pd);
 	return r;
 }
 static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size,
 			       unsigned short port, void *val,
 			       unsigned int count, bool in)
 {
 	trace_kvm_pio(!in, port, size, count);
 	vcpu->arch.pio.port = port;
 	vcpu->arch.pio.in = in;
 	vcpu->arch.pio.count  = count;
 	vcpu->arch.pio.size = size;
 	if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
 		vcpu->arch.pio.count = 0;
 		return 1;
 	}
 	vcpu->run->exit_reason = KVM_EXIT_IO;
 	vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
 	vcpu->run->io.size = size;
 	vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
 	vcpu->run->io.count = count;
 	vcpu->run->io.port = port;
 	return 0;
 }
 static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
 				    int size, unsigned short port, void *val,
 				    unsigned int count)
 {
 	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
 	int ret;
 	if (vcpu->arch.pio.count)
 		goto data_avail;
 	ret = emulator_pio_in_out(vcpu, size, port, val, count, true);
 	if (ret) {
 data_avail:
 		memcpy(val, vcpu->arch.pio_data, size * count);
 		vcpu->arch.pio.count = 0;
 		return 1;
 	}
 	return 0;
 }
 static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
 				     int size, unsigned short port,
 				     const void *val, unsigned int count)
 {
 	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
 	memcpy(vcpu->arch.pio_data, val, size * count);
 	return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false);
 }
 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
 {
 	return kvm_x86_ops->get_segment_base(vcpu, seg);
 }
 static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address)
 {
 	kvm_mmu_invlpg(emul_to_vcpu(ctxt), address);
 }
 int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
 {
 	if (!need_emulate_wbinvd(vcpu))
 		return X86EMUL_CONTINUE;
 	if (kvm_x86_ops->has_wbinvd_exit()) {
 		int cpu = get_cpu();
 		cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
 		smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,
 				wbinvd_ipi, NULL, 1);
 		put_cpu();
 		cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
 	} else
 		wbinvd();
 	return X86EMUL_CONTINUE;
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
 static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt)
 {
 	kvm_emulate_wbinvd(emul_to_vcpu(ctxt));
 }
 int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
 {
 	return _kvm_get_dr(emul_to_vcpu(ctxt), dr, dest);
 }
 int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
 {
 	return __kvm_set_dr(emul_to_vcpu(ctxt), dr, value);
 }
 static u64 mk_cr_64(u64 curr_cr, u32 new_val)
 {
 	return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
 }
 static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr)
 {
 	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
 	unsigned long value;
 	switch (cr) {
 	case 0:
 		value = kvm_read_cr0(vcpu);
 		break;
 	case 2:
 		value = vcpu->arch.cr2;
 		break;
 	case 3:
 		value = kvm_read_cr3(vcpu);
 		break;
 	case 4:
 		value = kvm_read_cr4(vcpu);
 		break;
 	case 8:
 		value = kvm_get_cr8(vcpu);
 		break;
 	default:
 		kvm_err("%s: unexpected cr %u\n", __func__, cr);
 		return 0;
 	}
 	return value;
 }
 static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
 {
 	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
 	int res = 0;
 	switch (cr) {
 	case 0:
 		res = kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
 		break;
 	case 2:
 		vcpu->arch.cr2 = val;
 		break;
 	case 3:
 		res = kvm_set_cr3(vcpu, val);
 		break;
 	case 4:
 		res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
 		break;
 	case 8:
 		res = kvm_set_cr8(vcpu, val);
 		break;
 	default:
 		kvm_err("%s: unexpected cr %u\n", __func__, cr);
 		res = -1;
 	}
 	return res;
 }
 static void emulator_set_rflags(struct x86_emulate_ctxt *ctxt, ulong val)
 {
 	kvm_set_rflags(emul_to_vcpu(ctxt), val);
 }
 static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
 {
 	return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt));
 }
 static void emulator_get_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
 {
 	kvm_x86_ops->get_gdt(emul_to_vcpu(ctxt), dt);
 }
 static void emulator_get_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
 {
 	kvm_x86_ops->get_idt(emul_to_vcpu(ctxt), dt);
 }
 static void emulator_set_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
 {
 	kvm_x86_ops->set_gdt(emul_to_vcpu(ctxt), dt);
 }
 static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
 {
 	kvm_x86_ops->set_idt(emul_to_vcpu(ctxt), dt);
 }
 static unsigned long emulator_get_cached_segment_base(
 	struct x86_emulate_ctxt *ctxt, int seg)
 {
 	return get_segment_base(emul_to_vcpu(ctxt), seg);
 }
 static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector,
 				 struct desc_struct *desc, u32 *base3,
 				 int seg)
 {
 	struct kvm_segment var;
 	kvm_get_segment(emul_to_vcpu(ctxt), &var, seg);
 	*selector = var.selector;
 	if (var.unusable)
 		return false;
 	if (var.g)
 		var.limit >>= 12;
 	set_desc_limit(desc, var.limit);
 	set_desc_base(desc, (unsigned long)var.base);
 #ifdef CONFIG_X86_64
 	if (base3)
 		*base3 = var.base >> 32;
 #endif
 	desc->type = var.type;
 	desc->s = var.s;
 	desc->dpl = var.dpl;
 	desc->p = var.present;
 	desc->avl = var.avl;
 	desc->l = var.l;
 	desc->d = var.db;
 	desc->g = var.g;
 	return true;
 }
 static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector,
 				 struct desc_struct *desc, u32 base3,
 				 int seg)
 {
 	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
 	struct kvm_segment var;
 	var.selector = selector;
 	var.base = get_desc_base(desc);
 #ifdef CONFIG_X86_64
 	var.base |= ((u64)base3) << 32;
 #endif
 	var.limit = get_desc_limit(desc);
 	if (desc->g)
 		var.limit = (var.limit << 12) | 0xfff;
 	var.type = desc->type;
 	var.present = desc->p;
 	var.dpl = desc->dpl;
 	var.db = desc->d;
 	var.s = desc->s;
 	var.l = desc->l;
 	var.g = desc->g;
 	var.avl = desc->avl;
 	var.present = desc->p;
 	var.unusable = !var.present;
 	var.padding = 0;
 	kvm_set_segment(vcpu, &var, seg);
 	return;
 }
 static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
 			    u32 msr_index, u64 *pdata)
 {
 	return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata);
 }
 static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
 			    u32 msr_index, u64 data)
 {
 	struct msr_data msr;
 	msr.data = data;
 	msr.index = msr_index;
 	msr.host_initiated = false;
 	return kvm_set_msr(emul_to_vcpu(ctxt), &msr);
 }
 static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
 			     u32 pmc, u64 *pdata)
 {
 	return kvm_pmu_read_pmc(emul_to_vcpu(ctxt), pmc, pdata);
 }
 static void emulator_halt(struct x86_emulate_ctxt *ctxt)
 {
 	emul_to_vcpu(ctxt)->arch.halt_request = 1;
 }
 static void emulator_get_fpu(struct x86_emulate_ctxt *ctxt)
 {
 	preempt_disable();
 	kvm_load_guest_fpu(emul_to_vcpu(ctxt));
 	/*
 	 * CR0.TS may reference the host fpu state, not the guest fpu state,
 	 * so it may be clear at this point.
 	 */
 	clts();
 }
 static void emulator_put_fpu(struct x86_emulate_ctxt *ctxt)
 {
 	preempt_enable();
 }
 static int emulator_intercept(struct x86_emulate_ctxt *ctxt,
 			      struct x86_instruction_info *info,
 			      enum x86_intercept_stage stage)
 {
 	return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage);
 }
 static void emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
 			       u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
 {
 	kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx);
 }
 static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg)
 {
 	return kvm_register_read(emul_to_vcpu(ctxt), reg);
 }
 static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulong val)
 {
 	kvm_register_write(emul_to_vcpu(ctxt), reg, val);
 }
 static const struct x86_emulate_ops emulate_ops = {
 	.read_gpr            = emulator_read_gpr,
 	.write_gpr           = emulator_write_gpr,
 	.read_std            = kvm_read_guest_virt_system,
 	.write_std           = kvm_write_guest_virt_system,
 	.fetch               = kvm_fetch_guest_virt,
 	.read_emulated       = emulator_read_emulated,
 	.write_emulated      = emulator_write_emulated,
 	.cmpxchg_emulated    = emulator_cmpxchg_emulated,
 	.invlpg              = emulator_invlpg,
 	.pio_in_emulated     = emulator_pio_in_emulated,
 	.pio_out_emulated    = emulator_pio_out_emulated,
 	.get_segment         = emulator_get_segment,
 	.set_segment         = emulator_set_segment,
 	.get_cached_segment_base = emulator_get_cached_segment_base,
 	.get_gdt             = emulator_get_gdt,
 	.get_idt	     = emulator_get_idt,
 	.set_gdt             = emulator_set_gdt,
 	.set_idt	     = emulator_set_idt,
 	.get_cr              = emulator_get_cr,
 	.set_cr              = emulator_set_cr,
 	.set_rflags          = emulator_set_rflags,
 	.cpl                 = emulator_get_cpl,
 	.get_dr              = emulator_get_dr,
 	.set_dr              = emulator_set_dr,
 	.set_msr             = emulator_set_msr,
 	.get_msr             = emulator_get_msr,
 	.read_pmc            = emulator_read_pmc,
 	.halt                = emulator_halt,
 	.wbinvd              = emulator_wbinvd,
 	.fix_hypercall       = emulator_fix_hypercall,
 	.get_fpu             = emulator_get_fpu,
 	.put_fpu             = emulator_put_fpu,
 	.intercept           = emulator_intercept,
 	.get_cpuid           = emulator_get_cpuid,
 };
 static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
 {
 	u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask);
 	/*
 	 * an sti; sti; sequence only disable interrupts for the first
 	 * instruction. So, if the last instruction, be it emulated or
 	 * not, left the system with the INT_STI flag enabled, it
 	 * means that the last instruction is an sti. We should not
 	 * leave the flag on in this case. The same goes for mov ss
 	 */
 	if (!(int_shadow & mask))
 		kvm_x86_ops->set_interrupt_shadow(vcpu, mask);
 }
 static void inject_emulated_exception(struct kvm_vcpu *vcpu)
 {
 	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
 	if (ctxt->exception.vector == PF_VECTOR)
 		kvm_propagate_fault(vcpu, &ctxt->exception);
 	else if (ctxt->exception.error_code_valid)
 		kvm_queue_exception_e(vcpu, ctxt->exception.vector,
 				      ctxt->exception.error_code);
 	else
 		kvm_queue_exception(vcpu, ctxt->exception.vector);
 }
 static void init_decode_cache(struct x86_emulate_ctxt *ctxt)
 {
 	memset(&ctxt->twobyte, 0,
 	       (void *)&ctxt->_regs - (void *)&ctxt->twobyte);
 	ctxt->fetch.start = 0;
 	ctxt->fetch.end = 0;
 	ctxt->io_read.pos = 0;
 	ctxt->io_read.end = 0;
 	ctxt->mem_read.pos = 0;
 	ctxt->mem_read.end = 0;
 }
 static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
 {
 	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
 	int cs_db, cs_l;
 	kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
 	ctxt->eflags = kvm_get_rflags(vcpu);
 	ctxt->eip = kvm_rip_read(vcpu);
 	ctxt->mode = (!is_protmode(vcpu))		? X86EMUL_MODE_REAL :
 		     (ctxt->eflags & X86_EFLAGS_VM)	? X86EMUL_MODE_VM86 :
 		     cs_l				? X86EMUL_MODE_PROT64 :
 		     cs_db				? X86EMUL_MODE_PROT32 :
 							  X86EMUL_MODE_PROT16;
 	ctxt->guest_mode = is_guest_mode(vcpu);
 	init_decode_cache(ctxt);
 	vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
 }
 int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
 {
 	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
 	int ret;
 	init_emulate_ctxt(vcpu);
 	ctxt->op_bytes = 2;
 	ctxt->ad_bytes = 2;
 	ctxt->_eip = ctxt->eip + inc_eip;
 	ret = emulate_int_real(ctxt, irq);
 	if (ret != X86EMUL_CONTINUE)
 		return EMULATE_FAIL;
 	ctxt->eip = ctxt->_eip;
 	kvm_rip_write(vcpu, ctxt->eip);
 	kvm_set_rflags(vcpu, ctxt->eflags);
 	if (irq == NMI_VECTOR)
 		vcpu->arch.nmi_pending = 0;
 	else
 		vcpu->arch.interrupt.pending = false;
 	return EMULATE_DONE;
 }
 EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
 static int handle_emulation_failure(struct kvm_vcpu *vcpu)
 {
 	int r = EMULATE_DONE;
 	++vcpu->stat.insn_emulation_fail;
 	trace_kvm_emulate_insn_failed(vcpu);
 	if (!is_guest_mode(vcpu)) {
 		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
 		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
 		vcpu->run->internal.ndata = 0;
 		r = EMULATE_FAIL;
 	}
 	kvm_queue_exception(vcpu, UD_VECTOR);
 	return r;
 }
 static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
 {
 	gpa_t gpa;
 	pfn_t pfn;
 	if (tdp_enabled)
 		return false;
 	/*
 	 * if emulation was due to access to shadowed page table
 	 * and it failed try to unshadow page and re-enter the
 	 * guest to let CPU execute the instruction.
 	 */
 	if (kvm_mmu_unprotect_page_virt(vcpu, gva))
 		return true;
 	gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL);
 	if (gpa == UNMAPPED_GVA)
 		return true; /* let cpu generate fault */
 	/*
 	 * Do not retry the unhandleable instruction if it faults on the
 	 * readonly host memory, otherwise it will goto a infinite loop:
 	 * retry instruction -> write #PF -> emulation fail -> retry
 	 * instruction -> ...
 	 */
 	pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));
 	if (!is_error_noslot_pfn(pfn)) {
 		kvm_release_pfn_clean(pfn);
 		return true;
 	}
 	return false;
 }
 static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
 			      unsigned long cr2,  int emulation_type)
 {
 	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
 	unsigned long last_retry_eip, last_retry_addr, gpa = cr2;
 	last_retry_eip = vcpu->arch.last_retry_eip;
 	last_retry_addr = vcpu->arch.last_retry_addr;
 	/*
 	 * If the emulation is caused by #PF and it is non-page_table
 	 * writing instruction, it means the VM-EXIT is caused by shadow
 	 * page protected, we can zap the shadow page and retry this
 	 * instruction directly.
 	 *
 	 * Note: if the guest uses a non-page-table modifying instruction
 	 * on the PDE that points to the instruction, then we will unmap
 	 * the instruction and go to an infinite loop. So, we cache the
 	 * last retried eip and the last fault address, if we meet the eip
 	 * and the address again, we can break out of the potential infinite
 	 * loop.
 	 */
 	vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0;
 	if (!(emulation_type & EMULTYPE_RETRY))
 		return false;
 	if (x86_page_table_writing_insn(ctxt))
 		return false;
 	if (ctxt->eip == last_retry_eip && last_retry_addr == cr2)
 		return false;
 	vcpu->arch.last_retry_eip = ctxt->eip;
 	vcpu->arch.last_retry_addr = cr2;
 	if (!vcpu->arch.mmu.direct_map)
 		gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
 	kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
 	return true;
 }
 static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
 static int complete_emulated_pio(struct kvm_vcpu *vcpu);
 int x86_emulate_instruction(struct kvm_vcpu *vcpu,
 			    unsigned long cr2,
 			    int emulation_type,
 			    void *insn,
 			    int insn_len)
 {
 	int r;
 	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
 	bool writeback = true;
 	kvm_clear_exception_queue(vcpu);
 	if (!(emulation_type & EMULTYPE_NO_DECODE)) {
 		init_emulate_ctxt(vcpu);
 		ctxt->interruptibility = 0;
 		ctxt->have_exception = false;
 		ctxt->perm_ok = false;
 		ctxt->only_vendor_specific_insn
 			= emulation_type & EMULTYPE_TRAP_UD;
 		r = x86_decode_insn(ctxt, insn, insn_len);
 		trace_kvm_emulate_insn_start(vcpu);
 		++vcpu->stat.insn_emulation;
 		if (r != EMULATION_OK)  {
 			if (emulation_type & EMULTYPE_TRAP_UD)
 				return EMULATE_FAIL;
 			if (reexecute_instruction(vcpu, cr2))
 				return EMULATE_DONE;
 			if (emulation_type & EMULTYPE_SKIP)
 				return EMULATE_FAIL;
 			return handle_emulation_failure(vcpu);
 		}
 	}
 	if (emulation_type & EMULTYPE_SKIP) {
 		kvm_rip_write(vcpu, ctxt->_eip);
 		return EMULATE_DONE;
 	}
 	if (retry_instruction(ctxt, cr2, emulation_type))
 		return EMULATE_DONE;
 	/* this is needed for vmware backdoor interface to work since it
 	   changes registers values  during IO operation */
 	if (vcpu->arch.emulate_regs_need_sync_from_vcpu) {
 		vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
 		emulator_invalidate_register_cache(ctxt);
 	}
 restart:
 	r = x86_emulate_insn(ctxt);
 	if (r == EMULATION_INTERCEPTED)
 		return EMULATE_DONE;
 	if (r == EMULATION_FAILED) {
 		if (reexecute_instruction(vcpu, cr2))
 			return EMULATE_DONE;
 		return handle_emulation_failure(vcpu);
 	}
 	if (ctxt->have_exception) {
 		inject_emulated_exception(vcpu);
 		r = EMULATE_DONE;
 	} else if (vcpu->arch.pio.count) {
 		if (!vcpu->arch.pio.in)
 			vcpu->arch.pio.count = 0;
 		else {
 			writeback = false;
 			vcpu->arch.complete_userspace_io = complete_emulated_pio;
 		}
 		r = EMULATE_DO_MMIO;
 	} else if (vcpu->mmio_needed) {
 		if (!vcpu->mmio_is_write)
 			writeback = false;
 		r = EMULATE_DO_MMIO;
 		vcpu->arch.complete_userspace_io = complete_emulated_mmio;
 	} else if (r == EMULATION_RESTART)
 		goto restart;
 	else
 		r = EMULATE_DONE;
 	if (writeback) {
 		toggle_interruptibility(vcpu, ctxt->interruptibility);
 		kvm_set_rflags(vcpu, ctxt->eflags);
 		kvm_make_request(KVM_REQ_EVENT, vcpu);
 		vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
 		kvm_rip_write(vcpu, ctxt->eip);
 	} else
 		vcpu->arch.emulate_regs_need_sync_to_vcpu = true;
 	return r;
 }
 EXPORT_SYMBOL_GPL(x86_emulate_instruction);
 int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)
 {
 	unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX);
 	int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt,
 					    size, port, &val, 1);
 	/* do not return to emulator after return from userspace */
 	vcpu->arch.pio.count = 0;
 	return ret;
 }
 EXPORT_SYMBOL_GPL(kvm_fast_pio_out);
 static void tsc_bad(void *info)
 {
 	__this_cpu_write(cpu_tsc_khz, 0);
 }
 static void tsc_khz_changed(void *data)
 {
 	struct cpufreq_freqs *freq = data;
 	unsigned long khz = 0;
 	if (data)
 		khz = freq->new;
 	else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
 		khz = cpufreq_quick_get(raw_smp_processor_id());
 	if (!khz)
 		khz = tsc_khz;
 	__this_cpu_write(cpu_tsc_khz, khz);
 }
 static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
 				     void *data)
 {
 	struct cpufreq_freqs *freq = data;
 	struct kvm *kvm;
 	struct kvm_vcpu *vcpu;
 	int i, send_ipi = 0;
 	/*
 	 * We allow guests to temporarily run on slowing clocks,
 	 * provided we notify them after, or to run on accelerating
 	 * clocks, provided we notify them before.  Thus time never
 	 * goes backwards.
 	 *
 	 * However, we have a problem.  We can't atomically update
 	 * the frequency of a given CPU from this function; it is
 	 * merely a notifier, which can be called from any CPU.
 	 * Changing the TSC frequency at arbitrary points in time
 	 * requires a recomputation of local variables related to
 	 * the TSC for each VCPU.  We must flag these local variables
 	 * to be updated and be sure the update takes place with the
 	 * new frequency before any guests proceed.
 	 *
 	 * Unfortunately, the combination of hotplug CPU and frequency
 	 * change creates an intractable locking scenario; the order
 	 * of when these callouts happen is undefined with respect to
 	 * CPU hotplug, and they can race with each other.  As such,
 	 * merely setting per_cpu(cpu_tsc_khz) = X during a hotadd is
 	 * undefined; you can actually have a CPU frequency change take
 	 * place in between the computation of X and the setting of the
 	 * variable.  To protect against this problem, all updates of
 	 * the per_cpu tsc_khz variable are done in an interrupt
 	 * protected IPI, and all callers wishing to update the value
 	 * must wait for a synchronous IPI to complete (which is trivial
 	 * if the caller is on the CPU already).  This establishes the
 	 * necessary total order on variable updates.
 	 *
 	 * Note that because a guest time update may take place
 	 * anytime after the setting of the VCPU's request bit, the
 	 * correct TSC value must be set before the request.  However,
 	 * to ensure the update actually makes it to any guest which
 	 * starts running in hardware virtualization between the set
 	 * and the acquisition of the spinlock, we must also ping the
 	 * CPU after setting the request bit.
 	 *
 	 */
 	if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
 		return 0;
 	if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
 		return 0;
 	smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
 	raw_spin_lock(&kvm_lock);
 	list_for_each_entry(kvm, &vm_list, vm_list) {
 		kvm_for_each_vcpu(i, vcpu, kvm) {
 			if (vcpu->cpu != freq->cpu)
 				continue;
 			kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
 			if (vcpu->cpu != smp_processor_id())
 				send_ipi = 1;
 		}
 	}
 	raw_spin_unlock(&kvm_lock);
 	if (freq->old < freq->new && send_ipi) {
 		/*
 		 * We upscale the frequency.  Must make the guest
 		 * doesn't see old kvmclock values while running with
 		 * the new frequency, otherwise we risk the guest sees
 		 * time go backwards.
 		 *
 		 * In case we update the frequency for another cpu
 		 * (which might be in guest context) send an interrupt
 		 * to kick the cpu out of guest context.  Next time
 		 * guest context is entered kvmclock will be updated,
 		 * so the guest will not see stale values.
 		 */
 		smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
 	}
 	return 0;
 }
 static struct notifier_block kvmclock_cpufreq_notifier_block = {
 	.notifier_call  = kvmclock_cpufreq_notifier
 };
 static int kvmclock_cpu_notifier(struct notifier_block *nfb,
 					unsigned long action, void *hcpu)
 {
 	unsigned int cpu = (unsigned long)hcpu;
 	switch (action) {
 		case CPU_ONLINE:
 		case CPU_DOWN_FAILED:
 			smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);
 			break;
 		case CPU_DOWN_PREPARE:
 			smp_call_function_single(cpu, tsc_bad, NULL, 1);
 			break;
 	}
 	return NOTIFY_OK;
 }
 static struct notifier_block kvmclock_cpu_notifier_block = {
 	.notifier_call  = kvmclock_cpu_notifier,
 	.priority = -INT_MAX
 };
 static void kvm_timer_init(void)
 {
 	int cpu;
 	max_tsc_khz = tsc_khz;
 	register_hotcpu_notifier(&kvmclock_cpu_notifier_block);
 	if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
 #ifdef CONFIG_CPU_FREQ
 		struct cpufreq_policy policy;
 		memset(&policy, 0, sizeof(policy));
 		cpu = get_cpu();
 		cpufreq_get_policy(&policy, cpu);
 		if (policy.cpuinfo.max_freq)
 			max_tsc_khz = policy.cpuinfo.max_freq;
 		put_cpu();
 #endif
 		cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
 					  CPUFREQ_TRANSITION_NOTIFIER);
 	}
 	pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz);
 	for_each_online_cpu(cpu)
 		smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);
 }
 static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
 int kvm_is_in_guest(void)
 {
 	return __this_cpu_read(current_vcpu) != NULL;
 }
 static int kvm_is_user_mode(void)
 {
 	int user_mode = 3;
 	if (__this_cpu_read(current_vcpu))
 		user_mode = kvm_x86_ops->get_cpl(__this_cpu_read(current_vcpu));
 	return user_mode != 0;
 }
 static unsigned long kvm_get_guest_ip(void)
 {
 	unsigned long ip = 0;
 	if (__this_cpu_read(current_vcpu))
 		ip = kvm_rip_read(__this_cpu_read(current_vcpu));
 	return ip;
 }
 static struct perf_guest_info_callbacks kvm_guest_cbs = {
 	.is_in_guest		= kvm_is_in_guest,
 	.is_user_mode		= kvm_is_user_mode,
 	.get_guest_ip		= kvm_get_guest_ip,
 };
 void kvm_before_handle_nmi(struct kvm_vcpu *vcpu)
 {
 	__this_cpu_write(current_vcpu, vcpu);
 }
 EXPORT_SYMBOL_GPL(kvm_before_handle_nmi);
 void kvm_after_handle_nmi(struct kvm_vcpu *vcpu)
 {
 	__this_cpu_write(current_vcpu, NULL);
 }
 EXPORT_SYMBOL_GPL(kvm_after_handle_nmi);
 static void kvm_set_mmio_spte_mask(void)
 {
 	u64 mask;
 	int maxphyaddr = boot_cpu_data.x86_phys_bits;
 	/*
 	 * Set the reserved bits and the present bit of an paging-structure
 	 * entry to generate page fault with PFER.RSV = 1.
 	 */
 	mask = ((1ull << (62 - maxphyaddr + 1)) - 1) << maxphyaddr;
 	mask |= 1ull;
 #ifdef CONFIG_X86_64
 	/*
 	 * If reserved bit is not supported, clear the present bit to disable
 	 * mmio page fault.
 	 */
 	if (maxphyaddr == 52)
 		mask &= ~1ull;
 #endif
 	kvm_mmu_set_mmio_spte_mask(mask);
 }
 #ifdef CONFIG_X86_64
 static void pvclock_gtod_update_fn(struct work_struct *work)
 {
 	struct kvm *kvm;
 	struct kvm_vcpu *vcpu;
 	int i;
 	raw_spin_lock(&kvm_lock);
 	list_for_each_entry(kvm, &vm_list, vm_list)
 		kvm_for_each_vcpu(i, vcpu, kvm)
 			set_bit(KVM_REQ_MASTERCLOCK_UPDATE, &vcpu->requests);
 	atomic_set(&kvm_guest_has_master_clock, 0);
 	raw_spin_unlock(&kvm_lock);
 }
 static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
 /*
  * Notification about pvclock gtod data update.
  */
 static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
 			       void *priv)
 {
 	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
 	struct timekeeper *tk = priv;
 	update_pvclock_gtod(tk);
 	/* disable master clock if host does not trust, or does not
 	 * use, TSC clocksource
 	 */
 	if (gtod->clock.vclock_mode != VCLOCK_TSC &&
 	    atomic_read(&kvm_guest_has_master_clock) != 0)
 		queue_work(system_long_wq, &pvclock_gtod_work);
 	return 0;
 }
 static struct notifier_block pvclock_gtod_notifier = {
 	.notifier_call = pvclock_gtod_notify,
 };
 #endif
 int kvm_arch_init(void *opaque)
 {
 	int r;
 	struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
 	if (kvm_x86_ops) {
 		printk(KERN_ERR "kvm: already loaded the other module\n");
 		r = -EEXIST;
 		goto out;
 	}
 	if (!ops->cpu_has_kvm_support()) {
 		printk(KERN_ERR "kvm: no hardware support\n");
 		r = -EOPNOTSUPP;
 		goto out;
 	}
 	if (ops->disabled_by_bios()) {
 		printk(KERN_ERR "kvm: disabled by bios\n");
 		r = -EOPNOTSUPP;
 		goto out;
 	}
 	r = kvm_mmu_module_init();
 	if (r)
 		goto out;
 	kvm_set_mmio_spte_mask();
 	kvm_init_msr_list();
 	kvm_x86_ops = ops;
 	kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
 			PT_DIRTY_MASK, PT64_NX_MASK, 0);
 	kvm_timer_init();
 	perf_register_guest_info_callbacks(&kvm_guest_cbs);
 	if (cpu_has_xsave)
 		host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
 	kvm_lapic_init();
 #ifdef CONFIG_X86_64
 	pvclock_gtod_register_notifier(&pvclock_gtod_notifier);
 #endif
 	return 0;
 out:
 	return r;
 }
 void kvm_arch_exit(void)
 {
 	perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
 	if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
 		cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
 					    CPUFREQ_TRANSITION_NOTIFIER);
 	unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block);
 #ifdef CONFIG_X86_64
 	pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);
 #endif
 	kvm_x86_ops = NULL;
 	kvm_mmu_module_exit();
 }
 int kvm_emulate_halt(struct kvm_vcpu *vcpu)
 {
 	++vcpu->stat.halt_exits;
 	if (irqchip_in_kernel(vcpu->kvm)) {
 		vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
 		return 1;
 	} else {
 		vcpu->run->exit_reason = KVM_EXIT_HLT;
 		return 0;
 	}
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_halt);
 int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
 {
 	u64 param, ingpa, outgpa, ret;
 	uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0;
 	bool fast, longmode;
 	int cs_db, cs_l;
 	/*
 	 * hypercall generates UD from non zero cpl and real mode
 	 * per HYPER-V spec
 	 */
 	if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) {
 		kvm_queue_exception(vcpu, UD_VECTOR);
 		return 0;
 	}
 	kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
 	longmode = is_long_mode(vcpu) && cs_l == 1;
 	if (!longmode) {
 		param = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) |
 			(kvm_register_read(vcpu, VCPU_REGS_RAX) & 0xffffffff);
 		ingpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RBX) << 32) |
 			(kvm_register_read(vcpu, VCPU_REGS_RCX) & 0xffffffff);
 		outgpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDI) << 32) |
 			(kvm_register_read(vcpu, VCPU_REGS_RSI) & 0xffffffff);
 	}
 #ifdef CONFIG_X86_64
 	else {
 		param = kvm_register_read(vcpu, VCPU_REGS_RCX);
 		ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX);
 		outgpa = kvm_register_read(vcpu, VCPU_REGS_R8);
 	}
 #endif
 	code = param & 0xffff;
 	fast = (param >> 16) & 0x1;
 	rep_cnt = (param >> 32) & 0xfff;
 	rep_idx = (param >> 48) & 0xfff;
 	trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa);
 	switch (code) {
 	case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT:
 		kvm_vcpu_on_spin(vcpu);
 		break;
 	default:
 		res = HV_STATUS_INVALID_HYPERCALL_CODE;
 		break;
 	}
 	ret = res | (((u64)rep_done & 0xfff) << 32);
 	if (longmode) {
 		kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
 	} else {
 		kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32);
 		kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff);
 	}
 	return 1;
 }
 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 {
 	unsigned long nr, a0, a1, a2, a3, ret;
 	int r = 1;
 	if (kvm_hv_hypercall_enabled(vcpu->kvm))
 		return kvm_hv_hypercall(vcpu);
 	nr = kvm_register_read(vcpu, VCPU_REGS_RAX);
 	a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);
 	a1 = kvm_register_read(vcpu, VCPU_REGS_RCX);
 	a2 = kvm_register_read(vcpu, VCPU_REGS_RDX);
 	a3 = kvm_register_read(vcpu, VCPU_REGS_RSI);
 	trace_kvm_hypercall(nr, a0, a1, a2, a3);
 	if (!is_long_mode(vcpu)) {
 		nr &= 0xFFFFFFFF;
 		a0 &= 0xFFFFFFFF;
 		a1 &= 0xFFFFFFFF;
 		a2 &= 0xFFFFFFFF;
 		a3 &= 0xFFFFFFFF;
 	}
 	if (kvm_x86_ops->get_cpl(vcpu) != 0) {
 		ret = -KVM_EPERM;
 		goto out;
 	}
 	switch (nr) {
 	case KVM_HC_VAPIC_POLL_IRQ:
 		ret = 0;
 		break;
 	default:
 		ret = -KVM_ENOSYS;
 		break;
 	}
 out:
 	kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
 	++vcpu->stat.hypercalls;
 	return r;
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
 static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
 {
 	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
 	char instruction[3];
 	unsigned long rip = kvm_rip_read(vcpu);
 	/*
 	 * Blow out the MMU to ensure that no other VCPU has an active mapping
 	 * to ensure that the updated hypercall appears atomically across all
 	 * VCPUs.
 	 */
 	kvm_mmu_zap_all(vcpu->kvm);
 	kvm_x86_ops->patch_hypercall(vcpu, instruction);
 	return emulator_write_emulated(ctxt, rip, instruction, 3, NULL);
 }
 /*
  * Check if userspace requested an interrupt window, and that the
  * interrupt window is open.
  *
  * No need to exit to userspace if we already have an interrupt queued.
  */
 static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
 {
 	return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&
 		vcpu->run->request_interrupt_window &&
 		kvm_arch_interrupt_allowed(vcpu));
 }
 static void post_kvm_run_save(struct kvm_vcpu *vcpu)
 {
 	struct kvm_run *kvm_run = vcpu->run;
 	kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
 	kvm_run->cr8 = kvm_get_cr8(vcpu);
 	kvm_run->apic_base = kvm_get_apic_base(vcpu);
 	if (irqchip_in_kernel(vcpu->kvm))
 		kvm_run->ready_for_interrupt_injection = 1;
 	else
 		kvm_run->ready_for_interrupt_injection =
 			kvm_arch_interrupt_allowed(vcpu) &&
 			!kvm_cpu_has_interrupt(vcpu) &&
 			!kvm_event_needs_reinjection(vcpu);
 }
 static int vapic_enter(struct kvm_vcpu *vcpu)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
 	struct page *page;
 	if (!apic || !apic->vapic_addr)
 		return 0;
 	page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
 	if (is_error_page(page))
 		return -EFAULT;
 	vcpu->arch.apic->vapic_page = page;
 	return 0;
 }
 static void vapic_exit(struct kvm_vcpu *vcpu)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
 	int idx;
 	if (!apic || !apic->vapic_addr)
 		return;
 	idx = srcu_read_lock(&vcpu->kvm->srcu);
 	kvm_release_page_dirty(apic->vapic_page);
 	mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
 }
 static void update_cr8_intercept(struct kvm_vcpu *vcpu)
 {
 	int max_irr, tpr;
 	if (!kvm_x86_ops->update_cr8_intercept)
 		return;
 	if (!vcpu->arch.apic)
 		return;
 	if (!vcpu->arch.apic->vapic_addr)
 		max_irr = kvm_lapic_find_highest_irr(vcpu);
 	else
 		max_irr = -1;
 	if (max_irr != -1)
 		max_irr >>= 4;
 	tpr = kvm_lapic_get_cr8(vcpu);
 	kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
 }
 static void inject_pending_event(struct kvm_vcpu *vcpu)
 {
 	/* try to reinject previous events if any */
 	if (vcpu->arch.exception.pending) {
 		trace_kvm_inj_exception(vcpu->arch.exception.nr,
 					vcpu->arch.exception.has_error_code,
 					vcpu->arch.exception.error_code);
 		kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
 					  vcpu->arch.exception.has_error_code,
 					  vcpu->arch.exception.error_code,
 					  vcpu->arch.exception.reinject);
 		return;
 	}
 	if (vcpu->arch.nmi_injected) {
 		kvm_x86_ops->set_nmi(vcpu);
 		return;
 	}
 	if (vcpu->arch.interrupt.pending) {
 		kvm_x86_ops->set_irq(vcpu);
 		return;
 	}
 	/* try to inject new event if pending */
 	if (vcpu->arch.nmi_pending) {
 		if (kvm_x86_ops->nmi_allowed(vcpu)) {
 			--vcpu->arch.nmi_pending;
 			vcpu->arch.nmi_injected = true;
 			kvm_x86_ops->set_nmi(vcpu);
 		}
 	} else if (kvm_cpu_has_interrupt(vcpu)) {
 		if (kvm_x86_ops->interrupt_allowed(vcpu)) {
 			kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
 					    false);
 			kvm_x86_ops->set_irq(vcpu);
 		}
 	}
 }
 static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)
 {
 	if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
 			!vcpu->guest_xcr0_loaded) {
 		/* kvm_set_xcr() also depends on this */
 		xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
 		vcpu->guest_xcr0_loaded = 1;
 	}
 }
 static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
 {
 	if (vcpu->guest_xcr0_loaded) {
 		if (vcpu->arch.xcr0 != host_xcr0)
 			xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
 		vcpu->guest_xcr0_loaded = 0;
 	}
 }
 static void process_nmi(struct kvm_vcpu *vcpu)
 {
 	unsigned limit = 2;
 	/*
 	 * x86 is limited to one NMI running, and one NMI pending after it.
 	 * If an NMI is already in progress, limit further NMIs to just one.
 	 * Otherwise, allow two (and we'll inject the first one immediately).
 	 */
 	if (kvm_x86_ops->get_nmi_mask(vcpu) || vcpu->arch.nmi_injected)
 		limit = 1;
 	vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
 	vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit);
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
 }
 static void kvm_gen_update_masterclock(struct kvm *kvm)
 {
 #ifdef CONFIG_X86_64
 	int i;
 	struct kvm_vcpu *vcpu;
 	struct kvm_arch *ka = &kvm->arch;
 	spin_lock(&ka->pvclock_gtod_sync_lock);
 	kvm_make_mclock_inprogress_request(kvm);
 	/* no guest entries from this point */
 	pvclock_update_vm_gtod_copy(kvm);
 	kvm_for_each_vcpu(i, vcpu, kvm)
 		set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
 	/* guest entries allowed */
 	kvm_for_each_vcpu(i, vcpu, kvm)
 		clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests);
 	spin_unlock(&ka->pvclock_gtod_sync_lock);
 #endif
 }
 static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 {
 	int r;
 	bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
 		vcpu->run->request_interrupt_window;
 	bool req_immediate_exit = 0;
 	if (vcpu->requests) {
 		if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
 			kvm_mmu_unload(vcpu);
 		if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
 			__kvm_migrate_timers(vcpu);
 		if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu))
 			kvm_gen_update_masterclock(vcpu->kvm);
 		if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
 			r = kvm_guest_time_update(vcpu);
 			if (unlikely(r))
 				goto out;
 		}
 		if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
 			kvm_mmu_sync_roots(vcpu);
 		if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
 			kvm_x86_ops->tlb_flush(vcpu);
 		if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
 			vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
 			r = 0;
 			goto out;
 		}
 		if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
 			vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
 			r = 0;
 			goto out;
 		}
 		if (kvm_check_request(KVM_REQ_DEACTIVATE_FPU, vcpu)) {
 			vcpu->fpu_active = 0;
 			kvm_x86_ops->fpu_deactivate(vcpu);
 		}
 		if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {
 			/* Page is swapped out. Do synthetic halt */
 			vcpu->arch.apf.halted = true;
 			r = 1;
 			goto out;
 		}
 		if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
 			record_steal_time(vcpu);
 		if (kvm_check_request(KVM_REQ_NMI, vcpu))
 			process_nmi(vcpu);
 		req_immediate_exit =
 			kvm_check_request(KVM_REQ_IMMEDIATE_EXIT, vcpu);
 		if (kvm_check_request(KVM_REQ_PMU, vcpu))
 			kvm_handle_pmu_event(vcpu);
 		if (kvm_check_request(KVM_REQ_PMI, vcpu))
 			kvm_deliver_pmi(vcpu);
 	}
 	if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
 		inject_pending_event(vcpu);
 		/* enable NMI/IRQ window open exits if needed */
 		if (vcpu->arch.nmi_pending)
 			kvm_x86_ops->enable_nmi_window(vcpu);
 		else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
 			kvm_x86_ops->enable_irq_window(vcpu);
 		if (kvm_lapic_enabled(vcpu)) {
 			update_cr8_intercept(vcpu);
 			kvm_lapic_sync_to_vapic(vcpu);
 		}
 	}
 	r = kvm_mmu_reload(vcpu);
 	if (unlikely(r)) {
 		goto cancel_injection;
 	}
 	preempt_disable();
 	kvm_x86_ops->prepare_guest_switch(vcpu);
 	if (vcpu->fpu_active)
 		kvm_load_guest_fpu(vcpu);
 	kvm_load_guest_xcr0(vcpu);
 	vcpu->mode = IN_GUEST_MODE;
 	/* We should set ->mode before check ->requests,
 	 * see the comment in make_all_cpus_request.
 	 */
 	smp_mb();
 	local_irq_disable();
 	if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests
 	    || need_resched() || signal_pending(current)) {
 		vcpu->mode = OUTSIDE_GUEST_MODE;
 		smp_wmb();
 		local_irq_enable();
 		preempt_enable();
 		r = 1;
 		goto cancel_injection;
 	}
 	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
 	if (req_immediate_exit)
 		smp_send_reschedule(vcpu->cpu);
 	kvm_guest_enter();
 	if (unlikely(vcpu->arch.switch_db_regs)) {
 		set_debugreg(0, 7);
 		set_debugreg(vcpu->arch.eff_db[0], 0);
 		set_debugreg(vcpu->arch.eff_db[1], 1);
 		set_debugreg(vcpu->arch.eff_db[2], 2);
 		set_debugreg(vcpu->arch.eff_db[3], 3);
 	}
 	trace_kvm_entry(vcpu->vcpu_id);
 	kvm_x86_ops->run(vcpu);
 	/*
 	 * If the guest has used debug registers, at least dr7
 	 * will be disabled while returning to the host.
 	 * If we don't have active breakpoints in the host, we don't
 	 * care about the messed up debug address registers. But if
 	 * we have some of them active, restore the old state.
 	 */
 	if (hw_breakpoint_active())
 		hw_breakpoint_restore();
 	vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu,
 							   native_read_tsc());
 	vcpu->mode = OUTSIDE_GUEST_MODE;
 	smp_wmb();
 	local_irq_enable();
 	++vcpu->stat.exits;
 	/*
 	 * We must have an instruction between local_irq_enable() and
 	 * kvm_guest_exit(), so the timer interrupt isn't delayed by
 	 * the interrupt shadow.  The stat.exits increment will do nicely.
 	 * But we need to prevent reordering, hence this barrier():
 	 */
 	barrier();
 	kvm_guest_exit();
 	preempt_enable();
 	vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
 	/*
 	 * Profile KVM exit RIPs:
 	 */
 	if (unlikely(prof_on == KVM_PROFILING)) {
 		unsigned long rip = kvm_rip_read(vcpu);
 		profile_hit(KVM_PROFILING, (void *)rip);
 	}
 	if (unlikely(vcpu->arch.tsc_always_catchup))
 		kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
 	if (vcpu->arch.apic_attention)
 		kvm_lapic_sync_from_vapic(vcpu);
 	r = kvm_x86_ops->handle_exit(vcpu);
 	return r;
 cancel_injection:
 	kvm_x86_ops->cancel_injection(vcpu);
 	if (unlikely(vcpu->arch.apic_attention))
 		kvm_lapic_sync_from_vapic(vcpu);
 out:
 	return r;
 }
 static int __vcpu_run(struct kvm_vcpu *vcpu)
 {
 	int r;
 	struct kvm *kvm = vcpu->kvm;
 	if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
 		pr_debug("vcpu %d received sipi with vector # %x\n",
 			 vcpu->vcpu_id, vcpu->arch.sipi_vector);
 		kvm_lapic_reset(vcpu);
 		r = kvm_vcpu_reset(vcpu);
 		if (r)
 			return r;
 		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
 	}
 	vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
 	r = vapic_enter(vcpu);
 	if (r) {
 		srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
 		return r;
 	}
 	r = 1;
 	while (r > 0) {
 		if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
 		    !vcpu->arch.apf.halted)
 			r = vcpu_enter_guest(vcpu);
 		else {
 			srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
 			kvm_vcpu_block(vcpu);
 			vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
 			if (kvm_check_request(KVM_REQ_UNHALT, vcpu))
 			{
 				switch(vcpu->arch.mp_state) {
 				case KVM_MP_STATE_HALTED:
 					vcpu->arch.mp_state =
 						KVM_MP_STATE_RUNNABLE;
 				case KVM_MP_STATE_RUNNABLE:
 					vcpu->arch.apf.halted = false;
 					break;
 				case KVM_MP_STATE_SIPI_RECEIVED:
 				default:
 					r = -EINTR;
 					break;
 				}
 			}
 		}
 		if (r <= 0)
 			break;
 		clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
 		if (kvm_cpu_has_pending_timer(vcpu))
 			kvm_inject_pending_timer_irqs(vcpu);
 		if (dm_request_for_irq_injection(vcpu)) {
 			r = -EINTR;
 			vcpu->run->exit_reason = KVM_EXIT_INTR;
 			++vcpu->stat.request_irq_exits;
 		}
 		kvm_check_async_pf_completion(vcpu);
 		if (signal_pending(current)) {
 			r = -EINTR;
 			vcpu->run->exit_reason = KVM_EXIT_INTR;
 			++vcpu->stat.signal_exits;
 		}
 		if (need_resched()) {
 			srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
 			kvm_resched(vcpu);
 			vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
 		}
 	}
 	srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
 	vapic_exit(vcpu);
 	return r;
 }
 static inline int complete_emulated_io(struct kvm_vcpu *vcpu)
 {
 	int r;
 	vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
 	r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
 	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
 	if (r != EMULATE_DONE)
 		return 0;
 	return 1;
 }
 static int complete_emulated_pio(struct kvm_vcpu *vcpu)
 {
 	BUG_ON(!vcpu->arch.pio.count);
 	return complete_emulated_io(vcpu);
 }
 /*
  * Implements the following, as a state machine:
  *
  * read:
  *   for each fragment
  *     for each mmio piece in the fragment
  *       write gpa, len
  *       exit
  *       copy data
  *   execute insn
  *
  * write:
  *   for each fragment
  *     for each mmio piece in the fragment
  *       write gpa, len
  *       copy data
  *       exit
  */
 static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
 {
 	struct kvm_run *run = vcpu->run;
 	struct kvm_mmio_fragment *frag;
 	unsigned len;
 	BUG_ON(!vcpu->mmio_needed);
 	/* Complete previous fragment */
 	frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment];
 	len = min(8u, frag->len);
 	if (!vcpu->mmio_is_write)
 		memcpy(frag->data, run->mmio.data, len);
 	if (frag->len <= 8) {
 		/* Switch to the next fragment. */
 		frag++;
 		vcpu->mmio_cur_fragment++;
 	} else {
 		/* Go forward to the next mmio piece. */
 		frag->data += len;
 		frag->gpa += len;
 		frag->len -= len;
 	}
 	if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) {
 		vcpu->mmio_needed = 0;
 		if (vcpu->mmio_is_write)
 			return 1;
 		vcpu->mmio_read_completed = 1;
 		return complete_emulated_io(vcpu);
 	}
 	run->exit_reason = KVM_EXIT_MMIO;
 	run->mmio.phys_addr = frag->gpa;
 	if (vcpu->mmio_is_write)
 		memcpy(run->mmio.data, frag->data, min(8u, frag->len));
 	run->mmio.len = min(8u, frag->len);
 	run->mmio.is_write = vcpu->mmio_is_write;
 	vcpu->arch.complete_userspace_io = complete_emulated_mmio;
 	return 0;
 }
 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	int r;
 	sigset_t sigsaved;
 	if (!tsk_used_math(current) && init_fpu(current))
 		return -ENOMEM;
 	if (vcpu->sigset_active)
 		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
 	if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
 		kvm_vcpu_block(vcpu);
 		clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
 		r = -EAGAIN;
 		goto out;
 	}
 	/* re-sync apic's tpr */
 	if (!irqchip_in_kernel(vcpu->kvm)) {
 		if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
 			r = -EINVAL;
 			goto out;
 		}
 	}
 	if (unlikely(vcpu->arch.complete_userspace_io)) {
 		int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io;
 		vcpu->arch.complete_userspace_io = NULL;
 		r = cui(vcpu);
 		if (r <= 0)
 			goto out;
 	} else
 		WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);
 	r = __vcpu_run(vcpu);
 out:
 	post_kvm_run_save(vcpu);
 	if (vcpu->sigset_active)
 		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
 	return r;
 }
 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
 	if (vcpu->arch.emulate_regs_need_sync_to_vcpu) {
 		/*
 		 * We are here if userspace calls get_regs() in the middle of
 		 * instruction emulation. Registers state needs to be copied
 		 * back from emulation context to vcpu. Userspace shouldn't do
 		 * that usually, but some bad designed PV devices (vmware
 		 * backdoor interface) need this to work
 		 */
 		emulator_writeback_register_cache(&vcpu->arch.emulate_ctxt);
 		vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
 	}
 	regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
 	regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
 	regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
 	regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX);
 	regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI);
 	regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI);
 	regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
 	regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP);
 #ifdef CONFIG_X86_64
 	regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8);
 	regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9);
 	regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10);
 	regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11);
 	regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12);
 	regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13);
 	regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14);
 	regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15);
 #endif
 	regs->rip = kvm_rip_read(vcpu);
 	regs->rflags = kvm_get_rflags(vcpu);
 	return 0;
 }
 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
 	vcpu->arch.emulate_regs_need_sync_from_vcpu = true;
 	vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
 	kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
 	kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
 	kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
 	kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx);
 	kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi);
 	kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi);
 	kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp);
 	kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp);
 #ifdef CONFIG_X86_64
 	kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8);
 	kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9);
 	kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10);
 	kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11);
 	kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12);
 	kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13);
 	kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14);
 	kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15);
 #endif
 	kvm_rip_write(vcpu, regs->rip);
 	kvm_set_rflags(vcpu, regs->rflags);
 	vcpu->arch.exception.pending = false;
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
 	return 0;
 }
 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
 {
 	struct kvm_segment cs;
 	kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
 	*db = cs.db;
 	*l = cs.l;
 }
 EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 				  struct kvm_sregs *sregs)
 {
 	struct desc_ptr dt;
 	kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
 	kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
 	kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
 	kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
 	kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
 	kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
 	kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
 	kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
 	kvm_x86_ops->get_idt(vcpu, &dt);
 	sregs->idt.limit = dt.size;
 	sregs->idt.base = dt.address;
 	kvm_x86_ops->get_gdt(vcpu, &dt);
 	sregs->gdt.limit = dt.size;
 	sregs->gdt.base = dt.address;
 	sregs->cr0 = kvm_read_cr0(vcpu);
 	sregs->cr2 = vcpu->arch.cr2;
 	sregs->cr3 = kvm_read_cr3(vcpu);
 	sregs->cr4 = kvm_read_cr4(vcpu);
 	sregs->cr8 = kvm_get_cr8(vcpu);
 	sregs->efer = vcpu->arch.efer;
 	sregs->apic_base = kvm_get_apic_base(vcpu);
 	memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap);
 	if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft)
 		set_bit(vcpu->arch.interrupt.nr,
 			(unsigned long *)sregs->interrupt_bitmap);
 	return 0;
 }
 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
 				    struct kvm_mp_state *mp_state)
 {
 	mp_state->mp_state = vcpu->arch.mp_state;
 	return 0;
 }
 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 				    struct kvm_mp_state *mp_state)
 {
 	vcpu->arch.mp_state = mp_state->mp_state;
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
 	return 0;
 }
 int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
 		    int reason, bool has_error_code, u32 error_code)
 {
 	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
 	int ret;
 	init_emulate_ctxt(vcpu);
 	ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason,
 				   has_error_code, error_code);
 	if (ret)
 		return EMULATE_FAIL;
 	kvm_rip_write(vcpu, ctxt->eip);
 	kvm_set_rflags(vcpu, ctxt->eflags);
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
 	return EMULATE_DONE;
 }
 EXPORT_SYMBOL_GPL(kvm_task_switch);
 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 				  struct kvm_sregs *sregs)
 {
 	int mmu_reset_needed = 0;
 	int pending_vec, max_bits, idx;
 	struct desc_ptr dt;
 	if (!guest_cpuid_has_xsave(vcpu) && (sregs->cr4 & X86_CR4_OSXSAVE))
 		return -EINVAL;
 	dt.size = sregs->idt.limit;
 	dt.address = sregs->idt.base;
 	kvm_x86_ops->set_idt(vcpu, &dt);
 	dt.size = sregs->gdt.limit;
 	dt.address = sregs->gdt.base;
 	kvm_x86_ops->set_gdt(vcpu, &dt);
 	vcpu->arch.cr2 = sregs->cr2;
 	mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
 	vcpu->arch.cr3 = sregs->cr3;
 	__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
 	kvm_set_cr8(vcpu, sregs->cr8);
 	mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
 	kvm_x86_ops->set_efer(vcpu, sregs->efer);
 	kvm_set_apic_base(vcpu, sregs->apic_base);
 	mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
 	kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
 	vcpu->arch.cr0 = sregs->cr0;
 	mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
 	kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
 	if (sregs->cr4 & X86_CR4_OSXSAVE)
 		kvm_update_cpuid(vcpu);
 	idx = srcu_read_lock(&vcpu->kvm->srcu);
 	if (!is_long_mode(vcpu) && is_pae(vcpu)) {
 		load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
 		mmu_reset_needed = 1;
 	}
 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
 	if (mmu_reset_needed)
 		kvm_mmu_reset_context(vcpu);
 	max_bits = KVM_NR_INTERRUPTS;
 	pending_vec = find_first_bit(
 		(const unsigned long *)sregs->interrupt_bitmap, max_bits);
 	if (pending_vec < max_bits) {
 		kvm_queue_interrupt(vcpu, pending_vec, false);
 		pr_debug("Set back pending irq %d\n", pending_vec);
 	}
 	kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
 	kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
 	kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
 	kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
 	kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
 	kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
 	kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
 	kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
 	update_cr8_intercept(vcpu);
 	/* Older userspace won't unhalt the vcpu on reset. */
 	if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 &&
 	    sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
 	    !is_protmode(vcpu))
 		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
 	return 0;
 }
 int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
 					struct kvm_guest_debug *dbg)
 {
 	unsigned long rflags;
 	int i, r;
 	if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
 		r = -EBUSY;
 		if (vcpu->arch.exception.pending)
 			goto out;
 		if (dbg->control & KVM_GUESTDBG_INJECT_DB)
 			kvm_queue_exception(vcpu, DB_VECTOR);
 		else
 			kvm_queue_exception(vcpu, BP_VECTOR);
 	}
 	/*
 	 * Read rflags as long as potentially injected trace flags are still
 	 * filtered out.
 	 */
 	rflags = kvm_get_rflags(vcpu);
 	vcpu->guest_debug = dbg->control;
 	if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
 		vcpu->guest_debug = 0;
 	if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
 		for (i = 0; i < KVM_NR_DB_REGS; ++i)
 			vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];
 		vcpu->arch.guest_debug_dr7 = dbg->arch.debugreg[7];
 	} else {
 		for (i = 0; i < KVM_NR_DB_REGS; i++)
 			vcpu->arch.eff_db[i] = vcpu->arch.db[i];
 	}
 	kvm_update_dr7(vcpu);
 	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
 		vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) +
 			get_segment_base(vcpu, VCPU_SREG_CS);
 	/*
 	 * Trigger an rflags update that will inject or remove the trace
 	 * flags.
 	 */
 	kvm_set_rflags(vcpu, rflags);
 	kvm_x86_ops->update_db_bp_intercept(vcpu);
 	r = 0;
 out:
 	return r;
 }
 /*
  * Translate a guest virtual address to a guest physical address.
  */
 int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
 				    struct kvm_translation *tr)
 {
 	unsigned long vaddr = tr->linear_address;
 	gpa_t gpa;
 	int idx;
 	idx = srcu_read_lock(&vcpu->kvm->srcu);
 	gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL);
 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
 	tr->physical_address = gpa;
 	tr->valid = gpa != UNMAPPED_GVA;
 	tr->writeable = 1;
 	tr->usermode = 0;
 	return 0;
 }
 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
 	struct i387_fxsave_struct *fxsave =
 			&vcpu->arch.guest_fpu.state->fxsave;
 	memcpy(fpu->fpr, fxsave->st_space, 128);
 	fpu->fcw = fxsave->cwd;
 	fpu->fsw = fxsave->swd;
 	fpu->ftwx = fxsave->twd;
 	fpu->last_opcode = fxsave->fop;
 	fpu->last_ip = fxsave->rip;
 	fpu->last_dp = fxsave->rdp;
 	memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
 	return 0;
 }
 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
 	struct i387_fxsave_struct *fxsave =
 			&vcpu->arch.guest_fpu.state->fxsave;
 	memcpy(fxsave->st_space, fpu->fpr, 128);
 	fxsave->cwd = fpu->fcw;
 	fxsave->swd = fpu->fsw;
 	fxsave->twd = fpu->ftwx;
 	fxsave->fop = fpu->last_opcode;
 	fxsave->rip = fpu->last_ip;
 	fxsave->rdp = fpu->last_dp;
 	memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
 	return 0;
 }
 int fx_init(struct kvm_vcpu *vcpu)
 {
 	int err;
 	err = fpu_alloc(&vcpu->arch.guest_fpu);
 	if (err)
 		return err;
 	fpu_finit(&vcpu->arch.guest_fpu);
 	/*
 	 * Ensure guest xcr0 is valid for loading
 	 */
 	vcpu->arch.xcr0 = XSTATE_FP;
 	vcpu->arch.cr0 |= X86_CR0_ET;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(fx_init);
 static void fx_free(struct kvm_vcpu *vcpu)
 {
 	fpu_free(&vcpu->arch.guest_fpu);
 }
 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
 {
 	if (vcpu->guest_fpu_loaded)
 		return;
 	/*
 	 * Restore all possible states in the guest,
 	 * and assume host would use all available bits.
 	 * Guest xcr0 would be loaded later.
 	 */
 	kvm_put_guest_xcr0(vcpu);
 	vcpu->guest_fpu_loaded = 1;
 	__kernel_fpu_begin();
 	fpu_restore_checking(&vcpu->arch.guest_fpu);
 	trace_kvm_fpu(1);
 }
 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
 {
 	kvm_put_guest_xcr0(vcpu);
 	if (!vcpu->guest_fpu_loaded)
 		return;
 	vcpu->guest_fpu_loaded = 0;
 	fpu_save_init(&vcpu->arch.guest_fpu);
 	__kernel_fpu_end();
 	++vcpu->stat.fpu_reload;
 	kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
 	trace_kvm_fpu(0);
 }
 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
 {
 	kvmclock_reset(vcpu);
 	free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
 	fx_free(vcpu);
 	kvm_x86_ops->vcpu_free(vcpu);
 }
 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
 						unsigned int id)
 {
 	if (check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
 		printk_once(KERN_WARNING
 		"kvm: SMP vm created on host with unstable TSC; "
 		"guest TSC will not be reliable\n");
 	return kvm_x86_ops->vcpu_create(kvm, id);
 }
 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 {
 	int r;
 	vcpu->arch.mtrr_state.have_fixed = 1;
 	r = vcpu_load(vcpu);
 	if (r)
 		return r;
 	r = kvm_vcpu_reset(vcpu);
 	if (r == 0)
 		r = kvm_mmu_setup(vcpu);
 	vcpu_put(vcpu);
 	return r;
 }
 int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
 {
 	int r;
 	struct msr_data msr;
 	r = vcpu_load(vcpu);
 	if (r)
 		return r;
 	msr.data = 0x0;
 	msr.index = MSR_IA32_TSC;
 	msr.host_initiated = true;
 	kvm_write_tsc(vcpu, &msr);
 	vcpu_put(vcpu);
 	return r;
 }
 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 {
 	int r;
 	vcpu->arch.apf.msr_val = 0;
 	r = vcpu_load(vcpu);
 	BUG_ON(r);
 	kvm_mmu_unload(vcpu);
 	vcpu_put(vcpu);
 	fx_free(vcpu);
 	kvm_x86_ops->vcpu_free(vcpu);
 }
 static int kvm_vcpu_reset(struct kvm_vcpu *vcpu)
 {
 	atomic_set(&vcpu->arch.nmi_queued, 0);
 	vcpu->arch.nmi_pending = 0;
 	vcpu->arch.nmi_injected = false;
 	memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
 	vcpu->arch.dr6 = DR6_FIXED_1;
 	vcpu->arch.dr7 = DR7_FIXED_1;
 	kvm_update_dr7(vcpu);
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
 	vcpu->arch.apf.msr_val = 0;
 	vcpu->arch.st.msr_val = 0;
 	kvmclock_reset(vcpu);
 	kvm_clear_async_pf_completion_queue(vcpu);
 	kvm_async_pf_hash_reset(vcpu);
 	vcpu->arch.apf.halted = false;
 	kvm_pmu_reset(vcpu);
 	memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
 	vcpu->arch.regs_avail = ~0;
 	vcpu->arch.regs_dirty = ~0;
 	return kvm_x86_ops->vcpu_reset(vcpu);
 }
 int kvm_arch_hardware_enable(void *garbage)
 {
 	struct kvm *kvm;
 	struct kvm_vcpu *vcpu;
 	int i;
 	int ret;
 	u64 local_tsc;
 	u64 max_tsc = 0;
 	bool stable, backwards_tsc = false;
 	kvm_shared_msr_cpu_online();
 	ret = kvm_x86_ops->hardware_enable(garbage);
 	if (ret != 0)
 		return ret;
 	local_tsc = native_read_tsc();
 	stable = !check_tsc_unstable();
 	list_for_each_entry(kvm, &vm_list, vm_list) {
 		kvm_for_each_vcpu(i, vcpu, kvm) {
 			if (!stable && vcpu->cpu == smp_processor_id())
 				set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
 			if (stable && vcpu->arch.last_host_tsc > local_tsc) {
 				backwards_tsc = true;
 				if (vcpu->arch.last_host_tsc > max_tsc)
 					max_tsc = vcpu->arch.last_host_tsc;
 			}
 		}
 	}
 	/*
 	 * Sometimes, even reliable TSCs go backwards.  This happens on
 	 * platforms that reset TSC during suspend or hibernate actions, but
 	 * maintain synchronization.  We must compensate.  Fortunately, we can
 	 * detect that condition here, which happens early in CPU bringup,
 	 * before any KVM threads can be running.  Unfortunately, we can't
 	 * bring the TSCs fully up to date with real time, as we aren't yet far
 	 * enough into CPU bringup that we know how much real time has actually
 	 * elapsed; our helper function, get_kernel_ns() will be using boot
 	 * variables that haven't been updated yet.
 	 *
 	 * So we simply find the maximum observed TSC above, then record the
 	 * adjustment to TSC in each VCPU.  When the VCPU later gets loaded,
 	 * the adjustment will be applied.  Note that we accumulate
 	 * adjustments, in case multiple suspend cycles happen before some VCPU
 	 * gets a chance to run again.  In the event that no KVM threads get a
 	 * chance to run, we will miss the entire elapsed period, as we'll have
 	 * reset last_host_tsc, so VCPUs will not have the TSC adjusted and may
 	 * loose cycle time.  This isn't too big a deal, since the loss will be
 	 * uniform across all VCPUs (not to mention the scenario is extremely
 	 * unlikely). It is possible that a second hibernate recovery happens
 	 * much faster than a first, causing the observed TSC here to be
 	 * smaller; this would require additional padding adjustment, which is
 	 * why we set last_host_tsc to the local tsc observed here.
 	 *
 	 * N.B. - this code below runs only on platforms with reliable TSC,
 	 * as that is the only way backwards_tsc is set above.  Also note
 	 * that this runs for ALL vcpus, which is not a bug; all VCPUs should
 	 * have the same delta_cyc adjustment applied if backwards_tsc
 	 * is detected.  Note further, this adjustment is only done once,
 	 * as we reset last_host_tsc on all VCPUs to stop this from being
 	 * called multiple times (one for each physical CPU bringup).
 	 *
 	 * Platforms with unreliable TSCs don't have to deal with this, they
 	 * will be compensated by the logic in vcpu_load, which sets the TSC to
 	 * catchup mode.  This will catchup all VCPUs to real time, but cannot
 	 * guarantee that they stay in perfect synchronization.
 	 */
 	if (backwards_tsc) {
 		u64 delta_cyc = max_tsc - local_tsc;
 		list_for_each_entry(kvm, &vm_list, vm_list) {
 			kvm_for_each_vcpu(i, vcpu, kvm) {
 				vcpu->arch.tsc_offset_adjustment += delta_cyc;
 				vcpu->arch.last_host_tsc = local_tsc;
 				set_bit(KVM_REQ_MASTERCLOCK_UPDATE,
 					&vcpu->requests);
 			}
 			/*
 			 * We have to disable TSC offset matching.. if you were
 			 * booting a VM while issuing an S4 host suspend....
 			 * you may have some problem.  Solving this issue is
 			 * left as an exercise to the reader.
 			 */
 			kvm->arch.last_tsc_nsec = 0;
 			kvm->arch.last_tsc_write = 0;
 		}
 	}
 	return 0;
 }
 void kvm_arch_hardware_disable(void *garbage)
 {
 	kvm_x86_ops->hardware_disable(garbage);
 	drop_user_return_notifiers(garbage);
 }
 int kvm_arch_hardware_setup(void)
 {
 	return kvm_x86_ops->hardware_setup();
 }
 void kvm_arch_hardware_unsetup(void)
 {
 	kvm_x86_ops->hardware_unsetup();
 }
 void kvm_arch_check_processor_compat(void *rtn)
 {
 	kvm_x86_ops->check_processor_compatibility(rtn);
 }
 bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
 {
 	return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL);
 }
 struct static_key kvm_no_apic_vcpu __read_mostly;
 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 {
 	struct page *page;
 	struct kvm *kvm;
 	int r;
 	BUG_ON(vcpu->kvm == NULL);
 	kvm = vcpu->kvm;
 	vcpu->arch.emulate_ctxt.ops = &emulate_ops;
 	if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
 		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
 	else
 		vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
 	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
 	if (!page) {
 		r = -ENOMEM;
 		goto fail;
 	}
 	vcpu->arch.pio_data = page_address(page);
 	kvm_set_tsc_khz(vcpu, max_tsc_khz);
 	r = kvm_mmu_create(vcpu);
 	if (r < 0)
 		goto fail_free_pio_data;
 	if (irqchip_in_kernel(kvm)) {
 		r = kvm_create_lapic(vcpu);
 		if (r < 0)
 			goto fail_mmu_destroy;
 	} else
 		static_key_slow_inc(&kvm_no_apic_vcpu);
 	vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
 				       GFP_KERNEL);
 	if (!vcpu->arch.mce_banks) {
 		r = -ENOMEM;
 		goto fail_free_lapic;
 	}
 	vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
 	if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL))
 		goto fail_free_mce_banks;
 	r = fx_init(vcpu);
 	if (r)
 		goto fail_free_wbinvd_dirty_mask;
 	vcpu->arch.ia32_tsc_adjust_msr = 0x0;
 	kvm_async_pf_hash_reset(vcpu);
 	kvm_pmu_init(vcpu);
 	return 0;
 fail_free_wbinvd_dirty_mask:
 	free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
 fail_free_mce_banks:
 	kfree(vcpu->arch.mce_banks);
 fail_free_lapic:
 	kvm_free_lapic(vcpu);
 fail_mmu_destroy:
 	kvm_mmu_destroy(vcpu);
 fail_free_pio_data:
 	free_page((unsigned long)vcpu->arch.pio_data);
 fail:
 	return r;
 }
 void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 {
 	int idx;
 	kvm_pmu_destroy(vcpu);
 	kfree(vcpu->arch.mce_banks);
 	kvm_free_lapic(vcpu);
 	idx = srcu_read_lock(&vcpu->kvm->srcu);
 	kvm_mmu_destroy(vcpu);
 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
 	free_page((unsigned long)vcpu->arch.pio_data);
 	if (!irqchip_in_kernel(vcpu->kvm))
 		static_key_slow_dec(&kvm_no_apic_vcpu);
 }
 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 {
 	if (type)
 		return -EINVAL;
 	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
 	INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
 	/* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
 	set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
 	/* Reserve bit 1 of irq_sources_bitmap for irqfd-resampler */
 	set_bit(KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
 		&kvm->arch.irq_sources_bitmap);
 	raw_spin_lock_init(&kvm->arch.tsc_write_lock);
 	mutex_init(&kvm->arch.apic_map_lock);
 	spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
 	pvclock_update_vm_gtod_copy(kvm);
 	return 0;
 }
 static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
 {
 	int r;
 	r = vcpu_load(vcpu);
 	BUG_ON(r);
 	kvm_mmu_unload(vcpu);
 	vcpu_put(vcpu);
 }
 static void kvm_free_vcpus(struct kvm *kvm)
 {
 	unsigned int i;
 	struct kvm_vcpu *vcpu;
 	/*
 	 * Unpin any mmu pages first.
 	 */
 	kvm_for_each_vcpu(i, vcpu, kvm) {
 		kvm_clear_async_pf_completion_queue(vcpu);
 		kvm_unload_vcpu_mmu(vcpu);
 	}
 	kvm_for_each_vcpu(i, vcpu, kvm)
 		kvm_arch_vcpu_free(vcpu);
 	mutex_lock(&kvm->lock);
 	for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
 		kvm->vcpus[i] = NULL;
 	atomic_set(&kvm->online_vcpus, 0);
 	mutex_unlock(&kvm->lock);
 }
 void kvm_arch_sync_events(struct kvm *kvm)
 {
 	kvm_free_all_assigned_devices(kvm);
 	kvm_free_pit(kvm);
 }
 void kvm_arch_destroy_vm(struct kvm *kvm)
 {
 	kvm_iommu_unmap_guest(kvm);
 	kfree(kvm->arch.vpic);
 	kfree(kvm->arch.vioapic);
 	kvm_free_vcpus(kvm);
 	if (kvm->arch.apic_access_page)
 		put_page(kvm->arch.apic_access_page);
 	if (kvm->arch.ept_identity_pagetable)
 		put_page(kvm->arch.ept_identity_pagetable);
 	kfree(rcu_dereference_check(kvm->arch.apic_map, 1));
 }
 void kvm_arch_free_memslot(struct kvm_memory_slot *free,
 			   struct kvm_memory_slot *dont)
 {
 	int i;
 	for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
 		if (!dont || free->arch.rmap[i] != dont->arch.rmap[i]) {
 			kvm_kvfree(free->arch.rmap[i]);
 			free->arch.rmap[i] = NULL;
 		}
 		if (i == 0)
 			continue;
 		if (!dont || free->arch.lpage_info[i - 1] !=
 			     dont->arch.lpage_info[i - 1]) {
 			kvm_kvfree(free->arch.lpage_info[i - 1]);
 			free->arch.lpage_info[i - 1] = NULL;
 		}
 	}
 }
 int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
 {
 	int i;
 	for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
 		unsigned long ugfn;
 		int lpages;
 		int level = i + 1;
 		lpages = gfn_to_index(slot->base_gfn + npages - 1,
 				      slot->base_gfn, level) + 1;
 		slot->arch.rmap[i] =
 			kvm_kvzalloc(lpages * sizeof(*slot->arch.rmap[i]));
 		if (!slot->arch.rmap[i])
 			goto out_free;
 		if (i == 0)
 			continue;
 		slot->arch.lpage_info[i - 1] = kvm_kvzalloc(lpages *
 					sizeof(*slot->arch.lpage_info[i - 1]));
 		if (!slot->arch.lpage_info[i - 1])
 			goto out_free;
 		if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
 			slot->arch.lpage_info[i - 1][0].write_count = 1;
 		if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
 			slot->arch.lpage_info[i - 1][lpages - 1].write_count = 1;
 		ugfn = slot->userspace_addr >> PAGE_SHIFT;
 		/*
 		 * If the gfn and userspace address are not aligned wrt each
 		 * other, or if explicitly asked to, disable large page
 		 * support for this slot
 		 */
 		if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) ||
 		    !kvm_largepages_enabled()) {
 			unsigned long j;
 			for (j = 0; j < lpages; ++j)
 				slot->arch.lpage_info[i - 1][j].write_count = 1;
 		}
 	}
 	return 0;
 out_free:
 	for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
 		kvm_kvfree(slot->arch.rmap[i]);
 		slot->arch.rmap[i] = NULL;
 		if (i == 0)
 			continue;
 		kvm_kvfree(slot->arch.lpage_info[i - 1]);
 		slot->arch.lpage_info[i - 1] = NULL;
 	}
 	return -ENOMEM;
 }
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
 				struct kvm_memory_slot *memslot,
 				struct kvm_memory_slot old,
 				struct kvm_userspace_memory_region *mem,
 				bool user_alloc)
 {
 	int npages = memslot->npages;
 	int map_flags = MAP_PRIVATE | MAP_ANONYMOUS;
 	/* Prevent internal slot pages from being moved by fork()/COW. */
 	if (memslot->id >= KVM_USER_MEM_SLOTS)
 		map_flags = MAP_SHARED | MAP_ANONYMOUS;
 	/*To keep backward compatibility with older userspace,
 	 *x86 needs to handle !user_alloc case.
 	 */
 	if (!user_alloc) {
 		if (npages && !old.npages) {
 			unsigned long userspace_addr;
 			userspace_addr = vm_mmap(NULL, 0,
 						 npages * PAGE_SIZE,
 						 PROT_READ | PROT_WRITE,
 						 map_flags,
 						 0);
 			if (IS_ERR((void *)userspace_addr))
 				return PTR_ERR((void *)userspace_addr);
 			memslot->userspace_addr = userspace_addr;
 		}
 	}
 	return 0;
 }
 void kvm_arch_commit_memory_region(struct kvm *kvm,
 				struct kvm_userspace_memory_region *mem,
 				struct kvm_memory_slot old,
 				bool user_alloc)
 {
 	int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT;
 	if (!user_alloc && !old.user_alloc && old.npages && !npages) {
 		int ret;
 		ret = vm_munmap(old.userspace_addr,
 				old.npages * PAGE_SIZE);
 		if (ret < 0)
 			printk(KERN_WARNING
 			       "kvm_vm_ioctl_set_memory_region: "
 			       "failed to munmap memory\n");
 	}
 	if (!kvm->arch.n_requested_mmu_pages)
 		nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
 	spin_lock(&kvm->mmu_lock);
 	if (nr_mmu_pages)
 		kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
-	kvm_mmu_slot_remove_write_access(kvm, mem->slot);
+	/*
+	 * Write protect all pages for dirty logging.
+	 * Existing largepage mappings are destroyed here and new ones will
+	 * not be created until the end of the logging.
+	 */
+	if (npages && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES))
+		kvm_mmu_slot_remove_write_access(kvm, mem->slot);
 	spin_unlock(&kvm->mmu_lock);
 	/*
 	 * If memory slot is created, or moved, we need to clear all
 	 * mmio sptes.
 	 */
 	if (npages && old.base_gfn != mem->guest_phys_addr >> PAGE_SHIFT) {
 		kvm_mmu_zap_all(kvm);
 		kvm_reload_remote_mmus(kvm);
 	}
 }
 void kvm_arch_flush_shadow_all(struct kvm *kvm)
 {
 	kvm_mmu_zap_all(kvm);
 	kvm_reload_remote_mmus(kvm);
 }
 void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
 				   struct kvm_memory_slot *slot)
 {
 	kvm_arch_flush_shadow_all(kvm);
 }
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 {
 	return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
 		!vcpu->arch.apf.halted)
 		|| !list_empty_careful(&vcpu->async_pf.done)
 		|| vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
 		|| atomic_read(&vcpu->arch.nmi_queued) ||
 		(kvm_arch_interrupt_allowed(vcpu) &&
 		 kvm_cpu_has_interrupt(vcpu));
 }
 int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
 {
 	return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
 }
 int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
 {
 	return kvm_x86_ops->interrupt_allowed(vcpu);
 }
 bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip)
 {
 	unsigned long current_rip = kvm_rip_read(vcpu) +
 		get_segment_base(vcpu, VCPU_SREG_CS);
 	return current_rip == linear_rip;
 }
 EXPORT_SYMBOL_GPL(kvm_is_linear_rip);
 unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
 {
 	unsigned long rflags;
 	rflags = kvm_x86_ops->get_rflags(vcpu);
 	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
 		rflags &= ~X86_EFLAGS_TF;
 	return rflags;
 }
 EXPORT_SYMBOL_GPL(kvm_get_rflags);
 void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
 {
 	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
 	    kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
 		rflags |= X86_EFLAGS_TF;
 	kvm_x86_ops->set_rflags(vcpu, rflags);
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
 }
 EXPORT_SYMBOL_GPL(kvm_set_rflags);
 void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
 {
 	int r;
 	if ((vcpu->arch.mmu.direct_map != work->arch.direct_map) ||
 	      is_error_page(work->page))
 		return;
 	r = kvm_mmu_reload(vcpu);
 	if (unlikely(r))
 		return;
 	if (!vcpu->arch.mmu.direct_map &&
 	      work->arch.cr3 != vcpu->arch.mmu.get_cr3(vcpu))
 		return;
 	vcpu->arch.mmu.page_fault(vcpu, work->gva, 0, true);
 }
 static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
 {
 	return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU));
 }
 static inline u32 kvm_async_pf_next_probe(u32 key)
 {
 	return (key + 1) & (roundup_pow_of_two(ASYNC_PF_PER_VCPU) - 1);
 }
 static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
 {
 	u32 key = kvm_async_pf_hash_fn(gfn);
 	while (vcpu->arch.apf.gfns[key] != ~0)
 		key = kvm_async_pf_next_probe(key);
 	vcpu->arch.apf.gfns[key] = gfn;
 }
 static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn)
 {
 	int i;
 	u32 key = kvm_async_pf_hash_fn(gfn);
 	for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU) &&
 		     (vcpu->arch.apf.gfns[key] != gfn &&
 		      vcpu->arch.apf.gfns[key] != ~0); i++)
 		key = kvm_async_pf_next_probe(key);
 	return key;
 }
 bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
 {
 	return vcpu->arch.apf.gfns[kvm_async_pf_gfn_slot(vcpu, gfn)] == gfn;
 }
 static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
 {
 	u32 i, j, k;
 	i = j = kvm_async_pf_gfn_slot(vcpu, gfn);
 	while (true) {
 		vcpu->arch.apf.gfns[i] = ~0;
 		do {
 			j = kvm_async_pf_next_probe(j);
 			if (vcpu->arch.apf.gfns[j] == ~0)
 				return;
 			k = kvm_async_pf_hash_fn(vcpu->arch.apf.gfns[j]);
 			/*
 			 * k lies cyclically in ]i,j]
 			 * |    i.k.j |
 			 * |....j i.k.| or  |.k..j i...|
 			 */
 		} while ((i <= j) ? (i < k && k <= j) : (i < k || k <= j));
 		vcpu->arch.apf.gfns[i] = vcpu->arch.apf.gfns[j];
 		i = j;
 	}
 }
 static int apf_put_user(struct kvm_vcpu *vcpu, u32 val)
 {
 	return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val,
 				      sizeof(val));
 }
 void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
 				     struct kvm_async_pf *work)
 {
 	struct x86_exception fault;
 	trace_kvm_async_pf_not_present(work->arch.token, work->gva);
 	kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
 	if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) ||
 	    (vcpu->arch.apf.send_user_only &&
 	     kvm_x86_ops->get_cpl(vcpu) == 0))
 		kvm_make_request(KVM_REQ_APF_HALT, vcpu);
 	else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) {
 		fault.vector = PF_VECTOR;
 		fault.error_code_valid = true;
 		fault.error_code = 0;
 		fault.nested_page_fault = false;
 		fault.address = work->arch.token;
 		kvm_inject_page_fault(vcpu, &fault);
 	}
 }
 void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
 				 struct kvm_async_pf *work)
 {
 	struct x86_exception fault;
 	trace_kvm_async_pf_ready(work->arch.token, work->gva);
 	if (is_error_page(work->page))
 		work->arch.token = ~0; /* broadcast wakeup */
 	else
 		kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
 	if ((vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) &&
 	    !apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
 		fault.vector = PF_VECTOR;
 		fault.error_code_valid = true;
 		fault.error_code = 0;
 		fault.nested_page_fault = false;
 		fault.address = work->arch.token;
 		kvm_inject_page_fault(vcpu, &fault);
 	}
 	vcpu->arch.apf.halted = false;
 	vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
 }
 bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
 {
 	if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED))
 		return true;
 	else
 		return !kvm_event_needs_reinjection(vcpu) &&
 			kvm_x86_ops->interrupt_allowed(vcpu);
 }
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);

 /*
  * Kernel-based Virtual Machine driver for Linux
  *
  * This module enables machines with Intel VT-x extensions to run virtual
  * machines without emulation or binary translation.
  *
  * Copyright (C) 2006 Qumranet, Inc.
  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  *
  * Authors:
  *   Avi Kivity   <avi@qumranet.com>
  *   Yaniv Kamay  <yaniv@qumranet.com>
  *
  * This work is licensed under the terms of the GNU GPL, version 2.  See
  * the COPYING file in the top-level directory.
  *
  */
 #include "iodev.h"
 #include <linux/kvm_host.h>
 #include <linux/kvm.h>
 #include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/percpu.h>
 #include <linux/mm.h>
 #include <linux/miscdevice.h>
 #include <linux/vmalloc.h>
 #include <linux/reboot.h>
 #include <linux/debugfs.h>
 #include <linux/highmem.h>
 #include <linux/file.h>
 #include <linux/syscore_ops.h>
 #include <linux/cpu.h>
 #include <linux/sched.h>
 #include <linux/cpumask.h>
 #include <linux/smp.h>
 #include <linux/anon_inodes.h>
 #include <linux/profile.h>
 #include <linux/kvm_para.h>
 #include <linux/pagemap.h>
 #include <linux/mman.h>
 #include <linux/swap.h>
 #include <linux/bitops.h>
 #include <linux/spinlock.h>
 #include <linux/compat.h>
 #include <linux/srcu.h>
 #include <linux/hugetlb.h>
 #include <linux/slab.h>
 #include <linux/sort.h>
 #include <linux/bsearch.h>
 #include <asm/processor.h>
 #include <asm/io.h>
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 #include "coalesced_mmio.h"
 #include "async_pf.h"
 #define CREATE_TRACE_POINTS
 #include <trace/events/kvm.h>
 MODULE_AUTHOR("Qumranet");
 MODULE_LICENSE("GPL");
 /*
  * Ordering of locks:
  *
  * 		kvm->lock --> kvm->slots_lock --> kvm->irq_lock
  */
 DEFINE_RAW_SPINLOCK(kvm_lock);
 LIST_HEAD(vm_list);
 static cpumask_var_t cpus_hardware_enabled;
 static int kvm_usage_count = 0;
 static atomic_t hardware_enable_failed;
 struct kmem_cache *kvm_vcpu_cache;
 EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
 static __read_mostly struct preempt_ops kvm_preempt_ops;
 struct dentry *kvm_debugfs_dir;
 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
 			   unsigned long arg);
 #ifdef CONFIG_COMPAT
 static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl,
 				  unsigned long arg);
 #endif
 static int hardware_enable_all(void);
 static void hardware_disable_all(void);
 static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
 bool kvm_rebooting;
 EXPORT_SYMBOL_GPL(kvm_rebooting);
 static bool largepages_enabled = true;
 bool kvm_is_mmio_pfn(pfn_t pfn)
 {
 	if (pfn_valid(pfn)) {
 		int reserved;
 		struct page *tail = pfn_to_page(pfn);
 		struct page *head = compound_trans_head(tail);
 		reserved = PageReserved(head);
 		if (head != tail) {
 			/*
 			 * "head" is not a dangling pointer
 			 * (compound_trans_head takes care of that)
 			 * but the hugepage may have been splitted
 			 * from under us (and we may not hold a
 			 * reference count on the head page so it can
 			 * be reused before we run PageReferenced), so
 			 * we've to check PageTail before returning
 			 * what we just read.
 			 */
 			smp_rmb();
 			if (PageTail(tail))
 				return reserved;
 		}
 		return PageReserved(tail);
 	}
 	return true;
 }
 /*
  * Switches to specified vcpu, until a matching vcpu_put()
  */
 int vcpu_load(struct kvm_vcpu *vcpu)
 {
 	int cpu;
 	if (mutex_lock_killable(&vcpu->mutex))
 		return -EINTR;
 	if (unlikely(vcpu->pid != current->pids[PIDTYPE_PID].pid)) {
 		/* The thread running this VCPU changed. */
 		struct pid *oldpid = vcpu->pid;
 		struct pid *newpid = get_task_pid(current, PIDTYPE_PID);
 		rcu_assign_pointer(vcpu->pid, newpid);
 		synchronize_rcu();
 		put_pid(oldpid);
 	}
 	cpu = get_cpu();
 	preempt_notifier_register(&vcpu->preempt_notifier);
 	kvm_arch_vcpu_load(vcpu, cpu);
 	put_cpu();
 	return 0;
 }
 void vcpu_put(struct kvm_vcpu *vcpu)
 {
 	preempt_disable();
 	kvm_arch_vcpu_put(vcpu);
 	preempt_notifier_unregister(&vcpu->preempt_notifier);
 	preempt_enable();
 	mutex_unlock(&vcpu->mutex);
 }
 static void ack_flush(void *_completed)
 {
 }
 static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
 {
 	int i, cpu, me;
 	cpumask_var_t cpus;
 	bool called = true;
 	struct kvm_vcpu *vcpu;
 	zalloc_cpumask_var(&cpus, GFP_ATOMIC);
 	me = get_cpu();
 	kvm_for_each_vcpu(i, vcpu, kvm) {
 		kvm_make_request(req, vcpu);
 		cpu = vcpu->cpu;
 		/* Set ->requests bit before we read ->mode */
 		smp_mb();
 		if (cpus != NULL && cpu != -1 && cpu != me &&
 		      kvm_vcpu_exiting_guest_mode(vcpu) != OUTSIDE_GUEST_MODE)
 			cpumask_set_cpu(cpu, cpus);
 	}
 	if (unlikely(cpus == NULL))
 		smp_call_function_many(cpu_online_mask, ack_flush, NULL, 1);
 	else if (!cpumask_empty(cpus))
 		smp_call_function_many(cpus, ack_flush, NULL, 1);
 	else
 		called = false;
 	put_cpu();
 	free_cpumask_var(cpus);
 	return called;
 }
 void kvm_flush_remote_tlbs(struct kvm *kvm)
 {
 	long dirty_count = kvm->tlbs_dirty;
 	smp_mb();
 	if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
 		++kvm->stat.remote_tlb_flush;
 	cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
 }
 void kvm_reload_remote_mmus(struct kvm *kvm)
 {
 	make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
 }
 void kvm_make_mclock_inprogress_request(struct kvm *kvm)
 {
 	make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
 }
 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
 {
 	struct page *page;
 	int r;
 	mutex_init(&vcpu->mutex);
 	vcpu->cpu = -1;
 	vcpu->kvm = kvm;
 	vcpu->vcpu_id = id;
 	vcpu->pid = NULL;
 	init_waitqueue_head(&vcpu->wq);
 	kvm_async_pf_vcpu_init(vcpu);
 	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
 	if (!page) {
 		r = -ENOMEM;
 		goto fail;
 	}
 	vcpu->run = page_address(page);
 	kvm_vcpu_set_in_spin_loop(vcpu, false);
 	kvm_vcpu_set_dy_eligible(vcpu, false);
 	r = kvm_arch_vcpu_init(vcpu);
 	if (r < 0)
 		goto fail_free_run;
 	return 0;
 fail_free_run:
 	free_page((unsigned long)vcpu->run);
 fail:
 	return r;
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_init);
 void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
 {
 	put_pid(vcpu->pid);
 	kvm_arch_vcpu_uninit(vcpu);
 	free_page((unsigned long)vcpu->run);
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
 static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
 {
 	return container_of(mn, struct kvm, mmu_notifier);
 }
 static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
 					     struct mm_struct *mm,
 					     unsigned long address)
 {
 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
 	int need_tlb_flush, idx;
 	/*
 	 * When ->invalidate_page runs, the linux pte has been zapped
 	 * already but the page is still allocated until
 	 * ->invalidate_page returns. So if we increase the sequence
 	 * here the kvm page fault will notice if the spte can't be
 	 * established because the page is going to be freed. If
 	 * instead the kvm page fault establishes the spte before
 	 * ->invalidate_page runs, kvm_unmap_hva will release it
 	 * before returning.
 	 *
 	 * The sequence increase only need to be seen at spin_unlock
 	 * time, and not at spin_lock time.
 	 *
 	 * Increasing the sequence after the spin_unlock would be
 	 * unsafe because the kvm page fault could then establish the
 	 * pte after kvm_unmap_hva returned, without noticing the page
 	 * is going to be freed.
 	 */
 	idx = srcu_read_lock(&kvm->srcu);
 	spin_lock(&kvm->mmu_lock);
 	kvm->mmu_notifier_seq++;
 	need_tlb_flush = kvm_unmap_hva(kvm, address) | kvm->tlbs_dirty;
 	/* we've to flush the tlb before the pages can be freed */
 	if (need_tlb_flush)
 		kvm_flush_remote_tlbs(kvm);
 	spin_unlock(&kvm->mmu_lock);
 	srcu_read_unlock(&kvm->srcu, idx);
 }
 static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
 					struct mm_struct *mm,
 					unsigned long address,
 					pte_t pte)
 {
 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
 	int idx;
 	idx = srcu_read_lock(&kvm->srcu);
 	spin_lock(&kvm->mmu_lock);
 	kvm->mmu_notifier_seq++;
 	kvm_set_spte_hva(kvm, address, pte);
 	spin_unlock(&kvm->mmu_lock);
 	srcu_read_unlock(&kvm->srcu, idx);
 }
 static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 						    struct mm_struct *mm,
 						    unsigned long start,
 						    unsigned long end)
 {
 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
 	int need_tlb_flush = 0, idx;
 	idx = srcu_read_lock(&kvm->srcu);
 	spin_lock(&kvm->mmu_lock);
 	/*
 	 * The count increase must become visible at unlock time as no
 	 * spte can be established without taking the mmu_lock and
 	 * count is also read inside the mmu_lock critical section.
 	 */
 	kvm->mmu_notifier_count++;
 	need_tlb_flush = kvm_unmap_hva_range(kvm, start, end);
 	need_tlb_flush |= kvm->tlbs_dirty;
 	/* we've to flush the tlb before the pages can be freed */
 	if (need_tlb_flush)
 		kvm_flush_remote_tlbs(kvm);
 	spin_unlock(&kvm->mmu_lock);
 	srcu_read_unlock(&kvm->srcu, idx);
 }
 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
 						  struct mm_struct *mm,
 						  unsigned long start,
 						  unsigned long end)
 {
 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
 	spin_lock(&kvm->mmu_lock);
 	/*
 	 * This sequence increase will notify the kvm page fault that
 	 * the page that is going to be mapped in the spte could have
 	 * been freed.
 	 */
 	kvm->mmu_notifier_seq++;
 	smp_wmb();
 	/*
 	 * The above sequence increase must be visible before the
 	 * below count decrease, which is ensured by the smp_wmb above
 	 * in conjunction with the smp_rmb in mmu_notifier_retry().
 	 */
 	kvm->mmu_notifier_count--;
 	spin_unlock(&kvm->mmu_lock);
 	BUG_ON(kvm->mmu_notifier_count < 0);
 }
 static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
 					      struct mm_struct *mm,
 					      unsigned long address)
 {
 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
 	int young, idx;
 	idx = srcu_read_lock(&kvm->srcu);
 	spin_lock(&kvm->mmu_lock);
 	young = kvm_age_hva(kvm, address);
 	if (young)
 		kvm_flush_remote_tlbs(kvm);
 	spin_unlock(&kvm->mmu_lock);
 	srcu_read_unlock(&kvm->srcu, idx);
 	return young;
 }
 static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
 				       struct mm_struct *mm,
 				       unsigned long address)
 {
 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
 	int young, idx;
 	idx = srcu_read_lock(&kvm->srcu);
 	spin_lock(&kvm->mmu_lock);
 	young = kvm_test_age_hva(kvm, address);
 	spin_unlock(&kvm->mmu_lock);
 	srcu_read_unlock(&kvm->srcu, idx);
 	return young;
 }
 static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
 				     struct mm_struct *mm)
 {
 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
 	int idx;
 	idx = srcu_read_lock(&kvm->srcu);
 	kvm_arch_flush_shadow_all(kvm);
 	srcu_read_unlock(&kvm->srcu, idx);
 }
 static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
 	.invalidate_page	= kvm_mmu_notifier_invalidate_page,
 	.invalidate_range_start	= kvm_mmu_notifier_invalidate_range_start,
 	.invalidate_range_end	= kvm_mmu_notifier_invalidate_range_end,
 	.clear_flush_young	= kvm_mmu_notifier_clear_flush_young,
 	.test_young		= kvm_mmu_notifier_test_young,
 	.change_pte		= kvm_mmu_notifier_change_pte,
 	.release		= kvm_mmu_notifier_release,
 };
 static int kvm_init_mmu_notifier(struct kvm *kvm)
 {
 	kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
 	return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
 }
 #else  /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */
 static int kvm_init_mmu_notifier(struct kvm *kvm)
 {
 	return 0;
 }
 #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
 static void kvm_init_memslots_id(struct kvm *kvm)
 {
 	int i;
 	struct kvm_memslots *slots = kvm->memslots;
 	for (i = 0; i < KVM_MEM_SLOTS_NUM; i++)
 		slots->id_to_index[i] = slots->memslots[i].id = i;
 }
 static struct kvm *kvm_create_vm(unsigned long type)
 {
 	int r, i;
 	struct kvm *kvm = kvm_arch_alloc_vm();
 	if (!kvm)
 		return ERR_PTR(-ENOMEM);
 	r = kvm_arch_init_vm(kvm, type);
 	if (r)
 		goto out_err_nodisable;
 	r = hardware_enable_all();
 	if (r)
 		goto out_err_nodisable;
 #ifdef CONFIG_HAVE_KVM_IRQCHIP
 	INIT_HLIST_HEAD(&kvm->mask_notifier_list);
 	INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
 #endif
 	BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
 	r = -ENOMEM;
 	kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
 	if (!kvm->memslots)
 		goto out_err_nosrcu;
 	kvm_init_memslots_id(kvm);
 	if (init_srcu_struct(&kvm->srcu))
 		goto out_err_nosrcu;
 	for (i = 0; i < KVM_NR_BUSES; i++) {
 		kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus),
 					GFP_KERNEL);
 		if (!kvm->buses[i])
 			goto out_err;
 	}
 	spin_lock_init(&kvm->mmu_lock);
 	kvm->mm = current->mm;
 	atomic_inc(&kvm->mm->mm_count);
 	kvm_eventfd_init(kvm);
 	mutex_init(&kvm->lock);
 	mutex_init(&kvm->irq_lock);
 	mutex_init(&kvm->slots_lock);
 	atomic_set(&kvm->users_count, 1);
 	r = kvm_init_mmu_notifier(kvm);
 	if (r)
 		goto out_err;
 	raw_spin_lock(&kvm_lock);
 	list_add(&kvm->vm_list, &vm_list);
 	raw_spin_unlock(&kvm_lock);
 	return kvm;
 out_err:
 	cleanup_srcu_struct(&kvm->srcu);
 out_err_nosrcu:
 	hardware_disable_all();
 out_err_nodisable:
 	for (i = 0; i < KVM_NR_BUSES; i++)
 		kfree(kvm->buses[i]);
 	kfree(kvm->memslots);
 	kvm_arch_free_vm(kvm);
 	return ERR_PTR(r);
 }
 /*
  * Avoid using vmalloc for a small buffer.
  * Should not be used when the size is statically known.
  */
 void *kvm_kvzalloc(unsigned long size)
 {
 	if (size > PAGE_SIZE)
 		return vzalloc(size);
 	else
 		return kzalloc(size, GFP_KERNEL);
 }
 void kvm_kvfree(const void *addr)
 {
 	if (is_vmalloc_addr(addr))
 		vfree(addr);
 	else
 		kfree(addr);
 }
 static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
 {
 	if (!memslot->dirty_bitmap)
 		return;
 	kvm_kvfree(memslot->dirty_bitmap);
 	memslot->dirty_bitmap = NULL;
 }
 /*
  * Free any memory in @free but not in @dont.
  */
 static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
 				  struct kvm_memory_slot *dont)
 {
 	if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
 		kvm_destroy_dirty_bitmap(free);
 	kvm_arch_free_memslot(free, dont);
 	free->npages = 0;
 }
 void kvm_free_physmem(struct kvm *kvm)
 {
 	struct kvm_memslots *slots = kvm->memslots;
 	struct kvm_memory_slot *memslot;
 	kvm_for_each_memslot(memslot, slots)
 		kvm_free_physmem_slot(memslot, NULL);
 	kfree(kvm->memslots);
 }
 static void kvm_destroy_vm(struct kvm *kvm)
 {
 	int i;
 	struct mm_struct *mm = kvm->mm;
 	kvm_arch_sync_events(kvm);
 	raw_spin_lock(&kvm_lock);
 	list_del(&kvm->vm_list);
 	raw_spin_unlock(&kvm_lock);
 	kvm_free_irq_routing(kvm);
 	for (i = 0; i < KVM_NR_BUSES; i++)
 		kvm_io_bus_destroy(kvm->buses[i]);
 	kvm_coalesced_mmio_free(kvm);
 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
 	mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
 #else
 	kvm_arch_flush_shadow_all(kvm);
 #endif
 	kvm_arch_destroy_vm(kvm);
 	kvm_free_physmem(kvm);
 	cleanup_srcu_struct(&kvm->srcu);
 	kvm_arch_free_vm(kvm);
 	hardware_disable_all();
 	mmdrop(mm);
 }
 void kvm_get_kvm(struct kvm *kvm)
 {
 	atomic_inc(&kvm->users_count);
 }
 EXPORT_SYMBOL_GPL(kvm_get_kvm);
 void kvm_put_kvm(struct kvm *kvm)
 {
 	if (atomic_dec_and_test(&kvm->users_count))
 		kvm_destroy_vm(kvm);
 }
 EXPORT_SYMBOL_GPL(kvm_put_kvm);
 static int kvm_vm_release(struct inode *inode, struct file *filp)
 {
 	struct kvm *kvm = filp->private_data;
 	kvm_irqfd_release(kvm);
 	kvm_put_kvm(kvm);
 	return 0;
 }
 /*
  * Allocation size is twice as large as the actual dirty bitmap size.
  * See x86's kvm_vm_ioctl_get_dirty_log() why this is needed.
  */
 static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
 {
 #ifndef CONFIG_S390
 	unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot);
 	memslot->dirty_bitmap = kvm_kvzalloc(dirty_bytes);
 	if (!memslot->dirty_bitmap)
 		return -ENOMEM;
 #endif /* !CONFIG_S390 */
 	return 0;
 }
 static int cmp_memslot(const void *slot1, const void *slot2)
 {
 	struct kvm_memory_slot *s1, *s2;
 	s1 = (struct kvm_memory_slot *)slot1;
 	s2 = (struct kvm_memory_slot *)slot2;
 	if (s1->npages < s2->npages)
 		return 1;
 	if (s1->npages > s2->npages)
 		return -1;
 	return 0;
 }
 /*
  * Sort the memslots base on its size, so the larger slots
  * will get better fit.
  */
 static void sort_memslots(struct kvm_memslots *slots)
 {
 	int i;
 	sort(slots->memslots, KVM_MEM_SLOTS_NUM,
 	      sizeof(struct kvm_memory_slot), cmp_memslot, NULL);
 	for (i = 0; i < KVM_MEM_SLOTS_NUM; i++)
 		slots->id_to_index[slots->memslots[i].id] = i;
 }
 void update_memslots(struct kvm_memslots *slots, struct kvm_memory_slot *new,
 		     u64 last_generation)
 {
 	if (new) {
 		int id = new->id;
 		struct kvm_memory_slot *old = id_to_memslot(slots, id);
 		unsigned long npages = old->npages;
 		*old = *new;
 		if (new->npages != npages)
 			sort_memslots(slots);
 	}
 	slots->generation = last_generation + 1;
 }
 static int check_memory_region_flags(struct kvm_userspace_memory_region *mem)
 {
 	u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
 #ifdef KVM_CAP_READONLY_MEM
 	valid_flags |= KVM_MEM_READONLY;
 #endif
 	if (mem->flags & ~valid_flags)
 		return -EINVAL;
 	return 0;
 }
 static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
 		struct kvm_memslots *slots, struct kvm_memory_slot *new)
 {
 	struct kvm_memslots *old_memslots = kvm->memslots;
 	update_memslots(slots, new, kvm->memslots->generation);
 	rcu_assign_pointer(kvm->memslots, slots);
 	synchronize_srcu_expedited(&kvm->srcu);
 	return old_memslots;
 }
 /*
  * Allocate some memory and give it an address in the guest physical address
  * space.
  *
  * Discontiguous memory is allowed, mostly for framebuffers.
  *
  * Must be called holding mmap_sem for write.
  */
 int __kvm_set_memory_region(struct kvm *kvm,
 			    struct kvm_userspace_memory_region *mem,
 			    bool user_alloc)
 {
 	int r;
 	gfn_t base_gfn;
 	unsigned long npages;
 	struct kvm_memory_slot *memslot, *slot;
 	struct kvm_memory_slot old, new;
 	struct kvm_memslots *slots = NULL, *old_memslots;
 	r = check_memory_region_flags(mem);
 	if (r)
 		goto out;
 	r = -EINVAL;
 	/* General sanity checks */
 	if (mem->memory_size & (PAGE_SIZE - 1))
 		goto out;
 	if (mem->guest_phys_addr & (PAGE_SIZE - 1))
 		goto out;
 	/* We can read the guest memory with __xxx_user() later on. */
 	if (user_alloc &&
 	    ((mem->userspace_addr & (PAGE_SIZE - 1)) ||
 	     !access_ok(VERIFY_WRITE,
 			(void __user *)(unsigned long)mem->userspace_addr,
 			mem->memory_size)))
 		goto out;
 	if (mem->slot >= KVM_MEM_SLOTS_NUM)
 		goto out;
 	if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
 		goto out;
 	memslot = id_to_memslot(kvm->memslots, mem->slot);
 	base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
 	npages = mem->memory_size >> PAGE_SHIFT;
 	r = -EINVAL;
 	if (npages > KVM_MEM_MAX_NR_PAGES)
 		goto out;
 	if (!npages)
 		mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
 	new = old = *memslot;
 	new.id = mem->slot;
 	new.base_gfn = base_gfn;
 	new.npages = npages;
 	new.flags = mem->flags;
 	/*
 	 * Disallow changing a memory slot's size or changing anything about
 	 * zero sized slots that doesn't involve making them non-zero.
 	 */
 	r = -EINVAL;
 	if (npages && old.npages && npages != old.npages)
 		goto out_free;
 	if (!npages && !old.npages)
 		goto out_free;
 	/* Check for overlaps */
 	r = -EEXIST;
 	kvm_for_each_memslot(slot, kvm->memslots) {
 		if (slot->id >= KVM_USER_MEM_SLOTS || slot == memslot)
 			continue;
 		if (!((base_gfn + npages <= slot->base_gfn) ||
 		      (base_gfn >= slot->base_gfn + slot->npages)))
 			goto out_free;
 	}
 	/* Free page dirty bitmap if unneeded */
 	if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
 		new.dirty_bitmap = NULL;
 	r = -ENOMEM;
 	/*
 	 * Allocate if a slot is being created.  If modifying a slot,
 	 * the userspace_addr cannot change.
 	 */
 	if (!old.npages) {
 		new.user_alloc = user_alloc;
 		new.userspace_addr = mem->userspace_addr;
 		if (kvm_arch_create_memslot(&new, npages))
 			goto out_free;
 	} else if (npages && mem->userspace_addr != old.userspace_addr) {
 		r = -EINVAL;
 		goto out_free;
 	}
 	/* Allocate page dirty bitmap if needed */
 	if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
 		if (kvm_create_dirty_bitmap(&new) < 0)
 			goto out_free;
-		/* destroy any largepage mappings for dirty tracking */
 	}
 	if (!npages || base_gfn != old.base_gfn) {
 		struct kvm_memory_slot *slot;
 		r = -ENOMEM;
 		slots = kmemdup(kvm->memslots, sizeof(struct kvm_memslots),
 				GFP_KERNEL);
 		if (!slots)
 			goto out_free;
 		slot = id_to_memslot(slots, mem->slot);
 		slot->flags |= KVM_MEMSLOT_INVALID;
 		old_memslots = install_new_memslots(kvm, slots, NULL);
 		/* slot was deleted or moved, clear iommu mapping */
 		kvm_iommu_unmap_pages(kvm, &old);
 		/* From this point no new shadow pages pointing to a deleted,
 		 * or moved, memslot will be created.
 		 *
 		 * validation of sp->gfn happens in:
 		 * 	- gfn_to_hva (kvm_read_guest, gfn_to_pfn)
 		 * 	- kvm_is_visible_gfn (mmu_check_roots)
 		 */
 		kvm_arch_flush_shadow_memslot(kvm, slot);
 		slots = old_memslots;
 	}
 	r = kvm_arch_prepare_memory_region(kvm, &new, old, mem, user_alloc);
 	if (r)
 		goto out_slots;
 	r = -ENOMEM;
 	/*
 	 * We can re-use the old_memslots from above, the only difference
 	 * from the currently installed memslots is the invalid flag.  This
 	 * will get overwritten by update_memslots anyway.
 	 */
 	if (!slots) {
 		slots = kmemdup(kvm->memslots, sizeof(struct kvm_memslots),
 				GFP_KERNEL);
 		if (!slots)
 			goto out_free;
 	}
 	/* map new memory slot into the iommu */
 	if (npages) {
 		r = kvm_iommu_map_pages(kvm, &new);
 		if (r)
 			goto out_slots;
 	}
 	/* actual memory is freed via old in kvm_free_physmem_slot below */
 	if (!npages) {
 		new.dirty_bitmap = NULL;
 		memset(&new.arch, 0, sizeof(new.arch));
 	}
 	old_memslots = install_new_memslots(kvm, slots, &new);
 	kvm_arch_commit_memory_region(kvm, mem, old, user_alloc);
 	kvm_free_physmem_slot(&old, &new);
 	kfree(old_memslots);
 	return 0;
 out_slots:
 	kfree(slots);
 out_free:
 	kvm_free_physmem_slot(&new, &old);
 out:
 	return r;
 }
 EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
 int kvm_set_memory_region(struct kvm *kvm,
 			  struct kvm_userspace_memory_region *mem,
 			  bool user_alloc)
 {
 	int r;
 	mutex_lock(&kvm->slots_lock);
 	r = __kvm_set_memory_region(kvm, mem, user_alloc);
 	mutex_unlock(&kvm->slots_lock);
 	return r;
 }
 EXPORT_SYMBOL_GPL(kvm_set_memory_region);
 int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
 				   struct
 				   kvm_userspace_memory_region *mem,
 				   bool user_alloc)
 {
 	if (mem->slot >= KVM_USER_MEM_SLOTS)
 		return -EINVAL;
 	return kvm_set_memory_region(kvm, mem, user_alloc);
 }
 int kvm_get_dirty_log(struct kvm *kvm,
 			struct kvm_dirty_log *log, int *is_dirty)
 {
 	struct kvm_memory_slot *memslot;
 	int r, i;
 	unsigned long n;
 	unsigned long any = 0;
 	r = -EINVAL;
 	if (log->slot >= KVM_USER_MEM_SLOTS)
 		goto out;
 	memslot = id_to_memslot(kvm->memslots, log->slot);
 	r = -ENOENT;
 	if (!memslot->dirty_bitmap)
 		goto out;
 	n = kvm_dirty_bitmap_bytes(memslot);
 	for (i = 0; !any && i < n/sizeof(long); ++i)
 		any = memslot->dirty_bitmap[i];
 	r = -EFAULT;
 	if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
 		goto out;
 	if (any)
 		*is_dirty = 1;
 	r = 0;
 out:
 	return r;
 }
 bool kvm_largepages_enabled(void)
 {
 	return largepages_enabled;
 }
 void kvm_disable_largepages(void)
 {
 	largepages_enabled = false;
 }
 EXPORT_SYMBOL_GPL(kvm_disable_largepages);
 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
 {
 	return __gfn_to_memslot(kvm_memslots(kvm), gfn);
 }
 EXPORT_SYMBOL_GPL(gfn_to_memslot);
 int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
 {
 	struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn);
 	if (!memslot || memslot->id >= KVM_USER_MEM_SLOTS ||
 	      memslot->flags & KVM_MEMSLOT_INVALID)
 		return 0;
 	return 1;
 }
 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
 unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn)
 {
 	struct vm_area_struct *vma;
 	unsigned long addr, size;
 	size = PAGE_SIZE;
 	addr = gfn_to_hva(kvm, gfn);
 	if (kvm_is_error_hva(addr))
 		return PAGE_SIZE;
 	down_read(&current->mm->mmap_sem);
 	vma = find_vma(current->mm, addr);
 	if (!vma)
 		goto out;
 	size = vma_kernel_pagesize(vma);
 out:
 	up_read(&current->mm->mmap_sem);
 	return size;
 }
 static bool memslot_is_readonly(struct kvm_memory_slot *slot)
 {
 	return slot->flags & KVM_MEM_READONLY;
 }
 static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
 				       gfn_t *nr_pages, bool write)
 {
 	if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
 		return KVM_HVA_ERR_BAD;
 	if (memslot_is_readonly(slot) && write)
 		return KVM_HVA_ERR_RO_BAD;
 	if (nr_pages)
 		*nr_pages = slot->npages - (gfn - slot->base_gfn);
 	return __gfn_to_hva_memslot(slot, gfn);
 }
 static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
 				     gfn_t *nr_pages)
 {
 	return __gfn_to_hva_many(slot, gfn, nr_pages, true);
 }
 unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
 				 gfn_t gfn)
 {
 	return gfn_to_hva_many(slot, gfn, NULL);
 }
 EXPORT_SYMBOL_GPL(gfn_to_hva_memslot);
 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
 {
 	return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
 }
 EXPORT_SYMBOL_GPL(gfn_to_hva);
 /*
  * The hva returned by this function is only allowed to be read.
  * It should pair with kvm_read_hva() or kvm_read_hva_atomic().
  */
 static unsigned long gfn_to_hva_read(struct kvm *kvm, gfn_t gfn)
 {
 	return __gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL, false);
 }
 static int kvm_read_hva(void *data, void __user *hva, int len)
 {
 	return __copy_from_user(data, hva, len);
 }
 static int kvm_read_hva_atomic(void *data, void __user *hva, int len)
 {
 	return __copy_from_user_inatomic(data, hva, len);
 }
 int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm,
 	unsigned long start, int write, struct page **page)
 {
 	int flags = FOLL_TOUCH | FOLL_NOWAIT | FOLL_HWPOISON | FOLL_GET;
 	if (write)
 		flags |= FOLL_WRITE;
 	return __get_user_pages(tsk, mm, start, 1, flags, page, NULL, NULL);
 }
 static inline int check_user_page_hwpoison(unsigned long addr)
 {
 	int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE;
 	rc = __get_user_pages(current, current->mm, addr, 1,
 			      flags, NULL, NULL, NULL);
 	return rc == -EHWPOISON;
 }
 /*
  * The atomic path to get the writable pfn which will be stored in @pfn,
  * true indicates success, otherwise false is returned.
  */
 static bool hva_to_pfn_fast(unsigned long addr, bool atomic, bool *async,
 			    bool write_fault, bool *writable, pfn_t *pfn)
 {
 	struct page *page[1];
 	int npages;
 	if (!(async || atomic))
 		return false;
 	/*
 	 * Fast pin a writable pfn only if it is a write fault request
 	 * or the caller allows to map a writable pfn for a read fault
 	 * request.
 	 */
 	if (!(write_fault || writable))
 		return false;
 	npages = __get_user_pages_fast(addr, 1, 1, page);
 	if (npages == 1) {
 		*pfn = page_to_pfn(page[0]);
 		if (writable)
 			*writable = true;
 		return true;
 	}
 	return false;
 }
 /*
  * The slow path to get the pfn of the specified host virtual address,
  * 1 indicates success, -errno is returned if error is detected.
  */
 static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
 			   bool *writable, pfn_t *pfn)
 {
 	struct page *page[1];
 	int npages = 0;
 	might_sleep();
 	if (writable)
 		*writable = write_fault;
 	if (async) {
 		down_read(&current->mm->mmap_sem);
 		npages = get_user_page_nowait(current, current->mm,
 					      addr, write_fault, page);
 		up_read(&current->mm->mmap_sem);
 	} else
 		npages = get_user_pages_fast(addr, 1, write_fault,
 					     page);
 	if (npages != 1)
 		return npages;
 	/* map read fault as writable if possible */
 	if (unlikely(!write_fault) && writable) {
 		struct page *wpage[1];
 		npages = __get_user_pages_fast(addr, 1, 1, wpage);
 		if (npages == 1) {
 			*writable = true;
 			put_page(page[0]);
 			page[0] = wpage[0];
 		}
 		npages = 1;
 	}
 	*pfn = page_to_pfn(page[0]);
 	return npages;
 }
 static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
 {
 	if (unlikely(!(vma->vm_flags & VM_READ)))
 		return false;
 	if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE))))
 		return false;
 	return true;
 }
 /*
  * Pin guest page in memory and return its pfn.
  * @addr: host virtual address which maps memory to the guest
  * @atomic: whether this function can sleep
  * @async: whether this function need to wait IO complete if the
  *         host page is not in the memory
  * @write_fault: whether we should get a writable host page
  * @writable: whether it allows to map a writable host page for !@write_fault
  *
  * The function will map a writable host page for these two cases:
  * 1): @write_fault = true
  * 2): @write_fault = false && @writable, @writable will tell the caller
  *     whether the mapping is writable.
  */
 static pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
 			bool write_fault, bool *writable)
 {
 	struct vm_area_struct *vma;
 	pfn_t pfn = 0;
 	int npages;
 	/* we can do it either atomically or asynchronously, not both */
 	BUG_ON(atomic && async);
 	if (hva_to_pfn_fast(addr, atomic, async, write_fault, writable, &pfn))
 		return pfn;
 	if (atomic)
 		return KVM_PFN_ERR_FAULT;
 	npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn);
 	if (npages == 1)
 		return pfn;
 	down_read(&current->mm->mmap_sem);
 	if (npages == -EHWPOISON ||
 	      (!async && check_user_page_hwpoison(addr))) {
 		pfn = KVM_PFN_ERR_HWPOISON;
 		goto exit;
 	}
 	vma = find_vma_intersection(current->mm, addr, addr + 1);
 	if (vma == NULL)
 		pfn = KVM_PFN_ERR_FAULT;
 	else if ((vma->vm_flags & VM_PFNMAP)) {
 		pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) +
 			vma->vm_pgoff;
 		BUG_ON(!kvm_is_mmio_pfn(pfn));
 	} else {
 		if (async && vma_is_valid(vma, write_fault))
 			*async = true;
 		pfn = KVM_PFN_ERR_FAULT;
 	}
 exit:
 	up_read(&current->mm->mmap_sem);
 	return pfn;
 }
 static pfn_t
 __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic,
 		     bool *async, bool write_fault, bool *writable)
 {
 	unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
 	if (addr == KVM_HVA_ERR_RO_BAD)
 		return KVM_PFN_ERR_RO_FAULT;
 	if (kvm_is_error_hva(addr))
 		return KVM_PFN_NOSLOT;
 	/* Do not map writable pfn in the readonly memslot. */
 	if (writable && memslot_is_readonly(slot)) {
 		*writable = false;
 		writable = NULL;
 	}
 	return hva_to_pfn(addr, atomic, async, write_fault,
 			  writable);
 }
 static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async,
 			  bool write_fault, bool *writable)
 {
 	struct kvm_memory_slot *slot;
 	if (async)
 		*async = false;
 	slot = gfn_to_memslot(kvm, gfn);
 	return __gfn_to_pfn_memslot(slot, gfn, atomic, async, write_fault,
 				    writable);
 }
 pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
 {
 	return __gfn_to_pfn(kvm, gfn, true, NULL, true, NULL);
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic);
 pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async,
 		       bool write_fault, bool *writable)
 {
 	return __gfn_to_pfn(kvm, gfn, false, async, write_fault, writable);
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn_async);
 pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
 {
 	return __gfn_to_pfn(kvm, gfn, false, NULL, true, NULL);
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn);
 pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
 		      bool *writable)
 {
 	return __gfn_to_pfn(kvm, gfn, false, NULL, write_fault, writable);
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
 pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
 {
 	return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL);
 }
 pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn)
 {
 	return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL);
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
 int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
 								  int nr_pages)
 {
 	unsigned long addr;
 	gfn_t entry;
 	addr = gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, &entry);
 	if (kvm_is_error_hva(addr))
 		return -1;
 	if (entry < nr_pages)
 		return 0;
 	return __get_user_pages_fast(addr, nr_pages, 1, pages);
 }
 EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
 static struct page *kvm_pfn_to_page(pfn_t pfn)
 {
 	if (is_error_noslot_pfn(pfn))
 		return KVM_ERR_PTR_BAD_PAGE;
 	if (kvm_is_mmio_pfn(pfn)) {
 		WARN_ON(1);
 		return KVM_ERR_PTR_BAD_PAGE;
 	}
 	return pfn_to_page(pfn);
 }
 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
 {
 	pfn_t pfn;
 	pfn = gfn_to_pfn(kvm, gfn);
 	return kvm_pfn_to_page(pfn);
 }
 EXPORT_SYMBOL_GPL(gfn_to_page);
 void kvm_release_page_clean(struct page *page)
 {
 	WARN_ON(is_error_page(page));
 	kvm_release_pfn_clean(page_to_pfn(page));
 }
 EXPORT_SYMBOL_GPL(kvm_release_page_clean);
 void kvm_release_pfn_clean(pfn_t pfn)
 {
 	if (!is_error_noslot_pfn(pfn) && !kvm_is_mmio_pfn(pfn))
 		put_page(pfn_to_page(pfn));
 }
 EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
 void kvm_release_page_dirty(struct page *page)
 {
 	WARN_ON(is_error_page(page));
 	kvm_release_pfn_dirty(page_to_pfn(page));
 }
 EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
 void kvm_release_pfn_dirty(pfn_t pfn)
 {
 	kvm_set_pfn_dirty(pfn);
 	kvm_release_pfn_clean(pfn);
 }
 EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
 void kvm_set_page_dirty(struct page *page)
 {
 	kvm_set_pfn_dirty(page_to_pfn(page));
 }
 EXPORT_SYMBOL_GPL(kvm_set_page_dirty);
 void kvm_set_pfn_dirty(pfn_t pfn)
 {
 	if (!kvm_is_mmio_pfn(pfn)) {
 		struct page *page = pfn_to_page(pfn);
 		if (!PageReserved(page))
 			SetPageDirty(page);
 	}
 }
 EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
 void kvm_set_pfn_accessed(pfn_t pfn)
 {
 	if (!kvm_is_mmio_pfn(pfn))
 		mark_page_accessed(pfn_to_page(pfn));
 }
 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
 void kvm_get_pfn(pfn_t pfn)
 {
 	if (!kvm_is_mmio_pfn(pfn))
 		get_page(pfn_to_page(pfn));
 }
 EXPORT_SYMBOL_GPL(kvm_get_pfn);
 static int next_segment(unsigned long len, int offset)
 {
 	if (len > PAGE_SIZE - offset)
 		return PAGE_SIZE - offset;
 	else
 		return len;
 }
 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
 			int len)
 {
 	int r;
 	unsigned long addr;
 	addr = gfn_to_hva_read(kvm, gfn);
 	if (kvm_is_error_hva(addr))
 		return -EFAULT;
 	r = kvm_read_hva(data, (void __user *)addr + offset, len);
 	if (r)
 		return -EFAULT;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(kvm_read_guest_page);
 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
 {
 	gfn_t gfn = gpa >> PAGE_SHIFT;
 	int seg;
 	int offset = offset_in_page(gpa);
 	int ret;
 	while ((seg = next_segment(len, offset)) != 0) {
 		ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
 		if (ret < 0)
 			return ret;
 		offset = 0;
 		len -= seg;
 		data += seg;
 		++gfn;
 	}
 	return 0;
 }
 EXPORT_SYMBOL_GPL(kvm_read_guest);
 int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
 			  unsigned long len)
 {
 	int r;
 	unsigned long addr;
 	gfn_t gfn = gpa >> PAGE_SHIFT;
 	int offset = offset_in_page(gpa);
 	addr = gfn_to_hva_read(kvm, gfn);
 	if (kvm_is_error_hva(addr))
 		return -EFAULT;
 	pagefault_disable();
 	r = kvm_read_hva_atomic(data, (void __user *)addr + offset, len);
 	pagefault_enable();
 	if (r)
 		return -EFAULT;
 	return 0;
 }
 EXPORT_SYMBOL(kvm_read_guest_atomic);
 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
 			 int offset, int len)
 {
 	int r;
 	unsigned long addr;
 	addr = gfn_to_hva(kvm, gfn);
 	if (kvm_is_error_hva(addr))
 		return -EFAULT;
 	r = __copy_to_user((void __user *)addr + offset, data, len);
 	if (r)
 		return -EFAULT;
 	mark_page_dirty(kvm, gfn);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(kvm_write_guest_page);
 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
 		    unsigned long len)
 {
 	gfn_t gfn = gpa >> PAGE_SHIFT;
 	int seg;
 	int offset = offset_in_page(gpa);
 	int ret;
 	while ((seg = next_segment(len, offset)) != 0) {
 		ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
 		if (ret < 0)
 			return ret;
 		offset = 0;
 		len -= seg;
 		data += seg;
 		++gfn;
 	}
 	return 0;
 }
 int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
 			      gpa_t gpa)
 {
 	struct kvm_memslots *slots = kvm_memslots(kvm);
 	int offset = offset_in_page(gpa);
 	gfn_t gfn = gpa >> PAGE_SHIFT;
 	ghc->gpa = gpa;
 	ghc->generation = slots->generation;
 	ghc->memslot = gfn_to_memslot(kvm, gfn);
 	ghc->hva = gfn_to_hva_many(ghc->memslot, gfn, NULL);
 	if (!kvm_is_error_hva(ghc->hva))
 		ghc->hva += offset;
 	else
 		return -EFAULT;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
 int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
 			   void *data, unsigned long len)
 {
 	struct kvm_memslots *slots = kvm_memslots(kvm);
 	int r;
 	if (slots->generation != ghc->generation)
 		kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa);
 	if (kvm_is_error_hva(ghc->hva))
 		return -EFAULT;
 	r = __copy_to_user((void __user *)ghc->hva, data, len);
 	if (r)
 		return -EFAULT;
 	mark_page_dirty_in_slot(kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
 int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
 			   void *data, unsigned long len)
 {
 	struct kvm_memslots *slots = kvm_memslots(kvm);
 	int r;
 	if (slots->generation != ghc->generation)
 		kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa);
 	if (kvm_is_error_hva(ghc->hva))
 		return -EFAULT;
 	r = __copy_from_user(data, (void __user *)ghc->hva, len);
 	if (r)
 		return -EFAULT;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(kvm_read_guest_cached);
 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
 {
 	return kvm_write_guest_page(kvm, gfn, (const void *) empty_zero_page,
 				    offset, len);
 }
 EXPORT_SYMBOL_GPL(kvm_clear_guest_page);
 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
 {
 	gfn_t gfn = gpa >> PAGE_SHIFT;
 	int seg;
 	int offset = offset_in_page(gpa);
 	int ret;
         while ((seg = next_segment(len, offset)) != 0) {
 		ret = kvm_clear_guest_page(kvm, gfn, offset, seg);
 		if (ret < 0)
 			return ret;
 		offset = 0;
 		len -= seg;
 		++gfn;
 	}
 	return 0;
 }
 EXPORT_SYMBOL_GPL(kvm_clear_guest);
 void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot,
 			     gfn_t gfn)
 {
 	if (memslot && memslot->dirty_bitmap) {
 		unsigned long rel_gfn = gfn - memslot->base_gfn;
 		set_bit_le(rel_gfn, memslot->dirty_bitmap);
 	}
 }
 void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
 {
 	struct kvm_memory_slot *memslot;
 	memslot = gfn_to_memslot(kvm, gfn);
 	mark_page_dirty_in_slot(kvm, memslot, gfn);
 }
 /*
  * The vCPU has executed a HLT instruction with in-kernel mode enabled.
  */
 void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 {
 	DEFINE_WAIT(wait);
 	for (;;) {
 		prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
 		if (kvm_arch_vcpu_runnable(vcpu)) {
 			kvm_make_request(KVM_REQ_UNHALT, vcpu);
 			break;
 		}
 		if (kvm_cpu_has_pending_timer(vcpu))
 			break;
 		if (signal_pending(current))
 			break;
 		schedule();
 	}
 	finish_wait(&vcpu->wq, &wait);
 }
 #ifndef CONFIG_S390
 /*
  * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode.
  */
 void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
 {
 	int me;
 	int cpu = vcpu->cpu;
 	wait_queue_head_t *wqp;
 	wqp = kvm_arch_vcpu_wq(vcpu);
 	if (waitqueue_active(wqp)) {
 		wake_up_interruptible(wqp);
 		++vcpu->stat.halt_wakeup;
 	}
 	me = get_cpu();
 	if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
 		if (kvm_arch_vcpu_should_kick(vcpu))
 			smp_send_reschedule(cpu);
 	put_cpu();
 }
 #endif /* !CONFIG_S390 */
 void kvm_resched(struct kvm_vcpu *vcpu)
 {
 	if (!need_resched())
 		return;
 	cond_resched();
 }
 EXPORT_SYMBOL_GPL(kvm_resched);
 bool kvm_vcpu_yield_to(struct kvm_vcpu *target)
 {
 	struct pid *pid;
 	struct task_struct *task = NULL;
 	rcu_read_lock();
 	pid = rcu_dereference(target->pid);
 	if (pid)
 		task = get_pid_task(target->pid, PIDTYPE_PID);
 	rcu_read_unlock();
 	if (!task)
 		return false;
 	if (task->flags & PF_VCPU) {
 		put_task_struct(task);
 		return false;
 	}
 	if (yield_to(task, 1)) {
 		put_task_struct(task);
 		return true;
 	}
 	put_task_struct(task);
 	return false;
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
 #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
 /*
  * Helper that checks whether a VCPU is eligible for directed yield.
  * Most eligible candidate to yield is decided by following heuristics:
  *
  *  (a) VCPU which has not done pl-exit or cpu relax intercepted recently
  *  (preempted lock holder), indicated by @in_spin_loop.
  *  Set at the beiginning and cleared at the end of interception/PLE handler.
  *
  *  (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get
  *  chance last time (mostly it has become eligible now since we have probably
  *  yielded to lockholder in last iteration. This is done by toggling
  *  @dy_eligible each time a VCPU checked for eligibility.)
  *
  *  Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding
  *  to preempted lock-holder could result in wrong VCPU selection and CPU
  *  burning. Giving priority for a potential lock-holder increases lock
  *  progress.
  *
  *  Since algorithm is based on heuristics, accessing another VCPU data without
  *  locking does not harm. It may result in trying to yield to  same VCPU, fail
  *  and continue with next VCPU and so on.
  */
 bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
 {
 	bool eligible;
 	eligible = !vcpu->spin_loop.in_spin_loop ||
 			(vcpu->spin_loop.in_spin_loop &&
 			 vcpu->spin_loop.dy_eligible);
 	if (vcpu->spin_loop.in_spin_loop)
 		kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
 	return eligible;
 }
 #endif
 void kvm_vcpu_on_spin(struct kvm_vcpu *me)
 {
 	struct kvm *kvm = me->kvm;
 	struct kvm_vcpu *vcpu;
 	int last_boosted_vcpu = me->kvm->last_boosted_vcpu;
 	int yielded = 0;
 	int pass;
 	int i;
 	kvm_vcpu_set_in_spin_loop(me, true);
 	/*
 	 * We boost the priority of a VCPU that is runnable but not
 	 * currently running, because it got preempted by something
 	 * else and called schedule in __vcpu_run.  Hopefully that
 	 * VCPU is holding the lock that we need and will release it.
 	 * We approximate round-robin by starting at the last boosted VCPU.
 	 */
 	for (pass = 0; pass < 2 && !yielded; pass++) {
 		kvm_for_each_vcpu(i, vcpu, kvm) {
 			if (!pass && i <= last_boosted_vcpu) {
 				i = last_boosted_vcpu;
 				continue;
 			} else if (pass && i > last_boosted_vcpu)
 				break;
 			if (vcpu == me)
 				continue;
 			if (waitqueue_active(&vcpu->wq))
 				continue;
 			if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
 				continue;
 			if (kvm_vcpu_yield_to(vcpu)) {
 				kvm->last_boosted_vcpu = i;
 				yielded = 1;
 				break;
 			}
 		}
 	}
 	kvm_vcpu_set_in_spin_loop(me, false);
 	/* Ensure vcpu is not eligible during next spinloop */
 	kvm_vcpu_set_dy_eligible(me, false);
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
 static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	struct kvm_vcpu *vcpu = vma->vm_file->private_data;
 	struct page *page;
 	if (vmf->pgoff == 0)
 		page = virt_to_page(vcpu->run);
 #ifdef CONFIG_X86
 	else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
 		page = virt_to_page(vcpu->arch.pio_data);
 #endif
 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
 	else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
 		page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
 #endif
 	else
 		return kvm_arch_vcpu_fault(vcpu, vmf);
 	get_page(page);
 	vmf->page = page;
 	return 0;
 }
 static const struct vm_operations_struct kvm_vcpu_vm_ops = {
 	.fault = kvm_vcpu_fault,
 };
 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	vma->vm_ops = &kvm_vcpu_vm_ops;
 	return 0;
 }
 static int kvm_vcpu_release(struct inode *inode, struct file *filp)
 {
 	struct kvm_vcpu *vcpu = filp->private_data;
 	kvm_put_kvm(vcpu->kvm);
 	return 0;
 }
 static struct file_operations kvm_vcpu_fops = {
 	.release        = kvm_vcpu_release,
 	.unlocked_ioctl = kvm_vcpu_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl   = kvm_vcpu_compat_ioctl,
 #endif
 	.mmap           = kvm_vcpu_mmap,
 	.llseek		= noop_llseek,
 };
 /*
  * Allocates an inode for the vcpu.
  */
 static int create_vcpu_fd(struct kvm_vcpu *vcpu)
 {
 	return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR);
 }
 /*
  * Creates some virtual cpus.  Good luck creating more than one.
  */
 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
 {
 	int r;
 	struct kvm_vcpu *vcpu, *v;
 	vcpu = kvm_arch_vcpu_create(kvm, id);
 	if (IS_ERR(vcpu))
 		return PTR_ERR(vcpu);
 	preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
 	r = kvm_arch_vcpu_setup(vcpu);
 	if (r)
 		goto vcpu_destroy;
 	mutex_lock(&kvm->lock);
 	if (!kvm_vcpu_compatible(vcpu)) {
 		r = -EINVAL;
 		goto unlock_vcpu_destroy;
 	}
 	if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) {
 		r = -EINVAL;
 		goto unlock_vcpu_destroy;
 	}
 	kvm_for_each_vcpu(r, v, kvm)
 		if (v->vcpu_id == id) {
 			r = -EEXIST;
 			goto unlock_vcpu_destroy;
 		}
 	BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]);
 	/* Now it's all set up, let userspace reach it */
 	kvm_get_kvm(kvm);
 	r = create_vcpu_fd(vcpu);
 	if (r < 0) {
 		kvm_put_kvm(kvm);
 		goto unlock_vcpu_destroy;
 	}
 	kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu;
 	smp_wmb();
 	atomic_inc(&kvm->online_vcpus);
 	mutex_unlock(&kvm->lock);
 	kvm_arch_vcpu_postcreate(vcpu);
 	return r;
 unlock_vcpu_destroy:
 	mutex_unlock(&kvm->lock);
 vcpu_destroy:
 	kvm_arch_vcpu_destroy(vcpu);
 	return r;
 }
 static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
 {
 	if (sigset) {
 		sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
 		vcpu->sigset_active = 1;
 		vcpu->sigset = *sigset;
 	} else
 		vcpu->sigset_active = 0;
 	return 0;
 }
 static long kvm_vcpu_ioctl(struct file *filp,
 			   unsigned int ioctl, unsigned long arg)
 {
 	struct kvm_vcpu *vcpu = filp->private_data;
 	void __user *argp = (void __user *)arg;
 	int r;
 	struct kvm_fpu *fpu = NULL;
 	struct kvm_sregs *kvm_sregs = NULL;
 	if (vcpu->kvm->mm != current->mm)
 		return -EIO;
 #if defined(CONFIG_S390) || defined(CONFIG_PPC)
 	/*
 	 * Special cases: vcpu ioctls that are asynchronous to vcpu execution,
 	 * so vcpu_load() would break it.
 	 */
 	if (ioctl == KVM_S390_INTERRUPT || ioctl == KVM_INTERRUPT)
 		return kvm_arch_vcpu_ioctl(filp, ioctl, arg);
 #endif
 	r = vcpu_load(vcpu);
 	if (r)
 		return r;
 	switch (ioctl) {
 	case KVM_RUN:
 		r = -EINVAL;
 		if (arg)
 			goto out;
 		r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
 		trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
 		break;
 	case KVM_GET_REGS: {
 		struct kvm_regs *kvm_regs;
 		r = -ENOMEM;
 		kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL);
 		if (!kvm_regs)
 			goto out;
 		r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
 		if (r)
 			goto out_free1;
 		r = -EFAULT;
 		if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs)))
 			goto out_free1;
 		r = 0;
 out_free1:
 		kfree(kvm_regs);
 		break;
 	}
 	case KVM_SET_REGS: {
 		struct kvm_regs *kvm_regs;
 		r = -ENOMEM;
 		kvm_regs = memdup_user(argp, sizeof(*kvm_regs));
 		if (IS_ERR(kvm_regs)) {
 			r = PTR_ERR(kvm_regs);
 			goto out;
 		}
 		r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
 		kfree(kvm_regs);
 		break;
 	}
 	case KVM_GET_SREGS: {
 		kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL);
 		r = -ENOMEM;
 		if (!kvm_sregs)
 			goto out;
 		r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs);
 		if (r)
 			goto out;
 		r = -EFAULT;
 		if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs)))
 			goto out;
 		r = 0;
 		break;
 	}
 	case KVM_SET_SREGS: {
 		kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs));
 		if (IS_ERR(kvm_sregs)) {
 			r = PTR_ERR(kvm_sregs);
 			kvm_sregs = NULL;
 			goto out;
 		}
 		r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
 		break;
 	}
 	case KVM_GET_MP_STATE: {
 		struct kvm_mp_state mp_state;
 		r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state);
 		if (r)
 			goto out;
 		r = -EFAULT;
 		if (copy_to_user(argp, &mp_state, sizeof mp_state))
 			goto out;
 		r = 0;
 		break;
 	}
 	case KVM_SET_MP_STATE: {
 		struct kvm_mp_state mp_state;
 		r = -EFAULT;
 		if (copy_from_user(&mp_state, argp, sizeof mp_state))
 			goto out;
 		r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
 		break;
 	}
 	case KVM_TRANSLATE: {
 		struct kvm_translation tr;
 		r = -EFAULT;
 		if (copy_from_user(&tr, argp, sizeof tr))
 			goto out;
 		r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
 		if (r)
 			goto out;
 		r = -EFAULT;
 		if (copy_to_user(argp, &tr, sizeof tr))
 			goto out;
 		r = 0;
 		break;
 	}
 	case KVM_SET_GUEST_DEBUG: {
 		struct kvm_guest_debug dbg;
 		r = -EFAULT;
 		if (copy_from_user(&dbg, argp, sizeof dbg))
 			goto out;
 		r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
 		break;
 	}
 	case KVM_SET_SIGNAL_MASK: {
 		struct kvm_signal_mask __user *sigmask_arg = argp;
 		struct kvm_signal_mask kvm_sigmask;
 		sigset_t sigset, *p;
 		p = NULL;
 		if (argp) {
 			r = -EFAULT;
 			if (copy_from_user(&kvm_sigmask, argp,
 					   sizeof kvm_sigmask))
 				goto out;
 			r = -EINVAL;
 			if (kvm_sigmask.len != sizeof sigset)
 				goto out;
 			r = -EFAULT;
 			if (copy_from_user(&sigset, sigmask_arg->sigset,
 					   sizeof sigset))
 				goto out;
 			p = &sigset;
 		}
 		r = kvm_vcpu_ioctl_set_sigmask(vcpu, p);
 		break;
 	}
 	case KVM_GET_FPU: {
 		fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL);
 		r = -ENOMEM;
 		if (!fpu)
 			goto out;
 		r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu);
 		if (r)
 			goto out;
 		r = -EFAULT;
 		if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu)))
 			goto out;
 		r = 0;
 		break;
 	}
 	case KVM_SET_FPU: {
 		fpu = memdup_user(argp, sizeof(*fpu));
 		if (IS_ERR(fpu)) {
 			r = PTR_ERR(fpu);
 			fpu = NULL;
 			goto out;
 		}
 		r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
 		break;
 	}
 	default:
 		r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
 	}
 out:
 	vcpu_put(vcpu);
 	kfree(fpu);
 	kfree(kvm_sregs);
 	return r;
 }
 #ifdef CONFIG_COMPAT
 static long kvm_vcpu_compat_ioctl(struct file *filp,
 				  unsigned int ioctl, unsigned long arg)
 {
 	struct kvm_vcpu *vcpu = filp->private_data;
 	void __user *argp = compat_ptr(arg);
 	int r;
 	if (vcpu->kvm->mm != current->mm)
 		return -EIO;
 	switch (ioctl) {
 	case KVM_SET_SIGNAL_MASK: {
 		struct kvm_signal_mask __user *sigmask_arg = argp;
 		struct kvm_signal_mask kvm_sigmask;
 		compat_sigset_t csigset;
 		sigset_t sigset;
 		if (argp) {
 			r = -EFAULT;
 			if (copy_from_user(&kvm_sigmask, argp,
 					   sizeof kvm_sigmask))
 				goto out;
 			r = -EINVAL;
 			if (kvm_sigmask.len != sizeof csigset)
 				goto out;
 			r = -EFAULT;
 			if (copy_from_user(&csigset, sigmask_arg->sigset,
 					   sizeof csigset))
 				goto out;
 			sigset_from_compat(&sigset, &csigset);
 			r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
 		} else
 			r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL);
 		break;
 	}
 	default:
 		r = kvm_vcpu_ioctl(filp, ioctl, arg);
 	}
 out:
 	return r;
 }
 #endif
 static long kvm_vm_ioctl(struct file *filp,
 			   unsigned int ioctl, unsigned long arg)
 {
 	struct kvm *kvm = filp->private_data;
 	void __user *argp = (void __user *)arg;
 	int r;
 	if (kvm->mm != current->mm)
 		return -EIO;
 	switch (ioctl) {
 	case KVM_CREATE_VCPU:
 		r = kvm_vm_ioctl_create_vcpu(kvm, arg);
 		break;
 	case KVM_SET_USER_MEMORY_REGION: {
 		struct kvm_userspace_memory_region kvm_userspace_mem;
 		r = -EFAULT;
 		if (copy_from_user(&kvm_userspace_mem, argp,
 						sizeof kvm_userspace_mem))
 			goto out;
 		r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, true);
 		break;
 	}
 	case KVM_GET_DIRTY_LOG: {
 		struct kvm_dirty_log log;
 		r = -EFAULT;
 		if (copy_from_user(&log, argp, sizeof log))
 			goto out;
 		r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
 		break;
 	}
 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
 	case KVM_REGISTER_COALESCED_MMIO: {
 		struct kvm_coalesced_mmio_zone zone;
 		r = -EFAULT;
 		if (copy_from_user(&zone, argp, sizeof zone))
 			goto out;
 		r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
 		break;
 	}
 	case KVM_UNREGISTER_COALESCED_MMIO: {
 		struct kvm_coalesced_mmio_zone zone;
 		r = -EFAULT;
 		if (copy_from_user(&zone, argp, sizeof zone))
 			goto out;
 		r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
 		break;
 	}
 #endif
 	case KVM_IRQFD: {
 		struct kvm_irqfd data;
 		r = -EFAULT;
 		if (copy_from_user(&data, argp, sizeof data))
 			goto out;
 		r = kvm_irqfd(kvm, &data);
 		break;
 	}
 	case KVM_IOEVENTFD: {
 		struct kvm_ioeventfd data;
 		r = -EFAULT;
 		if (copy_from_user(&data, argp, sizeof data))
 			goto out;
 		r = kvm_ioeventfd(kvm, &data);
 		break;
 	}
 #ifdef CONFIG_KVM_APIC_ARCHITECTURE
 	case KVM_SET_BOOT_CPU_ID:
 		r = 0;
 		mutex_lock(&kvm->lock);
 		if (atomic_read(&kvm->online_vcpus) != 0)
 			r = -EBUSY;
 		else
 			kvm->bsp_vcpu_id = arg;
 		mutex_unlock(&kvm->lock);
 		break;
 #endif
 #ifdef CONFIG_HAVE_KVM_MSI
 	case KVM_SIGNAL_MSI: {
 		struct kvm_msi msi;
 		r = -EFAULT;
 		if (copy_from_user(&msi, argp, sizeof msi))
 			goto out;
 		r = kvm_send_userspace_msi(kvm, &msi);
 		break;
 	}
 #endif
 #ifdef __KVM_HAVE_IRQ_LINE
 	case KVM_IRQ_LINE_STATUS:
 	case KVM_IRQ_LINE: {
 		struct kvm_irq_level irq_event;
 		r = -EFAULT;
 		if (copy_from_user(&irq_event, argp, sizeof irq_event))
 			goto out;
 		r = kvm_vm_ioctl_irq_line(kvm, &irq_event);
 		if (r)
 			goto out;
 		r = -EFAULT;
 		if (ioctl == KVM_IRQ_LINE_STATUS) {
 			if (copy_to_user(argp, &irq_event, sizeof irq_event))
 				goto out;
 		}
 		r = 0;
 		break;
 	}
 #endif
 	default:
 		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
 		if (r == -ENOTTY)
 			r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg);
 	}
 out:
 	return r;
 }
 #ifdef CONFIG_COMPAT
 struct compat_kvm_dirty_log {
 	__u32 slot;
 	__u32 padding1;
 	union {
 		compat_uptr_t dirty_bitmap; /* one bit per page */
 		__u64 padding2;
 	};
 };
 static long kvm_vm_compat_ioctl(struct file *filp,
 			   unsigned int ioctl, unsigned long arg)
 {
 	struct kvm *kvm = filp->private_data;
 	int r;
 	if (kvm->mm != current->mm)
 		return -EIO;
 	switch (ioctl) {
 	case KVM_GET_DIRTY_LOG: {
 		struct compat_kvm_dirty_log compat_log;
 		struct kvm_dirty_log log;
 		r = -EFAULT;
 		if (copy_from_user(&compat_log, (void __user *)arg,
 				   sizeof(compat_log)))
 			goto out;
 		log.slot	 = compat_log.slot;
 		log.padding1	 = compat_log.padding1;
 		log.padding2	 = compat_log.padding2;
 		log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
 		r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
 		break;
 	}
 	default:
 		r = kvm_vm_ioctl(filp, ioctl, arg);
 	}
 out:
 	return r;
 }
 #endif
 static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	struct page *page[1];
 	unsigned long addr;
 	int npages;
 	gfn_t gfn = vmf->pgoff;
 	struct kvm *kvm = vma->vm_file->private_data;
 	addr = gfn_to_hva(kvm, gfn);
 	if (kvm_is_error_hva(addr))
 		return VM_FAULT_SIGBUS;
 	npages = get_user_pages(current, current->mm, addr, 1, 1, 0, page,
 				NULL);
 	if (unlikely(npages != 1))
 		return VM_FAULT_SIGBUS;
 	vmf->page = page[0];
 	return 0;
 }
 static const struct vm_operations_struct kvm_vm_vm_ops = {
 	.fault = kvm_vm_fault,
 };
 static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	vma->vm_ops = &kvm_vm_vm_ops;
 	return 0;
 }
 static struct file_operations kvm_vm_fops = {
 	.release        = kvm_vm_release,
 	.unlocked_ioctl = kvm_vm_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl   = kvm_vm_compat_ioctl,
 #endif
 	.mmap           = kvm_vm_mmap,
 	.llseek		= noop_llseek,
 };
 static int kvm_dev_ioctl_create_vm(unsigned long type)
 {
 	int r;
 	struct kvm *kvm;
 	kvm = kvm_create_vm(type);
 	if (IS_ERR(kvm))
 		return PTR_ERR(kvm);
 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
 	r = kvm_coalesced_mmio_init(kvm);
 	if (r < 0) {
 		kvm_put_kvm(kvm);
 		return r;
 	}
 #endif
 	r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
 	if (r < 0)
 		kvm_put_kvm(kvm);
 	return r;
 }
 static long kvm_dev_ioctl_check_extension_generic(long arg)
 {
 	switch (arg) {
 	case KVM_CAP_USER_MEMORY:
 	case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
 	case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
 #ifdef CONFIG_KVM_APIC_ARCHITECTURE
 	case KVM_CAP_SET_BOOT_CPU_ID:
 #endif
 	case KVM_CAP_INTERNAL_ERROR_DATA:
 #ifdef CONFIG_HAVE_KVM_MSI
 	case KVM_CAP_SIGNAL_MSI:
 #endif
 		return 1;
 #ifdef KVM_CAP_IRQ_ROUTING
 	case KVM_CAP_IRQ_ROUTING:
 		return KVM_MAX_IRQ_ROUTES;
 #endif
 	default:
 		break;
 	}
 	return kvm_dev_ioctl_check_extension(arg);
 }
 static long kvm_dev_ioctl(struct file *filp,
 			  unsigned int ioctl, unsigned long arg)
 {
 	long r = -EINVAL;
 	switch (ioctl) {
 	case KVM_GET_API_VERSION:
 		r = -EINVAL;
 		if (arg)
 			goto out;
 		r = KVM_API_VERSION;
 		break;
 	case KVM_CREATE_VM:
 		r = kvm_dev_ioctl_create_vm(arg);
 		break;
 	case KVM_CHECK_EXTENSION:
 		r = kvm_dev_ioctl_check_extension_generic(arg);
 		break;
 	case KVM_GET_VCPU_MMAP_SIZE:
 		r = -EINVAL;
 		if (arg)
 			goto out;
 		r = PAGE_SIZE;     /* struct kvm_run */
 #ifdef CONFIG_X86
 		r += PAGE_SIZE;    /* pio data page */
 #endif
 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
 		r += PAGE_SIZE;    /* coalesced mmio ring page */
 #endif
 		break;
 	case KVM_TRACE_ENABLE:
 	case KVM_TRACE_PAUSE:
 	case KVM_TRACE_DISABLE:
 		r = -EOPNOTSUPP;
 		break;
 	default:
 		return kvm_arch_dev_ioctl(filp, ioctl, arg);
 	}
 out:
 	return r;
 }
 static struct file_operations kvm_chardev_ops = {
 	.unlocked_ioctl = kvm_dev_ioctl,
 	.compat_ioctl   = kvm_dev_ioctl,
 	.llseek		= noop_llseek,
 };
 static struct miscdevice kvm_dev = {
 	KVM_MINOR,
 	"kvm",
 	&kvm_chardev_ops,
 };
 static void hardware_enable_nolock(void *junk)
 {
 	int cpu = raw_smp_processor_id();
 	int r;
 	if (cpumask_test_cpu(cpu, cpus_hardware_enabled))
 		return;
 	cpumask_set_cpu(cpu, cpus_hardware_enabled);
 	r = kvm_arch_hardware_enable(NULL);
 	if (r) {
 		cpumask_clear_cpu(cpu, cpus_hardware_enabled);
 		atomic_inc(&hardware_enable_failed);
 		printk(KERN_INFO "kvm: enabling virtualization on "
 				 "CPU%d failed\n", cpu);
 	}
 }
 static void hardware_enable(void *junk)
 {
 	raw_spin_lock(&kvm_lock);
 	hardware_enable_nolock(junk);
 	raw_spin_unlock(&kvm_lock);
 }
 static void hardware_disable_nolock(void *junk)
 {
 	int cpu = raw_smp_processor_id();
 	if (!cpumask_test_cpu(cpu, cpus_hardware_enabled))
 		return;
 	cpumask_clear_cpu(cpu, cpus_hardware_enabled);
 	kvm_arch_hardware_disable(NULL);
 }
 static void hardware_disable(void *junk)
 {
 	raw_spin_lock(&kvm_lock);
 	hardware_disable_nolock(junk);
 	raw_spin_unlock(&kvm_lock);
 }
 static void hardware_disable_all_nolock(void)
 {
 	BUG_ON(!kvm_usage_count);
 	kvm_usage_count--;
 	if (!kvm_usage_count)
 		on_each_cpu(hardware_disable_nolock, NULL, 1);
 }
 static void hardware_disable_all(void)
 {
 	raw_spin_lock(&kvm_lock);
 	hardware_disable_all_nolock();
 	raw_spin_unlock(&kvm_lock);
 }
 static int hardware_enable_all(void)
 {
 	int r = 0;
 	raw_spin_lock(&kvm_lock);
 	kvm_usage_count++;
 	if (kvm_usage_count == 1) {
 		atomic_set(&hardware_enable_failed, 0);
 		on_each_cpu(hardware_enable_nolock, NULL, 1);
 		if (atomic_read(&hardware_enable_failed)) {
 			hardware_disable_all_nolock();
 			r = -EBUSY;
 		}
 	}
 	raw_spin_unlock(&kvm_lock);
 	return r;
 }
 static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
 			   void *v)
 {
 	int cpu = (long)v;
 	if (!kvm_usage_count)
 		return NOTIFY_OK;
 	val &= ~CPU_TASKS_FROZEN;
 	switch (val) {
 	case CPU_DYING:
 		printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
 		       cpu);
 		hardware_disable(NULL);
 		break;
 	case CPU_STARTING:
 		printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
 		       cpu);
 		hardware_enable(NULL);
 		break;
 	}
 	return NOTIFY_OK;
 }
 asmlinkage void kvm_spurious_fault(void)
 {
 	/* Fault while not rebooting.  We want the trace. */
 	BUG();
 }
 EXPORT_SYMBOL_GPL(kvm_spurious_fault);
 static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
 		      void *v)
 {
 	/*
 	 * Some (well, at least mine) BIOSes hang on reboot if
 	 * in vmx root mode.
 	 *
 	 * And Intel TXT required VMX off for all cpu when system shutdown.
 	 */
 	printk(KERN_INFO "kvm: exiting hardware virtualization\n");
 	kvm_rebooting = true;
 	on_each_cpu(hardware_disable_nolock, NULL, 1);
 	return NOTIFY_OK;
 }
 static struct notifier_block kvm_reboot_notifier = {
 	.notifier_call = kvm_reboot,
 	.priority = 0,
 };
 static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
 {
 	int i;
 	for (i = 0; i < bus->dev_count; i++) {
 		struct kvm_io_device *pos = bus->range[i].dev;
 		kvm_iodevice_destructor(pos);
 	}
 	kfree(bus);
 }
 int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
 {
 	const struct kvm_io_range *r1 = p1;
 	const struct kvm_io_range *r2 = p2;
 	if (r1->addr < r2->addr)
 		return -1;
 	if (r1->addr + r1->len > r2->addr + r2->len)
 		return 1;
 	return 0;
 }
 int kvm_io_bus_insert_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev,
 			  gpa_t addr, int len)
 {
 	bus->range[bus->dev_count++] = (struct kvm_io_range) {
 		.addr = addr,
 		.len = len,
 		.dev = dev,
 	};
 	sort(bus->range, bus->dev_count, sizeof(struct kvm_io_range),
 		kvm_io_bus_sort_cmp, NULL);
 	return 0;
 }
 int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus,
 			     gpa_t addr, int len)
 {
 	struct kvm_io_range *range, key;
 	int off;
 	key = (struct kvm_io_range) {
 		.addr = addr,
 		.len = len,
 	};
 	range = bsearch(&key, bus->range, bus->dev_count,
 			sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp);
 	if (range == NULL)
 		return -ENOENT;
 	off = range - bus->range;
 	while (off > 0 && kvm_io_bus_sort_cmp(&key, &bus->range[off-1]) == 0)
 		off--;
 	return off;
 }
 /* kvm_io_bus_write - called under kvm->slots_lock */
 int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 		     int len, const void *val)
 {
 	int idx;
 	struct kvm_io_bus *bus;
 	struct kvm_io_range range;
 	range = (struct kvm_io_range) {
 		.addr = addr,
 		.len = len,
 	};
 	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
 	idx = kvm_io_bus_get_first_dev(bus, addr, len);
 	if (idx < 0)
 		return -EOPNOTSUPP;
 	while (idx < bus->dev_count &&
 		kvm_io_bus_sort_cmp(&range, &bus->range[idx]) == 0) {
 		if (!kvm_iodevice_write(bus->range[idx].dev, addr, len, val))
 			return 0;
 		idx++;
 	}
 	return -EOPNOTSUPP;
 }
 /* kvm_io_bus_read - called under kvm->slots_lock */
 int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 		    int len, void *val)
 {
 	int idx;
 	struct kvm_io_bus *bus;
 	struct kvm_io_range range;
 	range = (struct kvm_io_range) {
 		.addr = addr,
 		.len = len,
 	};
 	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
 	idx = kvm_io_bus_get_first_dev(bus, addr, len);
 	if (idx < 0)
 		return -EOPNOTSUPP;
 	while (idx < bus->dev_count &&
 		kvm_io_bus_sort_cmp(&range, &bus->range[idx]) == 0) {
 		if (!kvm_iodevice_read(bus->range[idx].dev, addr, len, val))
 			return 0;
 		idx++;
 	}
 	return -EOPNOTSUPP;
 }
 /* Caller must hold slots_lock. */
 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 			    int len, struct kvm_io_device *dev)
 {
 	struct kvm_io_bus *new_bus, *bus;
 	bus = kvm->buses[bus_idx];
 	if (bus->dev_count > NR_IOBUS_DEVS - 1)
 		return -ENOSPC;
 	new_bus = kzalloc(sizeof(*bus) + ((bus->dev_count + 1) *
 			  sizeof(struct kvm_io_range)), GFP_KERNEL);
 	if (!new_bus)
 		return -ENOMEM;
 	memcpy(new_bus, bus, sizeof(*bus) + (bus->dev_count *
 	       sizeof(struct kvm_io_range)));
 	kvm_io_bus_insert_dev(new_bus, dev, addr, len);
 	rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
 	synchronize_srcu_expedited(&kvm->srcu);
 	kfree(bus);
 	return 0;
 }
 /* Caller must hold slots_lock. */
 int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
 			      struct kvm_io_device *dev)
 {
 	int i, r;
 	struct kvm_io_bus *new_bus, *bus;
 	bus = kvm->buses[bus_idx];
 	r = -ENOENT;
 	for (i = 0; i < bus->dev_count; i++)
 		if (bus->range[i].dev == dev) {
 			r = 0;
 			break;
 		}
 	if (r)
 		return r;
 	new_bus = kzalloc(sizeof(*bus) + ((bus->dev_count - 1) *
 			  sizeof(struct kvm_io_range)), GFP_KERNEL);
 	if (!new_bus)
 		return -ENOMEM;
 	memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range));
 	new_bus->dev_count--;
 	memcpy(new_bus->range + i, bus->range + i + 1,
 	       (new_bus->dev_count - i) * sizeof(struct kvm_io_range));
 	rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
 	synchronize_srcu_expedited(&kvm->srcu);
 	kfree(bus);
 	return r;
 }
 static struct notifier_block kvm_cpu_notifier = {
 	.notifier_call = kvm_cpu_hotplug,
 };
 static int vm_stat_get(void *_offset, u64 *val)
 {
 	unsigned offset = (long)_offset;
 	struct kvm *kvm;
 	*val = 0;
 	raw_spin_lock(&kvm_lock);
 	list_for_each_entry(kvm, &vm_list, vm_list)
 		*val += *(u32 *)((void *)kvm + offset);
 	raw_spin_unlock(&kvm_lock);
 	return 0;
 }
 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, NULL, "%llu\n");
 static int vcpu_stat_get(void *_offset, u64 *val)
 {
 	unsigned offset = (long)_offset;
 	struct kvm *kvm;
 	struct kvm_vcpu *vcpu;
 	int i;
 	*val = 0;
 	raw_spin_lock(&kvm_lock);
 	list_for_each_entry(kvm, &vm_list, vm_list)
 		kvm_for_each_vcpu(i, vcpu, kvm)
 			*val += *(u32 *)((void *)vcpu + offset);
 	raw_spin_unlock(&kvm_lock);
 	return 0;
 }
 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, NULL, "%llu\n");
 static const struct file_operations *stat_fops[] = {
 	[KVM_STAT_VCPU] = &vcpu_stat_fops,
 	[KVM_STAT_VM]   = &vm_stat_fops,
 };
 static int kvm_init_debug(void)
 {
 	int r = -EFAULT;
 	struct kvm_stats_debugfs_item *p;
 	kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
 	if (kvm_debugfs_dir == NULL)
 		goto out;
 	for (p = debugfs_entries; p->name; ++p) {
 		p->dentry = debugfs_create_file(p->name, 0444, kvm_debugfs_dir,
 						(void *)(long)p->offset,
 						stat_fops[p->kind]);
 		if (p->dentry == NULL)
 			goto out_dir;
 	}
 	return 0;
 out_dir:
 	debugfs_remove_recursive(kvm_debugfs_dir);
 out:
 	return r;
 }
 static void kvm_exit_debug(void)
 {
 	struct kvm_stats_debugfs_item *p;
 	for (p = debugfs_entries; p->name; ++p)
 		debugfs_remove(p->dentry);
 	debugfs_remove(kvm_debugfs_dir);
 }
 static int kvm_suspend(void)
 {
 	if (kvm_usage_count)
 		hardware_disable_nolock(NULL);
 	return 0;
 }
 static void kvm_resume(void)
 {
 	if (kvm_usage_count) {
 		WARN_ON(raw_spin_is_locked(&kvm_lock));
 		hardware_enable_nolock(NULL);
 	}
 }
 static struct syscore_ops kvm_syscore_ops = {
 	.suspend = kvm_suspend,
 	.resume = kvm_resume,
 };
 static inline
 struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
 {
 	return container_of(pn, struct kvm_vcpu, preempt_notifier);
 }
 static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
 {
 	struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
 	kvm_arch_vcpu_load(vcpu, cpu);
 }
 static void kvm_sched_out(struct preempt_notifier *pn,
 			  struct task_struct *next)
 {
 	struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
 	kvm_arch_vcpu_put(vcpu);
 }
 int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
 		  struct module *module)
 {
 	int r;
 	int cpu;
 	r = kvm_arch_init(opaque);
 	if (r)
 		goto out_fail;
 	if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
 		r = -ENOMEM;
 		goto out_free_0;
 	}
 	r = kvm_arch_hardware_setup();
 	if (r < 0)
 		goto out_free_0a;
 	for_each_online_cpu(cpu) {
 		smp_call_function_single(cpu,
 				kvm_arch_check_processor_compat,
 				&r, 1);
 		if (r < 0)
 			goto out_free_1;
 	}
 	r = register_cpu_notifier(&kvm_cpu_notifier);
 	if (r)
 		goto out_free_2;
 	register_reboot_notifier(&kvm_reboot_notifier);
 	/* A kmem cache lets us meet the alignment requirements of fx_save. */
 	if (!vcpu_align)
 		vcpu_align = __alignof__(struct kvm_vcpu);
 	kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, vcpu_align,
 					   0, NULL);
 	if (!kvm_vcpu_cache) {
 		r = -ENOMEM;
 		goto out_free_3;
 	}
 	r = kvm_async_pf_init();
 	if (r)
 		goto out_free;
 	kvm_chardev_ops.owner = module;
 	kvm_vm_fops.owner = module;
 	kvm_vcpu_fops.owner = module;
 	r = misc_register(&kvm_dev);
 	if (r) {
 		printk(KERN_ERR "kvm: misc device register failed\n");
 		goto out_unreg;
 	}
 	register_syscore_ops(&kvm_syscore_ops);
 	kvm_preempt_ops.sched_in = kvm_sched_in;
 	kvm_preempt_ops.sched_out = kvm_sched_out;
 	r = kvm_init_debug();
 	if (r) {
 		printk(KERN_ERR "kvm: create debugfs files failed\n");
 		goto out_undebugfs;
 	}
 	return 0;
 out_undebugfs:
 	unregister_syscore_ops(&kvm_syscore_ops);
 out_unreg:
 	kvm_async_pf_deinit();
 out_free:
 	kmem_cache_destroy(kvm_vcpu_cache);
 out_free_3:
 	unregister_reboot_notifier(&kvm_reboot_notifier);
 	unregister_cpu_notifier(&kvm_cpu_notifier);
 out_free_2:
 out_free_1:
 	kvm_arch_hardware_unsetup();
 out_free_0a:
 	free_cpumask_var(cpus_hardware_enabled);
 out_free_0:
 	kvm_arch_exit();
 out_fail:
 	return r;
 }
 EXPORT_SYMBOL_GPL(kvm_init);
 void kvm_exit(void)
 {
 	kvm_exit_debug();
 	misc_deregister(&kvm_dev);
 	kmem_cache_destroy(kvm_vcpu_cache);
 	kvm_async_pf_deinit();
 	unregister_syscore_ops(&kvm_syscore_ops);
 	unregister_reboot_notifier(&kvm_reboot_notifier);
 	unregister_cpu_notifier(&kvm_cpu_notifier);
 	on_each_cpu(hardware_disable_nolock, NULL, 1);
 	kvm_arch_hardware_unsetup();
 	kvm_arch_exit();
 	free_cpumask_var(cpus_hardware_enabled);
 }
 EXPORT_SYMBOL_GPL(kvm_exit);