KVM: MMU: hypercall based pte updates and TLB flushes

Hypercall based pte updates are faster than faults, and also allow use of the lazy MMU mode to batch operations. Don't report the feature if two dimensional paging is enabled. [avi: - one mmu_op hypercall instead of one per op - allow 64-bit gpa on hypercall - don't pass host errors (-ENOMEM) to guest] [akpm: warning fix on i386] Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Avi Kivity <avi@qumranet.com>

KVM: MMU: hypercall based pte updates and TLB flushes
Hypercall based pte updates are faster than faults, and also allow use of the lazy MMU mode to batch operations. Don't report the feature if two dimensional paging is enabled. [avi: - one mmu_op hypercall instead of one per op - allow 64-bit gpa on hypercall - don't pass host errors (-ENOMEM) to guest] [akpm: warning fix on i386] Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Avi Kivity <avi@qumranet.com>
Marcelo Tosatti · Avi Kivity
1 parent 9f81128591
Showing 6 changed files with 190 additions and 3 deletions Side-by-side Diff
arch/x86/kvm/mmu.c
arch/x86/kvm/x86.c
include/asm-x86/kvm_host.h
include/asm-x86/kvm_para.h
include/linux/kvm.h
include/linux/kvm_para.h
@@ -28,6 +28,7 @@
 #include <linux/module.h>
 #include <linux/swap.h>
 #include <linux/hugetlb.h>
+#include <linux/compiler.h>
  
 #include <asm/page.h>
 #include <asm/cmpxchg.h>
@@ -40,7 +41,7 @@
  * 2. while doing 1. it walks guest-physical to host-physical
  * If the hardware supports that we don't need to do shadow paging.
  */
-static bool tdp_enabled = false;
+bool tdp_enabled = false;
  
 #undef MMU_DEBUG
  
@@ -167,6 +168,13 @@
 #define ACC_USER_MASK    PT_USER_MASK
 #define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
  
+struct kvm_pv_mmu_op_buffer {
+	void *ptr;
+	unsigned len;
+	unsigned processed;
+	char buf[512] __aligned(sizeof(long));
+};
+
 struct kvm_rmap_desc {
 	u64 *shadow_ptes[RMAP_EXT];
 	struct kvm_rmap_desc *more;
@@ -2001,6 +2009,132 @@
 			(unsigned int) KVM_MIN_ALLOC_MMU_PAGES);
  
 	return nr_mmu_pages;
+}
+
+static void *pv_mmu_peek_buffer(struct kvm_pv_mmu_op_buffer *buffer,
+				unsigned len)
+{
+	if (len > buffer->len)
+		return NULL;
+	return buffer->ptr;
+}
+
+static void *pv_mmu_read_buffer(struct kvm_pv_mmu_op_buffer *buffer,
+				unsigned len)
+{
+	void *ret;
+
+	ret = pv_mmu_peek_buffer(buffer, len);
+	if (!ret)
+		return ret;
+	buffer->ptr += len;
+	buffer->len -= len;
+	buffer->processed += len;
+	return ret;
+}
+
+static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
+			     gpa_t addr, gpa_t value)
+{
+	int bytes = 8;
+	int r;
+
+	if (!is_long_mode(vcpu) && !is_pae(vcpu))
+		bytes = 4;
+
+	r = mmu_topup_memory_caches(vcpu);
+	if (r)
+		return r;
+
+	if (!__emulator_write_phys(vcpu, addr, &value, bytes))
+		return -EFAULT;
+
+	return 1;
+}
+
+static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
+{
+	kvm_x86_ops->tlb_flush(vcpu);
+	return 1;
+}
+
+static int kvm_pv_mmu_release_pt(struct kvm_vcpu *vcpu, gpa_t addr)
+{
+	spin_lock(&vcpu->kvm->mmu_lock);
+	mmu_unshadow(vcpu->kvm, addr >> PAGE_SHIFT);
+	spin_unlock(&vcpu->kvm->mmu_lock);
+	return 1;
+}
+
+static int kvm_pv_mmu_op_one(struct kvm_vcpu *vcpu,
+			     struct kvm_pv_mmu_op_buffer *buffer)
+{
+	struct kvm_mmu_op_header *header;
+
+	header = pv_mmu_peek_buffer(buffer, sizeof *header);
+	if (!header)
+		return 0;
+	switch (header->op) {
+	case KVM_MMU_OP_WRITE_PTE: {
+		struct kvm_mmu_op_write_pte *wpte;
+
+		wpte = pv_mmu_read_buffer(buffer, sizeof *wpte);
+		if (!wpte)
+			return 0;
+		return kvm_pv_mmu_write(vcpu, wpte->pte_phys,
+					wpte->pte_val);
+	}
+	case KVM_MMU_OP_FLUSH_TLB: {
+		struct kvm_mmu_op_flush_tlb *ftlb;
+
+		ftlb = pv_mmu_read_buffer(buffer, sizeof *ftlb);
+		if (!ftlb)
+			return 0;
+		return kvm_pv_mmu_flush_tlb(vcpu);
+	}
+	case KVM_MMU_OP_RELEASE_PT: {
+		struct kvm_mmu_op_release_pt *rpt;
+
+		rpt = pv_mmu_read_buffer(buffer, sizeof *rpt);
+		if (!rpt)
+			return 0;
+		return kvm_pv_mmu_release_pt(vcpu, rpt->pt_phys);
+	}
+	default: return 0;
+	}
+}
+
+int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
+		  gpa_t addr, unsigned long *ret)
+{
+	int r;
+	struct kvm_pv_mmu_op_buffer buffer;
+
+	down_read(&vcpu->kvm->slots_lock);
+	down_read(&current->mm->mmap_sem);
+
+	buffer.ptr = buffer.buf;
+	buffer.len = min_t(unsigned long, bytes, sizeof buffer.buf);
+	buffer.processed = 0;
+
+	r = kvm_read_guest(vcpu->kvm, addr, buffer.buf, buffer.len);
+	if (r)
+		goto out;
+
+	while (buffer.len) {
+		r = kvm_pv_mmu_op_one(vcpu, &buffer);
+		if (r < 0)
+			goto out;
+		if (r == 0)
+			break;
+	}
+
+	r = 1;
+out:
+	*ret = buffer.processed;
+	up_read(&current->mm->mmap_sem);
+	up_read(&vcpu->kvm->slots_lock);
+	return r;
 }
  
 #ifdef AUDIT
@@ -832,6 +832,9 @@
 	case KVM_CAP_NR_MEMSLOTS:
 		r = KVM_MEMORY_SLOTS;
 		break;
+	case KVM_CAP_PV_MMU:
+		r = !tdp_enabled;
+		break;
 	default:
 		r = 0;
 		break;
  
@@ -2452,9 +2455,19 @@
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_halt);
  
+static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0,
+			   unsigned long a1)
+{
+	if (is_long_mode(vcpu))
+		return a0;
+	else
+		return a0 | ((gpa_t)a1 << 32);
+}
+
 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 {
 	unsigned long nr, a0, a1, a2, a3, ret;
+	int r = 1;
  
 	kvm_x86_ops->cache_regs(vcpu);
  
@@ -2476,6 +2489,9 @@
 	case KVM_HC_VAPIC_POLL_IRQ:
 		ret = 0;
 		break;
+	case KVM_HC_MMU_OP:
+		r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret);
+		break;
 	default:
 		ret = -KVM_ENOSYS;
 		break;
@@ -2483,7 +2499,7 @@
 	vcpu->arch.regs[VCPU_REGS_RAX] = ret;
 	kvm_x86_ops->decache_regs(vcpu);
 	++vcpu->stat.hypercalls;
-	return 0;
+	return r;
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
  
@@ -434,6 +434,10 @@
  
 int __emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
 			  const void *val, int bytes);
+int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
+		  gpa_t addr, unsigned long *ret);
+
+extern bool tdp_enabled;
  
 enum emulation_result {
 	EMULATE_DONE,       /* no further processing */
@@ -12,9 +12,38 @@
 #define KVM_CPUID_FEATURES	0x40000001
 #define KVM_FEATURE_CLOCKSOURCE		0
 #define KVM_FEATURE_NOP_IO_DELAY	1
+#define KVM_FEATURE_MMU_OP		2
  
 #define MSR_KVM_WALL_CLOCK  0x11
 #define MSR_KVM_SYSTEM_TIME 0x12
+
+#define KVM_MAX_MMU_OP_BATCH           32
+
+/* Operations for KVM_HC_MMU_OP */
+#define KVM_MMU_OP_WRITE_PTE            1
+#define KVM_MMU_OP_FLUSH_TLB	        2
+#define KVM_MMU_OP_RELEASE_PT	        3
+
+/* Payload for KVM_HC_MMU_OP */
+struct kvm_mmu_op_header {
+	__u32 op;
+	__u32 pad;
+};
+
+struct kvm_mmu_op_write_pte {
+	struct kvm_mmu_op_header header;
+	__u64 pte_phys;
+	__u64 pte_val;
+};
+
+struct kvm_mmu_op_flush_tlb {
+	struct kvm_mmu_op_header header;
+};
+
+struct kvm_mmu_op_release_pt {
+	struct kvm_mmu_op_header header;
+	__u64 pt_phys;
+};
  
 #ifdef __KERNEL__
 #include <asm/processor.h>
@@ -238,6 +238,7 @@
 #define KVM_CAP_NR_MEMSLOTS 10   /* returns max memory slots per vm */
 #define KVM_CAP_PIT 11
 #define KVM_CAP_NOP_IO_DELAY 12
+#define KVM_CAP_PV_MMU 13
  
 /*
  * ioctls for VM fds
@@ -11,8 +11,11 @@
  
 /* Return values for hypercalls */
 #define KVM_ENOSYS		1000
+#define KVM_EFAULT		EFAULT
+#define KVM_E2BIG		E2BIG
  
-#define KVM_HC_VAPIC_POLL_IRQ            1
+#define KVM_HC_VAPIC_POLL_IRQ		1
+#define KVM_HC_MMU_OP			2
  
 /*
  * hypercalls use architecture specific
...	...	@@ -28,6 +28,7 @@
28	28	#include <linux/module.h>
29	29	#include <linux/swap.h>
30	30	#include <linux/hugetlb.h>
	31	+#include <linux/compiler.h>
31	32
32	33	#include <asm/page.h>
33	34	#include <asm/cmpxchg.h>
...	...	@@ -40,7 +41,7 @@
40	41	* 2. while doing 1. it walks guest-physical to host-physical
41	42	* If the hardware supports that we don't need to do shadow paging.
42	43	*/
43		-static bool tdp_enabled = false;
	44	+bool tdp_enabled = false;
44	45
45	46	#undef MMU_DEBUG
46	47
...	...	@@ -167,6 +168,13 @@
167	168	#define ACC_USER_MASK PT_USER_MASK
168	169	#define ACC_ALL (ACC_EXEC_MASK \| ACC_WRITE_MASK \| ACC_USER_MASK)
169	170
	171	+struct kvm_pv_mmu_op_buffer {
	172	+ void *ptr;
	173	+ unsigned len;
	174	+ unsigned processed;
	175	+ char buf[512] __aligned(sizeof(long));
	176	+};
	177	+
170	178	struct kvm_rmap_desc {
171	179	u64 *shadow_ptes[RMAP_EXT];
172	180	struct kvm_rmap_desc *more;
...	...	@@ -2001,6 +2009,132 @@
2001	2009	(unsigned int) KVM_MIN_ALLOC_MMU_PAGES);
2002	2010
2003	2011	return nr_mmu_pages;
	2012	+}
	2013	+
	2014	+static void pv_mmu_peek_buffer(struct kvm_pv_mmu_op_buffer buffer,
	2015	+ unsigned len)
	2016	+{
	2017	+ if (len > buffer->len)
	2018	+ return NULL;
	2019	+ return buffer->ptr;
	2020	+}
	2021	+
	2022	+static void pv_mmu_read_buffer(struct kvm_pv_mmu_op_buffer buffer,
	2023	+ unsigned len)
	2024	+{
	2025	+ void *ret;
	2026	+
	2027	+ ret = pv_mmu_peek_buffer(buffer, len);
	2028	+ if (!ret)
	2029	+ return ret;
	2030	+ buffer->ptr += len;
	2031	+ buffer->len -= len;
	2032	+ buffer->processed += len;
	2033	+ return ret;
	2034	+}
	2035	+
	2036	+static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
	2037	+ gpa_t addr, gpa_t value)
	2038	+{
	2039	+ int bytes = 8;
	2040	+ int r;
	2041	+
	2042	+ if (!is_long_mode(vcpu) && !is_pae(vcpu))
	2043	+ bytes = 4;
	2044	+
	2045	+ r = mmu_topup_memory_caches(vcpu);
	2046	+ if (r)
	2047	+ return r;
	2048	+
	2049	+ if (!__emulator_write_phys(vcpu, addr, &value, bytes))
	2050	+ return -EFAULT;
	2051	+
	2052	+ return 1;
	2053	+}
	2054	+
	2055	+static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
	2056	+{
	2057	+ kvm_x86_ops->tlb_flush(vcpu);
	2058	+ return 1;
	2059	+}
	2060	+
	2061	+static int kvm_pv_mmu_release_pt(struct kvm_vcpu *vcpu, gpa_t addr)
	2062	+{
	2063	+ spin_lock(&vcpu->kvm->mmu_lock);
	2064	+ mmu_unshadow(vcpu->kvm, addr >> PAGE_SHIFT);
	2065	+ spin_unlock(&vcpu->kvm->mmu_lock);
	2066	+ return 1;
	2067	+}
	2068	+
	2069	+static int kvm_pv_mmu_op_one(struct kvm_vcpu *vcpu,
	2070	+ struct kvm_pv_mmu_op_buffer *buffer)
	2071	+{
	2072	+ struct kvm_mmu_op_header *header;
	2073	+
	2074	+ header = pv_mmu_peek_buffer(buffer, sizeof *header);
	2075	+ if (!header)
	2076	+ return 0;
	2077	+ switch (header->op) {
	2078	+ case KVM_MMU_OP_WRITE_PTE: {
	2079	+ struct kvm_mmu_op_write_pte *wpte;
	2080	+
	2081	+ wpte = pv_mmu_read_buffer(buffer, sizeof *wpte);
	2082	+ if (!wpte)
	2083	+ return 0;
	2084	+ return kvm_pv_mmu_write(vcpu, wpte->pte_phys,
	2085	+ wpte->pte_val);
	2086	+ }
	2087	+ case KVM_MMU_OP_FLUSH_TLB: {
	2088	+ struct kvm_mmu_op_flush_tlb *ftlb;
	2089	+
	2090	+ ftlb = pv_mmu_read_buffer(buffer, sizeof *ftlb);
	2091	+ if (!ftlb)
	2092	+ return 0;
	2093	+ return kvm_pv_mmu_flush_tlb(vcpu);
	2094	+ }
	2095	+ case KVM_MMU_OP_RELEASE_PT: {
	2096	+ struct kvm_mmu_op_release_pt *rpt;
	2097	+
	2098	+ rpt = pv_mmu_read_buffer(buffer, sizeof *rpt);
	2099	+ if (!rpt)
	2100	+ return 0;
	2101	+ return kvm_pv_mmu_release_pt(vcpu, rpt->pt_phys);
	2102	+ }
	2103	+ default: return 0;
	2104	+ }
	2105	+}
	2106	+
	2107	+int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
	2108	+ gpa_t addr, unsigned long *ret)
	2109	+{
	2110	+ int r;
	2111	+ struct kvm_pv_mmu_op_buffer buffer;
	2112	+
	2113	+ down_read(&vcpu->kvm->slots_lock);
	2114	+ down_read(&current->mm->mmap_sem);
	2115	+
	2116	+ buffer.ptr = buffer.buf;
	2117	+ buffer.len = min_t(unsigned long, bytes, sizeof buffer.buf);
	2118	+ buffer.processed = 0;
	2119	+
	2120	+ r = kvm_read_guest(vcpu->kvm, addr, buffer.buf, buffer.len);
	2121	+ if (r)
	2122	+ goto out;
	2123	+
	2124	+ while (buffer.len) {
	2125	+ r = kvm_pv_mmu_op_one(vcpu, &buffer);
	2126	+ if (r < 0)
	2127	+ goto out;
	2128	+ if (r == 0)
	2129	+ break;
	2130	+ }
	2131	+
	2132	+ r = 1;
	2133	+out:
	2134	+ *ret = buffer.processed;
	2135	+ up_read(&current->mm->mmap_sem);
	2136	+ up_read(&vcpu->kvm->slots_lock);
	2137	+ return r;
2004	2138	}
2005	2139
2006	2140	#ifdef AUDIT
...	...	@@ -832,6 +832,9 @@
832	832	case KVM_CAP_NR_MEMSLOTS:
833	833	r = KVM_MEMORY_SLOTS;
834	834	break;
	835	+ case KVM_CAP_PV_MMU:
	836	+ r = !tdp_enabled;
	837	+ break;
835	838	default:
836	839	r = 0;
837	840	break;
838	841
...	...	@@ -2452,9 +2455,19 @@
2452	2455	}
2453	2456	EXPORT_SYMBOL_GPL(kvm_emulate_halt);
2454	2457
	2458	+static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0,
	2459	+ unsigned long a1)
	2460	+{
	2461	+ if (is_long_mode(vcpu))
	2462	+ return a0;
	2463	+ else
	2464	+ return a0 \| ((gpa_t)a1 << 32);
	2465	+}
	2466	+
2455	2467	int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
2456	2468	{
2457	2469	unsigned long nr, a0, a1, a2, a3, ret;
	2470	+ int r = 1;
2458	2471
2459	2472	kvm_x86_ops->cache_regs(vcpu);
2460	2473
...	...	@@ -2476,6 +2489,9 @@
2476	2489	case KVM_HC_VAPIC_POLL_IRQ:
2477	2490	ret = 0;
2478	2491	break;
	2492	+ case KVM_HC_MMU_OP:
	2493	+ r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret);
	2494	+ break;
2479	2495	default:
2480	2496	ret = -KVM_ENOSYS;
2481	2497	break;
...	...	@@ -2483,7 +2499,7 @@
2483	2499	vcpu->arch.regs[VCPU_REGS_RAX] = ret;
2484	2500	kvm_x86_ops->decache_regs(vcpu);
2485	2501	++vcpu->stat.hypercalls;
2486		- return 0;
	2502	+ return r;
2487	2503	}
2488	2504	EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
2489	2505
...	...	@@ -434,6 +434,10 @@
434	434
435	435	int __emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
436	436	const void *val, int bytes);
	437	+int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
	438	+ gpa_t addr, unsigned long *ret);
	439	+
	440	+extern bool tdp_enabled;
437	441
438	442	enum emulation_result {
439	443	EMULATE_DONE, /* no further processing */
...	...	@@ -12,9 +12,38 @@
12	12	#define KVM_CPUID_FEATURES 0x40000001
13	13	#define KVM_FEATURE_CLOCKSOURCE 0
14	14	#define KVM_FEATURE_NOP_IO_DELAY 1
	15	+#define KVM_FEATURE_MMU_OP 2
15	16
16	17	#define MSR_KVM_WALL_CLOCK 0x11
17	18	#define MSR_KVM_SYSTEM_TIME 0x12
	19	+
	20	+#define KVM_MAX_MMU_OP_BATCH 32
	21	+
	22	+/* Operations for KVM_HC_MMU_OP */
	23	+#define KVM_MMU_OP_WRITE_PTE 1
	24	+#define KVM_MMU_OP_FLUSH_TLB 2
	25	+#define KVM_MMU_OP_RELEASE_PT 3
	26	+
	27	+/* Payload for KVM_HC_MMU_OP */
	28	+struct kvm_mmu_op_header {
	29	+ __u32 op;
	30	+ __u32 pad;
	31	+};
	32	+
	33	+struct kvm_mmu_op_write_pte {
	34	+ struct kvm_mmu_op_header header;
	35	+ __u64 pte_phys;
	36	+ __u64 pte_val;
	37	+};
	38	+
	39	+struct kvm_mmu_op_flush_tlb {
	40	+ struct kvm_mmu_op_header header;
	41	+};
	42	+
	43	+struct kvm_mmu_op_release_pt {
	44	+ struct kvm_mmu_op_header header;
	45	+ __u64 pt_phys;
	46	+};
18	47
19	48	#ifdef __KERNEL__
20	49	#include <asm/processor.h>
...	...	@@ -238,6 +238,7 @@
238	238	#define KVM_CAP_NR_MEMSLOTS 10 /* returns max memory slots per vm */
239	239	#define KVM_CAP_PIT 11
240	240	#define KVM_CAP_NOP_IO_DELAY 12
	241	+#define KVM_CAP_PV_MMU 13
241	242
242	243	/*
243	244	* ioctls for VM fds
...	...	@@ -11,8 +11,11 @@
11	11
12	12	/* Return values for hypercalls */
13	13	#define KVM_ENOSYS 1000
	14	+#define KVM_EFAULT EFAULT
	15	+#define KVM_E2BIG E2BIG
14	16
15		-#define KVM_HC_VAPIC_POLL_IRQ 1
	17	+#define KVM_HC_VAPIC_POLL_IRQ 1
	18	+#define KVM_HC_MMU_OP 2
16	19
17	20	/*
18	21	* hypercalls use architecture specific