KVM: Steal time implementation

To implement steal time, we need the hypervisor to pass the guest information about how much time was spent running other processes outside the VM, while the vcpu had meaningful work to do - halt time does not count. This information is acquired through the run_delay field of delayacct/schedstats infrastructure, that counts time spent in a runqueue but not running. Steal time is a per-cpu information, so the traditional MSR-based infrastructure is used. A new msr, KVM_MSR_STEAL_TIME, holds the memory area address containing information about steal time This patch contains the hypervisor part of the steal time infrasructure, and can be backported independently of the guest portion. [avi, yongjie: export delayacct_on, to avoid build failures in some configs] Signed-off-by: Glauber Costa <glommer@redhat.com> Tested-by: Eric B Munson <emunson@mgebm.net> CC: Rik van Riel <riel@redhat.com> CC: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> CC: Peter Zijlstra <peterz@infradead.org> CC: Anthony Liguori <aliguori@us.ibm.com> Signed-off-by: Yongjie Ren <yongjie.ren@intel.com> Signed-off-by: Avi Kivity <avi@redhat.com>

KVM: Steal time implementation
To implement steal time, we need the hypervisor to pass the guest information about how much time was spent running other processes outside the VM, while the vcpu had meaningful work to do - halt time does not count. This information is acquired through the run_delay field of delayacct/schedstats infrastructure, that counts time spent in a runqueue but not running. Steal time is a per-cpu information, so the traditional MSR-based infrastructure is used. A new msr, KVM_MSR_STEAL_TIME, holds the memory area address containing information about steal time This patch contains the hypervisor part of the steal time infrasructure, and can be backported independently of the guest portion. [avi, yongjie: export delayacct_on, to avoid build failures in some configs] Signed-off-by: Glauber Costa <glommer@redhat.com> Tested-by: Eric B Munson <emunson@mgebm.net> CC: Rik van Riel <riel@redhat.com> CC: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> CC: Peter Zijlstra <peterz@infradead.org> CC: Anthony Liguori <aliguori@us.ibm.com> Signed-off-by: Yongjie Ren <yongjie.ren@intel.com> Signed-off-by: Avi Kivity <avi@redhat.com>
Glauber Costa · Avi Kivity
1 parent 9ddabbe72e
Showing 6 changed files with 89 additions and 2 deletions Side-by-side Diff
arch/x86/include/asm/kvm_host.h
arch/x86/include/asm/kvm_para.h
arch/x86/kvm/Kconfig
arch/x86/kvm/x86.c
include/linux/kvm_host.h
kernel/delayacct.c
@@ -389,6 +389,15 @@
 	unsigned int hw_tsc_khz;
 	unsigned int time_offset;
 	struct page *time_page;
+
+	struct {
+		u64 msr_val;
+		u64 last_steal;
+		u64 accum_steal;
+		struct gfn_to_hva_cache stime;
+		struct kvm_steal_time steal;
+	} st;
+
 	u64 last_guest_tsc;
 	u64 last_kernel_ns;
 	u64 last_tsc_nsec;
@@ -45,6 +45,10 @@
 	__u32 pad[12];
 };
  
+#define KVM_STEAL_ALIGNMENT_BITS 5
+#define KVM_STEAL_VALID_BITS ((-1ULL << (KVM_STEAL_ALIGNMENT_BITS + 1)))
+#define KVM_STEAL_RESERVED_MASK (((1 << KVM_STEAL_ALIGNMENT_BITS) - 1 ) << 1)
+
 #define KVM_MAX_MMU_OP_BATCH           32
  
 #define KVM_ASYNC_PF_ENABLED			(1 << 0)
@@ -31,6 +31,7 @@
 	select KVM_ASYNC_PF
 	select USER_RETURN_NOTIFIER
 	select KVM_MMIO
+	select TASK_DELAY_ACCT
 	---help---
 	  Support hosting fully virtualized guest machines using hardware
 	  virtualization extensions.  You will need a fairly recent
@@ -808,12 +808,12 @@
  * kvm-specific. Those are put in the beginning of the list.
  */
  
-#define KVM_SAVE_MSRS_BEGIN	8
+#define KVM_SAVE_MSRS_BEGIN	9
 static u32 msrs_to_save[] = {
 	MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
 	MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
 	HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
-	HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN,
+	HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
 	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
 	MSR_STAR,
 #ifdef CONFIG_X86_64
@@ -1488,6 +1488,35 @@
 	}
 }
  
+static void accumulate_steal_time(struct kvm_vcpu *vcpu)
+{
+	u64 delta;
+
+	if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
+		return;
+
+	delta = current->sched_info.run_delay - vcpu->arch.st.last_steal;
+	vcpu->arch.st.last_steal = current->sched_info.run_delay;
+	vcpu->arch.st.accum_steal = delta;
+}
+
+static void record_steal_time(struct kvm_vcpu *vcpu)
+{
+	if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
+		return;
+
+	if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
+		&vcpu->arch.st.steal, sizeof(struct kvm_steal_time))))
+		return;
+
+	vcpu->arch.st.steal.steal += vcpu->arch.st.accum_steal;
+	vcpu->arch.st.steal.version += 2;
+	vcpu->arch.st.accum_steal = 0;
+
+	kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
+		&vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
+}
+
 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 {
 	switch (msr) {
@@ -1570,6 +1599,33 @@
 		if (kvm_pv_enable_async_pf(vcpu, data))
 			return 1;
 		break;
+	case MSR_KVM_STEAL_TIME:
+
+		if (unlikely(!sched_info_on()))
+			return 1;
+
+		if (data & KVM_STEAL_RESERVED_MASK)
+			return 1;
+
+		if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime,
+							data & KVM_STEAL_VALID_BITS))
+			return 1;
+
+		vcpu->arch.st.msr_val = data;
+
+		if (!(data & KVM_MSR_ENABLED))
+			break;
+
+		vcpu->arch.st.last_steal = current->sched_info.run_delay;
+
+		preempt_disable();
+		accumulate_steal_time(vcpu);
+		preempt_enable();
+
+		kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
+
+		break;
+
 	case MSR_IA32_MCG_CTL:
 	case MSR_IA32_MCG_STATUS:
 	case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
@@ -1855,6 +1911,9 @@
 	case MSR_KVM_ASYNC_PF_EN:
 		data = vcpu->arch.apf.msr_val;
 		break;
+	case MSR_KVM_STEAL_TIME:
+		data = vcpu->arch.st.msr_val;
+		break;
 	case MSR_IA32_P5_MC_ADDR:
 	case MSR_IA32_P5_MC_TYPE:
 	case MSR_IA32_MCG_CAP:
@@ -2166,6 +2225,9 @@
 			kvm_migrate_timers(vcpu);
 		vcpu->cpu = cpu;
 	}
+
+	accumulate_steal_time(vcpu);
+	kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
 }
  
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
@@ -2487,6 +2549,10 @@
 			     (1 << KVM_FEATURE_CLOCKSOURCE2) |
 			     (1 << KVM_FEATURE_ASYNC_PF) |
 			     (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
+
+		if (sched_info_on())
+			entry->eax |= (1 << KVM_FEATURE_STEAL_TIME);
+
 		entry->ebx = 0;
 		entry->ecx = 0;
 		entry->edx = 0;
@@ -5470,6 +5536,9 @@
 			r = 1;
 			goto out;
 		}
+		if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
+			record_steal_time(vcpu);
+
 	}
  
 	r = kvm_mmu_reload(vcpu);
@@ -6206,6 +6275,7 @@
  
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
 	vcpu->arch.apf.msr_val = 0;
+	vcpu->arch.st.msr_val = 0;
  
 	kvmclock_reset(vcpu);
  
@@ -47,6 +47,7 @@
 #define KVM_REQ_DEACTIVATE_FPU    10
 #define KVM_REQ_EVENT             11
 #define KVM_REQ_APF_HALT          12
+#define KVM_REQ_STEAL_UPDATE      13
  
 #define KVM_USERSPACE_IRQ_SOURCE_ID	0
  
@@ -19,8 +19,10 @@
 #include <linux/time.h>
 #include <linux/sysctl.h>
 #include <linux/delayacct.h>
+#include <linux/module.h>
  
 int delayacct_on __read_mostly = 1;	/* Delay accounting turned on/off */
+EXPORT_SYMBOL_GPL(delayacct_on);
 struct kmem_cache *delayacct_cache;
  
 static int __init delayacct_setup_disable(char *str)
...	...	@@ -389,6 +389,15 @@
389	389	unsigned int hw_tsc_khz;
390	390	unsigned int time_offset;
391	391	struct page *time_page;
	392	+
	393	+ struct {
	394	+ u64 msr_val;
	395	+ u64 last_steal;
	396	+ u64 accum_steal;
	397	+ struct gfn_to_hva_cache stime;
	398	+ struct kvm_steal_time steal;
	399	+ } st;
	400	+
392	401	u64 last_guest_tsc;
393	402	u64 last_kernel_ns;
394	403	u64 last_tsc_nsec;
...	...	@@ -45,6 +45,10 @@
45	45	__u32 pad[12];
46	46	};
47	47
	48	+#define KVM_STEAL_ALIGNMENT_BITS 5
	49	+#define KVM_STEAL_VALID_BITS ((-1ULL << (KVM_STEAL_ALIGNMENT_BITS + 1)))
	50	+#define KVM_STEAL_RESERVED_MASK (((1 << KVM_STEAL_ALIGNMENT_BITS) - 1 ) << 1)
	51	+
48	52	#define KVM_MAX_MMU_OP_BATCH 32
49	53
50	54	#define KVM_ASYNC_PF_ENABLED (1 << 0)
...	...	@@ -31,6 +31,7 @@
31	31	select KVM_ASYNC_PF
32	32	select USER_RETURN_NOTIFIER
33	33	select KVM_MMIO
	34	+ select TASK_DELAY_ACCT
34	35	---help---
35	36	Support hosting fully virtualized guest machines using hardware
36	37	virtualization extensions. You will need a fairly recent
...	...	@@ -808,12 +808,12 @@
808	808	* kvm-specific. Those are put in the beginning of the list.
809	809	*/
810	810
811		-#define KVM_SAVE_MSRS_BEGIN 8
	811	+#define KVM_SAVE_MSRS_BEGIN 9
812	812	static u32 msrs_to_save[] = {
813	813	MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
814	814	MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
815	815	HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
816		- HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN,
	816	+ HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
817	817	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
818	818	MSR_STAR,
819	819	#ifdef CONFIG_X86_64
...	...	@@ -1488,6 +1488,35 @@
1488	1488	}
1489	1489	}
1490	1490
	1491	+static void accumulate_steal_time(struct kvm_vcpu *vcpu)
	1492	+{
	1493	+ u64 delta;
	1494	+
	1495	+ if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
	1496	+ return;
	1497	+
	1498	+ delta = current->sched_info.run_delay - vcpu->arch.st.last_steal;
	1499	+ vcpu->arch.st.last_steal = current->sched_info.run_delay;
	1500	+ vcpu->arch.st.accum_steal = delta;
	1501	+}
	1502	+
	1503	+static void record_steal_time(struct kvm_vcpu *vcpu)
	1504	+{
	1505	+ if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
	1506	+ return;
	1507	+
	1508	+ if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
	1509	+ &vcpu->arch.st.steal, sizeof(struct kvm_steal_time))))
	1510	+ return;
	1511	+
	1512	+ vcpu->arch.st.steal.steal += vcpu->arch.st.accum_steal;
	1513	+ vcpu->arch.st.steal.version += 2;
	1514	+ vcpu->arch.st.accum_steal = 0;
	1515	+
	1516	+ kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
	1517	+ &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
	1518	+}
	1519	+
1491	1520	int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1492	1521	{
1493	1522	switch (msr) {
...	...	@@ -1570,6 +1599,33 @@
1570	1599	if (kvm_pv_enable_async_pf(vcpu, data))
1571	1600	return 1;
1572	1601	break;
	1602	+ case MSR_KVM_STEAL_TIME:
	1603	+
	1604	+ if (unlikely(!sched_info_on()))
	1605	+ return 1;
	1606	+
	1607	+ if (data & KVM_STEAL_RESERVED_MASK)
	1608	+ return 1;
	1609	+
	1610	+ if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime,
	1611	+ data & KVM_STEAL_VALID_BITS))
	1612	+ return 1;
	1613	+
	1614	+ vcpu->arch.st.msr_val = data;
	1615	+
	1616	+ if (!(data & KVM_MSR_ENABLED))
	1617	+ break;
	1618	+
	1619	+ vcpu->arch.st.last_steal = current->sched_info.run_delay;
	1620	+
	1621	+ preempt_disable();
	1622	+ accumulate_steal_time(vcpu);
	1623	+ preempt_enable();
	1624	+
	1625	+ kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
	1626	+
	1627	+ break;
	1628	+
1573	1629	case MSR_IA32_MCG_CTL:
1574	1630	case MSR_IA32_MCG_STATUS:
1575	1631	case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
...	...	@@ -1855,6 +1911,9 @@
1855	1911	case MSR_KVM_ASYNC_PF_EN:
1856	1912	data = vcpu->arch.apf.msr_val;
1857	1913	break;
	1914	+ case MSR_KVM_STEAL_TIME:
	1915	+ data = vcpu->arch.st.msr_val;
	1916	+ break;
1858	1917	case MSR_IA32_P5_MC_ADDR:
1859	1918	case MSR_IA32_P5_MC_TYPE:
1860	1919	case MSR_IA32_MCG_CAP:
...	...	@@ -2166,6 +2225,9 @@
2166	2225	kvm_migrate_timers(vcpu);
2167	2226	vcpu->cpu = cpu;
2168	2227	}
	2228	+
	2229	+ accumulate_steal_time(vcpu);
	2230	+ kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
2169	2231	}
2170	2232
2171	2233	void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
...	...	@@ -2487,6 +2549,10 @@
2487	2549	(1 << KVM_FEATURE_CLOCKSOURCE2) \|
2488	2550	(1 << KVM_FEATURE_ASYNC_PF) \|
2489	2551	(1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
	2552	+
	2553	+ if (sched_info_on())
	2554	+ entry->eax \|= (1 << KVM_FEATURE_STEAL_TIME);
	2555	+
2490	2556	entry->ebx = 0;
2491	2557	entry->ecx = 0;
2492	2558	entry->edx = 0;
...	...	@@ -5470,6 +5536,9 @@
5470	5536	r = 1;
5471	5537	goto out;
5472	5538	}
	5539	+ if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
	5540	+ record_steal_time(vcpu);
	5541	+
5473	5542	}
5474	5543
5475	5544	r = kvm_mmu_reload(vcpu);
...	...	@@ -6206,6 +6275,7 @@
6206	6275
6207	6276	kvm_make_request(KVM_REQ_EVENT, vcpu);
6208	6277	vcpu->arch.apf.msr_val = 0;
	6278	+ vcpu->arch.st.msr_val = 0;
6209	6279
6210	6280	kvmclock_reset(vcpu);
6211	6281
...	...	@@ -47,6 +47,7 @@
47	47	#define KVM_REQ_DEACTIVATE_FPU 10
48	48	#define KVM_REQ_EVENT 11
49	49	#define KVM_REQ_APF_HALT 12
	50	+#define KVM_REQ_STEAL_UPDATE 13
50	51
51	52	#define KVM_USERSPACE_IRQ_SOURCE_ID 0
52	53
...	...	@@ -19,8 +19,10 @@
19	19	#include <linux/time.h>
20	20	#include <linux/sysctl.h>
21	21	#include <linux/delayacct.h>
	22	+#include <linux/module.h>
22	23
23	24	int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */
	25	+EXPORT_SYMBOL_GPL(delayacct_on);
24	26	struct kmem_cache *delayacct_cache;
25	27
26	28	static int __init delayacct_setup_disable(char *str)