Commit 790c73f6289a204f858ffdcbe4a2b38e91657ec6

Authored by Glauber de Oliveira Costa
Committed by Avi Kivity
1 parent 18068523d3

x86: KVM guest: paravirtualized clocksource

This is the guest part of kvm clock implementation
It does not do tsc-only timing, as tsc can have deltas
between cpus, and it did not seem worthy to me to keep
adjusting them.

We do use it, however, for fine-grained adjustment.

Other than that, time comes from the host.

[randy dunlap: add missing include]
[randy dunlap: disallow on Voyager or Visual WS]

Signed-off-by: Glauber de Oliveira Costa <gcosta@redhat.com>
Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

Showing 5 changed files with 182 additions and 0 deletions Side-by-side Diff

... ... @@ -373,6 +373,17 @@
373 373 at the moment), by linking the kernel to a GPL-ed ROM module
374 374 provided by the hypervisor.
375 375  
  376 +config KVM_CLOCK
  377 + bool "KVM paravirtualized clock"
  378 + select PARAVIRT
  379 + depends on !(X86_VISWS || X86_VOYAGER)
  380 + help
  381 + Turning on this option will allow you to run a paravirtualized clock
  382 + when running over the KVM hypervisor. Instead of relying on a PIT
  383 + (or probably other) emulation by the underlying device model, the host
  384 + provides the guest with timing infrastructure such as time of day, and
  385 + system time
  386 +
376 387 source "arch/x86/lguest/Kconfig"
377 388  
378 389 config PARAVIRT
arch/x86/kernel/Makefile
... ... @@ -80,6 +80,7 @@
80 80 obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o
81 81  
82 82 obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o
  83 +obj-$(CONFIG_KVM_CLOCK) += kvmclock.o
83 84 obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o
84 85  
85 86 ifdef CONFIG_INPUT_PCSPKR
arch/x86/kernel/kvmclock.c
  1 +/* KVM paravirtual clock driver. A clocksource implementation
  2 + Copyright (C) 2008 Glauber de Oliveira Costa, Red Hat Inc.
  3 +
  4 + This program is free software; you can redistribute it and/or modify
  5 + it under the terms of the GNU General Public License as published by
  6 + the Free Software Foundation; either version 2 of the License, or
  7 + (at your option) any later version.
  8 +
  9 + This program is distributed in the hope that it will be useful,
  10 + but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12 + GNU General Public License for more details.
  13 +
  14 + You should have received a copy of the GNU General Public License
  15 + along with this program; if not, write to the Free Software
  16 + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
  17 +*/
  18 +
  19 +#include <linux/clocksource.h>
  20 +#include <linux/kvm_para.h>
  21 +#include <asm/arch_hooks.h>
  22 +#include <asm/msr.h>
  23 +#include <asm/apic.h>
  24 +#include <linux/percpu.h>
  25 +
  26 +#define KVM_SCALE 22
  27 +
  28 +static int kvmclock = 1;
  29 +
  30 +static int parse_no_kvmclock(char *arg)
  31 +{
  32 + kvmclock = 0;
  33 + return 0;
  34 +}
  35 +early_param("no-kvmclock", parse_no_kvmclock);
  36 +
  37 +/* The hypervisor will put information about time periodically here */
  38 +static DEFINE_PER_CPU_SHARED_ALIGNED(struct kvm_vcpu_time_info, hv_clock);
  39 +#define get_clock(cpu, field) per_cpu(hv_clock, cpu).field
  40 +
  41 +static inline u64 kvm_get_delta(u64 last_tsc)
  42 +{
  43 + int cpu = smp_processor_id();
  44 + u64 delta = native_read_tsc() - last_tsc;
  45 + return (delta * get_clock(cpu, tsc_to_system_mul)) >> KVM_SCALE;
  46 +}
  47 +
  48 +static struct kvm_wall_clock wall_clock;
  49 +static cycle_t kvm_clock_read(void);
  50 +/*
  51 + * The wallclock is the time of day when we booted. Since then, some time may
  52 + * have elapsed since the hypervisor wrote the data. So we try to account for
  53 + * that with system time
  54 + */
  55 +unsigned long kvm_get_wallclock(void)
  56 +{
  57 + u32 wc_sec, wc_nsec;
  58 + u64 delta;
  59 + struct timespec ts;
  60 + int version, nsec;
  61 + int low, high;
  62 +
  63 + low = (int)__pa(&wall_clock);
  64 + high = ((u64)__pa(&wall_clock) >> 32);
  65 +
  66 + delta = kvm_clock_read();
  67 +
  68 + native_write_msr(MSR_KVM_WALL_CLOCK, low, high);
  69 + do {
  70 + version = wall_clock.wc_version;
  71 + rmb();
  72 + wc_sec = wall_clock.wc_sec;
  73 + wc_nsec = wall_clock.wc_nsec;
  74 + rmb();
  75 + } while ((wall_clock.wc_version != version) || (version & 1));
  76 +
  77 + delta = kvm_clock_read() - delta;
  78 + delta += wc_nsec;
  79 + nsec = do_div(delta, NSEC_PER_SEC);
  80 + set_normalized_timespec(&ts, wc_sec + delta, nsec);
  81 + /*
  82 + * Of all mechanisms of time adjustment I've tested, this one
  83 + * was the champion!
  84 + */
  85 + return ts.tv_sec + 1;
  86 +}
  87 +
  88 +int kvm_set_wallclock(unsigned long now)
  89 +{
  90 + return 0;
  91 +}
  92 +
  93 +/*
  94 + * This is our read_clock function. The host puts an tsc timestamp each time
  95 + * it updates a new time. Without the tsc adjustment, we can have a situation
  96 + * in which a vcpu starts to run earlier (smaller system_time), but probes
  97 + * time later (compared to another vcpu), leading to backwards time
  98 + */
  99 +static cycle_t kvm_clock_read(void)
  100 +{
  101 + u64 last_tsc, now;
  102 + int cpu;
  103 +
  104 + preempt_disable();
  105 + cpu = smp_processor_id();
  106 +
  107 + last_tsc = get_clock(cpu, tsc_timestamp);
  108 + now = get_clock(cpu, system_time);
  109 +
  110 + now += kvm_get_delta(last_tsc);
  111 + preempt_enable();
  112 +
  113 + return now;
  114 +}
  115 +static struct clocksource kvm_clock = {
  116 + .name = "kvm-clock",
  117 + .read = kvm_clock_read,
  118 + .rating = 400,
  119 + .mask = CLOCKSOURCE_MASK(64),
  120 + .mult = 1 << KVM_SCALE,
  121 + .shift = KVM_SCALE,
  122 + .flags = CLOCK_SOURCE_IS_CONTINUOUS,
  123 +};
  124 +
  125 +static int kvm_register_clock(void)
  126 +{
  127 + int cpu = smp_processor_id();
  128 + int low, high;
  129 + low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1;
  130 + high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32);
  131 +
  132 + return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high);
  133 +}
  134 +
  135 +static void kvm_setup_secondary_clock(void)
  136 +{
  137 + /*
  138 + * Now that the first cpu already had this clocksource initialized,
  139 + * we shouldn't fail.
  140 + */
  141 + WARN_ON(kvm_register_clock());
  142 + /* ok, done with our trickery, call native */
  143 + setup_secondary_APIC_clock();
  144 +}
  145 +
  146 +void __init kvmclock_init(void)
  147 +{
  148 + if (!kvm_para_available())
  149 + return;
  150 +
  151 + if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) {
  152 + if (kvm_register_clock())
  153 + return;
  154 + pv_time_ops.get_wallclock = kvm_get_wallclock;
  155 + pv_time_ops.set_wallclock = kvm_set_wallclock;
  156 + pv_time_ops.sched_clock = kvm_clock_read;
  157 + pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock;
  158 + clocksource_register(&kvm_clock);
  159 + }
  160 +}
arch/x86/kernel/setup_32.c
... ... @@ -47,6 +47,7 @@
47 47 #include <linux/pfn.h>
48 48 #include <linux/pci.h>
49 49 #include <linux/init_ohci1394_dma.h>
  50 +#include <linux/kvm_para.h>
50 51  
51 52 #include <video/edid.h>
52 53  
... ... @@ -819,6 +820,10 @@
819 820 propagate_e820_map();
820 821  
821 822 max_low_pfn = setup_memory();
  823 +
  824 +#ifdef CONFIG_KVM_CLOCK
  825 + kvmclock_init();
  826 +#endif
822 827  
823 828 #ifdef CONFIG_VMI
824 829 /*
arch/x86/kernel/setup_64.c
... ... @@ -42,6 +42,7 @@
42 42 #include <linux/ctype.h>
43 43 #include <linux/uaccess.h>
44 44 #include <linux/init_ohci1394_dma.h>
  45 +#include <linux/kvm_para.h>
45 46  
46 47 #include <asm/mtrr.h>
47 48 #include <asm/uaccess.h>
... ... @@ -383,6 +384,10 @@
383 384 dmi_scan_machine();
384 385  
385 386 io_delay_init();
  387 +
  388 +#ifdef CONFIG_KVM_CLOCK
  389 + kvmclock_init();
  390 +#endif
386 391  
387 392 #ifdef CONFIG_SMP
388 393 /* setup to use the early static init tables during kernel startup */