Commit f91a8b447b9af64f589f6e13fec7f09b5927563d

Authored by Jeremy Fitzhardinge
Committed by Jeremy Fitzhardinge
1 parent 9a4029fd34

xen: Account for stolen time

This patch accounts for the time stolen from our VCPUs.  Stolen time is
time where a vcpu is runnable and could be running, but all available
physical CPUs are being used for something else.

This accounting gets run on each timer interrupt, just as a way to get
it run relatively often, and when interesting things are going on.
Stolen time is not really used by much in the kernel; it is reported
in /proc/stats, and that's about it.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Acked-by: Chris Wright <chrisw@sous-sol.org>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Rik van Riel <riel@redhat.com>

Showing 1 changed file with 150 additions and 9 deletions Side-by-side Diff

arch/i386/xen/time.c
... ... @@ -11,6 +11,7 @@
11 11 #include <linux/interrupt.h>
12 12 #include <linux/clocksource.h>
13 13 #include <linux/clockchips.h>
  14 +#include <linux/kernel_stat.h>
14 15  
15 16 #include <asm/xen/hypervisor.h>
16 17 #include <asm/xen/hypercall.h>
... ... @@ -25,6 +26,7 @@
25 26  
26 27 /* Xen may fire a timer up to this many ns early */
27 28 #define TIMER_SLOP 100000
  29 +#define NS_PER_TICK (1000000000LL / HZ)
28 30  
29 31 /* These are perodically updated in shared_info, and then copied here. */
30 32 struct shadow_time_info {
... ... @@ -37,6 +39,139 @@
37 39  
38 40 static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
39 41  
  42 +/* runstate info updated by Xen */
  43 +static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
  44 +
  45 +/* snapshots of runstate info */
  46 +static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate_snapshot);
  47 +
  48 +/* unused ns of stolen and blocked time */
  49 +static DEFINE_PER_CPU(u64, residual_stolen);
  50 +static DEFINE_PER_CPU(u64, residual_blocked);
  51 +
  52 +/* return an consistent snapshot of 64-bit time/counter value */
  53 +static u64 get64(const u64 *p)
  54 +{
  55 + u64 ret;
  56 +
  57 + if (BITS_PER_LONG < 64) {
  58 + u32 *p32 = (u32 *)p;
  59 + u32 h, l;
  60 +
  61 + /*
  62 + * Read high then low, and then make sure high is
  63 + * still the same; this will only loop if low wraps
  64 + * and carries into high.
  65 + * XXX some clean way to make this endian-proof?
  66 + */
  67 + do {
  68 + h = p32[1];
  69 + barrier();
  70 + l = p32[0];
  71 + barrier();
  72 + } while (p32[1] != h);
  73 +
  74 + ret = (((u64)h) << 32) | l;
  75 + } else
  76 + ret = *p;
  77 +
  78 + return ret;
  79 +}
  80 +
  81 +/*
  82 + * Runstate accounting
  83 + */
  84 +static void get_runstate_snapshot(struct vcpu_runstate_info *res)
  85 +{
  86 + u64 state_time;
  87 + struct vcpu_runstate_info *state;
  88 +
  89 + preempt_disable();
  90 +
  91 + state = &__get_cpu_var(runstate);
  92 +
  93 + /*
  94 + * The runstate info is always updated by the hypervisor on
  95 + * the current CPU, so there's no need to use anything
  96 + * stronger than a compiler barrier when fetching it.
  97 + */
  98 + do {
  99 + state_time = get64(&state->state_entry_time);
  100 + barrier();
  101 + *res = *state;
  102 + barrier();
  103 + } while (get64(&state->state_entry_time) != state_time);
  104 +
  105 + preempt_enable();
  106 +}
  107 +
  108 +static void setup_runstate_info(int cpu)
  109 +{
  110 + struct vcpu_register_runstate_memory_area area;
  111 +
  112 + area.addr.v = &per_cpu(runstate, cpu);
  113 +
  114 + if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area,
  115 + cpu, &area))
  116 + BUG();
  117 +}
  118 +
  119 +static void do_stolen_accounting(void)
  120 +{
  121 + struct vcpu_runstate_info state;
  122 + struct vcpu_runstate_info *snap;
  123 + s64 blocked, runnable, offline, stolen;
  124 + cputime_t ticks;
  125 +
  126 + get_runstate_snapshot(&state);
  127 +
  128 + WARN_ON(state.state != RUNSTATE_running);
  129 +
  130 + snap = &__get_cpu_var(runstate_snapshot);
  131 +
  132 + /* work out how much time the VCPU has not been runn*ing* */
  133 + blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked];
  134 + runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable];
  135 + offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline];
  136 +
  137 + *snap = state;
  138 +
  139 + /* Add the appropriate number of ticks of stolen time,
  140 + including any left-overs from last time. Passing NULL to
  141 + account_steal_time accounts the time as stolen. */
  142 + stolen = runnable + offline + __get_cpu_var(residual_stolen);
  143 +
  144 + if (stolen < 0)
  145 + stolen = 0;
  146 +
  147 + ticks = 0;
  148 + while (stolen >= NS_PER_TICK) {
  149 + ticks++;
  150 + stolen -= NS_PER_TICK;
  151 + }
  152 + __get_cpu_var(residual_stolen) = stolen;
  153 + account_steal_time(NULL, ticks);
  154 +
  155 + /* Add the appropriate number of ticks of blocked time,
  156 + including any left-overs from last time. Passing idle to
  157 + account_steal_time accounts the time as idle/wait. */
  158 + blocked += __get_cpu_var(residual_blocked);
  159 +
  160 + if (blocked < 0)
  161 + blocked = 0;
  162 +
  163 + ticks = 0;
  164 + while (blocked >= NS_PER_TICK) {
  165 + ticks++;
  166 + blocked -= NS_PER_TICK;
  167 + }
  168 + __get_cpu_var(residual_blocked) = blocked;
  169 + account_steal_time(idle_task(smp_processor_id()), ticks);
  170 +}
  171 +
  172 +
  173 +
  174 +/* Get the CPU speed from Xen */
40 175 unsigned long xen_cpu_khz(void)
41 176 {
42 177 u64 cpu_khz = 1000000ULL << 32;
43 178  
... ... @@ -56,13 +191,11 @@
56 191 * Reads a consistent set of time-base values from Xen, into a shadow data
57 192 * area.
58 193 */
59   -static void get_time_values_from_xen(void)
  194 +static unsigned get_time_values_from_xen(void)
60 195 {
61 196 struct vcpu_time_info *src;
62 197 struct shadow_time_info *dst;
63 198  
64   - preempt_disable();
65   -
66 199 /* src is shared memory with the hypervisor, so we need to
67 200 make sure we get a consistent snapshot, even in the face of
68 201 being preempted. */
... ... @@ -79,7 +212,7 @@
79 212 rmb(); /* test version after fetching data */
80 213 } while ((src->version & 1) | (dst->version ^ src->version));
81 214  
82   - preempt_enable();
  215 + return dst->version;
83 216 }
84 217  
85 218 /*
... ... @@ -123,7 +256,7 @@
123 256 static u64 get_nsec_offset(struct shadow_time_info *shadow)
124 257 {
125 258 u64 now, delta;
126   - rdtscll(now);
  259 + now = native_read_tsc();
127 260 delta = now - shadow->tsc_timestamp;
128 261 return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
129 262 }
130 263  
131 264  
... ... @@ -132,11 +265,15 @@
132 265 {
133 266 struct shadow_time_info *shadow = &get_cpu_var(shadow_time);
134 267 cycle_t ret;
  268 + unsigned version;
135 269  
136   - get_time_values_from_xen();
  270 + do {
  271 + version = get_time_values_from_xen();
  272 + barrier();
  273 + ret = shadow->system_timestamp + get_nsec_offset(shadow);
  274 + barrier();
  275 + } while (version != __get_cpu_var(xen_vcpu)->time.version);
137 276  
138   - ret = shadow->system_timestamp + get_nsec_offset(shadow);
139   -
140 277 put_cpu_var(shadow_time);
141 278  
142 279 return ret;
... ... @@ -352,6 +489,8 @@
352 489 ret = IRQ_HANDLED;
353 490 }
354 491  
  492 + do_stolen_accounting();
  493 +
355 494 return ret;
356 495 }
357 496  
... ... @@ -378,6 +517,8 @@
378 517 evt->irq = irq;
379 518 clockevents_register_device(evt);
380 519  
  520 + setup_runstate_info(cpu);
  521 +
381 522 put_cpu_var(xen_clock_events);
382 523 }
383 524  
... ... @@ -390,7 +531,7 @@
390 531 clocksource_register(&xen_clocksource);
391 532  
392 533 if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
393   - /* Successfully turned off 100hz tick, so we have the
  534 + /* Successfully turned off 100Hz tick, so we have the
394 535 vcpuop-based timer interface */
395 536 printk(KERN_DEBUG "Xen: using vcpuop timer interface\n");
396 537 xen_clockevent = &xen_vcpuop_clockevent;