Commit f91a8b447b9af64f589f6e13fec7f09b5927563d
Committed by
Jeremy Fitzhardinge
1 parent
9a4029fd34
xen: Account for stolen time
This patch accounts for the time stolen from our VCPUs. Stolen time is time where a vcpu is runnable and could be running, but all available physical CPUs are being used for something else. This accounting gets run on each timer interrupt, just as a way to get it run relatively often, and when interesting things are going on. Stolen time is not really used by much in the kernel; it is reported in /proc/stats, and that's about it. Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com> Acked-by: Chris Wright <chrisw@sous-sol.org> Cc: john stultz <johnstul@us.ibm.com> Cc: Rik van Riel <riel@redhat.com>
Showing 1 changed file with 150 additions and 9 deletions Side-by-side Diff
arch/i386/xen/time.c
... | ... | @@ -11,6 +11,7 @@ |
11 | 11 | #include <linux/interrupt.h> |
12 | 12 | #include <linux/clocksource.h> |
13 | 13 | #include <linux/clockchips.h> |
14 | +#include <linux/kernel_stat.h> | |
14 | 15 | |
15 | 16 | #include <asm/xen/hypervisor.h> |
16 | 17 | #include <asm/xen/hypercall.h> |
... | ... | @@ -25,6 +26,7 @@ |
25 | 26 | |
26 | 27 | /* Xen may fire a timer up to this many ns early */ |
27 | 28 | #define TIMER_SLOP 100000 |
29 | +#define NS_PER_TICK (1000000000LL / HZ) | |
28 | 30 | |
29 | 31 | /* These are perodically updated in shared_info, and then copied here. */ |
30 | 32 | struct shadow_time_info { |
... | ... | @@ -37,6 +39,139 @@ |
37 | 39 | |
38 | 40 | static DEFINE_PER_CPU(struct shadow_time_info, shadow_time); |
39 | 41 | |
42 | +/* runstate info updated by Xen */ | |
43 | +static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate); | |
44 | + | |
45 | +/* snapshots of runstate info */ | |
46 | +static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate_snapshot); | |
47 | + | |
48 | +/* unused ns of stolen and blocked time */ | |
49 | +static DEFINE_PER_CPU(u64, residual_stolen); | |
50 | +static DEFINE_PER_CPU(u64, residual_blocked); | |
51 | + | |
52 | +/* return an consistent snapshot of 64-bit time/counter value */ | |
53 | +static u64 get64(const u64 *p) | |
54 | +{ | |
55 | + u64 ret; | |
56 | + | |
57 | + if (BITS_PER_LONG < 64) { | |
58 | + u32 *p32 = (u32 *)p; | |
59 | + u32 h, l; | |
60 | + | |
61 | + /* | |
62 | + * Read high then low, and then make sure high is | |
63 | + * still the same; this will only loop if low wraps | |
64 | + * and carries into high. | |
65 | + * XXX some clean way to make this endian-proof? | |
66 | + */ | |
67 | + do { | |
68 | + h = p32[1]; | |
69 | + barrier(); | |
70 | + l = p32[0]; | |
71 | + barrier(); | |
72 | + } while (p32[1] != h); | |
73 | + | |
74 | + ret = (((u64)h) << 32) | l; | |
75 | + } else | |
76 | + ret = *p; | |
77 | + | |
78 | + return ret; | |
79 | +} | |
80 | + | |
81 | +/* | |
82 | + * Runstate accounting | |
83 | + */ | |
84 | +static void get_runstate_snapshot(struct vcpu_runstate_info *res) | |
85 | +{ | |
86 | + u64 state_time; | |
87 | + struct vcpu_runstate_info *state; | |
88 | + | |
89 | + preempt_disable(); | |
90 | + | |
91 | + state = &__get_cpu_var(runstate); | |
92 | + | |
93 | + /* | |
94 | + * The runstate info is always updated by the hypervisor on | |
95 | + * the current CPU, so there's no need to use anything | |
96 | + * stronger than a compiler barrier when fetching it. | |
97 | + */ | |
98 | + do { | |
99 | + state_time = get64(&state->state_entry_time); | |
100 | + barrier(); | |
101 | + *res = *state; | |
102 | + barrier(); | |
103 | + } while (get64(&state->state_entry_time) != state_time); | |
104 | + | |
105 | + preempt_enable(); | |
106 | +} | |
107 | + | |
108 | +static void setup_runstate_info(int cpu) | |
109 | +{ | |
110 | + struct vcpu_register_runstate_memory_area area; | |
111 | + | |
112 | + area.addr.v = &per_cpu(runstate, cpu); | |
113 | + | |
114 | + if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, | |
115 | + cpu, &area)) | |
116 | + BUG(); | |
117 | +} | |
118 | + | |
119 | +static void do_stolen_accounting(void) | |
120 | +{ | |
121 | + struct vcpu_runstate_info state; | |
122 | + struct vcpu_runstate_info *snap; | |
123 | + s64 blocked, runnable, offline, stolen; | |
124 | + cputime_t ticks; | |
125 | + | |
126 | + get_runstate_snapshot(&state); | |
127 | + | |
128 | + WARN_ON(state.state != RUNSTATE_running); | |
129 | + | |
130 | + snap = &__get_cpu_var(runstate_snapshot); | |
131 | + | |
132 | + /* work out how much time the VCPU has not been runn*ing* */ | |
133 | + blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked]; | |
134 | + runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable]; | |
135 | + offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline]; | |
136 | + | |
137 | + *snap = state; | |
138 | + | |
139 | + /* Add the appropriate number of ticks of stolen time, | |
140 | + including any left-overs from last time. Passing NULL to | |
141 | + account_steal_time accounts the time as stolen. */ | |
142 | + stolen = runnable + offline + __get_cpu_var(residual_stolen); | |
143 | + | |
144 | + if (stolen < 0) | |
145 | + stolen = 0; | |
146 | + | |
147 | + ticks = 0; | |
148 | + while (stolen >= NS_PER_TICK) { | |
149 | + ticks++; | |
150 | + stolen -= NS_PER_TICK; | |
151 | + } | |
152 | + __get_cpu_var(residual_stolen) = stolen; | |
153 | + account_steal_time(NULL, ticks); | |
154 | + | |
155 | + /* Add the appropriate number of ticks of blocked time, | |
156 | + including any left-overs from last time. Passing idle to | |
157 | + account_steal_time accounts the time as idle/wait. */ | |
158 | + blocked += __get_cpu_var(residual_blocked); | |
159 | + | |
160 | + if (blocked < 0) | |
161 | + blocked = 0; | |
162 | + | |
163 | + ticks = 0; | |
164 | + while (blocked >= NS_PER_TICK) { | |
165 | + ticks++; | |
166 | + blocked -= NS_PER_TICK; | |
167 | + } | |
168 | + __get_cpu_var(residual_blocked) = blocked; | |
169 | + account_steal_time(idle_task(smp_processor_id()), ticks); | |
170 | +} | |
171 | + | |
172 | + | |
173 | + | |
174 | +/* Get the CPU speed from Xen */ | |
40 | 175 | unsigned long xen_cpu_khz(void) |
41 | 176 | { |
42 | 177 | u64 cpu_khz = 1000000ULL << 32; |
43 | 178 | |
... | ... | @@ -56,13 +191,11 @@ |
56 | 191 | * Reads a consistent set of time-base values from Xen, into a shadow data |
57 | 192 | * area. |
58 | 193 | */ |
59 | -static void get_time_values_from_xen(void) | |
194 | +static unsigned get_time_values_from_xen(void) | |
60 | 195 | { |
61 | 196 | struct vcpu_time_info *src; |
62 | 197 | struct shadow_time_info *dst; |
63 | 198 | |
64 | - preempt_disable(); | |
65 | - | |
66 | 199 | /* src is shared memory with the hypervisor, so we need to |
67 | 200 | make sure we get a consistent snapshot, even in the face of |
68 | 201 | being preempted. */ |
... | ... | @@ -79,7 +212,7 @@ |
79 | 212 | rmb(); /* test version after fetching data */ |
80 | 213 | } while ((src->version & 1) | (dst->version ^ src->version)); |
81 | 214 | |
82 | - preempt_enable(); | |
215 | + return dst->version; | |
83 | 216 | } |
84 | 217 | |
85 | 218 | /* |
... | ... | @@ -123,7 +256,7 @@ |
123 | 256 | static u64 get_nsec_offset(struct shadow_time_info *shadow) |
124 | 257 | { |
125 | 258 | u64 now, delta; |
126 | - rdtscll(now); | |
259 | + now = native_read_tsc(); | |
127 | 260 | delta = now - shadow->tsc_timestamp; |
128 | 261 | return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift); |
129 | 262 | } |
130 | 263 | |
131 | 264 | |
... | ... | @@ -132,11 +265,15 @@ |
132 | 265 | { |
133 | 266 | struct shadow_time_info *shadow = &get_cpu_var(shadow_time); |
134 | 267 | cycle_t ret; |
268 | + unsigned version; | |
135 | 269 | |
136 | - get_time_values_from_xen(); | |
270 | + do { | |
271 | + version = get_time_values_from_xen(); | |
272 | + barrier(); | |
273 | + ret = shadow->system_timestamp + get_nsec_offset(shadow); | |
274 | + barrier(); | |
275 | + } while (version != __get_cpu_var(xen_vcpu)->time.version); | |
137 | 276 | |
138 | - ret = shadow->system_timestamp + get_nsec_offset(shadow); | |
139 | - | |
140 | 277 | put_cpu_var(shadow_time); |
141 | 278 | |
142 | 279 | return ret; |
... | ... | @@ -352,6 +489,8 @@ |
352 | 489 | ret = IRQ_HANDLED; |
353 | 490 | } |
354 | 491 | |
492 | + do_stolen_accounting(); | |
493 | + | |
355 | 494 | return ret; |
356 | 495 | } |
357 | 496 | |
... | ... | @@ -378,6 +517,8 @@ |
378 | 517 | evt->irq = irq; |
379 | 518 | clockevents_register_device(evt); |
380 | 519 | |
520 | + setup_runstate_info(cpu); | |
521 | + | |
381 | 522 | put_cpu_var(xen_clock_events); |
382 | 523 | } |
383 | 524 | |
... | ... | @@ -390,7 +531,7 @@ |
390 | 531 | clocksource_register(&xen_clocksource); |
391 | 532 | |
392 | 533 | if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) { |
393 | - /* Successfully turned off 100hz tick, so we have the | |
534 | + /* Successfully turned off 100Hz tick, so we have the | |
394 | 535 | vcpuop-based timer interface */ |
395 | 536 | printk(KERN_DEBUG "Xen: using vcpuop timer interface\n"); |
396 | 537 | xen_clockevent = &xen_vcpuop_clockevent; |