Commit 2dfc818b35cbea59188cc86e86e0a0efce2b0dbe
Committed by
Dominik Brodowski
1 parent
75f25bd31d
Exists in
master
and in
4 other branches
cpupower: mperf monitor - Use TSC to calculate max frequency if possible
Which makes the implementation independent from cpufreq drivers. Therefore this would also work on a Xen kernel where the hypervisor is doing frequency switching and idle entering. Signed-off-by: Thomas Renninger <trenn@suse.de> Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
Showing 2 changed files with 131 additions and 48 deletions Side-by-side Diff
tools/power/cpupower/Makefile
... | ... | @@ -24,7 +24,7 @@ |
24 | 24 | |
25 | 25 | # Set the following to `true' to make a unstripped, unoptimized |
26 | 26 | # binary. Leave this set to `false' for production use. |
27 | -DEBUG ?= false | |
27 | +DEBUG ?= true | |
28 | 28 | |
29 | 29 | # make the build silent. Set this to something else to make it noisy again. |
30 | 30 | V ?= false |
tools/power/cpupower/utils/idle_monitor/mperf_monitor.c
... | ... | @@ -22,12 +22,15 @@ |
22 | 22 | |
23 | 23 | #define MSR_TSC 0x10 |
24 | 24 | |
25 | +#define MSR_AMD_HWCR 0xc0010015 | |
26 | + | |
25 | 27 | enum mperf_id { C0 = 0, Cx, AVG_FREQ, MPERF_CSTATE_COUNT }; |
26 | 28 | |
27 | 29 | static int mperf_get_count_percent(unsigned int self_id, double *percent, |
28 | 30 | unsigned int cpu); |
29 | 31 | static int mperf_get_count_freq(unsigned int id, unsigned long long *count, |
30 | 32 | unsigned int cpu); |
33 | +static struct timespec time_start, time_end; | |
31 | 34 | |
32 | 35 | static cstate_t mperf_cstates[MPERF_CSTATE_COUNT] = { |
33 | 36 | { |
34 | 37 | |
35 | 38 | |
36 | 39 | |
... | ... | @@ -54,19 +57,33 @@ |
54 | 57 | }, |
55 | 58 | }; |
56 | 59 | |
60 | +enum MAX_FREQ_MODE { MAX_FREQ_SYSFS, MAX_FREQ_TSC_REF }; | |
61 | +static int max_freq_mode; | |
62 | +/* | |
63 | + * The max frequency mperf is ticking at (in C0), either retrieved via: | |
64 | + * 1) calculated after measurements if we know TSC ticks at mperf/P0 frequency | |
65 | + * 2) cpufreq /sys/devices/.../cpu0/cpufreq/cpuinfo_max_freq at init time | |
66 | + * 1. Is preferred as it also works without cpufreq subsystem (e.g. on Xen) | |
67 | + */ | |
68 | +static unsigned long max_frequency; | |
69 | + | |
57 | 70 | static unsigned long long tsc_at_measure_start; |
58 | 71 | static unsigned long long tsc_at_measure_end; |
59 | -static unsigned long max_frequency; | |
60 | 72 | static unsigned long long *mperf_previous_count; |
61 | 73 | static unsigned long long *aperf_previous_count; |
62 | 74 | static unsigned long long *mperf_current_count; |
63 | 75 | static unsigned long long *aperf_current_count; |
76 | + | |
64 | 77 | /* valid flag for all CPUs. If a MSR read failed it will be zero */ |
65 | 78 | static int *is_valid; |
66 | 79 | |
67 | 80 | static int mperf_get_tsc(unsigned long long *tsc) |
68 | 81 | { |
69 | - return read_msr(0, MSR_TSC, tsc); | |
82 | + int ret; | |
83 | + ret = read_msr(0, MSR_TSC, tsc); | |
84 | + if (ret) | |
85 | + dprint("Reading TSC MSR failed, returning %llu\n", *tsc); | |
86 | + return ret; | |
70 | 87 | } |
71 | 88 | |
72 | 89 | static int mperf_init_stats(unsigned int cpu) |
73 | 90 | |
... | ... | @@ -97,36 +114,11 @@ |
97 | 114 | return 0; |
98 | 115 | } |
99 | 116 | |
100 | -/* | |
101 | - * get_average_perf() | |
102 | - * | |
103 | - * Returns the average performance (also considers boosted frequencies) | |
104 | - * | |
105 | - * Input: | |
106 | - * aperf_diff: Difference of the aperf register over a time period | |
107 | - * mperf_diff: Difference of the mperf register over the same time period | |
108 | - * max_freq: Maximum frequency (P0) | |
109 | - * | |
110 | - * Returns: | |
111 | - * Average performance over the time period | |
112 | - */ | |
113 | -static unsigned long get_average_perf(unsigned long long aperf_diff, | |
114 | - unsigned long long mperf_diff) | |
115 | -{ | |
116 | - unsigned int perf_percent = 0; | |
117 | - if (((unsigned long)(-1) / 100) < aperf_diff) { | |
118 | - int shift_count = 7; | |
119 | - aperf_diff >>= shift_count; | |
120 | - mperf_diff >>= shift_count; | |
121 | - } | |
122 | - perf_percent = (aperf_diff * 100) / mperf_diff; | |
123 | - return (max_frequency * perf_percent) / 100; | |
124 | -} | |
125 | - | |
126 | 117 | static int mperf_get_count_percent(unsigned int id, double *percent, |
127 | 118 | unsigned int cpu) |
128 | 119 | { |
129 | 120 | unsigned long long aperf_diff, mperf_diff, tsc_diff; |
121 | + unsigned long long timediff; | |
130 | 122 | |
131 | 123 | if (!is_valid[cpu]) |
132 | 124 | return -1; |
133 | 125 | |
... | ... | @@ -136,11 +128,19 @@ |
136 | 128 | |
137 | 129 | mperf_diff = mperf_current_count[cpu] - mperf_previous_count[cpu]; |
138 | 130 | aperf_diff = aperf_current_count[cpu] - aperf_previous_count[cpu]; |
139 | - tsc_diff = tsc_at_measure_end - tsc_at_measure_start; | |
140 | 131 | |
141 | - *percent = 100.0 * mperf_diff / tsc_diff; | |
142 | - dprint("%s: mperf_diff: %llu, tsc_diff: %llu\n", | |
143 | - mperf_cstates[id].name, mperf_diff, tsc_diff); | |
132 | + if (max_freq_mode == MAX_FREQ_TSC_REF) { | |
133 | + tsc_diff = tsc_at_measure_end - tsc_at_measure_start; | |
134 | + *percent = 100.0 * mperf_diff / tsc_diff; | |
135 | + dprint("%s: TSC Ref - mperf_diff: %llu, tsc_diff: %llu\n", | |
136 | + mperf_cstates[id].name, mperf_diff, tsc_diff); | |
137 | + } else if (max_freq_mode == MAX_FREQ_SYSFS) { | |
138 | + timediff = timespec_diff_us(time_start, time_end); | |
139 | + *percent = 100.0 * mperf_diff / timediff; | |
140 | + dprint("%s: MAXFREQ - mperf_diff: %llu, time_diff: %llu\n", | |
141 | + mperf_cstates[id].name, mperf_diff, timediff); | |
142 | + } else | |
143 | + return -1; | |
144 | 144 | |
145 | 145 | if (id == Cx) |
146 | 146 | *percent = 100.0 - *percent; |
... | ... | @@ -154,7 +154,7 @@ |
154 | 154 | static int mperf_get_count_freq(unsigned int id, unsigned long long *count, |
155 | 155 | unsigned int cpu) |
156 | 156 | { |
157 | - unsigned long long aperf_diff, mperf_diff; | |
157 | + unsigned long long aperf_diff, mperf_diff, time_diff, tsc_diff; | |
158 | 158 | |
159 | 159 | if (id != AVG_FREQ) |
160 | 160 | return 1; |
161 | 161 | |
... | ... | @@ -165,11 +165,21 @@ |
165 | 165 | mperf_diff = mperf_current_count[cpu] - mperf_previous_count[cpu]; |
166 | 166 | aperf_diff = aperf_current_count[cpu] - aperf_previous_count[cpu]; |
167 | 167 | |
168 | - /* Return MHz for now, might want to return KHz if column width is more | |
169 | - generic */ | |
170 | - *count = get_average_perf(aperf_diff, mperf_diff) / 1000; | |
171 | - dprint("%s: %llu\n", mperf_cstates[id].name, *count); | |
168 | + if (max_freq_mode == MAX_FREQ_TSC_REF) { | |
169 | + /* Calculate max_freq from TSC count */ | |
170 | + tsc_diff = tsc_at_measure_end - tsc_at_measure_start; | |
171 | + time_diff = timespec_diff_us(time_start, time_end); | |
172 | + max_frequency = tsc_diff / time_diff; | |
173 | + } | |
172 | 174 | |
175 | + *count = max_frequency * ((double)aperf_diff / mperf_diff); | |
176 | + dprint("%s: Average freq based on %s maximum frequency:\n", | |
177 | + mperf_cstates[id].name, | |
178 | + (max_freq_mode == MAX_FREQ_TSC_REF) ? "TSC calculated" : "sysfs read"); | |
179 | + dprint("%max_frequency: %lu", max_frequency); | |
180 | + dprint("aperf_diff: %llu\n", aperf_diff); | |
181 | + dprint("mperf_diff: %llu\n", mperf_diff); | |
182 | + dprint("avg freq: %llu\n", *count); | |
173 | 183 | return 0; |
174 | 184 | } |
175 | 185 | |
... | ... | @@ -178,6 +188,7 @@ |
178 | 188 | int cpu; |
179 | 189 | unsigned long long dbg; |
180 | 190 | |
191 | + clock_gettime(CLOCK_REALTIME, &time_start); | |
181 | 192 | mperf_get_tsc(&tsc_at_measure_start); |
182 | 193 | |
183 | 194 | for (cpu = 0; cpu < cpu_count; cpu++) |
184 | 195 | |
185 | 196 | |
186 | 197 | |
187 | 198 | |
188 | 199 | |
189 | 200 | |
190 | 201 | |
... | ... | @@ -193,32 +204,104 @@ |
193 | 204 | unsigned long long dbg; |
194 | 205 | int cpu; |
195 | 206 | |
196 | - mperf_get_tsc(&tsc_at_measure_end); | |
197 | - | |
198 | 207 | for (cpu = 0; cpu < cpu_count; cpu++) |
199 | 208 | mperf_measure_stats(cpu); |
200 | 209 | |
210 | + mperf_get_tsc(&tsc_at_measure_end); | |
211 | + clock_gettime(CLOCK_REALTIME, &time_end); | |
212 | + | |
201 | 213 | mperf_get_tsc(&dbg); |
202 | 214 | dprint("TSC diff: %llu\n", dbg - tsc_at_measure_end); |
203 | 215 | |
204 | 216 | return 0; |
205 | 217 | } |
206 | 218 | |
207 | -struct cpuidle_monitor mperf_monitor; | |
208 | - | |
209 | -struct cpuidle_monitor *mperf_register(void) | |
219 | +/* | |
220 | + * Mperf register is defined to tick at P0 (maximum) frequency | |
221 | + * | |
222 | + * Instead of reading out P0 which can be tricky to read out from HW, | |
223 | + * we use TSC counter if it reliably ticks at P0/mperf frequency. | |
224 | + * | |
225 | + * Still try to fall back to: | |
226 | + * /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq | |
227 | + * on older Intel HW without invariant TSC feature. | |
228 | + * Or on AMD machines where TSC does not tick at P0 (do not exist yet, but | |
229 | + * it's still double checked (MSR_AMD_HWCR)). | |
230 | + * | |
231 | + * On these machines the user would still get useful mperf | |
232 | + * stats when acpi-cpufreq driver is loaded. | |
233 | + */ | |
234 | +static int init_maxfreq_mode(void) | |
210 | 235 | { |
236 | + int ret; | |
237 | + unsigned long long hwcr; | |
211 | 238 | unsigned long min; |
212 | 239 | |
213 | - if (!(cpupower_cpu_info.caps & CPUPOWER_CAP_APERF)) | |
214 | - return NULL; | |
240 | + if (!cpupower_cpu_info.caps & CPUPOWER_CAP_INV_TSC) | |
241 | + goto use_sysfs; | |
215 | 242 | |
216 | - /* Assume min/max all the same on all cores */ | |
243 | + if (cpupower_cpu_info.vendor == X86_VENDOR_AMD) { | |
244 | + /* MSR_AMD_HWCR tells us whether TSC runs at P0/mperf | |
245 | + * freq. | |
246 | + * A test whether hwcr is accessable/available would be: | |
247 | + * (cpupower_cpu_info.family > 0x10 || | |
248 | + * cpupower_cpu_info.family == 0x10 && | |
249 | + * cpupower_cpu_info.model >= 0x2)) | |
250 | + * This should be the case for all aperf/mperf | |
251 | + * capable AMD machines and is therefore safe to test here. | |
252 | + * Compare with Linus kernel git commit: acf01734b1747b1ec4 | |
253 | + */ | |
254 | + ret = read_msr(0, MSR_AMD_HWCR, &hwcr); | |
255 | + /* | |
256 | + * If the MSR read failed, assume a Xen system that did | |
257 | + * not explicitly provide access to it and assume TSC works | |
258 | + */ | |
259 | + if (ret != 0) { | |
260 | + dprint("TSC read 0x%x failed - assume TSC working\n", | |
261 | + MSR_AMD_HWCR); | |
262 | + return 0; | |
263 | + } else if (1 & (hwcr >> 24)) { | |
264 | + max_freq_mode = MAX_FREQ_TSC_REF; | |
265 | + return 0; | |
266 | + } else { /* Use sysfs max frequency if available */ } | |
267 | + } else if (cpupower_cpu_info.vendor == X86_VENDOR_INTEL) { | |
268 | + /* | |
269 | + * On Intel we assume mperf (in C0) is ticking at same | |
270 | + * rate than TSC | |
271 | + */ | |
272 | + max_freq_mode = MAX_FREQ_TSC_REF; | |
273 | + return 0; | |
274 | + } | |
275 | +use_sysfs: | |
217 | 276 | if (cpufreq_get_hardware_limits(0, &min, &max_frequency)) { |
218 | 277 | dprint("Cannot retrieve max freq from cpufreq kernel " |
219 | 278 | "subsystem\n"); |
220 | - return NULL; | |
279 | + return -1; | |
221 | 280 | } |
281 | + max_freq_mode = MAX_FREQ_SYSFS; | |
282 | + return 0; | |
283 | +} | |
284 | + | |
285 | +/* | |
286 | + * This monitor provides: | |
287 | + * | |
288 | + * 1) Average frequency a CPU resided in | |
289 | + * This always works if the CPU has aperf/mperf capabilities | |
290 | + * | |
291 | + * 2) C0 and Cx (any sleep state) time a CPU resided in | |
292 | + * Works if mperf timer stops ticking in sleep states which | |
293 | + * seem to be the case on all current HW. | |
294 | + * Both is directly retrieved from HW registers and is independent | |
295 | + * from kernel statistics. | |
296 | + */ | |
297 | +struct cpuidle_monitor mperf_monitor; | |
298 | +struct cpuidle_monitor *mperf_register(void) | |
299 | +{ | |
300 | + if (!(cpupower_cpu_info.caps & CPUPOWER_CAP_APERF)) | |
301 | + return NULL; | |
302 | + | |
303 | + if (init_maxfreq_mode()) | |
304 | + return NULL; | |
222 | 305 | |
223 | 306 | /* Free this at program termination */ |
224 | 307 | is_valid = calloc(cpu_count, sizeof(int)); |