Commit 4bb9374e0bd40d8fe97860ea0d61a0330b7c3925

Authored by Linus Torvalds

Merge branch 'timers-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull NOHZ update from Thomas Gleixner:
 "Remove the call into the nohz idle code from the fake 'idle' thread in
  the powerclamp driver along with the export of those functions which
  was smuggeled in via the thermal tree.  People have tried to hack
  around it in the nohz core code, but it just violates all rightful
  assumptions of that code about the only valid calling context (i.e.
  the proper idle task).

  The powerclamp trainwreck will still work, it just wont get the
  benefit of long idle sleeps"

* 'timers-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  tick/powerclamp: Remove tick_nohz_idle abuse

Showing 2 changed files Inline Diff

drivers/thermal/intel_powerclamp.c
1 /* 1 /*
2 * intel_powerclamp.c - package c-state idle injection 2 * intel_powerclamp.c - package c-state idle injection
3 * 3 *
4 * Copyright (c) 2012, Intel Corporation. 4 * Copyright (c) 2012, Intel Corporation.
5 * 5 *
6 * Authors: 6 * Authors:
7 * Arjan van de Ven <arjan@linux.intel.com> 7 * Arjan van de Ven <arjan@linux.intel.com>
8 * Jacob Pan <jacob.jun.pan@linux.intel.com> 8 * Jacob Pan <jacob.jun.pan@linux.intel.com>
9 * 9 *
10 * This program is free software; you can redistribute it and/or modify it 10 * This program is free software; you can redistribute it and/or modify it
11 * under the terms and conditions of the GNU General Public License, 11 * under the terms and conditions of the GNU General Public License,
12 * version 2, as published by the Free Software Foundation. 12 * version 2, as published by the Free Software Foundation.
13 * 13 *
14 * This program is distributed in the hope it will be useful, but WITHOUT 14 * This program is distributed in the hope it will be useful, but WITHOUT
15 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 15 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
16 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 16 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
17 * more details. 17 * more details.
18 * 18 *
19 * You should have received a copy of the GNU General Public License along with 19 * You should have received a copy of the GNU General Public License along with
20 * this program; if not, write to the Free Software Foundation, Inc., 20 * this program; if not, write to the Free Software Foundation, Inc.,
21 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. 21 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
22 * 22 *
23 * 23 *
24 * TODO: 24 * TODO:
25 * 1. better handle wakeup from external interrupts, currently a fixed 25 * 1. better handle wakeup from external interrupts, currently a fixed
26 * compensation is added to clamping duration when excessive amount 26 * compensation is added to clamping duration when excessive amount
27 * of wakeups are observed during idle time. the reason is that in 27 * of wakeups are observed during idle time. the reason is that in
28 * case of external interrupts without need for ack, clamping down 28 * case of external interrupts without need for ack, clamping down
29 * cpu in non-irq context does not reduce irq. for majority of the 29 * cpu in non-irq context does not reduce irq. for majority of the
30 * cases, clamping down cpu does help reduce irq as well, we should 30 * cases, clamping down cpu does help reduce irq as well, we should
31 * be able to differenciate the two cases and give a quantitative 31 * be able to differenciate the two cases and give a quantitative
32 * solution for the irqs that we can control. perhaps based on 32 * solution for the irqs that we can control. perhaps based on
33 * get_cpu_iowait_time_us() 33 * get_cpu_iowait_time_us()
34 * 34 *
35 * 2. synchronization with other hw blocks 35 * 2. synchronization with other hw blocks
36 * 36 *
37 * 37 *
38 */ 38 */
39 39
40 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 40 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
41 41
42 #include <linux/module.h> 42 #include <linux/module.h>
43 #include <linux/kernel.h> 43 #include <linux/kernel.h>
44 #include <linux/delay.h> 44 #include <linux/delay.h>
45 #include <linux/kthread.h> 45 #include <linux/kthread.h>
46 #include <linux/freezer.h> 46 #include <linux/freezer.h>
47 #include <linux/cpu.h> 47 #include <linux/cpu.h>
48 #include <linux/thermal.h> 48 #include <linux/thermal.h>
49 #include <linux/slab.h> 49 #include <linux/slab.h>
50 #include <linux/tick.h> 50 #include <linux/tick.h>
51 #include <linux/debugfs.h> 51 #include <linux/debugfs.h>
52 #include <linux/seq_file.h> 52 #include <linux/seq_file.h>
53 #include <linux/sched/rt.h> 53 #include <linux/sched/rt.h>
54 54
55 #include <asm/nmi.h> 55 #include <asm/nmi.h>
56 #include <asm/msr.h> 56 #include <asm/msr.h>
57 #include <asm/mwait.h> 57 #include <asm/mwait.h>
58 #include <asm/cpu_device_id.h> 58 #include <asm/cpu_device_id.h>
59 #include <asm/idle.h> 59 #include <asm/idle.h>
60 #include <asm/hardirq.h> 60 #include <asm/hardirq.h>
61 61
62 #define MAX_TARGET_RATIO (50U) 62 #define MAX_TARGET_RATIO (50U)
63 /* For each undisturbed clamping period (no extra wake ups during idle time), 63 /* For each undisturbed clamping period (no extra wake ups during idle time),
64 * we increment the confidence counter for the given target ratio. 64 * we increment the confidence counter for the given target ratio.
65 * CONFIDENCE_OK defines the level where runtime calibration results are 65 * CONFIDENCE_OK defines the level where runtime calibration results are
66 * valid. 66 * valid.
67 */ 67 */
68 #define CONFIDENCE_OK (3) 68 #define CONFIDENCE_OK (3)
69 /* Default idle injection duration, driver adjust sleep time to meet target 69 /* Default idle injection duration, driver adjust sleep time to meet target
70 * idle ratio. Similar to frequency modulation. 70 * idle ratio. Similar to frequency modulation.
71 */ 71 */
72 #define DEFAULT_DURATION_JIFFIES (6) 72 #define DEFAULT_DURATION_JIFFIES (6)
73 73
74 static unsigned int target_mwait; 74 static unsigned int target_mwait;
75 static struct dentry *debug_dir; 75 static struct dentry *debug_dir;
76 76
77 /* user selected target */ 77 /* user selected target */
78 static unsigned int set_target_ratio; 78 static unsigned int set_target_ratio;
79 static unsigned int current_ratio; 79 static unsigned int current_ratio;
80 static bool should_skip; 80 static bool should_skip;
81 static bool reduce_irq; 81 static bool reduce_irq;
82 static atomic_t idle_wakeup_counter; 82 static atomic_t idle_wakeup_counter;
83 static unsigned int control_cpu; /* The cpu assigned to collect stat and update 83 static unsigned int control_cpu; /* The cpu assigned to collect stat and update
84 * control parameters. default to BSP but BSP 84 * control parameters. default to BSP but BSP
85 * can be offlined. 85 * can be offlined.
86 */ 86 */
87 static bool clamping; 87 static bool clamping;
88 88
89 89
90 static struct task_struct * __percpu *powerclamp_thread; 90 static struct task_struct * __percpu *powerclamp_thread;
91 static struct thermal_cooling_device *cooling_dev; 91 static struct thermal_cooling_device *cooling_dev;
92 static unsigned long *cpu_clamping_mask; /* bit map for tracking per cpu 92 static unsigned long *cpu_clamping_mask; /* bit map for tracking per cpu
93 * clamping thread 93 * clamping thread
94 */ 94 */
95 95
96 static unsigned int duration; 96 static unsigned int duration;
97 static unsigned int pkg_cstate_ratio_cur; 97 static unsigned int pkg_cstate_ratio_cur;
98 static unsigned int window_size; 98 static unsigned int window_size;
99 99
100 static int duration_set(const char *arg, const struct kernel_param *kp) 100 static int duration_set(const char *arg, const struct kernel_param *kp)
101 { 101 {
102 int ret = 0; 102 int ret = 0;
103 unsigned long new_duration; 103 unsigned long new_duration;
104 104
105 ret = kstrtoul(arg, 10, &new_duration); 105 ret = kstrtoul(arg, 10, &new_duration);
106 if (ret) 106 if (ret)
107 goto exit; 107 goto exit;
108 if (new_duration > 25 || new_duration < 6) { 108 if (new_duration > 25 || new_duration < 6) {
109 pr_err("Out of recommended range %lu, between 6-25ms\n", 109 pr_err("Out of recommended range %lu, between 6-25ms\n",
110 new_duration); 110 new_duration);
111 ret = -EINVAL; 111 ret = -EINVAL;
112 } 112 }
113 113
114 duration = clamp(new_duration, 6ul, 25ul); 114 duration = clamp(new_duration, 6ul, 25ul);
115 smp_mb(); 115 smp_mb();
116 116
117 exit: 117 exit:
118 118
119 return ret; 119 return ret;
120 } 120 }
121 121
122 static struct kernel_param_ops duration_ops = { 122 static struct kernel_param_ops duration_ops = {
123 .set = duration_set, 123 .set = duration_set,
124 .get = param_get_int, 124 .get = param_get_int,
125 }; 125 };
126 126
127 127
128 module_param_cb(duration, &duration_ops, &duration, 0644); 128 module_param_cb(duration, &duration_ops, &duration, 0644);
129 MODULE_PARM_DESC(duration, "forced idle time for each attempt in msec."); 129 MODULE_PARM_DESC(duration, "forced idle time for each attempt in msec.");
130 130
131 struct powerclamp_calibration_data { 131 struct powerclamp_calibration_data {
132 unsigned long confidence; /* used for calibration, basically a counter 132 unsigned long confidence; /* used for calibration, basically a counter
133 * gets incremented each time a clamping 133 * gets incremented each time a clamping
134 * period is completed without extra wakeups 134 * period is completed without extra wakeups
135 * once that counter is reached given level, 135 * once that counter is reached given level,
136 * compensation is deemed usable. 136 * compensation is deemed usable.
137 */ 137 */
138 unsigned long steady_comp; /* steady state compensation used when 138 unsigned long steady_comp; /* steady state compensation used when
139 * no extra wakeups occurred. 139 * no extra wakeups occurred.
140 */ 140 */
141 unsigned long dynamic_comp; /* compensate excessive wakeup from idle 141 unsigned long dynamic_comp; /* compensate excessive wakeup from idle
142 * mostly from external interrupts. 142 * mostly from external interrupts.
143 */ 143 */
144 }; 144 };
145 145
146 static struct powerclamp_calibration_data cal_data[MAX_TARGET_RATIO]; 146 static struct powerclamp_calibration_data cal_data[MAX_TARGET_RATIO];
147 147
148 static int window_size_set(const char *arg, const struct kernel_param *kp) 148 static int window_size_set(const char *arg, const struct kernel_param *kp)
149 { 149 {
150 int ret = 0; 150 int ret = 0;
151 unsigned long new_window_size; 151 unsigned long new_window_size;
152 152
153 ret = kstrtoul(arg, 10, &new_window_size); 153 ret = kstrtoul(arg, 10, &new_window_size);
154 if (ret) 154 if (ret)
155 goto exit_win; 155 goto exit_win;
156 if (new_window_size > 10 || new_window_size < 2) { 156 if (new_window_size > 10 || new_window_size < 2) {
157 pr_err("Out of recommended window size %lu, between 2-10\n", 157 pr_err("Out of recommended window size %lu, between 2-10\n",
158 new_window_size); 158 new_window_size);
159 ret = -EINVAL; 159 ret = -EINVAL;
160 } 160 }
161 161
162 window_size = clamp(new_window_size, 2ul, 10ul); 162 window_size = clamp(new_window_size, 2ul, 10ul);
163 smp_mb(); 163 smp_mb();
164 164
165 exit_win: 165 exit_win:
166 166
167 return ret; 167 return ret;
168 } 168 }
169 169
170 static struct kernel_param_ops window_size_ops = { 170 static struct kernel_param_ops window_size_ops = {
171 .set = window_size_set, 171 .set = window_size_set,
172 .get = param_get_int, 172 .get = param_get_int,
173 }; 173 };
174 174
175 module_param_cb(window_size, &window_size_ops, &window_size, 0644); 175 module_param_cb(window_size, &window_size_ops, &window_size, 0644);
176 MODULE_PARM_DESC(window_size, "sliding window in number of clamping cycles\n" 176 MODULE_PARM_DESC(window_size, "sliding window in number of clamping cycles\n"
177 "\tpowerclamp controls idle ratio within this window. larger\n" 177 "\tpowerclamp controls idle ratio within this window. larger\n"
178 "\twindow size results in slower response time but more smooth\n" 178 "\twindow size results in slower response time but more smooth\n"
179 "\tclamping results. default to 2."); 179 "\tclamping results. default to 2.");
180 180
181 static void find_target_mwait(void) 181 static void find_target_mwait(void)
182 { 182 {
183 unsigned int eax, ebx, ecx, edx; 183 unsigned int eax, ebx, ecx, edx;
184 unsigned int highest_cstate = 0; 184 unsigned int highest_cstate = 0;
185 unsigned int highest_subcstate = 0; 185 unsigned int highest_subcstate = 0;
186 int i; 186 int i;
187 187
188 if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF) 188 if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
189 return; 189 return;
190 190
191 cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx); 191 cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
192 192
193 if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) || 193 if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
194 !(ecx & CPUID5_ECX_INTERRUPT_BREAK)) 194 !(ecx & CPUID5_ECX_INTERRUPT_BREAK))
195 return; 195 return;
196 196
197 edx >>= MWAIT_SUBSTATE_SIZE; 197 edx >>= MWAIT_SUBSTATE_SIZE;
198 for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) { 198 for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
199 if (edx & MWAIT_SUBSTATE_MASK) { 199 if (edx & MWAIT_SUBSTATE_MASK) {
200 highest_cstate = i; 200 highest_cstate = i;
201 highest_subcstate = edx & MWAIT_SUBSTATE_MASK; 201 highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
202 } 202 }
203 } 203 }
204 target_mwait = (highest_cstate << MWAIT_SUBSTATE_SIZE) | 204 target_mwait = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
205 (highest_subcstate - 1); 205 (highest_subcstate - 1);
206 206
207 } 207 }
208 208
209 static bool has_pkg_state_counter(void) 209 static bool has_pkg_state_counter(void)
210 { 210 {
211 u64 tmp; 211 u64 tmp;
212 return !rdmsrl_safe(MSR_PKG_C2_RESIDENCY, &tmp) || 212 return !rdmsrl_safe(MSR_PKG_C2_RESIDENCY, &tmp) ||
213 !rdmsrl_safe(MSR_PKG_C3_RESIDENCY, &tmp) || 213 !rdmsrl_safe(MSR_PKG_C3_RESIDENCY, &tmp) ||
214 !rdmsrl_safe(MSR_PKG_C6_RESIDENCY, &tmp) || 214 !rdmsrl_safe(MSR_PKG_C6_RESIDENCY, &tmp) ||
215 !rdmsrl_safe(MSR_PKG_C7_RESIDENCY, &tmp); 215 !rdmsrl_safe(MSR_PKG_C7_RESIDENCY, &tmp);
216 } 216 }
217 217
218 static u64 pkg_state_counter(void) 218 static u64 pkg_state_counter(void)
219 { 219 {
220 u64 val; 220 u64 val;
221 u64 count = 0; 221 u64 count = 0;
222 222
223 static bool skip_c2; 223 static bool skip_c2;
224 static bool skip_c3; 224 static bool skip_c3;
225 static bool skip_c6; 225 static bool skip_c6;
226 static bool skip_c7; 226 static bool skip_c7;
227 227
228 if (!skip_c2) { 228 if (!skip_c2) {
229 if (!rdmsrl_safe(MSR_PKG_C2_RESIDENCY, &val)) 229 if (!rdmsrl_safe(MSR_PKG_C2_RESIDENCY, &val))
230 count += val; 230 count += val;
231 else 231 else
232 skip_c2 = true; 232 skip_c2 = true;
233 } 233 }
234 234
235 if (!skip_c3) { 235 if (!skip_c3) {
236 if (!rdmsrl_safe(MSR_PKG_C3_RESIDENCY, &val)) 236 if (!rdmsrl_safe(MSR_PKG_C3_RESIDENCY, &val))
237 count += val; 237 count += val;
238 else 238 else
239 skip_c3 = true; 239 skip_c3 = true;
240 } 240 }
241 241
242 if (!skip_c6) { 242 if (!skip_c6) {
243 if (!rdmsrl_safe(MSR_PKG_C6_RESIDENCY, &val)) 243 if (!rdmsrl_safe(MSR_PKG_C6_RESIDENCY, &val))
244 count += val; 244 count += val;
245 else 245 else
246 skip_c6 = true; 246 skip_c6 = true;
247 } 247 }
248 248
249 if (!skip_c7) { 249 if (!skip_c7) {
250 if (!rdmsrl_safe(MSR_PKG_C7_RESIDENCY, &val)) 250 if (!rdmsrl_safe(MSR_PKG_C7_RESIDENCY, &val))
251 count += val; 251 count += val;
252 else 252 else
253 skip_c7 = true; 253 skip_c7 = true;
254 } 254 }
255 255
256 return count; 256 return count;
257 } 257 }
258 258
259 static void noop_timer(unsigned long foo) 259 static void noop_timer(unsigned long foo)
260 { 260 {
261 /* empty... just the fact that we get the interrupt wakes us up */ 261 /* empty... just the fact that we get the interrupt wakes us up */
262 } 262 }
263 263
264 static unsigned int get_compensation(int ratio) 264 static unsigned int get_compensation(int ratio)
265 { 265 {
266 unsigned int comp = 0; 266 unsigned int comp = 0;
267 267
268 /* we only use compensation if all adjacent ones are good */ 268 /* we only use compensation if all adjacent ones are good */
269 if (ratio == 1 && 269 if (ratio == 1 &&
270 cal_data[ratio].confidence >= CONFIDENCE_OK && 270 cal_data[ratio].confidence >= CONFIDENCE_OK &&
271 cal_data[ratio + 1].confidence >= CONFIDENCE_OK && 271 cal_data[ratio + 1].confidence >= CONFIDENCE_OK &&
272 cal_data[ratio + 2].confidence >= CONFIDENCE_OK) { 272 cal_data[ratio + 2].confidence >= CONFIDENCE_OK) {
273 comp = (cal_data[ratio].steady_comp + 273 comp = (cal_data[ratio].steady_comp +
274 cal_data[ratio + 1].steady_comp + 274 cal_data[ratio + 1].steady_comp +
275 cal_data[ratio + 2].steady_comp) / 3; 275 cal_data[ratio + 2].steady_comp) / 3;
276 } else if (ratio == MAX_TARGET_RATIO - 1 && 276 } else if (ratio == MAX_TARGET_RATIO - 1 &&
277 cal_data[ratio].confidence >= CONFIDENCE_OK && 277 cal_data[ratio].confidence >= CONFIDENCE_OK &&
278 cal_data[ratio - 1].confidence >= CONFIDENCE_OK && 278 cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
279 cal_data[ratio - 2].confidence >= CONFIDENCE_OK) { 279 cal_data[ratio - 2].confidence >= CONFIDENCE_OK) {
280 comp = (cal_data[ratio].steady_comp + 280 comp = (cal_data[ratio].steady_comp +
281 cal_data[ratio - 1].steady_comp + 281 cal_data[ratio - 1].steady_comp +
282 cal_data[ratio - 2].steady_comp) / 3; 282 cal_data[ratio - 2].steady_comp) / 3;
283 } else if (cal_data[ratio].confidence >= CONFIDENCE_OK && 283 } else if (cal_data[ratio].confidence >= CONFIDENCE_OK &&
284 cal_data[ratio - 1].confidence >= CONFIDENCE_OK && 284 cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
285 cal_data[ratio + 1].confidence >= CONFIDENCE_OK) { 285 cal_data[ratio + 1].confidence >= CONFIDENCE_OK) {
286 comp = (cal_data[ratio].steady_comp + 286 comp = (cal_data[ratio].steady_comp +
287 cal_data[ratio - 1].steady_comp + 287 cal_data[ratio - 1].steady_comp +
288 cal_data[ratio + 1].steady_comp) / 3; 288 cal_data[ratio + 1].steady_comp) / 3;
289 } 289 }
290 290
291 /* REVISIT: simple penalty of double idle injection */ 291 /* REVISIT: simple penalty of double idle injection */
292 if (reduce_irq) 292 if (reduce_irq)
293 comp = ratio; 293 comp = ratio;
294 /* do not exceed limit */ 294 /* do not exceed limit */
295 if (comp + ratio >= MAX_TARGET_RATIO) 295 if (comp + ratio >= MAX_TARGET_RATIO)
296 comp = MAX_TARGET_RATIO - ratio - 1; 296 comp = MAX_TARGET_RATIO - ratio - 1;
297 297
298 return comp; 298 return comp;
299 } 299 }
300 300
301 static void adjust_compensation(int target_ratio, unsigned int win) 301 static void adjust_compensation(int target_ratio, unsigned int win)
302 { 302 {
303 int delta; 303 int delta;
304 struct powerclamp_calibration_data *d = &cal_data[target_ratio]; 304 struct powerclamp_calibration_data *d = &cal_data[target_ratio];
305 305
306 /* 306 /*
307 * adjust compensations if confidence level has not been reached or 307 * adjust compensations if confidence level has not been reached or
308 * there are too many wakeups during the last idle injection period, we 308 * there are too many wakeups during the last idle injection period, we
309 * cannot trust the data for compensation. 309 * cannot trust the data for compensation.
310 */ 310 */
311 if (d->confidence >= CONFIDENCE_OK || 311 if (d->confidence >= CONFIDENCE_OK ||
312 atomic_read(&idle_wakeup_counter) > 312 atomic_read(&idle_wakeup_counter) >
313 win * num_online_cpus()) 313 win * num_online_cpus())
314 return; 314 return;
315 315
316 delta = set_target_ratio - current_ratio; 316 delta = set_target_ratio - current_ratio;
317 /* filter out bad data */ 317 /* filter out bad data */
318 if (delta >= 0 && delta <= (1+target_ratio/10)) { 318 if (delta >= 0 && delta <= (1+target_ratio/10)) {
319 if (d->steady_comp) 319 if (d->steady_comp)
320 d->steady_comp = 320 d->steady_comp =
321 roundup(delta+d->steady_comp, 2)/2; 321 roundup(delta+d->steady_comp, 2)/2;
322 else 322 else
323 d->steady_comp = delta; 323 d->steady_comp = delta;
324 d->confidence++; 324 d->confidence++;
325 } 325 }
326 } 326 }
327 327
328 static bool powerclamp_adjust_controls(unsigned int target_ratio, 328 static bool powerclamp_adjust_controls(unsigned int target_ratio,
329 unsigned int guard, unsigned int win) 329 unsigned int guard, unsigned int win)
330 { 330 {
331 static u64 msr_last, tsc_last; 331 static u64 msr_last, tsc_last;
332 u64 msr_now, tsc_now; 332 u64 msr_now, tsc_now;
333 u64 val64; 333 u64 val64;
334 334
335 /* check result for the last window */ 335 /* check result for the last window */
336 msr_now = pkg_state_counter(); 336 msr_now = pkg_state_counter();
337 rdtscll(tsc_now); 337 rdtscll(tsc_now);
338 338
339 /* calculate pkg cstate vs tsc ratio */ 339 /* calculate pkg cstate vs tsc ratio */
340 if (!msr_last || !tsc_last) 340 if (!msr_last || !tsc_last)
341 current_ratio = 1; 341 current_ratio = 1;
342 else if (tsc_now-tsc_last) { 342 else if (tsc_now-tsc_last) {
343 val64 = 100*(msr_now-msr_last); 343 val64 = 100*(msr_now-msr_last);
344 do_div(val64, (tsc_now-tsc_last)); 344 do_div(val64, (tsc_now-tsc_last));
345 current_ratio = val64; 345 current_ratio = val64;
346 } 346 }
347 347
348 /* update record */ 348 /* update record */
349 msr_last = msr_now; 349 msr_last = msr_now;
350 tsc_last = tsc_now; 350 tsc_last = tsc_now;
351 351
352 adjust_compensation(target_ratio, win); 352 adjust_compensation(target_ratio, win);
353 /* 353 /*
354 * too many external interrupts, set flag such 354 * too many external interrupts, set flag such
355 * that we can take measure later. 355 * that we can take measure later.
356 */ 356 */
357 reduce_irq = atomic_read(&idle_wakeup_counter) >= 357 reduce_irq = atomic_read(&idle_wakeup_counter) >=
358 2 * win * num_online_cpus(); 358 2 * win * num_online_cpus();
359 359
360 atomic_set(&idle_wakeup_counter, 0); 360 atomic_set(&idle_wakeup_counter, 0);
361 /* if we are above target+guard, skip */ 361 /* if we are above target+guard, skip */
362 return set_target_ratio + guard <= current_ratio; 362 return set_target_ratio + guard <= current_ratio;
363 } 363 }
364 364
365 static int clamp_thread(void *arg) 365 static int clamp_thread(void *arg)
366 { 366 {
367 int cpunr = (unsigned long)arg; 367 int cpunr = (unsigned long)arg;
368 DEFINE_TIMER(wakeup_timer, noop_timer, 0, 0); 368 DEFINE_TIMER(wakeup_timer, noop_timer, 0, 0);
369 static const struct sched_param param = { 369 static const struct sched_param param = {
370 .sched_priority = MAX_USER_RT_PRIO/2, 370 .sched_priority = MAX_USER_RT_PRIO/2,
371 }; 371 };
372 unsigned int count = 0; 372 unsigned int count = 0;
373 unsigned int target_ratio; 373 unsigned int target_ratio;
374 374
375 set_bit(cpunr, cpu_clamping_mask); 375 set_bit(cpunr, cpu_clamping_mask);
376 set_freezable(); 376 set_freezable();
377 init_timer_on_stack(&wakeup_timer); 377 init_timer_on_stack(&wakeup_timer);
378 sched_setscheduler(current, SCHED_FIFO, &param); 378 sched_setscheduler(current, SCHED_FIFO, &param);
379 379
380 while (true == clamping && !kthread_should_stop() && 380 while (true == clamping && !kthread_should_stop() &&
381 cpu_online(cpunr)) { 381 cpu_online(cpunr)) {
382 int sleeptime; 382 int sleeptime;
383 unsigned long target_jiffies; 383 unsigned long target_jiffies;
384 unsigned int guard; 384 unsigned int guard;
385 unsigned int compensation = 0; 385 unsigned int compensation = 0;
386 int interval; /* jiffies to sleep for each attempt */ 386 int interval; /* jiffies to sleep for each attempt */
387 unsigned int duration_jiffies = msecs_to_jiffies(duration); 387 unsigned int duration_jiffies = msecs_to_jiffies(duration);
388 unsigned int window_size_now; 388 unsigned int window_size_now;
389 389
390 try_to_freeze(); 390 try_to_freeze();
391 /* 391 /*
392 * make sure user selected ratio does not take effect until 392 * make sure user selected ratio does not take effect until
393 * the next round. adjust target_ratio if user has changed 393 * the next round. adjust target_ratio if user has changed
394 * target such that we can converge quickly. 394 * target such that we can converge quickly.
395 */ 395 */
396 target_ratio = set_target_ratio; 396 target_ratio = set_target_ratio;
397 guard = 1 + target_ratio/20; 397 guard = 1 + target_ratio/20;
398 window_size_now = window_size; 398 window_size_now = window_size;
399 count++; 399 count++;
400 400
401 /* 401 /*
402 * systems may have different ability to enter package level 402 * systems may have different ability to enter package level
403 * c-states, thus we need to compensate the injected idle ratio 403 * c-states, thus we need to compensate the injected idle ratio
404 * to achieve the actual target reported by the HW. 404 * to achieve the actual target reported by the HW.
405 */ 405 */
406 compensation = get_compensation(target_ratio); 406 compensation = get_compensation(target_ratio);
407 interval = duration_jiffies*100/(target_ratio+compensation); 407 interval = duration_jiffies*100/(target_ratio+compensation);
408 408
409 /* align idle time */ 409 /* align idle time */
410 target_jiffies = roundup(jiffies, interval); 410 target_jiffies = roundup(jiffies, interval);
411 sleeptime = target_jiffies - jiffies; 411 sleeptime = target_jiffies - jiffies;
412 if (sleeptime <= 0) 412 if (sleeptime <= 0)
413 sleeptime = 1; 413 sleeptime = 1;
414 schedule_timeout_interruptible(sleeptime); 414 schedule_timeout_interruptible(sleeptime);
415 /* 415 /*
416 * only elected controlling cpu can collect stats and update 416 * only elected controlling cpu can collect stats and update
417 * control parameters. 417 * control parameters.
418 */ 418 */
419 if (cpunr == control_cpu && !(count%window_size_now)) { 419 if (cpunr == control_cpu && !(count%window_size_now)) {
420 should_skip = 420 should_skip =
421 powerclamp_adjust_controls(target_ratio, 421 powerclamp_adjust_controls(target_ratio,
422 guard, window_size_now); 422 guard, window_size_now);
423 smp_mb(); 423 smp_mb();
424 } 424 }
425 425
426 if (should_skip) 426 if (should_skip)
427 continue; 427 continue;
428 428
429 target_jiffies = jiffies + duration_jiffies; 429 target_jiffies = jiffies + duration_jiffies;
430 mod_timer(&wakeup_timer, target_jiffies); 430 mod_timer(&wakeup_timer, target_jiffies);
431 if (unlikely(local_softirq_pending())) 431 if (unlikely(local_softirq_pending()))
432 continue; 432 continue;
433 /* 433 /*
434 * stop tick sched during idle time, interrupts are still 434 * stop tick sched during idle time, interrupts are still
435 * allowed. thus jiffies are updated properly. 435 * allowed. thus jiffies are updated properly.
436 */ 436 */
437 preempt_disable(); 437 preempt_disable();
438 tick_nohz_idle_enter();
439 /* mwait until target jiffies is reached */ 438 /* mwait until target jiffies is reached */
440 while (time_before(jiffies, target_jiffies)) { 439 while (time_before(jiffies, target_jiffies)) {
441 unsigned long ecx = 1; 440 unsigned long ecx = 1;
442 unsigned long eax = target_mwait; 441 unsigned long eax = target_mwait;
443 442
444 /* 443 /*
445 * REVISIT: may call enter_idle() to notify drivers who 444 * REVISIT: may call enter_idle() to notify drivers who
446 * can save power during cpu idle. same for exit_idle() 445 * can save power during cpu idle. same for exit_idle()
447 */ 446 */
448 local_touch_nmi(); 447 local_touch_nmi();
449 stop_critical_timings(); 448 stop_critical_timings();
450 mwait_idle_with_hints(eax, ecx); 449 mwait_idle_with_hints(eax, ecx);
451 start_critical_timings(); 450 start_critical_timings();
452 atomic_inc(&idle_wakeup_counter); 451 atomic_inc(&idle_wakeup_counter);
453 } 452 }
454 tick_nohz_idle_exit();
455 preempt_enable(); 453 preempt_enable();
456 } 454 }
457 del_timer_sync(&wakeup_timer); 455 del_timer_sync(&wakeup_timer);
458 clear_bit(cpunr, cpu_clamping_mask); 456 clear_bit(cpunr, cpu_clamping_mask);
459 457
460 return 0; 458 return 0;
461 } 459 }
462 460
463 /* 461 /*
464 * 1 HZ polling while clamping is active, useful for userspace 462 * 1 HZ polling while clamping is active, useful for userspace
465 * to monitor actual idle ratio. 463 * to monitor actual idle ratio.
466 */ 464 */
467 static void poll_pkg_cstate(struct work_struct *dummy); 465 static void poll_pkg_cstate(struct work_struct *dummy);
468 static DECLARE_DELAYED_WORK(poll_pkg_cstate_work, poll_pkg_cstate); 466 static DECLARE_DELAYED_WORK(poll_pkg_cstate_work, poll_pkg_cstate);
469 static void poll_pkg_cstate(struct work_struct *dummy) 467 static void poll_pkg_cstate(struct work_struct *dummy)
470 { 468 {
471 static u64 msr_last; 469 static u64 msr_last;
472 static u64 tsc_last; 470 static u64 tsc_last;
473 static unsigned long jiffies_last; 471 static unsigned long jiffies_last;
474 472
475 u64 msr_now; 473 u64 msr_now;
476 unsigned long jiffies_now; 474 unsigned long jiffies_now;
477 u64 tsc_now; 475 u64 tsc_now;
478 u64 val64; 476 u64 val64;
479 477
480 msr_now = pkg_state_counter(); 478 msr_now = pkg_state_counter();
481 rdtscll(tsc_now); 479 rdtscll(tsc_now);
482 jiffies_now = jiffies; 480 jiffies_now = jiffies;
483 481
484 /* calculate pkg cstate vs tsc ratio */ 482 /* calculate pkg cstate vs tsc ratio */
485 if (!msr_last || !tsc_last) 483 if (!msr_last || !tsc_last)
486 pkg_cstate_ratio_cur = 1; 484 pkg_cstate_ratio_cur = 1;
487 else { 485 else {
488 if (tsc_now - tsc_last) { 486 if (tsc_now - tsc_last) {
489 val64 = 100 * (msr_now - msr_last); 487 val64 = 100 * (msr_now - msr_last);
490 do_div(val64, (tsc_now - tsc_last)); 488 do_div(val64, (tsc_now - tsc_last));
491 pkg_cstate_ratio_cur = val64; 489 pkg_cstate_ratio_cur = val64;
492 } 490 }
493 } 491 }
494 492
495 /* update record */ 493 /* update record */
496 msr_last = msr_now; 494 msr_last = msr_now;
497 jiffies_last = jiffies_now; 495 jiffies_last = jiffies_now;
498 tsc_last = tsc_now; 496 tsc_last = tsc_now;
499 497
500 if (true == clamping) 498 if (true == clamping)
501 schedule_delayed_work(&poll_pkg_cstate_work, HZ); 499 schedule_delayed_work(&poll_pkg_cstate_work, HZ);
502 } 500 }
503 501
504 static int start_power_clamp(void) 502 static int start_power_clamp(void)
505 { 503 {
506 unsigned long cpu; 504 unsigned long cpu;
507 struct task_struct *thread; 505 struct task_struct *thread;
508 506
509 /* check if pkg cstate counter is completely 0, abort in this case */ 507 /* check if pkg cstate counter is completely 0, abort in this case */
510 if (!has_pkg_state_counter()) { 508 if (!has_pkg_state_counter()) {
511 pr_err("pkg cstate counter not functional, abort\n"); 509 pr_err("pkg cstate counter not functional, abort\n");
512 return -EINVAL; 510 return -EINVAL;
513 } 511 }
514 512
515 set_target_ratio = clamp(set_target_ratio, 0U, MAX_TARGET_RATIO - 1); 513 set_target_ratio = clamp(set_target_ratio, 0U, MAX_TARGET_RATIO - 1);
516 /* prevent cpu hotplug */ 514 /* prevent cpu hotplug */
517 get_online_cpus(); 515 get_online_cpus();
518 516
519 /* prefer BSP */ 517 /* prefer BSP */
520 control_cpu = 0; 518 control_cpu = 0;
521 if (!cpu_online(control_cpu)) 519 if (!cpu_online(control_cpu))
522 control_cpu = smp_processor_id(); 520 control_cpu = smp_processor_id();
523 521
524 clamping = true; 522 clamping = true;
525 schedule_delayed_work(&poll_pkg_cstate_work, 0); 523 schedule_delayed_work(&poll_pkg_cstate_work, 0);
526 524
527 /* start one thread per online cpu */ 525 /* start one thread per online cpu */
528 for_each_online_cpu(cpu) { 526 for_each_online_cpu(cpu) {
529 struct task_struct **p = 527 struct task_struct **p =
530 per_cpu_ptr(powerclamp_thread, cpu); 528 per_cpu_ptr(powerclamp_thread, cpu);
531 529
532 thread = kthread_create_on_node(clamp_thread, 530 thread = kthread_create_on_node(clamp_thread,
533 (void *) cpu, 531 (void *) cpu,
534 cpu_to_node(cpu), 532 cpu_to_node(cpu),
535 "kidle_inject/%ld", cpu); 533 "kidle_inject/%ld", cpu);
536 /* bind to cpu here */ 534 /* bind to cpu here */
537 if (likely(!IS_ERR(thread))) { 535 if (likely(!IS_ERR(thread))) {
538 kthread_bind(thread, cpu); 536 kthread_bind(thread, cpu);
539 wake_up_process(thread); 537 wake_up_process(thread);
540 *p = thread; 538 *p = thread;
541 } 539 }
542 540
543 } 541 }
544 put_online_cpus(); 542 put_online_cpus();
545 543
546 return 0; 544 return 0;
547 } 545 }
548 546
549 static void end_power_clamp(void) 547 static void end_power_clamp(void)
550 { 548 {
551 int i; 549 int i;
552 struct task_struct *thread; 550 struct task_struct *thread;
553 551
554 clamping = false; 552 clamping = false;
555 /* 553 /*
556 * make clamping visible to other cpus and give per cpu clamping threads 554 * make clamping visible to other cpus and give per cpu clamping threads
557 * sometime to exit, or gets killed later. 555 * sometime to exit, or gets killed later.
558 */ 556 */
559 smp_mb(); 557 smp_mb();
560 msleep(20); 558 msleep(20);
561 if (bitmap_weight(cpu_clamping_mask, num_possible_cpus())) { 559 if (bitmap_weight(cpu_clamping_mask, num_possible_cpus())) {
562 for_each_set_bit(i, cpu_clamping_mask, num_possible_cpus()) { 560 for_each_set_bit(i, cpu_clamping_mask, num_possible_cpus()) {
563 pr_debug("clamping thread for cpu %d alive, kill\n", i); 561 pr_debug("clamping thread for cpu %d alive, kill\n", i);
564 thread = *per_cpu_ptr(powerclamp_thread, i); 562 thread = *per_cpu_ptr(powerclamp_thread, i);
565 kthread_stop(thread); 563 kthread_stop(thread);
566 } 564 }
567 } 565 }
568 } 566 }
569 567
570 static int powerclamp_cpu_callback(struct notifier_block *nfb, 568 static int powerclamp_cpu_callback(struct notifier_block *nfb,
571 unsigned long action, void *hcpu) 569 unsigned long action, void *hcpu)
572 { 570 {
573 unsigned long cpu = (unsigned long)hcpu; 571 unsigned long cpu = (unsigned long)hcpu;
574 struct task_struct *thread; 572 struct task_struct *thread;
575 struct task_struct **percpu_thread = 573 struct task_struct **percpu_thread =
576 per_cpu_ptr(powerclamp_thread, cpu); 574 per_cpu_ptr(powerclamp_thread, cpu);
577 575
578 if (false == clamping) 576 if (false == clamping)
579 goto exit_ok; 577 goto exit_ok;
580 578
581 switch (action) { 579 switch (action) {
582 case CPU_ONLINE: 580 case CPU_ONLINE:
583 thread = kthread_create_on_node(clamp_thread, 581 thread = kthread_create_on_node(clamp_thread,
584 (void *) cpu, 582 (void *) cpu,
585 cpu_to_node(cpu), 583 cpu_to_node(cpu),
586 "kidle_inject/%lu", cpu); 584 "kidle_inject/%lu", cpu);
587 if (likely(!IS_ERR(thread))) { 585 if (likely(!IS_ERR(thread))) {
588 kthread_bind(thread, cpu); 586 kthread_bind(thread, cpu);
589 wake_up_process(thread); 587 wake_up_process(thread);
590 *percpu_thread = thread; 588 *percpu_thread = thread;
591 } 589 }
592 /* prefer BSP as controlling CPU */ 590 /* prefer BSP as controlling CPU */
593 if (cpu == 0) { 591 if (cpu == 0) {
594 control_cpu = 0; 592 control_cpu = 0;
595 smp_mb(); 593 smp_mb();
596 } 594 }
597 break; 595 break;
598 case CPU_DEAD: 596 case CPU_DEAD:
599 if (test_bit(cpu, cpu_clamping_mask)) { 597 if (test_bit(cpu, cpu_clamping_mask)) {
600 pr_err("cpu %lu dead but powerclamping thread is not\n", 598 pr_err("cpu %lu dead but powerclamping thread is not\n",
601 cpu); 599 cpu);
602 kthread_stop(*percpu_thread); 600 kthread_stop(*percpu_thread);
603 } 601 }
604 if (cpu == control_cpu) { 602 if (cpu == control_cpu) {
605 control_cpu = smp_processor_id(); 603 control_cpu = smp_processor_id();
606 smp_mb(); 604 smp_mb();
607 } 605 }
608 } 606 }
609 607
610 exit_ok: 608 exit_ok:
611 return NOTIFY_OK; 609 return NOTIFY_OK;
612 } 610 }
613 611
614 static struct notifier_block powerclamp_cpu_notifier = { 612 static struct notifier_block powerclamp_cpu_notifier = {
615 .notifier_call = powerclamp_cpu_callback, 613 .notifier_call = powerclamp_cpu_callback,
616 }; 614 };
617 615
618 static int powerclamp_get_max_state(struct thermal_cooling_device *cdev, 616 static int powerclamp_get_max_state(struct thermal_cooling_device *cdev,
619 unsigned long *state) 617 unsigned long *state)
620 { 618 {
621 *state = MAX_TARGET_RATIO; 619 *state = MAX_TARGET_RATIO;
622 620
623 return 0; 621 return 0;
624 } 622 }
625 623
626 static int powerclamp_get_cur_state(struct thermal_cooling_device *cdev, 624 static int powerclamp_get_cur_state(struct thermal_cooling_device *cdev,
627 unsigned long *state) 625 unsigned long *state)
628 { 626 {
629 if (true == clamping) 627 if (true == clamping)
630 *state = pkg_cstate_ratio_cur; 628 *state = pkg_cstate_ratio_cur;
631 else 629 else
632 /* to save power, do not poll idle ratio while not clamping */ 630 /* to save power, do not poll idle ratio while not clamping */
633 *state = -1; /* indicates invalid state */ 631 *state = -1; /* indicates invalid state */
634 632
635 return 0; 633 return 0;
636 } 634 }
637 635
638 static int powerclamp_set_cur_state(struct thermal_cooling_device *cdev, 636 static int powerclamp_set_cur_state(struct thermal_cooling_device *cdev,
639 unsigned long new_target_ratio) 637 unsigned long new_target_ratio)
640 { 638 {
641 int ret = 0; 639 int ret = 0;
642 640
643 new_target_ratio = clamp(new_target_ratio, 0UL, 641 new_target_ratio = clamp(new_target_ratio, 0UL,
644 (unsigned long) (MAX_TARGET_RATIO-1)); 642 (unsigned long) (MAX_TARGET_RATIO-1));
645 if (set_target_ratio == 0 && new_target_ratio > 0) { 643 if (set_target_ratio == 0 && new_target_ratio > 0) {
646 pr_info("Start idle injection to reduce power\n"); 644 pr_info("Start idle injection to reduce power\n");
647 set_target_ratio = new_target_ratio; 645 set_target_ratio = new_target_ratio;
648 ret = start_power_clamp(); 646 ret = start_power_clamp();
649 goto exit_set; 647 goto exit_set;
650 } else if (set_target_ratio > 0 && new_target_ratio == 0) { 648 } else if (set_target_ratio > 0 && new_target_ratio == 0) {
651 pr_info("Stop forced idle injection\n"); 649 pr_info("Stop forced idle injection\n");
652 set_target_ratio = 0; 650 set_target_ratio = 0;
653 end_power_clamp(); 651 end_power_clamp();
654 } else /* adjust currently running */ { 652 } else /* adjust currently running */ {
655 set_target_ratio = new_target_ratio; 653 set_target_ratio = new_target_ratio;
656 /* make new set_target_ratio visible to other cpus */ 654 /* make new set_target_ratio visible to other cpus */
657 smp_mb(); 655 smp_mb();
658 } 656 }
659 657
660 exit_set: 658 exit_set:
661 return ret; 659 return ret;
662 } 660 }
663 661
664 /* bind to generic thermal layer as cooling device*/ 662 /* bind to generic thermal layer as cooling device*/
665 static struct thermal_cooling_device_ops powerclamp_cooling_ops = { 663 static struct thermal_cooling_device_ops powerclamp_cooling_ops = {
666 .get_max_state = powerclamp_get_max_state, 664 .get_max_state = powerclamp_get_max_state,
667 .get_cur_state = powerclamp_get_cur_state, 665 .get_cur_state = powerclamp_get_cur_state,
668 .set_cur_state = powerclamp_set_cur_state, 666 .set_cur_state = powerclamp_set_cur_state,
669 }; 667 };
670 668
671 /* runs on Nehalem and later */ 669 /* runs on Nehalem and later */
672 static const struct x86_cpu_id intel_powerclamp_ids[] = { 670 static const struct x86_cpu_id intel_powerclamp_ids[] = {
673 { X86_VENDOR_INTEL, 6, 0x1a}, 671 { X86_VENDOR_INTEL, 6, 0x1a},
674 { X86_VENDOR_INTEL, 6, 0x1c}, 672 { X86_VENDOR_INTEL, 6, 0x1c},
675 { X86_VENDOR_INTEL, 6, 0x1e}, 673 { X86_VENDOR_INTEL, 6, 0x1e},
676 { X86_VENDOR_INTEL, 6, 0x1f}, 674 { X86_VENDOR_INTEL, 6, 0x1f},
677 { X86_VENDOR_INTEL, 6, 0x25}, 675 { X86_VENDOR_INTEL, 6, 0x25},
678 { X86_VENDOR_INTEL, 6, 0x26}, 676 { X86_VENDOR_INTEL, 6, 0x26},
679 { X86_VENDOR_INTEL, 6, 0x2a}, 677 { X86_VENDOR_INTEL, 6, 0x2a},
680 { X86_VENDOR_INTEL, 6, 0x2c}, 678 { X86_VENDOR_INTEL, 6, 0x2c},
681 { X86_VENDOR_INTEL, 6, 0x2d}, 679 { X86_VENDOR_INTEL, 6, 0x2d},
682 { X86_VENDOR_INTEL, 6, 0x2e}, 680 { X86_VENDOR_INTEL, 6, 0x2e},
683 { X86_VENDOR_INTEL, 6, 0x2f}, 681 { X86_VENDOR_INTEL, 6, 0x2f},
684 { X86_VENDOR_INTEL, 6, 0x37}, 682 { X86_VENDOR_INTEL, 6, 0x37},
685 { X86_VENDOR_INTEL, 6, 0x3a}, 683 { X86_VENDOR_INTEL, 6, 0x3a},
686 { X86_VENDOR_INTEL, 6, 0x3c}, 684 { X86_VENDOR_INTEL, 6, 0x3c},
687 { X86_VENDOR_INTEL, 6, 0x3d}, 685 { X86_VENDOR_INTEL, 6, 0x3d},
688 { X86_VENDOR_INTEL, 6, 0x3e}, 686 { X86_VENDOR_INTEL, 6, 0x3e},
689 { X86_VENDOR_INTEL, 6, 0x3f}, 687 { X86_VENDOR_INTEL, 6, 0x3f},
690 { X86_VENDOR_INTEL, 6, 0x45}, 688 { X86_VENDOR_INTEL, 6, 0x45},
691 { X86_VENDOR_INTEL, 6, 0x46}, 689 { X86_VENDOR_INTEL, 6, 0x46},
692 { X86_VENDOR_INTEL, 6, 0x4c}, 690 { X86_VENDOR_INTEL, 6, 0x4c},
693 {} 691 {}
694 }; 692 };
695 MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids); 693 MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids);
696 694
697 static int powerclamp_probe(void) 695 static int powerclamp_probe(void)
698 { 696 {
699 if (!x86_match_cpu(intel_powerclamp_ids)) { 697 if (!x86_match_cpu(intel_powerclamp_ids)) {
700 pr_err("Intel powerclamp does not run on family %d model %d\n", 698 pr_err("Intel powerclamp does not run on family %d model %d\n",
701 boot_cpu_data.x86, boot_cpu_data.x86_model); 699 boot_cpu_data.x86, boot_cpu_data.x86_model);
702 return -ENODEV; 700 return -ENODEV;
703 } 701 }
704 if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC) || 702 if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC) ||
705 !boot_cpu_has(X86_FEATURE_CONSTANT_TSC) || 703 !boot_cpu_has(X86_FEATURE_CONSTANT_TSC) ||
706 !boot_cpu_has(X86_FEATURE_MWAIT) || 704 !boot_cpu_has(X86_FEATURE_MWAIT) ||
707 !boot_cpu_has(X86_FEATURE_ARAT)) 705 !boot_cpu_has(X86_FEATURE_ARAT))
708 return -ENODEV; 706 return -ENODEV;
709 707
710 /* find the deepest mwait value */ 708 /* find the deepest mwait value */
711 find_target_mwait(); 709 find_target_mwait();
712 710
713 return 0; 711 return 0;
714 } 712 }
715 713
716 static int powerclamp_debug_show(struct seq_file *m, void *unused) 714 static int powerclamp_debug_show(struct seq_file *m, void *unused)
717 { 715 {
718 int i = 0; 716 int i = 0;
719 717
720 seq_printf(m, "controlling cpu: %d\n", control_cpu); 718 seq_printf(m, "controlling cpu: %d\n", control_cpu);
721 seq_printf(m, "pct confidence steady dynamic (compensation)\n"); 719 seq_printf(m, "pct confidence steady dynamic (compensation)\n");
722 for (i = 0; i < MAX_TARGET_RATIO; i++) { 720 for (i = 0; i < MAX_TARGET_RATIO; i++) {
723 seq_printf(m, "%d\t%lu\t%lu\t%lu\n", 721 seq_printf(m, "%d\t%lu\t%lu\t%lu\n",
724 i, 722 i,
725 cal_data[i].confidence, 723 cal_data[i].confidence,
726 cal_data[i].steady_comp, 724 cal_data[i].steady_comp,
727 cal_data[i].dynamic_comp); 725 cal_data[i].dynamic_comp);
728 } 726 }
729 727
730 return 0; 728 return 0;
731 } 729 }
732 730
733 static int powerclamp_debug_open(struct inode *inode, 731 static int powerclamp_debug_open(struct inode *inode,
734 struct file *file) 732 struct file *file)
735 { 733 {
736 return single_open(file, powerclamp_debug_show, inode->i_private); 734 return single_open(file, powerclamp_debug_show, inode->i_private);
737 } 735 }
738 736
739 static const struct file_operations powerclamp_debug_fops = { 737 static const struct file_operations powerclamp_debug_fops = {
740 .open = powerclamp_debug_open, 738 .open = powerclamp_debug_open,
741 .read = seq_read, 739 .read = seq_read,
742 .llseek = seq_lseek, 740 .llseek = seq_lseek,
743 .release = single_release, 741 .release = single_release,
744 .owner = THIS_MODULE, 742 .owner = THIS_MODULE,
745 }; 743 };
746 744
747 static inline void powerclamp_create_debug_files(void) 745 static inline void powerclamp_create_debug_files(void)
748 { 746 {
749 debug_dir = debugfs_create_dir("intel_powerclamp", NULL); 747 debug_dir = debugfs_create_dir("intel_powerclamp", NULL);
750 if (!debug_dir) 748 if (!debug_dir)
751 return; 749 return;
752 750
753 if (!debugfs_create_file("powerclamp_calib", S_IRUGO, debug_dir, 751 if (!debugfs_create_file("powerclamp_calib", S_IRUGO, debug_dir,
754 cal_data, &powerclamp_debug_fops)) 752 cal_data, &powerclamp_debug_fops))
755 goto file_error; 753 goto file_error;
756 754
757 return; 755 return;
758 756
759 file_error: 757 file_error:
760 debugfs_remove_recursive(debug_dir); 758 debugfs_remove_recursive(debug_dir);
761 } 759 }
762 760
763 static int powerclamp_init(void) 761 static int powerclamp_init(void)
764 { 762 {
765 int retval; 763 int retval;
766 int bitmap_size; 764 int bitmap_size;
767 765
768 bitmap_size = BITS_TO_LONGS(num_possible_cpus()) * sizeof(long); 766 bitmap_size = BITS_TO_LONGS(num_possible_cpus()) * sizeof(long);
769 cpu_clamping_mask = kzalloc(bitmap_size, GFP_KERNEL); 767 cpu_clamping_mask = kzalloc(bitmap_size, GFP_KERNEL);
770 if (!cpu_clamping_mask) 768 if (!cpu_clamping_mask)
771 return -ENOMEM; 769 return -ENOMEM;
772 770
773 /* probe cpu features and ids here */ 771 /* probe cpu features and ids here */
774 retval = powerclamp_probe(); 772 retval = powerclamp_probe();
775 if (retval) 773 if (retval)
776 goto exit_free; 774 goto exit_free;
777 775
778 /* set default limit, maybe adjusted during runtime based on feedback */ 776 /* set default limit, maybe adjusted during runtime based on feedback */
779 window_size = 2; 777 window_size = 2;
780 register_hotcpu_notifier(&powerclamp_cpu_notifier); 778 register_hotcpu_notifier(&powerclamp_cpu_notifier);
781 779
782 powerclamp_thread = alloc_percpu(struct task_struct *); 780 powerclamp_thread = alloc_percpu(struct task_struct *);
783 if (!powerclamp_thread) { 781 if (!powerclamp_thread) {
784 retval = -ENOMEM; 782 retval = -ENOMEM;
785 goto exit_unregister; 783 goto exit_unregister;
786 } 784 }
787 785
788 cooling_dev = thermal_cooling_device_register("intel_powerclamp", NULL, 786 cooling_dev = thermal_cooling_device_register("intel_powerclamp", NULL,
789 &powerclamp_cooling_ops); 787 &powerclamp_cooling_ops);
790 if (IS_ERR(cooling_dev)) { 788 if (IS_ERR(cooling_dev)) {
791 retval = -ENODEV; 789 retval = -ENODEV;
792 goto exit_free_thread; 790 goto exit_free_thread;
793 } 791 }
794 792
795 if (!duration) 793 if (!duration)
796 duration = jiffies_to_msecs(DEFAULT_DURATION_JIFFIES); 794 duration = jiffies_to_msecs(DEFAULT_DURATION_JIFFIES);
797 795
798 powerclamp_create_debug_files(); 796 powerclamp_create_debug_files();
799 797
800 return 0; 798 return 0;
801 799
802 exit_free_thread: 800 exit_free_thread:
803 free_percpu(powerclamp_thread); 801 free_percpu(powerclamp_thread);
804 exit_unregister: 802 exit_unregister:
805 unregister_hotcpu_notifier(&powerclamp_cpu_notifier); 803 unregister_hotcpu_notifier(&powerclamp_cpu_notifier);
806 exit_free: 804 exit_free:
807 kfree(cpu_clamping_mask); 805 kfree(cpu_clamping_mask);
808 return retval; 806 return retval;
809 } 807 }
810 module_init(powerclamp_init); 808 module_init(powerclamp_init);
811 809
812 static void powerclamp_exit(void) 810 static void powerclamp_exit(void)
813 { 811 {
814 unregister_hotcpu_notifier(&powerclamp_cpu_notifier); 812 unregister_hotcpu_notifier(&powerclamp_cpu_notifier);
815 end_power_clamp(); 813 end_power_clamp();
816 free_percpu(powerclamp_thread); 814 free_percpu(powerclamp_thread);
817 thermal_cooling_device_unregister(cooling_dev); 815 thermal_cooling_device_unregister(cooling_dev);
818 kfree(cpu_clamping_mask); 816 kfree(cpu_clamping_mask);
819 817
820 cancel_delayed_work_sync(&poll_pkg_cstate_work); 818 cancel_delayed_work_sync(&poll_pkg_cstate_work);
821 debugfs_remove_recursive(debug_dir); 819 debugfs_remove_recursive(debug_dir);
822 } 820 }
823 module_exit(powerclamp_exit); 821 module_exit(powerclamp_exit);
824 822
825 MODULE_LICENSE("GPL"); 823 MODULE_LICENSE("GPL");
826 MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>"); 824 MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
827 MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@linux.intel.com>"); 825 MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@linux.intel.com>");
828 MODULE_DESCRIPTION("Package Level C-state Idle Injection for Intel CPUs"); 826 MODULE_DESCRIPTION("Package Level C-state Idle Injection for Intel CPUs");
829 827
kernel/time/tick-sched.c
1 /* 1 /*
2 * linux/kernel/time/tick-sched.c 2 * linux/kernel/time/tick-sched.c
3 * 3 *
4 * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> 4 * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
5 * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar 5 * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
6 * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner 6 * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner
7 * 7 *
8 * No idle tick implementation for low and high resolution timers 8 * No idle tick implementation for low and high resolution timers
9 * 9 *
10 * Started by: Thomas Gleixner and Ingo Molnar 10 * Started by: Thomas Gleixner and Ingo Molnar
11 * 11 *
12 * Distribute under GPLv2. 12 * Distribute under GPLv2.
13 */ 13 */
14 #include <linux/cpu.h> 14 #include <linux/cpu.h>
15 #include <linux/err.h> 15 #include <linux/err.h>
16 #include <linux/hrtimer.h> 16 #include <linux/hrtimer.h>
17 #include <linux/interrupt.h> 17 #include <linux/interrupt.h>
18 #include <linux/kernel_stat.h> 18 #include <linux/kernel_stat.h>
19 #include <linux/percpu.h> 19 #include <linux/percpu.h>
20 #include <linux/profile.h> 20 #include <linux/profile.h>
21 #include <linux/sched.h> 21 #include <linux/sched.h>
22 #include <linux/module.h> 22 #include <linux/module.h>
23 #include <linux/irq_work.h> 23 #include <linux/irq_work.h>
24 #include <linux/posix-timers.h> 24 #include <linux/posix-timers.h>
25 #include <linux/perf_event.h> 25 #include <linux/perf_event.h>
26 #include <linux/context_tracking.h> 26 #include <linux/context_tracking.h>
27 27
28 #include <asm/irq_regs.h> 28 #include <asm/irq_regs.h>
29 29
30 #include "tick-internal.h" 30 #include "tick-internal.h"
31 31
32 #include <trace/events/timer.h> 32 #include <trace/events/timer.h>
33 33
34 /* 34 /*
35 * Per cpu nohz control structure 35 * Per cpu nohz control structure
36 */ 36 */
37 DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); 37 DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
38 38
39 /* 39 /*
40 * The time, when the last jiffy update happened. Protected by jiffies_lock. 40 * The time, when the last jiffy update happened. Protected by jiffies_lock.
41 */ 41 */
42 static ktime_t last_jiffies_update; 42 static ktime_t last_jiffies_update;
43 43
44 struct tick_sched *tick_get_tick_sched(int cpu) 44 struct tick_sched *tick_get_tick_sched(int cpu)
45 { 45 {
46 return &per_cpu(tick_cpu_sched, cpu); 46 return &per_cpu(tick_cpu_sched, cpu);
47 } 47 }
48 48
49 /* 49 /*
50 * Must be called with interrupts disabled ! 50 * Must be called with interrupts disabled !
51 */ 51 */
52 static void tick_do_update_jiffies64(ktime_t now) 52 static void tick_do_update_jiffies64(ktime_t now)
53 { 53 {
54 unsigned long ticks = 0; 54 unsigned long ticks = 0;
55 ktime_t delta; 55 ktime_t delta;
56 56
57 /* 57 /*
58 * Do a quick check without holding jiffies_lock: 58 * Do a quick check without holding jiffies_lock:
59 */ 59 */
60 delta = ktime_sub(now, last_jiffies_update); 60 delta = ktime_sub(now, last_jiffies_update);
61 if (delta.tv64 < tick_period.tv64) 61 if (delta.tv64 < tick_period.tv64)
62 return; 62 return;
63 63
64 /* Reevalute with jiffies_lock held */ 64 /* Reevalute with jiffies_lock held */
65 write_seqlock(&jiffies_lock); 65 write_seqlock(&jiffies_lock);
66 66
67 delta = ktime_sub(now, last_jiffies_update); 67 delta = ktime_sub(now, last_jiffies_update);
68 if (delta.tv64 >= tick_period.tv64) { 68 if (delta.tv64 >= tick_period.tv64) {
69 69
70 delta = ktime_sub(delta, tick_period); 70 delta = ktime_sub(delta, tick_period);
71 last_jiffies_update = ktime_add(last_jiffies_update, 71 last_jiffies_update = ktime_add(last_jiffies_update,
72 tick_period); 72 tick_period);
73 73
74 /* Slow path for long timeouts */ 74 /* Slow path for long timeouts */
75 if (unlikely(delta.tv64 >= tick_period.tv64)) { 75 if (unlikely(delta.tv64 >= tick_period.tv64)) {
76 s64 incr = ktime_to_ns(tick_period); 76 s64 incr = ktime_to_ns(tick_period);
77 77
78 ticks = ktime_divns(delta, incr); 78 ticks = ktime_divns(delta, incr);
79 79
80 last_jiffies_update = ktime_add_ns(last_jiffies_update, 80 last_jiffies_update = ktime_add_ns(last_jiffies_update,
81 incr * ticks); 81 incr * ticks);
82 } 82 }
83 do_timer(++ticks); 83 do_timer(++ticks);
84 84
85 /* Keep the tick_next_period variable up to date */ 85 /* Keep the tick_next_period variable up to date */
86 tick_next_period = ktime_add(last_jiffies_update, tick_period); 86 tick_next_period = ktime_add(last_jiffies_update, tick_period);
87 } else { 87 } else {
88 write_sequnlock(&jiffies_lock); 88 write_sequnlock(&jiffies_lock);
89 return; 89 return;
90 } 90 }
91 write_sequnlock(&jiffies_lock); 91 write_sequnlock(&jiffies_lock);
92 update_wall_time(); 92 update_wall_time();
93 } 93 }
94 94
95 /* 95 /*
96 * Initialize and return retrieve the jiffies update. 96 * Initialize and return retrieve the jiffies update.
97 */ 97 */
98 static ktime_t tick_init_jiffy_update(void) 98 static ktime_t tick_init_jiffy_update(void)
99 { 99 {
100 ktime_t period; 100 ktime_t period;
101 101
102 write_seqlock(&jiffies_lock); 102 write_seqlock(&jiffies_lock);
103 /* Did we start the jiffies update yet ? */ 103 /* Did we start the jiffies update yet ? */
104 if (last_jiffies_update.tv64 == 0) 104 if (last_jiffies_update.tv64 == 0)
105 last_jiffies_update = tick_next_period; 105 last_jiffies_update = tick_next_period;
106 period = last_jiffies_update; 106 period = last_jiffies_update;
107 write_sequnlock(&jiffies_lock); 107 write_sequnlock(&jiffies_lock);
108 return period; 108 return period;
109 } 109 }
110 110
111 111
112 static void tick_sched_do_timer(ktime_t now) 112 static void tick_sched_do_timer(ktime_t now)
113 { 113 {
114 int cpu = smp_processor_id(); 114 int cpu = smp_processor_id();
115 115
116 #ifdef CONFIG_NO_HZ_COMMON 116 #ifdef CONFIG_NO_HZ_COMMON
117 /* 117 /*
118 * Check if the do_timer duty was dropped. We don't care about 118 * Check if the do_timer duty was dropped. We don't care about
119 * concurrency: This happens only when the cpu in charge went 119 * concurrency: This happens only when the cpu in charge went
120 * into a long sleep. If two cpus happen to assign themself to 120 * into a long sleep. If two cpus happen to assign themself to
121 * this duty, then the jiffies update is still serialized by 121 * this duty, then the jiffies update is still serialized by
122 * jiffies_lock. 122 * jiffies_lock.
123 */ 123 */
124 if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE) 124 if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)
125 && !tick_nohz_full_cpu(cpu)) 125 && !tick_nohz_full_cpu(cpu))
126 tick_do_timer_cpu = cpu; 126 tick_do_timer_cpu = cpu;
127 #endif 127 #endif
128 128
129 /* Check, if the jiffies need an update */ 129 /* Check, if the jiffies need an update */
130 if (tick_do_timer_cpu == cpu) 130 if (tick_do_timer_cpu == cpu)
131 tick_do_update_jiffies64(now); 131 tick_do_update_jiffies64(now);
132 } 132 }
133 133
134 static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) 134 static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
135 { 135 {
136 #ifdef CONFIG_NO_HZ_COMMON 136 #ifdef CONFIG_NO_HZ_COMMON
137 /* 137 /*
138 * When we are idle and the tick is stopped, we have to touch 138 * When we are idle and the tick is stopped, we have to touch
139 * the watchdog as we might not schedule for a really long 139 * the watchdog as we might not schedule for a really long
140 * time. This happens on complete idle SMP systems while 140 * time. This happens on complete idle SMP systems while
141 * waiting on the login prompt. We also increment the "start of 141 * waiting on the login prompt. We also increment the "start of
142 * idle" jiffy stamp so the idle accounting adjustment we do 142 * idle" jiffy stamp so the idle accounting adjustment we do
143 * when we go busy again does not account too much ticks. 143 * when we go busy again does not account too much ticks.
144 */ 144 */
145 if (ts->tick_stopped) { 145 if (ts->tick_stopped) {
146 touch_softlockup_watchdog(); 146 touch_softlockup_watchdog();
147 if (is_idle_task(current)) 147 if (is_idle_task(current))
148 ts->idle_jiffies++; 148 ts->idle_jiffies++;
149 } 149 }
150 #endif 150 #endif
151 update_process_times(user_mode(regs)); 151 update_process_times(user_mode(regs));
152 profile_tick(CPU_PROFILING); 152 profile_tick(CPU_PROFILING);
153 } 153 }
154 154
155 #ifdef CONFIG_NO_HZ_FULL 155 #ifdef CONFIG_NO_HZ_FULL
156 cpumask_var_t tick_nohz_full_mask; 156 cpumask_var_t tick_nohz_full_mask;
157 cpumask_var_t housekeeping_mask; 157 cpumask_var_t housekeeping_mask;
158 bool tick_nohz_full_running; 158 bool tick_nohz_full_running;
159 159
160 static bool can_stop_full_tick(void) 160 static bool can_stop_full_tick(void)
161 { 161 {
162 WARN_ON_ONCE(!irqs_disabled()); 162 WARN_ON_ONCE(!irqs_disabled());
163 163
164 if (!sched_can_stop_tick()) { 164 if (!sched_can_stop_tick()) {
165 trace_tick_stop(0, "more than 1 task in runqueue\n"); 165 trace_tick_stop(0, "more than 1 task in runqueue\n");
166 return false; 166 return false;
167 } 167 }
168 168
169 if (!posix_cpu_timers_can_stop_tick(current)) { 169 if (!posix_cpu_timers_can_stop_tick(current)) {
170 trace_tick_stop(0, "posix timers running\n"); 170 trace_tick_stop(0, "posix timers running\n");
171 return false; 171 return false;
172 } 172 }
173 173
174 if (!perf_event_can_stop_tick()) { 174 if (!perf_event_can_stop_tick()) {
175 trace_tick_stop(0, "perf events running\n"); 175 trace_tick_stop(0, "perf events running\n");
176 return false; 176 return false;
177 } 177 }
178 178
179 /* sched_clock_tick() needs us? */ 179 /* sched_clock_tick() needs us? */
180 #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK 180 #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
181 /* 181 /*
182 * TODO: kick full dynticks CPUs when 182 * TODO: kick full dynticks CPUs when
183 * sched_clock_stable is set. 183 * sched_clock_stable is set.
184 */ 184 */
185 if (!sched_clock_stable()) { 185 if (!sched_clock_stable()) {
186 trace_tick_stop(0, "unstable sched clock\n"); 186 trace_tick_stop(0, "unstable sched clock\n");
187 /* 187 /*
188 * Don't allow the user to think they can get 188 * Don't allow the user to think they can get
189 * full NO_HZ with this machine. 189 * full NO_HZ with this machine.
190 */ 190 */
191 WARN_ONCE(tick_nohz_full_running, 191 WARN_ONCE(tick_nohz_full_running,
192 "NO_HZ FULL will not work with unstable sched clock"); 192 "NO_HZ FULL will not work with unstable sched clock");
193 return false; 193 return false;
194 } 194 }
195 #endif 195 #endif
196 196
197 return true; 197 return true;
198 } 198 }
199 199
200 static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now); 200 static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now);
201 201
202 /* 202 /*
203 * Re-evaluate the need for the tick on the current CPU 203 * Re-evaluate the need for the tick on the current CPU
204 * and restart it if necessary. 204 * and restart it if necessary.
205 */ 205 */
206 void __tick_nohz_full_check(void) 206 void __tick_nohz_full_check(void)
207 { 207 {
208 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 208 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
209 209
210 if (tick_nohz_full_cpu(smp_processor_id())) { 210 if (tick_nohz_full_cpu(smp_processor_id())) {
211 if (ts->tick_stopped && !is_idle_task(current)) { 211 if (ts->tick_stopped && !is_idle_task(current)) {
212 if (!can_stop_full_tick()) 212 if (!can_stop_full_tick())
213 tick_nohz_restart_sched_tick(ts, ktime_get()); 213 tick_nohz_restart_sched_tick(ts, ktime_get());
214 } 214 }
215 } 215 }
216 } 216 }
217 217
218 static void nohz_full_kick_work_func(struct irq_work *work) 218 static void nohz_full_kick_work_func(struct irq_work *work)
219 { 219 {
220 __tick_nohz_full_check(); 220 __tick_nohz_full_check();
221 } 221 }
222 222
223 static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { 223 static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
224 .func = nohz_full_kick_work_func, 224 .func = nohz_full_kick_work_func,
225 }; 225 };
226 226
227 /* 227 /*
228 * Kick this CPU if it's full dynticks in order to force it to 228 * Kick this CPU if it's full dynticks in order to force it to
229 * re-evaluate its dependency on the tick and restart it if necessary. 229 * re-evaluate its dependency on the tick and restart it if necessary.
230 * This kick, unlike tick_nohz_full_kick_cpu() and tick_nohz_full_kick_all(), 230 * This kick, unlike tick_nohz_full_kick_cpu() and tick_nohz_full_kick_all(),
231 * is NMI safe. 231 * is NMI safe.
232 */ 232 */
233 void tick_nohz_full_kick(void) 233 void tick_nohz_full_kick(void)
234 { 234 {
235 if (!tick_nohz_full_cpu(smp_processor_id())) 235 if (!tick_nohz_full_cpu(smp_processor_id()))
236 return; 236 return;
237 237
238 irq_work_queue(this_cpu_ptr(&nohz_full_kick_work)); 238 irq_work_queue(this_cpu_ptr(&nohz_full_kick_work));
239 } 239 }
240 240
241 /* 241 /*
242 * Kick the CPU if it's full dynticks in order to force it to 242 * Kick the CPU if it's full dynticks in order to force it to
243 * re-evaluate its dependency on the tick and restart it if necessary. 243 * re-evaluate its dependency on the tick and restart it if necessary.
244 */ 244 */
245 void tick_nohz_full_kick_cpu(int cpu) 245 void tick_nohz_full_kick_cpu(int cpu)
246 { 246 {
247 if (!tick_nohz_full_cpu(cpu)) 247 if (!tick_nohz_full_cpu(cpu))
248 return; 248 return;
249 249
250 irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu); 250 irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu);
251 } 251 }
252 252
253 static void nohz_full_kick_ipi(void *info) 253 static void nohz_full_kick_ipi(void *info)
254 { 254 {
255 __tick_nohz_full_check(); 255 __tick_nohz_full_check();
256 } 256 }
257 257
258 /* 258 /*
259 * Kick all full dynticks CPUs in order to force these to re-evaluate 259 * Kick all full dynticks CPUs in order to force these to re-evaluate
260 * their dependency on the tick and restart it if necessary. 260 * their dependency on the tick and restart it if necessary.
261 */ 261 */
262 void tick_nohz_full_kick_all(void) 262 void tick_nohz_full_kick_all(void)
263 { 263 {
264 if (!tick_nohz_full_running) 264 if (!tick_nohz_full_running)
265 return; 265 return;
266 266
267 preempt_disable(); 267 preempt_disable();
268 smp_call_function_many(tick_nohz_full_mask, 268 smp_call_function_many(tick_nohz_full_mask,
269 nohz_full_kick_ipi, NULL, false); 269 nohz_full_kick_ipi, NULL, false);
270 tick_nohz_full_kick(); 270 tick_nohz_full_kick();
271 preempt_enable(); 271 preempt_enable();
272 } 272 }
273 273
274 /* 274 /*
275 * Re-evaluate the need for the tick as we switch the current task. 275 * Re-evaluate the need for the tick as we switch the current task.
276 * It might need the tick due to per task/process properties: 276 * It might need the tick due to per task/process properties:
277 * perf events, posix cpu timers, ... 277 * perf events, posix cpu timers, ...
278 */ 278 */
279 void __tick_nohz_task_switch(struct task_struct *tsk) 279 void __tick_nohz_task_switch(struct task_struct *tsk)
280 { 280 {
281 unsigned long flags; 281 unsigned long flags;
282 282
283 local_irq_save(flags); 283 local_irq_save(flags);
284 284
285 if (!tick_nohz_full_cpu(smp_processor_id())) 285 if (!tick_nohz_full_cpu(smp_processor_id()))
286 goto out; 286 goto out;
287 287
288 if (tick_nohz_tick_stopped() && !can_stop_full_tick()) 288 if (tick_nohz_tick_stopped() && !can_stop_full_tick())
289 tick_nohz_full_kick(); 289 tick_nohz_full_kick();
290 290
291 out: 291 out:
292 local_irq_restore(flags); 292 local_irq_restore(flags);
293 } 293 }
294 294
295 /* Parse the boot-time nohz CPU list from the kernel parameters. */ 295 /* Parse the boot-time nohz CPU list from the kernel parameters. */
296 static int __init tick_nohz_full_setup(char *str) 296 static int __init tick_nohz_full_setup(char *str)
297 { 297 {
298 alloc_bootmem_cpumask_var(&tick_nohz_full_mask); 298 alloc_bootmem_cpumask_var(&tick_nohz_full_mask);
299 if (cpulist_parse(str, tick_nohz_full_mask) < 0) { 299 if (cpulist_parse(str, tick_nohz_full_mask) < 0) {
300 pr_warning("NOHZ: Incorrect nohz_full cpumask\n"); 300 pr_warning("NOHZ: Incorrect nohz_full cpumask\n");
301 free_bootmem_cpumask_var(tick_nohz_full_mask); 301 free_bootmem_cpumask_var(tick_nohz_full_mask);
302 return 1; 302 return 1;
303 } 303 }
304 tick_nohz_full_running = true; 304 tick_nohz_full_running = true;
305 305
306 return 1; 306 return 1;
307 } 307 }
308 __setup("nohz_full=", tick_nohz_full_setup); 308 __setup("nohz_full=", tick_nohz_full_setup);
309 309
310 static int tick_nohz_cpu_down_callback(struct notifier_block *nfb, 310 static int tick_nohz_cpu_down_callback(struct notifier_block *nfb,
311 unsigned long action, 311 unsigned long action,
312 void *hcpu) 312 void *hcpu)
313 { 313 {
314 unsigned int cpu = (unsigned long)hcpu; 314 unsigned int cpu = (unsigned long)hcpu;
315 315
316 switch (action & ~CPU_TASKS_FROZEN) { 316 switch (action & ~CPU_TASKS_FROZEN) {
317 case CPU_DOWN_PREPARE: 317 case CPU_DOWN_PREPARE:
318 /* 318 /*
319 * If we handle the timekeeping duty for full dynticks CPUs, 319 * If we handle the timekeeping duty for full dynticks CPUs,
320 * we can't safely shutdown that CPU. 320 * we can't safely shutdown that CPU.
321 */ 321 */
322 if (tick_nohz_full_running && tick_do_timer_cpu == cpu) 322 if (tick_nohz_full_running && tick_do_timer_cpu == cpu)
323 return NOTIFY_BAD; 323 return NOTIFY_BAD;
324 break; 324 break;
325 } 325 }
326 return NOTIFY_OK; 326 return NOTIFY_OK;
327 } 327 }
328 328
329 /* 329 /*
330 * Worst case string length in chunks of CPU range seems 2 steps 330 * Worst case string length in chunks of CPU range seems 2 steps
331 * separations: 0,2,4,6,... 331 * separations: 0,2,4,6,...
332 * This is NR_CPUS + sizeof('\0') 332 * This is NR_CPUS + sizeof('\0')
333 */ 333 */
334 static char __initdata nohz_full_buf[NR_CPUS + 1]; 334 static char __initdata nohz_full_buf[NR_CPUS + 1];
335 335
336 static int tick_nohz_init_all(void) 336 static int tick_nohz_init_all(void)
337 { 337 {
338 int err = -1; 338 int err = -1;
339 339
340 #ifdef CONFIG_NO_HZ_FULL_ALL 340 #ifdef CONFIG_NO_HZ_FULL_ALL
341 if (!alloc_cpumask_var(&tick_nohz_full_mask, GFP_KERNEL)) { 341 if (!alloc_cpumask_var(&tick_nohz_full_mask, GFP_KERNEL)) {
342 WARN(1, "NO_HZ: Can't allocate full dynticks cpumask\n"); 342 WARN(1, "NO_HZ: Can't allocate full dynticks cpumask\n");
343 return err; 343 return err;
344 } 344 }
345 err = 0; 345 err = 0;
346 cpumask_setall(tick_nohz_full_mask); 346 cpumask_setall(tick_nohz_full_mask);
347 tick_nohz_full_running = true; 347 tick_nohz_full_running = true;
348 #endif 348 #endif
349 return err; 349 return err;
350 } 350 }
351 351
352 void __init tick_nohz_init(void) 352 void __init tick_nohz_init(void)
353 { 353 {
354 int cpu; 354 int cpu;
355 355
356 if (!tick_nohz_full_running) { 356 if (!tick_nohz_full_running) {
357 if (tick_nohz_init_all() < 0) 357 if (tick_nohz_init_all() < 0)
358 return; 358 return;
359 } 359 }
360 360
361 if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) { 361 if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) {
362 WARN(1, "NO_HZ: Can't allocate not-full dynticks cpumask\n"); 362 WARN(1, "NO_HZ: Can't allocate not-full dynticks cpumask\n");
363 cpumask_clear(tick_nohz_full_mask); 363 cpumask_clear(tick_nohz_full_mask);
364 tick_nohz_full_running = false; 364 tick_nohz_full_running = false;
365 return; 365 return;
366 } 366 }
367 367
368 /* 368 /*
369 * Full dynticks uses irq work to drive the tick rescheduling on safe 369 * Full dynticks uses irq work to drive the tick rescheduling on safe
370 * locking contexts. But then we need irq work to raise its own 370 * locking contexts. But then we need irq work to raise its own
371 * interrupts to avoid circular dependency on the tick 371 * interrupts to avoid circular dependency on the tick
372 */ 372 */
373 if (!arch_irq_work_has_interrupt()) { 373 if (!arch_irq_work_has_interrupt()) {
374 pr_warning("NO_HZ: Can't run full dynticks because arch doesn't " 374 pr_warning("NO_HZ: Can't run full dynticks because arch doesn't "
375 "support irq work self-IPIs\n"); 375 "support irq work self-IPIs\n");
376 cpumask_clear(tick_nohz_full_mask); 376 cpumask_clear(tick_nohz_full_mask);
377 cpumask_copy(housekeeping_mask, cpu_possible_mask); 377 cpumask_copy(housekeeping_mask, cpu_possible_mask);
378 tick_nohz_full_running = false; 378 tick_nohz_full_running = false;
379 return; 379 return;
380 } 380 }
381 381
382 cpu = smp_processor_id(); 382 cpu = smp_processor_id();
383 383
384 if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { 384 if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) {
385 pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu); 385 pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu);
386 cpumask_clear_cpu(cpu, tick_nohz_full_mask); 386 cpumask_clear_cpu(cpu, tick_nohz_full_mask);
387 } 387 }
388 388
389 cpumask_andnot(housekeeping_mask, 389 cpumask_andnot(housekeeping_mask,
390 cpu_possible_mask, tick_nohz_full_mask); 390 cpu_possible_mask, tick_nohz_full_mask);
391 391
392 for_each_cpu(cpu, tick_nohz_full_mask) 392 for_each_cpu(cpu, tick_nohz_full_mask)
393 context_tracking_cpu_set(cpu); 393 context_tracking_cpu_set(cpu);
394 394
395 cpu_notifier(tick_nohz_cpu_down_callback, 0); 395 cpu_notifier(tick_nohz_cpu_down_callback, 0);
396 cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), tick_nohz_full_mask); 396 cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), tick_nohz_full_mask);
397 pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf); 397 pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf);
398 } 398 }
399 #endif 399 #endif
400 400
401 /* 401 /*
402 * NOHZ - aka dynamic tick functionality 402 * NOHZ - aka dynamic tick functionality
403 */ 403 */
404 #ifdef CONFIG_NO_HZ_COMMON 404 #ifdef CONFIG_NO_HZ_COMMON
405 /* 405 /*
406 * NO HZ enabled ? 406 * NO HZ enabled ?
407 */ 407 */
408 static int tick_nohz_enabled __read_mostly = 1; 408 static int tick_nohz_enabled __read_mostly = 1;
409 int tick_nohz_active __read_mostly; 409 int tick_nohz_active __read_mostly;
410 /* 410 /*
411 * Enable / Disable tickless mode 411 * Enable / Disable tickless mode
412 */ 412 */
413 static int __init setup_tick_nohz(char *str) 413 static int __init setup_tick_nohz(char *str)
414 { 414 {
415 if (!strcmp(str, "off")) 415 if (!strcmp(str, "off"))
416 tick_nohz_enabled = 0; 416 tick_nohz_enabled = 0;
417 else if (!strcmp(str, "on")) 417 else if (!strcmp(str, "on"))
418 tick_nohz_enabled = 1; 418 tick_nohz_enabled = 1;
419 else 419 else
420 return 0; 420 return 0;
421 return 1; 421 return 1;
422 } 422 }
423 423
424 __setup("nohz=", setup_tick_nohz); 424 __setup("nohz=", setup_tick_nohz);
425 425
426 /** 426 /**
427 * tick_nohz_update_jiffies - update jiffies when idle was interrupted 427 * tick_nohz_update_jiffies - update jiffies when idle was interrupted
428 * 428 *
429 * Called from interrupt entry when the CPU was idle 429 * Called from interrupt entry when the CPU was idle
430 * 430 *
431 * In case the sched_tick was stopped on this CPU, we have to check if jiffies 431 * In case the sched_tick was stopped on this CPU, we have to check if jiffies
432 * must be updated. Otherwise an interrupt handler could use a stale jiffy 432 * must be updated. Otherwise an interrupt handler could use a stale jiffy
433 * value. We do this unconditionally on any cpu, as we don't know whether the 433 * value. We do this unconditionally on any cpu, as we don't know whether the
434 * cpu, which has the update task assigned is in a long sleep. 434 * cpu, which has the update task assigned is in a long sleep.
435 */ 435 */
436 static void tick_nohz_update_jiffies(ktime_t now) 436 static void tick_nohz_update_jiffies(ktime_t now)
437 { 437 {
438 unsigned long flags; 438 unsigned long flags;
439 439
440 __this_cpu_write(tick_cpu_sched.idle_waketime, now); 440 __this_cpu_write(tick_cpu_sched.idle_waketime, now);
441 441
442 local_irq_save(flags); 442 local_irq_save(flags);
443 tick_do_update_jiffies64(now); 443 tick_do_update_jiffies64(now);
444 local_irq_restore(flags); 444 local_irq_restore(flags);
445 445
446 touch_softlockup_watchdog(); 446 touch_softlockup_watchdog();
447 } 447 }
448 448
449 /* 449 /*
450 * Updates the per cpu time idle statistics counters 450 * Updates the per cpu time idle statistics counters
451 */ 451 */
452 static void 452 static void
453 update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time) 453 update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time)
454 { 454 {
455 ktime_t delta; 455 ktime_t delta;
456 456
457 if (ts->idle_active) { 457 if (ts->idle_active) {
458 delta = ktime_sub(now, ts->idle_entrytime); 458 delta = ktime_sub(now, ts->idle_entrytime);
459 if (nr_iowait_cpu(cpu) > 0) 459 if (nr_iowait_cpu(cpu) > 0)
460 ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta); 460 ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
461 else 461 else
462 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); 462 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
463 ts->idle_entrytime = now; 463 ts->idle_entrytime = now;
464 } 464 }
465 465
466 if (last_update_time) 466 if (last_update_time)
467 *last_update_time = ktime_to_us(now); 467 *last_update_time = ktime_to_us(now);
468 468
469 } 469 }
470 470
471 static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now) 471 static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
472 { 472 {
473 update_ts_time_stats(smp_processor_id(), ts, now, NULL); 473 update_ts_time_stats(smp_processor_id(), ts, now, NULL);
474 ts->idle_active = 0; 474 ts->idle_active = 0;
475 475
476 sched_clock_idle_wakeup_event(0); 476 sched_clock_idle_wakeup_event(0);
477 } 477 }
478 478
479 static ktime_t tick_nohz_start_idle(struct tick_sched *ts) 479 static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
480 { 480 {
481 ktime_t now = ktime_get(); 481 ktime_t now = ktime_get();
482 482
483 ts->idle_entrytime = now; 483 ts->idle_entrytime = now;
484 ts->idle_active = 1; 484 ts->idle_active = 1;
485 sched_clock_idle_sleep_event(); 485 sched_clock_idle_sleep_event();
486 return now; 486 return now;
487 } 487 }
488 488
489 /** 489 /**
490 * get_cpu_idle_time_us - get the total idle time of a cpu 490 * get_cpu_idle_time_us - get the total idle time of a cpu
491 * @cpu: CPU number to query 491 * @cpu: CPU number to query
492 * @last_update_time: variable to store update time in. Do not update 492 * @last_update_time: variable to store update time in. Do not update
493 * counters if NULL. 493 * counters if NULL.
494 * 494 *
495 * Return the cummulative idle time (since boot) for a given 495 * Return the cummulative idle time (since boot) for a given
496 * CPU, in microseconds. 496 * CPU, in microseconds.
497 * 497 *
498 * This time is measured via accounting rather than sampling, 498 * This time is measured via accounting rather than sampling,
499 * and is as accurate as ktime_get() is. 499 * and is as accurate as ktime_get() is.
500 * 500 *
501 * This function returns -1 if NOHZ is not enabled. 501 * This function returns -1 if NOHZ is not enabled.
502 */ 502 */
503 u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) 503 u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
504 { 504 {
505 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 505 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
506 ktime_t now, idle; 506 ktime_t now, idle;
507 507
508 if (!tick_nohz_active) 508 if (!tick_nohz_active)
509 return -1; 509 return -1;
510 510
511 now = ktime_get(); 511 now = ktime_get();
512 if (last_update_time) { 512 if (last_update_time) {
513 update_ts_time_stats(cpu, ts, now, last_update_time); 513 update_ts_time_stats(cpu, ts, now, last_update_time);
514 idle = ts->idle_sleeptime; 514 idle = ts->idle_sleeptime;
515 } else { 515 } else {
516 if (ts->idle_active && !nr_iowait_cpu(cpu)) { 516 if (ts->idle_active && !nr_iowait_cpu(cpu)) {
517 ktime_t delta = ktime_sub(now, ts->idle_entrytime); 517 ktime_t delta = ktime_sub(now, ts->idle_entrytime);
518 518
519 idle = ktime_add(ts->idle_sleeptime, delta); 519 idle = ktime_add(ts->idle_sleeptime, delta);
520 } else { 520 } else {
521 idle = ts->idle_sleeptime; 521 idle = ts->idle_sleeptime;
522 } 522 }
523 } 523 }
524 524
525 return ktime_to_us(idle); 525 return ktime_to_us(idle);
526 526
527 } 527 }
528 EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); 528 EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
529 529
530 /** 530 /**
531 * get_cpu_iowait_time_us - get the total iowait time of a cpu 531 * get_cpu_iowait_time_us - get the total iowait time of a cpu
532 * @cpu: CPU number to query 532 * @cpu: CPU number to query
533 * @last_update_time: variable to store update time in. Do not update 533 * @last_update_time: variable to store update time in. Do not update
534 * counters if NULL. 534 * counters if NULL.
535 * 535 *
536 * Return the cummulative iowait time (since boot) for a given 536 * Return the cummulative iowait time (since boot) for a given
537 * CPU, in microseconds. 537 * CPU, in microseconds.
538 * 538 *
539 * This time is measured via accounting rather than sampling, 539 * This time is measured via accounting rather than sampling,
540 * and is as accurate as ktime_get() is. 540 * and is as accurate as ktime_get() is.
541 * 541 *
542 * This function returns -1 if NOHZ is not enabled. 542 * This function returns -1 if NOHZ is not enabled.
543 */ 543 */
544 u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) 544 u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
545 { 545 {
546 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 546 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
547 ktime_t now, iowait; 547 ktime_t now, iowait;
548 548
549 if (!tick_nohz_active) 549 if (!tick_nohz_active)
550 return -1; 550 return -1;
551 551
552 now = ktime_get(); 552 now = ktime_get();
553 if (last_update_time) { 553 if (last_update_time) {
554 update_ts_time_stats(cpu, ts, now, last_update_time); 554 update_ts_time_stats(cpu, ts, now, last_update_time);
555 iowait = ts->iowait_sleeptime; 555 iowait = ts->iowait_sleeptime;
556 } else { 556 } else {
557 if (ts->idle_active && nr_iowait_cpu(cpu) > 0) { 557 if (ts->idle_active && nr_iowait_cpu(cpu) > 0) {
558 ktime_t delta = ktime_sub(now, ts->idle_entrytime); 558 ktime_t delta = ktime_sub(now, ts->idle_entrytime);
559 559
560 iowait = ktime_add(ts->iowait_sleeptime, delta); 560 iowait = ktime_add(ts->iowait_sleeptime, delta);
561 } else { 561 } else {
562 iowait = ts->iowait_sleeptime; 562 iowait = ts->iowait_sleeptime;
563 } 563 }
564 } 564 }
565 565
566 return ktime_to_us(iowait); 566 return ktime_to_us(iowait);
567 } 567 }
568 EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); 568 EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
569 569
570 static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, 570 static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
571 ktime_t now, int cpu) 571 ktime_t now, int cpu)
572 { 572 {
573 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; 573 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies;
574 ktime_t last_update, expires, ret = { .tv64 = 0 }; 574 ktime_t last_update, expires, ret = { .tv64 = 0 };
575 unsigned long rcu_delta_jiffies; 575 unsigned long rcu_delta_jiffies;
576 struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); 576 struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
577 u64 time_delta; 577 u64 time_delta;
578 578
579 time_delta = timekeeping_max_deferment(); 579 time_delta = timekeeping_max_deferment();
580 580
581 /* Read jiffies and the time when jiffies were updated last */ 581 /* Read jiffies and the time when jiffies were updated last */
582 do { 582 do {
583 seq = read_seqbegin(&jiffies_lock); 583 seq = read_seqbegin(&jiffies_lock);
584 last_update = last_jiffies_update; 584 last_update = last_jiffies_update;
585 last_jiffies = jiffies; 585 last_jiffies = jiffies;
586 } while (read_seqretry(&jiffies_lock, seq)); 586 } while (read_seqretry(&jiffies_lock, seq));
587 587
588 if (rcu_needs_cpu(&rcu_delta_jiffies) || 588 if (rcu_needs_cpu(&rcu_delta_jiffies) ||
589 arch_needs_cpu() || irq_work_needs_cpu()) { 589 arch_needs_cpu() || irq_work_needs_cpu()) {
590 next_jiffies = last_jiffies + 1; 590 next_jiffies = last_jiffies + 1;
591 delta_jiffies = 1; 591 delta_jiffies = 1;
592 } else { 592 } else {
593 /* Get the next timer wheel timer */ 593 /* Get the next timer wheel timer */
594 next_jiffies = get_next_timer_interrupt(last_jiffies); 594 next_jiffies = get_next_timer_interrupt(last_jiffies);
595 delta_jiffies = next_jiffies - last_jiffies; 595 delta_jiffies = next_jiffies - last_jiffies;
596 if (rcu_delta_jiffies < delta_jiffies) { 596 if (rcu_delta_jiffies < delta_jiffies) {
597 next_jiffies = last_jiffies + rcu_delta_jiffies; 597 next_jiffies = last_jiffies + rcu_delta_jiffies;
598 delta_jiffies = rcu_delta_jiffies; 598 delta_jiffies = rcu_delta_jiffies;
599 } 599 }
600 } 600 }
601 601
602 /* 602 /*
603 * Do not stop the tick, if we are only one off (or less) 603 * Do not stop the tick, if we are only one off (or less)
604 * or if the cpu is required for RCU: 604 * or if the cpu is required for RCU:
605 */ 605 */
606 if (!ts->tick_stopped && delta_jiffies <= 1) 606 if (!ts->tick_stopped && delta_jiffies <= 1)
607 goto out; 607 goto out;
608 608
609 /* Schedule the tick, if we are at least one jiffie off */ 609 /* Schedule the tick, if we are at least one jiffie off */
610 if ((long)delta_jiffies >= 1) { 610 if ((long)delta_jiffies >= 1) {
611 611
612 /* 612 /*
613 * If this cpu is the one which updates jiffies, then 613 * If this cpu is the one which updates jiffies, then
614 * give up the assignment and let it be taken by the 614 * give up the assignment and let it be taken by the
615 * cpu which runs the tick timer next, which might be 615 * cpu which runs the tick timer next, which might be
616 * this cpu as well. If we don't drop this here the 616 * this cpu as well. If we don't drop this here the
617 * jiffies might be stale and do_timer() never 617 * jiffies might be stale and do_timer() never
618 * invoked. Keep track of the fact that it was the one 618 * invoked. Keep track of the fact that it was the one
619 * which had the do_timer() duty last. If this cpu is 619 * which had the do_timer() duty last. If this cpu is
620 * the one which had the do_timer() duty last, we 620 * the one which had the do_timer() duty last, we
621 * limit the sleep time to the timekeeping 621 * limit the sleep time to the timekeeping
622 * max_deferement value which we retrieved 622 * max_deferement value which we retrieved
623 * above. Otherwise we can sleep as long as we want. 623 * above. Otherwise we can sleep as long as we want.
624 */ 624 */
625 if (cpu == tick_do_timer_cpu) { 625 if (cpu == tick_do_timer_cpu) {
626 tick_do_timer_cpu = TICK_DO_TIMER_NONE; 626 tick_do_timer_cpu = TICK_DO_TIMER_NONE;
627 ts->do_timer_last = 1; 627 ts->do_timer_last = 1;
628 } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) { 628 } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) {
629 time_delta = KTIME_MAX; 629 time_delta = KTIME_MAX;
630 ts->do_timer_last = 0; 630 ts->do_timer_last = 0;
631 } else if (!ts->do_timer_last) { 631 } else if (!ts->do_timer_last) {
632 time_delta = KTIME_MAX; 632 time_delta = KTIME_MAX;
633 } 633 }
634 634
635 #ifdef CONFIG_NO_HZ_FULL 635 #ifdef CONFIG_NO_HZ_FULL
636 if (!ts->inidle) { 636 if (!ts->inidle) {
637 time_delta = min(time_delta, 637 time_delta = min(time_delta,
638 scheduler_tick_max_deferment()); 638 scheduler_tick_max_deferment());
639 } 639 }
640 #endif 640 #endif
641 641
642 /* 642 /*
643 * calculate the expiry time for the next timer wheel 643 * calculate the expiry time for the next timer wheel
644 * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals 644 * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals
645 * that there is no timer pending or at least extremely 645 * that there is no timer pending or at least extremely
646 * far into the future (12 days for HZ=1000). In this 646 * far into the future (12 days for HZ=1000). In this
647 * case we set the expiry to the end of time. 647 * case we set the expiry to the end of time.
648 */ 648 */
649 if (likely(delta_jiffies < NEXT_TIMER_MAX_DELTA)) { 649 if (likely(delta_jiffies < NEXT_TIMER_MAX_DELTA)) {
650 /* 650 /*
651 * Calculate the time delta for the next timer event. 651 * Calculate the time delta for the next timer event.
652 * If the time delta exceeds the maximum time delta 652 * If the time delta exceeds the maximum time delta
653 * permitted by the current clocksource then adjust 653 * permitted by the current clocksource then adjust
654 * the time delta accordingly to ensure the 654 * the time delta accordingly to ensure the
655 * clocksource does not wrap. 655 * clocksource does not wrap.
656 */ 656 */
657 time_delta = min_t(u64, time_delta, 657 time_delta = min_t(u64, time_delta,
658 tick_period.tv64 * delta_jiffies); 658 tick_period.tv64 * delta_jiffies);
659 } 659 }
660 660
661 if (time_delta < KTIME_MAX) 661 if (time_delta < KTIME_MAX)
662 expires = ktime_add_ns(last_update, time_delta); 662 expires = ktime_add_ns(last_update, time_delta);
663 else 663 else
664 expires.tv64 = KTIME_MAX; 664 expires.tv64 = KTIME_MAX;
665 665
666 /* Skip reprogram of event if its not changed */ 666 /* Skip reprogram of event if its not changed */
667 if (ts->tick_stopped && ktime_equal(expires, dev->next_event)) 667 if (ts->tick_stopped && ktime_equal(expires, dev->next_event))
668 goto out; 668 goto out;
669 669
670 ret = expires; 670 ret = expires;
671 671
672 /* 672 /*
673 * nohz_stop_sched_tick can be called several times before 673 * nohz_stop_sched_tick can be called several times before
674 * the nohz_restart_sched_tick is called. This happens when 674 * the nohz_restart_sched_tick is called. This happens when
675 * interrupts arrive which do not cause a reschedule. In the 675 * interrupts arrive which do not cause a reschedule. In the
676 * first call we save the current tick time, so we can restart 676 * first call we save the current tick time, so we can restart
677 * the scheduler tick in nohz_restart_sched_tick. 677 * the scheduler tick in nohz_restart_sched_tick.
678 */ 678 */
679 if (!ts->tick_stopped) { 679 if (!ts->tick_stopped) {
680 nohz_balance_enter_idle(cpu); 680 nohz_balance_enter_idle(cpu);
681 calc_load_enter_idle(); 681 calc_load_enter_idle();
682 682
683 ts->last_tick = hrtimer_get_expires(&ts->sched_timer); 683 ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
684 ts->tick_stopped = 1; 684 ts->tick_stopped = 1;
685 trace_tick_stop(1, " "); 685 trace_tick_stop(1, " ");
686 } 686 }
687 687
688 /* 688 /*
689 * If the expiration time == KTIME_MAX, then 689 * If the expiration time == KTIME_MAX, then
690 * in this case we simply stop the tick timer. 690 * in this case we simply stop the tick timer.
691 */ 691 */
692 if (unlikely(expires.tv64 == KTIME_MAX)) { 692 if (unlikely(expires.tv64 == KTIME_MAX)) {
693 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) 693 if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
694 hrtimer_cancel(&ts->sched_timer); 694 hrtimer_cancel(&ts->sched_timer);
695 goto out; 695 goto out;
696 } 696 }
697 697
698 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { 698 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
699 hrtimer_start(&ts->sched_timer, expires, 699 hrtimer_start(&ts->sched_timer, expires,
700 HRTIMER_MODE_ABS_PINNED); 700 HRTIMER_MODE_ABS_PINNED);
701 /* Check, if the timer was already in the past */ 701 /* Check, if the timer was already in the past */
702 if (hrtimer_active(&ts->sched_timer)) 702 if (hrtimer_active(&ts->sched_timer))
703 goto out; 703 goto out;
704 } else if (!tick_program_event(expires, 0)) 704 } else if (!tick_program_event(expires, 0))
705 goto out; 705 goto out;
706 /* 706 /*
707 * We are past the event already. So we crossed a 707 * We are past the event already. So we crossed a
708 * jiffie boundary. Update jiffies and raise the 708 * jiffie boundary. Update jiffies and raise the
709 * softirq. 709 * softirq.
710 */ 710 */
711 tick_do_update_jiffies64(ktime_get()); 711 tick_do_update_jiffies64(ktime_get());
712 } 712 }
713 raise_softirq_irqoff(TIMER_SOFTIRQ); 713 raise_softirq_irqoff(TIMER_SOFTIRQ);
714 out: 714 out:
715 ts->next_jiffies = next_jiffies; 715 ts->next_jiffies = next_jiffies;
716 ts->last_jiffies = last_jiffies; 716 ts->last_jiffies = last_jiffies;
717 ts->sleep_length = ktime_sub(dev->next_event, now); 717 ts->sleep_length = ktime_sub(dev->next_event, now);
718 718
719 return ret; 719 return ret;
720 } 720 }
721 721
722 static void tick_nohz_full_stop_tick(struct tick_sched *ts) 722 static void tick_nohz_full_stop_tick(struct tick_sched *ts)
723 { 723 {
724 #ifdef CONFIG_NO_HZ_FULL 724 #ifdef CONFIG_NO_HZ_FULL
725 int cpu = smp_processor_id(); 725 int cpu = smp_processor_id();
726 726
727 if (!tick_nohz_full_cpu(cpu) || is_idle_task(current)) 727 if (!tick_nohz_full_cpu(cpu) || is_idle_task(current))
728 return; 728 return;
729 729
730 if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE) 730 if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE)
731 return; 731 return;
732 732
733 if (!can_stop_full_tick()) 733 if (!can_stop_full_tick())
734 return; 734 return;
735 735
736 tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); 736 tick_nohz_stop_sched_tick(ts, ktime_get(), cpu);
737 #endif 737 #endif
738 } 738 }
739 739
740 static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) 740 static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
741 { 741 {
742 /* 742 /*
743 * If this cpu is offline and it is the one which updates 743 * If this cpu is offline and it is the one which updates
744 * jiffies, then give up the assignment and let it be taken by 744 * jiffies, then give up the assignment and let it be taken by
745 * the cpu which runs the tick timer next. If we don't drop 745 * the cpu which runs the tick timer next. If we don't drop
746 * this here the jiffies might be stale and do_timer() never 746 * this here the jiffies might be stale and do_timer() never
747 * invoked. 747 * invoked.
748 */ 748 */
749 if (unlikely(!cpu_online(cpu))) { 749 if (unlikely(!cpu_online(cpu))) {
750 if (cpu == tick_do_timer_cpu) 750 if (cpu == tick_do_timer_cpu)
751 tick_do_timer_cpu = TICK_DO_TIMER_NONE; 751 tick_do_timer_cpu = TICK_DO_TIMER_NONE;
752 return false; 752 return false;
753 } 753 }
754 754
755 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) { 755 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) {
756 ts->sleep_length = (ktime_t) { .tv64 = NSEC_PER_SEC/HZ }; 756 ts->sleep_length = (ktime_t) { .tv64 = NSEC_PER_SEC/HZ };
757 return false; 757 return false;
758 } 758 }
759 759
760 if (need_resched()) 760 if (need_resched())
761 return false; 761 return false;
762 762
763 if (unlikely(local_softirq_pending() && cpu_online(cpu))) { 763 if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
764 static int ratelimit; 764 static int ratelimit;
765 765
766 if (ratelimit < 10 && 766 if (ratelimit < 10 &&
767 (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) { 767 (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
768 pr_warn("NOHZ: local_softirq_pending %02x\n", 768 pr_warn("NOHZ: local_softirq_pending %02x\n",
769 (unsigned int) local_softirq_pending()); 769 (unsigned int) local_softirq_pending());
770 ratelimit++; 770 ratelimit++;
771 } 771 }
772 return false; 772 return false;
773 } 773 }
774 774
775 if (tick_nohz_full_enabled()) { 775 if (tick_nohz_full_enabled()) {
776 /* 776 /*
777 * Keep the tick alive to guarantee timekeeping progression 777 * Keep the tick alive to guarantee timekeeping progression
778 * if there are full dynticks CPUs around 778 * if there are full dynticks CPUs around
779 */ 779 */
780 if (tick_do_timer_cpu == cpu) 780 if (tick_do_timer_cpu == cpu)
781 return false; 781 return false;
782 /* 782 /*
783 * Boot safety: make sure the timekeeping duty has been 783 * Boot safety: make sure the timekeeping duty has been
784 * assigned before entering dyntick-idle mode, 784 * assigned before entering dyntick-idle mode,
785 */ 785 */
786 if (tick_do_timer_cpu == TICK_DO_TIMER_NONE) 786 if (tick_do_timer_cpu == TICK_DO_TIMER_NONE)
787 return false; 787 return false;
788 } 788 }
789 789
790 return true; 790 return true;
791 } 791 }
792 792
793 static void __tick_nohz_idle_enter(struct tick_sched *ts) 793 static void __tick_nohz_idle_enter(struct tick_sched *ts)
794 { 794 {
795 ktime_t now, expires; 795 ktime_t now, expires;
796 int cpu = smp_processor_id(); 796 int cpu = smp_processor_id();
797 797
798 now = tick_nohz_start_idle(ts); 798 now = tick_nohz_start_idle(ts);
799 799
800 if (can_stop_idle_tick(cpu, ts)) { 800 if (can_stop_idle_tick(cpu, ts)) {
801 int was_stopped = ts->tick_stopped; 801 int was_stopped = ts->tick_stopped;
802 802
803 ts->idle_calls++; 803 ts->idle_calls++;
804 804
805 expires = tick_nohz_stop_sched_tick(ts, now, cpu); 805 expires = tick_nohz_stop_sched_tick(ts, now, cpu);
806 if (expires.tv64 > 0LL) { 806 if (expires.tv64 > 0LL) {
807 ts->idle_sleeps++; 807 ts->idle_sleeps++;
808 ts->idle_expires = expires; 808 ts->idle_expires = expires;
809 } 809 }
810 810
811 if (!was_stopped && ts->tick_stopped) 811 if (!was_stopped && ts->tick_stopped)
812 ts->idle_jiffies = ts->last_jiffies; 812 ts->idle_jiffies = ts->last_jiffies;
813 } 813 }
814 } 814 }
815 815
816 /** 816 /**
817 * tick_nohz_idle_enter - stop the idle tick from the idle task 817 * tick_nohz_idle_enter - stop the idle tick from the idle task
818 * 818 *
819 * When the next event is more than a tick into the future, stop the idle tick 819 * When the next event is more than a tick into the future, stop the idle tick
820 * Called when we start the idle loop. 820 * Called when we start the idle loop.
821 * 821 *
822 * The arch is responsible of calling: 822 * The arch is responsible of calling:
823 * 823 *
824 * - rcu_idle_enter() after its last use of RCU before the CPU is put 824 * - rcu_idle_enter() after its last use of RCU before the CPU is put
825 * to sleep. 825 * to sleep.
826 * - rcu_idle_exit() before the first use of RCU after the CPU is woken up. 826 * - rcu_idle_exit() before the first use of RCU after the CPU is woken up.
827 */ 827 */
828 void tick_nohz_idle_enter(void) 828 void tick_nohz_idle_enter(void)
829 { 829 {
830 struct tick_sched *ts; 830 struct tick_sched *ts;
831 831
832 WARN_ON_ONCE(irqs_disabled()); 832 WARN_ON_ONCE(irqs_disabled());
833 833
834 /* 834 /*
835 * Update the idle state in the scheduler domain hierarchy 835 * Update the idle state in the scheduler domain hierarchy
836 * when tick_nohz_stop_sched_tick() is called from the idle loop. 836 * when tick_nohz_stop_sched_tick() is called from the idle loop.
837 * State will be updated to busy during the first busy tick after 837 * State will be updated to busy during the first busy tick after
838 * exiting idle. 838 * exiting idle.
839 */ 839 */
840 set_cpu_sd_state_idle(); 840 set_cpu_sd_state_idle();
841 841
842 local_irq_disable(); 842 local_irq_disable();
843 843
844 ts = this_cpu_ptr(&tick_cpu_sched); 844 ts = this_cpu_ptr(&tick_cpu_sched);
845 ts->inidle = 1; 845 ts->inidle = 1;
846 __tick_nohz_idle_enter(ts); 846 __tick_nohz_idle_enter(ts);
847 847
848 local_irq_enable(); 848 local_irq_enable();
849 } 849 }
850 EXPORT_SYMBOL_GPL(tick_nohz_idle_enter);
851 850
852 /** 851 /**
853 * tick_nohz_irq_exit - update next tick event from interrupt exit 852 * tick_nohz_irq_exit - update next tick event from interrupt exit
854 * 853 *
855 * When an interrupt fires while we are idle and it doesn't cause 854 * When an interrupt fires while we are idle and it doesn't cause
856 * a reschedule, it may still add, modify or delete a timer, enqueue 855 * a reschedule, it may still add, modify or delete a timer, enqueue
857 * an RCU callback, etc... 856 * an RCU callback, etc...
858 * So we need to re-calculate and reprogram the next tick event. 857 * So we need to re-calculate and reprogram the next tick event.
859 */ 858 */
860 void tick_nohz_irq_exit(void) 859 void tick_nohz_irq_exit(void)
861 { 860 {
862 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 861 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
863 862
864 if (ts->inidle) 863 if (ts->inidle)
865 __tick_nohz_idle_enter(ts); 864 __tick_nohz_idle_enter(ts);
866 else 865 else
867 tick_nohz_full_stop_tick(ts); 866 tick_nohz_full_stop_tick(ts);
868 } 867 }
869 868
870 /** 869 /**
871 * tick_nohz_get_sleep_length - return the length of the current sleep 870 * tick_nohz_get_sleep_length - return the length of the current sleep
872 * 871 *
873 * Called from power state control code with interrupts disabled 872 * Called from power state control code with interrupts disabled
874 */ 873 */
875 ktime_t tick_nohz_get_sleep_length(void) 874 ktime_t tick_nohz_get_sleep_length(void)
876 { 875 {
877 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 876 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
878 877
879 return ts->sleep_length; 878 return ts->sleep_length;
880 } 879 }
881 880
882 static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) 881 static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
883 { 882 {
884 hrtimer_cancel(&ts->sched_timer); 883 hrtimer_cancel(&ts->sched_timer);
885 hrtimer_set_expires(&ts->sched_timer, ts->last_tick); 884 hrtimer_set_expires(&ts->sched_timer, ts->last_tick);
886 885
887 while (1) { 886 while (1) {
888 /* Forward the time to expire in the future */ 887 /* Forward the time to expire in the future */
889 hrtimer_forward(&ts->sched_timer, now, tick_period); 888 hrtimer_forward(&ts->sched_timer, now, tick_period);
890 889
891 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { 890 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
892 hrtimer_start_expires(&ts->sched_timer, 891 hrtimer_start_expires(&ts->sched_timer,
893 HRTIMER_MODE_ABS_PINNED); 892 HRTIMER_MODE_ABS_PINNED);
894 /* Check, if the timer was already in the past */ 893 /* Check, if the timer was already in the past */
895 if (hrtimer_active(&ts->sched_timer)) 894 if (hrtimer_active(&ts->sched_timer))
896 break; 895 break;
897 } else { 896 } else {
898 if (!tick_program_event( 897 if (!tick_program_event(
899 hrtimer_get_expires(&ts->sched_timer), 0)) 898 hrtimer_get_expires(&ts->sched_timer), 0))
900 break; 899 break;
901 } 900 }
902 /* Reread time and update jiffies */ 901 /* Reread time and update jiffies */
903 now = ktime_get(); 902 now = ktime_get();
904 tick_do_update_jiffies64(now); 903 tick_do_update_jiffies64(now);
905 } 904 }
906 } 905 }
907 906
908 static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) 907 static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
909 { 908 {
910 /* Update jiffies first */ 909 /* Update jiffies first */
911 tick_do_update_jiffies64(now); 910 tick_do_update_jiffies64(now);
912 update_cpu_load_nohz(); 911 update_cpu_load_nohz();
913 912
914 calc_load_exit_idle(); 913 calc_load_exit_idle();
915 touch_softlockup_watchdog(); 914 touch_softlockup_watchdog();
916 /* 915 /*
917 * Cancel the scheduled timer and restore the tick 916 * Cancel the scheduled timer and restore the tick
918 */ 917 */
919 ts->tick_stopped = 0; 918 ts->tick_stopped = 0;
920 ts->idle_exittime = now; 919 ts->idle_exittime = now;
921 920
922 tick_nohz_restart(ts, now); 921 tick_nohz_restart(ts, now);
923 } 922 }
924 923
925 static void tick_nohz_account_idle_ticks(struct tick_sched *ts) 924 static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
926 { 925 {
927 #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE 926 #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
928 unsigned long ticks; 927 unsigned long ticks;
929 928
930 if (vtime_accounting_enabled()) 929 if (vtime_accounting_enabled())
931 return; 930 return;
932 /* 931 /*
933 * We stopped the tick in idle. Update process times would miss the 932 * We stopped the tick in idle. Update process times would miss the
934 * time we slept as update_process_times does only a 1 tick 933 * time we slept as update_process_times does only a 1 tick
935 * accounting. Enforce that this is accounted to idle ! 934 * accounting. Enforce that this is accounted to idle !
936 */ 935 */
937 ticks = jiffies - ts->idle_jiffies; 936 ticks = jiffies - ts->idle_jiffies;
938 /* 937 /*
939 * We might be one off. Do not randomly account a huge number of ticks! 938 * We might be one off. Do not randomly account a huge number of ticks!
940 */ 939 */
941 if (ticks && ticks < LONG_MAX) 940 if (ticks && ticks < LONG_MAX)
942 account_idle_ticks(ticks); 941 account_idle_ticks(ticks);
943 #endif 942 #endif
944 } 943 }
945 944
946 /** 945 /**
947 * tick_nohz_idle_exit - restart the idle tick from the idle task 946 * tick_nohz_idle_exit - restart the idle tick from the idle task
948 * 947 *
949 * Restart the idle tick when the CPU is woken up from idle 948 * Restart the idle tick when the CPU is woken up from idle
950 * This also exit the RCU extended quiescent state. The CPU 949 * This also exit the RCU extended quiescent state. The CPU
951 * can use RCU again after this function is called. 950 * can use RCU again after this function is called.
952 */ 951 */
953 void tick_nohz_idle_exit(void) 952 void tick_nohz_idle_exit(void)
954 { 953 {
955 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 954 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
956 ktime_t now; 955 ktime_t now;
957 956
958 local_irq_disable(); 957 local_irq_disable();
959 958
960 WARN_ON_ONCE(!ts->inidle); 959 WARN_ON_ONCE(!ts->inidle);
961 960
962 ts->inidle = 0; 961 ts->inidle = 0;
963 962
964 if (ts->idle_active || ts->tick_stopped) 963 if (ts->idle_active || ts->tick_stopped)
965 now = ktime_get(); 964 now = ktime_get();
966 965
967 if (ts->idle_active) 966 if (ts->idle_active)
968 tick_nohz_stop_idle(ts, now); 967 tick_nohz_stop_idle(ts, now);
969 968
970 if (ts->tick_stopped) { 969 if (ts->tick_stopped) {
971 tick_nohz_restart_sched_tick(ts, now); 970 tick_nohz_restart_sched_tick(ts, now);
972 tick_nohz_account_idle_ticks(ts); 971 tick_nohz_account_idle_ticks(ts);
973 } 972 }
974 973
975 local_irq_enable(); 974 local_irq_enable();
976 } 975 }
977 EXPORT_SYMBOL_GPL(tick_nohz_idle_exit);
978 976
979 static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now) 977 static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now)
980 { 978 {
981 hrtimer_forward(&ts->sched_timer, now, tick_period); 979 hrtimer_forward(&ts->sched_timer, now, tick_period);
982 return tick_program_event(hrtimer_get_expires(&ts->sched_timer), 0); 980 return tick_program_event(hrtimer_get_expires(&ts->sched_timer), 0);
983 } 981 }
984 982
985 /* 983 /*
986 * The nohz low res interrupt handler 984 * The nohz low res interrupt handler
987 */ 985 */
988 static void tick_nohz_handler(struct clock_event_device *dev) 986 static void tick_nohz_handler(struct clock_event_device *dev)
989 { 987 {
990 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 988 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
991 struct pt_regs *regs = get_irq_regs(); 989 struct pt_regs *regs = get_irq_regs();
992 ktime_t now = ktime_get(); 990 ktime_t now = ktime_get();
993 991
994 dev->next_event.tv64 = KTIME_MAX; 992 dev->next_event.tv64 = KTIME_MAX;
995 993
996 tick_sched_do_timer(now); 994 tick_sched_do_timer(now);
997 tick_sched_handle(ts, regs); 995 tick_sched_handle(ts, regs);
998 996
999 /* No need to reprogram if we are running tickless */ 997 /* No need to reprogram if we are running tickless */
1000 if (unlikely(ts->tick_stopped)) 998 if (unlikely(ts->tick_stopped))
1001 return; 999 return;
1002 1000
1003 while (tick_nohz_reprogram(ts, now)) { 1001 while (tick_nohz_reprogram(ts, now)) {
1004 now = ktime_get(); 1002 now = ktime_get();
1005 tick_do_update_jiffies64(now); 1003 tick_do_update_jiffies64(now);
1006 } 1004 }
1007 } 1005 }
1008 1006
1009 /** 1007 /**
1010 * tick_nohz_switch_to_nohz - switch to nohz mode 1008 * tick_nohz_switch_to_nohz - switch to nohz mode
1011 */ 1009 */
1012 static void tick_nohz_switch_to_nohz(void) 1010 static void tick_nohz_switch_to_nohz(void)
1013 { 1011 {
1014 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 1012 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
1015 ktime_t next; 1013 ktime_t next;
1016 1014
1017 if (!tick_nohz_enabled) 1015 if (!tick_nohz_enabled)
1018 return; 1016 return;
1019 1017
1020 local_irq_disable(); 1018 local_irq_disable();
1021 if (tick_switch_to_oneshot(tick_nohz_handler)) { 1019 if (tick_switch_to_oneshot(tick_nohz_handler)) {
1022 local_irq_enable(); 1020 local_irq_enable();
1023 return; 1021 return;
1024 } 1022 }
1025 tick_nohz_active = 1; 1023 tick_nohz_active = 1;
1026 ts->nohz_mode = NOHZ_MODE_LOWRES; 1024 ts->nohz_mode = NOHZ_MODE_LOWRES;
1027 1025
1028 /* 1026 /*
1029 * Recycle the hrtimer in ts, so we can share the 1027 * Recycle the hrtimer in ts, so we can share the
1030 * hrtimer_forward with the highres code. 1028 * hrtimer_forward with the highres code.
1031 */ 1029 */
1032 hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 1030 hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
1033 /* Get the next period */ 1031 /* Get the next period */
1034 next = tick_init_jiffy_update(); 1032 next = tick_init_jiffy_update();
1035 1033
1036 for (;;) { 1034 for (;;) {
1037 hrtimer_set_expires(&ts->sched_timer, next); 1035 hrtimer_set_expires(&ts->sched_timer, next);
1038 if (!tick_program_event(next, 0)) 1036 if (!tick_program_event(next, 0))
1039 break; 1037 break;
1040 next = ktime_add(next, tick_period); 1038 next = ktime_add(next, tick_period);
1041 } 1039 }
1042 local_irq_enable(); 1040 local_irq_enable();
1043 } 1041 }
1044 1042
1045 /* 1043 /*
1046 * When NOHZ is enabled and the tick is stopped, we need to kick the 1044 * When NOHZ is enabled and the tick is stopped, we need to kick the
1047 * tick timer from irq_enter() so that the jiffies update is kept 1045 * tick timer from irq_enter() so that the jiffies update is kept
1048 * alive during long running softirqs. That's ugly as hell, but 1046 * alive during long running softirqs. That's ugly as hell, but
1049 * correctness is key even if we need to fix the offending softirq in 1047 * correctness is key even if we need to fix the offending softirq in
1050 * the first place. 1048 * the first place.
1051 * 1049 *
1052 * Note, this is different to tick_nohz_restart. We just kick the 1050 * Note, this is different to tick_nohz_restart. We just kick the
1053 * timer and do not touch the other magic bits which need to be done 1051 * timer and do not touch the other magic bits which need to be done
1054 * when idle is left. 1052 * when idle is left.
1055 */ 1053 */
1056 static void tick_nohz_kick_tick(struct tick_sched *ts, ktime_t now) 1054 static void tick_nohz_kick_tick(struct tick_sched *ts, ktime_t now)
1057 { 1055 {
1058 #if 0 1056 #if 0
1059 /* Switch back to 2.6.27 behaviour */ 1057 /* Switch back to 2.6.27 behaviour */
1060 ktime_t delta; 1058 ktime_t delta;
1061 1059
1062 /* 1060 /*
1063 * Do not touch the tick device, when the next expiry is either 1061 * Do not touch the tick device, when the next expiry is either
1064 * already reached or less/equal than the tick period. 1062 * already reached or less/equal than the tick period.
1065 */ 1063 */
1066 delta = ktime_sub(hrtimer_get_expires(&ts->sched_timer), now); 1064 delta = ktime_sub(hrtimer_get_expires(&ts->sched_timer), now);
1067 if (delta.tv64 <= tick_period.tv64) 1065 if (delta.tv64 <= tick_period.tv64)
1068 return; 1066 return;
1069 1067
1070 tick_nohz_restart(ts, now); 1068 tick_nohz_restart(ts, now);
1071 #endif 1069 #endif
1072 } 1070 }
1073 1071
1074 static inline void tick_nohz_irq_enter(void) 1072 static inline void tick_nohz_irq_enter(void)
1075 { 1073 {
1076 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 1074 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
1077 ktime_t now; 1075 ktime_t now;
1078 1076
1079 if (!ts->idle_active && !ts->tick_stopped) 1077 if (!ts->idle_active && !ts->tick_stopped)
1080 return; 1078 return;
1081 now = ktime_get(); 1079 now = ktime_get();
1082 if (ts->idle_active) 1080 if (ts->idle_active)
1083 tick_nohz_stop_idle(ts, now); 1081 tick_nohz_stop_idle(ts, now);
1084 if (ts->tick_stopped) { 1082 if (ts->tick_stopped) {
1085 tick_nohz_update_jiffies(now); 1083 tick_nohz_update_jiffies(now);
1086 tick_nohz_kick_tick(ts, now); 1084 tick_nohz_kick_tick(ts, now);
1087 } 1085 }
1088 } 1086 }
1089 1087
1090 #else 1088 #else
1091 1089
1092 static inline void tick_nohz_switch_to_nohz(void) { } 1090 static inline void tick_nohz_switch_to_nohz(void) { }
1093 static inline void tick_nohz_irq_enter(void) { } 1091 static inline void tick_nohz_irq_enter(void) { }
1094 1092
1095 #endif /* CONFIG_NO_HZ_COMMON */ 1093 #endif /* CONFIG_NO_HZ_COMMON */
1096 1094
1097 /* 1095 /*
1098 * Called from irq_enter to notify about the possible interruption of idle() 1096 * Called from irq_enter to notify about the possible interruption of idle()
1099 */ 1097 */
1100 void tick_irq_enter(void) 1098 void tick_irq_enter(void)
1101 { 1099 {
1102 tick_check_oneshot_broadcast_this_cpu(); 1100 tick_check_oneshot_broadcast_this_cpu();
1103 tick_nohz_irq_enter(); 1101 tick_nohz_irq_enter();
1104 } 1102 }
1105 1103
1106 /* 1104 /*
1107 * High resolution timer specific code 1105 * High resolution timer specific code
1108 */ 1106 */
1109 #ifdef CONFIG_HIGH_RES_TIMERS 1107 #ifdef CONFIG_HIGH_RES_TIMERS
1110 /* 1108 /*
1111 * We rearm the timer until we get disabled by the idle code. 1109 * We rearm the timer until we get disabled by the idle code.
1112 * Called with interrupts disabled. 1110 * Called with interrupts disabled.
1113 */ 1111 */
1114 static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) 1112 static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
1115 { 1113 {
1116 struct tick_sched *ts = 1114 struct tick_sched *ts =
1117 container_of(timer, struct tick_sched, sched_timer); 1115 container_of(timer, struct tick_sched, sched_timer);
1118 struct pt_regs *regs = get_irq_regs(); 1116 struct pt_regs *regs = get_irq_regs();
1119 ktime_t now = ktime_get(); 1117 ktime_t now = ktime_get();
1120 1118
1121 tick_sched_do_timer(now); 1119 tick_sched_do_timer(now);
1122 1120
1123 /* 1121 /*
1124 * Do not call, when we are not in irq context and have 1122 * Do not call, when we are not in irq context and have
1125 * no valid regs pointer 1123 * no valid regs pointer
1126 */ 1124 */
1127 if (regs) 1125 if (regs)
1128 tick_sched_handle(ts, regs); 1126 tick_sched_handle(ts, regs);
1129 1127
1130 /* No need to reprogram if we are in idle or full dynticks mode */ 1128 /* No need to reprogram if we are in idle or full dynticks mode */
1131 if (unlikely(ts->tick_stopped)) 1129 if (unlikely(ts->tick_stopped))
1132 return HRTIMER_NORESTART; 1130 return HRTIMER_NORESTART;
1133 1131
1134 hrtimer_forward(timer, now, tick_period); 1132 hrtimer_forward(timer, now, tick_period);
1135 1133
1136 return HRTIMER_RESTART; 1134 return HRTIMER_RESTART;
1137 } 1135 }
1138 1136
1139 static int sched_skew_tick; 1137 static int sched_skew_tick;
1140 1138
1141 static int __init skew_tick(char *str) 1139 static int __init skew_tick(char *str)
1142 { 1140 {
1143 get_option(&str, &sched_skew_tick); 1141 get_option(&str, &sched_skew_tick);
1144 1142
1145 return 0; 1143 return 0;
1146 } 1144 }
1147 early_param("skew_tick", skew_tick); 1145 early_param("skew_tick", skew_tick);
1148 1146
1149 /** 1147 /**
1150 * tick_setup_sched_timer - setup the tick emulation timer 1148 * tick_setup_sched_timer - setup the tick emulation timer
1151 */ 1149 */
1152 void tick_setup_sched_timer(void) 1150 void tick_setup_sched_timer(void)
1153 { 1151 {
1154 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 1152 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
1155 ktime_t now = ktime_get(); 1153 ktime_t now = ktime_get();
1156 1154
1157 /* 1155 /*
1158 * Emulate tick processing via per-CPU hrtimers: 1156 * Emulate tick processing via per-CPU hrtimers:
1159 */ 1157 */
1160 hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 1158 hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
1161 ts->sched_timer.function = tick_sched_timer; 1159 ts->sched_timer.function = tick_sched_timer;
1162 1160
1163 /* Get the next period (per cpu) */ 1161 /* Get the next period (per cpu) */
1164 hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); 1162 hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
1165 1163
1166 /* Offset the tick to avert jiffies_lock contention. */ 1164 /* Offset the tick to avert jiffies_lock contention. */
1167 if (sched_skew_tick) { 1165 if (sched_skew_tick) {
1168 u64 offset = ktime_to_ns(tick_period) >> 1; 1166 u64 offset = ktime_to_ns(tick_period) >> 1;
1169 do_div(offset, num_possible_cpus()); 1167 do_div(offset, num_possible_cpus());
1170 offset *= smp_processor_id(); 1168 offset *= smp_processor_id();
1171 hrtimer_add_expires_ns(&ts->sched_timer, offset); 1169 hrtimer_add_expires_ns(&ts->sched_timer, offset);
1172 } 1170 }
1173 1171
1174 for (;;) { 1172 for (;;) {
1175 hrtimer_forward(&ts->sched_timer, now, tick_period); 1173 hrtimer_forward(&ts->sched_timer, now, tick_period);
1176 hrtimer_start_expires(&ts->sched_timer, 1174 hrtimer_start_expires(&ts->sched_timer,
1177 HRTIMER_MODE_ABS_PINNED); 1175 HRTIMER_MODE_ABS_PINNED);
1178 /* Check, if the timer was already in the past */ 1176 /* Check, if the timer was already in the past */
1179 if (hrtimer_active(&ts->sched_timer)) 1177 if (hrtimer_active(&ts->sched_timer))
1180 break; 1178 break;
1181 now = ktime_get(); 1179 now = ktime_get();
1182 } 1180 }
1183 1181
1184 #ifdef CONFIG_NO_HZ_COMMON 1182 #ifdef CONFIG_NO_HZ_COMMON
1185 if (tick_nohz_enabled) { 1183 if (tick_nohz_enabled) {
1186 ts->nohz_mode = NOHZ_MODE_HIGHRES; 1184 ts->nohz_mode = NOHZ_MODE_HIGHRES;
1187 tick_nohz_active = 1; 1185 tick_nohz_active = 1;
1188 } 1186 }
1189 #endif 1187 #endif
1190 } 1188 }
1191 #endif /* HIGH_RES_TIMERS */ 1189 #endif /* HIGH_RES_TIMERS */
1192 1190
1193 #if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS 1191 #if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS
1194 void tick_cancel_sched_timer(int cpu) 1192 void tick_cancel_sched_timer(int cpu)
1195 { 1193 {
1196 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 1194 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
1197 1195
1198 # ifdef CONFIG_HIGH_RES_TIMERS 1196 # ifdef CONFIG_HIGH_RES_TIMERS
1199 if (ts->sched_timer.base) 1197 if (ts->sched_timer.base)
1200 hrtimer_cancel(&ts->sched_timer); 1198 hrtimer_cancel(&ts->sched_timer);
1201 # endif 1199 # endif
1202 1200
1203 memset(ts, 0, sizeof(*ts)); 1201 memset(ts, 0, sizeof(*ts));
1204 } 1202 }
1205 #endif 1203 #endif
1206 1204
1207 /** 1205 /**
1208 * Async notification about clocksource changes 1206 * Async notification about clocksource changes
1209 */ 1207 */
1210 void tick_clock_notify(void) 1208 void tick_clock_notify(void)
1211 { 1209 {
1212 int cpu; 1210 int cpu;
1213 1211
1214 for_each_possible_cpu(cpu) 1212 for_each_possible_cpu(cpu)
1215 set_bit(0, &per_cpu(tick_cpu_sched, cpu).check_clocks); 1213 set_bit(0, &per_cpu(tick_cpu_sched, cpu).check_clocks);
1216 } 1214 }
1217 1215
1218 /* 1216 /*
1219 * Async notification about clock event changes 1217 * Async notification about clock event changes
1220 */ 1218 */
1221 void tick_oneshot_notify(void) 1219 void tick_oneshot_notify(void)
1222 { 1220 {
1223 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 1221 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
1224 1222
1225 set_bit(0, &ts->check_clocks); 1223 set_bit(0, &ts->check_clocks);
1226 } 1224 }
1227 1225
1228 /** 1226 /**
1229 * Check, if a change happened, which makes oneshot possible. 1227 * Check, if a change happened, which makes oneshot possible.
1230 * 1228 *
1231 * Called cyclic from the hrtimer softirq (driven by the timer 1229 * Called cyclic from the hrtimer softirq (driven by the timer
1232 * softirq) allow_nohz signals, that we can switch into low-res nohz 1230 * softirq) allow_nohz signals, that we can switch into low-res nohz
1233 * mode, because high resolution timers are disabled (either compile 1231 * mode, because high resolution timers are disabled (either compile
1234 * or runtime). 1232 * or runtime).
1235 */ 1233 */
1236 int tick_check_oneshot_change(int allow_nohz) 1234 int tick_check_oneshot_change(int allow_nohz)
1237 { 1235 {
1238 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 1236 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
1239 1237
1240 if (!test_and_clear_bit(0, &ts->check_clocks)) 1238 if (!test_and_clear_bit(0, &ts->check_clocks))
1241 return 0; 1239 return 0;
1242 1240
1243 if (ts->nohz_mode != NOHZ_MODE_INACTIVE) 1241 if (ts->nohz_mode != NOHZ_MODE_INACTIVE)
1244 return 0; 1242 return 0;
1245 1243
1246 if (!timekeeping_valid_for_hres() || !tick_is_oneshot_available()) 1244 if (!timekeeping_valid_for_hres() || !tick_is_oneshot_available())
1247 return 0; 1245 return 0;
1248 1246
1249 if (!allow_nohz) 1247 if (!allow_nohz)
1250 return 1; 1248 return 1;
1251 1249
1252 tick_nohz_switch_to_nohz(); 1250 tick_nohz_switch_to_nohz();
1253 return 0; 1251 return 0;
1254 } 1252 }
1255 1253