Commit 4bb9374e0bd40d8fe97860ea0d61a0330b7c3925
Exists in
ti-lsk-linux-4.1.y
and in
10 other branches
Merge branch 'timers-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull NOHZ update from Thomas Gleixner: "Remove the call into the nohz idle code from the fake 'idle' thread in the powerclamp driver along with the export of those functions which was smuggeled in via the thermal tree. People have tried to hack around it in the nohz core code, but it just violates all rightful assumptions of that code about the only valid calling context (i.e. the proper idle task). The powerclamp trainwreck will still work, it just wont get the benefit of long idle sleeps" * 'timers-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: tick/powerclamp: Remove tick_nohz_idle abuse
Showing 2 changed files Inline Diff
drivers/thermal/intel_powerclamp.c
1 | /* | 1 | /* |
2 | * intel_powerclamp.c - package c-state idle injection | 2 | * intel_powerclamp.c - package c-state idle injection |
3 | * | 3 | * |
4 | * Copyright (c) 2012, Intel Corporation. | 4 | * Copyright (c) 2012, Intel Corporation. |
5 | * | 5 | * |
6 | * Authors: | 6 | * Authors: |
7 | * Arjan van de Ven <arjan@linux.intel.com> | 7 | * Arjan van de Ven <arjan@linux.intel.com> |
8 | * Jacob Pan <jacob.jun.pan@linux.intel.com> | 8 | * Jacob Pan <jacob.jun.pan@linux.intel.com> |
9 | * | 9 | * |
10 | * This program is free software; you can redistribute it and/or modify it | 10 | * This program is free software; you can redistribute it and/or modify it |
11 | * under the terms and conditions of the GNU General Public License, | 11 | * under the terms and conditions of the GNU General Public License, |
12 | * version 2, as published by the Free Software Foundation. | 12 | * version 2, as published by the Free Software Foundation. |
13 | * | 13 | * |
14 | * This program is distributed in the hope it will be useful, but WITHOUT | 14 | * This program is distributed in the hope it will be useful, but WITHOUT |
15 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | 15 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
16 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | 16 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for |
17 | * more details. | 17 | * more details. |
18 | * | 18 | * |
19 | * You should have received a copy of the GNU General Public License along with | 19 | * You should have received a copy of the GNU General Public License along with |
20 | * this program; if not, write to the Free Software Foundation, Inc., | 20 | * this program; if not, write to the Free Software Foundation, Inc., |
21 | * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. | 21 | * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. |
22 | * | 22 | * |
23 | * | 23 | * |
24 | * TODO: | 24 | * TODO: |
25 | * 1. better handle wakeup from external interrupts, currently a fixed | 25 | * 1. better handle wakeup from external interrupts, currently a fixed |
26 | * compensation is added to clamping duration when excessive amount | 26 | * compensation is added to clamping duration when excessive amount |
27 | * of wakeups are observed during idle time. the reason is that in | 27 | * of wakeups are observed during idle time. the reason is that in |
28 | * case of external interrupts without need for ack, clamping down | 28 | * case of external interrupts without need for ack, clamping down |
29 | * cpu in non-irq context does not reduce irq. for majority of the | 29 | * cpu in non-irq context does not reduce irq. for majority of the |
30 | * cases, clamping down cpu does help reduce irq as well, we should | 30 | * cases, clamping down cpu does help reduce irq as well, we should |
31 | * be able to differenciate the two cases and give a quantitative | 31 | * be able to differenciate the two cases and give a quantitative |
32 | * solution for the irqs that we can control. perhaps based on | 32 | * solution for the irqs that we can control. perhaps based on |
33 | * get_cpu_iowait_time_us() | 33 | * get_cpu_iowait_time_us() |
34 | * | 34 | * |
35 | * 2. synchronization with other hw blocks | 35 | * 2. synchronization with other hw blocks |
36 | * | 36 | * |
37 | * | 37 | * |
38 | */ | 38 | */ |
39 | 39 | ||
40 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | 40 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
41 | 41 | ||
42 | #include <linux/module.h> | 42 | #include <linux/module.h> |
43 | #include <linux/kernel.h> | 43 | #include <linux/kernel.h> |
44 | #include <linux/delay.h> | 44 | #include <linux/delay.h> |
45 | #include <linux/kthread.h> | 45 | #include <linux/kthread.h> |
46 | #include <linux/freezer.h> | 46 | #include <linux/freezer.h> |
47 | #include <linux/cpu.h> | 47 | #include <linux/cpu.h> |
48 | #include <linux/thermal.h> | 48 | #include <linux/thermal.h> |
49 | #include <linux/slab.h> | 49 | #include <linux/slab.h> |
50 | #include <linux/tick.h> | 50 | #include <linux/tick.h> |
51 | #include <linux/debugfs.h> | 51 | #include <linux/debugfs.h> |
52 | #include <linux/seq_file.h> | 52 | #include <linux/seq_file.h> |
53 | #include <linux/sched/rt.h> | 53 | #include <linux/sched/rt.h> |
54 | 54 | ||
55 | #include <asm/nmi.h> | 55 | #include <asm/nmi.h> |
56 | #include <asm/msr.h> | 56 | #include <asm/msr.h> |
57 | #include <asm/mwait.h> | 57 | #include <asm/mwait.h> |
58 | #include <asm/cpu_device_id.h> | 58 | #include <asm/cpu_device_id.h> |
59 | #include <asm/idle.h> | 59 | #include <asm/idle.h> |
60 | #include <asm/hardirq.h> | 60 | #include <asm/hardirq.h> |
61 | 61 | ||
62 | #define MAX_TARGET_RATIO (50U) | 62 | #define MAX_TARGET_RATIO (50U) |
63 | /* For each undisturbed clamping period (no extra wake ups during idle time), | 63 | /* For each undisturbed clamping period (no extra wake ups during idle time), |
64 | * we increment the confidence counter for the given target ratio. | 64 | * we increment the confidence counter for the given target ratio. |
65 | * CONFIDENCE_OK defines the level where runtime calibration results are | 65 | * CONFIDENCE_OK defines the level where runtime calibration results are |
66 | * valid. | 66 | * valid. |
67 | */ | 67 | */ |
68 | #define CONFIDENCE_OK (3) | 68 | #define CONFIDENCE_OK (3) |
69 | /* Default idle injection duration, driver adjust sleep time to meet target | 69 | /* Default idle injection duration, driver adjust sleep time to meet target |
70 | * idle ratio. Similar to frequency modulation. | 70 | * idle ratio. Similar to frequency modulation. |
71 | */ | 71 | */ |
72 | #define DEFAULT_DURATION_JIFFIES (6) | 72 | #define DEFAULT_DURATION_JIFFIES (6) |
73 | 73 | ||
74 | static unsigned int target_mwait; | 74 | static unsigned int target_mwait; |
75 | static struct dentry *debug_dir; | 75 | static struct dentry *debug_dir; |
76 | 76 | ||
77 | /* user selected target */ | 77 | /* user selected target */ |
78 | static unsigned int set_target_ratio; | 78 | static unsigned int set_target_ratio; |
79 | static unsigned int current_ratio; | 79 | static unsigned int current_ratio; |
80 | static bool should_skip; | 80 | static bool should_skip; |
81 | static bool reduce_irq; | 81 | static bool reduce_irq; |
82 | static atomic_t idle_wakeup_counter; | 82 | static atomic_t idle_wakeup_counter; |
83 | static unsigned int control_cpu; /* The cpu assigned to collect stat and update | 83 | static unsigned int control_cpu; /* The cpu assigned to collect stat and update |
84 | * control parameters. default to BSP but BSP | 84 | * control parameters. default to BSP but BSP |
85 | * can be offlined. | 85 | * can be offlined. |
86 | */ | 86 | */ |
87 | static bool clamping; | 87 | static bool clamping; |
88 | 88 | ||
89 | 89 | ||
90 | static struct task_struct * __percpu *powerclamp_thread; | 90 | static struct task_struct * __percpu *powerclamp_thread; |
91 | static struct thermal_cooling_device *cooling_dev; | 91 | static struct thermal_cooling_device *cooling_dev; |
92 | static unsigned long *cpu_clamping_mask; /* bit map for tracking per cpu | 92 | static unsigned long *cpu_clamping_mask; /* bit map for tracking per cpu |
93 | * clamping thread | 93 | * clamping thread |
94 | */ | 94 | */ |
95 | 95 | ||
96 | static unsigned int duration; | 96 | static unsigned int duration; |
97 | static unsigned int pkg_cstate_ratio_cur; | 97 | static unsigned int pkg_cstate_ratio_cur; |
98 | static unsigned int window_size; | 98 | static unsigned int window_size; |
99 | 99 | ||
100 | static int duration_set(const char *arg, const struct kernel_param *kp) | 100 | static int duration_set(const char *arg, const struct kernel_param *kp) |
101 | { | 101 | { |
102 | int ret = 0; | 102 | int ret = 0; |
103 | unsigned long new_duration; | 103 | unsigned long new_duration; |
104 | 104 | ||
105 | ret = kstrtoul(arg, 10, &new_duration); | 105 | ret = kstrtoul(arg, 10, &new_duration); |
106 | if (ret) | 106 | if (ret) |
107 | goto exit; | 107 | goto exit; |
108 | if (new_duration > 25 || new_duration < 6) { | 108 | if (new_duration > 25 || new_duration < 6) { |
109 | pr_err("Out of recommended range %lu, between 6-25ms\n", | 109 | pr_err("Out of recommended range %lu, between 6-25ms\n", |
110 | new_duration); | 110 | new_duration); |
111 | ret = -EINVAL; | 111 | ret = -EINVAL; |
112 | } | 112 | } |
113 | 113 | ||
114 | duration = clamp(new_duration, 6ul, 25ul); | 114 | duration = clamp(new_duration, 6ul, 25ul); |
115 | smp_mb(); | 115 | smp_mb(); |
116 | 116 | ||
117 | exit: | 117 | exit: |
118 | 118 | ||
119 | return ret; | 119 | return ret; |
120 | } | 120 | } |
121 | 121 | ||
122 | static struct kernel_param_ops duration_ops = { | 122 | static struct kernel_param_ops duration_ops = { |
123 | .set = duration_set, | 123 | .set = duration_set, |
124 | .get = param_get_int, | 124 | .get = param_get_int, |
125 | }; | 125 | }; |
126 | 126 | ||
127 | 127 | ||
128 | module_param_cb(duration, &duration_ops, &duration, 0644); | 128 | module_param_cb(duration, &duration_ops, &duration, 0644); |
129 | MODULE_PARM_DESC(duration, "forced idle time for each attempt in msec."); | 129 | MODULE_PARM_DESC(duration, "forced idle time for each attempt in msec."); |
130 | 130 | ||
131 | struct powerclamp_calibration_data { | 131 | struct powerclamp_calibration_data { |
132 | unsigned long confidence; /* used for calibration, basically a counter | 132 | unsigned long confidence; /* used for calibration, basically a counter |
133 | * gets incremented each time a clamping | 133 | * gets incremented each time a clamping |
134 | * period is completed without extra wakeups | 134 | * period is completed without extra wakeups |
135 | * once that counter is reached given level, | 135 | * once that counter is reached given level, |
136 | * compensation is deemed usable. | 136 | * compensation is deemed usable. |
137 | */ | 137 | */ |
138 | unsigned long steady_comp; /* steady state compensation used when | 138 | unsigned long steady_comp; /* steady state compensation used when |
139 | * no extra wakeups occurred. | 139 | * no extra wakeups occurred. |
140 | */ | 140 | */ |
141 | unsigned long dynamic_comp; /* compensate excessive wakeup from idle | 141 | unsigned long dynamic_comp; /* compensate excessive wakeup from idle |
142 | * mostly from external interrupts. | 142 | * mostly from external interrupts. |
143 | */ | 143 | */ |
144 | }; | 144 | }; |
145 | 145 | ||
146 | static struct powerclamp_calibration_data cal_data[MAX_TARGET_RATIO]; | 146 | static struct powerclamp_calibration_data cal_data[MAX_TARGET_RATIO]; |
147 | 147 | ||
148 | static int window_size_set(const char *arg, const struct kernel_param *kp) | 148 | static int window_size_set(const char *arg, const struct kernel_param *kp) |
149 | { | 149 | { |
150 | int ret = 0; | 150 | int ret = 0; |
151 | unsigned long new_window_size; | 151 | unsigned long new_window_size; |
152 | 152 | ||
153 | ret = kstrtoul(arg, 10, &new_window_size); | 153 | ret = kstrtoul(arg, 10, &new_window_size); |
154 | if (ret) | 154 | if (ret) |
155 | goto exit_win; | 155 | goto exit_win; |
156 | if (new_window_size > 10 || new_window_size < 2) { | 156 | if (new_window_size > 10 || new_window_size < 2) { |
157 | pr_err("Out of recommended window size %lu, between 2-10\n", | 157 | pr_err("Out of recommended window size %lu, between 2-10\n", |
158 | new_window_size); | 158 | new_window_size); |
159 | ret = -EINVAL; | 159 | ret = -EINVAL; |
160 | } | 160 | } |
161 | 161 | ||
162 | window_size = clamp(new_window_size, 2ul, 10ul); | 162 | window_size = clamp(new_window_size, 2ul, 10ul); |
163 | smp_mb(); | 163 | smp_mb(); |
164 | 164 | ||
165 | exit_win: | 165 | exit_win: |
166 | 166 | ||
167 | return ret; | 167 | return ret; |
168 | } | 168 | } |
169 | 169 | ||
170 | static struct kernel_param_ops window_size_ops = { | 170 | static struct kernel_param_ops window_size_ops = { |
171 | .set = window_size_set, | 171 | .set = window_size_set, |
172 | .get = param_get_int, | 172 | .get = param_get_int, |
173 | }; | 173 | }; |
174 | 174 | ||
175 | module_param_cb(window_size, &window_size_ops, &window_size, 0644); | 175 | module_param_cb(window_size, &window_size_ops, &window_size, 0644); |
176 | MODULE_PARM_DESC(window_size, "sliding window in number of clamping cycles\n" | 176 | MODULE_PARM_DESC(window_size, "sliding window in number of clamping cycles\n" |
177 | "\tpowerclamp controls idle ratio within this window. larger\n" | 177 | "\tpowerclamp controls idle ratio within this window. larger\n" |
178 | "\twindow size results in slower response time but more smooth\n" | 178 | "\twindow size results in slower response time but more smooth\n" |
179 | "\tclamping results. default to 2."); | 179 | "\tclamping results. default to 2."); |
180 | 180 | ||
181 | static void find_target_mwait(void) | 181 | static void find_target_mwait(void) |
182 | { | 182 | { |
183 | unsigned int eax, ebx, ecx, edx; | 183 | unsigned int eax, ebx, ecx, edx; |
184 | unsigned int highest_cstate = 0; | 184 | unsigned int highest_cstate = 0; |
185 | unsigned int highest_subcstate = 0; | 185 | unsigned int highest_subcstate = 0; |
186 | int i; | 186 | int i; |
187 | 187 | ||
188 | if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF) | 188 | if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF) |
189 | return; | 189 | return; |
190 | 190 | ||
191 | cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx); | 191 | cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx); |
192 | 192 | ||
193 | if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) || | 193 | if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) || |
194 | !(ecx & CPUID5_ECX_INTERRUPT_BREAK)) | 194 | !(ecx & CPUID5_ECX_INTERRUPT_BREAK)) |
195 | return; | 195 | return; |
196 | 196 | ||
197 | edx >>= MWAIT_SUBSTATE_SIZE; | 197 | edx >>= MWAIT_SUBSTATE_SIZE; |
198 | for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) { | 198 | for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) { |
199 | if (edx & MWAIT_SUBSTATE_MASK) { | 199 | if (edx & MWAIT_SUBSTATE_MASK) { |
200 | highest_cstate = i; | 200 | highest_cstate = i; |
201 | highest_subcstate = edx & MWAIT_SUBSTATE_MASK; | 201 | highest_subcstate = edx & MWAIT_SUBSTATE_MASK; |
202 | } | 202 | } |
203 | } | 203 | } |
204 | target_mwait = (highest_cstate << MWAIT_SUBSTATE_SIZE) | | 204 | target_mwait = (highest_cstate << MWAIT_SUBSTATE_SIZE) | |
205 | (highest_subcstate - 1); | 205 | (highest_subcstate - 1); |
206 | 206 | ||
207 | } | 207 | } |
208 | 208 | ||
209 | static bool has_pkg_state_counter(void) | 209 | static bool has_pkg_state_counter(void) |
210 | { | 210 | { |
211 | u64 tmp; | 211 | u64 tmp; |
212 | return !rdmsrl_safe(MSR_PKG_C2_RESIDENCY, &tmp) || | 212 | return !rdmsrl_safe(MSR_PKG_C2_RESIDENCY, &tmp) || |
213 | !rdmsrl_safe(MSR_PKG_C3_RESIDENCY, &tmp) || | 213 | !rdmsrl_safe(MSR_PKG_C3_RESIDENCY, &tmp) || |
214 | !rdmsrl_safe(MSR_PKG_C6_RESIDENCY, &tmp) || | 214 | !rdmsrl_safe(MSR_PKG_C6_RESIDENCY, &tmp) || |
215 | !rdmsrl_safe(MSR_PKG_C7_RESIDENCY, &tmp); | 215 | !rdmsrl_safe(MSR_PKG_C7_RESIDENCY, &tmp); |
216 | } | 216 | } |
217 | 217 | ||
218 | static u64 pkg_state_counter(void) | 218 | static u64 pkg_state_counter(void) |
219 | { | 219 | { |
220 | u64 val; | 220 | u64 val; |
221 | u64 count = 0; | 221 | u64 count = 0; |
222 | 222 | ||
223 | static bool skip_c2; | 223 | static bool skip_c2; |
224 | static bool skip_c3; | 224 | static bool skip_c3; |
225 | static bool skip_c6; | 225 | static bool skip_c6; |
226 | static bool skip_c7; | 226 | static bool skip_c7; |
227 | 227 | ||
228 | if (!skip_c2) { | 228 | if (!skip_c2) { |
229 | if (!rdmsrl_safe(MSR_PKG_C2_RESIDENCY, &val)) | 229 | if (!rdmsrl_safe(MSR_PKG_C2_RESIDENCY, &val)) |
230 | count += val; | 230 | count += val; |
231 | else | 231 | else |
232 | skip_c2 = true; | 232 | skip_c2 = true; |
233 | } | 233 | } |
234 | 234 | ||
235 | if (!skip_c3) { | 235 | if (!skip_c3) { |
236 | if (!rdmsrl_safe(MSR_PKG_C3_RESIDENCY, &val)) | 236 | if (!rdmsrl_safe(MSR_PKG_C3_RESIDENCY, &val)) |
237 | count += val; | 237 | count += val; |
238 | else | 238 | else |
239 | skip_c3 = true; | 239 | skip_c3 = true; |
240 | } | 240 | } |
241 | 241 | ||
242 | if (!skip_c6) { | 242 | if (!skip_c6) { |
243 | if (!rdmsrl_safe(MSR_PKG_C6_RESIDENCY, &val)) | 243 | if (!rdmsrl_safe(MSR_PKG_C6_RESIDENCY, &val)) |
244 | count += val; | 244 | count += val; |
245 | else | 245 | else |
246 | skip_c6 = true; | 246 | skip_c6 = true; |
247 | } | 247 | } |
248 | 248 | ||
249 | if (!skip_c7) { | 249 | if (!skip_c7) { |
250 | if (!rdmsrl_safe(MSR_PKG_C7_RESIDENCY, &val)) | 250 | if (!rdmsrl_safe(MSR_PKG_C7_RESIDENCY, &val)) |
251 | count += val; | 251 | count += val; |
252 | else | 252 | else |
253 | skip_c7 = true; | 253 | skip_c7 = true; |
254 | } | 254 | } |
255 | 255 | ||
256 | return count; | 256 | return count; |
257 | } | 257 | } |
258 | 258 | ||
259 | static void noop_timer(unsigned long foo) | 259 | static void noop_timer(unsigned long foo) |
260 | { | 260 | { |
261 | /* empty... just the fact that we get the interrupt wakes us up */ | 261 | /* empty... just the fact that we get the interrupt wakes us up */ |
262 | } | 262 | } |
263 | 263 | ||
264 | static unsigned int get_compensation(int ratio) | 264 | static unsigned int get_compensation(int ratio) |
265 | { | 265 | { |
266 | unsigned int comp = 0; | 266 | unsigned int comp = 0; |
267 | 267 | ||
268 | /* we only use compensation if all adjacent ones are good */ | 268 | /* we only use compensation if all adjacent ones are good */ |
269 | if (ratio == 1 && | 269 | if (ratio == 1 && |
270 | cal_data[ratio].confidence >= CONFIDENCE_OK && | 270 | cal_data[ratio].confidence >= CONFIDENCE_OK && |
271 | cal_data[ratio + 1].confidence >= CONFIDENCE_OK && | 271 | cal_data[ratio + 1].confidence >= CONFIDENCE_OK && |
272 | cal_data[ratio + 2].confidence >= CONFIDENCE_OK) { | 272 | cal_data[ratio + 2].confidence >= CONFIDENCE_OK) { |
273 | comp = (cal_data[ratio].steady_comp + | 273 | comp = (cal_data[ratio].steady_comp + |
274 | cal_data[ratio + 1].steady_comp + | 274 | cal_data[ratio + 1].steady_comp + |
275 | cal_data[ratio + 2].steady_comp) / 3; | 275 | cal_data[ratio + 2].steady_comp) / 3; |
276 | } else if (ratio == MAX_TARGET_RATIO - 1 && | 276 | } else if (ratio == MAX_TARGET_RATIO - 1 && |
277 | cal_data[ratio].confidence >= CONFIDENCE_OK && | 277 | cal_data[ratio].confidence >= CONFIDENCE_OK && |
278 | cal_data[ratio - 1].confidence >= CONFIDENCE_OK && | 278 | cal_data[ratio - 1].confidence >= CONFIDENCE_OK && |
279 | cal_data[ratio - 2].confidence >= CONFIDENCE_OK) { | 279 | cal_data[ratio - 2].confidence >= CONFIDENCE_OK) { |
280 | comp = (cal_data[ratio].steady_comp + | 280 | comp = (cal_data[ratio].steady_comp + |
281 | cal_data[ratio - 1].steady_comp + | 281 | cal_data[ratio - 1].steady_comp + |
282 | cal_data[ratio - 2].steady_comp) / 3; | 282 | cal_data[ratio - 2].steady_comp) / 3; |
283 | } else if (cal_data[ratio].confidence >= CONFIDENCE_OK && | 283 | } else if (cal_data[ratio].confidence >= CONFIDENCE_OK && |
284 | cal_data[ratio - 1].confidence >= CONFIDENCE_OK && | 284 | cal_data[ratio - 1].confidence >= CONFIDENCE_OK && |
285 | cal_data[ratio + 1].confidence >= CONFIDENCE_OK) { | 285 | cal_data[ratio + 1].confidence >= CONFIDENCE_OK) { |
286 | comp = (cal_data[ratio].steady_comp + | 286 | comp = (cal_data[ratio].steady_comp + |
287 | cal_data[ratio - 1].steady_comp + | 287 | cal_data[ratio - 1].steady_comp + |
288 | cal_data[ratio + 1].steady_comp) / 3; | 288 | cal_data[ratio + 1].steady_comp) / 3; |
289 | } | 289 | } |
290 | 290 | ||
291 | /* REVISIT: simple penalty of double idle injection */ | 291 | /* REVISIT: simple penalty of double idle injection */ |
292 | if (reduce_irq) | 292 | if (reduce_irq) |
293 | comp = ratio; | 293 | comp = ratio; |
294 | /* do not exceed limit */ | 294 | /* do not exceed limit */ |
295 | if (comp + ratio >= MAX_TARGET_RATIO) | 295 | if (comp + ratio >= MAX_TARGET_RATIO) |
296 | comp = MAX_TARGET_RATIO - ratio - 1; | 296 | comp = MAX_TARGET_RATIO - ratio - 1; |
297 | 297 | ||
298 | return comp; | 298 | return comp; |
299 | } | 299 | } |
300 | 300 | ||
301 | static void adjust_compensation(int target_ratio, unsigned int win) | 301 | static void adjust_compensation(int target_ratio, unsigned int win) |
302 | { | 302 | { |
303 | int delta; | 303 | int delta; |
304 | struct powerclamp_calibration_data *d = &cal_data[target_ratio]; | 304 | struct powerclamp_calibration_data *d = &cal_data[target_ratio]; |
305 | 305 | ||
306 | /* | 306 | /* |
307 | * adjust compensations if confidence level has not been reached or | 307 | * adjust compensations if confidence level has not been reached or |
308 | * there are too many wakeups during the last idle injection period, we | 308 | * there are too many wakeups during the last idle injection period, we |
309 | * cannot trust the data for compensation. | 309 | * cannot trust the data for compensation. |
310 | */ | 310 | */ |
311 | if (d->confidence >= CONFIDENCE_OK || | 311 | if (d->confidence >= CONFIDENCE_OK || |
312 | atomic_read(&idle_wakeup_counter) > | 312 | atomic_read(&idle_wakeup_counter) > |
313 | win * num_online_cpus()) | 313 | win * num_online_cpus()) |
314 | return; | 314 | return; |
315 | 315 | ||
316 | delta = set_target_ratio - current_ratio; | 316 | delta = set_target_ratio - current_ratio; |
317 | /* filter out bad data */ | 317 | /* filter out bad data */ |
318 | if (delta >= 0 && delta <= (1+target_ratio/10)) { | 318 | if (delta >= 0 && delta <= (1+target_ratio/10)) { |
319 | if (d->steady_comp) | 319 | if (d->steady_comp) |
320 | d->steady_comp = | 320 | d->steady_comp = |
321 | roundup(delta+d->steady_comp, 2)/2; | 321 | roundup(delta+d->steady_comp, 2)/2; |
322 | else | 322 | else |
323 | d->steady_comp = delta; | 323 | d->steady_comp = delta; |
324 | d->confidence++; | 324 | d->confidence++; |
325 | } | 325 | } |
326 | } | 326 | } |
327 | 327 | ||
328 | static bool powerclamp_adjust_controls(unsigned int target_ratio, | 328 | static bool powerclamp_adjust_controls(unsigned int target_ratio, |
329 | unsigned int guard, unsigned int win) | 329 | unsigned int guard, unsigned int win) |
330 | { | 330 | { |
331 | static u64 msr_last, tsc_last; | 331 | static u64 msr_last, tsc_last; |
332 | u64 msr_now, tsc_now; | 332 | u64 msr_now, tsc_now; |
333 | u64 val64; | 333 | u64 val64; |
334 | 334 | ||
335 | /* check result for the last window */ | 335 | /* check result for the last window */ |
336 | msr_now = pkg_state_counter(); | 336 | msr_now = pkg_state_counter(); |
337 | rdtscll(tsc_now); | 337 | rdtscll(tsc_now); |
338 | 338 | ||
339 | /* calculate pkg cstate vs tsc ratio */ | 339 | /* calculate pkg cstate vs tsc ratio */ |
340 | if (!msr_last || !tsc_last) | 340 | if (!msr_last || !tsc_last) |
341 | current_ratio = 1; | 341 | current_ratio = 1; |
342 | else if (tsc_now-tsc_last) { | 342 | else if (tsc_now-tsc_last) { |
343 | val64 = 100*(msr_now-msr_last); | 343 | val64 = 100*(msr_now-msr_last); |
344 | do_div(val64, (tsc_now-tsc_last)); | 344 | do_div(val64, (tsc_now-tsc_last)); |
345 | current_ratio = val64; | 345 | current_ratio = val64; |
346 | } | 346 | } |
347 | 347 | ||
348 | /* update record */ | 348 | /* update record */ |
349 | msr_last = msr_now; | 349 | msr_last = msr_now; |
350 | tsc_last = tsc_now; | 350 | tsc_last = tsc_now; |
351 | 351 | ||
352 | adjust_compensation(target_ratio, win); | 352 | adjust_compensation(target_ratio, win); |
353 | /* | 353 | /* |
354 | * too many external interrupts, set flag such | 354 | * too many external interrupts, set flag such |
355 | * that we can take measure later. | 355 | * that we can take measure later. |
356 | */ | 356 | */ |
357 | reduce_irq = atomic_read(&idle_wakeup_counter) >= | 357 | reduce_irq = atomic_read(&idle_wakeup_counter) >= |
358 | 2 * win * num_online_cpus(); | 358 | 2 * win * num_online_cpus(); |
359 | 359 | ||
360 | atomic_set(&idle_wakeup_counter, 0); | 360 | atomic_set(&idle_wakeup_counter, 0); |
361 | /* if we are above target+guard, skip */ | 361 | /* if we are above target+guard, skip */ |
362 | return set_target_ratio + guard <= current_ratio; | 362 | return set_target_ratio + guard <= current_ratio; |
363 | } | 363 | } |
364 | 364 | ||
365 | static int clamp_thread(void *arg) | 365 | static int clamp_thread(void *arg) |
366 | { | 366 | { |
367 | int cpunr = (unsigned long)arg; | 367 | int cpunr = (unsigned long)arg; |
368 | DEFINE_TIMER(wakeup_timer, noop_timer, 0, 0); | 368 | DEFINE_TIMER(wakeup_timer, noop_timer, 0, 0); |
369 | static const struct sched_param param = { | 369 | static const struct sched_param param = { |
370 | .sched_priority = MAX_USER_RT_PRIO/2, | 370 | .sched_priority = MAX_USER_RT_PRIO/2, |
371 | }; | 371 | }; |
372 | unsigned int count = 0; | 372 | unsigned int count = 0; |
373 | unsigned int target_ratio; | 373 | unsigned int target_ratio; |
374 | 374 | ||
375 | set_bit(cpunr, cpu_clamping_mask); | 375 | set_bit(cpunr, cpu_clamping_mask); |
376 | set_freezable(); | 376 | set_freezable(); |
377 | init_timer_on_stack(&wakeup_timer); | 377 | init_timer_on_stack(&wakeup_timer); |
378 | sched_setscheduler(current, SCHED_FIFO, ¶m); | 378 | sched_setscheduler(current, SCHED_FIFO, ¶m); |
379 | 379 | ||
380 | while (true == clamping && !kthread_should_stop() && | 380 | while (true == clamping && !kthread_should_stop() && |
381 | cpu_online(cpunr)) { | 381 | cpu_online(cpunr)) { |
382 | int sleeptime; | 382 | int sleeptime; |
383 | unsigned long target_jiffies; | 383 | unsigned long target_jiffies; |
384 | unsigned int guard; | 384 | unsigned int guard; |
385 | unsigned int compensation = 0; | 385 | unsigned int compensation = 0; |
386 | int interval; /* jiffies to sleep for each attempt */ | 386 | int interval; /* jiffies to sleep for each attempt */ |
387 | unsigned int duration_jiffies = msecs_to_jiffies(duration); | 387 | unsigned int duration_jiffies = msecs_to_jiffies(duration); |
388 | unsigned int window_size_now; | 388 | unsigned int window_size_now; |
389 | 389 | ||
390 | try_to_freeze(); | 390 | try_to_freeze(); |
391 | /* | 391 | /* |
392 | * make sure user selected ratio does not take effect until | 392 | * make sure user selected ratio does not take effect until |
393 | * the next round. adjust target_ratio if user has changed | 393 | * the next round. adjust target_ratio if user has changed |
394 | * target such that we can converge quickly. | 394 | * target such that we can converge quickly. |
395 | */ | 395 | */ |
396 | target_ratio = set_target_ratio; | 396 | target_ratio = set_target_ratio; |
397 | guard = 1 + target_ratio/20; | 397 | guard = 1 + target_ratio/20; |
398 | window_size_now = window_size; | 398 | window_size_now = window_size; |
399 | count++; | 399 | count++; |
400 | 400 | ||
401 | /* | 401 | /* |
402 | * systems may have different ability to enter package level | 402 | * systems may have different ability to enter package level |
403 | * c-states, thus we need to compensate the injected idle ratio | 403 | * c-states, thus we need to compensate the injected idle ratio |
404 | * to achieve the actual target reported by the HW. | 404 | * to achieve the actual target reported by the HW. |
405 | */ | 405 | */ |
406 | compensation = get_compensation(target_ratio); | 406 | compensation = get_compensation(target_ratio); |
407 | interval = duration_jiffies*100/(target_ratio+compensation); | 407 | interval = duration_jiffies*100/(target_ratio+compensation); |
408 | 408 | ||
409 | /* align idle time */ | 409 | /* align idle time */ |
410 | target_jiffies = roundup(jiffies, interval); | 410 | target_jiffies = roundup(jiffies, interval); |
411 | sleeptime = target_jiffies - jiffies; | 411 | sleeptime = target_jiffies - jiffies; |
412 | if (sleeptime <= 0) | 412 | if (sleeptime <= 0) |
413 | sleeptime = 1; | 413 | sleeptime = 1; |
414 | schedule_timeout_interruptible(sleeptime); | 414 | schedule_timeout_interruptible(sleeptime); |
415 | /* | 415 | /* |
416 | * only elected controlling cpu can collect stats and update | 416 | * only elected controlling cpu can collect stats and update |
417 | * control parameters. | 417 | * control parameters. |
418 | */ | 418 | */ |
419 | if (cpunr == control_cpu && !(count%window_size_now)) { | 419 | if (cpunr == control_cpu && !(count%window_size_now)) { |
420 | should_skip = | 420 | should_skip = |
421 | powerclamp_adjust_controls(target_ratio, | 421 | powerclamp_adjust_controls(target_ratio, |
422 | guard, window_size_now); | 422 | guard, window_size_now); |
423 | smp_mb(); | 423 | smp_mb(); |
424 | } | 424 | } |
425 | 425 | ||
426 | if (should_skip) | 426 | if (should_skip) |
427 | continue; | 427 | continue; |
428 | 428 | ||
429 | target_jiffies = jiffies + duration_jiffies; | 429 | target_jiffies = jiffies + duration_jiffies; |
430 | mod_timer(&wakeup_timer, target_jiffies); | 430 | mod_timer(&wakeup_timer, target_jiffies); |
431 | if (unlikely(local_softirq_pending())) | 431 | if (unlikely(local_softirq_pending())) |
432 | continue; | 432 | continue; |
433 | /* | 433 | /* |
434 | * stop tick sched during idle time, interrupts are still | 434 | * stop tick sched during idle time, interrupts are still |
435 | * allowed. thus jiffies are updated properly. | 435 | * allowed. thus jiffies are updated properly. |
436 | */ | 436 | */ |
437 | preempt_disable(); | 437 | preempt_disable(); |
438 | tick_nohz_idle_enter(); | ||
439 | /* mwait until target jiffies is reached */ | 438 | /* mwait until target jiffies is reached */ |
440 | while (time_before(jiffies, target_jiffies)) { | 439 | while (time_before(jiffies, target_jiffies)) { |
441 | unsigned long ecx = 1; | 440 | unsigned long ecx = 1; |
442 | unsigned long eax = target_mwait; | 441 | unsigned long eax = target_mwait; |
443 | 442 | ||
444 | /* | 443 | /* |
445 | * REVISIT: may call enter_idle() to notify drivers who | 444 | * REVISIT: may call enter_idle() to notify drivers who |
446 | * can save power during cpu idle. same for exit_idle() | 445 | * can save power during cpu idle. same for exit_idle() |
447 | */ | 446 | */ |
448 | local_touch_nmi(); | 447 | local_touch_nmi(); |
449 | stop_critical_timings(); | 448 | stop_critical_timings(); |
450 | mwait_idle_with_hints(eax, ecx); | 449 | mwait_idle_with_hints(eax, ecx); |
451 | start_critical_timings(); | 450 | start_critical_timings(); |
452 | atomic_inc(&idle_wakeup_counter); | 451 | atomic_inc(&idle_wakeup_counter); |
453 | } | 452 | } |
454 | tick_nohz_idle_exit(); | ||
455 | preempt_enable(); | 453 | preempt_enable(); |
456 | } | 454 | } |
457 | del_timer_sync(&wakeup_timer); | 455 | del_timer_sync(&wakeup_timer); |
458 | clear_bit(cpunr, cpu_clamping_mask); | 456 | clear_bit(cpunr, cpu_clamping_mask); |
459 | 457 | ||
460 | return 0; | 458 | return 0; |
461 | } | 459 | } |
462 | 460 | ||
463 | /* | 461 | /* |
464 | * 1 HZ polling while clamping is active, useful for userspace | 462 | * 1 HZ polling while clamping is active, useful for userspace |
465 | * to monitor actual idle ratio. | 463 | * to monitor actual idle ratio. |
466 | */ | 464 | */ |
467 | static void poll_pkg_cstate(struct work_struct *dummy); | 465 | static void poll_pkg_cstate(struct work_struct *dummy); |
468 | static DECLARE_DELAYED_WORK(poll_pkg_cstate_work, poll_pkg_cstate); | 466 | static DECLARE_DELAYED_WORK(poll_pkg_cstate_work, poll_pkg_cstate); |
469 | static void poll_pkg_cstate(struct work_struct *dummy) | 467 | static void poll_pkg_cstate(struct work_struct *dummy) |
470 | { | 468 | { |
471 | static u64 msr_last; | 469 | static u64 msr_last; |
472 | static u64 tsc_last; | 470 | static u64 tsc_last; |
473 | static unsigned long jiffies_last; | 471 | static unsigned long jiffies_last; |
474 | 472 | ||
475 | u64 msr_now; | 473 | u64 msr_now; |
476 | unsigned long jiffies_now; | 474 | unsigned long jiffies_now; |
477 | u64 tsc_now; | 475 | u64 tsc_now; |
478 | u64 val64; | 476 | u64 val64; |
479 | 477 | ||
480 | msr_now = pkg_state_counter(); | 478 | msr_now = pkg_state_counter(); |
481 | rdtscll(tsc_now); | 479 | rdtscll(tsc_now); |
482 | jiffies_now = jiffies; | 480 | jiffies_now = jiffies; |
483 | 481 | ||
484 | /* calculate pkg cstate vs tsc ratio */ | 482 | /* calculate pkg cstate vs tsc ratio */ |
485 | if (!msr_last || !tsc_last) | 483 | if (!msr_last || !tsc_last) |
486 | pkg_cstate_ratio_cur = 1; | 484 | pkg_cstate_ratio_cur = 1; |
487 | else { | 485 | else { |
488 | if (tsc_now - tsc_last) { | 486 | if (tsc_now - tsc_last) { |
489 | val64 = 100 * (msr_now - msr_last); | 487 | val64 = 100 * (msr_now - msr_last); |
490 | do_div(val64, (tsc_now - tsc_last)); | 488 | do_div(val64, (tsc_now - tsc_last)); |
491 | pkg_cstate_ratio_cur = val64; | 489 | pkg_cstate_ratio_cur = val64; |
492 | } | 490 | } |
493 | } | 491 | } |
494 | 492 | ||
495 | /* update record */ | 493 | /* update record */ |
496 | msr_last = msr_now; | 494 | msr_last = msr_now; |
497 | jiffies_last = jiffies_now; | 495 | jiffies_last = jiffies_now; |
498 | tsc_last = tsc_now; | 496 | tsc_last = tsc_now; |
499 | 497 | ||
500 | if (true == clamping) | 498 | if (true == clamping) |
501 | schedule_delayed_work(&poll_pkg_cstate_work, HZ); | 499 | schedule_delayed_work(&poll_pkg_cstate_work, HZ); |
502 | } | 500 | } |
503 | 501 | ||
504 | static int start_power_clamp(void) | 502 | static int start_power_clamp(void) |
505 | { | 503 | { |
506 | unsigned long cpu; | 504 | unsigned long cpu; |
507 | struct task_struct *thread; | 505 | struct task_struct *thread; |
508 | 506 | ||
509 | /* check if pkg cstate counter is completely 0, abort in this case */ | 507 | /* check if pkg cstate counter is completely 0, abort in this case */ |
510 | if (!has_pkg_state_counter()) { | 508 | if (!has_pkg_state_counter()) { |
511 | pr_err("pkg cstate counter not functional, abort\n"); | 509 | pr_err("pkg cstate counter not functional, abort\n"); |
512 | return -EINVAL; | 510 | return -EINVAL; |
513 | } | 511 | } |
514 | 512 | ||
515 | set_target_ratio = clamp(set_target_ratio, 0U, MAX_TARGET_RATIO - 1); | 513 | set_target_ratio = clamp(set_target_ratio, 0U, MAX_TARGET_RATIO - 1); |
516 | /* prevent cpu hotplug */ | 514 | /* prevent cpu hotplug */ |
517 | get_online_cpus(); | 515 | get_online_cpus(); |
518 | 516 | ||
519 | /* prefer BSP */ | 517 | /* prefer BSP */ |
520 | control_cpu = 0; | 518 | control_cpu = 0; |
521 | if (!cpu_online(control_cpu)) | 519 | if (!cpu_online(control_cpu)) |
522 | control_cpu = smp_processor_id(); | 520 | control_cpu = smp_processor_id(); |
523 | 521 | ||
524 | clamping = true; | 522 | clamping = true; |
525 | schedule_delayed_work(&poll_pkg_cstate_work, 0); | 523 | schedule_delayed_work(&poll_pkg_cstate_work, 0); |
526 | 524 | ||
527 | /* start one thread per online cpu */ | 525 | /* start one thread per online cpu */ |
528 | for_each_online_cpu(cpu) { | 526 | for_each_online_cpu(cpu) { |
529 | struct task_struct **p = | 527 | struct task_struct **p = |
530 | per_cpu_ptr(powerclamp_thread, cpu); | 528 | per_cpu_ptr(powerclamp_thread, cpu); |
531 | 529 | ||
532 | thread = kthread_create_on_node(clamp_thread, | 530 | thread = kthread_create_on_node(clamp_thread, |
533 | (void *) cpu, | 531 | (void *) cpu, |
534 | cpu_to_node(cpu), | 532 | cpu_to_node(cpu), |
535 | "kidle_inject/%ld", cpu); | 533 | "kidle_inject/%ld", cpu); |
536 | /* bind to cpu here */ | 534 | /* bind to cpu here */ |
537 | if (likely(!IS_ERR(thread))) { | 535 | if (likely(!IS_ERR(thread))) { |
538 | kthread_bind(thread, cpu); | 536 | kthread_bind(thread, cpu); |
539 | wake_up_process(thread); | 537 | wake_up_process(thread); |
540 | *p = thread; | 538 | *p = thread; |
541 | } | 539 | } |
542 | 540 | ||
543 | } | 541 | } |
544 | put_online_cpus(); | 542 | put_online_cpus(); |
545 | 543 | ||
546 | return 0; | 544 | return 0; |
547 | } | 545 | } |
548 | 546 | ||
549 | static void end_power_clamp(void) | 547 | static void end_power_clamp(void) |
550 | { | 548 | { |
551 | int i; | 549 | int i; |
552 | struct task_struct *thread; | 550 | struct task_struct *thread; |
553 | 551 | ||
554 | clamping = false; | 552 | clamping = false; |
555 | /* | 553 | /* |
556 | * make clamping visible to other cpus and give per cpu clamping threads | 554 | * make clamping visible to other cpus and give per cpu clamping threads |
557 | * sometime to exit, or gets killed later. | 555 | * sometime to exit, or gets killed later. |
558 | */ | 556 | */ |
559 | smp_mb(); | 557 | smp_mb(); |
560 | msleep(20); | 558 | msleep(20); |
561 | if (bitmap_weight(cpu_clamping_mask, num_possible_cpus())) { | 559 | if (bitmap_weight(cpu_clamping_mask, num_possible_cpus())) { |
562 | for_each_set_bit(i, cpu_clamping_mask, num_possible_cpus()) { | 560 | for_each_set_bit(i, cpu_clamping_mask, num_possible_cpus()) { |
563 | pr_debug("clamping thread for cpu %d alive, kill\n", i); | 561 | pr_debug("clamping thread for cpu %d alive, kill\n", i); |
564 | thread = *per_cpu_ptr(powerclamp_thread, i); | 562 | thread = *per_cpu_ptr(powerclamp_thread, i); |
565 | kthread_stop(thread); | 563 | kthread_stop(thread); |
566 | } | 564 | } |
567 | } | 565 | } |
568 | } | 566 | } |
569 | 567 | ||
570 | static int powerclamp_cpu_callback(struct notifier_block *nfb, | 568 | static int powerclamp_cpu_callback(struct notifier_block *nfb, |
571 | unsigned long action, void *hcpu) | 569 | unsigned long action, void *hcpu) |
572 | { | 570 | { |
573 | unsigned long cpu = (unsigned long)hcpu; | 571 | unsigned long cpu = (unsigned long)hcpu; |
574 | struct task_struct *thread; | 572 | struct task_struct *thread; |
575 | struct task_struct **percpu_thread = | 573 | struct task_struct **percpu_thread = |
576 | per_cpu_ptr(powerclamp_thread, cpu); | 574 | per_cpu_ptr(powerclamp_thread, cpu); |
577 | 575 | ||
578 | if (false == clamping) | 576 | if (false == clamping) |
579 | goto exit_ok; | 577 | goto exit_ok; |
580 | 578 | ||
581 | switch (action) { | 579 | switch (action) { |
582 | case CPU_ONLINE: | 580 | case CPU_ONLINE: |
583 | thread = kthread_create_on_node(clamp_thread, | 581 | thread = kthread_create_on_node(clamp_thread, |
584 | (void *) cpu, | 582 | (void *) cpu, |
585 | cpu_to_node(cpu), | 583 | cpu_to_node(cpu), |
586 | "kidle_inject/%lu", cpu); | 584 | "kidle_inject/%lu", cpu); |
587 | if (likely(!IS_ERR(thread))) { | 585 | if (likely(!IS_ERR(thread))) { |
588 | kthread_bind(thread, cpu); | 586 | kthread_bind(thread, cpu); |
589 | wake_up_process(thread); | 587 | wake_up_process(thread); |
590 | *percpu_thread = thread; | 588 | *percpu_thread = thread; |
591 | } | 589 | } |
592 | /* prefer BSP as controlling CPU */ | 590 | /* prefer BSP as controlling CPU */ |
593 | if (cpu == 0) { | 591 | if (cpu == 0) { |
594 | control_cpu = 0; | 592 | control_cpu = 0; |
595 | smp_mb(); | 593 | smp_mb(); |
596 | } | 594 | } |
597 | break; | 595 | break; |
598 | case CPU_DEAD: | 596 | case CPU_DEAD: |
599 | if (test_bit(cpu, cpu_clamping_mask)) { | 597 | if (test_bit(cpu, cpu_clamping_mask)) { |
600 | pr_err("cpu %lu dead but powerclamping thread is not\n", | 598 | pr_err("cpu %lu dead but powerclamping thread is not\n", |
601 | cpu); | 599 | cpu); |
602 | kthread_stop(*percpu_thread); | 600 | kthread_stop(*percpu_thread); |
603 | } | 601 | } |
604 | if (cpu == control_cpu) { | 602 | if (cpu == control_cpu) { |
605 | control_cpu = smp_processor_id(); | 603 | control_cpu = smp_processor_id(); |
606 | smp_mb(); | 604 | smp_mb(); |
607 | } | 605 | } |
608 | } | 606 | } |
609 | 607 | ||
610 | exit_ok: | 608 | exit_ok: |
611 | return NOTIFY_OK; | 609 | return NOTIFY_OK; |
612 | } | 610 | } |
613 | 611 | ||
614 | static struct notifier_block powerclamp_cpu_notifier = { | 612 | static struct notifier_block powerclamp_cpu_notifier = { |
615 | .notifier_call = powerclamp_cpu_callback, | 613 | .notifier_call = powerclamp_cpu_callback, |
616 | }; | 614 | }; |
617 | 615 | ||
618 | static int powerclamp_get_max_state(struct thermal_cooling_device *cdev, | 616 | static int powerclamp_get_max_state(struct thermal_cooling_device *cdev, |
619 | unsigned long *state) | 617 | unsigned long *state) |
620 | { | 618 | { |
621 | *state = MAX_TARGET_RATIO; | 619 | *state = MAX_TARGET_RATIO; |
622 | 620 | ||
623 | return 0; | 621 | return 0; |
624 | } | 622 | } |
625 | 623 | ||
626 | static int powerclamp_get_cur_state(struct thermal_cooling_device *cdev, | 624 | static int powerclamp_get_cur_state(struct thermal_cooling_device *cdev, |
627 | unsigned long *state) | 625 | unsigned long *state) |
628 | { | 626 | { |
629 | if (true == clamping) | 627 | if (true == clamping) |
630 | *state = pkg_cstate_ratio_cur; | 628 | *state = pkg_cstate_ratio_cur; |
631 | else | 629 | else |
632 | /* to save power, do not poll idle ratio while not clamping */ | 630 | /* to save power, do not poll idle ratio while not clamping */ |
633 | *state = -1; /* indicates invalid state */ | 631 | *state = -1; /* indicates invalid state */ |
634 | 632 | ||
635 | return 0; | 633 | return 0; |
636 | } | 634 | } |
637 | 635 | ||
638 | static int powerclamp_set_cur_state(struct thermal_cooling_device *cdev, | 636 | static int powerclamp_set_cur_state(struct thermal_cooling_device *cdev, |
639 | unsigned long new_target_ratio) | 637 | unsigned long new_target_ratio) |
640 | { | 638 | { |
641 | int ret = 0; | 639 | int ret = 0; |
642 | 640 | ||
643 | new_target_ratio = clamp(new_target_ratio, 0UL, | 641 | new_target_ratio = clamp(new_target_ratio, 0UL, |
644 | (unsigned long) (MAX_TARGET_RATIO-1)); | 642 | (unsigned long) (MAX_TARGET_RATIO-1)); |
645 | if (set_target_ratio == 0 && new_target_ratio > 0) { | 643 | if (set_target_ratio == 0 && new_target_ratio > 0) { |
646 | pr_info("Start idle injection to reduce power\n"); | 644 | pr_info("Start idle injection to reduce power\n"); |
647 | set_target_ratio = new_target_ratio; | 645 | set_target_ratio = new_target_ratio; |
648 | ret = start_power_clamp(); | 646 | ret = start_power_clamp(); |
649 | goto exit_set; | 647 | goto exit_set; |
650 | } else if (set_target_ratio > 0 && new_target_ratio == 0) { | 648 | } else if (set_target_ratio > 0 && new_target_ratio == 0) { |
651 | pr_info("Stop forced idle injection\n"); | 649 | pr_info("Stop forced idle injection\n"); |
652 | set_target_ratio = 0; | 650 | set_target_ratio = 0; |
653 | end_power_clamp(); | 651 | end_power_clamp(); |
654 | } else /* adjust currently running */ { | 652 | } else /* adjust currently running */ { |
655 | set_target_ratio = new_target_ratio; | 653 | set_target_ratio = new_target_ratio; |
656 | /* make new set_target_ratio visible to other cpus */ | 654 | /* make new set_target_ratio visible to other cpus */ |
657 | smp_mb(); | 655 | smp_mb(); |
658 | } | 656 | } |
659 | 657 | ||
660 | exit_set: | 658 | exit_set: |
661 | return ret; | 659 | return ret; |
662 | } | 660 | } |
663 | 661 | ||
664 | /* bind to generic thermal layer as cooling device*/ | 662 | /* bind to generic thermal layer as cooling device*/ |
665 | static struct thermal_cooling_device_ops powerclamp_cooling_ops = { | 663 | static struct thermal_cooling_device_ops powerclamp_cooling_ops = { |
666 | .get_max_state = powerclamp_get_max_state, | 664 | .get_max_state = powerclamp_get_max_state, |
667 | .get_cur_state = powerclamp_get_cur_state, | 665 | .get_cur_state = powerclamp_get_cur_state, |
668 | .set_cur_state = powerclamp_set_cur_state, | 666 | .set_cur_state = powerclamp_set_cur_state, |
669 | }; | 667 | }; |
670 | 668 | ||
671 | /* runs on Nehalem and later */ | 669 | /* runs on Nehalem and later */ |
672 | static const struct x86_cpu_id intel_powerclamp_ids[] = { | 670 | static const struct x86_cpu_id intel_powerclamp_ids[] = { |
673 | { X86_VENDOR_INTEL, 6, 0x1a}, | 671 | { X86_VENDOR_INTEL, 6, 0x1a}, |
674 | { X86_VENDOR_INTEL, 6, 0x1c}, | 672 | { X86_VENDOR_INTEL, 6, 0x1c}, |
675 | { X86_VENDOR_INTEL, 6, 0x1e}, | 673 | { X86_VENDOR_INTEL, 6, 0x1e}, |
676 | { X86_VENDOR_INTEL, 6, 0x1f}, | 674 | { X86_VENDOR_INTEL, 6, 0x1f}, |
677 | { X86_VENDOR_INTEL, 6, 0x25}, | 675 | { X86_VENDOR_INTEL, 6, 0x25}, |
678 | { X86_VENDOR_INTEL, 6, 0x26}, | 676 | { X86_VENDOR_INTEL, 6, 0x26}, |
679 | { X86_VENDOR_INTEL, 6, 0x2a}, | 677 | { X86_VENDOR_INTEL, 6, 0x2a}, |
680 | { X86_VENDOR_INTEL, 6, 0x2c}, | 678 | { X86_VENDOR_INTEL, 6, 0x2c}, |
681 | { X86_VENDOR_INTEL, 6, 0x2d}, | 679 | { X86_VENDOR_INTEL, 6, 0x2d}, |
682 | { X86_VENDOR_INTEL, 6, 0x2e}, | 680 | { X86_VENDOR_INTEL, 6, 0x2e}, |
683 | { X86_VENDOR_INTEL, 6, 0x2f}, | 681 | { X86_VENDOR_INTEL, 6, 0x2f}, |
684 | { X86_VENDOR_INTEL, 6, 0x37}, | 682 | { X86_VENDOR_INTEL, 6, 0x37}, |
685 | { X86_VENDOR_INTEL, 6, 0x3a}, | 683 | { X86_VENDOR_INTEL, 6, 0x3a}, |
686 | { X86_VENDOR_INTEL, 6, 0x3c}, | 684 | { X86_VENDOR_INTEL, 6, 0x3c}, |
687 | { X86_VENDOR_INTEL, 6, 0x3d}, | 685 | { X86_VENDOR_INTEL, 6, 0x3d}, |
688 | { X86_VENDOR_INTEL, 6, 0x3e}, | 686 | { X86_VENDOR_INTEL, 6, 0x3e}, |
689 | { X86_VENDOR_INTEL, 6, 0x3f}, | 687 | { X86_VENDOR_INTEL, 6, 0x3f}, |
690 | { X86_VENDOR_INTEL, 6, 0x45}, | 688 | { X86_VENDOR_INTEL, 6, 0x45}, |
691 | { X86_VENDOR_INTEL, 6, 0x46}, | 689 | { X86_VENDOR_INTEL, 6, 0x46}, |
692 | { X86_VENDOR_INTEL, 6, 0x4c}, | 690 | { X86_VENDOR_INTEL, 6, 0x4c}, |
693 | {} | 691 | {} |
694 | }; | 692 | }; |
695 | MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids); | 693 | MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids); |
696 | 694 | ||
697 | static int powerclamp_probe(void) | 695 | static int powerclamp_probe(void) |
698 | { | 696 | { |
699 | if (!x86_match_cpu(intel_powerclamp_ids)) { | 697 | if (!x86_match_cpu(intel_powerclamp_ids)) { |
700 | pr_err("Intel powerclamp does not run on family %d model %d\n", | 698 | pr_err("Intel powerclamp does not run on family %d model %d\n", |
701 | boot_cpu_data.x86, boot_cpu_data.x86_model); | 699 | boot_cpu_data.x86, boot_cpu_data.x86_model); |
702 | return -ENODEV; | 700 | return -ENODEV; |
703 | } | 701 | } |
704 | if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC) || | 702 | if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC) || |
705 | !boot_cpu_has(X86_FEATURE_CONSTANT_TSC) || | 703 | !boot_cpu_has(X86_FEATURE_CONSTANT_TSC) || |
706 | !boot_cpu_has(X86_FEATURE_MWAIT) || | 704 | !boot_cpu_has(X86_FEATURE_MWAIT) || |
707 | !boot_cpu_has(X86_FEATURE_ARAT)) | 705 | !boot_cpu_has(X86_FEATURE_ARAT)) |
708 | return -ENODEV; | 706 | return -ENODEV; |
709 | 707 | ||
710 | /* find the deepest mwait value */ | 708 | /* find the deepest mwait value */ |
711 | find_target_mwait(); | 709 | find_target_mwait(); |
712 | 710 | ||
713 | return 0; | 711 | return 0; |
714 | } | 712 | } |
715 | 713 | ||
716 | static int powerclamp_debug_show(struct seq_file *m, void *unused) | 714 | static int powerclamp_debug_show(struct seq_file *m, void *unused) |
717 | { | 715 | { |
718 | int i = 0; | 716 | int i = 0; |
719 | 717 | ||
720 | seq_printf(m, "controlling cpu: %d\n", control_cpu); | 718 | seq_printf(m, "controlling cpu: %d\n", control_cpu); |
721 | seq_printf(m, "pct confidence steady dynamic (compensation)\n"); | 719 | seq_printf(m, "pct confidence steady dynamic (compensation)\n"); |
722 | for (i = 0; i < MAX_TARGET_RATIO; i++) { | 720 | for (i = 0; i < MAX_TARGET_RATIO; i++) { |
723 | seq_printf(m, "%d\t%lu\t%lu\t%lu\n", | 721 | seq_printf(m, "%d\t%lu\t%lu\t%lu\n", |
724 | i, | 722 | i, |
725 | cal_data[i].confidence, | 723 | cal_data[i].confidence, |
726 | cal_data[i].steady_comp, | 724 | cal_data[i].steady_comp, |
727 | cal_data[i].dynamic_comp); | 725 | cal_data[i].dynamic_comp); |
728 | } | 726 | } |
729 | 727 | ||
730 | return 0; | 728 | return 0; |
731 | } | 729 | } |
732 | 730 | ||
733 | static int powerclamp_debug_open(struct inode *inode, | 731 | static int powerclamp_debug_open(struct inode *inode, |
734 | struct file *file) | 732 | struct file *file) |
735 | { | 733 | { |
736 | return single_open(file, powerclamp_debug_show, inode->i_private); | 734 | return single_open(file, powerclamp_debug_show, inode->i_private); |
737 | } | 735 | } |
738 | 736 | ||
739 | static const struct file_operations powerclamp_debug_fops = { | 737 | static const struct file_operations powerclamp_debug_fops = { |
740 | .open = powerclamp_debug_open, | 738 | .open = powerclamp_debug_open, |
741 | .read = seq_read, | 739 | .read = seq_read, |
742 | .llseek = seq_lseek, | 740 | .llseek = seq_lseek, |
743 | .release = single_release, | 741 | .release = single_release, |
744 | .owner = THIS_MODULE, | 742 | .owner = THIS_MODULE, |
745 | }; | 743 | }; |
746 | 744 | ||
747 | static inline void powerclamp_create_debug_files(void) | 745 | static inline void powerclamp_create_debug_files(void) |
748 | { | 746 | { |
749 | debug_dir = debugfs_create_dir("intel_powerclamp", NULL); | 747 | debug_dir = debugfs_create_dir("intel_powerclamp", NULL); |
750 | if (!debug_dir) | 748 | if (!debug_dir) |
751 | return; | 749 | return; |
752 | 750 | ||
753 | if (!debugfs_create_file("powerclamp_calib", S_IRUGO, debug_dir, | 751 | if (!debugfs_create_file("powerclamp_calib", S_IRUGO, debug_dir, |
754 | cal_data, &powerclamp_debug_fops)) | 752 | cal_data, &powerclamp_debug_fops)) |
755 | goto file_error; | 753 | goto file_error; |
756 | 754 | ||
757 | return; | 755 | return; |
758 | 756 | ||
759 | file_error: | 757 | file_error: |
760 | debugfs_remove_recursive(debug_dir); | 758 | debugfs_remove_recursive(debug_dir); |
761 | } | 759 | } |
762 | 760 | ||
763 | static int powerclamp_init(void) | 761 | static int powerclamp_init(void) |
764 | { | 762 | { |
765 | int retval; | 763 | int retval; |
766 | int bitmap_size; | 764 | int bitmap_size; |
767 | 765 | ||
768 | bitmap_size = BITS_TO_LONGS(num_possible_cpus()) * sizeof(long); | 766 | bitmap_size = BITS_TO_LONGS(num_possible_cpus()) * sizeof(long); |
769 | cpu_clamping_mask = kzalloc(bitmap_size, GFP_KERNEL); | 767 | cpu_clamping_mask = kzalloc(bitmap_size, GFP_KERNEL); |
770 | if (!cpu_clamping_mask) | 768 | if (!cpu_clamping_mask) |
771 | return -ENOMEM; | 769 | return -ENOMEM; |
772 | 770 | ||
773 | /* probe cpu features and ids here */ | 771 | /* probe cpu features and ids here */ |
774 | retval = powerclamp_probe(); | 772 | retval = powerclamp_probe(); |
775 | if (retval) | 773 | if (retval) |
776 | goto exit_free; | 774 | goto exit_free; |
777 | 775 | ||
778 | /* set default limit, maybe adjusted during runtime based on feedback */ | 776 | /* set default limit, maybe adjusted during runtime based on feedback */ |
779 | window_size = 2; | 777 | window_size = 2; |
780 | register_hotcpu_notifier(&powerclamp_cpu_notifier); | 778 | register_hotcpu_notifier(&powerclamp_cpu_notifier); |
781 | 779 | ||
782 | powerclamp_thread = alloc_percpu(struct task_struct *); | 780 | powerclamp_thread = alloc_percpu(struct task_struct *); |
783 | if (!powerclamp_thread) { | 781 | if (!powerclamp_thread) { |
784 | retval = -ENOMEM; | 782 | retval = -ENOMEM; |
785 | goto exit_unregister; | 783 | goto exit_unregister; |
786 | } | 784 | } |
787 | 785 | ||
788 | cooling_dev = thermal_cooling_device_register("intel_powerclamp", NULL, | 786 | cooling_dev = thermal_cooling_device_register("intel_powerclamp", NULL, |
789 | &powerclamp_cooling_ops); | 787 | &powerclamp_cooling_ops); |
790 | if (IS_ERR(cooling_dev)) { | 788 | if (IS_ERR(cooling_dev)) { |
791 | retval = -ENODEV; | 789 | retval = -ENODEV; |
792 | goto exit_free_thread; | 790 | goto exit_free_thread; |
793 | } | 791 | } |
794 | 792 | ||
795 | if (!duration) | 793 | if (!duration) |
796 | duration = jiffies_to_msecs(DEFAULT_DURATION_JIFFIES); | 794 | duration = jiffies_to_msecs(DEFAULT_DURATION_JIFFIES); |
797 | 795 | ||
798 | powerclamp_create_debug_files(); | 796 | powerclamp_create_debug_files(); |
799 | 797 | ||
800 | return 0; | 798 | return 0; |
801 | 799 | ||
802 | exit_free_thread: | 800 | exit_free_thread: |
803 | free_percpu(powerclamp_thread); | 801 | free_percpu(powerclamp_thread); |
804 | exit_unregister: | 802 | exit_unregister: |
805 | unregister_hotcpu_notifier(&powerclamp_cpu_notifier); | 803 | unregister_hotcpu_notifier(&powerclamp_cpu_notifier); |
806 | exit_free: | 804 | exit_free: |
807 | kfree(cpu_clamping_mask); | 805 | kfree(cpu_clamping_mask); |
808 | return retval; | 806 | return retval; |
809 | } | 807 | } |
810 | module_init(powerclamp_init); | 808 | module_init(powerclamp_init); |
811 | 809 | ||
812 | static void powerclamp_exit(void) | 810 | static void powerclamp_exit(void) |
813 | { | 811 | { |
814 | unregister_hotcpu_notifier(&powerclamp_cpu_notifier); | 812 | unregister_hotcpu_notifier(&powerclamp_cpu_notifier); |
815 | end_power_clamp(); | 813 | end_power_clamp(); |
816 | free_percpu(powerclamp_thread); | 814 | free_percpu(powerclamp_thread); |
817 | thermal_cooling_device_unregister(cooling_dev); | 815 | thermal_cooling_device_unregister(cooling_dev); |
818 | kfree(cpu_clamping_mask); | 816 | kfree(cpu_clamping_mask); |
819 | 817 | ||
820 | cancel_delayed_work_sync(&poll_pkg_cstate_work); | 818 | cancel_delayed_work_sync(&poll_pkg_cstate_work); |
821 | debugfs_remove_recursive(debug_dir); | 819 | debugfs_remove_recursive(debug_dir); |
822 | } | 820 | } |
823 | module_exit(powerclamp_exit); | 821 | module_exit(powerclamp_exit); |
824 | 822 | ||
825 | MODULE_LICENSE("GPL"); | 823 | MODULE_LICENSE("GPL"); |
826 | MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>"); | 824 | MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>"); |
827 | MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@linux.intel.com>"); | 825 | MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@linux.intel.com>"); |
828 | MODULE_DESCRIPTION("Package Level C-state Idle Injection for Intel CPUs"); | 826 | MODULE_DESCRIPTION("Package Level C-state Idle Injection for Intel CPUs"); |
829 | 827 |
kernel/time/tick-sched.c
1 | /* | 1 | /* |
2 | * linux/kernel/time/tick-sched.c | 2 | * linux/kernel/time/tick-sched.c |
3 | * | 3 | * |
4 | * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> | 4 | * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> |
5 | * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar | 5 | * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar |
6 | * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner | 6 | * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner |
7 | * | 7 | * |
8 | * No idle tick implementation for low and high resolution timers | 8 | * No idle tick implementation for low and high resolution timers |
9 | * | 9 | * |
10 | * Started by: Thomas Gleixner and Ingo Molnar | 10 | * Started by: Thomas Gleixner and Ingo Molnar |
11 | * | 11 | * |
12 | * Distribute under GPLv2. | 12 | * Distribute under GPLv2. |
13 | */ | 13 | */ |
14 | #include <linux/cpu.h> | 14 | #include <linux/cpu.h> |
15 | #include <linux/err.h> | 15 | #include <linux/err.h> |
16 | #include <linux/hrtimer.h> | 16 | #include <linux/hrtimer.h> |
17 | #include <linux/interrupt.h> | 17 | #include <linux/interrupt.h> |
18 | #include <linux/kernel_stat.h> | 18 | #include <linux/kernel_stat.h> |
19 | #include <linux/percpu.h> | 19 | #include <linux/percpu.h> |
20 | #include <linux/profile.h> | 20 | #include <linux/profile.h> |
21 | #include <linux/sched.h> | 21 | #include <linux/sched.h> |
22 | #include <linux/module.h> | 22 | #include <linux/module.h> |
23 | #include <linux/irq_work.h> | 23 | #include <linux/irq_work.h> |
24 | #include <linux/posix-timers.h> | 24 | #include <linux/posix-timers.h> |
25 | #include <linux/perf_event.h> | 25 | #include <linux/perf_event.h> |
26 | #include <linux/context_tracking.h> | 26 | #include <linux/context_tracking.h> |
27 | 27 | ||
28 | #include <asm/irq_regs.h> | 28 | #include <asm/irq_regs.h> |
29 | 29 | ||
30 | #include "tick-internal.h" | 30 | #include "tick-internal.h" |
31 | 31 | ||
32 | #include <trace/events/timer.h> | 32 | #include <trace/events/timer.h> |
33 | 33 | ||
34 | /* | 34 | /* |
35 | * Per cpu nohz control structure | 35 | * Per cpu nohz control structure |
36 | */ | 36 | */ |
37 | DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); | 37 | DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); |
38 | 38 | ||
39 | /* | 39 | /* |
40 | * The time, when the last jiffy update happened. Protected by jiffies_lock. | 40 | * The time, when the last jiffy update happened. Protected by jiffies_lock. |
41 | */ | 41 | */ |
42 | static ktime_t last_jiffies_update; | 42 | static ktime_t last_jiffies_update; |
43 | 43 | ||
44 | struct tick_sched *tick_get_tick_sched(int cpu) | 44 | struct tick_sched *tick_get_tick_sched(int cpu) |
45 | { | 45 | { |
46 | return &per_cpu(tick_cpu_sched, cpu); | 46 | return &per_cpu(tick_cpu_sched, cpu); |
47 | } | 47 | } |
48 | 48 | ||
49 | /* | 49 | /* |
50 | * Must be called with interrupts disabled ! | 50 | * Must be called with interrupts disabled ! |
51 | */ | 51 | */ |
52 | static void tick_do_update_jiffies64(ktime_t now) | 52 | static void tick_do_update_jiffies64(ktime_t now) |
53 | { | 53 | { |
54 | unsigned long ticks = 0; | 54 | unsigned long ticks = 0; |
55 | ktime_t delta; | 55 | ktime_t delta; |
56 | 56 | ||
57 | /* | 57 | /* |
58 | * Do a quick check without holding jiffies_lock: | 58 | * Do a quick check without holding jiffies_lock: |
59 | */ | 59 | */ |
60 | delta = ktime_sub(now, last_jiffies_update); | 60 | delta = ktime_sub(now, last_jiffies_update); |
61 | if (delta.tv64 < tick_period.tv64) | 61 | if (delta.tv64 < tick_period.tv64) |
62 | return; | 62 | return; |
63 | 63 | ||
64 | /* Reevalute with jiffies_lock held */ | 64 | /* Reevalute with jiffies_lock held */ |
65 | write_seqlock(&jiffies_lock); | 65 | write_seqlock(&jiffies_lock); |
66 | 66 | ||
67 | delta = ktime_sub(now, last_jiffies_update); | 67 | delta = ktime_sub(now, last_jiffies_update); |
68 | if (delta.tv64 >= tick_period.tv64) { | 68 | if (delta.tv64 >= tick_period.tv64) { |
69 | 69 | ||
70 | delta = ktime_sub(delta, tick_period); | 70 | delta = ktime_sub(delta, tick_period); |
71 | last_jiffies_update = ktime_add(last_jiffies_update, | 71 | last_jiffies_update = ktime_add(last_jiffies_update, |
72 | tick_period); | 72 | tick_period); |
73 | 73 | ||
74 | /* Slow path for long timeouts */ | 74 | /* Slow path for long timeouts */ |
75 | if (unlikely(delta.tv64 >= tick_period.tv64)) { | 75 | if (unlikely(delta.tv64 >= tick_period.tv64)) { |
76 | s64 incr = ktime_to_ns(tick_period); | 76 | s64 incr = ktime_to_ns(tick_period); |
77 | 77 | ||
78 | ticks = ktime_divns(delta, incr); | 78 | ticks = ktime_divns(delta, incr); |
79 | 79 | ||
80 | last_jiffies_update = ktime_add_ns(last_jiffies_update, | 80 | last_jiffies_update = ktime_add_ns(last_jiffies_update, |
81 | incr * ticks); | 81 | incr * ticks); |
82 | } | 82 | } |
83 | do_timer(++ticks); | 83 | do_timer(++ticks); |
84 | 84 | ||
85 | /* Keep the tick_next_period variable up to date */ | 85 | /* Keep the tick_next_period variable up to date */ |
86 | tick_next_period = ktime_add(last_jiffies_update, tick_period); | 86 | tick_next_period = ktime_add(last_jiffies_update, tick_period); |
87 | } else { | 87 | } else { |
88 | write_sequnlock(&jiffies_lock); | 88 | write_sequnlock(&jiffies_lock); |
89 | return; | 89 | return; |
90 | } | 90 | } |
91 | write_sequnlock(&jiffies_lock); | 91 | write_sequnlock(&jiffies_lock); |
92 | update_wall_time(); | 92 | update_wall_time(); |
93 | } | 93 | } |
94 | 94 | ||
95 | /* | 95 | /* |
96 | * Initialize and return retrieve the jiffies update. | 96 | * Initialize and return retrieve the jiffies update. |
97 | */ | 97 | */ |
98 | static ktime_t tick_init_jiffy_update(void) | 98 | static ktime_t tick_init_jiffy_update(void) |
99 | { | 99 | { |
100 | ktime_t period; | 100 | ktime_t period; |
101 | 101 | ||
102 | write_seqlock(&jiffies_lock); | 102 | write_seqlock(&jiffies_lock); |
103 | /* Did we start the jiffies update yet ? */ | 103 | /* Did we start the jiffies update yet ? */ |
104 | if (last_jiffies_update.tv64 == 0) | 104 | if (last_jiffies_update.tv64 == 0) |
105 | last_jiffies_update = tick_next_period; | 105 | last_jiffies_update = tick_next_period; |
106 | period = last_jiffies_update; | 106 | period = last_jiffies_update; |
107 | write_sequnlock(&jiffies_lock); | 107 | write_sequnlock(&jiffies_lock); |
108 | return period; | 108 | return period; |
109 | } | 109 | } |
110 | 110 | ||
111 | 111 | ||
112 | static void tick_sched_do_timer(ktime_t now) | 112 | static void tick_sched_do_timer(ktime_t now) |
113 | { | 113 | { |
114 | int cpu = smp_processor_id(); | 114 | int cpu = smp_processor_id(); |
115 | 115 | ||
116 | #ifdef CONFIG_NO_HZ_COMMON | 116 | #ifdef CONFIG_NO_HZ_COMMON |
117 | /* | 117 | /* |
118 | * Check if the do_timer duty was dropped. We don't care about | 118 | * Check if the do_timer duty was dropped. We don't care about |
119 | * concurrency: This happens only when the cpu in charge went | 119 | * concurrency: This happens only when the cpu in charge went |
120 | * into a long sleep. If two cpus happen to assign themself to | 120 | * into a long sleep. If two cpus happen to assign themself to |
121 | * this duty, then the jiffies update is still serialized by | 121 | * this duty, then the jiffies update is still serialized by |
122 | * jiffies_lock. | 122 | * jiffies_lock. |
123 | */ | 123 | */ |
124 | if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE) | 124 | if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE) |
125 | && !tick_nohz_full_cpu(cpu)) | 125 | && !tick_nohz_full_cpu(cpu)) |
126 | tick_do_timer_cpu = cpu; | 126 | tick_do_timer_cpu = cpu; |
127 | #endif | 127 | #endif |
128 | 128 | ||
129 | /* Check, if the jiffies need an update */ | 129 | /* Check, if the jiffies need an update */ |
130 | if (tick_do_timer_cpu == cpu) | 130 | if (tick_do_timer_cpu == cpu) |
131 | tick_do_update_jiffies64(now); | 131 | tick_do_update_jiffies64(now); |
132 | } | 132 | } |
133 | 133 | ||
134 | static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) | 134 | static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) |
135 | { | 135 | { |
136 | #ifdef CONFIG_NO_HZ_COMMON | 136 | #ifdef CONFIG_NO_HZ_COMMON |
137 | /* | 137 | /* |
138 | * When we are idle and the tick is stopped, we have to touch | 138 | * When we are idle and the tick is stopped, we have to touch |
139 | * the watchdog as we might not schedule for a really long | 139 | * the watchdog as we might not schedule for a really long |
140 | * time. This happens on complete idle SMP systems while | 140 | * time. This happens on complete idle SMP systems while |
141 | * waiting on the login prompt. We also increment the "start of | 141 | * waiting on the login prompt. We also increment the "start of |
142 | * idle" jiffy stamp so the idle accounting adjustment we do | 142 | * idle" jiffy stamp so the idle accounting adjustment we do |
143 | * when we go busy again does not account too much ticks. | 143 | * when we go busy again does not account too much ticks. |
144 | */ | 144 | */ |
145 | if (ts->tick_stopped) { | 145 | if (ts->tick_stopped) { |
146 | touch_softlockup_watchdog(); | 146 | touch_softlockup_watchdog(); |
147 | if (is_idle_task(current)) | 147 | if (is_idle_task(current)) |
148 | ts->idle_jiffies++; | 148 | ts->idle_jiffies++; |
149 | } | 149 | } |
150 | #endif | 150 | #endif |
151 | update_process_times(user_mode(regs)); | 151 | update_process_times(user_mode(regs)); |
152 | profile_tick(CPU_PROFILING); | 152 | profile_tick(CPU_PROFILING); |
153 | } | 153 | } |
154 | 154 | ||
155 | #ifdef CONFIG_NO_HZ_FULL | 155 | #ifdef CONFIG_NO_HZ_FULL |
156 | cpumask_var_t tick_nohz_full_mask; | 156 | cpumask_var_t tick_nohz_full_mask; |
157 | cpumask_var_t housekeeping_mask; | 157 | cpumask_var_t housekeeping_mask; |
158 | bool tick_nohz_full_running; | 158 | bool tick_nohz_full_running; |
159 | 159 | ||
160 | static bool can_stop_full_tick(void) | 160 | static bool can_stop_full_tick(void) |
161 | { | 161 | { |
162 | WARN_ON_ONCE(!irqs_disabled()); | 162 | WARN_ON_ONCE(!irqs_disabled()); |
163 | 163 | ||
164 | if (!sched_can_stop_tick()) { | 164 | if (!sched_can_stop_tick()) { |
165 | trace_tick_stop(0, "more than 1 task in runqueue\n"); | 165 | trace_tick_stop(0, "more than 1 task in runqueue\n"); |
166 | return false; | 166 | return false; |
167 | } | 167 | } |
168 | 168 | ||
169 | if (!posix_cpu_timers_can_stop_tick(current)) { | 169 | if (!posix_cpu_timers_can_stop_tick(current)) { |
170 | trace_tick_stop(0, "posix timers running\n"); | 170 | trace_tick_stop(0, "posix timers running\n"); |
171 | return false; | 171 | return false; |
172 | } | 172 | } |
173 | 173 | ||
174 | if (!perf_event_can_stop_tick()) { | 174 | if (!perf_event_can_stop_tick()) { |
175 | trace_tick_stop(0, "perf events running\n"); | 175 | trace_tick_stop(0, "perf events running\n"); |
176 | return false; | 176 | return false; |
177 | } | 177 | } |
178 | 178 | ||
179 | /* sched_clock_tick() needs us? */ | 179 | /* sched_clock_tick() needs us? */ |
180 | #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK | 180 | #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK |
181 | /* | 181 | /* |
182 | * TODO: kick full dynticks CPUs when | 182 | * TODO: kick full dynticks CPUs when |
183 | * sched_clock_stable is set. | 183 | * sched_clock_stable is set. |
184 | */ | 184 | */ |
185 | if (!sched_clock_stable()) { | 185 | if (!sched_clock_stable()) { |
186 | trace_tick_stop(0, "unstable sched clock\n"); | 186 | trace_tick_stop(0, "unstable sched clock\n"); |
187 | /* | 187 | /* |
188 | * Don't allow the user to think they can get | 188 | * Don't allow the user to think they can get |
189 | * full NO_HZ with this machine. | 189 | * full NO_HZ with this machine. |
190 | */ | 190 | */ |
191 | WARN_ONCE(tick_nohz_full_running, | 191 | WARN_ONCE(tick_nohz_full_running, |
192 | "NO_HZ FULL will not work with unstable sched clock"); | 192 | "NO_HZ FULL will not work with unstable sched clock"); |
193 | return false; | 193 | return false; |
194 | } | 194 | } |
195 | #endif | 195 | #endif |
196 | 196 | ||
197 | return true; | 197 | return true; |
198 | } | 198 | } |
199 | 199 | ||
200 | static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now); | 200 | static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now); |
201 | 201 | ||
202 | /* | 202 | /* |
203 | * Re-evaluate the need for the tick on the current CPU | 203 | * Re-evaluate the need for the tick on the current CPU |
204 | * and restart it if necessary. | 204 | * and restart it if necessary. |
205 | */ | 205 | */ |
206 | void __tick_nohz_full_check(void) | 206 | void __tick_nohz_full_check(void) |
207 | { | 207 | { |
208 | struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); | 208 | struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); |
209 | 209 | ||
210 | if (tick_nohz_full_cpu(smp_processor_id())) { | 210 | if (tick_nohz_full_cpu(smp_processor_id())) { |
211 | if (ts->tick_stopped && !is_idle_task(current)) { | 211 | if (ts->tick_stopped && !is_idle_task(current)) { |
212 | if (!can_stop_full_tick()) | 212 | if (!can_stop_full_tick()) |
213 | tick_nohz_restart_sched_tick(ts, ktime_get()); | 213 | tick_nohz_restart_sched_tick(ts, ktime_get()); |
214 | } | 214 | } |
215 | } | 215 | } |
216 | } | 216 | } |
217 | 217 | ||
218 | static void nohz_full_kick_work_func(struct irq_work *work) | 218 | static void nohz_full_kick_work_func(struct irq_work *work) |
219 | { | 219 | { |
220 | __tick_nohz_full_check(); | 220 | __tick_nohz_full_check(); |
221 | } | 221 | } |
222 | 222 | ||
223 | static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { | 223 | static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { |
224 | .func = nohz_full_kick_work_func, | 224 | .func = nohz_full_kick_work_func, |
225 | }; | 225 | }; |
226 | 226 | ||
227 | /* | 227 | /* |
228 | * Kick this CPU if it's full dynticks in order to force it to | 228 | * Kick this CPU if it's full dynticks in order to force it to |
229 | * re-evaluate its dependency on the tick and restart it if necessary. | 229 | * re-evaluate its dependency on the tick and restart it if necessary. |
230 | * This kick, unlike tick_nohz_full_kick_cpu() and tick_nohz_full_kick_all(), | 230 | * This kick, unlike tick_nohz_full_kick_cpu() and tick_nohz_full_kick_all(), |
231 | * is NMI safe. | 231 | * is NMI safe. |
232 | */ | 232 | */ |
233 | void tick_nohz_full_kick(void) | 233 | void tick_nohz_full_kick(void) |
234 | { | 234 | { |
235 | if (!tick_nohz_full_cpu(smp_processor_id())) | 235 | if (!tick_nohz_full_cpu(smp_processor_id())) |
236 | return; | 236 | return; |
237 | 237 | ||
238 | irq_work_queue(this_cpu_ptr(&nohz_full_kick_work)); | 238 | irq_work_queue(this_cpu_ptr(&nohz_full_kick_work)); |
239 | } | 239 | } |
240 | 240 | ||
241 | /* | 241 | /* |
242 | * Kick the CPU if it's full dynticks in order to force it to | 242 | * Kick the CPU if it's full dynticks in order to force it to |
243 | * re-evaluate its dependency on the tick and restart it if necessary. | 243 | * re-evaluate its dependency on the tick and restart it if necessary. |
244 | */ | 244 | */ |
245 | void tick_nohz_full_kick_cpu(int cpu) | 245 | void tick_nohz_full_kick_cpu(int cpu) |
246 | { | 246 | { |
247 | if (!tick_nohz_full_cpu(cpu)) | 247 | if (!tick_nohz_full_cpu(cpu)) |
248 | return; | 248 | return; |
249 | 249 | ||
250 | irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu); | 250 | irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu); |
251 | } | 251 | } |
252 | 252 | ||
253 | static void nohz_full_kick_ipi(void *info) | 253 | static void nohz_full_kick_ipi(void *info) |
254 | { | 254 | { |
255 | __tick_nohz_full_check(); | 255 | __tick_nohz_full_check(); |
256 | } | 256 | } |
257 | 257 | ||
258 | /* | 258 | /* |
259 | * Kick all full dynticks CPUs in order to force these to re-evaluate | 259 | * Kick all full dynticks CPUs in order to force these to re-evaluate |
260 | * their dependency on the tick and restart it if necessary. | 260 | * their dependency on the tick and restart it if necessary. |
261 | */ | 261 | */ |
262 | void tick_nohz_full_kick_all(void) | 262 | void tick_nohz_full_kick_all(void) |
263 | { | 263 | { |
264 | if (!tick_nohz_full_running) | 264 | if (!tick_nohz_full_running) |
265 | return; | 265 | return; |
266 | 266 | ||
267 | preempt_disable(); | 267 | preempt_disable(); |
268 | smp_call_function_many(tick_nohz_full_mask, | 268 | smp_call_function_many(tick_nohz_full_mask, |
269 | nohz_full_kick_ipi, NULL, false); | 269 | nohz_full_kick_ipi, NULL, false); |
270 | tick_nohz_full_kick(); | 270 | tick_nohz_full_kick(); |
271 | preempt_enable(); | 271 | preempt_enable(); |
272 | } | 272 | } |
273 | 273 | ||
274 | /* | 274 | /* |
275 | * Re-evaluate the need for the tick as we switch the current task. | 275 | * Re-evaluate the need for the tick as we switch the current task. |
276 | * It might need the tick due to per task/process properties: | 276 | * It might need the tick due to per task/process properties: |
277 | * perf events, posix cpu timers, ... | 277 | * perf events, posix cpu timers, ... |
278 | */ | 278 | */ |
279 | void __tick_nohz_task_switch(struct task_struct *tsk) | 279 | void __tick_nohz_task_switch(struct task_struct *tsk) |
280 | { | 280 | { |
281 | unsigned long flags; | 281 | unsigned long flags; |
282 | 282 | ||
283 | local_irq_save(flags); | 283 | local_irq_save(flags); |
284 | 284 | ||
285 | if (!tick_nohz_full_cpu(smp_processor_id())) | 285 | if (!tick_nohz_full_cpu(smp_processor_id())) |
286 | goto out; | 286 | goto out; |
287 | 287 | ||
288 | if (tick_nohz_tick_stopped() && !can_stop_full_tick()) | 288 | if (tick_nohz_tick_stopped() && !can_stop_full_tick()) |
289 | tick_nohz_full_kick(); | 289 | tick_nohz_full_kick(); |
290 | 290 | ||
291 | out: | 291 | out: |
292 | local_irq_restore(flags); | 292 | local_irq_restore(flags); |
293 | } | 293 | } |
294 | 294 | ||
295 | /* Parse the boot-time nohz CPU list from the kernel parameters. */ | 295 | /* Parse the boot-time nohz CPU list from the kernel parameters. */ |
296 | static int __init tick_nohz_full_setup(char *str) | 296 | static int __init tick_nohz_full_setup(char *str) |
297 | { | 297 | { |
298 | alloc_bootmem_cpumask_var(&tick_nohz_full_mask); | 298 | alloc_bootmem_cpumask_var(&tick_nohz_full_mask); |
299 | if (cpulist_parse(str, tick_nohz_full_mask) < 0) { | 299 | if (cpulist_parse(str, tick_nohz_full_mask) < 0) { |
300 | pr_warning("NOHZ: Incorrect nohz_full cpumask\n"); | 300 | pr_warning("NOHZ: Incorrect nohz_full cpumask\n"); |
301 | free_bootmem_cpumask_var(tick_nohz_full_mask); | 301 | free_bootmem_cpumask_var(tick_nohz_full_mask); |
302 | return 1; | 302 | return 1; |
303 | } | 303 | } |
304 | tick_nohz_full_running = true; | 304 | tick_nohz_full_running = true; |
305 | 305 | ||
306 | return 1; | 306 | return 1; |
307 | } | 307 | } |
308 | __setup("nohz_full=", tick_nohz_full_setup); | 308 | __setup("nohz_full=", tick_nohz_full_setup); |
309 | 309 | ||
310 | static int tick_nohz_cpu_down_callback(struct notifier_block *nfb, | 310 | static int tick_nohz_cpu_down_callback(struct notifier_block *nfb, |
311 | unsigned long action, | 311 | unsigned long action, |
312 | void *hcpu) | 312 | void *hcpu) |
313 | { | 313 | { |
314 | unsigned int cpu = (unsigned long)hcpu; | 314 | unsigned int cpu = (unsigned long)hcpu; |
315 | 315 | ||
316 | switch (action & ~CPU_TASKS_FROZEN) { | 316 | switch (action & ~CPU_TASKS_FROZEN) { |
317 | case CPU_DOWN_PREPARE: | 317 | case CPU_DOWN_PREPARE: |
318 | /* | 318 | /* |
319 | * If we handle the timekeeping duty for full dynticks CPUs, | 319 | * If we handle the timekeeping duty for full dynticks CPUs, |
320 | * we can't safely shutdown that CPU. | 320 | * we can't safely shutdown that CPU. |
321 | */ | 321 | */ |
322 | if (tick_nohz_full_running && tick_do_timer_cpu == cpu) | 322 | if (tick_nohz_full_running && tick_do_timer_cpu == cpu) |
323 | return NOTIFY_BAD; | 323 | return NOTIFY_BAD; |
324 | break; | 324 | break; |
325 | } | 325 | } |
326 | return NOTIFY_OK; | 326 | return NOTIFY_OK; |
327 | } | 327 | } |
328 | 328 | ||
329 | /* | 329 | /* |
330 | * Worst case string length in chunks of CPU range seems 2 steps | 330 | * Worst case string length in chunks of CPU range seems 2 steps |
331 | * separations: 0,2,4,6,... | 331 | * separations: 0,2,4,6,... |
332 | * This is NR_CPUS + sizeof('\0') | 332 | * This is NR_CPUS + sizeof('\0') |
333 | */ | 333 | */ |
334 | static char __initdata nohz_full_buf[NR_CPUS + 1]; | 334 | static char __initdata nohz_full_buf[NR_CPUS + 1]; |
335 | 335 | ||
336 | static int tick_nohz_init_all(void) | 336 | static int tick_nohz_init_all(void) |
337 | { | 337 | { |
338 | int err = -1; | 338 | int err = -1; |
339 | 339 | ||
340 | #ifdef CONFIG_NO_HZ_FULL_ALL | 340 | #ifdef CONFIG_NO_HZ_FULL_ALL |
341 | if (!alloc_cpumask_var(&tick_nohz_full_mask, GFP_KERNEL)) { | 341 | if (!alloc_cpumask_var(&tick_nohz_full_mask, GFP_KERNEL)) { |
342 | WARN(1, "NO_HZ: Can't allocate full dynticks cpumask\n"); | 342 | WARN(1, "NO_HZ: Can't allocate full dynticks cpumask\n"); |
343 | return err; | 343 | return err; |
344 | } | 344 | } |
345 | err = 0; | 345 | err = 0; |
346 | cpumask_setall(tick_nohz_full_mask); | 346 | cpumask_setall(tick_nohz_full_mask); |
347 | tick_nohz_full_running = true; | 347 | tick_nohz_full_running = true; |
348 | #endif | 348 | #endif |
349 | return err; | 349 | return err; |
350 | } | 350 | } |
351 | 351 | ||
352 | void __init tick_nohz_init(void) | 352 | void __init tick_nohz_init(void) |
353 | { | 353 | { |
354 | int cpu; | 354 | int cpu; |
355 | 355 | ||
356 | if (!tick_nohz_full_running) { | 356 | if (!tick_nohz_full_running) { |
357 | if (tick_nohz_init_all() < 0) | 357 | if (tick_nohz_init_all() < 0) |
358 | return; | 358 | return; |
359 | } | 359 | } |
360 | 360 | ||
361 | if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) { | 361 | if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) { |
362 | WARN(1, "NO_HZ: Can't allocate not-full dynticks cpumask\n"); | 362 | WARN(1, "NO_HZ: Can't allocate not-full dynticks cpumask\n"); |
363 | cpumask_clear(tick_nohz_full_mask); | 363 | cpumask_clear(tick_nohz_full_mask); |
364 | tick_nohz_full_running = false; | 364 | tick_nohz_full_running = false; |
365 | return; | 365 | return; |
366 | } | 366 | } |
367 | 367 | ||
368 | /* | 368 | /* |
369 | * Full dynticks uses irq work to drive the tick rescheduling on safe | 369 | * Full dynticks uses irq work to drive the tick rescheduling on safe |
370 | * locking contexts. But then we need irq work to raise its own | 370 | * locking contexts. But then we need irq work to raise its own |
371 | * interrupts to avoid circular dependency on the tick | 371 | * interrupts to avoid circular dependency on the tick |
372 | */ | 372 | */ |
373 | if (!arch_irq_work_has_interrupt()) { | 373 | if (!arch_irq_work_has_interrupt()) { |
374 | pr_warning("NO_HZ: Can't run full dynticks because arch doesn't " | 374 | pr_warning("NO_HZ: Can't run full dynticks because arch doesn't " |
375 | "support irq work self-IPIs\n"); | 375 | "support irq work self-IPIs\n"); |
376 | cpumask_clear(tick_nohz_full_mask); | 376 | cpumask_clear(tick_nohz_full_mask); |
377 | cpumask_copy(housekeeping_mask, cpu_possible_mask); | 377 | cpumask_copy(housekeeping_mask, cpu_possible_mask); |
378 | tick_nohz_full_running = false; | 378 | tick_nohz_full_running = false; |
379 | return; | 379 | return; |
380 | } | 380 | } |
381 | 381 | ||
382 | cpu = smp_processor_id(); | 382 | cpu = smp_processor_id(); |
383 | 383 | ||
384 | if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { | 384 | if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { |
385 | pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu); | 385 | pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu); |
386 | cpumask_clear_cpu(cpu, tick_nohz_full_mask); | 386 | cpumask_clear_cpu(cpu, tick_nohz_full_mask); |
387 | } | 387 | } |
388 | 388 | ||
389 | cpumask_andnot(housekeeping_mask, | 389 | cpumask_andnot(housekeeping_mask, |
390 | cpu_possible_mask, tick_nohz_full_mask); | 390 | cpu_possible_mask, tick_nohz_full_mask); |
391 | 391 | ||
392 | for_each_cpu(cpu, tick_nohz_full_mask) | 392 | for_each_cpu(cpu, tick_nohz_full_mask) |
393 | context_tracking_cpu_set(cpu); | 393 | context_tracking_cpu_set(cpu); |
394 | 394 | ||
395 | cpu_notifier(tick_nohz_cpu_down_callback, 0); | 395 | cpu_notifier(tick_nohz_cpu_down_callback, 0); |
396 | cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), tick_nohz_full_mask); | 396 | cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), tick_nohz_full_mask); |
397 | pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf); | 397 | pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf); |
398 | } | 398 | } |
399 | #endif | 399 | #endif |
400 | 400 | ||
401 | /* | 401 | /* |
402 | * NOHZ - aka dynamic tick functionality | 402 | * NOHZ - aka dynamic tick functionality |
403 | */ | 403 | */ |
404 | #ifdef CONFIG_NO_HZ_COMMON | 404 | #ifdef CONFIG_NO_HZ_COMMON |
405 | /* | 405 | /* |
406 | * NO HZ enabled ? | 406 | * NO HZ enabled ? |
407 | */ | 407 | */ |
408 | static int tick_nohz_enabled __read_mostly = 1; | 408 | static int tick_nohz_enabled __read_mostly = 1; |
409 | int tick_nohz_active __read_mostly; | 409 | int tick_nohz_active __read_mostly; |
410 | /* | 410 | /* |
411 | * Enable / Disable tickless mode | 411 | * Enable / Disable tickless mode |
412 | */ | 412 | */ |
413 | static int __init setup_tick_nohz(char *str) | 413 | static int __init setup_tick_nohz(char *str) |
414 | { | 414 | { |
415 | if (!strcmp(str, "off")) | 415 | if (!strcmp(str, "off")) |
416 | tick_nohz_enabled = 0; | 416 | tick_nohz_enabled = 0; |
417 | else if (!strcmp(str, "on")) | 417 | else if (!strcmp(str, "on")) |
418 | tick_nohz_enabled = 1; | 418 | tick_nohz_enabled = 1; |
419 | else | 419 | else |
420 | return 0; | 420 | return 0; |
421 | return 1; | 421 | return 1; |
422 | } | 422 | } |
423 | 423 | ||
424 | __setup("nohz=", setup_tick_nohz); | 424 | __setup("nohz=", setup_tick_nohz); |
425 | 425 | ||
426 | /** | 426 | /** |
427 | * tick_nohz_update_jiffies - update jiffies when idle was interrupted | 427 | * tick_nohz_update_jiffies - update jiffies when idle was interrupted |
428 | * | 428 | * |
429 | * Called from interrupt entry when the CPU was idle | 429 | * Called from interrupt entry when the CPU was idle |
430 | * | 430 | * |
431 | * In case the sched_tick was stopped on this CPU, we have to check if jiffies | 431 | * In case the sched_tick was stopped on this CPU, we have to check if jiffies |
432 | * must be updated. Otherwise an interrupt handler could use a stale jiffy | 432 | * must be updated. Otherwise an interrupt handler could use a stale jiffy |
433 | * value. We do this unconditionally on any cpu, as we don't know whether the | 433 | * value. We do this unconditionally on any cpu, as we don't know whether the |
434 | * cpu, which has the update task assigned is in a long sleep. | 434 | * cpu, which has the update task assigned is in a long sleep. |
435 | */ | 435 | */ |
436 | static void tick_nohz_update_jiffies(ktime_t now) | 436 | static void tick_nohz_update_jiffies(ktime_t now) |
437 | { | 437 | { |
438 | unsigned long flags; | 438 | unsigned long flags; |
439 | 439 | ||
440 | __this_cpu_write(tick_cpu_sched.idle_waketime, now); | 440 | __this_cpu_write(tick_cpu_sched.idle_waketime, now); |
441 | 441 | ||
442 | local_irq_save(flags); | 442 | local_irq_save(flags); |
443 | tick_do_update_jiffies64(now); | 443 | tick_do_update_jiffies64(now); |
444 | local_irq_restore(flags); | 444 | local_irq_restore(flags); |
445 | 445 | ||
446 | touch_softlockup_watchdog(); | 446 | touch_softlockup_watchdog(); |
447 | } | 447 | } |
448 | 448 | ||
449 | /* | 449 | /* |
450 | * Updates the per cpu time idle statistics counters | 450 | * Updates the per cpu time idle statistics counters |
451 | */ | 451 | */ |
452 | static void | 452 | static void |
453 | update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time) | 453 | update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time) |
454 | { | 454 | { |
455 | ktime_t delta; | 455 | ktime_t delta; |
456 | 456 | ||
457 | if (ts->idle_active) { | 457 | if (ts->idle_active) { |
458 | delta = ktime_sub(now, ts->idle_entrytime); | 458 | delta = ktime_sub(now, ts->idle_entrytime); |
459 | if (nr_iowait_cpu(cpu) > 0) | 459 | if (nr_iowait_cpu(cpu) > 0) |
460 | ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta); | 460 | ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta); |
461 | else | 461 | else |
462 | ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); | 462 | ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); |
463 | ts->idle_entrytime = now; | 463 | ts->idle_entrytime = now; |
464 | } | 464 | } |
465 | 465 | ||
466 | if (last_update_time) | 466 | if (last_update_time) |
467 | *last_update_time = ktime_to_us(now); | 467 | *last_update_time = ktime_to_us(now); |
468 | 468 | ||
469 | } | 469 | } |
470 | 470 | ||
471 | static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now) | 471 | static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now) |
472 | { | 472 | { |
473 | update_ts_time_stats(smp_processor_id(), ts, now, NULL); | 473 | update_ts_time_stats(smp_processor_id(), ts, now, NULL); |
474 | ts->idle_active = 0; | 474 | ts->idle_active = 0; |
475 | 475 | ||
476 | sched_clock_idle_wakeup_event(0); | 476 | sched_clock_idle_wakeup_event(0); |
477 | } | 477 | } |
478 | 478 | ||
479 | static ktime_t tick_nohz_start_idle(struct tick_sched *ts) | 479 | static ktime_t tick_nohz_start_idle(struct tick_sched *ts) |
480 | { | 480 | { |
481 | ktime_t now = ktime_get(); | 481 | ktime_t now = ktime_get(); |
482 | 482 | ||
483 | ts->idle_entrytime = now; | 483 | ts->idle_entrytime = now; |
484 | ts->idle_active = 1; | 484 | ts->idle_active = 1; |
485 | sched_clock_idle_sleep_event(); | 485 | sched_clock_idle_sleep_event(); |
486 | return now; | 486 | return now; |
487 | } | 487 | } |
488 | 488 | ||
489 | /** | 489 | /** |
490 | * get_cpu_idle_time_us - get the total idle time of a cpu | 490 | * get_cpu_idle_time_us - get the total idle time of a cpu |
491 | * @cpu: CPU number to query | 491 | * @cpu: CPU number to query |
492 | * @last_update_time: variable to store update time in. Do not update | 492 | * @last_update_time: variable to store update time in. Do not update |
493 | * counters if NULL. | 493 | * counters if NULL. |
494 | * | 494 | * |
495 | * Return the cummulative idle time (since boot) for a given | 495 | * Return the cummulative idle time (since boot) for a given |
496 | * CPU, in microseconds. | 496 | * CPU, in microseconds. |
497 | * | 497 | * |
498 | * This time is measured via accounting rather than sampling, | 498 | * This time is measured via accounting rather than sampling, |
499 | * and is as accurate as ktime_get() is. | 499 | * and is as accurate as ktime_get() is. |
500 | * | 500 | * |
501 | * This function returns -1 if NOHZ is not enabled. | 501 | * This function returns -1 if NOHZ is not enabled. |
502 | */ | 502 | */ |
503 | u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) | 503 | u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) |
504 | { | 504 | { |
505 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); | 505 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); |
506 | ktime_t now, idle; | 506 | ktime_t now, idle; |
507 | 507 | ||
508 | if (!tick_nohz_active) | 508 | if (!tick_nohz_active) |
509 | return -1; | 509 | return -1; |
510 | 510 | ||
511 | now = ktime_get(); | 511 | now = ktime_get(); |
512 | if (last_update_time) { | 512 | if (last_update_time) { |
513 | update_ts_time_stats(cpu, ts, now, last_update_time); | 513 | update_ts_time_stats(cpu, ts, now, last_update_time); |
514 | idle = ts->idle_sleeptime; | 514 | idle = ts->idle_sleeptime; |
515 | } else { | 515 | } else { |
516 | if (ts->idle_active && !nr_iowait_cpu(cpu)) { | 516 | if (ts->idle_active && !nr_iowait_cpu(cpu)) { |
517 | ktime_t delta = ktime_sub(now, ts->idle_entrytime); | 517 | ktime_t delta = ktime_sub(now, ts->idle_entrytime); |
518 | 518 | ||
519 | idle = ktime_add(ts->idle_sleeptime, delta); | 519 | idle = ktime_add(ts->idle_sleeptime, delta); |
520 | } else { | 520 | } else { |
521 | idle = ts->idle_sleeptime; | 521 | idle = ts->idle_sleeptime; |
522 | } | 522 | } |
523 | } | 523 | } |
524 | 524 | ||
525 | return ktime_to_us(idle); | 525 | return ktime_to_us(idle); |
526 | 526 | ||
527 | } | 527 | } |
528 | EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); | 528 | EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); |
529 | 529 | ||
530 | /** | 530 | /** |
531 | * get_cpu_iowait_time_us - get the total iowait time of a cpu | 531 | * get_cpu_iowait_time_us - get the total iowait time of a cpu |
532 | * @cpu: CPU number to query | 532 | * @cpu: CPU number to query |
533 | * @last_update_time: variable to store update time in. Do not update | 533 | * @last_update_time: variable to store update time in. Do not update |
534 | * counters if NULL. | 534 | * counters if NULL. |
535 | * | 535 | * |
536 | * Return the cummulative iowait time (since boot) for a given | 536 | * Return the cummulative iowait time (since boot) for a given |
537 | * CPU, in microseconds. | 537 | * CPU, in microseconds. |
538 | * | 538 | * |
539 | * This time is measured via accounting rather than sampling, | 539 | * This time is measured via accounting rather than sampling, |
540 | * and is as accurate as ktime_get() is. | 540 | * and is as accurate as ktime_get() is. |
541 | * | 541 | * |
542 | * This function returns -1 if NOHZ is not enabled. | 542 | * This function returns -1 if NOHZ is not enabled. |
543 | */ | 543 | */ |
544 | u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) | 544 | u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) |
545 | { | 545 | { |
546 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); | 546 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); |
547 | ktime_t now, iowait; | 547 | ktime_t now, iowait; |
548 | 548 | ||
549 | if (!tick_nohz_active) | 549 | if (!tick_nohz_active) |
550 | return -1; | 550 | return -1; |
551 | 551 | ||
552 | now = ktime_get(); | 552 | now = ktime_get(); |
553 | if (last_update_time) { | 553 | if (last_update_time) { |
554 | update_ts_time_stats(cpu, ts, now, last_update_time); | 554 | update_ts_time_stats(cpu, ts, now, last_update_time); |
555 | iowait = ts->iowait_sleeptime; | 555 | iowait = ts->iowait_sleeptime; |
556 | } else { | 556 | } else { |
557 | if (ts->idle_active && nr_iowait_cpu(cpu) > 0) { | 557 | if (ts->idle_active && nr_iowait_cpu(cpu) > 0) { |
558 | ktime_t delta = ktime_sub(now, ts->idle_entrytime); | 558 | ktime_t delta = ktime_sub(now, ts->idle_entrytime); |
559 | 559 | ||
560 | iowait = ktime_add(ts->iowait_sleeptime, delta); | 560 | iowait = ktime_add(ts->iowait_sleeptime, delta); |
561 | } else { | 561 | } else { |
562 | iowait = ts->iowait_sleeptime; | 562 | iowait = ts->iowait_sleeptime; |
563 | } | 563 | } |
564 | } | 564 | } |
565 | 565 | ||
566 | return ktime_to_us(iowait); | 566 | return ktime_to_us(iowait); |
567 | } | 567 | } |
568 | EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); | 568 | EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); |
569 | 569 | ||
570 | static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, | 570 | static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, |
571 | ktime_t now, int cpu) | 571 | ktime_t now, int cpu) |
572 | { | 572 | { |
573 | unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; | 573 | unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; |
574 | ktime_t last_update, expires, ret = { .tv64 = 0 }; | 574 | ktime_t last_update, expires, ret = { .tv64 = 0 }; |
575 | unsigned long rcu_delta_jiffies; | 575 | unsigned long rcu_delta_jiffies; |
576 | struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); | 576 | struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); |
577 | u64 time_delta; | 577 | u64 time_delta; |
578 | 578 | ||
579 | time_delta = timekeeping_max_deferment(); | 579 | time_delta = timekeeping_max_deferment(); |
580 | 580 | ||
581 | /* Read jiffies and the time when jiffies were updated last */ | 581 | /* Read jiffies and the time when jiffies were updated last */ |
582 | do { | 582 | do { |
583 | seq = read_seqbegin(&jiffies_lock); | 583 | seq = read_seqbegin(&jiffies_lock); |
584 | last_update = last_jiffies_update; | 584 | last_update = last_jiffies_update; |
585 | last_jiffies = jiffies; | 585 | last_jiffies = jiffies; |
586 | } while (read_seqretry(&jiffies_lock, seq)); | 586 | } while (read_seqretry(&jiffies_lock, seq)); |
587 | 587 | ||
588 | if (rcu_needs_cpu(&rcu_delta_jiffies) || | 588 | if (rcu_needs_cpu(&rcu_delta_jiffies) || |
589 | arch_needs_cpu() || irq_work_needs_cpu()) { | 589 | arch_needs_cpu() || irq_work_needs_cpu()) { |
590 | next_jiffies = last_jiffies + 1; | 590 | next_jiffies = last_jiffies + 1; |
591 | delta_jiffies = 1; | 591 | delta_jiffies = 1; |
592 | } else { | 592 | } else { |
593 | /* Get the next timer wheel timer */ | 593 | /* Get the next timer wheel timer */ |
594 | next_jiffies = get_next_timer_interrupt(last_jiffies); | 594 | next_jiffies = get_next_timer_interrupt(last_jiffies); |
595 | delta_jiffies = next_jiffies - last_jiffies; | 595 | delta_jiffies = next_jiffies - last_jiffies; |
596 | if (rcu_delta_jiffies < delta_jiffies) { | 596 | if (rcu_delta_jiffies < delta_jiffies) { |
597 | next_jiffies = last_jiffies + rcu_delta_jiffies; | 597 | next_jiffies = last_jiffies + rcu_delta_jiffies; |
598 | delta_jiffies = rcu_delta_jiffies; | 598 | delta_jiffies = rcu_delta_jiffies; |
599 | } | 599 | } |
600 | } | 600 | } |
601 | 601 | ||
602 | /* | 602 | /* |
603 | * Do not stop the tick, if we are only one off (or less) | 603 | * Do not stop the tick, if we are only one off (or less) |
604 | * or if the cpu is required for RCU: | 604 | * or if the cpu is required for RCU: |
605 | */ | 605 | */ |
606 | if (!ts->tick_stopped && delta_jiffies <= 1) | 606 | if (!ts->tick_stopped && delta_jiffies <= 1) |
607 | goto out; | 607 | goto out; |
608 | 608 | ||
609 | /* Schedule the tick, if we are at least one jiffie off */ | 609 | /* Schedule the tick, if we are at least one jiffie off */ |
610 | if ((long)delta_jiffies >= 1) { | 610 | if ((long)delta_jiffies >= 1) { |
611 | 611 | ||
612 | /* | 612 | /* |
613 | * If this cpu is the one which updates jiffies, then | 613 | * If this cpu is the one which updates jiffies, then |
614 | * give up the assignment and let it be taken by the | 614 | * give up the assignment and let it be taken by the |
615 | * cpu which runs the tick timer next, which might be | 615 | * cpu which runs the tick timer next, which might be |
616 | * this cpu as well. If we don't drop this here the | 616 | * this cpu as well. If we don't drop this here the |
617 | * jiffies might be stale and do_timer() never | 617 | * jiffies might be stale and do_timer() never |
618 | * invoked. Keep track of the fact that it was the one | 618 | * invoked. Keep track of the fact that it was the one |
619 | * which had the do_timer() duty last. If this cpu is | 619 | * which had the do_timer() duty last. If this cpu is |
620 | * the one which had the do_timer() duty last, we | 620 | * the one which had the do_timer() duty last, we |
621 | * limit the sleep time to the timekeeping | 621 | * limit the sleep time to the timekeeping |
622 | * max_deferement value which we retrieved | 622 | * max_deferement value which we retrieved |
623 | * above. Otherwise we can sleep as long as we want. | 623 | * above. Otherwise we can sleep as long as we want. |
624 | */ | 624 | */ |
625 | if (cpu == tick_do_timer_cpu) { | 625 | if (cpu == tick_do_timer_cpu) { |
626 | tick_do_timer_cpu = TICK_DO_TIMER_NONE; | 626 | tick_do_timer_cpu = TICK_DO_TIMER_NONE; |
627 | ts->do_timer_last = 1; | 627 | ts->do_timer_last = 1; |
628 | } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) { | 628 | } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) { |
629 | time_delta = KTIME_MAX; | 629 | time_delta = KTIME_MAX; |
630 | ts->do_timer_last = 0; | 630 | ts->do_timer_last = 0; |
631 | } else if (!ts->do_timer_last) { | 631 | } else if (!ts->do_timer_last) { |
632 | time_delta = KTIME_MAX; | 632 | time_delta = KTIME_MAX; |
633 | } | 633 | } |
634 | 634 | ||
635 | #ifdef CONFIG_NO_HZ_FULL | 635 | #ifdef CONFIG_NO_HZ_FULL |
636 | if (!ts->inidle) { | 636 | if (!ts->inidle) { |
637 | time_delta = min(time_delta, | 637 | time_delta = min(time_delta, |
638 | scheduler_tick_max_deferment()); | 638 | scheduler_tick_max_deferment()); |
639 | } | 639 | } |
640 | #endif | 640 | #endif |
641 | 641 | ||
642 | /* | 642 | /* |
643 | * calculate the expiry time for the next timer wheel | 643 | * calculate the expiry time for the next timer wheel |
644 | * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals | 644 | * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals |
645 | * that there is no timer pending or at least extremely | 645 | * that there is no timer pending or at least extremely |
646 | * far into the future (12 days for HZ=1000). In this | 646 | * far into the future (12 days for HZ=1000). In this |
647 | * case we set the expiry to the end of time. | 647 | * case we set the expiry to the end of time. |
648 | */ | 648 | */ |
649 | if (likely(delta_jiffies < NEXT_TIMER_MAX_DELTA)) { | 649 | if (likely(delta_jiffies < NEXT_TIMER_MAX_DELTA)) { |
650 | /* | 650 | /* |
651 | * Calculate the time delta for the next timer event. | 651 | * Calculate the time delta for the next timer event. |
652 | * If the time delta exceeds the maximum time delta | 652 | * If the time delta exceeds the maximum time delta |
653 | * permitted by the current clocksource then adjust | 653 | * permitted by the current clocksource then adjust |
654 | * the time delta accordingly to ensure the | 654 | * the time delta accordingly to ensure the |
655 | * clocksource does not wrap. | 655 | * clocksource does not wrap. |
656 | */ | 656 | */ |
657 | time_delta = min_t(u64, time_delta, | 657 | time_delta = min_t(u64, time_delta, |
658 | tick_period.tv64 * delta_jiffies); | 658 | tick_period.tv64 * delta_jiffies); |
659 | } | 659 | } |
660 | 660 | ||
661 | if (time_delta < KTIME_MAX) | 661 | if (time_delta < KTIME_MAX) |
662 | expires = ktime_add_ns(last_update, time_delta); | 662 | expires = ktime_add_ns(last_update, time_delta); |
663 | else | 663 | else |
664 | expires.tv64 = KTIME_MAX; | 664 | expires.tv64 = KTIME_MAX; |
665 | 665 | ||
666 | /* Skip reprogram of event if its not changed */ | 666 | /* Skip reprogram of event if its not changed */ |
667 | if (ts->tick_stopped && ktime_equal(expires, dev->next_event)) | 667 | if (ts->tick_stopped && ktime_equal(expires, dev->next_event)) |
668 | goto out; | 668 | goto out; |
669 | 669 | ||
670 | ret = expires; | 670 | ret = expires; |
671 | 671 | ||
672 | /* | 672 | /* |
673 | * nohz_stop_sched_tick can be called several times before | 673 | * nohz_stop_sched_tick can be called several times before |
674 | * the nohz_restart_sched_tick is called. This happens when | 674 | * the nohz_restart_sched_tick is called. This happens when |
675 | * interrupts arrive which do not cause a reschedule. In the | 675 | * interrupts arrive which do not cause a reschedule. In the |
676 | * first call we save the current tick time, so we can restart | 676 | * first call we save the current tick time, so we can restart |
677 | * the scheduler tick in nohz_restart_sched_tick. | 677 | * the scheduler tick in nohz_restart_sched_tick. |
678 | */ | 678 | */ |
679 | if (!ts->tick_stopped) { | 679 | if (!ts->tick_stopped) { |
680 | nohz_balance_enter_idle(cpu); | 680 | nohz_balance_enter_idle(cpu); |
681 | calc_load_enter_idle(); | 681 | calc_load_enter_idle(); |
682 | 682 | ||
683 | ts->last_tick = hrtimer_get_expires(&ts->sched_timer); | 683 | ts->last_tick = hrtimer_get_expires(&ts->sched_timer); |
684 | ts->tick_stopped = 1; | 684 | ts->tick_stopped = 1; |
685 | trace_tick_stop(1, " "); | 685 | trace_tick_stop(1, " "); |
686 | } | 686 | } |
687 | 687 | ||
688 | /* | 688 | /* |
689 | * If the expiration time == KTIME_MAX, then | 689 | * If the expiration time == KTIME_MAX, then |
690 | * in this case we simply stop the tick timer. | 690 | * in this case we simply stop the tick timer. |
691 | */ | 691 | */ |
692 | if (unlikely(expires.tv64 == KTIME_MAX)) { | 692 | if (unlikely(expires.tv64 == KTIME_MAX)) { |
693 | if (ts->nohz_mode == NOHZ_MODE_HIGHRES) | 693 | if (ts->nohz_mode == NOHZ_MODE_HIGHRES) |
694 | hrtimer_cancel(&ts->sched_timer); | 694 | hrtimer_cancel(&ts->sched_timer); |
695 | goto out; | 695 | goto out; |
696 | } | 696 | } |
697 | 697 | ||
698 | if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { | 698 | if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { |
699 | hrtimer_start(&ts->sched_timer, expires, | 699 | hrtimer_start(&ts->sched_timer, expires, |
700 | HRTIMER_MODE_ABS_PINNED); | 700 | HRTIMER_MODE_ABS_PINNED); |
701 | /* Check, if the timer was already in the past */ | 701 | /* Check, if the timer was already in the past */ |
702 | if (hrtimer_active(&ts->sched_timer)) | 702 | if (hrtimer_active(&ts->sched_timer)) |
703 | goto out; | 703 | goto out; |
704 | } else if (!tick_program_event(expires, 0)) | 704 | } else if (!tick_program_event(expires, 0)) |
705 | goto out; | 705 | goto out; |
706 | /* | 706 | /* |
707 | * We are past the event already. So we crossed a | 707 | * We are past the event already. So we crossed a |
708 | * jiffie boundary. Update jiffies and raise the | 708 | * jiffie boundary. Update jiffies and raise the |
709 | * softirq. | 709 | * softirq. |
710 | */ | 710 | */ |
711 | tick_do_update_jiffies64(ktime_get()); | 711 | tick_do_update_jiffies64(ktime_get()); |
712 | } | 712 | } |
713 | raise_softirq_irqoff(TIMER_SOFTIRQ); | 713 | raise_softirq_irqoff(TIMER_SOFTIRQ); |
714 | out: | 714 | out: |
715 | ts->next_jiffies = next_jiffies; | 715 | ts->next_jiffies = next_jiffies; |
716 | ts->last_jiffies = last_jiffies; | 716 | ts->last_jiffies = last_jiffies; |
717 | ts->sleep_length = ktime_sub(dev->next_event, now); | 717 | ts->sleep_length = ktime_sub(dev->next_event, now); |
718 | 718 | ||
719 | return ret; | 719 | return ret; |
720 | } | 720 | } |
721 | 721 | ||
722 | static void tick_nohz_full_stop_tick(struct tick_sched *ts) | 722 | static void tick_nohz_full_stop_tick(struct tick_sched *ts) |
723 | { | 723 | { |
724 | #ifdef CONFIG_NO_HZ_FULL | 724 | #ifdef CONFIG_NO_HZ_FULL |
725 | int cpu = smp_processor_id(); | 725 | int cpu = smp_processor_id(); |
726 | 726 | ||
727 | if (!tick_nohz_full_cpu(cpu) || is_idle_task(current)) | 727 | if (!tick_nohz_full_cpu(cpu) || is_idle_task(current)) |
728 | return; | 728 | return; |
729 | 729 | ||
730 | if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE) | 730 | if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE) |
731 | return; | 731 | return; |
732 | 732 | ||
733 | if (!can_stop_full_tick()) | 733 | if (!can_stop_full_tick()) |
734 | return; | 734 | return; |
735 | 735 | ||
736 | tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); | 736 | tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); |
737 | #endif | 737 | #endif |
738 | } | 738 | } |
739 | 739 | ||
740 | static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) | 740 | static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) |
741 | { | 741 | { |
742 | /* | 742 | /* |
743 | * If this cpu is offline and it is the one which updates | 743 | * If this cpu is offline and it is the one which updates |
744 | * jiffies, then give up the assignment and let it be taken by | 744 | * jiffies, then give up the assignment and let it be taken by |
745 | * the cpu which runs the tick timer next. If we don't drop | 745 | * the cpu which runs the tick timer next. If we don't drop |
746 | * this here the jiffies might be stale and do_timer() never | 746 | * this here the jiffies might be stale and do_timer() never |
747 | * invoked. | 747 | * invoked. |
748 | */ | 748 | */ |
749 | if (unlikely(!cpu_online(cpu))) { | 749 | if (unlikely(!cpu_online(cpu))) { |
750 | if (cpu == tick_do_timer_cpu) | 750 | if (cpu == tick_do_timer_cpu) |
751 | tick_do_timer_cpu = TICK_DO_TIMER_NONE; | 751 | tick_do_timer_cpu = TICK_DO_TIMER_NONE; |
752 | return false; | 752 | return false; |
753 | } | 753 | } |
754 | 754 | ||
755 | if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) { | 755 | if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) { |
756 | ts->sleep_length = (ktime_t) { .tv64 = NSEC_PER_SEC/HZ }; | 756 | ts->sleep_length = (ktime_t) { .tv64 = NSEC_PER_SEC/HZ }; |
757 | return false; | 757 | return false; |
758 | } | 758 | } |
759 | 759 | ||
760 | if (need_resched()) | 760 | if (need_resched()) |
761 | return false; | 761 | return false; |
762 | 762 | ||
763 | if (unlikely(local_softirq_pending() && cpu_online(cpu))) { | 763 | if (unlikely(local_softirq_pending() && cpu_online(cpu))) { |
764 | static int ratelimit; | 764 | static int ratelimit; |
765 | 765 | ||
766 | if (ratelimit < 10 && | 766 | if (ratelimit < 10 && |
767 | (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) { | 767 | (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) { |
768 | pr_warn("NOHZ: local_softirq_pending %02x\n", | 768 | pr_warn("NOHZ: local_softirq_pending %02x\n", |
769 | (unsigned int) local_softirq_pending()); | 769 | (unsigned int) local_softirq_pending()); |
770 | ratelimit++; | 770 | ratelimit++; |
771 | } | 771 | } |
772 | return false; | 772 | return false; |
773 | } | 773 | } |
774 | 774 | ||
775 | if (tick_nohz_full_enabled()) { | 775 | if (tick_nohz_full_enabled()) { |
776 | /* | 776 | /* |
777 | * Keep the tick alive to guarantee timekeeping progression | 777 | * Keep the tick alive to guarantee timekeeping progression |
778 | * if there are full dynticks CPUs around | 778 | * if there are full dynticks CPUs around |
779 | */ | 779 | */ |
780 | if (tick_do_timer_cpu == cpu) | 780 | if (tick_do_timer_cpu == cpu) |
781 | return false; | 781 | return false; |
782 | /* | 782 | /* |
783 | * Boot safety: make sure the timekeeping duty has been | 783 | * Boot safety: make sure the timekeeping duty has been |
784 | * assigned before entering dyntick-idle mode, | 784 | * assigned before entering dyntick-idle mode, |
785 | */ | 785 | */ |
786 | if (tick_do_timer_cpu == TICK_DO_TIMER_NONE) | 786 | if (tick_do_timer_cpu == TICK_DO_TIMER_NONE) |
787 | return false; | 787 | return false; |
788 | } | 788 | } |
789 | 789 | ||
790 | return true; | 790 | return true; |
791 | } | 791 | } |
792 | 792 | ||
793 | static void __tick_nohz_idle_enter(struct tick_sched *ts) | 793 | static void __tick_nohz_idle_enter(struct tick_sched *ts) |
794 | { | 794 | { |
795 | ktime_t now, expires; | 795 | ktime_t now, expires; |
796 | int cpu = smp_processor_id(); | 796 | int cpu = smp_processor_id(); |
797 | 797 | ||
798 | now = tick_nohz_start_idle(ts); | 798 | now = tick_nohz_start_idle(ts); |
799 | 799 | ||
800 | if (can_stop_idle_tick(cpu, ts)) { | 800 | if (can_stop_idle_tick(cpu, ts)) { |
801 | int was_stopped = ts->tick_stopped; | 801 | int was_stopped = ts->tick_stopped; |
802 | 802 | ||
803 | ts->idle_calls++; | 803 | ts->idle_calls++; |
804 | 804 | ||
805 | expires = tick_nohz_stop_sched_tick(ts, now, cpu); | 805 | expires = tick_nohz_stop_sched_tick(ts, now, cpu); |
806 | if (expires.tv64 > 0LL) { | 806 | if (expires.tv64 > 0LL) { |
807 | ts->idle_sleeps++; | 807 | ts->idle_sleeps++; |
808 | ts->idle_expires = expires; | 808 | ts->idle_expires = expires; |
809 | } | 809 | } |
810 | 810 | ||
811 | if (!was_stopped && ts->tick_stopped) | 811 | if (!was_stopped && ts->tick_stopped) |
812 | ts->idle_jiffies = ts->last_jiffies; | 812 | ts->idle_jiffies = ts->last_jiffies; |
813 | } | 813 | } |
814 | } | 814 | } |
815 | 815 | ||
816 | /** | 816 | /** |
817 | * tick_nohz_idle_enter - stop the idle tick from the idle task | 817 | * tick_nohz_idle_enter - stop the idle tick from the idle task |
818 | * | 818 | * |
819 | * When the next event is more than a tick into the future, stop the idle tick | 819 | * When the next event is more than a tick into the future, stop the idle tick |
820 | * Called when we start the idle loop. | 820 | * Called when we start the idle loop. |
821 | * | 821 | * |
822 | * The arch is responsible of calling: | 822 | * The arch is responsible of calling: |
823 | * | 823 | * |
824 | * - rcu_idle_enter() after its last use of RCU before the CPU is put | 824 | * - rcu_idle_enter() after its last use of RCU before the CPU is put |
825 | * to sleep. | 825 | * to sleep. |
826 | * - rcu_idle_exit() before the first use of RCU after the CPU is woken up. | 826 | * - rcu_idle_exit() before the first use of RCU after the CPU is woken up. |
827 | */ | 827 | */ |
828 | void tick_nohz_idle_enter(void) | 828 | void tick_nohz_idle_enter(void) |
829 | { | 829 | { |
830 | struct tick_sched *ts; | 830 | struct tick_sched *ts; |
831 | 831 | ||
832 | WARN_ON_ONCE(irqs_disabled()); | 832 | WARN_ON_ONCE(irqs_disabled()); |
833 | 833 | ||
834 | /* | 834 | /* |
835 | * Update the idle state in the scheduler domain hierarchy | 835 | * Update the idle state in the scheduler domain hierarchy |
836 | * when tick_nohz_stop_sched_tick() is called from the idle loop. | 836 | * when tick_nohz_stop_sched_tick() is called from the idle loop. |
837 | * State will be updated to busy during the first busy tick after | 837 | * State will be updated to busy during the first busy tick after |
838 | * exiting idle. | 838 | * exiting idle. |
839 | */ | 839 | */ |
840 | set_cpu_sd_state_idle(); | 840 | set_cpu_sd_state_idle(); |
841 | 841 | ||
842 | local_irq_disable(); | 842 | local_irq_disable(); |
843 | 843 | ||
844 | ts = this_cpu_ptr(&tick_cpu_sched); | 844 | ts = this_cpu_ptr(&tick_cpu_sched); |
845 | ts->inidle = 1; | 845 | ts->inidle = 1; |
846 | __tick_nohz_idle_enter(ts); | 846 | __tick_nohz_idle_enter(ts); |
847 | 847 | ||
848 | local_irq_enable(); | 848 | local_irq_enable(); |
849 | } | 849 | } |
850 | EXPORT_SYMBOL_GPL(tick_nohz_idle_enter); | ||
851 | 850 | ||
852 | /** | 851 | /** |
853 | * tick_nohz_irq_exit - update next tick event from interrupt exit | 852 | * tick_nohz_irq_exit - update next tick event from interrupt exit |
854 | * | 853 | * |
855 | * When an interrupt fires while we are idle and it doesn't cause | 854 | * When an interrupt fires while we are idle and it doesn't cause |
856 | * a reschedule, it may still add, modify or delete a timer, enqueue | 855 | * a reschedule, it may still add, modify or delete a timer, enqueue |
857 | * an RCU callback, etc... | 856 | * an RCU callback, etc... |
858 | * So we need to re-calculate and reprogram the next tick event. | 857 | * So we need to re-calculate and reprogram the next tick event. |
859 | */ | 858 | */ |
860 | void tick_nohz_irq_exit(void) | 859 | void tick_nohz_irq_exit(void) |
861 | { | 860 | { |
862 | struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); | 861 | struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); |
863 | 862 | ||
864 | if (ts->inidle) | 863 | if (ts->inidle) |
865 | __tick_nohz_idle_enter(ts); | 864 | __tick_nohz_idle_enter(ts); |
866 | else | 865 | else |
867 | tick_nohz_full_stop_tick(ts); | 866 | tick_nohz_full_stop_tick(ts); |
868 | } | 867 | } |
869 | 868 | ||
870 | /** | 869 | /** |
871 | * tick_nohz_get_sleep_length - return the length of the current sleep | 870 | * tick_nohz_get_sleep_length - return the length of the current sleep |
872 | * | 871 | * |
873 | * Called from power state control code with interrupts disabled | 872 | * Called from power state control code with interrupts disabled |
874 | */ | 873 | */ |
875 | ktime_t tick_nohz_get_sleep_length(void) | 874 | ktime_t tick_nohz_get_sleep_length(void) |
876 | { | 875 | { |
877 | struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); | 876 | struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); |
878 | 877 | ||
879 | return ts->sleep_length; | 878 | return ts->sleep_length; |
880 | } | 879 | } |
881 | 880 | ||
882 | static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) | 881 | static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) |
883 | { | 882 | { |
884 | hrtimer_cancel(&ts->sched_timer); | 883 | hrtimer_cancel(&ts->sched_timer); |
885 | hrtimer_set_expires(&ts->sched_timer, ts->last_tick); | 884 | hrtimer_set_expires(&ts->sched_timer, ts->last_tick); |
886 | 885 | ||
887 | while (1) { | 886 | while (1) { |
888 | /* Forward the time to expire in the future */ | 887 | /* Forward the time to expire in the future */ |
889 | hrtimer_forward(&ts->sched_timer, now, tick_period); | 888 | hrtimer_forward(&ts->sched_timer, now, tick_period); |
890 | 889 | ||
891 | if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { | 890 | if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { |
892 | hrtimer_start_expires(&ts->sched_timer, | 891 | hrtimer_start_expires(&ts->sched_timer, |
893 | HRTIMER_MODE_ABS_PINNED); | 892 | HRTIMER_MODE_ABS_PINNED); |
894 | /* Check, if the timer was already in the past */ | 893 | /* Check, if the timer was already in the past */ |
895 | if (hrtimer_active(&ts->sched_timer)) | 894 | if (hrtimer_active(&ts->sched_timer)) |
896 | break; | 895 | break; |
897 | } else { | 896 | } else { |
898 | if (!tick_program_event( | 897 | if (!tick_program_event( |
899 | hrtimer_get_expires(&ts->sched_timer), 0)) | 898 | hrtimer_get_expires(&ts->sched_timer), 0)) |
900 | break; | 899 | break; |
901 | } | 900 | } |
902 | /* Reread time and update jiffies */ | 901 | /* Reread time and update jiffies */ |
903 | now = ktime_get(); | 902 | now = ktime_get(); |
904 | tick_do_update_jiffies64(now); | 903 | tick_do_update_jiffies64(now); |
905 | } | 904 | } |
906 | } | 905 | } |
907 | 906 | ||
908 | static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) | 907 | static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) |
909 | { | 908 | { |
910 | /* Update jiffies first */ | 909 | /* Update jiffies first */ |
911 | tick_do_update_jiffies64(now); | 910 | tick_do_update_jiffies64(now); |
912 | update_cpu_load_nohz(); | 911 | update_cpu_load_nohz(); |
913 | 912 | ||
914 | calc_load_exit_idle(); | 913 | calc_load_exit_idle(); |
915 | touch_softlockup_watchdog(); | 914 | touch_softlockup_watchdog(); |
916 | /* | 915 | /* |
917 | * Cancel the scheduled timer and restore the tick | 916 | * Cancel the scheduled timer and restore the tick |
918 | */ | 917 | */ |
919 | ts->tick_stopped = 0; | 918 | ts->tick_stopped = 0; |
920 | ts->idle_exittime = now; | 919 | ts->idle_exittime = now; |
921 | 920 | ||
922 | tick_nohz_restart(ts, now); | 921 | tick_nohz_restart(ts, now); |
923 | } | 922 | } |
924 | 923 | ||
925 | static void tick_nohz_account_idle_ticks(struct tick_sched *ts) | 924 | static void tick_nohz_account_idle_ticks(struct tick_sched *ts) |
926 | { | 925 | { |
927 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE | 926 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE |
928 | unsigned long ticks; | 927 | unsigned long ticks; |
929 | 928 | ||
930 | if (vtime_accounting_enabled()) | 929 | if (vtime_accounting_enabled()) |
931 | return; | 930 | return; |
932 | /* | 931 | /* |
933 | * We stopped the tick in idle. Update process times would miss the | 932 | * We stopped the tick in idle. Update process times would miss the |
934 | * time we slept as update_process_times does only a 1 tick | 933 | * time we slept as update_process_times does only a 1 tick |
935 | * accounting. Enforce that this is accounted to idle ! | 934 | * accounting. Enforce that this is accounted to idle ! |
936 | */ | 935 | */ |
937 | ticks = jiffies - ts->idle_jiffies; | 936 | ticks = jiffies - ts->idle_jiffies; |
938 | /* | 937 | /* |
939 | * We might be one off. Do not randomly account a huge number of ticks! | 938 | * We might be one off. Do not randomly account a huge number of ticks! |
940 | */ | 939 | */ |
941 | if (ticks && ticks < LONG_MAX) | 940 | if (ticks && ticks < LONG_MAX) |
942 | account_idle_ticks(ticks); | 941 | account_idle_ticks(ticks); |
943 | #endif | 942 | #endif |
944 | } | 943 | } |
945 | 944 | ||
946 | /** | 945 | /** |
947 | * tick_nohz_idle_exit - restart the idle tick from the idle task | 946 | * tick_nohz_idle_exit - restart the idle tick from the idle task |
948 | * | 947 | * |
949 | * Restart the idle tick when the CPU is woken up from idle | 948 | * Restart the idle tick when the CPU is woken up from idle |
950 | * This also exit the RCU extended quiescent state. The CPU | 949 | * This also exit the RCU extended quiescent state. The CPU |
951 | * can use RCU again after this function is called. | 950 | * can use RCU again after this function is called. |
952 | */ | 951 | */ |
953 | void tick_nohz_idle_exit(void) | 952 | void tick_nohz_idle_exit(void) |
954 | { | 953 | { |
955 | struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); | 954 | struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); |
956 | ktime_t now; | 955 | ktime_t now; |
957 | 956 | ||
958 | local_irq_disable(); | 957 | local_irq_disable(); |
959 | 958 | ||
960 | WARN_ON_ONCE(!ts->inidle); | 959 | WARN_ON_ONCE(!ts->inidle); |
961 | 960 | ||
962 | ts->inidle = 0; | 961 | ts->inidle = 0; |
963 | 962 | ||
964 | if (ts->idle_active || ts->tick_stopped) | 963 | if (ts->idle_active || ts->tick_stopped) |
965 | now = ktime_get(); | 964 | now = ktime_get(); |
966 | 965 | ||
967 | if (ts->idle_active) | 966 | if (ts->idle_active) |
968 | tick_nohz_stop_idle(ts, now); | 967 | tick_nohz_stop_idle(ts, now); |
969 | 968 | ||
970 | if (ts->tick_stopped) { | 969 | if (ts->tick_stopped) { |
971 | tick_nohz_restart_sched_tick(ts, now); | 970 | tick_nohz_restart_sched_tick(ts, now); |
972 | tick_nohz_account_idle_ticks(ts); | 971 | tick_nohz_account_idle_ticks(ts); |
973 | } | 972 | } |
974 | 973 | ||
975 | local_irq_enable(); | 974 | local_irq_enable(); |
976 | } | 975 | } |
977 | EXPORT_SYMBOL_GPL(tick_nohz_idle_exit); | ||
978 | 976 | ||
979 | static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now) | 977 | static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now) |
980 | { | 978 | { |
981 | hrtimer_forward(&ts->sched_timer, now, tick_period); | 979 | hrtimer_forward(&ts->sched_timer, now, tick_period); |
982 | return tick_program_event(hrtimer_get_expires(&ts->sched_timer), 0); | 980 | return tick_program_event(hrtimer_get_expires(&ts->sched_timer), 0); |
983 | } | 981 | } |
984 | 982 | ||
985 | /* | 983 | /* |
986 | * The nohz low res interrupt handler | 984 | * The nohz low res interrupt handler |
987 | */ | 985 | */ |
988 | static void tick_nohz_handler(struct clock_event_device *dev) | 986 | static void tick_nohz_handler(struct clock_event_device *dev) |
989 | { | 987 | { |
990 | struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); | 988 | struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); |
991 | struct pt_regs *regs = get_irq_regs(); | 989 | struct pt_regs *regs = get_irq_regs(); |
992 | ktime_t now = ktime_get(); | 990 | ktime_t now = ktime_get(); |
993 | 991 | ||
994 | dev->next_event.tv64 = KTIME_MAX; | 992 | dev->next_event.tv64 = KTIME_MAX; |
995 | 993 | ||
996 | tick_sched_do_timer(now); | 994 | tick_sched_do_timer(now); |
997 | tick_sched_handle(ts, regs); | 995 | tick_sched_handle(ts, regs); |
998 | 996 | ||
999 | /* No need to reprogram if we are running tickless */ | 997 | /* No need to reprogram if we are running tickless */ |
1000 | if (unlikely(ts->tick_stopped)) | 998 | if (unlikely(ts->tick_stopped)) |
1001 | return; | 999 | return; |
1002 | 1000 | ||
1003 | while (tick_nohz_reprogram(ts, now)) { | 1001 | while (tick_nohz_reprogram(ts, now)) { |
1004 | now = ktime_get(); | 1002 | now = ktime_get(); |
1005 | tick_do_update_jiffies64(now); | 1003 | tick_do_update_jiffies64(now); |
1006 | } | 1004 | } |
1007 | } | 1005 | } |
1008 | 1006 | ||
1009 | /** | 1007 | /** |
1010 | * tick_nohz_switch_to_nohz - switch to nohz mode | 1008 | * tick_nohz_switch_to_nohz - switch to nohz mode |
1011 | */ | 1009 | */ |
1012 | static void tick_nohz_switch_to_nohz(void) | 1010 | static void tick_nohz_switch_to_nohz(void) |
1013 | { | 1011 | { |
1014 | struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); | 1012 | struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); |
1015 | ktime_t next; | 1013 | ktime_t next; |
1016 | 1014 | ||
1017 | if (!tick_nohz_enabled) | 1015 | if (!tick_nohz_enabled) |
1018 | return; | 1016 | return; |
1019 | 1017 | ||
1020 | local_irq_disable(); | 1018 | local_irq_disable(); |
1021 | if (tick_switch_to_oneshot(tick_nohz_handler)) { | 1019 | if (tick_switch_to_oneshot(tick_nohz_handler)) { |
1022 | local_irq_enable(); | 1020 | local_irq_enable(); |
1023 | return; | 1021 | return; |
1024 | } | 1022 | } |
1025 | tick_nohz_active = 1; | 1023 | tick_nohz_active = 1; |
1026 | ts->nohz_mode = NOHZ_MODE_LOWRES; | 1024 | ts->nohz_mode = NOHZ_MODE_LOWRES; |
1027 | 1025 | ||
1028 | /* | 1026 | /* |
1029 | * Recycle the hrtimer in ts, so we can share the | 1027 | * Recycle the hrtimer in ts, so we can share the |
1030 | * hrtimer_forward with the highres code. | 1028 | * hrtimer_forward with the highres code. |
1031 | */ | 1029 | */ |
1032 | hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); | 1030 | hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); |
1033 | /* Get the next period */ | 1031 | /* Get the next period */ |
1034 | next = tick_init_jiffy_update(); | 1032 | next = tick_init_jiffy_update(); |
1035 | 1033 | ||
1036 | for (;;) { | 1034 | for (;;) { |
1037 | hrtimer_set_expires(&ts->sched_timer, next); | 1035 | hrtimer_set_expires(&ts->sched_timer, next); |
1038 | if (!tick_program_event(next, 0)) | 1036 | if (!tick_program_event(next, 0)) |
1039 | break; | 1037 | break; |
1040 | next = ktime_add(next, tick_period); | 1038 | next = ktime_add(next, tick_period); |
1041 | } | 1039 | } |
1042 | local_irq_enable(); | 1040 | local_irq_enable(); |
1043 | } | 1041 | } |
1044 | 1042 | ||
1045 | /* | 1043 | /* |
1046 | * When NOHZ is enabled and the tick is stopped, we need to kick the | 1044 | * When NOHZ is enabled and the tick is stopped, we need to kick the |
1047 | * tick timer from irq_enter() so that the jiffies update is kept | 1045 | * tick timer from irq_enter() so that the jiffies update is kept |
1048 | * alive during long running softirqs. That's ugly as hell, but | 1046 | * alive during long running softirqs. That's ugly as hell, but |
1049 | * correctness is key even if we need to fix the offending softirq in | 1047 | * correctness is key even if we need to fix the offending softirq in |
1050 | * the first place. | 1048 | * the first place. |
1051 | * | 1049 | * |
1052 | * Note, this is different to tick_nohz_restart. We just kick the | 1050 | * Note, this is different to tick_nohz_restart. We just kick the |
1053 | * timer and do not touch the other magic bits which need to be done | 1051 | * timer and do not touch the other magic bits which need to be done |
1054 | * when idle is left. | 1052 | * when idle is left. |
1055 | */ | 1053 | */ |
1056 | static void tick_nohz_kick_tick(struct tick_sched *ts, ktime_t now) | 1054 | static void tick_nohz_kick_tick(struct tick_sched *ts, ktime_t now) |
1057 | { | 1055 | { |
1058 | #if 0 | 1056 | #if 0 |
1059 | /* Switch back to 2.6.27 behaviour */ | 1057 | /* Switch back to 2.6.27 behaviour */ |
1060 | ktime_t delta; | 1058 | ktime_t delta; |
1061 | 1059 | ||
1062 | /* | 1060 | /* |
1063 | * Do not touch the tick device, when the next expiry is either | 1061 | * Do not touch the tick device, when the next expiry is either |
1064 | * already reached or less/equal than the tick period. | 1062 | * already reached or less/equal than the tick period. |
1065 | */ | 1063 | */ |
1066 | delta = ktime_sub(hrtimer_get_expires(&ts->sched_timer), now); | 1064 | delta = ktime_sub(hrtimer_get_expires(&ts->sched_timer), now); |
1067 | if (delta.tv64 <= tick_period.tv64) | 1065 | if (delta.tv64 <= tick_period.tv64) |
1068 | return; | 1066 | return; |
1069 | 1067 | ||
1070 | tick_nohz_restart(ts, now); | 1068 | tick_nohz_restart(ts, now); |
1071 | #endif | 1069 | #endif |
1072 | } | 1070 | } |
1073 | 1071 | ||
1074 | static inline void tick_nohz_irq_enter(void) | 1072 | static inline void tick_nohz_irq_enter(void) |
1075 | { | 1073 | { |
1076 | struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); | 1074 | struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); |
1077 | ktime_t now; | 1075 | ktime_t now; |
1078 | 1076 | ||
1079 | if (!ts->idle_active && !ts->tick_stopped) | 1077 | if (!ts->idle_active && !ts->tick_stopped) |
1080 | return; | 1078 | return; |
1081 | now = ktime_get(); | 1079 | now = ktime_get(); |
1082 | if (ts->idle_active) | 1080 | if (ts->idle_active) |
1083 | tick_nohz_stop_idle(ts, now); | 1081 | tick_nohz_stop_idle(ts, now); |
1084 | if (ts->tick_stopped) { | 1082 | if (ts->tick_stopped) { |
1085 | tick_nohz_update_jiffies(now); | 1083 | tick_nohz_update_jiffies(now); |
1086 | tick_nohz_kick_tick(ts, now); | 1084 | tick_nohz_kick_tick(ts, now); |
1087 | } | 1085 | } |
1088 | } | 1086 | } |
1089 | 1087 | ||
1090 | #else | 1088 | #else |
1091 | 1089 | ||
1092 | static inline void tick_nohz_switch_to_nohz(void) { } | 1090 | static inline void tick_nohz_switch_to_nohz(void) { } |
1093 | static inline void tick_nohz_irq_enter(void) { } | 1091 | static inline void tick_nohz_irq_enter(void) { } |
1094 | 1092 | ||
1095 | #endif /* CONFIG_NO_HZ_COMMON */ | 1093 | #endif /* CONFIG_NO_HZ_COMMON */ |
1096 | 1094 | ||
1097 | /* | 1095 | /* |
1098 | * Called from irq_enter to notify about the possible interruption of idle() | 1096 | * Called from irq_enter to notify about the possible interruption of idle() |
1099 | */ | 1097 | */ |
1100 | void tick_irq_enter(void) | 1098 | void tick_irq_enter(void) |
1101 | { | 1099 | { |
1102 | tick_check_oneshot_broadcast_this_cpu(); | 1100 | tick_check_oneshot_broadcast_this_cpu(); |
1103 | tick_nohz_irq_enter(); | 1101 | tick_nohz_irq_enter(); |
1104 | } | 1102 | } |
1105 | 1103 | ||
1106 | /* | 1104 | /* |
1107 | * High resolution timer specific code | 1105 | * High resolution timer specific code |
1108 | */ | 1106 | */ |
1109 | #ifdef CONFIG_HIGH_RES_TIMERS | 1107 | #ifdef CONFIG_HIGH_RES_TIMERS |
1110 | /* | 1108 | /* |
1111 | * We rearm the timer until we get disabled by the idle code. | 1109 | * We rearm the timer until we get disabled by the idle code. |
1112 | * Called with interrupts disabled. | 1110 | * Called with interrupts disabled. |
1113 | */ | 1111 | */ |
1114 | static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) | 1112 | static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) |
1115 | { | 1113 | { |
1116 | struct tick_sched *ts = | 1114 | struct tick_sched *ts = |
1117 | container_of(timer, struct tick_sched, sched_timer); | 1115 | container_of(timer, struct tick_sched, sched_timer); |
1118 | struct pt_regs *regs = get_irq_regs(); | 1116 | struct pt_regs *regs = get_irq_regs(); |
1119 | ktime_t now = ktime_get(); | 1117 | ktime_t now = ktime_get(); |
1120 | 1118 | ||
1121 | tick_sched_do_timer(now); | 1119 | tick_sched_do_timer(now); |
1122 | 1120 | ||
1123 | /* | 1121 | /* |
1124 | * Do not call, when we are not in irq context and have | 1122 | * Do not call, when we are not in irq context and have |
1125 | * no valid regs pointer | 1123 | * no valid regs pointer |
1126 | */ | 1124 | */ |
1127 | if (regs) | 1125 | if (regs) |
1128 | tick_sched_handle(ts, regs); | 1126 | tick_sched_handle(ts, regs); |
1129 | 1127 | ||
1130 | /* No need to reprogram if we are in idle or full dynticks mode */ | 1128 | /* No need to reprogram if we are in idle or full dynticks mode */ |
1131 | if (unlikely(ts->tick_stopped)) | 1129 | if (unlikely(ts->tick_stopped)) |
1132 | return HRTIMER_NORESTART; | 1130 | return HRTIMER_NORESTART; |
1133 | 1131 | ||
1134 | hrtimer_forward(timer, now, tick_period); | 1132 | hrtimer_forward(timer, now, tick_period); |
1135 | 1133 | ||
1136 | return HRTIMER_RESTART; | 1134 | return HRTIMER_RESTART; |
1137 | } | 1135 | } |
1138 | 1136 | ||
1139 | static int sched_skew_tick; | 1137 | static int sched_skew_tick; |
1140 | 1138 | ||
1141 | static int __init skew_tick(char *str) | 1139 | static int __init skew_tick(char *str) |
1142 | { | 1140 | { |
1143 | get_option(&str, &sched_skew_tick); | 1141 | get_option(&str, &sched_skew_tick); |
1144 | 1142 | ||
1145 | return 0; | 1143 | return 0; |
1146 | } | 1144 | } |
1147 | early_param("skew_tick", skew_tick); | 1145 | early_param("skew_tick", skew_tick); |
1148 | 1146 | ||
1149 | /** | 1147 | /** |
1150 | * tick_setup_sched_timer - setup the tick emulation timer | 1148 | * tick_setup_sched_timer - setup the tick emulation timer |
1151 | */ | 1149 | */ |
1152 | void tick_setup_sched_timer(void) | 1150 | void tick_setup_sched_timer(void) |
1153 | { | 1151 | { |
1154 | struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); | 1152 | struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); |
1155 | ktime_t now = ktime_get(); | 1153 | ktime_t now = ktime_get(); |
1156 | 1154 | ||
1157 | /* | 1155 | /* |
1158 | * Emulate tick processing via per-CPU hrtimers: | 1156 | * Emulate tick processing via per-CPU hrtimers: |
1159 | */ | 1157 | */ |
1160 | hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); | 1158 | hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); |
1161 | ts->sched_timer.function = tick_sched_timer; | 1159 | ts->sched_timer.function = tick_sched_timer; |
1162 | 1160 | ||
1163 | /* Get the next period (per cpu) */ | 1161 | /* Get the next period (per cpu) */ |
1164 | hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); | 1162 | hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); |
1165 | 1163 | ||
1166 | /* Offset the tick to avert jiffies_lock contention. */ | 1164 | /* Offset the tick to avert jiffies_lock contention. */ |
1167 | if (sched_skew_tick) { | 1165 | if (sched_skew_tick) { |
1168 | u64 offset = ktime_to_ns(tick_period) >> 1; | 1166 | u64 offset = ktime_to_ns(tick_period) >> 1; |
1169 | do_div(offset, num_possible_cpus()); | 1167 | do_div(offset, num_possible_cpus()); |
1170 | offset *= smp_processor_id(); | 1168 | offset *= smp_processor_id(); |
1171 | hrtimer_add_expires_ns(&ts->sched_timer, offset); | 1169 | hrtimer_add_expires_ns(&ts->sched_timer, offset); |
1172 | } | 1170 | } |
1173 | 1171 | ||
1174 | for (;;) { | 1172 | for (;;) { |
1175 | hrtimer_forward(&ts->sched_timer, now, tick_period); | 1173 | hrtimer_forward(&ts->sched_timer, now, tick_period); |
1176 | hrtimer_start_expires(&ts->sched_timer, | 1174 | hrtimer_start_expires(&ts->sched_timer, |
1177 | HRTIMER_MODE_ABS_PINNED); | 1175 | HRTIMER_MODE_ABS_PINNED); |
1178 | /* Check, if the timer was already in the past */ | 1176 | /* Check, if the timer was already in the past */ |
1179 | if (hrtimer_active(&ts->sched_timer)) | 1177 | if (hrtimer_active(&ts->sched_timer)) |
1180 | break; | 1178 | break; |
1181 | now = ktime_get(); | 1179 | now = ktime_get(); |
1182 | } | 1180 | } |
1183 | 1181 | ||
1184 | #ifdef CONFIG_NO_HZ_COMMON | 1182 | #ifdef CONFIG_NO_HZ_COMMON |
1185 | if (tick_nohz_enabled) { | 1183 | if (tick_nohz_enabled) { |
1186 | ts->nohz_mode = NOHZ_MODE_HIGHRES; | 1184 | ts->nohz_mode = NOHZ_MODE_HIGHRES; |
1187 | tick_nohz_active = 1; | 1185 | tick_nohz_active = 1; |
1188 | } | 1186 | } |
1189 | #endif | 1187 | #endif |
1190 | } | 1188 | } |
1191 | #endif /* HIGH_RES_TIMERS */ | 1189 | #endif /* HIGH_RES_TIMERS */ |
1192 | 1190 | ||
1193 | #if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS | 1191 | #if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS |
1194 | void tick_cancel_sched_timer(int cpu) | 1192 | void tick_cancel_sched_timer(int cpu) |
1195 | { | 1193 | { |
1196 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); | 1194 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); |
1197 | 1195 | ||
1198 | # ifdef CONFIG_HIGH_RES_TIMERS | 1196 | # ifdef CONFIG_HIGH_RES_TIMERS |
1199 | if (ts->sched_timer.base) | 1197 | if (ts->sched_timer.base) |
1200 | hrtimer_cancel(&ts->sched_timer); | 1198 | hrtimer_cancel(&ts->sched_timer); |
1201 | # endif | 1199 | # endif |
1202 | 1200 | ||
1203 | memset(ts, 0, sizeof(*ts)); | 1201 | memset(ts, 0, sizeof(*ts)); |
1204 | } | 1202 | } |
1205 | #endif | 1203 | #endif |
1206 | 1204 | ||
1207 | /** | 1205 | /** |
1208 | * Async notification about clocksource changes | 1206 | * Async notification about clocksource changes |
1209 | */ | 1207 | */ |
1210 | void tick_clock_notify(void) | 1208 | void tick_clock_notify(void) |
1211 | { | 1209 | { |
1212 | int cpu; | 1210 | int cpu; |
1213 | 1211 | ||
1214 | for_each_possible_cpu(cpu) | 1212 | for_each_possible_cpu(cpu) |
1215 | set_bit(0, &per_cpu(tick_cpu_sched, cpu).check_clocks); | 1213 | set_bit(0, &per_cpu(tick_cpu_sched, cpu).check_clocks); |
1216 | } | 1214 | } |
1217 | 1215 | ||
1218 | /* | 1216 | /* |
1219 | * Async notification about clock event changes | 1217 | * Async notification about clock event changes |
1220 | */ | 1218 | */ |
1221 | void tick_oneshot_notify(void) | 1219 | void tick_oneshot_notify(void) |
1222 | { | 1220 | { |
1223 | struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); | 1221 | struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); |
1224 | 1222 | ||
1225 | set_bit(0, &ts->check_clocks); | 1223 | set_bit(0, &ts->check_clocks); |
1226 | } | 1224 | } |
1227 | 1225 | ||
1228 | /** | 1226 | /** |
1229 | * Check, if a change happened, which makes oneshot possible. | 1227 | * Check, if a change happened, which makes oneshot possible. |
1230 | * | 1228 | * |
1231 | * Called cyclic from the hrtimer softirq (driven by the timer | 1229 | * Called cyclic from the hrtimer softirq (driven by the timer |
1232 | * softirq) allow_nohz signals, that we can switch into low-res nohz | 1230 | * softirq) allow_nohz signals, that we can switch into low-res nohz |
1233 | * mode, because high resolution timers are disabled (either compile | 1231 | * mode, because high resolution timers are disabled (either compile |
1234 | * or runtime). | 1232 | * or runtime). |
1235 | */ | 1233 | */ |
1236 | int tick_check_oneshot_change(int allow_nohz) | 1234 | int tick_check_oneshot_change(int allow_nohz) |
1237 | { | 1235 | { |
1238 | struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); | 1236 | struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); |
1239 | 1237 | ||
1240 | if (!test_and_clear_bit(0, &ts->check_clocks)) | 1238 | if (!test_and_clear_bit(0, &ts->check_clocks)) |
1241 | return 0; | 1239 | return 0; |
1242 | 1240 | ||
1243 | if (ts->nohz_mode != NOHZ_MODE_INACTIVE) | 1241 | if (ts->nohz_mode != NOHZ_MODE_INACTIVE) |
1244 | return 0; | 1242 | return 0; |
1245 | 1243 | ||
1246 | if (!timekeeping_valid_for_hres() || !tick_is_oneshot_available()) | 1244 | if (!timekeeping_valid_for_hres() || !tick_is_oneshot_available()) |
1247 | return 0; | 1245 | return 0; |
1248 | 1246 | ||
1249 | if (!allow_nohz) | 1247 | if (!allow_nohz) |
1250 | return 1; | 1248 | return 1; |
1251 | 1249 | ||
1252 | tick_nohz_switch_to_nohz(); | 1250 | tick_nohz_switch_to_nohz(); |
1253 | return 0; | 1251 | return 0; |
1254 | } | 1252 | } |
1255 | 1253 |