Commit a382bf934449ddeb625167537ae81daa0211b477

Authored by Frederic Weisbecker
1 parent a831881be2

nohz: Assign timekeeping duty to a CPU outside the full dynticks range

This way the full nohz CPUs can safely run with the tick
stopped with a guarantee that somebody else is taking
care of the jiffies and GTOD progression.

Once the duty is attributed to a CPU, it won't change. Also that
CPU can't enter into dyntick idle mode or be hot unplugged.

This may later be improved from a power consumption POV. At
least we should be able to share the duty amongst all CPUs
outside the full dynticks range. Then the duty could even be
shared with full dynticks CPUs when those can't stop their
tick for any reason.

But let's start with that very simple approach first.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Geoff Levand <geoff@infradead.org>
Cc: Gilad Ben Yossef <gilad@benyossef.com>
Cc: Hakan Akkan <hakanakkan@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Kevin Hilman <khilman@linaro.org>
Cc: Li Zhong <zhong@linux.vnet.ibm.com>
Cc: Namhyung Kim <namhyung.kim@lge.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
[fix have_nohz_full_mask offcase]
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

Showing 3 changed files with 51 additions and 4 deletions Inline Diff

kernel/time/tick-broadcast.c
1 /* 1 /*
2 * linux/kernel/time/tick-broadcast.c 2 * linux/kernel/time/tick-broadcast.c
3 * 3 *
4 * This file contains functions which emulate a local clock-event 4 * This file contains functions which emulate a local clock-event
5 * device via a broadcast event source. 5 * device via a broadcast event source.
6 * 6 *
7 * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> 7 * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
8 * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar 8 * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
9 * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner 9 * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner
10 * 10 *
11 * This code is licenced under the GPL version 2. For details see 11 * This code is licenced under the GPL version 2. For details see
12 * kernel-base/COPYING. 12 * kernel-base/COPYING.
13 */ 13 */
14 #include <linux/cpu.h> 14 #include <linux/cpu.h>
15 #include <linux/err.h> 15 #include <linux/err.h>
16 #include <linux/hrtimer.h> 16 #include <linux/hrtimer.h>
17 #include <linux/interrupt.h> 17 #include <linux/interrupt.h>
18 #include <linux/percpu.h> 18 #include <linux/percpu.h>
19 #include <linux/profile.h> 19 #include <linux/profile.h>
20 #include <linux/sched.h> 20 #include <linux/sched.h>
21 #include <linux/smp.h> 21 #include <linux/smp.h>
22 22
23 #include "tick-internal.h" 23 #include "tick-internal.h"
24 24
25 /* 25 /*
26 * Broadcast support for broken x86 hardware, where the local apic 26 * Broadcast support for broken x86 hardware, where the local apic
27 * timer stops in C3 state. 27 * timer stops in C3 state.
28 */ 28 */
29 29
30 static struct tick_device tick_broadcast_device; 30 static struct tick_device tick_broadcast_device;
31 /* FIXME: Use cpumask_var_t. */ 31 /* FIXME: Use cpumask_var_t. */
32 static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS); 32 static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS);
33 static DECLARE_BITMAP(tmpmask, NR_CPUS); 33 static DECLARE_BITMAP(tmpmask, NR_CPUS);
34 static DEFINE_RAW_SPINLOCK(tick_broadcast_lock); 34 static DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
35 static int tick_broadcast_force; 35 static int tick_broadcast_force;
36 36
37 #ifdef CONFIG_TICK_ONESHOT 37 #ifdef CONFIG_TICK_ONESHOT
38 static void tick_broadcast_clear_oneshot(int cpu); 38 static void tick_broadcast_clear_oneshot(int cpu);
39 #else 39 #else
40 static inline void tick_broadcast_clear_oneshot(int cpu) { } 40 static inline void tick_broadcast_clear_oneshot(int cpu) { }
41 #endif 41 #endif
42 42
43 /* 43 /*
44 * Debugging: see timer_list.c 44 * Debugging: see timer_list.c
45 */ 45 */
46 struct tick_device *tick_get_broadcast_device(void) 46 struct tick_device *tick_get_broadcast_device(void)
47 { 47 {
48 return &tick_broadcast_device; 48 return &tick_broadcast_device;
49 } 49 }
50 50
51 struct cpumask *tick_get_broadcast_mask(void) 51 struct cpumask *tick_get_broadcast_mask(void)
52 { 52 {
53 return to_cpumask(tick_broadcast_mask); 53 return to_cpumask(tick_broadcast_mask);
54 } 54 }
55 55
56 /* 56 /*
57 * Start the device in periodic mode 57 * Start the device in periodic mode
58 */ 58 */
59 static void tick_broadcast_start_periodic(struct clock_event_device *bc) 59 static void tick_broadcast_start_periodic(struct clock_event_device *bc)
60 { 60 {
61 if (bc) 61 if (bc)
62 tick_setup_periodic(bc, 1); 62 tick_setup_periodic(bc, 1);
63 } 63 }
64 64
65 /* 65 /*
66 * Check, if the device can be utilized as broadcast device: 66 * Check, if the device can be utilized as broadcast device:
67 */ 67 */
68 int tick_check_broadcast_device(struct clock_event_device *dev) 68 int tick_check_broadcast_device(struct clock_event_device *dev)
69 { 69 {
70 if ((tick_broadcast_device.evtdev && 70 if ((tick_broadcast_device.evtdev &&
71 tick_broadcast_device.evtdev->rating >= dev->rating) || 71 tick_broadcast_device.evtdev->rating >= dev->rating) ||
72 (dev->features & CLOCK_EVT_FEAT_C3STOP)) 72 (dev->features & CLOCK_EVT_FEAT_C3STOP))
73 return 0; 73 return 0;
74 74
75 clockevents_exchange_device(tick_broadcast_device.evtdev, dev); 75 clockevents_exchange_device(tick_broadcast_device.evtdev, dev);
76 tick_broadcast_device.evtdev = dev; 76 tick_broadcast_device.evtdev = dev;
77 if (!cpumask_empty(tick_get_broadcast_mask())) 77 if (!cpumask_empty(tick_get_broadcast_mask()))
78 tick_broadcast_start_periodic(dev); 78 tick_broadcast_start_periodic(dev);
79 return 1; 79 return 1;
80 } 80 }
81 81
82 /* 82 /*
83 * Check, if the device is the broadcast device 83 * Check, if the device is the broadcast device
84 */ 84 */
85 int tick_is_broadcast_device(struct clock_event_device *dev) 85 int tick_is_broadcast_device(struct clock_event_device *dev)
86 { 86 {
87 return (dev && tick_broadcast_device.evtdev == dev); 87 return (dev && tick_broadcast_device.evtdev == dev);
88 } 88 }
89 89
90 static void err_broadcast(const struct cpumask *mask) 90 static void err_broadcast(const struct cpumask *mask)
91 { 91 {
92 pr_crit_once("Failed to broadcast timer tick. Some CPUs may be unresponsive.\n"); 92 pr_crit_once("Failed to broadcast timer tick. Some CPUs may be unresponsive.\n");
93 } 93 }
94 94
95 static void tick_device_setup_broadcast_func(struct clock_event_device *dev) 95 static void tick_device_setup_broadcast_func(struct clock_event_device *dev)
96 { 96 {
97 if (!dev->broadcast) 97 if (!dev->broadcast)
98 dev->broadcast = tick_broadcast; 98 dev->broadcast = tick_broadcast;
99 if (!dev->broadcast) { 99 if (!dev->broadcast) {
100 pr_warn_once("%s depends on broadcast, but no broadcast function available\n", 100 pr_warn_once("%s depends on broadcast, but no broadcast function available\n",
101 dev->name); 101 dev->name);
102 dev->broadcast = err_broadcast; 102 dev->broadcast = err_broadcast;
103 } 103 }
104 } 104 }
105 105
106 /* 106 /*
107 * Check, if the device is disfunctional and a place holder, which 107 * Check, if the device is disfunctional and a place holder, which
108 * needs to be handled by the broadcast device. 108 * needs to be handled by the broadcast device.
109 */ 109 */
110 int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) 110 int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
111 { 111 {
112 unsigned long flags; 112 unsigned long flags;
113 int ret = 0; 113 int ret = 0;
114 114
115 raw_spin_lock_irqsave(&tick_broadcast_lock, flags); 115 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
116 116
117 /* 117 /*
118 * Devices might be registered with both periodic and oneshot 118 * Devices might be registered with both periodic and oneshot
119 * mode disabled. This signals, that the device needs to be 119 * mode disabled. This signals, that the device needs to be
120 * operated from the broadcast device and is a placeholder for 120 * operated from the broadcast device and is a placeholder for
121 * the cpu local device. 121 * the cpu local device.
122 */ 122 */
123 if (!tick_device_is_functional(dev)) { 123 if (!tick_device_is_functional(dev)) {
124 dev->event_handler = tick_handle_periodic; 124 dev->event_handler = tick_handle_periodic;
125 tick_device_setup_broadcast_func(dev); 125 tick_device_setup_broadcast_func(dev);
126 cpumask_set_cpu(cpu, tick_get_broadcast_mask()); 126 cpumask_set_cpu(cpu, tick_get_broadcast_mask());
127 tick_broadcast_start_periodic(tick_broadcast_device.evtdev); 127 tick_broadcast_start_periodic(tick_broadcast_device.evtdev);
128 ret = 1; 128 ret = 1;
129 } else { 129 } else {
130 /* 130 /*
131 * When the new device is not affected by the stop 131 * When the new device is not affected by the stop
132 * feature and the cpu is marked in the broadcast mask 132 * feature and the cpu is marked in the broadcast mask
133 * then clear the broadcast bit. 133 * then clear the broadcast bit.
134 */ 134 */
135 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) { 135 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) {
136 int cpu = smp_processor_id(); 136 int cpu = smp_processor_id();
137 cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); 137 cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
138 tick_broadcast_clear_oneshot(cpu); 138 tick_broadcast_clear_oneshot(cpu);
139 } else { 139 } else {
140 tick_device_setup_broadcast_func(dev); 140 tick_device_setup_broadcast_func(dev);
141 } 141 }
142 } 142 }
143 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); 143 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
144 return ret; 144 return ret;
145 } 145 }
146 146
147 #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST 147 #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
148 int tick_receive_broadcast(void) 148 int tick_receive_broadcast(void)
149 { 149 {
150 struct tick_device *td = this_cpu_ptr(&tick_cpu_device); 150 struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
151 struct clock_event_device *evt = td->evtdev; 151 struct clock_event_device *evt = td->evtdev;
152 152
153 if (!evt) 153 if (!evt)
154 return -ENODEV; 154 return -ENODEV;
155 155
156 if (!evt->event_handler) 156 if (!evt->event_handler)
157 return -EINVAL; 157 return -EINVAL;
158 158
159 evt->event_handler(evt); 159 evt->event_handler(evt);
160 return 0; 160 return 0;
161 } 161 }
162 #endif 162 #endif
163 163
164 /* 164 /*
165 * Broadcast the event to the cpus, which are set in the mask (mangled). 165 * Broadcast the event to the cpus, which are set in the mask (mangled).
166 */ 166 */
167 static void tick_do_broadcast(struct cpumask *mask) 167 static void tick_do_broadcast(struct cpumask *mask)
168 { 168 {
169 int cpu = smp_processor_id(); 169 int cpu = smp_processor_id();
170 struct tick_device *td; 170 struct tick_device *td;
171 171
172 /* 172 /*
173 * Check, if the current cpu is in the mask 173 * Check, if the current cpu is in the mask
174 */ 174 */
175 if (cpumask_test_cpu(cpu, mask)) { 175 if (cpumask_test_cpu(cpu, mask)) {
176 cpumask_clear_cpu(cpu, mask); 176 cpumask_clear_cpu(cpu, mask);
177 td = &per_cpu(tick_cpu_device, cpu); 177 td = &per_cpu(tick_cpu_device, cpu);
178 td->evtdev->event_handler(td->evtdev); 178 td->evtdev->event_handler(td->evtdev);
179 } 179 }
180 180
181 if (!cpumask_empty(mask)) { 181 if (!cpumask_empty(mask)) {
182 /* 182 /*
183 * It might be necessary to actually check whether the devices 183 * It might be necessary to actually check whether the devices
184 * have different broadcast functions. For now, just use the 184 * have different broadcast functions. For now, just use the
185 * one of the first device. This works as long as we have this 185 * one of the first device. This works as long as we have this
186 * misfeature only on x86 (lapic) 186 * misfeature only on x86 (lapic)
187 */ 187 */
188 td = &per_cpu(tick_cpu_device, cpumask_first(mask)); 188 td = &per_cpu(tick_cpu_device, cpumask_first(mask));
189 td->evtdev->broadcast(mask); 189 td->evtdev->broadcast(mask);
190 } 190 }
191 } 191 }
192 192
193 /* 193 /*
194 * Periodic broadcast: 194 * Periodic broadcast:
195 * - invoke the broadcast handlers 195 * - invoke the broadcast handlers
196 */ 196 */
197 static void tick_do_periodic_broadcast(void) 197 static void tick_do_periodic_broadcast(void)
198 { 198 {
199 raw_spin_lock(&tick_broadcast_lock); 199 raw_spin_lock(&tick_broadcast_lock);
200 200
201 cpumask_and(to_cpumask(tmpmask), 201 cpumask_and(to_cpumask(tmpmask),
202 cpu_online_mask, tick_get_broadcast_mask()); 202 cpu_online_mask, tick_get_broadcast_mask());
203 tick_do_broadcast(to_cpumask(tmpmask)); 203 tick_do_broadcast(to_cpumask(tmpmask));
204 204
205 raw_spin_unlock(&tick_broadcast_lock); 205 raw_spin_unlock(&tick_broadcast_lock);
206 } 206 }
207 207
208 /* 208 /*
209 * Event handler for periodic broadcast ticks 209 * Event handler for periodic broadcast ticks
210 */ 210 */
211 static void tick_handle_periodic_broadcast(struct clock_event_device *dev) 211 static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
212 { 212 {
213 ktime_t next; 213 ktime_t next;
214 214
215 tick_do_periodic_broadcast(); 215 tick_do_periodic_broadcast();
216 216
217 /* 217 /*
218 * The device is in periodic mode. No reprogramming necessary: 218 * The device is in periodic mode. No reprogramming necessary:
219 */ 219 */
220 if (dev->mode == CLOCK_EVT_MODE_PERIODIC) 220 if (dev->mode == CLOCK_EVT_MODE_PERIODIC)
221 return; 221 return;
222 222
223 /* 223 /*
224 * Setup the next period for devices, which do not have 224 * Setup the next period for devices, which do not have
225 * periodic mode. We read dev->next_event first and add to it 225 * periodic mode. We read dev->next_event first and add to it
226 * when the event already expired. clockevents_program_event() 226 * when the event already expired. clockevents_program_event()
227 * sets dev->next_event only when the event is really 227 * sets dev->next_event only when the event is really
228 * programmed to the device. 228 * programmed to the device.
229 */ 229 */
230 for (next = dev->next_event; ;) { 230 for (next = dev->next_event; ;) {
231 next = ktime_add(next, tick_period); 231 next = ktime_add(next, tick_period);
232 232
233 if (!clockevents_program_event(dev, next, false)) 233 if (!clockevents_program_event(dev, next, false))
234 return; 234 return;
235 tick_do_periodic_broadcast(); 235 tick_do_periodic_broadcast();
236 } 236 }
237 } 237 }
238 238
239 /* 239 /*
240 * Powerstate information: The system enters/leaves a state, where 240 * Powerstate information: The system enters/leaves a state, where
241 * affected devices might stop 241 * affected devices might stop
242 */ 242 */
243 static void tick_do_broadcast_on_off(unsigned long *reason) 243 static void tick_do_broadcast_on_off(unsigned long *reason)
244 { 244 {
245 struct clock_event_device *bc, *dev; 245 struct clock_event_device *bc, *dev;
246 struct tick_device *td; 246 struct tick_device *td;
247 unsigned long flags; 247 unsigned long flags;
248 int cpu, bc_stopped; 248 int cpu, bc_stopped;
249 249
250 raw_spin_lock_irqsave(&tick_broadcast_lock, flags); 250 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
251 251
252 cpu = smp_processor_id(); 252 cpu = smp_processor_id();
253 td = &per_cpu(tick_cpu_device, cpu); 253 td = &per_cpu(tick_cpu_device, cpu);
254 dev = td->evtdev; 254 dev = td->evtdev;
255 bc = tick_broadcast_device.evtdev; 255 bc = tick_broadcast_device.evtdev;
256 256
257 /* 257 /*
258 * Is the device not affected by the powerstate ? 258 * Is the device not affected by the powerstate ?
259 */ 259 */
260 if (!dev || !(dev->features & CLOCK_EVT_FEAT_C3STOP)) 260 if (!dev || !(dev->features & CLOCK_EVT_FEAT_C3STOP))
261 goto out; 261 goto out;
262 262
263 if (!tick_device_is_functional(dev)) 263 if (!tick_device_is_functional(dev))
264 goto out; 264 goto out;
265 265
266 bc_stopped = cpumask_empty(tick_get_broadcast_mask()); 266 bc_stopped = cpumask_empty(tick_get_broadcast_mask());
267 267
268 switch (*reason) { 268 switch (*reason) {
269 case CLOCK_EVT_NOTIFY_BROADCAST_ON: 269 case CLOCK_EVT_NOTIFY_BROADCAST_ON:
270 case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: 270 case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
271 if (!cpumask_test_cpu(cpu, tick_get_broadcast_mask())) { 271 if (!cpumask_test_cpu(cpu, tick_get_broadcast_mask())) {
272 cpumask_set_cpu(cpu, tick_get_broadcast_mask()); 272 cpumask_set_cpu(cpu, tick_get_broadcast_mask());
273 if (tick_broadcast_device.mode == 273 if (tick_broadcast_device.mode ==
274 TICKDEV_MODE_PERIODIC) 274 TICKDEV_MODE_PERIODIC)
275 clockevents_shutdown(dev); 275 clockevents_shutdown(dev);
276 } 276 }
277 if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE) 277 if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE)
278 tick_broadcast_force = 1; 278 tick_broadcast_force = 1;
279 break; 279 break;
280 case CLOCK_EVT_NOTIFY_BROADCAST_OFF: 280 case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
281 if (!tick_broadcast_force && 281 if (!tick_broadcast_force &&
282 cpumask_test_cpu(cpu, tick_get_broadcast_mask())) { 282 cpumask_test_cpu(cpu, tick_get_broadcast_mask())) {
283 cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); 283 cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
284 if (tick_broadcast_device.mode == 284 if (tick_broadcast_device.mode ==
285 TICKDEV_MODE_PERIODIC) 285 TICKDEV_MODE_PERIODIC)
286 tick_setup_periodic(dev, 0); 286 tick_setup_periodic(dev, 0);
287 } 287 }
288 break; 288 break;
289 } 289 }
290 290
291 if (cpumask_empty(tick_get_broadcast_mask())) { 291 if (cpumask_empty(tick_get_broadcast_mask())) {
292 if (!bc_stopped) 292 if (!bc_stopped)
293 clockevents_shutdown(bc); 293 clockevents_shutdown(bc);
294 } else if (bc_stopped) { 294 } else if (bc_stopped) {
295 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) 295 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
296 tick_broadcast_start_periodic(bc); 296 tick_broadcast_start_periodic(bc);
297 else 297 else
298 tick_broadcast_setup_oneshot(bc); 298 tick_broadcast_setup_oneshot(bc);
299 } 299 }
300 out: 300 out:
301 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); 301 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
302 } 302 }
303 303
304 /* 304 /*
305 * Powerstate information: The system enters/leaves a state, where 305 * Powerstate information: The system enters/leaves a state, where
306 * affected devices might stop. 306 * affected devices might stop.
307 */ 307 */
308 void tick_broadcast_on_off(unsigned long reason, int *oncpu) 308 void tick_broadcast_on_off(unsigned long reason, int *oncpu)
309 { 309 {
310 if (!cpumask_test_cpu(*oncpu, cpu_online_mask)) 310 if (!cpumask_test_cpu(*oncpu, cpu_online_mask))
311 printk(KERN_ERR "tick-broadcast: ignoring broadcast for " 311 printk(KERN_ERR "tick-broadcast: ignoring broadcast for "
312 "offline CPU #%d\n", *oncpu); 312 "offline CPU #%d\n", *oncpu);
313 else 313 else
314 tick_do_broadcast_on_off(&reason); 314 tick_do_broadcast_on_off(&reason);
315 } 315 }
316 316
317 /* 317 /*
318 * Set the periodic handler depending on broadcast on/off 318 * Set the periodic handler depending on broadcast on/off
319 */ 319 */
320 void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast) 320 void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast)
321 { 321 {
322 if (!broadcast) 322 if (!broadcast)
323 dev->event_handler = tick_handle_periodic; 323 dev->event_handler = tick_handle_periodic;
324 else 324 else
325 dev->event_handler = tick_handle_periodic_broadcast; 325 dev->event_handler = tick_handle_periodic_broadcast;
326 } 326 }
327 327
328 /* 328 /*
329 * Remove a CPU from broadcasting 329 * Remove a CPU from broadcasting
330 */ 330 */
331 void tick_shutdown_broadcast(unsigned int *cpup) 331 void tick_shutdown_broadcast(unsigned int *cpup)
332 { 332 {
333 struct clock_event_device *bc; 333 struct clock_event_device *bc;
334 unsigned long flags; 334 unsigned long flags;
335 unsigned int cpu = *cpup; 335 unsigned int cpu = *cpup;
336 336
337 raw_spin_lock_irqsave(&tick_broadcast_lock, flags); 337 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
338 338
339 bc = tick_broadcast_device.evtdev; 339 bc = tick_broadcast_device.evtdev;
340 cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); 340 cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
341 341
342 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { 342 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) {
343 if (bc && cpumask_empty(tick_get_broadcast_mask())) 343 if (bc && cpumask_empty(tick_get_broadcast_mask()))
344 clockevents_shutdown(bc); 344 clockevents_shutdown(bc);
345 } 345 }
346 346
347 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); 347 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
348 } 348 }
349 349
350 void tick_suspend_broadcast(void) 350 void tick_suspend_broadcast(void)
351 { 351 {
352 struct clock_event_device *bc; 352 struct clock_event_device *bc;
353 unsigned long flags; 353 unsigned long flags;
354 354
355 raw_spin_lock_irqsave(&tick_broadcast_lock, flags); 355 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
356 356
357 bc = tick_broadcast_device.evtdev; 357 bc = tick_broadcast_device.evtdev;
358 if (bc) 358 if (bc)
359 clockevents_shutdown(bc); 359 clockevents_shutdown(bc);
360 360
361 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); 361 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
362 } 362 }
363 363
364 int tick_resume_broadcast(void) 364 int tick_resume_broadcast(void)
365 { 365 {
366 struct clock_event_device *bc; 366 struct clock_event_device *bc;
367 unsigned long flags; 367 unsigned long flags;
368 int broadcast = 0; 368 int broadcast = 0;
369 369
370 raw_spin_lock_irqsave(&tick_broadcast_lock, flags); 370 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
371 371
372 bc = tick_broadcast_device.evtdev; 372 bc = tick_broadcast_device.evtdev;
373 373
374 if (bc) { 374 if (bc) {
375 clockevents_set_mode(bc, CLOCK_EVT_MODE_RESUME); 375 clockevents_set_mode(bc, CLOCK_EVT_MODE_RESUME);
376 376
377 switch (tick_broadcast_device.mode) { 377 switch (tick_broadcast_device.mode) {
378 case TICKDEV_MODE_PERIODIC: 378 case TICKDEV_MODE_PERIODIC:
379 if (!cpumask_empty(tick_get_broadcast_mask())) 379 if (!cpumask_empty(tick_get_broadcast_mask()))
380 tick_broadcast_start_periodic(bc); 380 tick_broadcast_start_periodic(bc);
381 broadcast = cpumask_test_cpu(smp_processor_id(), 381 broadcast = cpumask_test_cpu(smp_processor_id(),
382 tick_get_broadcast_mask()); 382 tick_get_broadcast_mask());
383 break; 383 break;
384 case TICKDEV_MODE_ONESHOT: 384 case TICKDEV_MODE_ONESHOT:
385 if (!cpumask_empty(tick_get_broadcast_mask())) 385 if (!cpumask_empty(tick_get_broadcast_mask()))
386 broadcast = tick_resume_broadcast_oneshot(bc); 386 broadcast = tick_resume_broadcast_oneshot(bc);
387 break; 387 break;
388 } 388 }
389 } 389 }
390 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); 390 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
391 391
392 return broadcast; 392 return broadcast;
393 } 393 }
394 394
395 395
396 #ifdef CONFIG_TICK_ONESHOT 396 #ifdef CONFIG_TICK_ONESHOT
397 397
398 /* FIXME: use cpumask_var_t. */ 398 /* FIXME: use cpumask_var_t. */
399 static DECLARE_BITMAP(tick_broadcast_oneshot_mask, NR_CPUS); 399 static DECLARE_BITMAP(tick_broadcast_oneshot_mask, NR_CPUS);
400 400
401 /* 401 /*
402 * Exposed for debugging: see timer_list.c 402 * Exposed for debugging: see timer_list.c
403 */ 403 */
404 struct cpumask *tick_get_broadcast_oneshot_mask(void) 404 struct cpumask *tick_get_broadcast_oneshot_mask(void)
405 { 405 {
406 return to_cpumask(tick_broadcast_oneshot_mask); 406 return to_cpumask(tick_broadcast_oneshot_mask);
407 } 407 }
408 408
409 static int tick_broadcast_set_event(ktime_t expires, int force) 409 static int tick_broadcast_set_event(ktime_t expires, int force)
410 { 410 {
411 struct clock_event_device *bc = tick_broadcast_device.evtdev; 411 struct clock_event_device *bc = tick_broadcast_device.evtdev;
412 412
413 if (bc->mode != CLOCK_EVT_MODE_ONESHOT) 413 if (bc->mode != CLOCK_EVT_MODE_ONESHOT)
414 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); 414 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
415 415
416 return clockevents_program_event(bc, expires, force); 416 return clockevents_program_event(bc, expires, force);
417 } 417 }
418 418
419 int tick_resume_broadcast_oneshot(struct clock_event_device *bc) 419 int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
420 { 420 {
421 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); 421 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
422 return 0; 422 return 0;
423 } 423 }
424 424
425 /* 425 /*
426 * Called from irq_enter() when idle was interrupted to reenable the 426 * Called from irq_enter() when idle was interrupted to reenable the
427 * per cpu device. 427 * per cpu device.
428 */ 428 */
429 void tick_check_oneshot_broadcast(int cpu) 429 void tick_check_oneshot_broadcast(int cpu)
430 { 430 {
431 if (cpumask_test_cpu(cpu, to_cpumask(tick_broadcast_oneshot_mask))) { 431 if (cpumask_test_cpu(cpu, to_cpumask(tick_broadcast_oneshot_mask))) {
432 struct tick_device *td = &per_cpu(tick_cpu_device, cpu); 432 struct tick_device *td = &per_cpu(tick_cpu_device, cpu);
433 433
434 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT); 434 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT);
435 } 435 }
436 } 436 }
437 437
438 /* 438 /*
439 * Handle oneshot mode broadcasting 439 * Handle oneshot mode broadcasting
440 */ 440 */
441 static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) 441 static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
442 { 442 {
443 struct tick_device *td; 443 struct tick_device *td;
444 ktime_t now, next_event; 444 ktime_t now, next_event;
445 int cpu; 445 int cpu;
446 446
447 raw_spin_lock(&tick_broadcast_lock); 447 raw_spin_lock(&tick_broadcast_lock);
448 again: 448 again:
449 dev->next_event.tv64 = KTIME_MAX; 449 dev->next_event.tv64 = KTIME_MAX;
450 next_event.tv64 = KTIME_MAX; 450 next_event.tv64 = KTIME_MAX;
451 cpumask_clear(to_cpumask(tmpmask)); 451 cpumask_clear(to_cpumask(tmpmask));
452 now = ktime_get(); 452 now = ktime_get();
453 /* Find all expired events */ 453 /* Find all expired events */
454 for_each_cpu(cpu, tick_get_broadcast_oneshot_mask()) { 454 for_each_cpu(cpu, tick_get_broadcast_oneshot_mask()) {
455 td = &per_cpu(tick_cpu_device, cpu); 455 td = &per_cpu(tick_cpu_device, cpu);
456 if (td->evtdev->next_event.tv64 <= now.tv64) 456 if (td->evtdev->next_event.tv64 <= now.tv64)
457 cpumask_set_cpu(cpu, to_cpumask(tmpmask)); 457 cpumask_set_cpu(cpu, to_cpumask(tmpmask));
458 else if (td->evtdev->next_event.tv64 < next_event.tv64) 458 else if (td->evtdev->next_event.tv64 < next_event.tv64)
459 next_event.tv64 = td->evtdev->next_event.tv64; 459 next_event.tv64 = td->evtdev->next_event.tv64;
460 } 460 }
461 461
462 /* 462 /*
463 * Wakeup the cpus which have an expired event. 463 * Wakeup the cpus which have an expired event.
464 */ 464 */
465 tick_do_broadcast(to_cpumask(tmpmask)); 465 tick_do_broadcast(to_cpumask(tmpmask));
466 466
467 /* 467 /*
468 * Two reasons for reprogram: 468 * Two reasons for reprogram:
469 * 469 *
470 * - The global event did not expire any CPU local 470 * - The global event did not expire any CPU local
471 * events. This happens in dyntick mode, as the maximum PIT 471 * events. This happens in dyntick mode, as the maximum PIT
472 * delta is quite small. 472 * delta is quite small.
473 * 473 *
474 * - There are pending events on sleeping CPUs which were not 474 * - There are pending events on sleeping CPUs which were not
475 * in the event mask 475 * in the event mask
476 */ 476 */
477 if (next_event.tv64 != KTIME_MAX) { 477 if (next_event.tv64 != KTIME_MAX) {
478 /* 478 /*
479 * Rearm the broadcast device. If event expired, 479 * Rearm the broadcast device. If event expired,
480 * repeat the above 480 * repeat the above
481 */ 481 */
482 if (tick_broadcast_set_event(next_event, 0)) 482 if (tick_broadcast_set_event(next_event, 0))
483 goto again; 483 goto again;
484 } 484 }
485 raw_spin_unlock(&tick_broadcast_lock); 485 raw_spin_unlock(&tick_broadcast_lock);
486 } 486 }
487 487
488 /* 488 /*
489 * Powerstate information: The system enters/leaves a state, where 489 * Powerstate information: The system enters/leaves a state, where
490 * affected devices might stop 490 * affected devices might stop
491 */ 491 */
492 void tick_broadcast_oneshot_control(unsigned long reason) 492 void tick_broadcast_oneshot_control(unsigned long reason)
493 { 493 {
494 struct clock_event_device *bc, *dev; 494 struct clock_event_device *bc, *dev;
495 struct tick_device *td; 495 struct tick_device *td;
496 unsigned long flags; 496 unsigned long flags;
497 int cpu; 497 int cpu;
498 498
499 /* 499 /*
500 * Periodic mode does not care about the enter/exit of power 500 * Periodic mode does not care about the enter/exit of power
501 * states 501 * states
502 */ 502 */
503 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) 503 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
504 return; 504 return;
505 505
506 /* 506 /*
507 * We are called with preemtion disabled from the depth of the 507 * We are called with preemtion disabled from the depth of the
508 * idle code, so we can't be moved away. 508 * idle code, so we can't be moved away.
509 */ 509 */
510 cpu = smp_processor_id(); 510 cpu = smp_processor_id();
511 td = &per_cpu(tick_cpu_device, cpu); 511 td = &per_cpu(tick_cpu_device, cpu);
512 dev = td->evtdev; 512 dev = td->evtdev;
513 513
514 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) 514 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
515 return; 515 return;
516 516
517 bc = tick_broadcast_device.evtdev; 517 bc = tick_broadcast_device.evtdev;
518 518
519 raw_spin_lock_irqsave(&tick_broadcast_lock, flags); 519 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
520 if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { 520 if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
521 if (!cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) { 521 if (!cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) {
522 cpumask_set_cpu(cpu, tick_get_broadcast_oneshot_mask()); 522 cpumask_set_cpu(cpu, tick_get_broadcast_oneshot_mask());
523 clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); 523 clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
524 if (dev->next_event.tv64 < bc->next_event.tv64) 524 if (dev->next_event.tv64 < bc->next_event.tv64)
525 tick_broadcast_set_event(dev->next_event, 1); 525 tick_broadcast_set_event(dev->next_event, 1);
526 } 526 }
527 } else { 527 } else {
528 if (cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) { 528 if (cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) {
529 cpumask_clear_cpu(cpu, 529 cpumask_clear_cpu(cpu,
530 tick_get_broadcast_oneshot_mask()); 530 tick_get_broadcast_oneshot_mask());
531 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); 531 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
532 if (dev->next_event.tv64 != KTIME_MAX) 532 if (dev->next_event.tv64 != KTIME_MAX)
533 tick_program_event(dev->next_event, 1); 533 tick_program_event(dev->next_event, 1);
534 } 534 }
535 } 535 }
536 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); 536 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
537 } 537 }
538 538
539 /* 539 /*
540 * Reset the one shot broadcast for a cpu 540 * Reset the one shot broadcast for a cpu
541 * 541 *
542 * Called with tick_broadcast_lock held 542 * Called with tick_broadcast_lock held
543 */ 543 */
544 static void tick_broadcast_clear_oneshot(int cpu) 544 static void tick_broadcast_clear_oneshot(int cpu)
545 { 545 {
546 cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask()); 546 cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask());
547 } 547 }
548 548
549 static void tick_broadcast_init_next_event(struct cpumask *mask, 549 static void tick_broadcast_init_next_event(struct cpumask *mask,
550 ktime_t expires) 550 ktime_t expires)
551 { 551 {
552 struct tick_device *td; 552 struct tick_device *td;
553 int cpu; 553 int cpu;
554 554
555 for_each_cpu(cpu, mask) { 555 for_each_cpu(cpu, mask) {
556 td = &per_cpu(tick_cpu_device, cpu); 556 td = &per_cpu(tick_cpu_device, cpu);
557 if (td->evtdev) 557 if (td->evtdev)
558 td->evtdev->next_event = expires; 558 td->evtdev->next_event = expires;
559 } 559 }
560 } 560 }
561 561
562 /** 562 /**
563 * tick_broadcast_setup_oneshot - setup the broadcast device 563 * tick_broadcast_setup_oneshot - setup the broadcast device
564 */ 564 */
565 void tick_broadcast_setup_oneshot(struct clock_event_device *bc) 565 void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
566 { 566 {
567 int cpu = smp_processor_id(); 567 int cpu = smp_processor_id();
568 568
569 /* Set it up only once ! */ 569 /* Set it up only once ! */
570 if (bc->event_handler != tick_handle_oneshot_broadcast) { 570 if (bc->event_handler != tick_handle_oneshot_broadcast) {
571 int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC; 571 int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC;
572 572
573 bc->event_handler = tick_handle_oneshot_broadcast; 573 bc->event_handler = tick_handle_oneshot_broadcast;
574 574
575 /* Take the do_timer update */ 575 /* Take the do_timer update */
576 tick_do_timer_cpu = cpu; 576 if (!tick_nohz_extended_cpu(cpu))
577 tick_do_timer_cpu = cpu;
577 578
578 /* 579 /*
579 * We must be careful here. There might be other CPUs 580 * We must be careful here. There might be other CPUs
580 * waiting for periodic broadcast. We need to set the 581 * waiting for periodic broadcast. We need to set the
581 * oneshot_mask bits for those and program the 582 * oneshot_mask bits for those and program the
582 * broadcast device to fire. 583 * broadcast device to fire.
583 */ 584 */
584 cpumask_copy(to_cpumask(tmpmask), tick_get_broadcast_mask()); 585 cpumask_copy(to_cpumask(tmpmask), tick_get_broadcast_mask());
585 cpumask_clear_cpu(cpu, to_cpumask(tmpmask)); 586 cpumask_clear_cpu(cpu, to_cpumask(tmpmask));
586 cpumask_or(tick_get_broadcast_oneshot_mask(), 587 cpumask_or(tick_get_broadcast_oneshot_mask(),
587 tick_get_broadcast_oneshot_mask(), 588 tick_get_broadcast_oneshot_mask(),
588 to_cpumask(tmpmask)); 589 to_cpumask(tmpmask));
589 590
590 if (was_periodic && !cpumask_empty(to_cpumask(tmpmask))) { 591 if (was_periodic && !cpumask_empty(to_cpumask(tmpmask))) {
591 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); 592 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
592 tick_broadcast_init_next_event(to_cpumask(tmpmask), 593 tick_broadcast_init_next_event(to_cpumask(tmpmask),
593 tick_next_period); 594 tick_next_period);
594 tick_broadcast_set_event(tick_next_period, 1); 595 tick_broadcast_set_event(tick_next_period, 1);
595 } else 596 } else
596 bc->next_event.tv64 = KTIME_MAX; 597 bc->next_event.tv64 = KTIME_MAX;
597 } else { 598 } else {
598 /* 599 /*
599 * The first cpu which switches to oneshot mode sets 600 * The first cpu which switches to oneshot mode sets
600 * the bit for all other cpus which are in the general 601 * the bit for all other cpus which are in the general
601 * (periodic) broadcast mask. So the bit is set and 602 * (periodic) broadcast mask. So the bit is set and
602 * would prevent the first broadcast enter after this 603 * would prevent the first broadcast enter after this
603 * to program the bc device. 604 * to program the bc device.
604 */ 605 */
605 tick_broadcast_clear_oneshot(cpu); 606 tick_broadcast_clear_oneshot(cpu);
606 } 607 }
607 } 608 }
608 609
609 /* 610 /*
610 * Select oneshot operating mode for the broadcast device 611 * Select oneshot operating mode for the broadcast device
611 */ 612 */
612 void tick_broadcast_switch_to_oneshot(void) 613 void tick_broadcast_switch_to_oneshot(void)
613 { 614 {
614 struct clock_event_device *bc; 615 struct clock_event_device *bc;
615 unsigned long flags; 616 unsigned long flags;
616 617
617 raw_spin_lock_irqsave(&tick_broadcast_lock, flags); 618 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
618 619
619 tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT; 620 tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT;
620 bc = tick_broadcast_device.evtdev; 621 bc = tick_broadcast_device.evtdev;
621 if (bc) 622 if (bc)
622 tick_broadcast_setup_oneshot(bc); 623 tick_broadcast_setup_oneshot(bc);
623 624
624 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); 625 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
625 } 626 }
626 627
627 628
628 /* 629 /*
629 * Remove a dead CPU from broadcasting 630 * Remove a dead CPU from broadcasting
630 */ 631 */
631 void tick_shutdown_broadcast_oneshot(unsigned int *cpup) 632 void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
632 { 633 {
633 unsigned long flags; 634 unsigned long flags;
634 unsigned int cpu = *cpup; 635 unsigned int cpu = *cpup;
635 636
636 raw_spin_lock_irqsave(&tick_broadcast_lock, flags); 637 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
637 638
638 /* 639 /*
639 * Clear the broadcast mask flag for the dead cpu, but do not 640 * Clear the broadcast mask flag for the dead cpu, but do not
640 * stop the broadcast device! 641 * stop the broadcast device!
641 */ 642 */
642 cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask()); 643 cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask());
643 644
644 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); 645 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
645 } 646 }
646 647
647 /* 648 /*
648 * Check, whether the broadcast device is in one shot mode 649 * Check, whether the broadcast device is in one shot mode
649 */ 650 */
650 int tick_broadcast_oneshot_active(void) 651 int tick_broadcast_oneshot_active(void)
651 { 652 {
652 return tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT; 653 return tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT;
653 } 654 }
654 655
655 /* 656 /*
656 * Check whether the broadcast device supports oneshot. 657 * Check whether the broadcast device supports oneshot.
657 */ 658 */
658 bool tick_broadcast_oneshot_available(void) 659 bool tick_broadcast_oneshot_available(void)
659 { 660 {
660 struct clock_event_device *bc = tick_broadcast_device.evtdev; 661 struct clock_event_device *bc = tick_broadcast_device.evtdev;
661 662
662 return bc ? bc->features & CLOCK_EVT_FEAT_ONESHOT : false; 663 return bc ? bc->features & CLOCK_EVT_FEAT_ONESHOT : false;
663 } 664 }
664 665
665 #endif 666 #endif
666 667
kernel/time/tick-common.c
1 /* 1 /*
2 * linux/kernel/time/tick-common.c 2 * linux/kernel/time/tick-common.c
3 * 3 *
4 * This file contains the base functions to manage periodic tick 4 * This file contains the base functions to manage periodic tick
5 * related events. 5 * related events.
6 * 6 *
7 * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> 7 * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
8 * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar 8 * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
9 * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner 9 * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner
10 * 10 *
11 * This code is licenced under the GPL version 2. For details see 11 * This code is licenced under the GPL version 2. For details see
12 * kernel-base/COPYING. 12 * kernel-base/COPYING.
13 */ 13 */
14 #include <linux/cpu.h> 14 #include <linux/cpu.h>
15 #include <linux/err.h> 15 #include <linux/err.h>
16 #include <linux/hrtimer.h> 16 #include <linux/hrtimer.h>
17 #include <linux/interrupt.h> 17 #include <linux/interrupt.h>
18 #include <linux/percpu.h> 18 #include <linux/percpu.h>
19 #include <linux/profile.h> 19 #include <linux/profile.h>
20 #include <linux/sched.h> 20 #include <linux/sched.h>
21 21
22 #include <asm/irq_regs.h> 22 #include <asm/irq_regs.h>
23 23
24 #include "tick-internal.h" 24 #include "tick-internal.h"
25 25
26 /* 26 /*
27 * Tick devices 27 * Tick devices
28 */ 28 */
29 DEFINE_PER_CPU(struct tick_device, tick_cpu_device); 29 DEFINE_PER_CPU(struct tick_device, tick_cpu_device);
30 /* 30 /*
31 * Tick next event: keeps track of the tick time 31 * Tick next event: keeps track of the tick time
32 */ 32 */
33 ktime_t tick_next_period; 33 ktime_t tick_next_period;
34 ktime_t tick_period; 34 ktime_t tick_period;
35 int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; 35 int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
36 static DEFINE_RAW_SPINLOCK(tick_device_lock); 36 static DEFINE_RAW_SPINLOCK(tick_device_lock);
37 37
38 /* 38 /*
39 * Debugging: see timer_list.c 39 * Debugging: see timer_list.c
40 */ 40 */
41 struct tick_device *tick_get_device(int cpu) 41 struct tick_device *tick_get_device(int cpu)
42 { 42 {
43 return &per_cpu(tick_cpu_device, cpu); 43 return &per_cpu(tick_cpu_device, cpu);
44 } 44 }
45 45
46 /** 46 /**
47 * tick_is_oneshot_available - check for a oneshot capable event device 47 * tick_is_oneshot_available - check for a oneshot capable event device
48 */ 48 */
49 int tick_is_oneshot_available(void) 49 int tick_is_oneshot_available(void)
50 { 50 {
51 struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); 51 struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
52 52
53 if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT)) 53 if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT))
54 return 0; 54 return 0;
55 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) 55 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
56 return 1; 56 return 1;
57 return tick_broadcast_oneshot_available(); 57 return tick_broadcast_oneshot_available();
58 } 58 }
59 59
60 /* 60 /*
61 * Periodic tick 61 * Periodic tick
62 */ 62 */
63 static void tick_periodic(int cpu) 63 static void tick_periodic(int cpu)
64 { 64 {
65 if (tick_do_timer_cpu == cpu) { 65 if (tick_do_timer_cpu == cpu) {
66 write_seqlock(&jiffies_lock); 66 write_seqlock(&jiffies_lock);
67 67
68 /* Keep track of the next tick event */ 68 /* Keep track of the next tick event */
69 tick_next_period = ktime_add(tick_next_period, tick_period); 69 tick_next_period = ktime_add(tick_next_period, tick_period);
70 70
71 do_timer(1); 71 do_timer(1);
72 write_sequnlock(&jiffies_lock); 72 write_sequnlock(&jiffies_lock);
73 } 73 }
74 74
75 update_process_times(user_mode(get_irq_regs())); 75 update_process_times(user_mode(get_irq_regs()));
76 profile_tick(CPU_PROFILING); 76 profile_tick(CPU_PROFILING);
77 } 77 }
78 78
79 /* 79 /*
80 * Event handler for periodic ticks 80 * Event handler for periodic ticks
81 */ 81 */
82 void tick_handle_periodic(struct clock_event_device *dev) 82 void tick_handle_periodic(struct clock_event_device *dev)
83 { 83 {
84 int cpu = smp_processor_id(); 84 int cpu = smp_processor_id();
85 ktime_t next; 85 ktime_t next;
86 86
87 tick_periodic(cpu); 87 tick_periodic(cpu);
88 88
89 if (dev->mode != CLOCK_EVT_MODE_ONESHOT) 89 if (dev->mode != CLOCK_EVT_MODE_ONESHOT)
90 return; 90 return;
91 /* 91 /*
92 * Setup the next period for devices, which do not have 92 * Setup the next period for devices, which do not have
93 * periodic mode: 93 * periodic mode:
94 */ 94 */
95 next = ktime_add(dev->next_event, tick_period); 95 next = ktime_add(dev->next_event, tick_period);
96 for (;;) { 96 for (;;) {
97 if (!clockevents_program_event(dev, next, false)) 97 if (!clockevents_program_event(dev, next, false))
98 return; 98 return;
99 /* 99 /*
100 * Have to be careful here. If we're in oneshot mode, 100 * Have to be careful here. If we're in oneshot mode,
101 * before we call tick_periodic() in a loop, we need 101 * before we call tick_periodic() in a loop, we need
102 * to be sure we're using a real hardware clocksource. 102 * to be sure we're using a real hardware clocksource.
103 * Otherwise we could get trapped in an infinite 103 * Otherwise we could get trapped in an infinite
104 * loop, as the tick_periodic() increments jiffies, 104 * loop, as the tick_periodic() increments jiffies,
105 * when then will increment time, posibly causing 105 * when then will increment time, posibly causing
106 * the loop to trigger again and again. 106 * the loop to trigger again and again.
107 */ 107 */
108 if (timekeeping_valid_for_hres()) 108 if (timekeeping_valid_for_hres())
109 tick_periodic(cpu); 109 tick_periodic(cpu);
110 next = ktime_add(next, tick_period); 110 next = ktime_add(next, tick_period);
111 } 111 }
112 } 112 }
113 113
114 /* 114 /*
115 * Setup the device for a periodic tick 115 * Setup the device for a periodic tick
116 */ 116 */
117 void tick_setup_periodic(struct clock_event_device *dev, int broadcast) 117 void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
118 { 118 {
119 tick_set_periodic_handler(dev, broadcast); 119 tick_set_periodic_handler(dev, broadcast);
120 120
121 /* Broadcast setup ? */ 121 /* Broadcast setup ? */
122 if (!tick_device_is_functional(dev)) 122 if (!tick_device_is_functional(dev))
123 return; 123 return;
124 124
125 if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) && 125 if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) &&
126 !tick_broadcast_oneshot_active()) { 126 !tick_broadcast_oneshot_active()) {
127 clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC); 127 clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC);
128 } else { 128 } else {
129 unsigned long seq; 129 unsigned long seq;
130 ktime_t next; 130 ktime_t next;
131 131
132 do { 132 do {
133 seq = read_seqbegin(&jiffies_lock); 133 seq = read_seqbegin(&jiffies_lock);
134 next = tick_next_period; 134 next = tick_next_period;
135 } while (read_seqretry(&jiffies_lock, seq)); 135 } while (read_seqretry(&jiffies_lock, seq));
136 136
137 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); 137 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
138 138
139 for (;;) { 139 for (;;) {
140 if (!clockevents_program_event(dev, next, false)) 140 if (!clockevents_program_event(dev, next, false))
141 return; 141 return;
142 next = ktime_add(next, tick_period); 142 next = ktime_add(next, tick_period);
143 } 143 }
144 } 144 }
145 } 145 }
146 146
147 /* 147 /*
148 * Setup the tick device 148 * Setup the tick device
149 */ 149 */
150 static void tick_setup_device(struct tick_device *td, 150 static void tick_setup_device(struct tick_device *td,
151 struct clock_event_device *newdev, int cpu, 151 struct clock_event_device *newdev, int cpu,
152 const struct cpumask *cpumask) 152 const struct cpumask *cpumask)
153 { 153 {
154 ktime_t next_event; 154 ktime_t next_event;
155 void (*handler)(struct clock_event_device *) = NULL; 155 void (*handler)(struct clock_event_device *) = NULL;
156 156
157 /* 157 /*
158 * First device setup ? 158 * First device setup ?
159 */ 159 */
160 if (!td->evtdev) { 160 if (!td->evtdev) {
161 /* 161 /*
162 * If no cpu took the do_timer update, assign it to 162 * If no cpu took the do_timer update, assign it to
163 * this cpu: 163 * this cpu:
164 */ 164 */
165 if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) { 165 if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) {
166 tick_do_timer_cpu = cpu; 166 if (!tick_nohz_extended_cpu(cpu))
167 tick_do_timer_cpu = cpu;
168 else
169 tick_do_timer_cpu = TICK_DO_TIMER_NONE;
167 tick_next_period = ktime_get(); 170 tick_next_period = ktime_get();
168 tick_period = ktime_set(0, NSEC_PER_SEC / HZ); 171 tick_period = ktime_set(0, NSEC_PER_SEC / HZ);
169 } 172 }
170 173
171 /* 174 /*
172 * Startup in periodic mode first. 175 * Startup in periodic mode first.
173 */ 176 */
174 td->mode = TICKDEV_MODE_PERIODIC; 177 td->mode = TICKDEV_MODE_PERIODIC;
175 } else { 178 } else {
176 handler = td->evtdev->event_handler; 179 handler = td->evtdev->event_handler;
177 next_event = td->evtdev->next_event; 180 next_event = td->evtdev->next_event;
178 td->evtdev->event_handler = clockevents_handle_noop; 181 td->evtdev->event_handler = clockevents_handle_noop;
179 } 182 }
180 183
181 td->evtdev = newdev; 184 td->evtdev = newdev;
182 185
183 /* 186 /*
184 * When the device is not per cpu, pin the interrupt to the 187 * When the device is not per cpu, pin the interrupt to the
185 * current cpu: 188 * current cpu:
186 */ 189 */
187 if (!cpumask_equal(newdev->cpumask, cpumask)) 190 if (!cpumask_equal(newdev->cpumask, cpumask))
188 irq_set_affinity(newdev->irq, cpumask); 191 irq_set_affinity(newdev->irq, cpumask);
189 192
190 /* 193 /*
191 * When global broadcasting is active, check if the current 194 * When global broadcasting is active, check if the current
192 * device is registered as a placeholder for broadcast mode. 195 * device is registered as a placeholder for broadcast mode.
193 * This allows us to handle this x86 misfeature in a generic 196 * This allows us to handle this x86 misfeature in a generic
194 * way. 197 * way.
195 */ 198 */
196 if (tick_device_uses_broadcast(newdev, cpu)) 199 if (tick_device_uses_broadcast(newdev, cpu))
197 return; 200 return;
198 201
199 if (td->mode == TICKDEV_MODE_PERIODIC) 202 if (td->mode == TICKDEV_MODE_PERIODIC)
200 tick_setup_periodic(newdev, 0); 203 tick_setup_periodic(newdev, 0);
201 else 204 else
202 tick_setup_oneshot(newdev, handler, next_event); 205 tick_setup_oneshot(newdev, handler, next_event);
203 } 206 }
204 207
205 /* 208 /*
206 * Check, if the new registered device should be used. 209 * Check, if the new registered device should be used.
207 */ 210 */
208 static int tick_check_new_device(struct clock_event_device *newdev) 211 static int tick_check_new_device(struct clock_event_device *newdev)
209 { 212 {
210 struct clock_event_device *curdev; 213 struct clock_event_device *curdev;
211 struct tick_device *td; 214 struct tick_device *td;
212 int cpu, ret = NOTIFY_OK; 215 int cpu, ret = NOTIFY_OK;
213 unsigned long flags; 216 unsigned long flags;
214 217
215 raw_spin_lock_irqsave(&tick_device_lock, flags); 218 raw_spin_lock_irqsave(&tick_device_lock, flags);
216 219
217 cpu = smp_processor_id(); 220 cpu = smp_processor_id();
218 if (!cpumask_test_cpu(cpu, newdev->cpumask)) 221 if (!cpumask_test_cpu(cpu, newdev->cpumask))
219 goto out_bc; 222 goto out_bc;
220 223
221 td = &per_cpu(tick_cpu_device, cpu); 224 td = &per_cpu(tick_cpu_device, cpu);
222 curdev = td->evtdev; 225 curdev = td->evtdev;
223 226
224 /* cpu local device ? */ 227 /* cpu local device ? */
225 if (!cpumask_equal(newdev->cpumask, cpumask_of(cpu))) { 228 if (!cpumask_equal(newdev->cpumask, cpumask_of(cpu))) {
226 229
227 /* 230 /*
228 * If the cpu affinity of the device interrupt can not 231 * If the cpu affinity of the device interrupt can not
229 * be set, ignore it. 232 * be set, ignore it.
230 */ 233 */
231 if (!irq_can_set_affinity(newdev->irq)) 234 if (!irq_can_set_affinity(newdev->irq))
232 goto out_bc; 235 goto out_bc;
233 236
234 /* 237 /*
235 * If we have a cpu local device already, do not replace it 238 * If we have a cpu local device already, do not replace it
236 * by a non cpu local device 239 * by a non cpu local device
237 */ 240 */
238 if (curdev && cpumask_equal(curdev->cpumask, cpumask_of(cpu))) 241 if (curdev && cpumask_equal(curdev->cpumask, cpumask_of(cpu)))
239 goto out_bc; 242 goto out_bc;
240 } 243 }
241 244
242 /* 245 /*
243 * If we have an active device, then check the rating and the oneshot 246 * If we have an active device, then check the rating and the oneshot
244 * feature. 247 * feature.
245 */ 248 */
246 if (curdev) { 249 if (curdev) {
247 /* 250 /*
248 * Prefer one shot capable devices ! 251 * Prefer one shot capable devices !
249 */ 252 */
250 if ((curdev->features & CLOCK_EVT_FEAT_ONESHOT) && 253 if ((curdev->features & CLOCK_EVT_FEAT_ONESHOT) &&
251 !(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) 254 !(newdev->features & CLOCK_EVT_FEAT_ONESHOT))
252 goto out_bc; 255 goto out_bc;
253 /* 256 /*
254 * Check the rating 257 * Check the rating
255 */ 258 */
256 if (curdev->rating >= newdev->rating) 259 if (curdev->rating >= newdev->rating)
257 goto out_bc; 260 goto out_bc;
258 } 261 }
259 262
260 /* 263 /*
261 * Replace the eventually existing device by the new 264 * Replace the eventually existing device by the new
262 * device. If the current device is the broadcast device, do 265 * device. If the current device is the broadcast device, do
263 * not give it back to the clockevents layer ! 266 * not give it back to the clockevents layer !
264 */ 267 */
265 if (tick_is_broadcast_device(curdev)) { 268 if (tick_is_broadcast_device(curdev)) {
266 clockevents_shutdown(curdev); 269 clockevents_shutdown(curdev);
267 curdev = NULL; 270 curdev = NULL;
268 } 271 }
269 clockevents_exchange_device(curdev, newdev); 272 clockevents_exchange_device(curdev, newdev);
270 tick_setup_device(td, newdev, cpu, cpumask_of(cpu)); 273 tick_setup_device(td, newdev, cpu, cpumask_of(cpu));
271 if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) 274 if (newdev->features & CLOCK_EVT_FEAT_ONESHOT)
272 tick_oneshot_notify(); 275 tick_oneshot_notify();
273 276
274 raw_spin_unlock_irqrestore(&tick_device_lock, flags); 277 raw_spin_unlock_irqrestore(&tick_device_lock, flags);
275 return NOTIFY_STOP; 278 return NOTIFY_STOP;
276 279
277 out_bc: 280 out_bc:
278 /* 281 /*
279 * Can the new device be used as a broadcast device ? 282 * Can the new device be used as a broadcast device ?
280 */ 283 */
281 if (tick_check_broadcast_device(newdev)) 284 if (tick_check_broadcast_device(newdev))
282 ret = NOTIFY_STOP; 285 ret = NOTIFY_STOP;
283 286
284 raw_spin_unlock_irqrestore(&tick_device_lock, flags); 287 raw_spin_unlock_irqrestore(&tick_device_lock, flags);
285 288
286 return ret; 289 return ret;
287 } 290 }
288 291
289 /* 292 /*
290 * Transfer the do_timer job away from a dying cpu. 293 * Transfer the do_timer job away from a dying cpu.
291 * 294 *
292 * Called with interrupts disabled. 295 * Called with interrupts disabled.
293 */ 296 */
294 static void tick_handover_do_timer(int *cpup) 297 static void tick_handover_do_timer(int *cpup)
295 { 298 {
296 if (*cpup == tick_do_timer_cpu) { 299 if (*cpup == tick_do_timer_cpu) {
297 int cpu = cpumask_first(cpu_online_mask); 300 int cpu = cpumask_first(cpu_online_mask);
298 301
299 tick_do_timer_cpu = (cpu < nr_cpu_ids) ? cpu : 302 tick_do_timer_cpu = (cpu < nr_cpu_ids) ? cpu :
300 TICK_DO_TIMER_NONE; 303 TICK_DO_TIMER_NONE;
301 } 304 }
302 } 305 }
303 306
304 /* 307 /*
305 * Shutdown an event device on a given cpu: 308 * Shutdown an event device on a given cpu:
306 * 309 *
307 * This is called on a life CPU, when a CPU is dead. So we cannot 310 * This is called on a life CPU, when a CPU is dead. So we cannot
308 * access the hardware device itself. 311 * access the hardware device itself.
309 * We just set the mode and remove it from the lists. 312 * We just set the mode and remove it from the lists.
310 */ 313 */
311 static void tick_shutdown(unsigned int *cpup) 314 static void tick_shutdown(unsigned int *cpup)
312 { 315 {
313 struct tick_device *td = &per_cpu(tick_cpu_device, *cpup); 316 struct tick_device *td = &per_cpu(tick_cpu_device, *cpup);
314 struct clock_event_device *dev = td->evtdev; 317 struct clock_event_device *dev = td->evtdev;
315 unsigned long flags; 318 unsigned long flags;
316 319
317 raw_spin_lock_irqsave(&tick_device_lock, flags); 320 raw_spin_lock_irqsave(&tick_device_lock, flags);
318 td->mode = TICKDEV_MODE_PERIODIC; 321 td->mode = TICKDEV_MODE_PERIODIC;
319 if (dev) { 322 if (dev) {
320 /* 323 /*
321 * Prevent that the clock events layer tries to call 324 * Prevent that the clock events layer tries to call
322 * the set mode function! 325 * the set mode function!
323 */ 326 */
324 dev->mode = CLOCK_EVT_MODE_UNUSED; 327 dev->mode = CLOCK_EVT_MODE_UNUSED;
325 clockevents_exchange_device(dev, NULL); 328 clockevents_exchange_device(dev, NULL);
326 td->evtdev = NULL; 329 td->evtdev = NULL;
327 } 330 }
328 raw_spin_unlock_irqrestore(&tick_device_lock, flags); 331 raw_spin_unlock_irqrestore(&tick_device_lock, flags);
329 } 332 }
330 333
331 static void tick_suspend(void) 334 static void tick_suspend(void)
332 { 335 {
333 struct tick_device *td = &__get_cpu_var(tick_cpu_device); 336 struct tick_device *td = &__get_cpu_var(tick_cpu_device);
334 unsigned long flags; 337 unsigned long flags;
335 338
336 raw_spin_lock_irqsave(&tick_device_lock, flags); 339 raw_spin_lock_irqsave(&tick_device_lock, flags);
337 clockevents_shutdown(td->evtdev); 340 clockevents_shutdown(td->evtdev);
338 raw_spin_unlock_irqrestore(&tick_device_lock, flags); 341 raw_spin_unlock_irqrestore(&tick_device_lock, flags);
339 } 342 }
340 343
341 static void tick_resume(void) 344 static void tick_resume(void)
342 { 345 {
343 struct tick_device *td = &__get_cpu_var(tick_cpu_device); 346 struct tick_device *td = &__get_cpu_var(tick_cpu_device);
344 unsigned long flags; 347 unsigned long flags;
345 int broadcast = tick_resume_broadcast(); 348 int broadcast = tick_resume_broadcast();
346 349
347 raw_spin_lock_irqsave(&tick_device_lock, flags); 350 raw_spin_lock_irqsave(&tick_device_lock, flags);
348 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME); 351 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME);
349 352
350 if (!broadcast) { 353 if (!broadcast) {
351 if (td->mode == TICKDEV_MODE_PERIODIC) 354 if (td->mode == TICKDEV_MODE_PERIODIC)
352 tick_setup_periodic(td->evtdev, 0); 355 tick_setup_periodic(td->evtdev, 0);
353 else 356 else
354 tick_resume_oneshot(); 357 tick_resume_oneshot();
355 } 358 }
356 raw_spin_unlock_irqrestore(&tick_device_lock, flags); 359 raw_spin_unlock_irqrestore(&tick_device_lock, flags);
357 } 360 }
358 361
359 /* 362 /*
360 * Notification about clock event devices 363 * Notification about clock event devices
361 */ 364 */
362 static int tick_notify(struct notifier_block *nb, unsigned long reason, 365 static int tick_notify(struct notifier_block *nb, unsigned long reason,
363 void *dev) 366 void *dev)
364 { 367 {
365 switch (reason) { 368 switch (reason) {
366 369
367 case CLOCK_EVT_NOTIFY_ADD: 370 case CLOCK_EVT_NOTIFY_ADD:
368 return tick_check_new_device(dev); 371 return tick_check_new_device(dev);
369 372
370 case CLOCK_EVT_NOTIFY_BROADCAST_ON: 373 case CLOCK_EVT_NOTIFY_BROADCAST_ON:
371 case CLOCK_EVT_NOTIFY_BROADCAST_OFF: 374 case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
372 case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: 375 case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
373 tick_broadcast_on_off(reason, dev); 376 tick_broadcast_on_off(reason, dev);
374 break; 377 break;
375 378
376 case CLOCK_EVT_NOTIFY_BROADCAST_ENTER: 379 case CLOCK_EVT_NOTIFY_BROADCAST_ENTER:
377 case CLOCK_EVT_NOTIFY_BROADCAST_EXIT: 380 case CLOCK_EVT_NOTIFY_BROADCAST_EXIT:
378 tick_broadcast_oneshot_control(reason); 381 tick_broadcast_oneshot_control(reason);
379 break; 382 break;
380 383
381 case CLOCK_EVT_NOTIFY_CPU_DYING: 384 case CLOCK_EVT_NOTIFY_CPU_DYING:
382 tick_handover_do_timer(dev); 385 tick_handover_do_timer(dev);
383 break; 386 break;
384 387
385 case CLOCK_EVT_NOTIFY_CPU_DEAD: 388 case CLOCK_EVT_NOTIFY_CPU_DEAD:
386 tick_shutdown_broadcast_oneshot(dev); 389 tick_shutdown_broadcast_oneshot(dev);
387 tick_shutdown_broadcast(dev); 390 tick_shutdown_broadcast(dev);
388 tick_shutdown(dev); 391 tick_shutdown(dev);
389 break; 392 break;
390 393
391 case CLOCK_EVT_NOTIFY_SUSPEND: 394 case CLOCK_EVT_NOTIFY_SUSPEND:
392 tick_suspend(); 395 tick_suspend();
393 tick_suspend_broadcast(); 396 tick_suspend_broadcast();
394 break; 397 break;
395 398
396 case CLOCK_EVT_NOTIFY_RESUME: 399 case CLOCK_EVT_NOTIFY_RESUME:
397 tick_resume(); 400 tick_resume();
398 break; 401 break;
399 402
400 default: 403 default:
401 break; 404 break;
402 } 405 }
403 406
404 return NOTIFY_OK; 407 return NOTIFY_OK;
405 } 408 }
406 409
407 static struct notifier_block tick_notifier = { 410 static struct notifier_block tick_notifier = {
408 .notifier_call = tick_notify, 411 .notifier_call = tick_notify,
409 }; 412 };
410 413
411 /** 414 /**
412 * tick_init - initialize the tick control 415 * tick_init - initialize the tick control
413 * 416 *
414 * Register the notifier with the clockevents framework 417 * Register the notifier with the clockevents framework
415 */ 418 */
416 void __init tick_init(void) 419 void __init tick_init(void)
417 { 420 {
418 clockevents_register_notifier(&tick_notifier); 421 clockevents_register_notifier(&tick_notifier);
419 } 422 }
420 423
kernel/time/tick-sched.c
1 /* 1 /*
2 * linux/kernel/time/tick-sched.c 2 * linux/kernel/time/tick-sched.c
3 * 3 *
4 * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> 4 * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
5 * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar 5 * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
6 * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner 6 * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner
7 * 7 *
8 * No idle tick implementation for low and high resolution timers 8 * No idle tick implementation for low and high resolution timers
9 * 9 *
10 * Started by: Thomas Gleixner and Ingo Molnar 10 * Started by: Thomas Gleixner and Ingo Molnar
11 * 11 *
12 * Distribute under GPLv2. 12 * Distribute under GPLv2.
13 */ 13 */
14 #include <linux/cpu.h> 14 #include <linux/cpu.h>
15 #include <linux/err.h> 15 #include <linux/err.h>
16 #include <linux/hrtimer.h> 16 #include <linux/hrtimer.h>
17 #include <linux/interrupt.h> 17 #include <linux/interrupt.h>
18 #include <linux/kernel_stat.h> 18 #include <linux/kernel_stat.h>
19 #include <linux/percpu.h> 19 #include <linux/percpu.h>
20 #include <linux/profile.h> 20 #include <linux/profile.h>
21 #include <linux/sched.h> 21 #include <linux/sched.h>
22 #include <linux/module.h> 22 #include <linux/module.h>
23 #include <linux/irq_work.h> 23 #include <linux/irq_work.h>
24 24
25 #include <asm/irq_regs.h> 25 #include <asm/irq_regs.h>
26 26
27 #include "tick-internal.h" 27 #include "tick-internal.h"
28 28
29 /* 29 /*
30 * Per cpu nohz control structure 30 * Per cpu nohz control structure
31 */ 31 */
32 DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); 32 DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
33 33
34 /* 34 /*
35 * The time, when the last jiffy update happened. Protected by jiffies_lock. 35 * The time, when the last jiffy update happened. Protected by jiffies_lock.
36 */ 36 */
37 static ktime_t last_jiffies_update; 37 static ktime_t last_jiffies_update;
38 38
39 struct tick_sched *tick_get_tick_sched(int cpu) 39 struct tick_sched *tick_get_tick_sched(int cpu)
40 { 40 {
41 return &per_cpu(tick_cpu_sched, cpu); 41 return &per_cpu(tick_cpu_sched, cpu);
42 } 42 }
43 43
44 /* 44 /*
45 * Must be called with interrupts disabled ! 45 * Must be called with interrupts disabled !
46 */ 46 */
47 static void tick_do_update_jiffies64(ktime_t now) 47 static void tick_do_update_jiffies64(ktime_t now)
48 { 48 {
49 unsigned long ticks = 0; 49 unsigned long ticks = 0;
50 ktime_t delta; 50 ktime_t delta;
51 51
52 /* 52 /*
53 * Do a quick check without holding jiffies_lock: 53 * Do a quick check without holding jiffies_lock:
54 */ 54 */
55 delta = ktime_sub(now, last_jiffies_update); 55 delta = ktime_sub(now, last_jiffies_update);
56 if (delta.tv64 < tick_period.tv64) 56 if (delta.tv64 < tick_period.tv64)
57 return; 57 return;
58 58
59 /* Reevalute with jiffies_lock held */ 59 /* Reevalute with jiffies_lock held */
60 write_seqlock(&jiffies_lock); 60 write_seqlock(&jiffies_lock);
61 61
62 delta = ktime_sub(now, last_jiffies_update); 62 delta = ktime_sub(now, last_jiffies_update);
63 if (delta.tv64 >= tick_period.tv64) { 63 if (delta.tv64 >= tick_period.tv64) {
64 64
65 delta = ktime_sub(delta, tick_period); 65 delta = ktime_sub(delta, tick_period);
66 last_jiffies_update = ktime_add(last_jiffies_update, 66 last_jiffies_update = ktime_add(last_jiffies_update,
67 tick_period); 67 tick_period);
68 68
69 /* Slow path for long timeouts */ 69 /* Slow path for long timeouts */
70 if (unlikely(delta.tv64 >= tick_period.tv64)) { 70 if (unlikely(delta.tv64 >= tick_period.tv64)) {
71 s64 incr = ktime_to_ns(tick_period); 71 s64 incr = ktime_to_ns(tick_period);
72 72
73 ticks = ktime_divns(delta, incr); 73 ticks = ktime_divns(delta, incr);
74 74
75 last_jiffies_update = ktime_add_ns(last_jiffies_update, 75 last_jiffies_update = ktime_add_ns(last_jiffies_update,
76 incr * ticks); 76 incr * ticks);
77 } 77 }
78 do_timer(++ticks); 78 do_timer(++ticks);
79 79
80 /* Keep the tick_next_period variable up to date */ 80 /* Keep the tick_next_period variable up to date */
81 tick_next_period = ktime_add(last_jiffies_update, tick_period); 81 tick_next_period = ktime_add(last_jiffies_update, tick_period);
82 } 82 }
83 write_sequnlock(&jiffies_lock); 83 write_sequnlock(&jiffies_lock);
84 } 84 }
85 85
86 /* 86 /*
87 * Initialize and return retrieve the jiffies update. 87 * Initialize and return retrieve the jiffies update.
88 */ 88 */
89 static ktime_t tick_init_jiffy_update(void) 89 static ktime_t tick_init_jiffy_update(void)
90 { 90 {
91 ktime_t period; 91 ktime_t period;
92 92
93 write_seqlock(&jiffies_lock); 93 write_seqlock(&jiffies_lock);
94 /* Did we start the jiffies update yet ? */ 94 /* Did we start the jiffies update yet ? */
95 if (last_jiffies_update.tv64 == 0) 95 if (last_jiffies_update.tv64 == 0)
96 last_jiffies_update = tick_next_period; 96 last_jiffies_update = tick_next_period;
97 period = last_jiffies_update; 97 period = last_jiffies_update;
98 write_sequnlock(&jiffies_lock); 98 write_sequnlock(&jiffies_lock);
99 return period; 99 return period;
100 } 100 }
101 101
102 102
103 static void tick_sched_do_timer(ktime_t now) 103 static void tick_sched_do_timer(ktime_t now)
104 { 104 {
105 int cpu = smp_processor_id(); 105 int cpu = smp_processor_id();
106 106
107 #ifdef CONFIG_NO_HZ 107 #ifdef CONFIG_NO_HZ
108 /* 108 /*
109 * Check if the do_timer duty was dropped. We don't care about 109 * Check if the do_timer duty was dropped. We don't care about
110 * concurrency: This happens only when the cpu in charge went 110 * concurrency: This happens only when the cpu in charge went
111 * into a long sleep. If two cpus happen to assign themself to 111 * into a long sleep. If two cpus happen to assign themself to
112 * this duty, then the jiffies update is still serialized by 112 * this duty, then the jiffies update is still serialized by
113 * jiffies_lock. 113 * jiffies_lock.
114 */ 114 */
115 if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) 115 if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)
116 && !tick_nohz_extended_cpu(cpu))
116 tick_do_timer_cpu = cpu; 117 tick_do_timer_cpu = cpu;
117 #endif 118 #endif
118 119
119 /* Check, if the jiffies need an update */ 120 /* Check, if the jiffies need an update */
120 if (tick_do_timer_cpu == cpu) 121 if (tick_do_timer_cpu == cpu)
121 tick_do_update_jiffies64(now); 122 tick_do_update_jiffies64(now);
122 } 123 }
123 124
124 static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) 125 static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
125 { 126 {
126 #ifdef CONFIG_NO_HZ 127 #ifdef CONFIG_NO_HZ
127 /* 128 /*
128 * When we are idle and the tick is stopped, we have to touch 129 * When we are idle and the tick is stopped, we have to touch
129 * the watchdog as we might not schedule for a really long 130 * the watchdog as we might not schedule for a really long
130 * time. This happens on complete idle SMP systems while 131 * time. This happens on complete idle SMP systems while
131 * waiting on the login prompt. We also increment the "start of 132 * waiting on the login prompt. We also increment the "start of
132 * idle" jiffy stamp so the idle accounting adjustment we do 133 * idle" jiffy stamp so the idle accounting adjustment we do
133 * when we go busy again does not account too much ticks. 134 * when we go busy again does not account too much ticks.
134 */ 135 */
135 if (ts->tick_stopped) { 136 if (ts->tick_stopped) {
136 touch_softlockup_watchdog(); 137 touch_softlockup_watchdog();
137 if (is_idle_task(current)) 138 if (is_idle_task(current))
138 ts->idle_jiffies++; 139 ts->idle_jiffies++;
139 } 140 }
140 #endif 141 #endif
141 update_process_times(user_mode(regs)); 142 update_process_times(user_mode(regs));
142 profile_tick(CPU_PROFILING); 143 profile_tick(CPU_PROFILING);
143 } 144 }
144 145
145 #ifdef CONFIG_NO_HZ_EXTENDED 146 #ifdef CONFIG_NO_HZ_EXTENDED
146 static cpumask_var_t nohz_extended_mask; 147 static cpumask_var_t nohz_extended_mask;
147 bool have_nohz_extended_mask; 148 bool have_nohz_extended_mask;
148 149
149 int tick_nohz_extended_cpu(int cpu) 150 int tick_nohz_extended_cpu(int cpu)
150 { 151 {
151 if (!have_nohz_extended_mask) 152 if (!have_nohz_extended_mask)
152 return 0; 153 return 0;
153 154
154 return cpumask_test_cpu(cpu, nohz_extended_mask); 155 return cpumask_test_cpu(cpu, nohz_extended_mask);
155 } 156 }
156 157
157 /* Parse the boot-time nohz CPU list from the kernel parameters. */ 158 /* Parse the boot-time nohz CPU list from the kernel parameters. */
158 static int __init tick_nohz_extended_setup(char *str) 159 static int __init tick_nohz_extended_setup(char *str)
159 { 160 {
160 alloc_bootmem_cpumask_var(&nohz_extended_mask); 161 alloc_bootmem_cpumask_var(&nohz_extended_mask);
161 if (cpulist_parse(str, nohz_extended_mask) < 0) 162 if (cpulist_parse(str, nohz_extended_mask) < 0)
162 pr_warning("NOHZ: Incorrect nohz_extended cpumask\n"); 163 pr_warning("NOHZ: Incorrect nohz_extended cpumask\n");
163 else 164 else
164 have_nohz_extended_mask = true; 165 have_nohz_extended_mask = true;
165 return 1; 166 return 1;
166 } 167 }
167 __setup("nohz_extended=", tick_nohz_extended_setup); 168 __setup("nohz_extended=", tick_nohz_extended_setup);
168 169
170 static int __cpuinit tick_nohz_cpu_down_callback(struct notifier_block *nfb,
171 unsigned long action,
172 void *hcpu)
173 {
174 unsigned int cpu = (unsigned long)hcpu;
175
176 switch (action & ~CPU_TASKS_FROZEN) {
177 case CPU_DOWN_PREPARE:
178 /*
179 * If we handle the timekeeping duty for full dynticks CPUs,
180 * we can't safely shutdown that CPU.
181 */
182 if (have_nohz_extended_mask && tick_do_timer_cpu == cpu)
183 return -EINVAL;
184 break;
185 }
186 return NOTIFY_OK;
187 }
188
169 static int __init init_tick_nohz_extended(void) 189 static int __init init_tick_nohz_extended(void)
170 { 190 {
171 cpumask_var_t online_nohz; 191 cpumask_var_t online_nohz;
172 int cpu; 192 int cpu;
173 193
174 if (!have_nohz_extended_mask) 194 if (!have_nohz_extended_mask)
175 return 0; 195 return 0;
176 196
197 cpu_notifier(tick_nohz_cpu_down_callback, 0);
198
177 if (!zalloc_cpumask_var(&online_nohz, GFP_KERNEL)) { 199 if (!zalloc_cpumask_var(&online_nohz, GFP_KERNEL)) {
178 pr_warning("NO_HZ: Not enough memory to check extended nohz mask\n"); 200 pr_warning("NO_HZ: Not enough memory to check extended nohz mask\n");
179 return -ENOMEM; 201 return -ENOMEM;
180 } 202 }
181 203
182 /* 204 /*
183 * CPUs can probably not be concurrently offlined on initcall time. 205 * CPUs can probably not be concurrently offlined on initcall time.
184 * But we are paranoid, aren't we? 206 * But we are paranoid, aren't we?
185 */ 207 */
186 get_online_cpus(); 208 get_online_cpus();
187 209
188 /* Ensure we keep a CPU outside the dynticks range for timekeeping */ 210 /* Ensure we keep a CPU outside the dynticks range for timekeeping */
189 cpumask_and(online_nohz, cpu_online_mask, nohz_extended_mask); 211 cpumask_and(online_nohz, cpu_online_mask, nohz_extended_mask);
190 if (cpumask_equal(online_nohz, cpu_online_mask)) { 212 if (cpumask_equal(online_nohz, cpu_online_mask)) {
191 cpu = cpumask_any(cpu_online_mask);
192 pr_warning("NO_HZ: Must keep at least one online CPU " 213 pr_warning("NO_HZ: Must keep at least one online CPU "
193 "out of nohz_extended range\n"); 214 "out of nohz_extended range\n");
215 /*
216 * We know the current CPU doesn't have its tick stopped.
217 * Let's use it for the timekeeping duty.
218 */
219 preempt_disable();
220 cpu = smp_processor_id();
194 pr_warning("NO_HZ: Clearing %d from nohz_extended range\n", cpu); 221 pr_warning("NO_HZ: Clearing %d from nohz_extended range\n", cpu);
195 cpumask_clear_cpu(cpu, nohz_extended_mask); 222 cpumask_clear_cpu(cpu, nohz_extended_mask);
223 preempt_enable();
196 } 224 }
197 put_online_cpus(); 225 put_online_cpus();
198 free_cpumask_var(online_nohz); 226 free_cpumask_var(online_nohz);
199 227
200 return 0; 228 return 0;
201 } 229 }
202 core_initcall(init_tick_nohz_extended); 230 core_initcall(init_tick_nohz_extended);
203 #else 231 #else
204 #define have_nohz_extended_mask (0) 232 #define have_nohz_extended_mask (0)
205 #endif 233 #endif
206 234
207 /* 235 /*
208 * NOHZ - aka dynamic tick functionality 236 * NOHZ - aka dynamic tick functionality
209 */ 237 */
210 #ifdef CONFIG_NO_HZ 238 #ifdef CONFIG_NO_HZ
211 /* 239 /*
212 * NO HZ enabled ? 240 * NO HZ enabled ?
213 */ 241 */
214 int tick_nohz_enabled __read_mostly = 1; 242 int tick_nohz_enabled __read_mostly = 1;
215 243
216 /* 244 /*
217 * Enable / Disable tickless mode 245 * Enable / Disable tickless mode
218 */ 246 */
219 static int __init setup_tick_nohz(char *str) 247 static int __init setup_tick_nohz(char *str)
220 { 248 {
221 if (!strcmp(str, "off")) 249 if (!strcmp(str, "off"))
222 tick_nohz_enabled = 0; 250 tick_nohz_enabled = 0;
223 else if (!strcmp(str, "on")) 251 else if (!strcmp(str, "on"))
224 tick_nohz_enabled = 1; 252 tick_nohz_enabled = 1;
225 else 253 else
226 return 0; 254 return 0;
227 return 1; 255 return 1;
228 } 256 }
229 257
230 __setup("nohz=", setup_tick_nohz); 258 __setup("nohz=", setup_tick_nohz);
231 259
232 /** 260 /**
233 * tick_nohz_update_jiffies - update jiffies when idle was interrupted 261 * tick_nohz_update_jiffies - update jiffies when idle was interrupted
234 * 262 *
235 * Called from interrupt entry when the CPU was idle 263 * Called from interrupt entry when the CPU was idle
236 * 264 *
237 * In case the sched_tick was stopped on this CPU, we have to check if jiffies 265 * In case the sched_tick was stopped on this CPU, we have to check if jiffies
238 * must be updated. Otherwise an interrupt handler could use a stale jiffy 266 * must be updated. Otherwise an interrupt handler could use a stale jiffy
239 * value. We do this unconditionally on any cpu, as we don't know whether the 267 * value. We do this unconditionally on any cpu, as we don't know whether the
240 * cpu, which has the update task assigned is in a long sleep. 268 * cpu, which has the update task assigned is in a long sleep.
241 */ 269 */
242 static void tick_nohz_update_jiffies(ktime_t now) 270 static void tick_nohz_update_jiffies(ktime_t now)
243 { 271 {
244 int cpu = smp_processor_id(); 272 int cpu = smp_processor_id();
245 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 273 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
246 unsigned long flags; 274 unsigned long flags;
247 275
248 ts->idle_waketime = now; 276 ts->idle_waketime = now;
249 277
250 local_irq_save(flags); 278 local_irq_save(flags);
251 tick_do_update_jiffies64(now); 279 tick_do_update_jiffies64(now);
252 local_irq_restore(flags); 280 local_irq_restore(flags);
253 281
254 touch_softlockup_watchdog(); 282 touch_softlockup_watchdog();
255 } 283 }
256 284
257 /* 285 /*
258 * Updates the per cpu time idle statistics counters 286 * Updates the per cpu time idle statistics counters
259 */ 287 */
260 static void 288 static void
261 update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time) 289 update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time)
262 { 290 {
263 ktime_t delta; 291 ktime_t delta;
264 292
265 if (ts->idle_active) { 293 if (ts->idle_active) {
266 delta = ktime_sub(now, ts->idle_entrytime); 294 delta = ktime_sub(now, ts->idle_entrytime);
267 if (nr_iowait_cpu(cpu) > 0) 295 if (nr_iowait_cpu(cpu) > 0)
268 ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta); 296 ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
269 else 297 else
270 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); 298 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
271 ts->idle_entrytime = now; 299 ts->idle_entrytime = now;
272 } 300 }
273 301
274 if (last_update_time) 302 if (last_update_time)
275 *last_update_time = ktime_to_us(now); 303 *last_update_time = ktime_to_us(now);
276 304
277 } 305 }
278 306
279 static void tick_nohz_stop_idle(int cpu, ktime_t now) 307 static void tick_nohz_stop_idle(int cpu, ktime_t now)
280 { 308 {
281 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 309 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
282 310
283 update_ts_time_stats(cpu, ts, now, NULL); 311 update_ts_time_stats(cpu, ts, now, NULL);
284 ts->idle_active = 0; 312 ts->idle_active = 0;
285 313
286 sched_clock_idle_wakeup_event(0); 314 sched_clock_idle_wakeup_event(0);
287 } 315 }
288 316
289 static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts) 317 static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts)
290 { 318 {
291 ktime_t now = ktime_get(); 319 ktime_t now = ktime_get();
292 320
293 ts->idle_entrytime = now; 321 ts->idle_entrytime = now;
294 ts->idle_active = 1; 322 ts->idle_active = 1;
295 sched_clock_idle_sleep_event(); 323 sched_clock_idle_sleep_event();
296 return now; 324 return now;
297 } 325 }
298 326
299 /** 327 /**
300 * get_cpu_idle_time_us - get the total idle time of a cpu 328 * get_cpu_idle_time_us - get the total idle time of a cpu
301 * @cpu: CPU number to query 329 * @cpu: CPU number to query
302 * @last_update_time: variable to store update time in. Do not update 330 * @last_update_time: variable to store update time in. Do not update
303 * counters if NULL. 331 * counters if NULL.
304 * 332 *
305 * Return the cummulative idle time (since boot) for a given 333 * Return the cummulative idle time (since boot) for a given
306 * CPU, in microseconds. 334 * CPU, in microseconds.
307 * 335 *
308 * This time is measured via accounting rather than sampling, 336 * This time is measured via accounting rather than sampling,
309 * and is as accurate as ktime_get() is. 337 * and is as accurate as ktime_get() is.
310 * 338 *
311 * This function returns -1 if NOHZ is not enabled. 339 * This function returns -1 if NOHZ is not enabled.
312 */ 340 */
313 u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) 341 u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
314 { 342 {
315 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 343 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
316 ktime_t now, idle; 344 ktime_t now, idle;
317 345
318 if (!tick_nohz_enabled) 346 if (!tick_nohz_enabled)
319 return -1; 347 return -1;
320 348
321 now = ktime_get(); 349 now = ktime_get();
322 if (last_update_time) { 350 if (last_update_time) {
323 update_ts_time_stats(cpu, ts, now, last_update_time); 351 update_ts_time_stats(cpu, ts, now, last_update_time);
324 idle = ts->idle_sleeptime; 352 idle = ts->idle_sleeptime;
325 } else { 353 } else {
326 if (ts->idle_active && !nr_iowait_cpu(cpu)) { 354 if (ts->idle_active && !nr_iowait_cpu(cpu)) {
327 ktime_t delta = ktime_sub(now, ts->idle_entrytime); 355 ktime_t delta = ktime_sub(now, ts->idle_entrytime);
328 356
329 idle = ktime_add(ts->idle_sleeptime, delta); 357 idle = ktime_add(ts->idle_sleeptime, delta);
330 } else { 358 } else {
331 idle = ts->idle_sleeptime; 359 idle = ts->idle_sleeptime;
332 } 360 }
333 } 361 }
334 362
335 return ktime_to_us(idle); 363 return ktime_to_us(idle);
336 364
337 } 365 }
338 EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); 366 EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
339 367
340 /** 368 /**
341 * get_cpu_iowait_time_us - get the total iowait time of a cpu 369 * get_cpu_iowait_time_us - get the total iowait time of a cpu
342 * @cpu: CPU number to query 370 * @cpu: CPU number to query
343 * @last_update_time: variable to store update time in. Do not update 371 * @last_update_time: variable to store update time in. Do not update
344 * counters if NULL. 372 * counters if NULL.
345 * 373 *
346 * Return the cummulative iowait time (since boot) for a given 374 * Return the cummulative iowait time (since boot) for a given
347 * CPU, in microseconds. 375 * CPU, in microseconds.
348 * 376 *
349 * This time is measured via accounting rather than sampling, 377 * This time is measured via accounting rather than sampling,
350 * and is as accurate as ktime_get() is. 378 * and is as accurate as ktime_get() is.
351 * 379 *
352 * This function returns -1 if NOHZ is not enabled. 380 * This function returns -1 if NOHZ is not enabled.
353 */ 381 */
354 u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) 382 u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
355 { 383 {
356 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 384 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
357 ktime_t now, iowait; 385 ktime_t now, iowait;
358 386
359 if (!tick_nohz_enabled) 387 if (!tick_nohz_enabled)
360 return -1; 388 return -1;
361 389
362 now = ktime_get(); 390 now = ktime_get();
363 if (last_update_time) { 391 if (last_update_time) {
364 update_ts_time_stats(cpu, ts, now, last_update_time); 392 update_ts_time_stats(cpu, ts, now, last_update_time);
365 iowait = ts->iowait_sleeptime; 393 iowait = ts->iowait_sleeptime;
366 } else { 394 } else {
367 if (ts->idle_active && nr_iowait_cpu(cpu) > 0) { 395 if (ts->idle_active && nr_iowait_cpu(cpu) > 0) {
368 ktime_t delta = ktime_sub(now, ts->idle_entrytime); 396 ktime_t delta = ktime_sub(now, ts->idle_entrytime);
369 397
370 iowait = ktime_add(ts->iowait_sleeptime, delta); 398 iowait = ktime_add(ts->iowait_sleeptime, delta);
371 } else { 399 } else {
372 iowait = ts->iowait_sleeptime; 400 iowait = ts->iowait_sleeptime;
373 } 401 }
374 } 402 }
375 403
376 return ktime_to_us(iowait); 404 return ktime_to_us(iowait);
377 } 405 }
378 EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); 406 EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
379 407
380 static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, 408 static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
381 ktime_t now, int cpu) 409 ktime_t now, int cpu)
382 { 410 {
383 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; 411 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies;
384 ktime_t last_update, expires, ret = { .tv64 = 0 }; 412 ktime_t last_update, expires, ret = { .tv64 = 0 };
385 unsigned long rcu_delta_jiffies; 413 unsigned long rcu_delta_jiffies;
386 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; 414 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
387 u64 time_delta; 415 u64 time_delta;
388 416
389 /* Read jiffies and the time when jiffies were updated last */ 417 /* Read jiffies and the time when jiffies were updated last */
390 do { 418 do {
391 seq = read_seqbegin(&jiffies_lock); 419 seq = read_seqbegin(&jiffies_lock);
392 last_update = last_jiffies_update; 420 last_update = last_jiffies_update;
393 last_jiffies = jiffies; 421 last_jiffies = jiffies;
394 time_delta = timekeeping_max_deferment(); 422 time_delta = timekeeping_max_deferment();
395 } while (read_seqretry(&jiffies_lock, seq)); 423 } while (read_seqretry(&jiffies_lock, seq));
396 424
397 if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || 425 if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) ||
398 arch_needs_cpu(cpu) || irq_work_needs_cpu()) { 426 arch_needs_cpu(cpu) || irq_work_needs_cpu()) {
399 next_jiffies = last_jiffies + 1; 427 next_jiffies = last_jiffies + 1;
400 delta_jiffies = 1; 428 delta_jiffies = 1;
401 } else { 429 } else {
402 /* Get the next timer wheel timer */ 430 /* Get the next timer wheel timer */
403 next_jiffies = get_next_timer_interrupt(last_jiffies); 431 next_jiffies = get_next_timer_interrupt(last_jiffies);
404 delta_jiffies = next_jiffies - last_jiffies; 432 delta_jiffies = next_jiffies - last_jiffies;
405 if (rcu_delta_jiffies < delta_jiffies) { 433 if (rcu_delta_jiffies < delta_jiffies) {
406 next_jiffies = last_jiffies + rcu_delta_jiffies; 434 next_jiffies = last_jiffies + rcu_delta_jiffies;
407 delta_jiffies = rcu_delta_jiffies; 435 delta_jiffies = rcu_delta_jiffies;
408 } 436 }
409 } 437 }
410 /* 438 /*
411 * Do not stop the tick, if we are only one off 439 * Do not stop the tick, if we are only one off
412 * or if the cpu is required for rcu 440 * or if the cpu is required for rcu
413 */ 441 */
414 if (!ts->tick_stopped && delta_jiffies == 1) 442 if (!ts->tick_stopped && delta_jiffies == 1)
415 goto out; 443 goto out;
416 444
417 /* Schedule the tick, if we are at least one jiffie off */ 445 /* Schedule the tick, if we are at least one jiffie off */
418 if ((long)delta_jiffies >= 1) { 446 if ((long)delta_jiffies >= 1) {
419 447
420 /* 448 /*
421 * If this cpu is the one which updates jiffies, then 449 * If this cpu is the one which updates jiffies, then
422 * give up the assignment and let it be taken by the 450 * give up the assignment and let it be taken by the
423 * cpu which runs the tick timer next, which might be 451 * cpu which runs the tick timer next, which might be
424 * this cpu as well. If we don't drop this here the 452 * this cpu as well. If we don't drop this here the
425 * jiffies might be stale and do_timer() never 453 * jiffies might be stale and do_timer() never
426 * invoked. Keep track of the fact that it was the one 454 * invoked. Keep track of the fact that it was the one
427 * which had the do_timer() duty last. If this cpu is 455 * which had the do_timer() duty last. If this cpu is
428 * the one which had the do_timer() duty last, we 456 * the one which had the do_timer() duty last, we
429 * limit the sleep time to the timekeeping 457 * limit the sleep time to the timekeeping
430 * max_deferement value which we retrieved 458 * max_deferement value which we retrieved
431 * above. Otherwise we can sleep as long as we want. 459 * above. Otherwise we can sleep as long as we want.
432 */ 460 */
433 if (cpu == tick_do_timer_cpu) { 461 if (cpu == tick_do_timer_cpu) {
434 tick_do_timer_cpu = TICK_DO_TIMER_NONE; 462 tick_do_timer_cpu = TICK_DO_TIMER_NONE;
435 ts->do_timer_last = 1; 463 ts->do_timer_last = 1;
436 } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) { 464 } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) {
437 time_delta = KTIME_MAX; 465 time_delta = KTIME_MAX;
438 ts->do_timer_last = 0; 466 ts->do_timer_last = 0;
439 } else if (!ts->do_timer_last) { 467 } else if (!ts->do_timer_last) {
440 time_delta = KTIME_MAX; 468 time_delta = KTIME_MAX;
441 } 469 }
442 470
443 /* 471 /*
444 * calculate the expiry time for the next timer wheel 472 * calculate the expiry time for the next timer wheel
445 * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals 473 * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals
446 * that there is no timer pending or at least extremely 474 * that there is no timer pending or at least extremely
447 * far into the future (12 days for HZ=1000). In this 475 * far into the future (12 days for HZ=1000). In this
448 * case we set the expiry to the end of time. 476 * case we set the expiry to the end of time.
449 */ 477 */
450 if (likely(delta_jiffies < NEXT_TIMER_MAX_DELTA)) { 478 if (likely(delta_jiffies < NEXT_TIMER_MAX_DELTA)) {
451 /* 479 /*
452 * Calculate the time delta for the next timer event. 480 * Calculate the time delta for the next timer event.
453 * If the time delta exceeds the maximum time delta 481 * If the time delta exceeds the maximum time delta
454 * permitted by the current clocksource then adjust 482 * permitted by the current clocksource then adjust
455 * the time delta accordingly to ensure the 483 * the time delta accordingly to ensure the
456 * clocksource does not wrap. 484 * clocksource does not wrap.
457 */ 485 */
458 time_delta = min_t(u64, time_delta, 486 time_delta = min_t(u64, time_delta,
459 tick_period.tv64 * delta_jiffies); 487 tick_period.tv64 * delta_jiffies);
460 } 488 }
461 489
462 if (time_delta < KTIME_MAX) 490 if (time_delta < KTIME_MAX)
463 expires = ktime_add_ns(last_update, time_delta); 491 expires = ktime_add_ns(last_update, time_delta);
464 else 492 else
465 expires.tv64 = KTIME_MAX; 493 expires.tv64 = KTIME_MAX;
466 494
467 /* Skip reprogram of event if its not changed */ 495 /* Skip reprogram of event if its not changed */
468 if (ts->tick_stopped && ktime_equal(expires, dev->next_event)) 496 if (ts->tick_stopped && ktime_equal(expires, dev->next_event))
469 goto out; 497 goto out;
470 498
471 ret = expires; 499 ret = expires;
472 500
473 /* 501 /*
474 * nohz_stop_sched_tick can be called several times before 502 * nohz_stop_sched_tick can be called several times before
475 * the nohz_restart_sched_tick is called. This happens when 503 * the nohz_restart_sched_tick is called. This happens when
476 * interrupts arrive which do not cause a reschedule. In the 504 * interrupts arrive which do not cause a reschedule. In the
477 * first call we save the current tick time, so we can restart 505 * first call we save the current tick time, so we can restart
478 * the scheduler tick in nohz_restart_sched_tick. 506 * the scheduler tick in nohz_restart_sched_tick.
479 */ 507 */
480 if (!ts->tick_stopped) { 508 if (!ts->tick_stopped) {
481 nohz_balance_enter_idle(cpu); 509 nohz_balance_enter_idle(cpu);
482 calc_load_enter_idle(); 510 calc_load_enter_idle();
483 511
484 ts->last_tick = hrtimer_get_expires(&ts->sched_timer); 512 ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
485 ts->tick_stopped = 1; 513 ts->tick_stopped = 1;
486 } 514 }
487 515
488 /* 516 /*
489 * If the expiration time == KTIME_MAX, then 517 * If the expiration time == KTIME_MAX, then
490 * in this case we simply stop the tick timer. 518 * in this case we simply stop the tick timer.
491 */ 519 */
492 if (unlikely(expires.tv64 == KTIME_MAX)) { 520 if (unlikely(expires.tv64 == KTIME_MAX)) {
493 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) 521 if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
494 hrtimer_cancel(&ts->sched_timer); 522 hrtimer_cancel(&ts->sched_timer);
495 goto out; 523 goto out;
496 } 524 }
497 525
498 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { 526 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
499 hrtimer_start(&ts->sched_timer, expires, 527 hrtimer_start(&ts->sched_timer, expires,
500 HRTIMER_MODE_ABS_PINNED); 528 HRTIMER_MODE_ABS_PINNED);
501 /* Check, if the timer was already in the past */ 529 /* Check, if the timer was already in the past */
502 if (hrtimer_active(&ts->sched_timer)) 530 if (hrtimer_active(&ts->sched_timer))
503 goto out; 531 goto out;
504 } else if (!tick_program_event(expires, 0)) 532 } else if (!tick_program_event(expires, 0))
505 goto out; 533 goto out;
506 /* 534 /*
507 * We are past the event already. So we crossed a 535 * We are past the event already. So we crossed a
508 * jiffie boundary. Update jiffies and raise the 536 * jiffie boundary. Update jiffies and raise the
509 * softirq. 537 * softirq.
510 */ 538 */
511 tick_do_update_jiffies64(ktime_get()); 539 tick_do_update_jiffies64(ktime_get());
512 } 540 }
513 raise_softirq_irqoff(TIMER_SOFTIRQ); 541 raise_softirq_irqoff(TIMER_SOFTIRQ);
514 out: 542 out:
515 ts->next_jiffies = next_jiffies; 543 ts->next_jiffies = next_jiffies;
516 ts->last_jiffies = last_jiffies; 544 ts->last_jiffies = last_jiffies;
517 ts->sleep_length = ktime_sub(dev->next_event, now); 545 ts->sleep_length = ktime_sub(dev->next_event, now);
518 546
519 return ret; 547 return ret;
520 } 548 }
521 549
522 static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) 550 static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
523 { 551 {
524 /* 552 /*
525 * If this cpu is offline and it is the one which updates 553 * If this cpu is offline and it is the one which updates
526 * jiffies, then give up the assignment and let it be taken by 554 * jiffies, then give up the assignment and let it be taken by
527 * the cpu which runs the tick timer next. If we don't drop 555 * the cpu which runs the tick timer next. If we don't drop
528 * this here the jiffies might be stale and do_timer() never 556 * this here the jiffies might be stale and do_timer() never
529 * invoked. 557 * invoked.
530 */ 558 */
531 if (unlikely(!cpu_online(cpu))) { 559 if (unlikely(!cpu_online(cpu))) {
532 if (cpu == tick_do_timer_cpu) 560 if (cpu == tick_do_timer_cpu)
533 tick_do_timer_cpu = TICK_DO_TIMER_NONE; 561 tick_do_timer_cpu = TICK_DO_TIMER_NONE;
534 } 562 }
535 563
536 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) 564 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
537 return false; 565 return false;
538 566
539 if (need_resched()) 567 if (need_resched())
540 return false; 568 return false;
541 569
542 if (unlikely(local_softirq_pending() && cpu_online(cpu))) { 570 if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
543 static int ratelimit; 571 static int ratelimit;
544 572
545 if (ratelimit < 10 && 573 if (ratelimit < 10 &&
546 (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) { 574 (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
547 printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", 575 printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
548 (unsigned int) local_softirq_pending()); 576 (unsigned int) local_softirq_pending());
549 ratelimit++; 577 ratelimit++;
550 } 578 }
551 return false; 579 return false;
580 }
581
582 if (have_nohz_extended_mask) {
583 /*
584 * Keep the tick alive to guarantee timekeeping progression
585 * if there are full dynticks CPUs around
586 */
587 if (tick_do_timer_cpu == cpu)
588 return false;
589 /*
590 * Boot safety: make sure the timekeeping duty has been
591 * assigned before entering dyntick-idle mode,
592 */
593 if (tick_do_timer_cpu == TICK_DO_TIMER_NONE)
594 return false;
552 } 595 }
553 596
554 return true; 597 return true;
555 } 598 }
556 599
557 static void __tick_nohz_idle_enter(struct tick_sched *ts) 600 static void __tick_nohz_idle_enter(struct tick_sched *ts)
558 { 601 {
559 ktime_t now, expires; 602 ktime_t now, expires;
560 int cpu = smp_processor_id(); 603 int cpu = smp_processor_id();
561 604
562 now = tick_nohz_start_idle(cpu, ts); 605 now = tick_nohz_start_idle(cpu, ts);
563 606
564 if (can_stop_idle_tick(cpu, ts)) { 607 if (can_stop_idle_tick(cpu, ts)) {
565 int was_stopped = ts->tick_stopped; 608 int was_stopped = ts->tick_stopped;
566 609
567 ts->idle_calls++; 610 ts->idle_calls++;
568 611
569 expires = tick_nohz_stop_sched_tick(ts, now, cpu); 612 expires = tick_nohz_stop_sched_tick(ts, now, cpu);
570 if (expires.tv64 > 0LL) { 613 if (expires.tv64 > 0LL) {
571 ts->idle_sleeps++; 614 ts->idle_sleeps++;
572 ts->idle_expires = expires; 615 ts->idle_expires = expires;
573 } 616 }
574 617
575 if (!was_stopped && ts->tick_stopped) 618 if (!was_stopped && ts->tick_stopped)
576 ts->idle_jiffies = ts->last_jiffies; 619 ts->idle_jiffies = ts->last_jiffies;
577 } 620 }
578 } 621 }
579 622
580 /** 623 /**
581 * tick_nohz_idle_enter - stop the idle tick from the idle task 624 * tick_nohz_idle_enter - stop the idle tick from the idle task
582 * 625 *
583 * When the next event is more than a tick into the future, stop the idle tick 626 * When the next event is more than a tick into the future, stop the idle tick
584 * Called when we start the idle loop. 627 * Called when we start the idle loop.
585 * 628 *
586 * The arch is responsible of calling: 629 * The arch is responsible of calling:
587 * 630 *
588 * - rcu_idle_enter() after its last use of RCU before the CPU is put 631 * - rcu_idle_enter() after its last use of RCU before the CPU is put
589 * to sleep. 632 * to sleep.
590 * - rcu_idle_exit() before the first use of RCU after the CPU is woken up. 633 * - rcu_idle_exit() before the first use of RCU after the CPU is woken up.
591 */ 634 */
592 void tick_nohz_idle_enter(void) 635 void tick_nohz_idle_enter(void)
593 { 636 {
594 struct tick_sched *ts; 637 struct tick_sched *ts;
595 638
596 WARN_ON_ONCE(irqs_disabled()); 639 WARN_ON_ONCE(irqs_disabled());
597 640
598 /* 641 /*
599 * Update the idle state in the scheduler domain hierarchy 642 * Update the idle state in the scheduler domain hierarchy
600 * when tick_nohz_stop_sched_tick() is called from the idle loop. 643 * when tick_nohz_stop_sched_tick() is called from the idle loop.
601 * State will be updated to busy during the first busy tick after 644 * State will be updated to busy during the first busy tick after
602 * exiting idle. 645 * exiting idle.
603 */ 646 */
604 set_cpu_sd_state_idle(); 647 set_cpu_sd_state_idle();
605 648
606 local_irq_disable(); 649 local_irq_disable();
607 650
608 ts = &__get_cpu_var(tick_cpu_sched); 651 ts = &__get_cpu_var(tick_cpu_sched);
609 /* 652 /*
610 * set ts->inidle unconditionally. even if the system did not 653 * set ts->inidle unconditionally. even if the system did not
611 * switch to nohz mode the cpu frequency governers rely on the 654 * switch to nohz mode the cpu frequency governers rely on the
612 * update of the idle time accounting in tick_nohz_start_idle(). 655 * update of the idle time accounting in tick_nohz_start_idle().
613 */ 656 */
614 ts->inidle = 1; 657 ts->inidle = 1;
615 __tick_nohz_idle_enter(ts); 658 __tick_nohz_idle_enter(ts);
616 659
617 local_irq_enable(); 660 local_irq_enable();
618 } 661 }
619 EXPORT_SYMBOL_GPL(tick_nohz_idle_enter); 662 EXPORT_SYMBOL_GPL(tick_nohz_idle_enter);
620 663
621 /** 664 /**
622 * tick_nohz_irq_exit - update next tick event from interrupt exit 665 * tick_nohz_irq_exit - update next tick event from interrupt exit
623 * 666 *
624 * When an interrupt fires while we are idle and it doesn't cause 667 * When an interrupt fires while we are idle and it doesn't cause
625 * a reschedule, it may still add, modify or delete a timer, enqueue 668 * a reschedule, it may still add, modify or delete a timer, enqueue
626 * an RCU callback, etc... 669 * an RCU callback, etc...
627 * So we need to re-calculate and reprogram the next tick event. 670 * So we need to re-calculate and reprogram the next tick event.
628 */ 671 */
629 void tick_nohz_irq_exit(void) 672 void tick_nohz_irq_exit(void)
630 { 673 {
631 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); 674 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
632 675
633 if (!ts->inidle) 676 if (!ts->inidle)
634 return; 677 return;
635 678
636 /* Cancel the timer because CPU already waken up from the C-states*/ 679 /* Cancel the timer because CPU already waken up from the C-states*/
637 menu_hrtimer_cancel(); 680 menu_hrtimer_cancel();
638 __tick_nohz_idle_enter(ts); 681 __tick_nohz_idle_enter(ts);
639 } 682 }
640 683
641 /** 684 /**
642 * tick_nohz_get_sleep_length - return the length of the current sleep 685 * tick_nohz_get_sleep_length - return the length of the current sleep
643 * 686 *
644 * Called from power state control code with interrupts disabled 687 * Called from power state control code with interrupts disabled
645 */ 688 */
646 ktime_t tick_nohz_get_sleep_length(void) 689 ktime_t tick_nohz_get_sleep_length(void)
647 { 690 {
648 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); 691 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
649 692
650 return ts->sleep_length; 693 return ts->sleep_length;
651 } 694 }
652 695
653 static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) 696 static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
654 { 697 {
655 hrtimer_cancel(&ts->sched_timer); 698 hrtimer_cancel(&ts->sched_timer);
656 hrtimer_set_expires(&ts->sched_timer, ts->last_tick); 699 hrtimer_set_expires(&ts->sched_timer, ts->last_tick);
657 700
658 while (1) { 701 while (1) {
659 /* Forward the time to expire in the future */ 702 /* Forward the time to expire in the future */
660 hrtimer_forward(&ts->sched_timer, now, tick_period); 703 hrtimer_forward(&ts->sched_timer, now, tick_period);
661 704
662 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { 705 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
663 hrtimer_start_expires(&ts->sched_timer, 706 hrtimer_start_expires(&ts->sched_timer,
664 HRTIMER_MODE_ABS_PINNED); 707 HRTIMER_MODE_ABS_PINNED);
665 /* Check, if the timer was already in the past */ 708 /* Check, if the timer was already in the past */
666 if (hrtimer_active(&ts->sched_timer)) 709 if (hrtimer_active(&ts->sched_timer))
667 break; 710 break;
668 } else { 711 } else {
669 if (!tick_program_event( 712 if (!tick_program_event(
670 hrtimer_get_expires(&ts->sched_timer), 0)) 713 hrtimer_get_expires(&ts->sched_timer), 0))
671 break; 714 break;
672 } 715 }
673 /* Reread time and update jiffies */ 716 /* Reread time and update jiffies */
674 now = ktime_get(); 717 now = ktime_get();
675 tick_do_update_jiffies64(now); 718 tick_do_update_jiffies64(now);
676 } 719 }
677 } 720 }
678 721
679 static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) 722 static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
680 { 723 {
681 /* Update jiffies first */ 724 /* Update jiffies first */
682 tick_do_update_jiffies64(now); 725 tick_do_update_jiffies64(now);
683 update_cpu_load_nohz(); 726 update_cpu_load_nohz();
684 727
685 calc_load_exit_idle(); 728 calc_load_exit_idle();
686 touch_softlockup_watchdog(); 729 touch_softlockup_watchdog();
687 /* 730 /*
688 * Cancel the scheduled timer and restore the tick 731 * Cancel the scheduled timer and restore the tick
689 */ 732 */
690 ts->tick_stopped = 0; 733 ts->tick_stopped = 0;
691 ts->idle_exittime = now; 734 ts->idle_exittime = now;
692 735
693 tick_nohz_restart(ts, now); 736 tick_nohz_restart(ts, now);
694 } 737 }
695 738
696 static void tick_nohz_account_idle_ticks(struct tick_sched *ts) 739 static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
697 { 740 {
698 #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE 741 #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
699 unsigned long ticks; 742 unsigned long ticks;
700 743
701 if (vtime_accounting_enabled()) 744 if (vtime_accounting_enabled())
702 return; 745 return;
703 /* 746 /*
704 * We stopped the tick in idle. Update process times would miss the 747 * We stopped the tick in idle. Update process times would miss the
705 * time we slept as update_process_times does only a 1 tick 748 * time we slept as update_process_times does only a 1 tick
706 * accounting. Enforce that this is accounted to idle ! 749 * accounting. Enforce that this is accounted to idle !
707 */ 750 */
708 ticks = jiffies - ts->idle_jiffies; 751 ticks = jiffies - ts->idle_jiffies;
709 /* 752 /*
710 * We might be one off. Do not randomly account a huge number of ticks! 753 * We might be one off. Do not randomly account a huge number of ticks!
711 */ 754 */
712 if (ticks && ticks < LONG_MAX) 755 if (ticks && ticks < LONG_MAX)
713 account_idle_ticks(ticks); 756 account_idle_ticks(ticks);
714 #endif 757 #endif
715 } 758 }
716 759
717 /** 760 /**
718 * tick_nohz_idle_exit - restart the idle tick from the idle task 761 * tick_nohz_idle_exit - restart the idle tick from the idle task
719 * 762 *
720 * Restart the idle tick when the CPU is woken up from idle 763 * Restart the idle tick when the CPU is woken up from idle
721 * This also exit the RCU extended quiescent state. The CPU 764 * This also exit the RCU extended quiescent state. The CPU
722 * can use RCU again after this function is called. 765 * can use RCU again after this function is called.
723 */ 766 */
724 void tick_nohz_idle_exit(void) 767 void tick_nohz_idle_exit(void)
725 { 768 {
726 int cpu = smp_processor_id(); 769 int cpu = smp_processor_id();
727 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 770 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
728 ktime_t now; 771 ktime_t now;
729 772
730 local_irq_disable(); 773 local_irq_disable();
731 774
732 WARN_ON_ONCE(!ts->inidle); 775 WARN_ON_ONCE(!ts->inidle);
733 776
734 ts->inidle = 0; 777 ts->inidle = 0;
735 778
736 /* Cancel the timer because CPU already waken up from the C-states*/ 779 /* Cancel the timer because CPU already waken up from the C-states*/
737 menu_hrtimer_cancel(); 780 menu_hrtimer_cancel();
738 if (ts->idle_active || ts->tick_stopped) 781 if (ts->idle_active || ts->tick_stopped)
739 now = ktime_get(); 782 now = ktime_get();
740 783
741 if (ts->idle_active) 784 if (ts->idle_active)
742 tick_nohz_stop_idle(cpu, now); 785 tick_nohz_stop_idle(cpu, now);
743 786
744 if (ts->tick_stopped) { 787 if (ts->tick_stopped) {
745 tick_nohz_restart_sched_tick(ts, now); 788 tick_nohz_restart_sched_tick(ts, now);
746 tick_nohz_account_idle_ticks(ts); 789 tick_nohz_account_idle_ticks(ts);
747 } 790 }
748 791
749 local_irq_enable(); 792 local_irq_enable();
750 } 793 }
751 EXPORT_SYMBOL_GPL(tick_nohz_idle_exit); 794 EXPORT_SYMBOL_GPL(tick_nohz_idle_exit);
752 795
753 static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now) 796 static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now)
754 { 797 {
755 hrtimer_forward(&ts->sched_timer, now, tick_period); 798 hrtimer_forward(&ts->sched_timer, now, tick_period);
756 return tick_program_event(hrtimer_get_expires(&ts->sched_timer), 0); 799 return tick_program_event(hrtimer_get_expires(&ts->sched_timer), 0);
757 } 800 }
758 801
759 /* 802 /*
760 * The nohz low res interrupt handler 803 * The nohz low res interrupt handler
761 */ 804 */
762 static void tick_nohz_handler(struct clock_event_device *dev) 805 static void tick_nohz_handler(struct clock_event_device *dev)
763 { 806 {
764 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); 807 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
765 struct pt_regs *regs = get_irq_regs(); 808 struct pt_regs *regs = get_irq_regs();
766 ktime_t now = ktime_get(); 809 ktime_t now = ktime_get();
767 810
768 dev->next_event.tv64 = KTIME_MAX; 811 dev->next_event.tv64 = KTIME_MAX;
769 812
770 tick_sched_do_timer(now); 813 tick_sched_do_timer(now);
771 tick_sched_handle(ts, regs); 814 tick_sched_handle(ts, regs);
772 815
773 while (tick_nohz_reprogram(ts, now)) { 816 while (tick_nohz_reprogram(ts, now)) {
774 now = ktime_get(); 817 now = ktime_get();
775 tick_do_update_jiffies64(now); 818 tick_do_update_jiffies64(now);
776 } 819 }
777 } 820 }
778 821
779 /** 822 /**
780 * tick_nohz_switch_to_nohz - switch to nohz mode 823 * tick_nohz_switch_to_nohz - switch to nohz mode
781 */ 824 */
782 static void tick_nohz_switch_to_nohz(void) 825 static void tick_nohz_switch_to_nohz(void)
783 { 826 {
784 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); 827 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
785 ktime_t next; 828 ktime_t next;
786 829
787 if (!tick_nohz_enabled) 830 if (!tick_nohz_enabled)
788 return; 831 return;
789 832
790 local_irq_disable(); 833 local_irq_disable();
791 if (tick_switch_to_oneshot(tick_nohz_handler)) { 834 if (tick_switch_to_oneshot(tick_nohz_handler)) {
792 local_irq_enable(); 835 local_irq_enable();
793 return; 836 return;
794 } 837 }
795 838
796 ts->nohz_mode = NOHZ_MODE_LOWRES; 839 ts->nohz_mode = NOHZ_MODE_LOWRES;
797 840
798 /* 841 /*
799 * Recycle the hrtimer in ts, so we can share the 842 * Recycle the hrtimer in ts, so we can share the
800 * hrtimer_forward with the highres code. 843 * hrtimer_forward with the highres code.
801 */ 844 */
802 hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 845 hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
803 /* Get the next period */ 846 /* Get the next period */
804 next = tick_init_jiffy_update(); 847 next = tick_init_jiffy_update();
805 848
806 for (;;) { 849 for (;;) {
807 hrtimer_set_expires(&ts->sched_timer, next); 850 hrtimer_set_expires(&ts->sched_timer, next);
808 if (!tick_program_event(next, 0)) 851 if (!tick_program_event(next, 0))
809 break; 852 break;
810 next = ktime_add(next, tick_period); 853 next = ktime_add(next, tick_period);
811 } 854 }
812 local_irq_enable(); 855 local_irq_enable();
813 } 856 }
814 857
815 /* 858 /*
816 * When NOHZ is enabled and the tick is stopped, we need to kick the 859 * When NOHZ is enabled and the tick is stopped, we need to kick the
817 * tick timer from irq_enter() so that the jiffies update is kept 860 * tick timer from irq_enter() so that the jiffies update is kept
818 * alive during long running softirqs. That's ugly as hell, but 861 * alive during long running softirqs. That's ugly as hell, but
819 * correctness is key even if we need to fix the offending softirq in 862 * correctness is key even if we need to fix the offending softirq in
820 * the first place. 863 * the first place.
821 * 864 *
822 * Note, this is different to tick_nohz_restart. We just kick the 865 * Note, this is different to tick_nohz_restart. We just kick the
823 * timer and do not touch the other magic bits which need to be done 866 * timer and do not touch the other magic bits which need to be done
824 * when idle is left. 867 * when idle is left.
825 */ 868 */
826 static void tick_nohz_kick_tick(int cpu, ktime_t now) 869 static void tick_nohz_kick_tick(int cpu, ktime_t now)
827 { 870 {
828 #if 0 871 #if 0
829 /* Switch back to 2.6.27 behaviour */ 872 /* Switch back to 2.6.27 behaviour */
830 873
831 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 874 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
832 ktime_t delta; 875 ktime_t delta;
833 876
834 /* 877 /*
835 * Do not touch the tick device, when the next expiry is either 878 * Do not touch the tick device, when the next expiry is either
836 * already reached or less/equal than the tick period. 879 * already reached or less/equal than the tick period.
837 */ 880 */
838 delta = ktime_sub(hrtimer_get_expires(&ts->sched_timer), now); 881 delta = ktime_sub(hrtimer_get_expires(&ts->sched_timer), now);
839 if (delta.tv64 <= tick_period.tv64) 882 if (delta.tv64 <= tick_period.tv64)
840 return; 883 return;
841 884
842 tick_nohz_restart(ts, now); 885 tick_nohz_restart(ts, now);
843 #endif 886 #endif
844 } 887 }
845 888
846 static inline void tick_check_nohz(int cpu) 889 static inline void tick_check_nohz(int cpu)
847 { 890 {
848 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 891 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
849 ktime_t now; 892 ktime_t now;
850 893
851 if (!ts->idle_active && !ts->tick_stopped) 894 if (!ts->idle_active && !ts->tick_stopped)
852 return; 895 return;
853 now = ktime_get(); 896 now = ktime_get();
854 if (ts->idle_active) 897 if (ts->idle_active)
855 tick_nohz_stop_idle(cpu, now); 898 tick_nohz_stop_idle(cpu, now);
856 if (ts->tick_stopped) { 899 if (ts->tick_stopped) {
857 tick_nohz_update_jiffies(now); 900 tick_nohz_update_jiffies(now);
858 tick_nohz_kick_tick(cpu, now); 901 tick_nohz_kick_tick(cpu, now);
859 } 902 }
860 } 903 }
861 904
862 #else 905 #else
863 906
864 static inline void tick_nohz_switch_to_nohz(void) { } 907 static inline void tick_nohz_switch_to_nohz(void) { }
865 static inline void tick_check_nohz(int cpu) { } 908 static inline void tick_check_nohz(int cpu) { }
866 909
867 #endif /* NO_HZ */ 910 #endif /* NO_HZ */
868 911
869 /* 912 /*
870 * Called from irq_enter to notify about the possible interruption of idle() 913 * Called from irq_enter to notify about the possible interruption of idle()
871 */ 914 */
872 void tick_check_idle(int cpu) 915 void tick_check_idle(int cpu)
873 { 916 {
874 tick_check_oneshot_broadcast(cpu); 917 tick_check_oneshot_broadcast(cpu);
875 tick_check_nohz(cpu); 918 tick_check_nohz(cpu);
876 } 919 }
877 920
878 /* 921 /*
879 * High resolution timer specific code 922 * High resolution timer specific code
880 */ 923 */
881 #ifdef CONFIG_HIGH_RES_TIMERS 924 #ifdef CONFIG_HIGH_RES_TIMERS
882 /* 925 /*
883 * We rearm the timer until we get disabled by the idle code. 926 * We rearm the timer until we get disabled by the idle code.
884 * Called with interrupts disabled. 927 * Called with interrupts disabled.
885 */ 928 */
886 static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) 929 static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
887 { 930 {
888 struct tick_sched *ts = 931 struct tick_sched *ts =
889 container_of(timer, struct tick_sched, sched_timer); 932 container_of(timer, struct tick_sched, sched_timer);
890 struct pt_regs *regs = get_irq_regs(); 933 struct pt_regs *regs = get_irq_regs();
891 ktime_t now = ktime_get(); 934 ktime_t now = ktime_get();
892 935
893 tick_sched_do_timer(now); 936 tick_sched_do_timer(now);
894 937
895 /* 938 /*
896 * Do not call, when we are not in irq context and have 939 * Do not call, when we are not in irq context and have
897 * no valid regs pointer 940 * no valid regs pointer
898 */ 941 */
899 if (regs) 942 if (regs)
900 tick_sched_handle(ts, regs); 943 tick_sched_handle(ts, regs);
901 944
902 hrtimer_forward(timer, now, tick_period); 945 hrtimer_forward(timer, now, tick_period);
903 946
904 return HRTIMER_RESTART; 947 return HRTIMER_RESTART;
905 } 948 }
906 949
907 static int sched_skew_tick; 950 static int sched_skew_tick;
908 951
909 static int __init skew_tick(char *str) 952 static int __init skew_tick(char *str)
910 { 953 {
911 get_option(&str, &sched_skew_tick); 954 get_option(&str, &sched_skew_tick);
912 955
913 return 0; 956 return 0;
914 } 957 }
915 early_param("skew_tick", skew_tick); 958 early_param("skew_tick", skew_tick);
916 959
917 /** 960 /**
918 * tick_setup_sched_timer - setup the tick emulation timer 961 * tick_setup_sched_timer - setup the tick emulation timer
919 */ 962 */
920 void tick_setup_sched_timer(void) 963 void tick_setup_sched_timer(void)
921 { 964 {
922 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); 965 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
923 ktime_t now = ktime_get(); 966 ktime_t now = ktime_get();
924 967
925 /* 968 /*
926 * Emulate tick processing via per-CPU hrtimers: 969 * Emulate tick processing via per-CPU hrtimers:
927 */ 970 */
928 hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 971 hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
929 ts->sched_timer.function = tick_sched_timer; 972 ts->sched_timer.function = tick_sched_timer;
930 973
931 /* Get the next period (per cpu) */ 974 /* Get the next period (per cpu) */
932 hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); 975 hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
933 976
934 /* Offset the tick to avert jiffies_lock contention. */ 977 /* Offset the tick to avert jiffies_lock contention. */
935 if (sched_skew_tick) { 978 if (sched_skew_tick) {
936 u64 offset = ktime_to_ns(tick_period) >> 1; 979 u64 offset = ktime_to_ns(tick_period) >> 1;
937 do_div(offset, num_possible_cpus()); 980 do_div(offset, num_possible_cpus());
938 offset *= smp_processor_id(); 981 offset *= smp_processor_id();
939 hrtimer_add_expires_ns(&ts->sched_timer, offset); 982 hrtimer_add_expires_ns(&ts->sched_timer, offset);
940 } 983 }
941 984
942 for (;;) { 985 for (;;) {
943 hrtimer_forward(&ts->sched_timer, now, tick_period); 986 hrtimer_forward(&ts->sched_timer, now, tick_period);
944 hrtimer_start_expires(&ts->sched_timer, 987 hrtimer_start_expires(&ts->sched_timer,
945 HRTIMER_MODE_ABS_PINNED); 988 HRTIMER_MODE_ABS_PINNED);
946 /* Check, if the timer was already in the past */ 989 /* Check, if the timer was already in the past */
947 if (hrtimer_active(&ts->sched_timer)) 990 if (hrtimer_active(&ts->sched_timer))
948 break; 991 break;
949 now = ktime_get(); 992 now = ktime_get();
950 } 993 }
951 994
952 #ifdef CONFIG_NO_HZ 995 #ifdef CONFIG_NO_HZ
953 if (tick_nohz_enabled) 996 if (tick_nohz_enabled)
954 ts->nohz_mode = NOHZ_MODE_HIGHRES; 997 ts->nohz_mode = NOHZ_MODE_HIGHRES;
955 #endif 998 #endif
956 } 999 }
957 #endif /* HIGH_RES_TIMERS */ 1000 #endif /* HIGH_RES_TIMERS */
958 1001
959 #if defined CONFIG_NO_HZ || defined CONFIG_HIGH_RES_TIMERS 1002 #if defined CONFIG_NO_HZ || defined CONFIG_HIGH_RES_TIMERS
960 void tick_cancel_sched_timer(int cpu) 1003 void tick_cancel_sched_timer(int cpu)
961 { 1004 {
962 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 1005 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
963 1006
964 # ifdef CONFIG_HIGH_RES_TIMERS 1007 # ifdef CONFIG_HIGH_RES_TIMERS
965 if (ts->sched_timer.base) 1008 if (ts->sched_timer.base)
966 hrtimer_cancel(&ts->sched_timer); 1009 hrtimer_cancel(&ts->sched_timer);
967 # endif 1010 # endif
968 1011
969 ts->nohz_mode = NOHZ_MODE_INACTIVE; 1012 ts->nohz_mode = NOHZ_MODE_INACTIVE;
970 } 1013 }
971 #endif 1014 #endif
972 1015
973 /** 1016 /**
974 * Async notification about clocksource changes 1017 * Async notification about clocksource changes
975 */ 1018 */
976 void tick_clock_notify(void) 1019 void tick_clock_notify(void)
977 { 1020 {
978 int cpu; 1021 int cpu;
979 1022
980 for_each_possible_cpu(cpu) 1023 for_each_possible_cpu(cpu)
981 set_bit(0, &per_cpu(tick_cpu_sched, cpu).check_clocks); 1024 set_bit(0, &per_cpu(tick_cpu_sched, cpu).check_clocks);
982 } 1025 }
983 1026
984 /* 1027 /*
985 * Async notification about clock event changes 1028 * Async notification about clock event changes
986 */ 1029 */
987 void tick_oneshot_notify(void) 1030 void tick_oneshot_notify(void)
988 { 1031 {
989 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); 1032 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
990 1033
991 set_bit(0, &ts->check_clocks); 1034 set_bit(0, &ts->check_clocks);
992 } 1035 }
993 1036
994 /** 1037 /**
995 * Check, if a change happened, which makes oneshot possible. 1038 * Check, if a change happened, which makes oneshot possible.
996 * 1039 *
997 * Called cyclic from the hrtimer softirq (driven by the timer 1040 * Called cyclic from the hrtimer softirq (driven by the timer
998 * softirq) allow_nohz signals, that we can switch into low-res nohz 1041 * softirq) allow_nohz signals, that we can switch into low-res nohz
999 * mode, because high resolution timers are disabled (either compile 1042 * mode, because high resolution timers are disabled (either compile
1000 * or runtime). 1043 * or runtime).
1001 */ 1044 */
1002 int tick_check_oneshot_change(int allow_nohz) 1045 int tick_check_oneshot_change(int allow_nohz)
1003 { 1046 {
1004 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); 1047 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
1005 1048
1006 if (!test_and_clear_bit(0, &ts->check_clocks)) 1049 if (!test_and_clear_bit(0, &ts->check_clocks))
1007 return 0; 1050 return 0;
1008 1051
1009 if (ts->nohz_mode != NOHZ_MODE_INACTIVE) 1052 if (ts->nohz_mode != NOHZ_MODE_INACTIVE)
1010 return 0; 1053 return 0;
1011 1054
1012 if (!timekeeping_valid_for_hres() || !tick_is_oneshot_available()) 1055 if (!timekeeping_valid_for_hres() || !tick_is_oneshot_available())
1013 return 0; 1056 return 0;
1014 1057
1015 if (!allow_nohz) 1058 if (!allow_nohz)
1016 return 1; 1059 return 1;
1017 1060
1018 tick_nohz_switch_to_nohz(); 1061 tick_nohz_switch_to_nohz();
1019 return 0; 1062 return 0;
1020 } 1063 }