Commit 6ff7041dbfeb3bd7dfe9aa67275c21199ef760d6

Authored by Thomas Gleixner
1 parent 7e0c5086c1

hrtimer: Fix migration expiry check

The timer migration expiry check should prevent the migration of a
timer to another CPU when the timer expires before the next event is
scheduled on the other CPU. Migrating the timer might delay it because
we can not reprogram the clock event device on the other CPU. But the
code implementing that check has two flaws:

- for !HIGHRES the check compares the expiry value with the clock
  events device expiry value which is wrong for CLOCK_REALTIME based
  timers.

- the check is racy. It holds the hrtimer base lock of the target CPU,
  but the clock event device expiry value can be modified
  nevertheless, e.g. by an timer interrupt firing.

The !HIGHRES case is easy to fix as we can enqueue the timer on the
cpu which was selected by the load balancer. It runs the idle
balancing code once per jiffy anyway. So the maximum delay for the
timer is the same as when we keep the tick on the current cpu going.

In the HIGHRES case we can get the next expiry value from the hrtimer
cpu_base of the target CPU and serialize the update with the cpu_base
lock. This moves the lock section in hrtimer_interrupt() so we can set
next_event to KTIME_MAX while we are handling the expired timers and
set it to the next expiry value after we handled the timers under the
base lock. While the expired timers are processed timer migration is
blocked because the expiry time of the timer is always <= KTIME_MAX.

Also remove the now useless clockevents_get_next_event() function.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

Showing 3 changed files with 64 additions and 77 deletions Side-by-side Diff

include/linux/clockchips.h
... ... @@ -143,13 +143,4 @@
143 143 #endif
144 144  
145 145 #endif
146   -
147   -#ifdef CONFIG_GENERIC_CLOCKEVENTS
148   -extern ktime_t clockevents_get_next_event(int cpu);
149   -#else
150   -static inline ktime_t clockevents_get_next_event(int cpu)
151   -{
152   - return (ktime_t) { .tv64 = KTIME_MAX };
153   -}
154   -#endif
... ... @@ -191,7 +191,47 @@
191 191 }
192 192 }
193 193  
  194 +
194 195 /*
  196 + * Get the preferred target CPU for NOHZ
  197 + */
  198 +static int hrtimer_get_target(int this_cpu, int pinned)
  199 +{
  200 +#ifdef CONFIG_NO_HZ
  201 + if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) {
  202 + int preferred_cpu = get_nohz_load_balancer();
  203 +
  204 + if (preferred_cpu >= 0)
  205 + return preferred_cpu;
  206 + }
  207 +#endif
  208 + return this_cpu;
  209 +}
  210 +
  211 +/*
  212 + * With HIGHRES=y we do not migrate the timer when it is expiring
  213 + * before the next event on the target cpu because we cannot reprogram
  214 + * the target cpu hardware and we would cause it to fire late.
  215 + *
  216 + * Called with cpu_base->lock of target cpu held.
  217 + */
  218 +static int
  219 +hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
  220 +{
  221 +#ifdef CONFIG_HIGH_RES_TIMERS
  222 + ktime_t expires;
  223 +
  224 + if (!new_base->cpu_base->hres_active)
  225 + return 0;
  226 +
  227 + expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset);
  228 + return expires.tv64 <= new_base->cpu_base->expires_next.tv64;
  229 +#else
  230 + return 0;
  231 +#endif
  232 +}
  233 +
  234 +/*
195 235 * Switch the timer base to the current CPU when possible.
196 236 */
197 237 static inline struct hrtimer_clock_base *
198 238  
199 239  
... ... @@ -200,35 +240,16 @@
200 240 {
201 241 struct hrtimer_clock_base *new_base;
202 242 struct hrtimer_cpu_base *new_cpu_base;
203   - int cpu, preferred_cpu = -1;
  243 + int this_cpu = smp_processor_id();
  244 + int cpu = hrtimer_get_target(this_cpu, pinned);
204 245  
205   - cpu = smp_processor_id();
206   -#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
207   - if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) {
208   - preferred_cpu = get_nohz_load_balancer();
209   - if (preferred_cpu >= 0) {
210   - /*
211   - * We must not check the expiry value when
212   - * preferred_cpu is the current cpu. If base
213   - * != new_base we would loop forever when the
214   - * timer expires before the current programmed
215   - * next timer event.
216   - */
217   - if (preferred_cpu != cpu)
218   - cpu = preferred_cpu;
219   - else
220   - preferred_cpu = -1;
221   - }
222   - }
223   -#endif
224   -
225 246 again:
226 247 new_cpu_base = &per_cpu(hrtimer_bases, cpu);
227 248 new_base = &new_cpu_base->clock_base[base->index];
228 249  
229 250 if (base != new_base) {
230 251 /*
231   - * We are trying to schedule the timer on the local CPU.
  252 + * We are trying to move timer to new_base.
232 253 * However we can't change timer's base while it is running,
233 254 * so we keep it on the same CPU. No hassle vs. reprogramming
234 255 * the event source in the high resolution case. The softirq
... ... @@ -244,38 +265,12 @@
244 265 spin_unlock(&base->cpu_base->lock);
245 266 spin_lock(&new_base->cpu_base->lock);
246 267  
247   - /* Optimized away for NOHZ=n SMP=n */
248   - if (cpu == preferred_cpu) {
249   - /* Calculate clock monotonic expiry time */
250   -#ifdef CONFIG_HIGH_RES_TIMERS
251   - ktime_t expires = ktime_sub(hrtimer_get_expires(timer),
252   - new_base->offset);
253   -#else
254   - ktime_t expires = hrtimer_get_expires(timer);
255   -#endif
256   -
257   - /*
258   - * Get the next event on target cpu from the
259   - * clock events layer.
260   - * This covers the highres=off nohz=on case as well.
261   - */
262   - ktime_t next = clockevents_get_next_event(cpu);
263   -
264   - ktime_t delta = ktime_sub(expires, next);
265   -
266   - /*
267   - * We do not migrate the timer when it is expiring
268   - * before the next event on the target cpu because
269   - * we cannot reprogram the target cpu hardware and
270   - * we would cause it to fire late.
271   - */
272   - if (delta.tv64 < 0) {
273   - cpu = smp_processor_id();
274   - spin_unlock(&new_base->cpu_base->lock);
275   - spin_lock(&base->cpu_base->lock);
276   - timer->base = base;
277   - goto again;
278   - }
  268 + if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) {
  269 + cpu = this_cpu;
  270 + spin_unlock(&new_base->cpu_base->lock);
  271 + spin_lock(&base->cpu_base->lock);
  272 + timer->base = base;
  273 + goto again;
279 274 }
280 275 timer->base = new_base;
281 276 }
282 277  
... ... @@ -1287,14 +1282,22 @@
1287 1282  
1288 1283 expires_next.tv64 = KTIME_MAX;
1289 1284  
  1285 + spin_lock(&cpu_base->lock);
  1286 + /*
  1287 + * We set expires_next to KTIME_MAX here with cpu_base->lock
  1288 + * held to prevent that a timer is enqueued in our queue via
  1289 + * the migration code. This does not affect enqueueing of
  1290 + * timers which run their callback and need to be requeued on
  1291 + * this CPU.
  1292 + */
  1293 + cpu_base->expires_next.tv64 = KTIME_MAX;
  1294 +
1290 1295 base = cpu_base->clock_base;
1291 1296  
1292 1297 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
1293 1298 ktime_t basenow;
1294 1299 struct rb_node *node;
1295 1300  
1296   - spin_lock(&cpu_base->lock);
1297   -
1298 1301 basenow = ktime_add(now, base->offset);
1299 1302  
1300 1303 while ((node = base->first)) {
1301 1304  
1302 1305  
... ... @@ -1327,11 +1330,15 @@
1327 1330  
1328 1331 __run_hrtimer(timer);
1329 1332 }
1330   - spin_unlock(&cpu_base->lock);
1331 1333 base++;
1332 1334 }
1333 1335  
  1336 + /*
  1337 + * Store the new expiry value so the migration code can verify
  1338 + * against it.
  1339 + */
1334 1340 cpu_base->expires_next = expires_next;
  1341 + spin_unlock(&cpu_base->lock);
1335 1342  
1336 1343 /* Reprogramming necessary ? */
1337 1344 if (expires_next.tv64 != KTIME_MAX) {
kernel/time/clockevents.c
... ... @@ -254,16 +254,5 @@
254 254 spin_unlock(&clockevents_lock);
255 255 }
256 256 EXPORT_SYMBOL_GPL(clockevents_notify);
257   -
258   -ktime_t clockevents_get_next_event(int cpu)
259   -{
260   - struct tick_device *td;
261   - struct clock_event_device *dev;
262   -
263   - td = &per_cpu(tick_cpu_device, cpu);
264   - dev = td->evtdev;
265   -
266   - return dev->next_event;
267   -}
268 257 #endif