Commit 98962465ed9e6ea99c38e0af63fe1dcb5a79dc25

Authored by Jon Hunter
Committed by Thomas Gleixner
1 parent 529eaccd90

nohz: Prevent clocksource wrapping during idle

The dynamic tick allows the kernel to sleep for periods longer than a
single tick, but it does not limit the sleep time currently. In the
worst case the kernel could sleep longer than the wrap around time of
the time keeping clock source which would result in losing track of
time.

Prevent this by limiting it to the safe maximum sleep time of the
current time keeping clock source. The value is calculated when the
clock source is registered.

[ tglx: simplified the code a bit and massaged the commit msg ]

Signed-off-by: Jon Hunter <jon-hunter@ti.com>
Cc: John Stultz <johnstul@us.ibm.com>
LKML-Reference: <1250617512-23567-2-git-send-email-jon-hunter@ti.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

Showing 5 changed files with 96 additions and 14 deletions Side-by-side Diff

include/linux/clocksource.h
... ... @@ -151,6 +151,7 @@
151 151 * subtraction of non 64 bit counters
152 152 * @mult: cycle to nanosecond multiplier
153 153 * @shift: cycle to nanosecond divisor (power of two)
  154 + * @max_idle_ns: max idle time permitted by the clocksource (nsecs)
154 155 * @flags: flags describing special properties
155 156 * @vread: vsyscall based read
156 157 * @resume: resume function for the clocksource, if necessary
... ... @@ -168,6 +169,7 @@
168 169 cycle_t mask;
169 170 u32 mult;
170 171 u32 shift;
  172 + u64 max_idle_ns;
171 173 unsigned long flags;
172 174 cycle_t (*vread)(void);
173 175 void (*resume)(void);
include/linux/time.h
... ... @@ -148,6 +148,7 @@
148 148  
149 149 extern struct timespec timespec_trunc(struct timespec t, unsigned gran);
150 150 extern int timekeeping_valid_for_hres(void);
  151 +extern u64 timekeeping_max_deferment(void);
151 152 extern void update_wall_time(void);
152 153 extern void update_xtime_cache(u64 nsec);
153 154 extern void timekeeping_leap_insert(int leapsecond);
kernel/time/clocksource.c
... ... @@ -469,6 +469,47 @@
469 469 #ifdef CONFIG_GENERIC_TIME
470 470  
471 471 /**
  472 + * clocksource_max_deferment - Returns max time the clocksource can be deferred
  473 + * @cs: Pointer to clocksource
  474 + *
  475 + */
  476 +static u64 clocksource_max_deferment(struct clocksource *cs)
  477 +{
  478 + u64 max_nsecs, max_cycles;
  479 +
  480 + /*
  481 + * Calculate the maximum number of cycles that we can pass to the
  482 + * cyc2ns function without overflowing a 64-bit signed result. The
  483 + * maximum number of cycles is equal to ULLONG_MAX/cs->mult which
  484 + * is equivalent to the below.
  485 + * max_cycles < (2^63)/cs->mult
  486 + * max_cycles < 2^(log2((2^63)/cs->mult))
  487 + * max_cycles < 2^(log2(2^63) - log2(cs->mult))
  488 + * max_cycles < 2^(63 - log2(cs->mult))
  489 + * max_cycles < 1 << (63 - log2(cs->mult))
  490 + * Please note that we add 1 to the result of the log2 to account for
  491 + * any rounding errors, ensure the above inequality is satisfied and
  492 + * no overflow will occur.
  493 + */
  494 + max_cycles = 1ULL << (63 - (ilog2(cs->mult) + 1));
  495 +
  496 + /*
  497 + * The actual maximum number of cycles we can defer the clocksource is
  498 + * determined by the minimum of max_cycles and cs->mask.
  499 + */
  500 + max_cycles = min_t(u64, max_cycles, (u64) cs->mask);
  501 + max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult, cs->shift);
  502 +
  503 + /*
  504 + * To ensure that the clocksource does not wrap whilst we are idle,
  505 + * limit the time the clocksource can be deferred by 12.5%. Please
  506 + * note a margin of 12.5% is used because this can be computed with
  507 + * a shift, versus say 10% which would require division.
  508 + */
  509 + return max_nsecs - (max_nsecs >> 5);
  510 +}
  511 +
  512 +/**
472 513 * clocksource_select - Select the best clocksource available
473 514 *
474 515 * Private function. Must hold clocksource_mutex when called.
... ... @@ -564,6 +605,9 @@
564 605 */
565 606 int clocksource_register(struct clocksource *cs)
566 607 {
  608 + /* calculate max idle time permitted for this clocksource */
  609 + cs->max_idle_ns = clocksource_max_deferment(cs);
  610 +
567 611 mutex_lock(&clocksource_mutex);
568 612 clocksource_enqueue(cs);
569 613 clocksource_select();
kernel/time/tick-sched.c
... ... @@ -208,6 +208,7 @@
208 208 struct tick_sched *ts;
209 209 ktime_t last_update, expires, now;
210 210 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
  211 + u64 time_delta;
211 212 int cpu;
212 213  
213 214 local_irq_save(flags);
... ... @@ -262,6 +263,17 @@
262 263 seq = read_seqbegin(&xtime_lock);
263 264 last_update = last_jiffies_update;
264 265 last_jiffies = jiffies;
  266 +
  267 + /*
  268 + * On SMP we really should only care for the CPU which
  269 + * has the do_timer duty assigned. All other CPUs can
  270 + * sleep as long as they want.
  271 + */
  272 + if (cpu == tick_do_timer_cpu ||
  273 + tick_do_timer_cpu == TICK_DO_TIMER_NONE)
  274 + time_delta = timekeeping_max_deferment();
  275 + else
  276 + time_delta = KTIME_MAX;
265 277 } while (read_seqretry(&xtime_lock, seq));
266 278  
267 279 if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) ||
... ... @@ -284,11 +296,26 @@
284 296 if ((long)delta_jiffies >= 1) {
285 297  
286 298 /*
287   - * calculate the expiry time for the next timer wheel
288   - * timer
289   - */
290   - expires = ktime_add_ns(last_update, tick_period.tv64 *
291   - delta_jiffies);
  299 + * calculate the expiry time for the next timer wheel
  300 + * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals
  301 + * that there is no timer pending or at least extremely
  302 + * far into the future (12 days for HZ=1000). In this
  303 + * case we set the expiry to the end of time.
  304 + */
  305 + if (likely(delta_jiffies < NEXT_TIMER_MAX_DELTA)) {
  306 + /*
  307 + * Calculate the time delta for the next timer event.
  308 + * If the time delta exceeds the maximum time delta
  309 + * permitted by the current clocksource then adjust
  310 + * the time delta accordingly to ensure the
  311 + * clocksource does not wrap.
  312 + */
  313 + time_delta = min_t(u64, time_delta,
  314 + tick_period.tv64 * delta_jiffies);
  315 + expires = ktime_add_ns(last_update, time_delta);
  316 + } else {
  317 + expires.tv64 = KTIME_MAX;
  318 + }
292 319  
293 320 /*
294 321 * If this cpu is the one which updates jiffies, then
295 322  
296 323  
297 324  
... ... @@ -332,21 +359,18 @@
332 359  
333 360 ts->idle_sleeps++;
334 361  
  362 + /* Mark expires */
  363 + ts->idle_expires = expires;
  364 +
335 365 /*
336   - * delta_jiffies >= NEXT_TIMER_MAX_DELTA signals that
337   - * there is no timer pending or at least extremly far
338   - * into the future (12 days for HZ=1000). In this case
339   - * we simply stop the tick timer:
  366 + * If the expiration time == KTIME_MAX, then
  367 + * in this case we simply stop the tick timer.
340 368 */
341   - if (unlikely(delta_jiffies >= NEXT_TIMER_MAX_DELTA)) {
342   - ts->idle_expires.tv64 = KTIME_MAX;
  369 + if (unlikely(expires.tv64 == KTIME_MAX)) {
343 370 if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
344 371 hrtimer_cancel(&ts->sched_timer);
345 372 goto out;
346 373 }
347   -
348   - /* Mark expiries */
349   - ts->idle_expires = expires;
350 374  
351 375 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
352 376 hrtimer_start(&ts->sched_timer, expires,
kernel/time/timekeeping.c
... ... @@ -478,6 +478,17 @@
478 478 }
479 479  
480 480 /**
  481 + * timekeeping_max_deferment - Returns max time the clocksource can be deferred
  482 + *
  483 + * Caller must observe xtime_lock via read_seqbegin/read_seqretry to
  484 + * ensure that the clocksource does not change!
  485 + */
  486 +u64 timekeeping_max_deferment(void)
  487 +{
  488 + return timekeeper.clock->max_idle_ns;
  489 +}
  490 +
  491 +/**
481 492 * read_persistent_clock - Return time from the persistent clock.
482 493 *
483 494 * Weak dummy function for arches that do not yet support it.