Commit c28800a9c3caaf387d85ac665a25ebe99e480295

Authored by Linus Torvalds

Merge branch 'timers-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

* 'timers-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  hrtimer: Fix extra wakeups from __remove_hrtimer()
  timekeeping: add arch_offset hook to ktime_get functions
  clocksource: Avoid selecting mult values that might overflow when adjusted
  time: Improve documentation of timekeeeping_adjust()

Showing 4 changed files Side-by-side Diff

include/linux/clocksource.h
... ... @@ -156,6 +156,7 @@
156 156 * @mult: cycle to nanosecond multiplier
157 157 * @shift: cycle to nanosecond divisor (power of two)
158 158 * @max_idle_ns: max idle time permitted by the clocksource (nsecs)
  159 + * @maxadj maximum adjustment value to mult (~11%)
159 160 * @flags: flags describing special properties
160 161 * @archdata: arch-specific data
161 162 * @suspend: suspend function for the clocksource, if necessary
... ... @@ -172,7 +173,7 @@
172 173 u32 mult;
173 174 u32 shift;
174 175 u64 max_idle_ns;
175   -
  176 + u32 maxadj;
176 177 #ifdef CONFIG_ARCH_CLOCKSOURCE_DATA
177 178 struct arch_clocksource_data archdata;
178 179 #endif
... ... @@ -885,10 +885,13 @@
885 885 struct hrtimer_clock_base *base,
886 886 unsigned long newstate, int reprogram)
887 887 {
  888 + struct timerqueue_node *next_timer;
888 889 if (!(timer->state & HRTIMER_STATE_ENQUEUED))
889 890 goto out;
890 891  
891   - if (&timer->node == timerqueue_getnext(&base->active)) {
  892 + next_timer = timerqueue_getnext(&base->active);
  893 + timerqueue_del(&base->active, &timer->node);
  894 + if (&timer->node == next_timer) {
892 895 #ifdef CONFIG_HIGH_RES_TIMERS
893 896 /* Reprogram the clock event device. if enabled */
894 897 if (reprogram && hrtimer_hres_active()) {
... ... @@ -901,7 +904,6 @@
901 904 }
902 905 #endif
903 906 }
904   - timerqueue_del(&base->active, &timer->node);
905 907 if (!timerqueue_getnext(&base->active))
906 908 base->cpu_base->active_bases &= ~(1 << base->index);
907 909 out:
kernel/time/clocksource.c
... ... @@ -492,6 +492,22 @@
492 492 }
493 493  
494 494 /**
  495 + * clocksource_max_adjustment- Returns max adjustment amount
  496 + * @cs: Pointer to clocksource
  497 + *
  498 + */
  499 +static u32 clocksource_max_adjustment(struct clocksource *cs)
  500 +{
  501 + u64 ret;
  502 + /*
  503 + * We won't try to correct for more then 11% adjustments (110,000 ppm),
  504 + */
  505 + ret = (u64)cs->mult * 11;
  506 + do_div(ret,100);
  507 + return (u32)ret;
  508 +}
  509 +
  510 +/**
495 511 * clocksource_max_deferment - Returns max time the clocksource can be deferred
496 512 * @cs: Pointer to clocksource
497 513 *
498 514  
499 515  
500 516  
... ... @@ -503,25 +519,28 @@
503 519 /*
504 520 * Calculate the maximum number of cycles that we can pass to the
505 521 * cyc2ns function without overflowing a 64-bit signed result. The
506   - * maximum number of cycles is equal to ULLONG_MAX/cs->mult which
507   - * is equivalent to the below.
508   - * max_cycles < (2^63)/cs->mult
509   - * max_cycles < 2^(log2((2^63)/cs->mult))
510   - * max_cycles < 2^(log2(2^63) - log2(cs->mult))
511   - * max_cycles < 2^(63 - log2(cs->mult))
512   - * max_cycles < 1 << (63 - log2(cs->mult))
  522 + * maximum number of cycles is equal to ULLONG_MAX/(cs->mult+cs->maxadj)
  523 + * which is equivalent to the below.
  524 + * max_cycles < (2^63)/(cs->mult + cs->maxadj)
  525 + * max_cycles < 2^(log2((2^63)/(cs->mult + cs->maxadj)))
  526 + * max_cycles < 2^(log2(2^63) - log2(cs->mult + cs->maxadj))
  527 + * max_cycles < 2^(63 - log2(cs->mult + cs->maxadj))
  528 + * max_cycles < 1 << (63 - log2(cs->mult + cs->maxadj))
513 529 * Please note that we add 1 to the result of the log2 to account for
514 530 * any rounding errors, ensure the above inequality is satisfied and
515 531 * no overflow will occur.
516 532 */
517   - max_cycles = 1ULL << (63 - (ilog2(cs->mult) + 1));
  533 + max_cycles = 1ULL << (63 - (ilog2(cs->mult + cs->maxadj) + 1));
518 534  
519 535 /*
520 536 * The actual maximum number of cycles we can defer the clocksource is
521 537 * determined by the minimum of max_cycles and cs->mask.
  538 + * Note: Here we subtract the maxadj to make sure we don't sleep for
  539 + * too long if there's a large negative adjustment.
522 540 */
523 541 max_cycles = min_t(u64, max_cycles, (u64) cs->mask);
524   - max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult, cs->shift);
  542 + max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult - cs->maxadj,
  543 + cs->shift);
525 544  
526 545 /*
527 546 * To ensure that the clocksource does not wrap whilst we are idle,
... ... @@ -640,7 +659,6 @@
640 659 void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)
641 660 {
642 661 u64 sec;
643   -
644 662 /*
645 663 * Calc the maximum number of seconds which we can run before
646 664 * wrapping around. For clocksources which have a mask > 32bit
... ... @@ -661,6 +679,20 @@
661 679  
662 680 clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
663 681 NSEC_PER_SEC / scale, sec * scale);
  682 +
  683 + /*
  684 + * for clocksources that have large mults, to avoid overflow.
  685 + * Since mult may be adjusted by ntp, add an safety extra margin
  686 + *
  687 + */
  688 + cs->maxadj = clocksource_max_adjustment(cs);
  689 + while ((cs->mult + cs->maxadj < cs->mult)
  690 + || (cs->mult - cs->maxadj > cs->mult)) {
  691 + cs->mult >>= 1;
  692 + cs->shift--;
  693 + cs->maxadj = clocksource_max_adjustment(cs);
  694 + }
  695 +
664 696 cs->max_idle_ns = clocksource_max_deferment(cs);
665 697 }
666 698 EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale);
... ... @@ -701,6 +733,12 @@
701 733 */
702 734 int clocksource_register(struct clocksource *cs)
703 735 {
  736 + /* calculate max adjustment for given mult/shift */
  737 + cs->maxadj = clocksource_max_adjustment(cs);
  738 + WARN_ONCE(cs->mult + cs->maxadj < cs->mult,
  739 + "Clocksource %s might overflow on 11%% adjustment\n",
  740 + cs->name);
  741 +
704 742 /* calculate max idle time permitted for this clocksource */
705 743 cs->max_idle_ns = clocksource_max_deferment(cs);
706 744  
kernel/time/timekeeping.c
... ... @@ -249,6 +249,8 @@
249 249 secs = xtime.tv_sec + wall_to_monotonic.tv_sec;
250 250 nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec;
251 251 nsecs += timekeeping_get_ns();
  252 + /* If arch requires, add in gettimeoffset() */
  253 + nsecs += arch_gettimeoffset();
252 254  
253 255 } while (read_seqretry(&xtime_lock, seq));
254 256 /*
... ... @@ -280,6 +282,8 @@
280 282 *ts = xtime;
281 283 tomono = wall_to_monotonic;
282 284 nsecs = timekeeping_get_ns();
  285 + /* If arch requires, add in gettimeoffset() */
  286 + nsecs += arch_gettimeoffset();
283 287  
284 288 } while (read_seqretry(&xtime_lock, seq));
285 289  
286 290  
287 291  
288 292  
... ... @@ -802,14 +806,44 @@
802 806 s64 error, interval = timekeeper.cycle_interval;
803 807 int adj;
804 808  
  809 + /*
  810 + * The point of this is to check if the error is greater then half
  811 + * an interval.
  812 + *
  813 + * First we shift it down from NTP_SHIFT to clocksource->shifted nsecs.
  814 + *
  815 + * Note we subtract one in the shift, so that error is really error*2.
  816 + * This "saves" dividing(shifting) intererval twice, but keeps the
  817 + * (error > interval) comparision as still measuring if error is
  818 + * larger then half an interval.
  819 + *
  820 + * Note: It does not "save" on aggrivation when reading the code.
  821 + */
805 822 error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1);
806 823 if (error > interval) {
  824 + /*
  825 + * We now divide error by 4(via shift), which checks if
  826 + * the error is greater then twice the interval.
  827 + * If it is greater, we need a bigadjust, if its smaller,
  828 + * we can adjust by 1.
  829 + */
807 830 error >>= 2;
  831 + /*
  832 + * XXX - In update_wall_time, we round up to the next
  833 + * nanosecond, and store the amount rounded up into
  834 + * the error. This causes the likely below to be unlikely.
  835 + *
  836 + * The properfix is to avoid rounding up by using
  837 + * the high precision timekeeper.xtime_nsec instead of
  838 + * xtime.tv_nsec everywhere. Fixing this will take some
  839 + * time.
  840 + */
808 841 if (likely(error <= interval))
809 842 adj = 1;
810 843 else
811 844 adj = timekeeping_bigadjust(error, &interval, &offset);
812 845 } else if (error < -interval) {
  846 + /* See comment above, this is just switched for the negative */
813 847 error >>= 2;
814 848 if (likely(error >= -interval)) {
815 849 adj = -1;
816 850  
... ... @@ -817,9 +851,65 @@
817 851 offset = -offset;
818 852 } else
819 853 adj = timekeeping_bigadjust(error, &interval, &offset);
820   - } else
  854 + } else /* No adjustment needed */
821 855 return;
822 856  
  857 + WARN_ONCE(timekeeper.clock->maxadj &&
  858 + (timekeeper.mult + adj > timekeeper.clock->mult +
  859 + timekeeper.clock->maxadj),
  860 + "Adjusting %s more then 11%% (%ld vs %ld)\n",
  861 + timekeeper.clock->name, (long)timekeeper.mult + adj,
  862 + (long)timekeeper.clock->mult +
  863 + timekeeper.clock->maxadj);
  864 + /*
  865 + * So the following can be confusing.
  866 + *
  867 + * To keep things simple, lets assume adj == 1 for now.
  868 + *
  869 + * When adj != 1, remember that the interval and offset values
  870 + * have been appropriately scaled so the math is the same.
  871 + *
  872 + * The basic idea here is that we're increasing the multiplier
  873 + * by one, this causes the xtime_interval to be incremented by
  874 + * one cycle_interval. This is because:
  875 + * xtime_interval = cycle_interval * mult
  876 + * So if mult is being incremented by one:
  877 + * xtime_interval = cycle_interval * (mult + 1)
  878 + * Its the same as:
  879 + * xtime_interval = (cycle_interval * mult) + cycle_interval
  880 + * Which can be shortened to:
  881 + * xtime_interval += cycle_interval
  882 + *
  883 + * So offset stores the non-accumulated cycles. Thus the current
  884 + * time (in shifted nanoseconds) is:
  885 + * now = (offset * adj) + xtime_nsec
  886 + * Now, even though we're adjusting the clock frequency, we have
  887 + * to keep time consistent. In other words, we can't jump back
  888 + * in time, and we also want to avoid jumping forward in time.
  889 + *
  890 + * So given the same offset value, we need the time to be the same
  891 + * both before and after the freq adjustment.
  892 + * now = (offset * adj_1) + xtime_nsec_1
  893 + * now = (offset * adj_2) + xtime_nsec_2
  894 + * So:
  895 + * (offset * adj_1) + xtime_nsec_1 =
  896 + * (offset * adj_2) + xtime_nsec_2
  897 + * And we know:
  898 + * adj_2 = adj_1 + 1
  899 + * So:
  900 + * (offset * adj_1) + xtime_nsec_1 =
  901 + * (offset * (adj_1+1)) + xtime_nsec_2
  902 + * (offset * adj_1) + xtime_nsec_1 =
  903 + * (offset * adj_1) + offset + xtime_nsec_2
  904 + * Canceling the sides:
  905 + * xtime_nsec_1 = offset + xtime_nsec_2
  906 + * Which gives us:
  907 + * xtime_nsec_2 = xtime_nsec_1 - offset
  908 + * Which simplfies to:
  909 + * xtime_nsec -= offset
  910 + *
  911 + * XXX - TODO: Doc ntp_error calculation.
  912 + */
823 913 timekeeper.mult += adj;
824 914 timekeeper.xtime_interval += interval;
825 915 timekeeper.xtime_nsec -= offset;