Commit c28800a9c3caaf387d85ac665a25ebe99e480295
Exists in
master
and in
6 other branches
Merge branch 'timers-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
* 'timers-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: hrtimer: Fix extra wakeups from __remove_hrtimer() timekeeping: add arch_offset hook to ktime_get functions clocksource: Avoid selecting mult values that might overflow when adjusted time: Improve documentation of timekeeeping_adjust()
Showing 4 changed files Side-by-side Diff
include/linux/clocksource.h
... | ... | @@ -156,6 +156,7 @@ |
156 | 156 | * @mult: cycle to nanosecond multiplier |
157 | 157 | * @shift: cycle to nanosecond divisor (power of two) |
158 | 158 | * @max_idle_ns: max idle time permitted by the clocksource (nsecs) |
159 | + * @maxadj maximum adjustment value to mult (~11%) | |
159 | 160 | * @flags: flags describing special properties |
160 | 161 | * @archdata: arch-specific data |
161 | 162 | * @suspend: suspend function for the clocksource, if necessary |
... | ... | @@ -172,7 +173,7 @@ |
172 | 173 | u32 mult; |
173 | 174 | u32 shift; |
174 | 175 | u64 max_idle_ns; |
175 | - | |
176 | + u32 maxadj; | |
176 | 177 | #ifdef CONFIG_ARCH_CLOCKSOURCE_DATA |
177 | 178 | struct arch_clocksource_data archdata; |
178 | 179 | #endif |
kernel/hrtimer.c
... | ... | @@ -885,10 +885,13 @@ |
885 | 885 | struct hrtimer_clock_base *base, |
886 | 886 | unsigned long newstate, int reprogram) |
887 | 887 | { |
888 | + struct timerqueue_node *next_timer; | |
888 | 889 | if (!(timer->state & HRTIMER_STATE_ENQUEUED)) |
889 | 890 | goto out; |
890 | 891 | |
891 | - if (&timer->node == timerqueue_getnext(&base->active)) { | |
892 | + next_timer = timerqueue_getnext(&base->active); | |
893 | + timerqueue_del(&base->active, &timer->node); | |
894 | + if (&timer->node == next_timer) { | |
892 | 895 | #ifdef CONFIG_HIGH_RES_TIMERS |
893 | 896 | /* Reprogram the clock event device. if enabled */ |
894 | 897 | if (reprogram && hrtimer_hres_active()) { |
... | ... | @@ -901,7 +904,6 @@ |
901 | 904 | } |
902 | 905 | #endif |
903 | 906 | } |
904 | - timerqueue_del(&base->active, &timer->node); | |
905 | 907 | if (!timerqueue_getnext(&base->active)) |
906 | 908 | base->cpu_base->active_bases &= ~(1 << base->index); |
907 | 909 | out: |
kernel/time/clocksource.c
... | ... | @@ -492,6 +492,22 @@ |
492 | 492 | } |
493 | 493 | |
494 | 494 | /** |
495 | + * clocksource_max_adjustment- Returns max adjustment amount | |
496 | + * @cs: Pointer to clocksource | |
497 | + * | |
498 | + */ | |
499 | +static u32 clocksource_max_adjustment(struct clocksource *cs) | |
500 | +{ | |
501 | + u64 ret; | |
502 | + /* | |
503 | + * We won't try to correct for more then 11% adjustments (110,000 ppm), | |
504 | + */ | |
505 | + ret = (u64)cs->mult * 11; | |
506 | + do_div(ret,100); | |
507 | + return (u32)ret; | |
508 | +} | |
509 | + | |
510 | +/** | |
495 | 511 | * clocksource_max_deferment - Returns max time the clocksource can be deferred |
496 | 512 | * @cs: Pointer to clocksource |
497 | 513 | * |
498 | 514 | |
499 | 515 | |
500 | 516 | |
... | ... | @@ -503,25 +519,28 @@ |
503 | 519 | /* |
504 | 520 | * Calculate the maximum number of cycles that we can pass to the |
505 | 521 | * cyc2ns function without overflowing a 64-bit signed result. The |
506 | - * maximum number of cycles is equal to ULLONG_MAX/cs->mult which | |
507 | - * is equivalent to the below. | |
508 | - * max_cycles < (2^63)/cs->mult | |
509 | - * max_cycles < 2^(log2((2^63)/cs->mult)) | |
510 | - * max_cycles < 2^(log2(2^63) - log2(cs->mult)) | |
511 | - * max_cycles < 2^(63 - log2(cs->mult)) | |
512 | - * max_cycles < 1 << (63 - log2(cs->mult)) | |
522 | + * maximum number of cycles is equal to ULLONG_MAX/(cs->mult+cs->maxadj) | |
523 | + * which is equivalent to the below. | |
524 | + * max_cycles < (2^63)/(cs->mult + cs->maxadj) | |
525 | + * max_cycles < 2^(log2((2^63)/(cs->mult + cs->maxadj))) | |
526 | + * max_cycles < 2^(log2(2^63) - log2(cs->mult + cs->maxadj)) | |
527 | + * max_cycles < 2^(63 - log2(cs->mult + cs->maxadj)) | |
528 | + * max_cycles < 1 << (63 - log2(cs->mult + cs->maxadj)) | |
513 | 529 | * Please note that we add 1 to the result of the log2 to account for |
514 | 530 | * any rounding errors, ensure the above inequality is satisfied and |
515 | 531 | * no overflow will occur. |
516 | 532 | */ |
517 | - max_cycles = 1ULL << (63 - (ilog2(cs->mult) + 1)); | |
533 | + max_cycles = 1ULL << (63 - (ilog2(cs->mult + cs->maxadj) + 1)); | |
518 | 534 | |
519 | 535 | /* |
520 | 536 | * The actual maximum number of cycles we can defer the clocksource is |
521 | 537 | * determined by the minimum of max_cycles and cs->mask. |
538 | + * Note: Here we subtract the maxadj to make sure we don't sleep for | |
539 | + * too long if there's a large negative adjustment. | |
522 | 540 | */ |
523 | 541 | max_cycles = min_t(u64, max_cycles, (u64) cs->mask); |
524 | - max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult, cs->shift); | |
542 | + max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult - cs->maxadj, | |
543 | + cs->shift); | |
525 | 544 | |
526 | 545 | /* |
527 | 546 | * To ensure that the clocksource does not wrap whilst we are idle, |
... | ... | @@ -640,7 +659,6 @@ |
640 | 659 | void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) |
641 | 660 | { |
642 | 661 | u64 sec; |
643 | - | |
644 | 662 | /* |
645 | 663 | * Calc the maximum number of seconds which we can run before |
646 | 664 | * wrapping around. For clocksources which have a mask > 32bit |
... | ... | @@ -661,6 +679,20 @@ |
661 | 679 | |
662 | 680 | clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, |
663 | 681 | NSEC_PER_SEC / scale, sec * scale); |
682 | + | |
683 | + /* | |
684 | + * for clocksources that have large mults, to avoid overflow. | |
685 | + * Since mult may be adjusted by ntp, add an safety extra margin | |
686 | + * | |
687 | + */ | |
688 | + cs->maxadj = clocksource_max_adjustment(cs); | |
689 | + while ((cs->mult + cs->maxadj < cs->mult) | |
690 | + || (cs->mult - cs->maxadj > cs->mult)) { | |
691 | + cs->mult >>= 1; | |
692 | + cs->shift--; | |
693 | + cs->maxadj = clocksource_max_adjustment(cs); | |
694 | + } | |
695 | + | |
664 | 696 | cs->max_idle_ns = clocksource_max_deferment(cs); |
665 | 697 | } |
666 | 698 | EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); |
... | ... | @@ -701,6 +733,12 @@ |
701 | 733 | */ |
702 | 734 | int clocksource_register(struct clocksource *cs) |
703 | 735 | { |
736 | + /* calculate max adjustment for given mult/shift */ | |
737 | + cs->maxadj = clocksource_max_adjustment(cs); | |
738 | + WARN_ONCE(cs->mult + cs->maxadj < cs->mult, | |
739 | + "Clocksource %s might overflow on 11%% adjustment\n", | |
740 | + cs->name); | |
741 | + | |
704 | 742 | /* calculate max idle time permitted for this clocksource */ |
705 | 743 | cs->max_idle_ns = clocksource_max_deferment(cs); |
706 | 744 |
kernel/time/timekeeping.c
... | ... | @@ -249,6 +249,8 @@ |
249 | 249 | secs = xtime.tv_sec + wall_to_monotonic.tv_sec; |
250 | 250 | nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec; |
251 | 251 | nsecs += timekeeping_get_ns(); |
252 | + /* If arch requires, add in gettimeoffset() */ | |
253 | + nsecs += arch_gettimeoffset(); | |
252 | 254 | |
253 | 255 | } while (read_seqretry(&xtime_lock, seq)); |
254 | 256 | /* |
... | ... | @@ -280,6 +282,8 @@ |
280 | 282 | *ts = xtime; |
281 | 283 | tomono = wall_to_monotonic; |
282 | 284 | nsecs = timekeeping_get_ns(); |
285 | + /* If arch requires, add in gettimeoffset() */ | |
286 | + nsecs += arch_gettimeoffset(); | |
283 | 287 | |
284 | 288 | } while (read_seqretry(&xtime_lock, seq)); |
285 | 289 | |
286 | 290 | |
287 | 291 | |
288 | 292 | |
... | ... | @@ -802,14 +806,44 @@ |
802 | 806 | s64 error, interval = timekeeper.cycle_interval; |
803 | 807 | int adj; |
804 | 808 | |
809 | + /* | |
810 | + * The point of this is to check if the error is greater then half | |
811 | + * an interval. | |
812 | + * | |
813 | + * First we shift it down from NTP_SHIFT to clocksource->shifted nsecs. | |
814 | + * | |
815 | + * Note we subtract one in the shift, so that error is really error*2. | |
816 | + * This "saves" dividing(shifting) intererval twice, but keeps the | |
817 | + * (error > interval) comparision as still measuring if error is | |
818 | + * larger then half an interval. | |
819 | + * | |
820 | + * Note: It does not "save" on aggrivation when reading the code. | |
821 | + */ | |
805 | 822 | error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1); |
806 | 823 | if (error > interval) { |
824 | + /* | |
825 | + * We now divide error by 4(via shift), which checks if | |
826 | + * the error is greater then twice the interval. | |
827 | + * If it is greater, we need a bigadjust, if its smaller, | |
828 | + * we can adjust by 1. | |
829 | + */ | |
807 | 830 | error >>= 2; |
831 | + /* | |
832 | + * XXX - In update_wall_time, we round up to the next | |
833 | + * nanosecond, and store the amount rounded up into | |
834 | + * the error. This causes the likely below to be unlikely. | |
835 | + * | |
836 | + * The properfix is to avoid rounding up by using | |
837 | + * the high precision timekeeper.xtime_nsec instead of | |
838 | + * xtime.tv_nsec everywhere. Fixing this will take some | |
839 | + * time. | |
840 | + */ | |
808 | 841 | if (likely(error <= interval)) |
809 | 842 | adj = 1; |
810 | 843 | else |
811 | 844 | adj = timekeeping_bigadjust(error, &interval, &offset); |
812 | 845 | } else if (error < -interval) { |
846 | + /* See comment above, this is just switched for the negative */ | |
813 | 847 | error >>= 2; |
814 | 848 | if (likely(error >= -interval)) { |
815 | 849 | adj = -1; |
816 | 850 | |
... | ... | @@ -817,9 +851,65 @@ |
817 | 851 | offset = -offset; |
818 | 852 | } else |
819 | 853 | adj = timekeeping_bigadjust(error, &interval, &offset); |
820 | - } else | |
854 | + } else /* No adjustment needed */ | |
821 | 855 | return; |
822 | 856 | |
857 | + WARN_ONCE(timekeeper.clock->maxadj && | |
858 | + (timekeeper.mult + adj > timekeeper.clock->mult + | |
859 | + timekeeper.clock->maxadj), | |
860 | + "Adjusting %s more then 11%% (%ld vs %ld)\n", | |
861 | + timekeeper.clock->name, (long)timekeeper.mult + adj, | |
862 | + (long)timekeeper.clock->mult + | |
863 | + timekeeper.clock->maxadj); | |
864 | + /* | |
865 | + * So the following can be confusing. | |
866 | + * | |
867 | + * To keep things simple, lets assume adj == 1 for now. | |
868 | + * | |
869 | + * When adj != 1, remember that the interval and offset values | |
870 | + * have been appropriately scaled so the math is the same. | |
871 | + * | |
872 | + * The basic idea here is that we're increasing the multiplier | |
873 | + * by one, this causes the xtime_interval to be incremented by | |
874 | + * one cycle_interval. This is because: | |
875 | + * xtime_interval = cycle_interval * mult | |
876 | + * So if mult is being incremented by one: | |
877 | + * xtime_interval = cycle_interval * (mult + 1) | |
878 | + * Its the same as: | |
879 | + * xtime_interval = (cycle_interval * mult) + cycle_interval | |
880 | + * Which can be shortened to: | |
881 | + * xtime_interval += cycle_interval | |
882 | + * | |
883 | + * So offset stores the non-accumulated cycles. Thus the current | |
884 | + * time (in shifted nanoseconds) is: | |
885 | + * now = (offset * adj) + xtime_nsec | |
886 | + * Now, even though we're adjusting the clock frequency, we have | |
887 | + * to keep time consistent. In other words, we can't jump back | |
888 | + * in time, and we also want to avoid jumping forward in time. | |
889 | + * | |
890 | + * So given the same offset value, we need the time to be the same | |
891 | + * both before and after the freq adjustment. | |
892 | + * now = (offset * adj_1) + xtime_nsec_1 | |
893 | + * now = (offset * adj_2) + xtime_nsec_2 | |
894 | + * So: | |
895 | + * (offset * adj_1) + xtime_nsec_1 = | |
896 | + * (offset * adj_2) + xtime_nsec_2 | |
897 | + * And we know: | |
898 | + * adj_2 = adj_1 + 1 | |
899 | + * So: | |
900 | + * (offset * adj_1) + xtime_nsec_1 = | |
901 | + * (offset * (adj_1+1)) + xtime_nsec_2 | |
902 | + * (offset * adj_1) + xtime_nsec_1 = | |
903 | + * (offset * adj_1) + offset + xtime_nsec_2 | |
904 | + * Canceling the sides: | |
905 | + * xtime_nsec_1 = offset + xtime_nsec_2 | |
906 | + * Which gives us: | |
907 | + * xtime_nsec_2 = xtime_nsec_1 - offset | |
908 | + * Which simplfies to: | |
909 | + * xtime_nsec -= offset | |
910 | + * | |
911 | + * XXX - TODO: Doc ntp_error calculation. | |
912 | + */ | |
823 | 913 | timekeeper.mult += adj; |
824 | 914 | timekeeper.xtime_interval += interval; |
825 | 915 | timekeeper.xtime_nsec -= offset; |