Commit 41d2e494937715d3150e5c75d01f0e75ae899337

Authored by Thomas Gleixner
1 parent 3067e02f8f

hrtimer: Tune hrtimer_interrupt hang logic

The hrtimer_interrupt hang logic adjusts min_delta_ns based on the
execution time of the hrtimer callbacks.

This is error-prone for virtual machines, where a guest vcpu can be
scheduled out during the execution of the callbacks (and the callbacks
themselves can do operations that translate to blocking operations in
the hypervisor), which in can lead to large min_delta_ns rendering the
system unusable.

Replace the current heuristics with something more reliable. Allow the
interrupt code to try 3 times to catch up with the lost time. If that
fails use the total time spent in the interrupt handler to defer the
next timer interrupt so the system can catch up with other things
which got delayed. Limit that deferment to 100ms.

The retry events and the maximum time spent in the interrupt handler
are recorded and exposed via /proc/timer_list

Inspired by a patch from Marcelo.

Reported-by: Michael Tokarev <mjt@tls.msk.ru>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Marcelo Tosatti <mtosatti@redhat.com>
Cc: kvm@vger.kernel.org

Showing 3 changed files with 70 additions and 45 deletions Side-by-side Diff

include/linux/hrtimer.h
... ... @@ -162,10 +162,11 @@
162 162 * @expires_next: absolute time of the next event which was scheduled
163 163 * via clock_set_next_event()
164 164 * @hres_active: State of high resolution mode
165   - * @check_clocks: Indictator, when set evaluate time source and clock
166   - * event devices whether high resolution mode can be
167   - * activated.
168   - * @nr_events: Total number of timer interrupt events
  165 + * @hang_detected: The last hrtimer interrupt detected a hang
  166 + * @nr_events: Total number of hrtimer interrupt events
  167 + * @nr_retries: Total number of hrtimer interrupt retries
  168 + * @nr_hangs: Total number of hrtimer interrupt hangs
  169 + * @max_hang_time: Maximum time spent in hrtimer_interrupt
169 170 */
170 171 struct hrtimer_cpu_base {
171 172 spinlock_t lock;
172 173  
... ... @@ -173,7 +174,11 @@
173 174 #ifdef CONFIG_HIGH_RES_TIMERS
174 175 ktime_t expires_next;
175 176 int hres_active;
  177 + int hang_detected;
176 178 unsigned long nr_events;
  179 + unsigned long nr_retries;
  180 + unsigned long nr_hangs;
  181 + ktime_t max_hang_time;
177 182 #endif
178 183 };
179 184  
... ... @@ -557,7 +557,7 @@
557 557 static int hrtimer_reprogram(struct hrtimer *timer,
558 558 struct hrtimer_clock_base *base)
559 559 {
560   - ktime_t *expires_next = &__get_cpu_var(hrtimer_bases).expires_next;
  560 + struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
561 561 ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
562 562 int res;
563 563  
564 564  
565 565  
... ... @@ -582,15 +582,24 @@
582 582 if (expires.tv64 < 0)
583 583 return -ETIME;
584 584  
585   - if (expires.tv64 >= expires_next->tv64)
  585 + if (expires.tv64 >= cpu_base->expires_next.tv64)
586 586 return 0;
587 587  
588 588 /*
  589 + * If a hang was detected in the last timer interrupt then we
  590 + * do not schedule a timer which is earlier than the expiry
  591 + * which we enforced in the hang detection. We want the system
  592 + * to make progress.
  593 + */
  594 + if (cpu_base->hang_detected)
  595 + return 0;
  596 +
  597 + /*
589 598 * Clockevents returns -ETIME, when the event was in the past.
590 599 */
591 600 res = tick_program_event(expires, 0);
592 601 if (!IS_ERR_VALUE(res))
593   - *expires_next = expires;
  602 + cpu_base->expires_next = expires;
594 603 return res;
595 604 }
596 605  
597 606  
... ... @@ -1217,31 +1226,7 @@
1217 1226  
1218 1227 #ifdef CONFIG_HIGH_RES_TIMERS
1219 1228  
1220   -static int force_clock_reprogram;
1221   -
1222 1229 /*
1223   - * After 5 iteration's attempts, we consider that hrtimer_interrupt()
1224   - * is hanging, which could happen with something that slows the interrupt
1225   - * such as the tracing. Then we force the clock reprogramming for each future
1226   - * hrtimer interrupts to avoid infinite loops and use the min_delta_ns
1227   - * threshold that we will overwrite.
1228   - * The next tick event will be scheduled to 3 times we currently spend on
1229   - * hrtimer_interrupt(). This gives a good compromise, the cpus will spend
1230   - * 1/4 of their time to process the hrtimer interrupts. This is enough to
1231   - * let it running without serious starvation.
1232   - */
1233   -
1234   -static inline void
1235   -hrtimer_interrupt_hanging(struct clock_event_device *dev,
1236   - ktime_t try_time)
1237   -{
1238   - force_clock_reprogram = 1;
1239   - dev->min_delta_ns = (unsigned long)try_time.tv64 * 3;
1240   - printk(KERN_WARNING "hrtimer: interrupt too slow, "
1241   - "forcing clock min delta to %llu ns\n",
1242   - (unsigned long long) dev->min_delta_ns);
1243   -}
1244   -/*
1245 1230 * High resolution timer interrupt
1246 1231 * Called with interrupts disabled
1247 1232 */
1248 1233  
... ... @@ -1249,21 +1234,15 @@
1249 1234 {
1250 1235 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
1251 1236 struct hrtimer_clock_base *base;
1252   - ktime_t expires_next, now;
1253   - int nr_retries = 0;
1254   - int i;
  1237 + ktime_t expires_next, now, entry_time, delta;
  1238 + int i, retries = 0;
1255 1239  
1256 1240 BUG_ON(!cpu_base->hres_active);
1257 1241 cpu_base->nr_events++;
1258 1242 dev->next_event.tv64 = KTIME_MAX;
1259 1243  
1260   - retry:
1261   - /* 5 retries is enough to notice a hang */
1262   - if (!(++nr_retries % 5))
1263   - hrtimer_interrupt_hanging(dev, ktime_sub(ktime_get(), now));
1264   -
1265   - now = ktime_get();
1266   -
  1244 + entry_time = now = ktime_get();
  1245 +retry:
1267 1246 expires_next.tv64 = KTIME_MAX;
1268 1247  
1269 1248 spin_lock(&cpu_base->lock);
1270 1249  
... ... @@ -1325,10 +1304,48 @@
1325 1304 spin_unlock(&cpu_base->lock);
1326 1305  
1327 1306 /* Reprogramming necessary ? */
1328   - if (expires_next.tv64 != KTIME_MAX) {
1329   - if (tick_program_event(expires_next, force_clock_reprogram))
1330   - goto retry;
  1307 + if (expires_next.tv64 == KTIME_MAX ||
  1308 + !tick_program_event(expires_next, 0)) {
  1309 + cpu_base->hang_detected = 0;
  1310 + return;
1331 1311 }
  1312 +
  1313 + /*
  1314 + * The next timer was already expired due to:
  1315 + * - tracing
  1316 + * - long lasting callbacks
  1317 + * - being scheduled away when running in a VM
  1318 + *
  1319 + * We need to prevent that we loop forever in the hrtimer
  1320 + * interrupt routine. We give it 3 attempts to avoid
  1321 + * overreacting on some spurious event.
  1322 + */
  1323 + now = ktime_get();
  1324 + cpu_base->nr_retries++;
  1325 + if (++retries < 3)
  1326 + goto retry;
  1327 + /*
  1328 + * Give the system a chance to do something else than looping
  1329 + * here. We stored the entry time, so we know exactly how long
  1330 + * we spent here. We schedule the next event this amount of
  1331 + * time away.
  1332 + */
  1333 + cpu_base->nr_hangs++;
  1334 + cpu_base->hang_detected = 1;
  1335 + delta = ktime_sub(now, entry_time);
  1336 + if (delta.tv64 > cpu_base->max_hang_time.tv64)
  1337 + cpu_base->max_hang_time = delta;
  1338 + /*
  1339 + * Limit it to a sensible value as we enforce a longer
  1340 + * delay. Give the CPU at least 100ms to catch up.
  1341 + */
  1342 + if (delta.tv64 > 100 * NSEC_PER_MSEC)
  1343 + expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC);
  1344 + else
  1345 + expires_next = ktime_add(now, delta);
  1346 + tick_program_event(expires_next, 1);
  1347 + printk_once(KERN_WARNING "hrtimer: interrupt took %llu ns\n",
  1348 + ktime_to_ns(delta));
1332 1349 }
1333 1350  
1334 1351 /*
kernel/time/timer_list.c
... ... @@ -150,6 +150,9 @@
150 150 P_ns(expires_next);
151 151 P(hres_active);
152 152 P(nr_events);
  153 + P(nr_retries);
  154 + P(nr_hangs);
  155 + P_ns(max_hang_time);
153 156 #endif
154 157 #undef P
155 158 #undef P_ns
... ... @@ -254,7 +257,7 @@
254 257 u64 now = ktime_to_ns(ktime_get());
255 258 int cpu;
256 259  
257   - SEQ_printf(m, "Timer List Version: v0.4\n");
  260 + SEQ_printf(m, "Timer List Version: v0.5\n");
258 261 SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
259 262 SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
260 263