Commit 94dba895333a4321f27360e42b807260ae36bda4

Authored by Linus Torvalds

Merge branch 'timers-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kern…

…el/git/tip/linux-2.6-tip

* 'timers-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip:
  timers: fix TIMER_ABSTIME for process wide cpu timers
  timers: split process wide cpu clocks/timers, fix
  x86: clean up hpet timer reinit
  timers: split process wide cpu clocks/timers, remove spurious warning
  timers: split process wide cpu clocks/timers
  signal: re-add dead task accumulation stats.
  x86: fix hpet timer reinit for x86_64
  sched: fix nohz load balancer on cpu offline

Showing 10 changed files Side-by-side Diff

arch/x86/kernel/hpet.c
... ... @@ -897,7 +897,7 @@
897 897 static int hpet_prev_update_sec;
898 898 static struct rtc_time hpet_alarm_time;
899 899 static unsigned long hpet_pie_count;
900   -static unsigned long hpet_t1_cmp;
  900 +static u32 hpet_t1_cmp;
901 901 static unsigned long hpet_default_delta;
902 902 static unsigned long hpet_pie_delta;
903 903 static unsigned long hpet_pie_limit;
... ... @@ -905,6 +905,14 @@
905 905 static rtc_irq_handler irq_handler;
906 906  
907 907 /*
  908 + * Check that the hpet counter c1 is ahead of the c2
  909 + */
  910 +static inline int hpet_cnt_ahead(u32 c1, u32 c2)
  911 +{
  912 + return (s32)(c2 - c1) < 0;
  913 +}
  914 +
  915 +/*
908 916 * Registers a IRQ handler.
909 917 */
910 918 int hpet_register_irq_handler(rtc_irq_handler handler)
... ... @@ -1075,7 +1083,7 @@
1075 1083 hpet_t1_cmp += delta;
1076 1084 hpet_writel(hpet_t1_cmp, HPET_T1_CMP);
1077 1085 lost_ints++;
1078   - } while ((long)(hpet_readl(HPET_COUNTER) - hpet_t1_cmp) > 0);
  1086 + } while (!hpet_cnt_ahead(hpet_t1_cmp, hpet_readl(HPET_COUNTER)));
1079 1087  
1080 1088 if (lost_ints) {
1081 1089 if (hpet_rtc_flags & RTC_PIE)
include/linux/init_task.h
... ... @@ -48,12 +48,11 @@
48 48 .posix_timers = LIST_HEAD_INIT(sig.posix_timers), \
49 49 .cpu_timers = INIT_CPU_TIMERS(sig.cpu_timers), \
50 50 .rlim = INIT_RLIMITS, \
51   - .cputime = { .totals = { \
52   - .utime = cputime_zero, \
53   - .stime = cputime_zero, \
54   - .sum_exec_runtime = 0, \
55   - .lock = __SPIN_LOCK_UNLOCKED(sig.cputime.totals.lock), \
56   - }, }, \
  51 + .cputimer = { \
  52 + .cputime = INIT_CPUTIME, \
  53 + .running = 0, \
  54 + .lock = __SPIN_LOCK_UNLOCKED(sig.cputimer.lock), \
  55 + }, \
57 56 }
58 57  
59 58 extern struct nsproxy init_nsproxy;
include/linux/sched.h
... ... @@ -443,7 +443,6 @@
443 443 * @utime: time spent in user mode, in &cputime_t units
444 444 * @stime: time spent in kernel mode, in &cputime_t units
445 445 * @sum_exec_runtime: total time spent on the CPU, in nanoseconds
446   - * @lock: lock for fields in this struct
447 446 *
448 447 * This structure groups together three kinds of CPU time that are
449 448 * tracked for threads and thread groups. Most things considering
450 449  
451 450  
452 451  
453 452  
... ... @@ -454,23 +453,33 @@
454 453 cputime_t utime;
455 454 cputime_t stime;
456 455 unsigned long long sum_exec_runtime;
457   - spinlock_t lock;
458 456 };
459 457 /* Alternate field names when used to cache expirations. */
460 458 #define prof_exp stime
461 459 #define virt_exp utime
462 460 #define sched_exp sum_exec_runtime
463 461  
  462 +#define INIT_CPUTIME \
  463 + (struct task_cputime) { \
  464 + .utime = cputime_zero, \
  465 + .stime = cputime_zero, \
  466 + .sum_exec_runtime = 0, \
  467 + }
  468 +
464 469 /**
465   - * struct thread_group_cputime - thread group interval timer counts
466   - * @totals: thread group interval timers; substructure for
467   - * uniprocessor kernel, per-cpu for SMP kernel.
  470 + * struct thread_group_cputimer - thread group interval timer counts
  471 + * @cputime: thread group interval timers.
  472 + * @running: non-zero when there are timers running and
  473 + * @cputime receives updates.
  474 + * @lock: lock for fields in this struct.
468 475 *
469 476 * This structure contains the version of task_cputime, above, that is
470   - * used for thread group CPU clock calculations.
  477 + * used for thread group CPU timer calculations.
471 478 */
472   -struct thread_group_cputime {
473   - struct task_cputime totals;
  479 +struct thread_group_cputimer {
  480 + struct task_cputime cputime;
  481 + int running;
  482 + spinlock_t lock;
474 483 };
475 484  
476 485 /*
477 486  
... ... @@ -519,10 +528,10 @@
519 528 cputime_t it_prof_incr, it_virt_incr;
520 529  
521 530 /*
522   - * Thread group totals for process CPU clocks.
523   - * See thread_group_cputime(), et al, for details.
  531 + * Thread group totals for process CPU timers.
  532 + * See thread_group_cputimer(), et al, for details.
524 533 */
525   - struct thread_group_cputime cputime;
  534 + struct thread_group_cputimer cputimer;
526 535  
527 536 /* Earliest-expiration cache. */
528 537 struct task_cputime cputime_expires;
... ... @@ -559,7 +568,7 @@
559 568 * Live threads maintain their own counters and add to these
560 569 * in __exit_signal, except for the group leader.
561 570 */
562   - cputime_t cutime, cstime;
  571 + cputime_t utime, stime, cutime, cstime;
563 572 cputime_t gtime;
564 573 cputime_t cgtime;
565 574 unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
... ... @@ -568,6 +577,14 @@
568 577 struct task_io_accounting ioac;
569 578  
570 579 /*
  580 + * Cumulative ns of schedule CPU time fo dead threads in the
  581 + * group, not including a zombie group leader, (This only differs
  582 + * from jiffies_to_ns(utime + stime) if sched_clock uses something
  583 + * other than jiffies.)
  584 + */
  585 + unsigned long long sum_sched_runtime;
  586 +
  587 + /*
571 588 * We don't bother to synchronize most readers of this at all,
572 589 * because there is no reader checking a limit that actually needs
573 590 * to get both rlim_cur and rlim_max atomically, and either one
574 591  
575 592  
... ... @@ -2183,27 +2200,14 @@
2183 2200 /*
2184 2201 * Thread group CPU time accounting.
2185 2202 */
  2203 +void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times);
  2204 +void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times);
2186 2205  
2187   -static inline
2188   -void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
2189   -{
2190   - struct task_cputime *totals = &tsk->signal->cputime.totals;
2191   - unsigned long flags;
2192   -
2193   - spin_lock_irqsave(&totals->lock, flags);
2194   - *times = *totals;
2195   - spin_unlock_irqrestore(&totals->lock, flags);
2196   -}
2197   -
2198 2206 static inline void thread_group_cputime_init(struct signal_struct *sig)
2199 2207 {
2200   - sig->cputime.totals = (struct task_cputime){
2201   - .utime = cputime_zero,
2202   - .stime = cputime_zero,
2203   - .sum_exec_runtime = 0,
2204   - };
2205   -
2206   - spin_lock_init(&sig->cputime.totals.lock);
  2208 + sig->cputimer.cputime = INIT_CPUTIME;
  2209 + spin_lock_init(&sig->cputimer.lock);
  2210 + sig->cputimer.running = 0;
2207 2211 }
2208 2212  
2209 2213 static inline void thread_group_cputime_free(struct signal_struct *sig)
... ... @@ -118,6 +118,8 @@
118 118 * We won't ever get here for the group leader, since it
119 119 * will have been the last reference on the signal_struct.
120 120 */
  121 + sig->utime = cputime_add(sig->utime, task_utime(tsk));
  122 + sig->stime = cputime_add(sig->stime, task_stime(tsk));
121 123 sig->gtime = cputime_add(sig->gtime, task_gtime(tsk));
122 124 sig->min_flt += tsk->min_flt;
123 125 sig->maj_flt += tsk->maj_flt;
... ... @@ -126,6 +128,7 @@
126 128 sig->inblock += task_io_get_inblock(tsk);
127 129 sig->oublock += task_io_get_oublock(tsk);
128 130 task_io_accounting_add(&sig->ioac, &tsk->ioac);
  131 + sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
129 132 sig = NULL; /* Marker for below. */
130 133 }
131 134  
... ... @@ -851,13 +851,14 @@
851 851 sig->tty_old_pgrp = NULL;
852 852 sig->tty = NULL;
853 853  
854   - sig->cutime = sig->cstime = cputime_zero;
  854 + sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
855 855 sig->gtime = cputime_zero;
856 856 sig->cgtime = cputime_zero;
857 857 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
858 858 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
859 859 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
860 860 task_io_accounting_init(&sig->ioac);
  861 + sig->sum_sched_runtime = 0;
861 862 taskstats_tgid_init(sig);
862 863  
863 864 task_lock(current->group_leader);
... ... @@ -62,7 +62,7 @@
62 62 struct task_cputime cputime;
63 63 cputime_t utime;
64 64  
65   - thread_group_cputime(tsk, &cputime);
  65 + thread_group_cputimer(tsk, &cputime);
66 66 utime = cputime.utime;
67 67 if (cputime_le(cval, utime)) { /* about to fire */
68 68 cval = jiffies_to_cputime(1);
... ... @@ -82,7 +82,7 @@
82 82 struct task_cputime times;
83 83 cputime_t ptime;
84 84  
85   - thread_group_cputime(tsk, &times);
  85 + thread_group_cputimer(tsk, &times);
86 86 ptime = cputime_add(times.utime, times.stime);
87 87 if (cputime_le(cval, ptime)) { /* about to fire */
88 88 cval = jiffies_to_cputime(1);
kernel/posix-cpu-timers.c
... ... @@ -230,6 +230,71 @@
230 230 return 0;
231 231 }
232 232  
  233 +void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
  234 +{
  235 + struct sighand_struct *sighand;
  236 + struct signal_struct *sig;
  237 + struct task_struct *t;
  238 +
  239 + *times = INIT_CPUTIME;
  240 +
  241 + rcu_read_lock();
  242 + sighand = rcu_dereference(tsk->sighand);
  243 + if (!sighand)
  244 + goto out;
  245 +
  246 + sig = tsk->signal;
  247 +
  248 + t = tsk;
  249 + do {
  250 + times->utime = cputime_add(times->utime, t->utime);
  251 + times->stime = cputime_add(times->stime, t->stime);
  252 + times->sum_exec_runtime += t->se.sum_exec_runtime;
  253 +
  254 + t = next_thread(t);
  255 + } while (t != tsk);
  256 +
  257 + times->utime = cputime_add(times->utime, sig->utime);
  258 + times->stime = cputime_add(times->stime, sig->stime);
  259 + times->sum_exec_runtime += sig->sum_sched_runtime;
  260 +out:
  261 + rcu_read_unlock();
  262 +}
  263 +
  264 +static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b)
  265 +{
  266 + if (cputime_gt(b->utime, a->utime))
  267 + a->utime = b->utime;
  268 +
  269 + if (cputime_gt(b->stime, a->stime))
  270 + a->stime = b->stime;
  271 +
  272 + if (b->sum_exec_runtime > a->sum_exec_runtime)
  273 + a->sum_exec_runtime = b->sum_exec_runtime;
  274 +}
  275 +
  276 +void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
  277 +{
  278 + struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
  279 + struct task_cputime sum;
  280 + unsigned long flags;
  281 +
  282 + spin_lock_irqsave(&cputimer->lock, flags);
  283 + if (!cputimer->running) {
  284 + cputimer->running = 1;
  285 + /*
  286 + * The POSIX timer interface allows for absolute time expiry
  287 + * values through the TIMER_ABSTIME flag, therefore we have
  288 + * to synchronize the timer to the clock every time we start
  289 + * it.
  290 + */
  291 + thread_group_cputime(tsk, &sum);
  292 + update_gt_cputime(&cputimer->cputime, &sum);
  293 + }
  294 + *times = cputimer->cputime;
  295 + spin_unlock_irqrestore(&cputimer->lock, flags);
  296 +}
  297 +
233 298 /*
234 299 * Sample a process (thread group) clock for the given group_leader task.
235 300 * Must be called with tasklist_lock held for reading.
... ... @@ -457,7 +522,7 @@
457 522 {
458 523 struct task_cputime cputime;
459 524  
460   - thread_group_cputime(tsk, &cputime);
  525 + thread_group_cputimer(tsk, &cputime);
461 526 cleanup_timers(tsk->signal->cpu_timers,
462 527 cputime.utime, cputime.stime, cputime.sum_exec_runtime);
463 528 }
... ... @@ -964,6 +1029,19 @@
964 1029 }
965 1030 }
966 1031  
  1032 +static void stop_process_timers(struct task_struct *tsk)
  1033 +{
  1034 + struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
  1035 + unsigned long flags;
  1036 +
  1037 + if (!cputimer->running)
  1038 + return;
  1039 +
  1040 + spin_lock_irqsave(&cputimer->lock, flags);
  1041 + cputimer->running = 0;
  1042 + spin_unlock_irqrestore(&cputimer->lock, flags);
  1043 +}
  1044 +
967 1045 /*
968 1046 * Check for any per-thread CPU timers that have fired and move them
969 1047 * off the tsk->*_timers list onto the firing list. Per-thread timers
970 1048  
971 1049  
... ... @@ -987,13 +1065,15 @@
987 1065 sig->rlim[RLIMIT_CPU].rlim_cur == RLIM_INFINITY &&
988 1066 list_empty(&timers[CPUCLOCK_VIRT]) &&
989 1067 cputime_eq(sig->it_virt_expires, cputime_zero) &&
990   - list_empty(&timers[CPUCLOCK_SCHED]))
  1068 + list_empty(&timers[CPUCLOCK_SCHED])) {
  1069 + stop_process_timers(tsk);
991 1070 return;
  1071 + }
992 1072  
993 1073 /*
994 1074 * Collect the current process totals.
995 1075 */
996   - thread_group_cputime(tsk, &cputime);
  1076 + thread_group_cputimer(tsk, &cputime);
997 1077 utime = cputime.utime;
998 1078 ptime = cputime_add(utime, cputime.stime);
999 1079 sum_sched_runtime = cputime.sum_exec_runtime;
... ... @@ -1259,7 +1339,7 @@
1259 1339 if (!task_cputime_zero(&sig->cputime_expires)) {
1260 1340 struct task_cputime group_sample;
1261 1341  
1262   - thread_group_cputime(tsk, &group_sample);
  1342 + thread_group_cputimer(tsk, &group_sample);
1263 1343 if (task_cputime_expired(&group_sample, &sig->cputime_expires))
1264 1344 return 1;
1265 1345 }
... ... @@ -1329,6 +1409,33 @@
1329 1409 }
1330 1410  
1331 1411 /*
  1412 + * Sample a process (thread group) timer for the given group_leader task.
  1413 + * Must be called with tasklist_lock held for reading.
  1414 + */
  1415 +static int cpu_timer_sample_group(const clockid_t which_clock,
  1416 + struct task_struct *p,
  1417 + union cpu_time_count *cpu)
  1418 +{
  1419 + struct task_cputime cputime;
  1420 +
  1421 + thread_group_cputimer(p, &cputime);
  1422 + switch (CPUCLOCK_WHICH(which_clock)) {
  1423 + default:
  1424 + return -EINVAL;
  1425 + case CPUCLOCK_PROF:
  1426 + cpu->cpu = cputime_add(cputime.utime, cputime.stime);
  1427 + break;
  1428 + case CPUCLOCK_VIRT:
  1429 + cpu->cpu = cputime.utime;
  1430 + break;
  1431 + case CPUCLOCK_SCHED:
  1432 + cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p);
  1433 + break;
  1434 + }
  1435 + return 0;
  1436 +}
  1437 +
  1438 +/*
1332 1439 * Set one of the process-wide special case CPU timers.
1333 1440 * The tsk->sighand->siglock must be held by the caller.
1334 1441 * The *newval argument is relative and we update it to be absolute, *oldval
... ... @@ -1341,7 +1448,7 @@
1341 1448 struct list_head *head;
1342 1449  
1343 1450 BUG_ON(clock_idx == CPUCLOCK_SCHED);
1344   - cpu_clock_sample_group(clock_idx, tsk, &now);
  1451 + cpu_timer_sample_group(clock_idx, tsk, &now);
1345 1452  
1346 1453 if (oldval) {
1347 1454 if (!cputime_eq(*oldval, cputime_zero)) {
... ... @@ -3890,18 +3890,23 @@
3890 3890 int cpu = smp_processor_id();
3891 3891  
3892 3892 if (stop_tick) {
3893   - cpumask_set_cpu(cpu, nohz.cpu_mask);
3894 3893 cpu_rq(cpu)->in_nohz_recently = 1;
3895 3894  
3896   - /*
3897   - * If we are going offline and still the leader, give up!
3898   - */
3899   - if (!cpu_active(cpu) &&
3900   - atomic_read(&nohz.load_balancer) == cpu) {
  3895 + if (!cpu_active(cpu)) {
  3896 + if (atomic_read(&nohz.load_balancer) != cpu)
  3897 + return 0;
  3898 +
  3899 + /*
  3900 + * If we are going offline and still the leader,
  3901 + * give up!
  3902 + */
3901 3903 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
3902 3904 BUG();
  3905 +
3903 3906 return 0;
3904 3907 }
  3908 +
  3909 + cpumask_set_cpu(cpu, nohz.cpu_mask);
3905 3910  
3906 3911 /* time for ilb owner also to sleep */
3907 3912 if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
kernel/sched_stats.h
... ... @@ -296,19 +296,21 @@
296 296 static inline void account_group_user_time(struct task_struct *tsk,
297 297 cputime_t cputime)
298 298 {
299   - struct task_cputime *times;
300   - struct signal_struct *sig;
  299 + struct thread_group_cputimer *cputimer;
301 300  
302 301 /* tsk == current, ensure it is safe to use ->signal */
303 302 if (unlikely(tsk->exit_state))
304 303 return;
305 304  
306   - sig = tsk->signal;
307   - times = &sig->cputime.totals;
  305 + cputimer = &tsk->signal->cputimer;
308 306  
309   - spin_lock(&times->lock);
310   - times->utime = cputime_add(times->utime, cputime);
311   - spin_unlock(&times->lock);
  307 + if (!cputimer->running)
  308 + return;
  309 +
  310 + spin_lock(&cputimer->lock);
  311 + cputimer->cputime.utime =
  312 + cputime_add(cputimer->cputime.utime, cputime);
  313 + spin_unlock(&cputimer->lock);
312 314 }
313 315  
314 316 /**
315 317  
316 318  
... ... @@ -324,19 +326,21 @@
324 326 static inline void account_group_system_time(struct task_struct *tsk,
325 327 cputime_t cputime)
326 328 {
327   - struct task_cputime *times;
328   - struct signal_struct *sig;
  329 + struct thread_group_cputimer *cputimer;
329 330  
330 331 /* tsk == current, ensure it is safe to use ->signal */
331 332 if (unlikely(tsk->exit_state))
332 333 return;
333 334  
334   - sig = tsk->signal;
335   - times = &sig->cputime.totals;
  335 + cputimer = &tsk->signal->cputimer;
336 336  
337   - spin_lock(&times->lock);
338   - times->stime = cputime_add(times->stime, cputime);
339   - spin_unlock(&times->lock);
  337 + if (!cputimer->running)
  338 + return;
  339 +
  340 + spin_lock(&cputimer->lock);
  341 + cputimer->cputime.stime =
  342 + cputime_add(cputimer->cputime.stime, cputime);
  343 + spin_unlock(&cputimer->lock);
340 344 }
341 345  
342 346 /**
... ... @@ -352,7 +356,7 @@
352 356 static inline void account_group_exec_runtime(struct task_struct *tsk,
353 357 unsigned long long ns)
354 358 {
355   - struct task_cputime *times;
  359 + struct thread_group_cputimer *cputimer;
356 360 struct signal_struct *sig;
357 361  
358 362 sig = tsk->signal;
359 363  
... ... @@ -361,10 +365,13 @@
361 365 if (unlikely(!sig))
362 366 return;
363 367  
364   - times = &sig->cputime.totals;
  368 + cputimer = &sig->cputimer;
365 369  
366   - spin_lock(&times->lock);
367   - times->sum_exec_runtime += ns;
368   - spin_unlock(&times->lock);
  370 + if (!cputimer->running)
  371 + return;
  372 +
  373 + spin_lock(&cputimer->lock);
  374 + cputimer->cputime.sum_exec_runtime += ns;
  375 + spin_unlock(&cputimer->lock);
369 376 }
... ... @@ -1367,7 +1367,6 @@
1367 1367 struct siginfo info;
1368 1368 unsigned long flags;
1369 1369 struct sighand_struct *psig;
1370   - struct task_cputime cputime;
1371 1370 int ret = sig;
1372 1371  
1373 1372 BUG_ON(sig == -1);
... ... @@ -1397,9 +1396,10 @@
1397 1396 info.si_uid = __task_cred(tsk)->uid;
1398 1397 rcu_read_unlock();
1399 1398  
1400   - thread_group_cputime(tsk, &cputime);
1401   - info.si_utime = cputime_to_jiffies(cputime.utime);
1402   - info.si_stime = cputime_to_jiffies(cputime.stime);
  1399 + info.si_utime = cputime_to_clock_t(cputime_add(tsk->utime,
  1400 + tsk->signal->utime));
  1401 + info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime,
  1402 + tsk->signal->stime));
1403 1403  
1404 1404 info.si_status = tsk->exit_code & 0x7f;
1405 1405 if (tsk->exit_code & 0x80)