Commit 4cd4c1b40d40447fb5e7ba80746c6d7ba91d7a53

Authored by Peter Zijlstra
Committed by Ingo Molnar
1 parent 32bd671d6c

timers: split process wide cpu clocks/timers

Change the process wide cpu timers/clocks so that we:

 1) don't mess up the kernel with too many threads,
 2) don't have a per-cpu allocation for each process,
 3) have no impact when not used.

In order to accomplish this we're going to split it into two parts:

 - clocks; which can take all the time they want since they run
           from user context -- ie. sys_clock_gettime(CLOCK_PROCESS_CPUTIME_ID)

 - timers; which need constant time sampling but since they're
           explicity used, the user can pay the overhead.

The clock readout will go back to a full sum of the thread group, while the
timers will run of a global 'clock' that only runs when needed, so only
programs that make use of the facility pay the price.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

Showing 5 changed files with 155 additions and 54 deletions Side-by-side Diff

include/linux/init_task.h
... ... @@ -48,12 +48,11 @@
48 48 .posix_timers = LIST_HEAD_INIT(sig.posix_timers), \
49 49 .cpu_timers = INIT_CPU_TIMERS(sig.cpu_timers), \
50 50 .rlim = INIT_RLIMITS, \
51   - .cputime = { .totals = { \
52   - .utime = cputime_zero, \
53   - .stime = cputime_zero, \
54   - .sum_exec_runtime = 0, \
55   - .lock = __SPIN_LOCK_UNLOCKED(sig.cputime.totals.lock), \
56   - }, }, \
  51 + .cputimer = { \
  52 + .cputime = INIT_CPUTIME, \
  53 + .running = 0, \
  54 + .lock = __SPIN_LOCK_UNLOCKED(sig.cputimer.lock), \
  55 + }, \
57 56 }
58 57  
59 58 extern struct nsproxy init_nsproxy;
include/linux/sched.h
... ... @@ -443,7 +443,6 @@
443 443 * @utime: time spent in user mode, in &cputime_t units
444 444 * @stime: time spent in kernel mode, in &cputime_t units
445 445 * @sum_exec_runtime: total time spent on the CPU, in nanoseconds
446   - * @lock: lock for fields in this struct
447 446 *
448 447 * This structure groups together three kinds of CPU time that are
449 448 * tracked for threads and thread groups. Most things considering
450 449  
451 450  
452 451  
453 452  
... ... @@ -454,23 +453,33 @@
454 453 cputime_t utime;
455 454 cputime_t stime;
456 455 unsigned long long sum_exec_runtime;
457   - spinlock_t lock;
458 456 };
459 457 /* Alternate field names when used to cache expirations. */
460 458 #define prof_exp stime
461 459 #define virt_exp utime
462 460 #define sched_exp sum_exec_runtime
463 461  
  462 +#define INIT_CPUTIME \
  463 + (struct task_cputime) { \
  464 + .utime = cputime_zero, \
  465 + .stime = cputime_zero, \
  466 + .sum_exec_runtime = 0, \
  467 + }
  468 +
464 469 /**
465   - * struct thread_group_cputime - thread group interval timer counts
466   - * @totals: thread group interval timers; substructure for
467   - * uniprocessor kernel, per-cpu for SMP kernel.
  470 + * struct thread_group_cputimer - thread group interval timer counts
  471 + * @cputime: thread group interval timers.
  472 + * @running: non-zero when there are timers running and
  473 + * @cputime receives updates.
  474 + * @lock: lock for fields in this struct.
468 475 *
469 476 * This structure contains the version of task_cputime, above, that is
470   - * used for thread group CPU clock calculations.
  477 + * used for thread group CPU timer calculations.
471 478 */
472   -struct thread_group_cputime {
473   - struct task_cputime totals;
  479 +struct thread_group_cputimer {
  480 + struct task_cputime cputime;
  481 + int running;
  482 + spinlock_t lock;
474 483 };
475 484  
476 485 /*
477 486  
... ... @@ -519,10 +528,10 @@
519 528 cputime_t it_prof_incr, it_virt_incr;
520 529  
521 530 /*
522   - * Thread group totals for process CPU clocks.
523   - * See thread_group_cputime(), et al, for details.
  531 + * Thread group totals for process CPU timers.
  532 + * See thread_group_cputimer(), et al, for details.
524 533 */
525   - struct thread_group_cputime cputime;
  534 + struct thread_group_cputimer cputimer;
526 535  
527 536 /* Earliest-expiration cache. */
528 537 struct task_cputime cputime_expires;
529 538  
530 539  
531 540  
532 541  
... ... @@ -2191,27 +2200,26 @@
2191 2200 /*
2192 2201 * Thread group CPU time accounting.
2193 2202 */
  2203 +void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times);
2194 2204  
2195 2205 static inline
2196   -void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
  2206 +void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
2197 2207 {
2198   - struct task_cputime *totals = &tsk->signal->cputime.totals;
  2208 + struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
2199 2209 unsigned long flags;
2200 2210  
2201   - spin_lock_irqsave(&totals->lock, flags);
2202   - *times = *totals;
2203   - spin_unlock_irqrestore(&totals->lock, flags);
  2211 + WARN_ON(!cputimer->running);
  2212 +
  2213 + spin_lock_irqsave(&cputimer->lock, flags);
  2214 + *times = cputimer->cputime;
  2215 + spin_unlock_irqrestore(&cputimer->lock, flags);
2204 2216 }
2205 2217  
2206 2218 static inline void thread_group_cputime_init(struct signal_struct *sig)
2207 2219 {
2208   - sig->cputime.totals = (struct task_cputime){
2209   - .utime = cputime_zero,
2210   - .stime = cputime_zero,
2211   - .sum_exec_runtime = 0,
2212   - };
2213   -
2214   - spin_lock_init(&sig->cputime.totals.lock);
  2220 + sig->cputimer.cputime = INIT_CPUTIME;
  2221 + spin_lock_init(&sig->cputimer.lock);
  2222 + sig->cputimer.running = 0;
2215 2223 }
2216 2224  
2217 2225 static inline void thread_group_cputime_free(struct signal_struct *sig)
... ... @@ -62,7 +62,7 @@
62 62 struct task_cputime cputime;
63 63 cputime_t utime;
64 64  
65   - thread_group_cputime(tsk, &cputime);
  65 + thread_group_cputimer(tsk, &cputime);
66 66 utime = cputime.utime;
67 67 if (cputime_le(cval, utime)) { /* about to fire */
68 68 cval = jiffies_to_cputime(1);
... ... @@ -82,7 +82,7 @@
82 82 struct task_cputime times;
83 83 cputime_t ptime;
84 84  
85   - thread_group_cputime(tsk, &times);
  85 + thread_group_cputimer(tsk, &times);
86 86 ptime = cputime_add(times.utime, times.stime);
87 87 if (cputime_le(cval, ptime)) { /* about to fire */
88 88 cval = jiffies_to_cputime(1);
kernel/posix-cpu-timers.c
... ... @@ -230,6 +230,37 @@
230 230 return 0;
231 231 }
232 232  
  233 +void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
  234 +{
  235 + struct sighand_struct *sighand;
  236 + struct signal_struct *sig;
  237 + struct task_struct *t;
  238 +
  239 + *times = INIT_CPUTIME;
  240 +
  241 + rcu_read_lock();
  242 + sighand = rcu_dereference(tsk->sighand);
  243 + if (!sighand)
  244 + goto out;
  245 +
  246 + sig = tsk->signal;
  247 +
  248 + t = tsk;
  249 + do {
  250 + times->utime = cputime_add(times->utime, t->utime);
  251 + times->stime = cputime_add(times->stime, t->stime);
  252 + times->sum_exec_runtime += t->se.sum_exec_runtime;
  253 +
  254 + t = next_thread(t);
  255 + } while (t != tsk);
  256 +
  257 + times->utime = cputime_add(times->utime, sig->utime);
  258 + times->stime = cputime_add(times->stime, sig->stime);
  259 + times->sum_exec_runtime += sig->sum_sched_runtime;
  260 +out:
  261 + rcu_read_unlock();
  262 +}
  263 +
233 264 /*
234 265 * Sample a process (thread group) clock for the given group_leader task.
235 266 * Must be called with tasklist_lock held for reading.
... ... @@ -476,6 +507,29 @@
476 507 }
477 508  
478 509 /*
  510 + * Enable the process wide cpu timer accounting.
  511 + *
  512 + * serialized using ->sighand->siglock
  513 + */
  514 +static void start_process_timers(struct task_struct *tsk)
  515 +{
  516 + tsk->signal->cputimer.running = 1;
  517 + barrier();
  518 +}
  519 +
  520 +/*
  521 + * Release the process wide timer accounting -- timer stops ticking when
  522 + * nobody cares about it.
  523 + *
  524 + * serialized using ->sighand->siglock
  525 + */
  526 +static void stop_process_timers(struct task_struct *tsk)
  527 +{
  528 + tsk->signal->cputimer.running = 0;
  529 + barrier();
  530 +}
  531 +
  532 +/*
479 533 * Insert the timer on the appropriate list before any timers that
480 534 * expire later. This must be called with the tasklist_lock held
481 535 * for reading, and interrupts disabled.
... ... @@ -495,6 +549,9 @@
495 549 BUG_ON(!irqs_disabled());
496 550 spin_lock(&p->sighand->siglock);
497 551  
  552 + if (!CPUCLOCK_PERTHREAD(timer->it_clock))
  553 + start_process_timers(p);
  554 +
498 555 listpos = head;
499 556 if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) {
500 557 list_for_each_entry(next, head, entry) {
501 558  
502 559  
... ... @@ -987,13 +1044,15 @@
987 1044 sig->rlim[RLIMIT_CPU].rlim_cur == RLIM_INFINITY &&
988 1045 list_empty(&timers[CPUCLOCK_VIRT]) &&
989 1046 cputime_eq(sig->it_virt_expires, cputime_zero) &&
990   - list_empty(&timers[CPUCLOCK_SCHED]))
  1047 + list_empty(&timers[CPUCLOCK_SCHED])) {
  1048 + stop_process_timers(tsk);
991 1049 return;
  1050 + }
992 1051  
993 1052 /*
994 1053 * Collect the current process totals.
995 1054 */
996   - thread_group_cputime(tsk, &cputime);
  1055 + thread_group_cputimer(tsk, &cputime);
997 1056 utime = cputime.utime;
998 1057 ptime = cputime_add(utime, cputime.stime);
999 1058 sum_sched_runtime = cputime.sum_exec_runtime;
... ... @@ -1259,7 +1318,7 @@
1259 1318 if (!task_cputime_zero(&sig->cputime_expires)) {
1260 1319 struct task_cputime group_sample;
1261 1320  
1262   - thread_group_cputime(tsk, &group_sample);
  1321 + thread_group_cputimer(tsk, &group_sample);
1263 1322 if (task_cputime_expired(&group_sample, &sig->cputime_expires))
1264 1323 return 1;
1265 1324 }
... ... @@ -1329,6 +1388,33 @@
1329 1388 }
1330 1389  
1331 1390 /*
  1391 + * Sample a process (thread group) timer for the given group_leader task.
  1392 + * Must be called with tasklist_lock held for reading.
  1393 + */
  1394 +static int cpu_timer_sample_group(const clockid_t which_clock,
  1395 + struct task_struct *p,
  1396 + union cpu_time_count *cpu)
  1397 +{
  1398 + struct task_cputime cputime;
  1399 +
  1400 + thread_group_cputimer(p, &cputime);
  1401 + switch (CPUCLOCK_WHICH(which_clock)) {
  1402 + default:
  1403 + return -EINVAL;
  1404 + case CPUCLOCK_PROF:
  1405 + cpu->cpu = cputime_add(cputime.utime, cputime.stime);
  1406 + break;
  1407 + case CPUCLOCK_VIRT:
  1408 + cpu->cpu = cputime.utime;
  1409 + break;
  1410 + case CPUCLOCK_SCHED:
  1411 + cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p);
  1412 + break;
  1413 + }
  1414 + return 0;
  1415 +}
  1416 +
  1417 +/*
1332 1418 * Set one of the process-wide special case CPU timers.
1333 1419 * The tsk->sighand->siglock must be held by the caller.
1334 1420 * The *newval argument is relative and we update it to be absolute, *oldval
... ... @@ -1341,7 +1427,8 @@
1341 1427 struct list_head *head;
1342 1428  
1343 1429 BUG_ON(clock_idx == CPUCLOCK_SCHED);
1344   - cpu_clock_sample_group(clock_idx, tsk, &now);
  1430 + start_process_timers(tsk);
  1431 + cpu_timer_sample_group(clock_idx, tsk, &now);
1345 1432  
1346 1433 if (oldval) {
1347 1434 if (!cputime_eq(*oldval, cputime_zero)) {
kernel/sched_stats.h
... ... @@ -296,19 +296,21 @@
296 296 static inline void account_group_user_time(struct task_struct *tsk,
297 297 cputime_t cputime)
298 298 {
299   - struct task_cputime *times;
300   - struct signal_struct *sig;
  299 + struct thread_group_cputimer *cputimer;
301 300  
302 301 /* tsk == current, ensure it is safe to use ->signal */
303 302 if (unlikely(tsk->exit_state))
304 303 return;
305 304  
306   - sig = tsk->signal;
307   - times = &sig->cputime.totals;
  305 + cputimer = &tsk->signal->cputimer;
308 306  
309   - spin_lock(&times->lock);
310   - times->utime = cputime_add(times->utime, cputime);
311   - spin_unlock(&times->lock);
  307 + if (!cputimer->running)
  308 + return;
  309 +
  310 + spin_lock(&cputimer->lock);
  311 + cputimer->cputime.utime =
  312 + cputime_add(cputimer->cputime.utime, cputime);
  313 + spin_unlock(&cputimer->lock);
312 314 }
313 315  
314 316 /**
315 317  
316 318  
... ... @@ -324,19 +326,21 @@
324 326 static inline void account_group_system_time(struct task_struct *tsk,
325 327 cputime_t cputime)
326 328 {
327   - struct task_cputime *times;
328   - struct signal_struct *sig;
  329 + struct thread_group_cputimer *cputimer;
329 330  
330 331 /* tsk == current, ensure it is safe to use ->signal */
331 332 if (unlikely(tsk->exit_state))
332 333 return;
333 334  
334   - sig = tsk->signal;
335   - times = &sig->cputime.totals;
  335 + cputimer = &tsk->signal->cputimer;
336 336  
337   - spin_lock(&times->lock);
338   - times->stime = cputime_add(times->stime, cputime);
339   - spin_unlock(&times->lock);
  337 + if (!cputimer->running)
  338 + return;
  339 +
  340 + spin_lock(&cputimer->lock);
  341 + cputimer->cputime.stime =
  342 + cputime_add(cputimer->cputime.stime, cputime);
  343 + spin_unlock(&cputimer->lock);
340 344 }
341 345  
342 346 /**
... ... @@ -352,7 +356,7 @@
352 356 static inline void account_group_exec_runtime(struct task_struct *tsk,
353 357 unsigned long long ns)
354 358 {
355   - struct task_cputime *times;
  359 + struct thread_group_cputimer *cputimer;
356 360 struct signal_struct *sig;
357 361  
358 362 sig = tsk->signal;
359 363  
... ... @@ -361,10 +365,13 @@
361 365 if (unlikely(!sig))
362 366 return;
363 367  
364   - times = &sig->cputime.totals;
  368 + cputimer = &sig->cputimer;
365 369  
366   - spin_lock(&times->lock);
367   - times->sum_exec_runtime += ns;
368   - spin_unlock(&times->lock);
  370 + if (!cputimer->running)
  371 + return;
  372 +
  373 + spin_lock(&cputimer->lock);
  374 + cputimer->cputime.sum_exec_runtime += ns;
  375 + spin_unlock(&cputimer->lock);
369 376 }