Commit 4cd4c1b40d40447fb5e7ba80746c6d7ba91d7a53
Committed by
Ingo Molnar
1 parent
32bd671d6c
Exists in
master
and in
4 other branches
timers: split process wide cpu clocks/timers
Change the process wide cpu timers/clocks so that we: 1) don't mess up the kernel with too many threads, 2) don't have a per-cpu allocation for each process, 3) have no impact when not used. In order to accomplish this we're going to split it into two parts: - clocks; which can take all the time they want since they run from user context -- ie. sys_clock_gettime(CLOCK_PROCESS_CPUTIME_ID) - timers; which need constant time sampling but since they're explicity used, the user can pay the overhead. The clock readout will go back to a full sum of the thread group, while the timers will run of a global 'clock' that only runs when needed, so only programs that make use of the facility pay the price. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Reviewed-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Ingo Molnar <mingo@elte.hu>
Showing 5 changed files with 155 additions and 54 deletions Side-by-side Diff
include/linux/init_task.h
... | ... | @@ -48,12 +48,11 @@ |
48 | 48 | .posix_timers = LIST_HEAD_INIT(sig.posix_timers), \ |
49 | 49 | .cpu_timers = INIT_CPU_TIMERS(sig.cpu_timers), \ |
50 | 50 | .rlim = INIT_RLIMITS, \ |
51 | - .cputime = { .totals = { \ | |
52 | - .utime = cputime_zero, \ | |
53 | - .stime = cputime_zero, \ | |
54 | - .sum_exec_runtime = 0, \ | |
55 | - .lock = __SPIN_LOCK_UNLOCKED(sig.cputime.totals.lock), \ | |
56 | - }, }, \ | |
51 | + .cputimer = { \ | |
52 | + .cputime = INIT_CPUTIME, \ | |
53 | + .running = 0, \ | |
54 | + .lock = __SPIN_LOCK_UNLOCKED(sig.cputimer.lock), \ | |
55 | + }, \ | |
57 | 56 | } |
58 | 57 | |
59 | 58 | extern struct nsproxy init_nsproxy; |
include/linux/sched.h
... | ... | @@ -443,7 +443,6 @@ |
443 | 443 | * @utime: time spent in user mode, in &cputime_t units |
444 | 444 | * @stime: time spent in kernel mode, in &cputime_t units |
445 | 445 | * @sum_exec_runtime: total time spent on the CPU, in nanoseconds |
446 | - * @lock: lock for fields in this struct | |
447 | 446 | * |
448 | 447 | * This structure groups together three kinds of CPU time that are |
449 | 448 | * tracked for threads and thread groups. Most things considering |
450 | 449 | |
451 | 450 | |
452 | 451 | |
453 | 452 | |
... | ... | @@ -454,23 +453,33 @@ |
454 | 453 | cputime_t utime; |
455 | 454 | cputime_t stime; |
456 | 455 | unsigned long long sum_exec_runtime; |
457 | - spinlock_t lock; | |
458 | 456 | }; |
459 | 457 | /* Alternate field names when used to cache expirations. */ |
460 | 458 | #define prof_exp stime |
461 | 459 | #define virt_exp utime |
462 | 460 | #define sched_exp sum_exec_runtime |
463 | 461 | |
462 | +#define INIT_CPUTIME \ | |
463 | + (struct task_cputime) { \ | |
464 | + .utime = cputime_zero, \ | |
465 | + .stime = cputime_zero, \ | |
466 | + .sum_exec_runtime = 0, \ | |
467 | + } | |
468 | + | |
464 | 469 | /** |
465 | - * struct thread_group_cputime - thread group interval timer counts | |
466 | - * @totals: thread group interval timers; substructure for | |
467 | - * uniprocessor kernel, per-cpu for SMP kernel. | |
470 | + * struct thread_group_cputimer - thread group interval timer counts | |
471 | + * @cputime: thread group interval timers. | |
472 | + * @running: non-zero when there are timers running and | |
473 | + * @cputime receives updates. | |
474 | + * @lock: lock for fields in this struct. | |
468 | 475 | * |
469 | 476 | * This structure contains the version of task_cputime, above, that is |
470 | - * used for thread group CPU clock calculations. | |
477 | + * used for thread group CPU timer calculations. | |
471 | 478 | */ |
472 | -struct thread_group_cputime { | |
473 | - struct task_cputime totals; | |
479 | +struct thread_group_cputimer { | |
480 | + struct task_cputime cputime; | |
481 | + int running; | |
482 | + spinlock_t lock; | |
474 | 483 | }; |
475 | 484 | |
476 | 485 | /* |
477 | 486 | |
... | ... | @@ -519,10 +528,10 @@ |
519 | 528 | cputime_t it_prof_incr, it_virt_incr; |
520 | 529 | |
521 | 530 | /* |
522 | - * Thread group totals for process CPU clocks. | |
523 | - * See thread_group_cputime(), et al, for details. | |
531 | + * Thread group totals for process CPU timers. | |
532 | + * See thread_group_cputimer(), et al, for details. | |
524 | 533 | */ |
525 | - struct thread_group_cputime cputime; | |
534 | + struct thread_group_cputimer cputimer; | |
526 | 535 | |
527 | 536 | /* Earliest-expiration cache. */ |
528 | 537 | struct task_cputime cputime_expires; |
529 | 538 | |
530 | 539 | |
531 | 540 | |
532 | 541 | |
... | ... | @@ -2191,27 +2200,26 @@ |
2191 | 2200 | /* |
2192 | 2201 | * Thread group CPU time accounting. |
2193 | 2202 | */ |
2203 | +void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times); | |
2194 | 2204 | |
2195 | 2205 | static inline |
2196 | -void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) | |
2206 | +void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) | |
2197 | 2207 | { |
2198 | - struct task_cputime *totals = &tsk->signal->cputime.totals; | |
2208 | + struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; | |
2199 | 2209 | unsigned long flags; |
2200 | 2210 | |
2201 | - spin_lock_irqsave(&totals->lock, flags); | |
2202 | - *times = *totals; | |
2203 | - spin_unlock_irqrestore(&totals->lock, flags); | |
2211 | + WARN_ON(!cputimer->running); | |
2212 | + | |
2213 | + spin_lock_irqsave(&cputimer->lock, flags); | |
2214 | + *times = cputimer->cputime; | |
2215 | + spin_unlock_irqrestore(&cputimer->lock, flags); | |
2204 | 2216 | } |
2205 | 2217 | |
2206 | 2218 | static inline void thread_group_cputime_init(struct signal_struct *sig) |
2207 | 2219 | { |
2208 | - sig->cputime.totals = (struct task_cputime){ | |
2209 | - .utime = cputime_zero, | |
2210 | - .stime = cputime_zero, | |
2211 | - .sum_exec_runtime = 0, | |
2212 | - }; | |
2213 | - | |
2214 | - spin_lock_init(&sig->cputime.totals.lock); | |
2220 | + sig->cputimer.cputime = INIT_CPUTIME; | |
2221 | + spin_lock_init(&sig->cputimer.lock); | |
2222 | + sig->cputimer.running = 0; | |
2215 | 2223 | } |
2216 | 2224 | |
2217 | 2225 | static inline void thread_group_cputime_free(struct signal_struct *sig) |
kernel/itimer.c
... | ... | @@ -62,7 +62,7 @@ |
62 | 62 | struct task_cputime cputime; |
63 | 63 | cputime_t utime; |
64 | 64 | |
65 | - thread_group_cputime(tsk, &cputime); | |
65 | + thread_group_cputimer(tsk, &cputime); | |
66 | 66 | utime = cputime.utime; |
67 | 67 | if (cputime_le(cval, utime)) { /* about to fire */ |
68 | 68 | cval = jiffies_to_cputime(1); |
... | ... | @@ -82,7 +82,7 @@ |
82 | 82 | struct task_cputime times; |
83 | 83 | cputime_t ptime; |
84 | 84 | |
85 | - thread_group_cputime(tsk, ×); | |
85 | + thread_group_cputimer(tsk, ×); | |
86 | 86 | ptime = cputime_add(times.utime, times.stime); |
87 | 87 | if (cputime_le(cval, ptime)) { /* about to fire */ |
88 | 88 | cval = jiffies_to_cputime(1); |
kernel/posix-cpu-timers.c
... | ... | @@ -230,6 +230,37 @@ |
230 | 230 | return 0; |
231 | 231 | } |
232 | 232 | |
233 | +void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) | |
234 | +{ | |
235 | + struct sighand_struct *sighand; | |
236 | + struct signal_struct *sig; | |
237 | + struct task_struct *t; | |
238 | + | |
239 | + *times = INIT_CPUTIME; | |
240 | + | |
241 | + rcu_read_lock(); | |
242 | + sighand = rcu_dereference(tsk->sighand); | |
243 | + if (!sighand) | |
244 | + goto out; | |
245 | + | |
246 | + sig = tsk->signal; | |
247 | + | |
248 | + t = tsk; | |
249 | + do { | |
250 | + times->utime = cputime_add(times->utime, t->utime); | |
251 | + times->stime = cputime_add(times->stime, t->stime); | |
252 | + times->sum_exec_runtime += t->se.sum_exec_runtime; | |
253 | + | |
254 | + t = next_thread(t); | |
255 | + } while (t != tsk); | |
256 | + | |
257 | + times->utime = cputime_add(times->utime, sig->utime); | |
258 | + times->stime = cputime_add(times->stime, sig->stime); | |
259 | + times->sum_exec_runtime += sig->sum_sched_runtime; | |
260 | +out: | |
261 | + rcu_read_unlock(); | |
262 | +} | |
263 | + | |
233 | 264 | /* |
234 | 265 | * Sample a process (thread group) clock for the given group_leader task. |
235 | 266 | * Must be called with tasklist_lock held for reading. |
... | ... | @@ -476,6 +507,29 @@ |
476 | 507 | } |
477 | 508 | |
478 | 509 | /* |
510 | + * Enable the process wide cpu timer accounting. | |
511 | + * | |
512 | + * serialized using ->sighand->siglock | |
513 | + */ | |
514 | +static void start_process_timers(struct task_struct *tsk) | |
515 | +{ | |
516 | + tsk->signal->cputimer.running = 1; | |
517 | + barrier(); | |
518 | +} | |
519 | + | |
520 | +/* | |
521 | + * Release the process wide timer accounting -- timer stops ticking when | |
522 | + * nobody cares about it. | |
523 | + * | |
524 | + * serialized using ->sighand->siglock | |
525 | + */ | |
526 | +static void stop_process_timers(struct task_struct *tsk) | |
527 | +{ | |
528 | + tsk->signal->cputimer.running = 0; | |
529 | + barrier(); | |
530 | +} | |
531 | + | |
532 | +/* | |
479 | 533 | * Insert the timer on the appropriate list before any timers that |
480 | 534 | * expire later. This must be called with the tasklist_lock held |
481 | 535 | * for reading, and interrupts disabled. |
... | ... | @@ -495,6 +549,9 @@ |
495 | 549 | BUG_ON(!irqs_disabled()); |
496 | 550 | spin_lock(&p->sighand->siglock); |
497 | 551 | |
552 | + if (!CPUCLOCK_PERTHREAD(timer->it_clock)) | |
553 | + start_process_timers(p); | |
554 | + | |
498 | 555 | listpos = head; |
499 | 556 | if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) { |
500 | 557 | list_for_each_entry(next, head, entry) { |
501 | 558 | |
502 | 559 | |
... | ... | @@ -987,13 +1044,15 @@ |
987 | 1044 | sig->rlim[RLIMIT_CPU].rlim_cur == RLIM_INFINITY && |
988 | 1045 | list_empty(&timers[CPUCLOCK_VIRT]) && |
989 | 1046 | cputime_eq(sig->it_virt_expires, cputime_zero) && |
990 | - list_empty(&timers[CPUCLOCK_SCHED])) | |
1047 | + list_empty(&timers[CPUCLOCK_SCHED])) { | |
1048 | + stop_process_timers(tsk); | |
991 | 1049 | return; |
1050 | + } | |
992 | 1051 | |
993 | 1052 | /* |
994 | 1053 | * Collect the current process totals. |
995 | 1054 | */ |
996 | - thread_group_cputime(tsk, &cputime); | |
1055 | + thread_group_cputimer(tsk, &cputime); | |
997 | 1056 | utime = cputime.utime; |
998 | 1057 | ptime = cputime_add(utime, cputime.stime); |
999 | 1058 | sum_sched_runtime = cputime.sum_exec_runtime; |
... | ... | @@ -1259,7 +1318,7 @@ |
1259 | 1318 | if (!task_cputime_zero(&sig->cputime_expires)) { |
1260 | 1319 | struct task_cputime group_sample; |
1261 | 1320 | |
1262 | - thread_group_cputime(tsk, &group_sample); | |
1321 | + thread_group_cputimer(tsk, &group_sample); | |
1263 | 1322 | if (task_cputime_expired(&group_sample, &sig->cputime_expires)) |
1264 | 1323 | return 1; |
1265 | 1324 | } |
... | ... | @@ -1329,6 +1388,33 @@ |
1329 | 1388 | } |
1330 | 1389 | |
1331 | 1390 | /* |
1391 | + * Sample a process (thread group) timer for the given group_leader task. | |
1392 | + * Must be called with tasklist_lock held for reading. | |
1393 | + */ | |
1394 | +static int cpu_timer_sample_group(const clockid_t which_clock, | |
1395 | + struct task_struct *p, | |
1396 | + union cpu_time_count *cpu) | |
1397 | +{ | |
1398 | + struct task_cputime cputime; | |
1399 | + | |
1400 | + thread_group_cputimer(p, &cputime); | |
1401 | + switch (CPUCLOCK_WHICH(which_clock)) { | |
1402 | + default: | |
1403 | + return -EINVAL; | |
1404 | + case CPUCLOCK_PROF: | |
1405 | + cpu->cpu = cputime_add(cputime.utime, cputime.stime); | |
1406 | + break; | |
1407 | + case CPUCLOCK_VIRT: | |
1408 | + cpu->cpu = cputime.utime; | |
1409 | + break; | |
1410 | + case CPUCLOCK_SCHED: | |
1411 | + cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p); | |
1412 | + break; | |
1413 | + } | |
1414 | + return 0; | |
1415 | +} | |
1416 | + | |
1417 | +/* | |
1332 | 1418 | * Set one of the process-wide special case CPU timers. |
1333 | 1419 | * The tsk->sighand->siglock must be held by the caller. |
1334 | 1420 | * The *newval argument is relative and we update it to be absolute, *oldval |
... | ... | @@ -1341,7 +1427,8 @@ |
1341 | 1427 | struct list_head *head; |
1342 | 1428 | |
1343 | 1429 | BUG_ON(clock_idx == CPUCLOCK_SCHED); |
1344 | - cpu_clock_sample_group(clock_idx, tsk, &now); | |
1430 | + start_process_timers(tsk); | |
1431 | + cpu_timer_sample_group(clock_idx, tsk, &now); | |
1345 | 1432 | |
1346 | 1433 | if (oldval) { |
1347 | 1434 | if (!cputime_eq(*oldval, cputime_zero)) { |
kernel/sched_stats.h
... | ... | @@ -296,19 +296,21 @@ |
296 | 296 | static inline void account_group_user_time(struct task_struct *tsk, |
297 | 297 | cputime_t cputime) |
298 | 298 | { |
299 | - struct task_cputime *times; | |
300 | - struct signal_struct *sig; | |
299 | + struct thread_group_cputimer *cputimer; | |
301 | 300 | |
302 | 301 | /* tsk == current, ensure it is safe to use ->signal */ |
303 | 302 | if (unlikely(tsk->exit_state)) |
304 | 303 | return; |
305 | 304 | |
306 | - sig = tsk->signal; | |
307 | - times = &sig->cputime.totals; | |
305 | + cputimer = &tsk->signal->cputimer; | |
308 | 306 | |
309 | - spin_lock(×->lock); | |
310 | - times->utime = cputime_add(times->utime, cputime); | |
311 | - spin_unlock(×->lock); | |
307 | + if (!cputimer->running) | |
308 | + return; | |
309 | + | |
310 | + spin_lock(&cputimer->lock); | |
311 | + cputimer->cputime.utime = | |
312 | + cputime_add(cputimer->cputime.utime, cputime); | |
313 | + spin_unlock(&cputimer->lock); | |
312 | 314 | } |
313 | 315 | |
314 | 316 | /** |
315 | 317 | |
316 | 318 | |
... | ... | @@ -324,19 +326,21 @@ |
324 | 326 | static inline void account_group_system_time(struct task_struct *tsk, |
325 | 327 | cputime_t cputime) |
326 | 328 | { |
327 | - struct task_cputime *times; | |
328 | - struct signal_struct *sig; | |
329 | + struct thread_group_cputimer *cputimer; | |
329 | 330 | |
330 | 331 | /* tsk == current, ensure it is safe to use ->signal */ |
331 | 332 | if (unlikely(tsk->exit_state)) |
332 | 333 | return; |
333 | 334 | |
334 | - sig = tsk->signal; | |
335 | - times = &sig->cputime.totals; | |
335 | + cputimer = &tsk->signal->cputimer; | |
336 | 336 | |
337 | - spin_lock(×->lock); | |
338 | - times->stime = cputime_add(times->stime, cputime); | |
339 | - spin_unlock(×->lock); | |
337 | + if (!cputimer->running) | |
338 | + return; | |
339 | + | |
340 | + spin_lock(&cputimer->lock); | |
341 | + cputimer->cputime.stime = | |
342 | + cputime_add(cputimer->cputime.stime, cputime); | |
343 | + spin_unlock(&cputimer->lock); | |
340 | 344 | } |
341 | 345 | |
342 | 346 | /** |
... | ... | @@ -352,7 +356,7 @@ |
352 | 356 | static inline void account_group_exec_runtime(struct task_struct *tsk, |
353 | 357 | unsigned long long ns) |
354 | 358 | { |
355 | - struct task_cputime *times; | |
359 | + struct thread_group_cputimer *cputimer; | |
356 | 360 | struct signal_struct *sig; |
357 | 361 | |
358 | 362 | sig = tsk->signal; |
359 | 363 | |
... | ... | @@ -361,10 +365,13 @@ |
361 | 365 | if (unlikely(!sig)) |
362 | 366 | return; |
363 | 367 | |
364 | - times = &sig->cputime.totals; | |
368 | + cputimer = &sig->cputimer; | |
365 | 369 | |
366 | - spin_lock(×->lock); | |
367 | - times->sum_exec_runtime += ns; | |
368 | - spin_unlock(×->lock); | |
370 | + if (!cputimer->running) | |
371 | + return; | |
372 | + | |
373 | + spin_lock(&cputimer->lock); | |
374 | + cputimer->cputime.sum_exec_runtime += ns; | |
375 | + spin_unlock(&cputimer->lock); | |
369 | 376 | } |