Commit 0cf55e1ec08bb5a22e068309e2d8ba1180ab4239
Committed by
Ingo Molnar
1 parent
d99ca3b977
Exists in
master
and in
4 other branches
sched, cputime: Introduce thread_group_times()
This is a real fix for problem of utime/stime values decreasing described in the thread: http://lkml.org/lkml/2009/11/3/522 Now cputime is accounted in the following way: - {u,s}time in task_struct are increased every time when the thread is interrupted by a tick (timer interrupt). - When a thread exits, its {u,s}time are added to signal->{u,s}time, after adjusted by task_times(). - When all threads in a thread_group exits, accumulated {u,s}time (and also c{u,s}time) in signal struct are added to c{u,s}time in signal struct of the group's parent. So {u,s}time in task struct are "raw" tick count, while {u,s}time and c{u,s}time in signal struct are "adjusted" values. And accounted values are used by: - task_times(), to get cputime of a thread: This function returns adjusted values that originates from raw {u,s}time and scaled by sum_exec_runtime that accounted by CFS. - thread_group_cputime(), to get cputime of a thread group: This function returns sum of all {u,s}time of living threads in the group, plus {u,s}time in the signal struct that is sum of adjusted cputimes of all exited threads belonged to the group. The problem is the return value of thread_group_cputime(), because it is mixed sum of "raw" value and "adjusted" value: group's {u,s}time = foreach(thread){{u,s}time} + exited({u,s}time) This misbehavior can break {u,s}time monotonicity. Assume that if there is a thread that have raw values greater than adjusted values (e.g. interrupted by 1000Hz ticks 50 times but only runs 45ms) and if it exits, cputime will decrease (e.g. -5ms). To fix this, we could do: group's {u,s}time = foreach(t){task_times(t)} + exited({u,s}time) But task_times() contains hard divisions, so applying it for every thread should be avoided. This patch fixes the above problem in the following way: - Modify thread's exit (= __exit_signal()) not to use task_times(). It means {u,s}time in signal struct accumulates raw values instead of adjusted values. As the result it makes thread_group_cputime() to return pure sum of "raw" values. - Introduce a new function thread_group_times(*task, *utime, *stime) that converts "raw" values of thread_group_cputime() to "adjusted" values, in same calculation procedure as task_times(). - Modify group's exit (= wait_task_zombie()) to use this introduced thread_group_times(). It make c{u,s}time in signal struct to have adjusted values like before this patch. - Replace some thread_group_cputime() by thread_group_times(). This replacements are only applied where conveys the "adjusted" cputime to users, and where already uses task_times() near by it. (i.e. sys_times(), getrusage(), and /proc/<PID>/stat.) This patch have a positive side effect: - Before this patch, if a group contains many short-life threads (e.g. runs 0.9ms and not interrupted by ticks), the group's cputime could be invisible since thread's cputime was accumulated after adjusted: imagine adjustment function as adj(ticks, runtime), {adj(0, 0.9) + adj(0, 0.9) + ....} = {0 + 0 + ....} = 0. After this patch it will not happen because the adjustment is applied after accumulated. v2: - remove if()s, put new variables into signal_struct. Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com> Acked-by: Peter Zijlstra <peterz@infradead.org> Cc: Spencer Candland <spencer@bluehost.com> Cc: Americo Wang <xiyou.wangcong@gmail.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Stanislaw Gruszka <sgruszka@redhat.com> LKML-Reference: <4B162517.8040909@jp.fujitsu.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
Showing 6 changed files with 69 additions and 25 deletions Side-by-side Diff
fs/proc/array.c
... | ... | @@ -506,7 +506,6 @@ |
506 | 506 | |
507 | 507 | /* add up live thread stats at the group level */ |
508 | 508 | if (whole) { |
509 | - struct task_cputime cputime; | |
510 | 509 | struct task_struct *t = task; |
511 | 510 | do { |
512 | 511 | min_flt += t->min_flt; |
... | ... | @@ -517,9 +516,7 @@ |
517 | 516 | |
518 | 517 | min_flt += sig->min_flt; |
519 | 518 | maj_flt += sig->maj_flt; |
520 | - thread_group_cputime(task, &cputime); | |
521 | - utime = cputime.utime; | |
522 | - stime = cputime.stime; | |
519 | + thread_group_times(task, &utime, &stime); | |
523 | 520 | gtime = cputime_add(gtime, sig->gtime); |
524 | 521 | } |
525 | 522 |
include/linux/sched.h
... | ... | @@ -624,6 +624,9 @@ |
624 | 624 | cputime_t utime, stime, cutime, cstime; |
625 | 625 | cputime_t gtime; |
626 | 626 | cputime_t cgtime; |
627 | +#ifndef CONFIG_VIRT_CPU_ACCOUNTING | |
628 | + cputime_t prev_utime, prev_stime; | |
629 | +#endif | |
627 | 630 | unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; |
628 | 631 | unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt; |
629 | 632 | unsigned long inblock, oublock, cinblock, coublock; |
... | ... | @@ -1723,6 +1726,7 @@ |
1723 | 1726 | } |
1724 | 1727 | |
1725 | 1728 | extern void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st); |
1729 | +extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st); | |
1726 | 1730 | |
1727 | 1731 | /* |
1728 | 1732 | * Per process flags |
kernel/exit.c
... | ... | @@ -91,8 +91,6 @@ |
91 | 91 | if (atomic_dec_and_test(&sig->count)) |
92 | 92 | posix_cpu_timers_exit_group(tsk); |
93 | 93 | else { |
94 | - cputime_t utime, stime; | |
95 | - | |
96 | 94 | /* |
97 | 95 | * If there is any task waiting for the group exit |
98 | 96 | * then notify it: |
... | ... | @@ -112,9 +110,8 @@ |
112 | 110 | * We won't ever get here for the group leader, since it |
113 | 111 | * will have been the last reference on the signal_struct. |
114 | 112 | */ |
115 | - task_times(tsk, &utime, &stime); | |
116 | - sig->utime = cputime_add(sig->utime, utime); | |
117 | - sig->stime = cputime_add(sig->stime, stime); | |
113 | + sig->utime = cputime_add(sig->utime, tsk->utime); | |
114 | + sig->stime = cputime_add(sig->stime, tsk->stime); | |
118 | 115 | sig->gtime = cputime_add(sig->gtime, tsk->gtime); |
119 | 116 | sig->min_flt += tsk->min_flt; |
120 | 117 | sig->maj_flt += tsk->maj_flt; |
... | ... | @@ -1208,6 +1205,7 @@ |
1208 | 1205 | struct signal_struct *psig; |
1209 | 1206 | struct signal_struct *sig; |
1210 | 1207 | unsigned long maxrss; |
1208 | + cputime_t tgutime, tgstime; | |
1211 | 1209 | |
1212 | 1210 | /* |
1213 | 1211 | * The resource counters for the group leader are in its |
1214 | 1212 | |
1215 | 1213 | |
1216 | 1214 | |
... | ... | @@ -1223,20 +1221,23 @@ |
1223 | 1221 | * need to protect the access to parent->signal fields, |
1224 | 1222 | * as other threads in the parent group can be right |
1225 | 1223 | * here reaping other children at the same time. |
1224 | + * | |
1225 | + * We use thread_group_times() to get times for the thread | |
1226 | + * group, which consolidates times for all threads in the | |
1227 | + * group including the group leader. | |
1226 | 1228 | */ |
1229 | + thread_group_times(p, &tgutime, &tgstime); | |
1227 | 1230 | spin_lock_irq(&p->real_parent->sighand->siglock); |
1228 | 1231 | psig = p->real_parent->signal; |
1229 | 1232 | sig = p->signal; |
1230 | 1233 | psig->cutime = |
1231 | 1234 | cputime_add(psig->cutime, |
1232 | - cputime_add(p->utime, | |
1233 | - cputime_add(sig->utime, | |
1234 | - sig->cutime))); | |
1235 | + cputime_add(tgutime, | |
1236 | + sig->cutime)); | |
1235 | 1237 | psig->cstime = |
1236 | 1238 | cputime_add(psig->cstime, |
1237 | - cputime_add(p->stime, | |
1238 | - cputime_add(sig->stime, | |
1239 | - sig->cstime))); | |
1239 | + cputime_add(tgstime, | |
1240 | + sig->cstime)); | |
1240 | 1241 | psig->cgtime = |
1241 | 1242 | cputime_add(psig->cgtime, |
1242 | 1243 | cputime_add(p->gtime, |
kernel/fork.c
... | ... | @@ -884,6 +884,9 @@ |
884 | 884 | sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; |
885 | 885 | sig->gtime = cputime_zero; |
886 | 886 | sig->cgtime = cputime_zero; |
887 | +#ifndef CONFIG_VIRT_CPU_ACCOUNTING | |
888 | + sig->prev_utime = sig->prev_stime = cputime_zero; | |
889 | +#endif | |
887 | 890 | sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; |
888 | 891 | sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; |
889 | 892 | sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; |
kernel/sched.c
... | ... | @@ -5187,6 +5187,16 @@ |
5187 | 5187 | *ut = p->utime; |
5188 | 5188 | *st = p->stime; |
5189 | 5189 | } |
5190 | + | |
5191 | +void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | |
5192 | +{ | |
5193 | + struct task_cputime cputime; | |
5194 | + | |
5195 | + thread_group_cputime(p, &cputime); | |
5196 | + | |
5197 | + *ut = cputime.utime; | |
5198 | + *st = cputime.stime; | |
5199 | +} | |
5190 | 5200 | #else |
5191 | 5201 | |
5192 | 5202 | #ifndef nsecs_to_cputime |
... | ... | @@ -5219,6 +5229,37 @@ |
5219 | 5229 | |
5220 | 5230 | *ut = p->prev_utime; |
5221 | 5231 | *st = p->prev_stime; |
5232 | +} | |
5233 | + | |
5234 | +/* | |
5235 | + * Must be called with siglock held. | |
5236 | + */ | |
5237 | +void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | |
5238 | +{ | |
5239 | + struct signal_struct *sig = p->signal; | |
5240 | + struct task_cputime cputime; | |
5241 | + cputime_t rtime, utime, total; | |
5242 | + | |
5243 | + thread_group_cputime(p, &cputime); | |
5244 | + | |
5245 | + total = cputime_add(cputime.utime, cputime.stime); | |
5246 | + rtime = nsecs_to_cputime(cputime.sum_exec_runtime); | |
5247 | + | |
5248 | + if (total) { | |
5249 | + u64 temp; | |
5250 | + | |
5251 | + temp = (u64)(rtime * cputime.utime); | |
5252 | + do_div(temp, total); | |
5253 | + utime = (cputime_t)temp; | |
5254 | + } else | |
5255 | + utime = rtime; | |
5256 | + | |
5257 | + sig->prev_utime = max(sig->prev_utime, utime); | |
5258 | + sig->prev_stime = max(sig->prev_stime, | |
5259 | + cputime_sub(rtime, sig->prev_utime)); | |
5260 | + | |
5261 | + *ut = sig->prev_utime; | |
5262 | + *st = sig->prev_stime; | |
5222 | 5263 | } |
5223 | 5264 | #endif |
5224 | 5265 |
kernel/sys.c
... | ... | @@ -911,16 +911,15 @@ |
911 | 911 | |
912 | 912 | void do_sys_times(struct tms *tms) |
913 | 913 | { |
914 | - struct task_cputime cputime; | |
915 | - cputime_t cutime, cstime; | |
914 | + cputime_t tgutime, tgstime, cutime, cstime; | |
916 | 915 | |
917 | - thread_group_cputime(current, &cputime); | |
918 | 916 | spin_lock_irq(¤t->sighand->siglock); |
917 | + thread_group_times(current, &tgutime, &tgstime); | |
919 | 918 | cutime = current->signal->cutime; |
920 | 919 | cstime = current->signal->cstime; |
921 | 920 | spin_unlock_irq(¤t->sighand->siglock); |
922 | - tms->tms_utime = cputime_to_clock_t(cputime.utime); | |
923 | - tms->tms_stime = cputime_to_clock_t(cputime.stime); | |
921 | + tms->tms_utime = cputime_to_clock_t(tgutime); | |
922 | + tms->tms_stime = cputime_to_clock_t(tgstime); | |
924 | 923 | tms->tms_cutime = cputime_to_clock_t(cutime); |
925 | 924 | tms->tms_cstime = cputime_to_clock_t(cstime); |
926 | 925 | } |
... | ... | @@ -1338,8 +1337,7 @@ |
1338 | 1337 | { |
1339 | 1338 | struct task_struct *t; |
1340 | 1339 | unsigned long flags; |
1341 | - cputime_t utime, stime; | |
1342 | - struct task_cputime cputime; | |
1340 | + cputime_t tgutime, tgstime, utime, stime; | |
1343 | 1341 | unsigned long maxrss = 0; |
1344 | 1342 | |
1345 | 1343 | memset((char *) r, 0, sizeof *r); |
... | ... | @@ -1372,9 +1370,9 @@ |
1372 | 1370 | break; |
1373 | 1371 | |
1374 | 1372 | case RUSAGE_SELF: |
1375 | - thread_group_cputime(p, &cputime); | |
1376 | - utime = cputime_add(utime, cputime.utime); | |
1377 | - stime = cputime_add(stime, cputime.stime); | |
1373 | + thread_group_times(p, &tgutime, &tgstime); | |
1374 | + utime = cputime_add(utime, tgutime); | |
1375 | + stime = cputime_add(stime, tgstime); | |
1378 | 1376 | r->ru_nvcsw += p->signal->nvcsw; |
1379 | 1377 | r->ru_nivcsw += p->signal->nivcsw; |
1380 | 1378 | r->ru_minflt += p->signal->min_flt; |