Commit 74f5187ac873042f502227701ed1727e7c5fbfa9

Authored by Peter Zijlstra
Committed by Ingo Molnar
1 parent 09a40af524

sched: Cure load average vs NO_HZ woes

Chase reported that due to us decrementing calc_load_task prematurely
(before the next LOAD_FREQ sample), the load average could be scewed
by as much as the number of CPUs in the machine.

This patch, based on Chase's patch, cures the problem by keeping the
delta of the CPU going into NO_HZ idle separately and folding that in
on the next LOAD_FREQ update.

This restores the balance and we get strict LOAD_FREQ period samples.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Chase Douglas <chase.douglas@canonical.com>
LKML-Reference: <1271934490.1776.343.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

Showing 2 changed files with 68 additions and 15 deletions Side-by-side Diff

... ... @@ -1815,7 +1815,7 @@
1815 1815 }
1816 1816 #endif
1817 1817  
1818   -static void calc_load_account_active(struct rq *this_rq);
  1818 +static void calc_load_account_idle(struct rq *this_rq);
1819 1819 static void update_sysctl(void);
1820 1820 static int get_update_sysctl_factor(void);
1821 1821  
... ... @@ -2950,6 +2950,61 @@
2950 2950 unsigned long avenrun[3];
2951 2951 EXPORT_SYMBOL(avenrun);
2952 2952  
  2953 +static long calc_load_fold_active(struct rq *this_rq)
  2954 +{
  2955 + long nr_active, delta = 0;
  2956 +
  2957 + nr_active = this_rq->nr_running;
  2958 + nr_active += (long) this_rq->nr_uninterruptible;
  2959 +
  2960 + if (nr_active != this_rq->calc_load_active) {
  2961 + delta = nr_active - this_rq->calc_load_active;
  2962 + this_rq->calc_load_active = nr_active;
  2963 + }
  2964 +
  2965 + return delta;
  2966 +}
  2967 +
  2968 +#ifdef CONFIG_NO_HZ
  2969 +/*
  2970 + * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
  2971 + *
  2972 + * When making the ILB scale, we should try to pull this in as well.
  2973 + */
  2974 +static atomic_long_t calc_load_tasks_idle;
  2975 +
  2976 +static void calc_load_account_idle(struct rq *this_rq)
  2977 +{
  2978 + long delta;
  2979 +
  2980 + delta = calc_load_fold_active(this_rq);
  2981 + if (delta)
  2982 + atomic_long_add(delta, &calc_load_tasks_idle);
  2983 +}
  2984 +
  2985 +static long calc_load_fold_idle(void)
  2986 +{
  2987 + long delta = 0;
  2988 +
  2989 + /*
  2990 + * Its got a race, we don't care...
  2991 + */
  2992 + if (atomic_long_read(&calc_load_tasks_idle))
  2993 + delta = atomic_long_xchg(&calc_load_tasks_idle, 0);
  2994 +
  2995 + return delta;
  2996 +}
  2997 +#else
  2998 +static void calc_load_account_idle(struct rq *this_rq)
  2999 +{
  3000 +}
  3001 +
  3002 +static inline long calc_load_fold_idle(void)
  3003 +{
  3004 + return 0;
  3005 +}
  3006 +#endif
  3007 +
2953 3008 /**
2954 3009 * get_avenrun - get the load average array
2955 3010 * @loads: pointer to dest load array
2956 3011  
2957 3012  
2958 3013  
2959 3014  
... ... @@ -2996,20 +3051,22 @@
2996 3051 }
2997 3052  
2998 3053 /*
2999   - * Either called from update_cpu_load() or from a cpu going idle
  3054 + * Called from update_cpu_load() to periodically update this CPU's
  3055 + * active count.
3000 3056 */
3001 3057 static void calc_load_account_active(struct rq *this_rq)
3002 3058 {
3003   - long nr_active, delta;
  3059 + long delta;
3004 3060  
3005   - nr_active = this_rq->nr_running;
3006   - nr_active += (long) this_rq->nr_uninterruptible;
  3061 + if (time_before(jiffies, this_rq->calc_load_update))
  3062 + return;
3007 3063  
3008   - if (nr_active != this_rq->calc_load_active) {
3009   - delta = nr_active - this_rq->calc_load_active;
3010   - this_rq->calc_load_active = nr_active;
  3064 + delta = calc_load_fold_active(this_rq);
  3065 + delta += calc_load_fold_idle();
  3066 + if (delta)
3011 3067 atomic_long_add(delta, &calc_load_tasks);
3012   - }
  3068 +
  3069 + this_rq->calc_load_update += LOAD_FREQ;
3013 3070 }
3014 3071  
3015 3072 /*
... ... @@ -3041,10 +3098,7 @@
3041 3098 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
3042 3099 }
3043 3100  
3044   - if (time_after_eq(jiffies, this_rq->calc_load_update)) {
3045   - this_rq->calc_load_update += LOAD_FREQ;
3046   - calc_load_account_active(this_rq);
3047   - }
  3101 + calc_load_account_active(this_rq);
3048 3102 }
3049 3103  
3050 3104 #ifdef CONFIG_SMP
kernel/sched_idletask.c
... ... @@ -23,8 +23,7 @@
23 23 static struct task_struct *pick_next_task_idle(struct rq *rq)
24 24 {
25 25 schedstat_inc(rq, sched_goidle);
26   - /* adjust the active tasks as we might go into a long sleep */
27   - calc_load_account_active(rq);
  26 + calc_load_account_idle(rq);
28 27 return rq->idle;
29 28 }
30 29