Commit 095c0aa83e52d6c3dd7168610746703921f570af

Authored by Glauber Costa
Committed by Avi Kivity
1 parent e6e6685acc

sched: adjust scheduler cpu power for stolen time

This patch makes update_rq_clock() aware of steal time.
The mechanism of operation is not different from irq_time,
and follows the same principles. This lives in a CONFIG
option itself, and can be compiled out independently of
the rest of steal time reporting. The effect of disabling it
is that the scheduler will still report steal time (that cannot be
disabled), but won't use this information for cpu power adjustments.

Everytime update_rq_clock_task() is invoked, we query information
about how much time was stolen since last call, and feed it into
sched_rt_avg_update().

Although steal time reporting in account_process_tick() keeps
track of the last time we read the steal clock, in prev_steal_time,
this patch do it independently using another field,
prev_steal_time_rq. This is because otherwise, information about time
accounted in update_process_tick() would never reach us in update_rq_clock().

Signed-off-by: Glauber Costa <glommer@redhat.com>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Tested-by: Eric B Munson <emunson@mgebm.net>
CC: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
CC: Anthony Liguori <aliguori@us.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

Showing 3 changed files with 51 additions and 12 deletions Side-by-side Diff

... ... @@ -512,6 +512,18 @@
512 512  
513 513 if PARAVIRT_GUEST
514 514  
  515 +config PARAVIRT_TIME_ACCOUNTING
  516 + bool "Paravirtual steal time accounting"
  517 + select PARAVIRT
  518 + default n
  519 + ---help---
  520 + Select this option to enable fine granularity task steal time
  521 + accounting. Time spent executing other tasks in parallel with
  522 + the current vCPU is discounted from the vCPU power. To account for
  523 + that, there can be a small performance impact.
  524 +
  525 + If in doubt, say N here.
  526 +
515 527 source "arch/x86/xen/Kconfig"
516 528  
517 529 config KVM_CLOCK
... ... @@ -534,6 +534,9 @@
534 534 #ifdef CONFIG_PARAVIRT
535 535 u64 prev_steal_time;
536 536 #endif
  537 +#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
  538 + u64 prev_steal_time_rq;
  539 +#endif
537 540  
538 541 /* calc_load related fields */
539 542 unsigned long calc_load_update;
... ... @@ -1973,8 +1976,14 @@
1973 1976  
1974 1977 static void update_rq_clock_task(struct rq *rq, s64 delta)
1975 1978 {
1976   - s64 irq_delta;
1977   -
  1979 +/*
  1980 + * In theory, the compile should just see 0 here, and optimize out the call
  1981 + * to sched_rt_avg_update. But I don't trust it...
  1982 + */
  1983 +#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
  1984 + s64 steal = 0, irq_delta = 0;
  1985 +#endif
  1986 +#ifdef CONFIG_IRQ_TIME_ACCOUNTING
1978 1987 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
1979 1988  
1980 1989 /*
1981 1990  
1982 1991  
... ... @@ -1997,12 +2006,35 @@
1997 2006  
1998 2007 rq->prev_irq_time += irq_delta;
1999 2008 delta -= irq_delta;
  2009 +#endif
  2010 +#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
  2011 + if (static_branch((&paravirt_steal_rq_enabled))) {
  2012 + u64 st;
  2013 +
  2014 + steal = paravirt_steal_clock(cpu_of(rq));
  2015 + steal -= rq->prev_steal_time_rq;
  2016 +
  2017 + if (unlikely(steal > delta))
  2018 + steal = delta;
  2019 +
  2020 + st = steal_ticks(steal);
  2021 + steal = st * TICK_NSEC;
  2022 +
  2023 + rq->prev_steal_time_rq += steal;
  2024 +
  2025 + delta -= steal;
  2026 + }
  2027 +#endif
  2028 +
2000 2029 rq->clock_task += delta;
2001 2030  
2002   - if (irq_delta && sched_feat(NONIRQ_POWER))
2003   - sched_rt_avg_update(rq, irq_delta);
  2031 +#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
  2032 + if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
  2033 + sched_rt_avg_update(rq, irq_delta + steal);
  2034 +#endif
2004 2035 }
2005 2036  
  2037 +#ifdef CONFIG_IRQ_TIME_ACCOUNTING
2006 2038 static int irqtime_account_hi_update(void)
2007 2039 {
2008 2040 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
... ... @@ -2037,12 +2069,7 @@
2037 2069  
2038 2070 #define sched_clock_irqtime (0)
2039 2071  
2040   -static void update_rq_clock_task(struct rq *rq, s64 delta)
2041   -{
2042   - rq->clock_task += delta;
2043   -}
2044   -
2045   -#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
  2072 +#endif
2046 2073  
2047 2074 #include "sched_idletask.c"
2048 2075 #include "sched_fair.c"
kernel/sched_features.h
... ... @@ -61,9 +61,9 @@
61 61 SCHED_FEAT(OWNER_SPIN, 1)
62 62  
63 63 /*
64   - * Decrement CPU power based on irq activity
  64 + * Decrement CPU power based on time not spent running tasks
65 65 */
66   -SCHED_FEAT(NONIRQ_POWER, 1)
  66 +SCHED_FEAT(NONTASK_POWER, 1)
67 67  
68 68 /*
69 69 * Queue remote wakeups on the target CPU and process them