Commit d842de871c8c5e2110c7e4f3f29bbe7b1a519ab8

Authored by Srivatsa Vaddagiri
Committed by Ingo Molnar
1 parent 92d499d991

sched: cpu accounting controller (V2)

Commit cfb5285660aad4931b2ebbfa902ea48a37dfffa1 removed a useful feature for
us, which provided a cpu accounting resource controller.  This feature would be
useful if someone wants to group tasks only for accounting purpose and doesnt
really want to exercise any control over their cpu consumption.

The patch below reintroduces the feature. It is based on Paul Menage's
original patch (Commit 62d0df64065e7c135d0002f069444fbdfc64768f), with
these differences:

        - Removed load average information. I felt it needs more thought (esp
	  to deal with SMP and virtualized platforms) and can be added for
	  2.6.25 after more discussions.
        - Convert group cpu usage to be nanosecond accurate (as rest of the cfs
	  stats are) and invoke cpuacct_charge() from the respective scheduler
	  classes
	- Make accounting scalable on SMP systems by splitting the usage
	  counter to be per-cpu
	- Move the code from kernel/cpu_acct.c to kernel/sched.c (since the
	  code is not big enough to warrant a new file and also this rightly
	  needs to live inside the scheduler. Also things like accessing
	  rq->lock while reading cpu usage becomes easier if the code lived in
	  kernel/sched.c)

The patch also modifies the cpu controller not to provide the same accounting
information.

Tested-by: Balbir Singh <balbir@linux.vnet.ibm.com>

 Tested the patches on top of 2.6.24-rc3. The patches work fine. Ran
 some simple tests like cpuspin (spin on the cpu), ran several tasks in
 the same group and timed them. Compared their time stamps with
 cpuacct.usage.

Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

Showing 5 changed files with 150 additions and 26 deletions Side-by-side Diff

include/linux/cgroup_subsys.h
... ... @@ -30,4 +30,10 @@
30 30 #endif
31 31  
32 32 /* */
  33 +
  34 +#ifdef CONFIG_CGROUP_CPUACCT
  35 +SUBSYS(cpuacct)
  36 +#endif
  37 +
  38 +/* */
... ... @@ -354,6 +354,13 @@
354 354  
355 355 endchoice
356 356  
  357 +config CGROUP_CPUACCT
  358 + bool "Simple CPU accounting cgroup subsystem"
  359 + depends on CGROUPS
  360 + help
  361 + Provides a simple Resource Controller for monitoring the
  362 + total CPU consumed by the tasks in a cgroup
  363 +
357 364 config SYSFS_DEPRECATED
358 365 bool "Create deprecated sysfs files"
359 366 default y
... ... @@ -854,6 +854,12 @@
854 854 struct rq_iterator *iterator);
855 855 #endif
856 856  
  857 +#ifdef CONFIG_CGROUP_CPUACCT
  858 +static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
  859 +#else
  860 +static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
  861 +#endif
  862 +
857 863 #include "sched_stats.h"
858 864 #include "sched_idletask.c"
859 865 #include "sched_fair.c"
860 866  
... ... @@ -7221,38 +7227,12 @@
7221 7227 return (u64) tg->shares;
7222 7228 }
7223 7229  
7224   -static u64 cpu_usage_read(struct cgroup *cgrp, struct cftype *cft)
7225   -{
7226   - struct task_group *tg = cgroup_tg(cgrp);
7227   - unsigned long flags;
7228   - u64 res = 0;
7229   - int i;
7230   -
7231   - for_each_possible_cpu(i) {
7232   - /*
7233   - * Lock to prevent races with updating 64-bit counters
7234   - * on 32-bit arches.
7235   - */
7236   - spin_lock_irqsave(&cpu_rq(i)->lock, flags);
7237   - res += tg->se[i]->sum_exec_runtime;
7238   - spin_unlock_irqrestore(&cpu_rq(i)->lock, flags);
7239   - }
7240   - /* Convert from ns to ms */
7241   - do_div(res, NSEC_PER_MSEC);
7242   -
7243   - return res;
7244   -}
7245   -
7246 7230 static struct cftype cpu_files[] = {
7247 7231 {
7248 7232 .name = "shares",
7249 7233 .read_uint = cpu_shares_read_uint,
7250 7234 .write_uint = cpu_shares_write_uint,
7251 7235 },
7252   - {
7253   - .name = "usage",
7254   - .read_uint = cpu_usage_read,
7255   - },
7256 7236 };
7257 7237  
7258 7238 static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
... ... @@ -7272,4 +7252,127 @@
7272 7252 };
7273 7253  
7274 7254 #endif /* CONFIG_FAIR_CGROUP_SCHED */
  7255 +
  7256 +#ifdef CONFIG_CGROUP_CPUACCT
  7257 +
  7258 +/*
  7259 + * CPU accounting code for task groups.
  7260 + *
  7261 + * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
  7262 + * (balbir@in.ibm.com).
  7263 + */
  7264 +
  7265 +/* track cpu usage of a group of tasks */
  7266 +struct cpuacct {
  7267 + struct cgroup_subsys_state css;
  7268 + /* cpuusage holds pointer to a u64-type object on every cpu */
  7269 + u64 *cpuusage;
  7270 +};
  7271 +
  7272 +struct cgroup_subsys cpuacct_subsys;
  7273 +
  7274 +/* return cpu accounting group corresponding to this container */
  7275 +static inline struct cpuacct *cgroup_ca(struct cgroup *cont)
  7276 +{
  7277 + return container_of(cgroup_subsys_state(cont, cpuacct_subsys_id),
  7278 + struct cpuacct, css);
  7279 +}
  7280 +
  7281 +/* return cpu accounting group to which this task belongs */
  7282 +static inline struct cpuacct *task_ca(struct task_struct *tsk)
  7283 +{
  7284 + return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
  7285 + struct cpuacct, css);
  7286 +}
  7287 +
  7288 +/* create a new cpu accounting group */
  7289 +static struct cgroup_subsys_state *cpuacct_create(
  7290 + struct cgroup_subsys *ss, struct cgroup *cont)
  7291 +{
  7292 + struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
  7293 +
  7294 + if (!ca)
  7295 + return ERR_PTR(-ENOMEM);
  7296 +
  7297 + ca->cpuusage = alloc_percpu(u64);
  7298 + if (!ca->cpuusage) {
  7299 + kfree(ca);
  7300 + return ERR_PTR(-ENOMEM);
  7301 + }
  7302 +
  7303 + return &ca->css;
  7304 +}
  7305 +
  7306 +/* destroy an existing cpu accounting group */
  7307 +static void cpuacct_destroy(struct cgroup_subsys *ss,
  7308 + struct cgroup *cont)
  7309 +{
  7310 + struct cpuacct *ca = cgroup_ca(cont);
  7311 +
  7312 + free_percpu(ca->cpuusage);
  7313 + kfree(ca);
  7314 +}
  7315 +
  7316 +/* return total cpu usage (in nanoseconds) of a group */
  7317 +static u64 cpuusage_read(struct cgroup *cont, struct cftype *cft)
  7318 +{
  7319 + struct cpuacct *ca = cgroup_ca(cont);
  7320 + u64 totalcpuusage = 0;
  7321 + int i;
  7322 +
  7323 + for_each_possible_cpu(i) {
  7324 + u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
  7325 +
  7326 + /*
  7327 + * Take rq->lock to make 64-bit addition safe on 32-bit
  7328 + * platforms.
  7329 + */
  7330 + spin_lock_irq(&cpu_rq(i)->lock);
  7331 + totalcpuusage += *cpuusage;
  7332 + spin_unlock_irq(&cpu_rq(i)->lock);
  7333 + }
  7334 +
  7335 + return totalcpuusage;
  7336 +}
  7337 +
  7338 +static struct cftype files[] = {
  7339 + {
  7340 + .name = "usage",
  7341 + .read_uint = cpuusage_read,
  7342 + },
  7343 +};
  7344 +
  7345 +static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cont)
  7346 +{
  7347 + return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
  7348 +}
  7349 +
  7350 +/*
  7351 + * charge this task's execution time to its accounting group.
  7352 + *
  7353 + * called with rq->lock held.
  7354 + */
  7355 +static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
  7356 +{
  7357 + struct cpuacct *ca;
  7358 +
  7359 + if (!cpuacct_subsys.active)
  7360 + return;
  7361 +
  7362 + ca = task_ca(tsk);
  7363 + if (ca) {
  7364 + u64 *cpuusage = percpu_ptr(ca->cpuusage, task_cpu(tsk));
  7365 +
  7366 + *cpuusage += cputime;
  7367 + }
  7368 +}
  7369 +
  7370 +struct cgroup_subsys cpuacct_subsys = {
  7371 + .name = "cpuacct",
  7372 + .create = cpuacct_create,
  7373 + .destroy = cpuacct_destroy,
  7374 + .populate = cpuacct_populate,
  7375 + .subsys_id = cpuacct_subsys_id,
  7376 +};
  7377 +#endif /* CONFIG_CGROUP_CPUACCT */
... ... @@ -351,6 +351,12 @@
351 351  
352 352 __update_curr(cfs_rq, curr, delta_exec);
353 353 curr->exec_start = now;
  354 +
  355 + if (entity_is_task(curr)) {
  356 + struct task_struct *curtask = task_of(curr);
  357 +
  358 + cpuacct_charge(curtask, delta_exec);
  359 + }
354 360 }
355 361  
356 362 static inline void
... ... @@ -23,6 +23,7 @@
23 23  
24 24 curr->se.sum_exec_runtime += delta_exec;
25 25 curr->se.exec_start = rq->clock;
  26 + cpuacct_charge(curr, delta_exec);
26 27 }
27 28  
28 29 static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)