Merge branch 'blkcg-cfq-hierarchy' of git://git.kernel.org/pub/scm/linux/kernel/…

…git/tj/cgroup into for-3.9/core Tejun writes: Hello, Jens. Please consider pulling from the following branch to receive cfq blkcg hierarchy support. The branch is based on top of v3.8-rc2. git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git blkcg-cfq-hierarchy The patchset was reviewd in the following thread. http://thread.gmane.org/gmane.linux.kernel.cgroups/5571

Merge branch 'blkcg-cfq-hierarchy' of git://git.kernel.org/pub/scm/linux/kernel/…
…git/tj/cgroup into for-3.9/core Tejun writes: Hello, Jens. Please consider pulling from the following branch to receive cfq blkcg hierarchy support. The branch is based on top of v3.8-rc2. git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git blkcg-cfq-hierarchy The patchset was reviewd in the following thread. http://thread.gmane.org/gmane.linux.kernel.cgroups/5571
Jens Axboe
2 parents 422765c263 43114018cb
Showing 7 changed files Side-by-side Diff
Documentation/block/cfq-iosched.txt
Documentation/cgroups/blkio-controller.txt
block/blk-cgroup.c
block/blk-cgroup.h
block/blk-sysfs.c
block/cfq-iosched.c
include/linux/blkdev.h
@@ -102,6 +102,64 @@
 performace although this can cause the latency of some I/O to increase due
 to more number of requests.
  
+CFQ Group scheduling
+====================
+
+CFQ supports blkio cgroup and has "blkio." prefixed files in each
+blkio cgroup directory. It is weight-based and there are four knobs
+for configuration - weight[_device] and leaf_weight[_device].
+Internal cgroup nodes (the ones with children) can also have tasks in
+them, so the former two configure how much proportion the cgroup as a
+whole is entitled to at its parent's level while the latter two
+configure how much proportion the tasks in the cgroup have compared to
+its direct children.
+
+Another way to think about it is assuming that each internal node has
+an implicit leaf child node which hosts all the tasks whose weight is
+configured by leaf_weight[_device]. Let's assume a blkio hierarchy
+composed of five cgroups - root, A, B, AA and AB - with the following
+weights where the names represent the hierarchy.
+
+        weight leaf_weight
+ root :  125    125
+ A    :  500    750
+ B    :  250    500
+ AA   :  500    500
+ AB   : 1000    500
+
+root never has a parent making its weight is meaningless. For backward
+compatibility, weight is always kept in sync with leaf_weight. B, AA
+and AB have no child and thus its tasks have no children cgroup to
+compete with. They always get 100% of what the cgroup won at the
+parent level. Considering only the weights which matter, the hierarchy
+looks like the following.
+
+          root
+       /    |   \
+      A     B    leaf
+     500   250   125
+   /  |  \
+  AA  AB  leaf
+ 500 1000 750
+
+If all cgroups have active IOs and competing with each other, disk
+time will be distributed like the following.
+
+Distribution below root. The total active weight at this level is
+A:500 + B:250 + C:125 = 875.
+
+ root-leaf :   125 /  875      =~ 14%
+ A         :   500 /  875      =~ 57%
+ B(-leaf)  :   250 /  875      =~ 28%
+
+A has children and further distributes its 57% among the children and
+the implicit leaf node. The total active weight at this level is
+AA:500 + AB:1000 + A-leaf:750 = 2250.
+
+ A-leaf    : ( 750 / 2250) * A =~ 19%
+ AA(-leaf) : ( 500 / 2250) * A =~ 12%
+ AB(-leaf) : (1000 / 2250) * A =~ 25%
+
 CFQ IOPS Mode for group scheduling
 ===================================
 Basic CFQ design is to provide priority based time slices. Higher priority
@@ -94,13 +94,11 @@
  
 Hierarchical Cgroups
 ====================
-- Currently none of the IO control policy supports hierarchical groups. But
-  cgroup interface does allow creation of hierarchical cgroups and internally
-  IO policies treat them as flat hierarchy.
+- Currently only CFQ supports hierarchical groups. For throttling,
+  cgroup interface does allow creation of hierarchical cgroups and
+  internally it treats them as flat hierarchy.
  
-  So this patch will allow creation of cgroup hierarchcy but at the backend
-  everything will be treated as flat. So if somebody created a hierarchy like
-  as follows.
+  If somebody created a hierarchy like as follows.
  
 			root
 			/  \
  
@@ -108,16 +106,20 @@
 			|
 		     test3
  
-  CFQ and throttling will practically treat all groups at same level.
+  CFQ will handle the hierarchy correctly but and throttling will
+  practically treat all groups at same level. For details on CFQ
+  hierarchy support, refer to Documentation/block/cfq-iosched.txt.
+  Throttling will treat the hierarchy as if it looks like the
+  following.
  
 				pivot
 			     /  /   \  \
 			root  test1 test2  test3
  
-  Down the line we can implement hierarchical accounting/control support
-  and also introduce a new cgroup file "use_hierarchy" which will control
-  whether cgroup hierarchy is viewed as flat or hierarchical by the policy..
-  This is how memory controller also has implemented the things.
+  Nesting cgroups, while allowed, isn't officially supported and blkio
+  genereates warning when cgroups nest. Once throttling implements
+  hierarchy support, hierarchy will be supported and the warning will
+  be removed.
  
 Various user visible config options
 ===================================
@@ -172,6 +174,12 @@
 	  dev     weight
 	  8:16    300
  
+- blkio.leaf_weight[_device]
+	- Equivalents of blkio.weight[_device] for the purpose of
+          deciding how much weight tasks in the given cgroup has while
+          competing with the cgroup's child cgroups. For details,
+          please refer to Documentation/block/cfq-iosched.txt.
+
 - blkio.time
 	- disk time allocated to cgroup per device in milliseconds. First
 	  two fields specify the major and minor number of the device and
@@ -278,6 +286,11 @@
 	  from service tree of the device. First two fields specify the major
 	  and minor number of the device and third field specifies the number
 	  of times a group was dequeued from a particular device.
+
+- blkio.*_recursive
+	- Recursive version of various stats. These files show the
+          same information as their non-recursive counterparts but
+          include stats from all the descendant cgroups.
  
 Throttling/Upper limit policy files
 -----------------------------------
@@ -26,11 +26,32 @@
  
 static DEFINE_MUTEX(blkcg_pol_mutex);
  
-struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT };
+struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT,
+			    .cfq_leaf_weight = 2 * CFQ_WEIGHT_DEFAULT, };
 EXPORT_SYMBOL_GPL(blkcg_root);
  
 static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
  
+static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
+				      struct request_queue *q, bool update_hint);
+
+/**
+ * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
+ * @d_blkg: loop cursor pointing to the current descendant
+ * @pos_cgrp: used for iteration
+ * @p_blkg: target blkg to walk descendants of
+ *
+ * Walk @c_blkg through the descendants of @p_blkg.  Must be used with RCU
+ * read locked.  If called under either blkcg or queue lock, the iteration
+ * is guaranteed to include all and only online blkgs.  The caller may
+ * update @pos_cgrp by calling cgroup_rightmost_descendant() to skip
+ * subtree.
+ */
+#define blkg_for_each_descendant_pre(d_blkg, pos_cgrp, p_blkg)		\
+	cgroup_for_each_descendant_pre((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \
+		if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \
+					      (p_blkg)->q, false)))
+
 static bool blkcg_policy_enabled(struct request_queue *q,
 				 const struct blkcg_policy *pol)
 {
  
@@ -112,9 +133,10 @@
  
 		blkg->pd[i] = pd;
 		pd->blkg = blkg;
+		pd->plid = i;
  
 		/* invoke per-policy init */
-		if (blkcg_policy_enabled(blkg->q, pol))
+		if (pol->pd_init_fn)
 			pol->pd_init_fn(blkg);
 	}
  
  
@@ -125,8 +147,19 @@
 	return NULL;
 }
  
+/**
+ * __blkg_lookup - internal version of blkg_lookup()
+ * @blkcg: blkcg of interest
+ * @q: request_queue of interest
+ * @update_hint: whether to update lookup hint with the result or not
+ *
+ * This is internal version and shouldn't be used by policy
+ * implementations.  Looks up blkgs for the @blkcg - @q pair regardless of
+ * @q's bypass state.  If @update_hint is %true, the caller should be
+ * holding @q->queue_lock and lookup hint is updated on success.
+ */
 static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
-				      struct request_queue *q)
+				      struct request_queue *q, bool update_hint)
 {
 	struct blkcg_gq *blkg;
  
  
  
@@ -135,14 +168,19 @@
 		return blkg;
  
 	/*
-	 * Hint didn't match.  Look up from the radix tree.  Note that we
-	 * may not be holding queue_lock and thus are not sure whether
-	 * @blkg from blkg_tree has already been removed or not, so we
-	 * can't update hint to the lookup result.  Leave it to the caller.
+	 * Hint didn't match.  Look up from the radix tree.  Note that the
+	 * hint can only be updated under queue_lock as otherwise @blkg
+	 * could have already been removed from blkg_tree.  The caller is
+	 * responsible for grabbing queue_lock if @update_hint.
 	 */
 	blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id);
-	if (blkg && blkg->q == q)
+	if (blkg && blkg->q == q) {
+		if (update_hint) {
+			lockdep_assert_held(q->queue_lock);
+			rcu_assign_pointer(blkcg->blkg_hint, blkg);
+		}
 		return blkg;
+	}
  
 	return NULL;
 }
@@ -162,7 +200,7 @@
  
 	if (unlikely(blk_queue_bypass(q)))
 		return NULL;
-	return __blkg_lookup(blkcg, q);
+	return __blkg_lookup(blkcg, q, false);
 }
 EXPORT_SYMBOL_GPL(blkg_lookup);
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
@@ -170,75 +208,129 @@
  * If @new_blkg is %NULL, this function tries to allocate a new one as
  * necessary using %GFP_ATOMIC.  @new_blkg is always consumed on return.
  */
-static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
-					     struct request_queue *q,
-					     struct blkcg_gq *new_blkg)
+static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
+				    struct request_queue *q,
+				    struct blkcg_gq *new_blkg)
 {
 	struct blkcg_gq *blkg;
-	int ret;
+	int i, ret;
  
 	WARN_ON_ONCE(!rcu_read_lock_held());
 	lockdep_assert_held(q->queue_lock);
  
-	/* lookup and update hint on success, see __blkg_lookup() for details */
-	blkg = __blkg_lookup(blkcg, q);
-	if (blkg) {
-		rcu_assign_pointer(blkcg->blkg_hint, blkg);
-		goto out_free;
-	}
-
 	/* blkg holds a reference to blkcg */
 	if (!css_tryget(&blkcg->css)) {
-		blkg = ERR_PTR(-EINVAL);
-		goto out_free;
+		ret = -EINVAL;
+		goto err_free_blkg;
 	}
  
 	/* allocate */
 	if (!new_blkg) {
 		new_blkg = blkg_alloc(blkcg, q, GFP_ATOMIC);
 		if (unlikely(!new_blkg)) {
-			blkg = ERR_PTR(-ENOMEM);
-			goto out_put;
+			ret = -ENOMEM;
+			goto err_put_css;
 		}
 	}
 	blkg = new_blkg;
  
-	/* insert */
+	/* link parent and insert */
+	if (blkcg_parent(blkcg)) {
+		blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
+		if (WARN_ON_ONCE(!blkg->parent)) {
+			blkg = ERR_PTR(-EINVAL);
+			goto err_put_css;
+		}
+		blkg_get(blkg->parent);
+	}
+
 	spin_lock(&blkcg->lock);
 	ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg);
 	if (likely(!ret)) {
 		hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
 		list_add(&blkg->q_node, &q->blkg_list);
+
+		for (i = 0; i < BLKCG_MAX_POLS; i++) {
+			struct blkcg_policy *pol = blkcg_policy[i];
+
+			if (blkg->pd[i] && pol->pd_online_fn)
+				pol->pd_online_fn(blkg);
+		}
 	}
+	blkg->online = true;
 	spin_unlock(&blkcg->lock);
  
 	if (!ret)
 		return blkg;
  
-	blkg = ERR_PTR(ret);
-out_put:
+	/* @blkg failed fully initialized, use the usual release path */
+	blkg_put(blkg);
+	return ERR_PTR(ret);
+
+err_put_css:
 	css_put(&blkcg->css);
-out_free:
+err_free_blkg:
 	blkg_free(new_blkg);
-	return blkg;
+	return ERR_PTR(ret);
 }
  
+/**
+ * blkg_lookup_create - lookup blkg, try to create one if not there
+ * @blkcg: blkcg of interest
+ * @q: request_queue of interest
+ *
+ * Lookup blkg for the @blkcg - @q pair.  If it doesn't exist, try to
+ * create one.  blkg creation is performed recursively from blkcg_root such
+ * that all non-root blkg's have access to the parent blkg.  This function
+ * should be called under RCU read lock and @q->queue_lock.
+ *
+ * Returns pointer to the looked up or created blkg on success, ERR_PTR()
+ * value on error.  If @q is dead, returns ERR_PTR(-EINVAL).  If @q is not
+ * dead and bypassing, returns ERR_PTR(-EBUSY).
+ */
 struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
 				    struct request_queue *q)
 {
+	struct blkcg_gq *blkg;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+	lockdep_assert_held(q->queue_lock);
+
 	/*
 	 * This could be the first entry point of blkcg implementation and
 	 * we shouldn't allow anything to go through for a bypassing queue.
 	 */
 	if (unlikely(blk_queue_bypass(q)))
 		return ERR_PTR(blk_queue_dying(q) ? -EINVAL : -EBUSY);
-	return __blkg_lookup_create(blkcg, q, NULL);
+
+	blkg = __blkg_lookup(blkcg, q, true);
+	if (blkg)
+		return blkg;
+
+	/*
+	 * Create blkgs walking down from blkcg_root to @blkcg, so that all
+	 * non-root blkgs have access to their parents.
+	 */
+	while (true) {
+		struct blkcg *pos = blkcg;
+		struct blkcg *parent = blkcg_parent(blkcg);
+
+		while (parent && !__blkg_lookup(parent, q, false)) {
+			pos = parent;
+			parent = blkcg_parent(parent);
+		}
+
+		blkg = blkg_create(pos, q, NULL);
+		if (pos == blkcg || IS_ERR(blkg))
+			return blkg;
+	}
 }
 EXPORT_SYMBOL_GPL(blkg_lookup_create);
  
 static void blkg_destroy(struct blkcg_gq *blkg)
 {
 	struct blkcg *blkcg = blkg->blkcg;
+	int i;
  
 	lockdep_assert_held(blkg->q->queue_lock);
 	lockdep_assert_held(&blkcg->lock);
@@ -247,6 +339,14 @@
 	WARN_ON_ONCE(list_empty(&blkg->q_node));
 	WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node));
  
+	for (i = 0; i < BLKCG_MAX_POLS; i++) {
+		struct blkcg_policy *pol = blkcg_policy[i];
+
+		if (blkg->pd[i] && pol->pd_offline_fn)
+			pol->pd_offline_fn(blkg);
+	}
+	blkg->online = false;
+
 	radix_tree_delete(&blkcg->blkg_tree, blkg->q->id);
 	list_del_init(&blkg->q_node);
 	hlist_del_init_rcu(&blkg->blkcg_node);
  
@@ -301,8 +401,10 @@
  
 void __blkg_release(struct blkcg_gq *blkg)
 {
-	/* release the extra blkcg reference this blkg has been holding */
+	/* release the blkcg and parent blkg refs this blkg has been holding */
 	css_put(&blkg->blkcg->css);
+	if (blkg->parent)
+		blkg_put(blkg->parent);
  
 	/*
 	 * A group is freed in rcu manner. But having an rcu lock does not
@@ -402,8 +504,9 @@
  *
  * This function invokes @prfill on each blkg of @blkcg if pd for the
  * policy specified by @pol exists.  @prfill is invoked with @sf, the
- * policy data and @data.  If @show_total is %true, the sum of the return
- * values from @prfill is printed with "Total" label at the end.
+ * policy data and @data and the matching queue lock held.  If @show_total
+ * is %true, the sum of the return values from @prfill is printed with
+ * "Total" label at the end.
  *
  * This is to be used to construct print functions for
  * cftype->read_seq_string method.
  
@@ -418,11 +521,14 @@
 	struct hlist_node *n;
 	u64 total = 0;
  
-	spin_lock_irq(&blkcg->lock);
-	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node)
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
+		spin_lock_irq(blkg->q->queue_lock);
 		if (blkcg_policy_enabled(blkg->q, pol))
 			total += prfill(sf, blkg->pd[pol->plid], data);
-	spin_unlock_irq(&blkcg->lock);
+		spin_unlock_irq(blkg->q->queue_lock);
+	}
+	rcu_read_unlock();
  
 	if (show_total)
 		seq_printf(sf, "Total %llu\n", (unsigned long long)total);
@@ -481,6 +587,7 @@
 	seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v);
 	return v;
 }
+EXPORT_SYMBOL_GPL(__blkg_prfill_rwstat);
  
 /**
  * blkg_prfill_stat - prfill callback for blkg_stat
@@ -514,6 +621,82 @@
 EXPORT_SYMBOL_GPL(blkg_prfill_rwstat);
  
 /**
+ * blkg_stat_recursive_sum - collect hierarchical blkg_stat
+ * @pd: policy private data of interest
+ * @off: offset to the blkg_stat in @pd
+ *
+ * Collect the blkg_stat specified by @off from @pd and all its online
+ * descendants and return the sum.  The caller must be holding the queue
+ * lock for online tests.
+ */
+u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off)
+{
+	struct blkcg_policy *pol = blkcg_policy[pd->plid];
+	struct blkcg_gq *pos_blkg;
+	struct cgroup *pos_cgrp;
+	u64 sum;
+
+	lockdep_assert_held(pd->blkg->q->queue_lock);
+
+	sum = blkg_stat_read((void *)pd + off);
+
+	rcu_read_lock();
+	blkg_for_each_descendant_pre(pos_blkg, pos_cgrp, pd_to_blkg(pd)) {
+		struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol);
+		struct blkg_stat *stat = (void *)pos_pd + off;
+
+		if (pos_blkg->online)
+			sum += blkg_stat_read(stat);
+	}
+	rcu_read_unlock();
+
+	return sum;
+}
+EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum);
+
+/**
+ * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat
+ * @pd: policy private data of interest
+ * @off: offset to the blkg_stat in @pd
+ *
+ * Collect the blkg_rwstat specified by @off from @pd and all its online
+ * descendants and return the sum.  The caller must be holding the queue
+ * lock for online tests.
+ */
+struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd,
+					     int off)
+{
+	struct blkcg_policy *pol = blkcg_policy[pd->plid];
+	struct blkcg_gq *pos_blkg;
+	struct cgroup *pos_cgrp;
+	struct blkg_rwstat sum;
+	int i;
+
+	lockdep_assert_held(pd->blkg->q->queue_lock);
+
+	sum = blkg_rwstat_read((void *)pd + off);
+
+	rcu_read_lock();
+	blkg_for_each_descendant_pre(pos_blkg, pos_cgrp, pd_to_blkg(pd)) {
+		struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol);
+		struct blkg_rwstat *rwstat = (void *)pos_pd + off;
+		struct blkg_rwstat tmp;
+
+		if (!pos_blkg->online)
+			continue;
+
+		tmp = blkg_rwstat_read(rwstat);
+
+		for (i = 0; i < BLKG_RWSTAT_NR; i++)
+			sum.cnt[i] += tmp.cnt[i];
+	}
+	rcu_read_unlock();
+
+	return sum;
+}
+EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum);
+
+/**
  * blkg_conf_prep - parse and prepare for per-blkg config update
  * @blkcg: target block cgroup
  * @pol: target policy
@@ -658,6 +841,7 @@
 		return ERR_PTR(-ENOMEM);
  
 	blkcg->cfq_weight = CFQ_WEIGHT_DEFAULT;
+	blkcg->cfq_leaf_weight = CFQ_WEIGHT_DEFAULT;
 	blkcg->id = atomic64_inc_return(&id_seq); /* root is 0, start from 1 */
 done:
 	spin_lock_init(&blkcg->lock);
@@ -777,7 +961,7 @@
 			  const struct blkcg_policy *pol)
 {
 	LIST_HEAD(pds);
-	struct blkcg_gq *blkg;
+	struct blkcg_gq *blkg, *new_blkg;
 	struct blkg_policy_data *pd, *n;
 	int cnt = 0, ret;
 	bool preloaded;
  
  
@@ -786,19 +970,27 @@
 		return 0;
  
 	/* preallocations for root blkg */
-	blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
-	if (!blkg)
+	new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
+	if (!new_blkg)
 		return -ENOMEM;
  
 	preloaded = !radix_tree_preload(GFP_KERNEL);
  
 	blk_queue_bypass_start(q);
  
-	/* make sure the root blkg exists and count the existing blkgs */
+	/*
+	 * Make sure the root blkg exists and count the existing blkgs.  As
+	 * @q is bypassing at this point, blkg_lookup_create() can't be
+	 * used.  Open code it.
+	 */
 	spin_lock_irq(q->queue_lock);
  
 	rcu_read_lock();
-	blkg = __blkg_lookup_create(&blkcg_root, q, blkg);
+	blkg = __blkg_lookup(&blkcg_root, q, false);
+	if (blkg)
+		blkg_free(new_blkg);
+	else
+		blkg = blkg_create(&blkcg_root, q, new_blkg);
 	rcu_read_unlock();
  
 	if (preloaded)
@@ -846,6 +1038,7 @@
  
 		blkg->pd[pol->plid] = pd;
 		pd->blkg = blkg;
+		pd->plid = pol->plid;
 		pol->pd_init_fn(blkg);
  
 		spin_unlock(&blkg->blkcg->lock);
@@ -892,6 +1085,8 @@
 		/* grab blkcg lock too while removing @pd from @blkg */
 		spin_lock(&blkg->blkcg->lock);
  
+		if (pol->pd_offline_fn)
+			pol->pd_offline_fn(blkg);
 		if (pol->pd_exit_fn)
 			pol->pd_exit_fn(blkg);
  
@@ -54,6 +54,7 @@
  
 	/* TODO: per-policy storage in blkcg */
 	unsigned int			cfq_weight;	/* belongs to cfq */
+	unsigned int			cfq_leaf_weight;
 };
  
 struct blkg_stat {
  
@@ -80,8 +81,9 @@
  * beginning and pd_size can't be smaller than pd.
  */
 struct blkg_policy_data {
-	/* the blkg this per-policy data belongs to */
+	/* the blkg and policy id this per-policy data belongs to */
 	struct blkcg_gq			*blkg;
+	int				plid;
  
 	/* used during policy activation */
 	struct list_head		alloc_node;
  
  
  
@@ -94,17 +96,27 @@
 	struct list_head		q_node;
 	struct hlist_node		blkcg_node;
 	struct blkcg			*blkcg;
+
+	/* all non-root blkcg_gq's are guaranteed to have access to parent */
+	struct blkcg_gq			*parent;
+
 	/* request allocation list for this blkcg-q pair */
 	struct request_list		rl;
+
 	/* reference count */
 	int				refcnt;
  
+	/* is this blkg online? protected by both blkcg and q locks */
+	bool				online;
+
 	struct blkg_policy_data		*pd[BLKCG_MAX_POLS];
  
 	struct rcu_head			rcu_head;
 };
  
 typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg);
+typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg);
+typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg);
 typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg);
 typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg);
  
@@ -117,6 +129,8 @@
  
 	/* operations */
 	blkcg_pol_init_pd_fn		*pd_init_fn;
+	blkcg_pol_online_pd_fn		*pd_online_fn;
+	blkcg_pol_offline_pd_fn		*pd_offline_fn;
 	blkcg_pol_exit_pd_fn		*pd_exit_fn;
 	blkcg_pol_reset_pd_stats_fn	*pd_reset_stats_fn;
 };
@@ -150,6 +164,10 @@
 u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
 		       int off);
  
+u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off);
+struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd,
+					     int off);
+
 struct blkg_conf_ctx {
 	struct gendisk			*disk;
 	struct blkcg_gq			*blkg;
@@ -181,6 +199,19 @@
 }
  
 /**
+ * blkcg_parent - get the parent of a blkcg
+ * @blkcg: blkcg of interest
+ *
+ * Return the parent blkcg of @blkcg.  Can be called anytime.
+ */
+static inline struct blkcg *blkcg_parent(struct blkcg *blkcg)
+{
+	struct cgroup *pcg = blkcg->css.cgroup->parent;
+
+	return pcg ? cgroup_to_blkcg(pcg) : NULL;
+}
+
+/**
  * blkg_to_pdata - get policy private data
  * @blkg: blkg of interest
  * @pol: policy of interest
@@ -387,6 +418,18 @@
 }
  
 /**
+ * blkg_stat_merge - merge a blkg_stat into another
+ * @to: the destination blkg_stat
+ * @from: the source
+ *
+ * Add @from's count to @to.
+ */
+static inline void blkg_stat_merge(struct blkg_stat *to, struct blkg_stat *from)
+{
+	blkg_stat_add(to, blkg_stat_read(from));
+}
+
+/**
  * blkg_rwstat_add - add a value to a blkg_rwstat
  * @rwstat: target blkg_rwstat
  * @rw: mask of REQ_{WRITE|SYNC}
  
@@ -434,14 +477,14 @@
 }
  
 /**
- * blkg_rwstat_sum - read the total count of a blkg_rwstat
+ * blkg_rwstat_total - read the total count of a blkg_rwstat
  * @rwstat: blkg_rwstat to read
  *
  * Return the total count of @rwstat regardless of the IO direction.  This
  * function can be called without synchronization and takes care of u64
  * atomicity.
  */
-static inline uint64_t blkg_rwstat_sum(struct blkg_rwstat *rwstat)
+static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat)
 {
 	struct blkg_rwstat tmp = blkg_rwstat_read(rwstat);
  
@@ -455,6 +498,25 @@
 static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat)
 {
 	memset(rwstat->cnt, 0, sizeof(rwstat->cnt));
+}
+
+/**
+ * blkg_rwstat_merge - merge a blkg_rwstat into another
+ * @to: the destination blkg_rwstat
+ * @from: the source
+ *
+ * Add @from's counts to @to.
+ */
+static inline void blkg_rwstat_merge(struct blkg_rwstat *to,
+				     struct blkg_rwstat *from)
+{
+	struct blkg_rwstat v = blkg_rwstat_read(from);
+	int i;
+
+	u64_stats_update_begin(&to->syncp);
+	for (i = 0; i < BLKG_RWSTAT_NR; i++)
+		to->cnt[i] += v.cnt[i];
+	u64_stats_update_end(&to->syncp);
 }
  
 #else	/* CONFIG_BLK_CGROUP */
@@ -497,6 +497,13 @@
 	return res;
 }
  
+static void blk_free_queue_rcu(struct rcu_head *rcu_head)
+{
+	struct request_queue *q = container_of(rcu_head, struct request_queue,
+					       rcu_head);
+	kmem_cache_free(blk_requestq_cachep, q);
+}
+
 /**
  * blk_release_queue: - release a &struct request_queue when it is no longer needed
  * @kobj:    the kobj belonging to the request queue to be released
@@ -538,7 +545,7 @@
 	bdi_destroy(&q->backing_dev_info);
  
 	ida_simple_remove(&blk_queue_ida, q->id);
-	kmem_cache_free(blk_requestq_cachep, q);
+	call_rcu(&q->rcu_head, blk_free_queue_rcu);
 }
  
 static const struct sysfs_ops queue_sysfs_ops = {
@@ -85,7 +85,6 @@
 	struct rb_root rb;
 	struct rb_node *left;
 	unsigned count;
-	unsigned total_weight;
 	u64 min_vdisktime;
 	struct cfq_ttime ttime;
 };
@@ -155,7 +154,7 @@
  * First index in the service_trees.
  * IDLE is handled separately, so it has negative index
  */
-enum wl_prio_t {
+enum wl_class_t {
 	BE_WORKLOAD = 0,
 	RT_WORKLOAD = 1,
 	IDLE_WORKLOAD = 2,
  
@@ -223,10 +222,45 @@
  
 	/* group service_tree key */
 	u64 vdisktime;
+
+	/*
+	 * The number of active cfqgs and sum of their weights under this
+	 * cfqg.  This covers this cfqg's leaf_weight and all children's
+	 * weights, but does not cover weights of further descendants.
+	 *
+	 * If a cfqg is on the service tree, it's active.  An active cfqg
+	 * also activates its parent and contributes to the children_weight
+	 * of the parent.
+	 */
+	int nr_active;
+	unsigned int children_weight;
+
+	/*
+	 * vfraction is the fraction of vdisktime that the tasks in this
+	 * cfqg are entitled to.  This is determined by compounding the
+	 * ratios walking up from this cfqg to the root.
+	 *
+	 * It is in fixed point w/ CFQ_SERVICE_SHIFT and the sum of all
+	 * vfractions on a service tree is approximately 1.  The sum may
+	 * deviate a bit due to rounding errors and fluctuations caused by
+	 * cfqgs entering and leaving the service tree.
+	 */
+	unsigned int vfraction;
+
+	/*
+	 * There are two weights - (internal) weight is the weight of this
+	 * cfqg against the sibling cfqgs.  leaf_weight is the wight of
+	 * this cfqg against the child cfqgs.  For the root cfqg, both
+	 * weights are kept in sync for backward compatibility.
+	 */
 	unsigned int weight;
 	unsigned int new_weight;
 	unsigned int dev_weight;
  
+	unsigned int leaf_weight;
+	unsigned int new_leaf_weight;
+	unsigned int dev_leaf_weight;
+
 	/* number of cfqq currently on this group */
 	int nr_cfqq;
  
  
@@ -248,14 +282,15 @@
 	struct cfq_rb_root service_trees[2][3];
 	struct cfq_rb_root service_tree_idle;
  
-	unsigned long saved_workload_slice;
-	enum wl_type_t saved_workload;
-	enum wl_prio_t saved_serving_prio;
+	unsigned long saved_wl_slice;
+	enum wl_type_t saved_wl_type;
+	enum wl_class_t saved_wl_class;
  
 	/* number of requests that are on the dispatch list or inside driver */
 	int dispatched;
 	struct cfq_ttime ttime;
-	struct cfqg_stats stats;
+	struct cfqg_stats stats;	/* stats for this cfqg */
+	struct cfqg_stats dead_stats;	/* stats pushed from dead children */
 };
  
 struct cfq_io_cq {
@@ -280,8 +315,8 @@
 	/*
 	 * The priority currently being served
 	 */
-	enum wl_prio_t serving_prio;
-	enum wl_type_t serving_type;
+	enum wl_class_t serving_wl_class;
+	enum wl_type_t serving_wl_type;
 	unsigned long workload_expires;
 	struct cfq_group *serving_group;
  
  
  
@@ -353,17 +388,17 @@
  
 static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
  
-static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
-					    enum wl_prio_t prio,
+static struct cfq_rb_root *st_for(struct cfq_group *cfqg,
+					    enum wl_class_t class,
 					    enum wl_type_t type)
 {
 	if (!cfqg)
 		return NULL;
  
-	if (prio == IDLE_WORKLOAD)
+	if (class == IDLE_WORKLOAD)
 		return &cfqg->service_tree_idle;
  
-	return &cfqg->service_trees[prio][type];
+	return &cfqg->service_trees[class][type];
 }
  
 enum cfqq_state_flags {
@@ -502,7 +537,7 @@
 {
 	struct cfqg_stats *stats = &cfqg->stats;
  
-	if (blkg_rwstat_sum(&stats->queued))
+	if (blkg_rwstat_total(&stats->queued))
 		return;
  
 	/*
@@ -546,7 +581,7 @@
 	struct cfqg_stats *stats = &cfqg->stats;
  
 	blkg_stat_add(&stats->avg_queue_size_sum,
-		      blkg_rwstat_sum(&stats->queued));
+		      blkg_rwstat_total(&stats->queued));
 	blkg_stat_add(&stats->avg_queue_size_samples, 1);
 	cfqg_stats_update_group_wait_time(stats);
 }
@@ -572,6 +607,13 @@
 	return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq));
 }
  
+static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg)
+{
+	struct blkcg_gq *pblkg = cfqg_to_blkg(cfqg)->parent;
+
+	return pblkg ? blkg_to_cfqg(pblkg) : NULL;
+}
+
 static inline void cfqg_get(struct cfq_group *cfqg)
 {
 	return blkg_get(cfqg_to_blkg(cfqg));
@@ -586,8 +628,9 @@
 	char __pbuf[128];						\
 									\
 	blkg_path(cfqg_to_blkg((cfqq)->cfqg), __pbuf, sizeof(__pbuf));	\
-	blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \
-			  cfq_cfqq_sync((cfqq)) ? 'S' : 'A',		\
+	blk_add_trace_msg((cfqd)->queue, "cfq%d%c%c %s " fmt, (cfqq)->pid, \
+			cfq_cfqq_sync((cfqq)) ? 'S' : 'A',		\
+			cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\
 			  __pbuf, ##args);				\
 } while (0)
  
  
@@ -646,11 +689,9 @@
 				io_start_time - start_time);
 }
  
-static void cfq_pd_reset_stats(struct blkcg_gq *blkg)
+/* @stats = 0 */
+static void cfqg_stats_reset(struct cfqg_stats *stats)
 {
-	struct cfq_group *cfqg = blkg_to_cfqg(blkg);
-	struct cfqg_stats *stats = &cfqg->stats;
-
 	/* queued stats shouldn't be cleared */
 	blkg_rwstat_reset(&stats->service_bytes);
 	blkg_rwstat_reset(&stats->serviced);
  
  
@@ -669,13 +710,58 @@
 #endif
 }
  
+/* @to += @from */
+static void cfqg_stats_merge(struct cfqg_stats *to, struct cfqg_stats *from)
+{
+	/* queued stats shouldn't be cleared */
+	blkg_rwstat_merge(&to->service_bytes, &from->service_bytes);
+	blkg_rwstat_merge(&to->serviced, &from->serviced);
+	blkg_rwstat_merge(&to->merged, &from->merged);
+	blkg_rwstat_merge(&to->service_time, &from->service_time);
+	blkg_rwstat_merge(&to->wait_time, &from->wait_time);
+	blkg_stat_merge(&from->time, &from->time);
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+	blkg_stat_merge(&to->unaccounted_time, &from->unaccounted_time);
+	blkg_stat_merge(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
+	blkg_stat_merge(&to->avg_queue_size_samples, &from->avg_queue_size_samples);
+	blkg_stat_merge(&to->dequeue, &from->dequeue);
+	blkg_stat_merge(&to->group_wait_time, &from->group_wait_time);
+	blkg_stat_merge(&to->idle_time, &from->idle_time);
+	blkg_stat_merge(&to->empty_time, &from->empty_time);
+#endif
+}
+
+/*
+ * Transfer @cfqg's stats to its parent's dead_stats so that the ancestors'
+ * recursive stats can still account for the amount used by this cfqg after
+ * it's gone.
+ */
+static void cfqg_stats_xfer_dead(struct cfq_group *cfqg)
+{
+	struct cfq_group *parent = cfqg_parent(cfqg);
+
+	lockdep_assert_held(cfqg_to_blkg(cfqg)->q->queue_lock);
+
+	if (unlikely(!parent))
+		return;
+
+	cfqg_stats_merge(&parent->dead_stats, &cfqg->stats);
+	cfqg_stats_merge(&parent->dead_stats, &cfqg->dead_stats);
+	cfqg_stats_reset(&cfqg->stats);
+	cfqg_stats_reset(&cfqg->dead_stats);
+}
+
 #else	/* CONFIG_CFQ_GROUP_IOSCHED */
  
+static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg) { return NULL; }
 static inline void cfqg_get(struct cfq_group *cfqg) { }
 static inline void cfqg_put(struct cfq_group *cfqg) { }
  
 #define cfq_log_cfqq(cfqd, cfqq, fmt, args...)	\
-	blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args)
+	blk_add_trace_msg((cfqd)->queue, "cfq%d%c%c " fmt, (cfqq)->pid,	\
+			cfq_cfqq_sync((cfqq)) ? 'S' : 'A',		\
+			cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\
+				##args)
 #define cfq_log_cfqg(cfqd, cfqg, fmt, args...)		do {} while (0)
  
 static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
@@ -732,7 +818,7 @@
 		return false;
 }
  
-static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq)
+static inline enum wl_class_t cfqq_class(struct cfq_queue *cfqq)
 {
 	if (cfq_class_idle(cfqq))
 		return IDLE_WORKLOAD;
  
  
  
@@ -751,23 +837,23 @@
 	return SYNC_WORKLOAD;
 }
  
-static inline int cfq_group_busy_queues_wl(enum wl_prio_t wl,
+static inline int cfq_group_busy_queues_wl(enum wl_class_t wl_class,
 					struct cfq_data *cfqd,
 					struct cfq_group *cfqg)
 {
-	if (wl == IDLE_WORKLOAD)
+	if (wl_class == IDLE_WORKLOAD)
 		return cfqg->service_tree_idle.count;
  
-	return cfqg->service_trees[wl][ASYNC_WORKLOAD].count
-		+ cfqg->service_trees[wl][SYNC_NOIDLE_WORKLOAD].count
-		+ cfqg->service_trees[wl][SYNC_WORKLOAD].count;
+	return cfqg->service_trees[wl_class][ASYNC_WORKLOAD].count +
+		cfqg->service_trees[wl_class][SYNC_NOIDLE_WORKLOAD].count +
+		cfqg->service_trees[wl_class][SYNC_WORKLOAD].count;
 }
  
 static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
 					struct cfq_group *cfqg)
 {
-	return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count
-		+ cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count;
+	return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count +
+		cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count;
 }
  
 static void cfq_dispatch_insert(struct request_queue *, struct request *);
  
  
@@ -847,13 +933,27 @@
 	return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
 }
  
-static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg)
+/**
+ * cfqg_scale_charge - scale disk time charge according to cfqg weight
+ * @charge: disk time being charged
+ * @vfraction: vfraction of the cfqg, fixed point w/ CFQ_SERVICE_SHIFT
+ *
+ * Scale @charge according to @vfraction, which is in range (0, 1].  The
+ * scaling is inversely proportional.
+ *
+ * scaled = charge / vfraction
+ *
+ * The result is also in fixed point w/ CFQ_SERVICE_SHIFT.
+ */
+static inline u64 cfqg_scale_charge(unsigned long charge,
+				    unsigned int vfraction)
 {
-	u64 d = delta << CFQ_SERVICE_SHIFT;
+	u64 c = charge << CFQ_SERVICE_SHIFT;	/* make it fixed point */
  
-	d = d * CFQ_WEIGHT_DEFAULT;
-	do_div(d, cfqg->weight);
-	return d;
+	/* charge / vfraction */
+	c <<= CFQ_SERVICE_SHIFT;
+	do_div(c, vfraction);
+	return c;
 }
  
 static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)
@@ -909,9 +1009,7 @@
 static inline unsigned
 cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
-	struct cfq_rb_root *st = &cfqd->grp_service_tree;
-
-	return cfqd->cfq_target_latency * cfqg->weight / st->total_weight;
+	return cfqd->cfq_target_latency * cfqg->vfraction >> CFQ_SERVICE_SHIFT;
 }
  
 static inline unsigned
  
  
  
@@ -1178,20 +1276,61 @@
 cfq_update_group_weight(struct cfq_group *cfqg)
 {
 	BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
+
 	if (cfqg->new_weight) {
 		cfqg->weight = cfqg->new_weight;
 		cfqg->new_weight = 0;
 	}
+
+	if (cfqg->new_leaf_weight) {
+		cfqg->leaf_weight = cfqg->new_leaf_weight;
+		cfqg->new_leaf_weight = 0;
+	}
 }
  
 static void
 cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
 {
+	unsigned int vfr = 1 << CFQ_SERVICE_SHIFT;	/* start with 1 */
+	struct cfq_group *pos = cfqg;
+	struct cfq_group *parent;
+	bool propagate;
+
+	/* add to the service tree */
 	BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
  
 	cfq_update_group_weight(cfqg);
 	__cfq_group_service_tree_add(st, cfqg);
-	st->total_weight += cfqg->weight;
+
+	/*
+	 * Activate @cfqg and calculate the portion of vfraction @cfqg is
+	 * entitled to.  vfraction is calculated by walking the tree
+	 * towards the root calculating the fraction it has at each level.
+	 * The compounded ratio is how much vfraction @cfqg owns.
+	 *
+	 * Start with the proportion tasks in this cfqg has against active
+	 * children cfqgs - its leaf_weight against children_weight.
+	 */
+	propagate = !pos->nr_active++;
+	pos->children_weight += pos->leaf_weight;
+	vfr = vfr * pos->leaf_weight / pos->children_weight;
+
+	/*
+	 * Compound ->weight walking up the tree.  Both activation and
+	 * vfraction calculation are done in the same loop.  Propagation
+	 * stops once an already activated node is met.  vfraction
+	 * calculation should always continue to the root.
+	 */
+	while ((parent = cfqg_parent(pos))) {
+		if (propagate) {
+			propagate = !parent->nr_active++;
+			parent->children_weight += pos->weight;
+		}
+		vfr = vfr * pos->weight / parent->children_weight;
+		pos = parent;
+	}
+
+	cfqg->vfraction = max_t(unsigned, vfr, 1);
 }
  
 static void
@@ -1222,7 +1361,32 @@
 static void
 cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg)
 {
-	st->total_weight -= cfqg->weight;
+	struct cfq_group *pos = cfqg;
+	bool propagate;
+
+	/*
+	 * Undo activation from cfq_group_service_tree_add().  Deactivate
+	 * @cfqg and propagate deactivation upwards.
+	 */
+	propagate = !--pos->nr_active;
+	pos->children_weight -= pos->leaf_weight;
+
+	while (propagate) {
+		struct cfq_group *parent = cfqg_parent(pos);
+
+		/* @pos has 0 nr_active at this point */
+		WARN_ON_ONCE(pos->children_weight);
+		pos->vfraction = 0;
+
+		if (!parent)
+			break;
+
+		propagate = !--parent->nr_active;
+		parent->children_weight -= pos->weight;
+		pos = parent;
+	}
+
+	/* remove from the service tree */
 	if (!RB_EMPTY_NODE(&cfqg->rb_node))
 		cfq_rb_erase(&cfqg->rb_node, st);
 }
@@ -1241,7 +1405,7 @@
  
 	cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
 	cfq_group_service_tree_del(st, cfqg);
-	cfqg->saved_workload_slice = 0;
+	cfqg->saved_wl_slice = 0;
 	cfqg_stats_update_dequeue(cfqg);
 }
  
@@ -1284,6 +1448,7 @@
 	unsigned int used_sl, charge, unaccounted_sl = 0;
 	int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
 			- cfqg->service_tree_idle.count;
+	unsigned int vfr;
  
 	BUG_ON(nr_sync < 0);
 	used_sl = charge = cfq_cfqq_slice_usage(cfqq, &unaccounted_sl);
  
  
  
  
@@ -1293,20 +1458,25 @@
 	else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
 		charge = cfqq->allocated_slice;
  
-	/* Can't update vdisktime while group is on service tree */
+	/*
+	 * Can't update vdisktime while on service tree and cfqg->vfraction
+	 * is valid only while on it.  Cache vfr, leave the service tree,
+	 * update vdisktime and go back on.  The re-addition to the tree
+	 * will also update the weights as necessary.
+	 */
+	vfr = cfqg->vfraction;
 	cfq_group_service_tree_del(st, cfqg);
-	cfqg->vdisktime += cfq_scale_slice(charge, cfqg);
-	/* If a new weight was requested, update now, off tree */
+	cfqg->vdisktime += cfqg_scale_charge(charge, vfr);
 	cfq_group_service_tree_add(st, cfqg);
  
 	/* This group is being expired. Save the context */
 	if (time_after(cfqd->workload_expires, jiffies)) {
-		cfqg->saved_workload_slice = cfqd->workload_expires
+		cfqg->saved_wl_slice = cfqd->workload_expires
 						- jiffies;
-		cfqg->saved_workload = cfqd->serving_type;
-		cfqg->saved_serving_prio = cfqd->serving_prio;
+		cfqg->saved_wl_type = cfqd->serving_wl_type;
+		cfqg->saved_wl_class = cfqd->serving_wl_class;
 	} else
-		cfqg->saved_workload_slice = 0;
+		cfqg->saved_wl_slice = 0;
  
 	cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
 					st->min_vdisktime);
  
@@ -1344,8 +1514,54 @@
  
 	cfq_init_cfqg_base(cfqg);
 	cfqg->weight = blkg->blkcg->cfq_weight;
+	cfqg->leaf_weight = blkg->blkcg->cfq_leaf_weight;
 }
  
+static void cfq_pd_offline(struct blkcg_gq *blkg)
+{
+	/*
+	 * @blkg is going offline and will be ignored by
+	 * blkg_[rw]stat_recursive_sum().  Transfer stats to the parent so
+	 * that they don't get lost.  If IOs complete after this point, the
+	 * stats for them will be lost.  Oh well...
+	 */
+	cfqg_stats_xfer_dead(blkg_to_cfqg(blkg));
+}
+
+/* offset delta from cfqg->stats to cfqg->dead_stats */
+static const int dead_stats_off_delta = offsetof(struct cfq_group, dead_stats) -
+					offsetof(struct cfq_group, stats);
+
+/* to be used by recursive prfill, sums live and dead stats recursively */
+static u64 cfqg_stat_pd_recursive_sum(struct blkg_policy_data *pd, int off)
+{
+	u64 sum = 0;
+
+	sum += blkg_stat_recursive_sum(pd, off);
+	sum += blkg_stat_recursive_sum(pd, off + dead_stats_off_delta);
+	return sum;
+}
+
+/* to be used by recursive prfill, sums live and dead rwstats recursively */
+static struct blkg_rwstat cfqg_rwstat_pd_recursive_sum(struct blkg_policy_data *pd,
+						       int off)
+{
+	struct blkg_rwstat a, b;
+
+	a = blkg_rwstat_recursive_sum(pd, off);
+	b = blkg_rwstat_recursive_sum(pd, off + dead_stats_off_delta);
+	blkg_rwstat_merge(&a, &b);
+	return a;
+}
+
+static void cfq_pd_reset_stats(struct blkcg_gq *blkg)
+{
+	struct cfq_group *cfqg = blkg_to_cfqg(blkg);
+
+	cfqg_stats_reset(&cfqg->stats);
+	cfqg_stats_reset(&cfqg->dead_stats);
+}
+
 /*
  * Search for the cfq group current task belongs to. request_queue lock must
  * be held.
@@ -1400,6 +1616,26 @@
 	return 0;
 }
  
+static u64 cfqg_prfill_leaf_weight_device(struct seq_file *sf,
+					  struct blkg_policy_data *pd, int off)
+{
+	struct cfq_group *cfqg = pd_to_cfqg(pd);
+
+	if (!cfqg->dev_leaf_weight)
+		return 0;
+	return __blkg_prfill_u64(sf, pd, cfqg->dev_leaf_weight);
+}
+
+static int cfqg_print_leaf_weight_device(struct cgroup *cgrp,
+					 struct cftype *cft,
+					 struct seq_file *sf)
+{
+	blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp),
+			  cfqg_prfill_leaf_weight_device, &blkcg_policy_cfq, 0,
+			  false);
+	return 0;
+}
+
 static int cfq_print_weight(struct cgroup *cgrp, struct cftype *cft,
 			    struct seq_file *sf)
 {
  
@@ -1407,9 +1643,17 @@
 	return 0;
 }
  
-static int cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft,
-				  const char *buf)
+static int cfq_print_leaf_weight(struct cgroup *cgrp, struct cftype *cft,
+				 struct seq_file *sf)
 {
+	seq_printf(sf, "%u\n",
+		   cgroup_to_blkcg(cgrp)->cfq_leaf_weight);
+	return 0;
+}
+
+static int __cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft,
+				    const char *buf, bool is_leaf_weight)
+{
 	struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
 	struct blkg_conf_ctx ctx;
 	struct cfq_group *cfqg;
@@ -1422,8 +1666,13 @@
 	ret = -EINVAL;
 	cfqg = blkg_to_cfqg(ctx.blkg);
 	if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) {
-		cfqg->dev_weight = ctx.v;
-		cfqg->new_weight = cfqg->dev_weight ?: blkcg->cfq_weight;
+		if (!is_leaf_weight) {
+			cfqg->dev_weight = ctx.v;
+			cfqg->new_weight = ctx.v ?: blkcg->cfq_weight;
+		} else {
+			cfqg->dev_leaf_weight = ctx.v;
+			cfqg->new_leaf_weight = ctx.v ?: blkcg->cfq_leaf_weight;
+		}
 		ret = 0;
 	}
  
  
@@ -1431,8 +1680,21 @@
 	return ret;
 }
  
-static int cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val)
+static int cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft,
+				  const char *buf)
 {
+	return __cfqg_set_weight_device(cgrp, cft, buf, false);
+}
+
+static int cfqg_set_leaf_weight_device(struct cgroup *cgrp, struct cftype *cft,
+				       const char *buf)
+{
+	return __cfqg_set_weight_device(cgrp, cft, buf, true);
+}
+
+static int __cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val,
+			    bool is_leaf_weight)
+{
 	struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
 	struct blkcg_gq *blkg;
 	struct hlist_node *n;
  
  
  
@@ -1441,19 +1703,41 @@
 		return -EINVAL;
  
 	spin_lock_irq(&blkcg->lock);
-	blkcg->cfq_weight = (unsigned int)val;
  
+	if (!is_leaf_weight)
+		blkcg->cfq_weight = val;
+	else
+		blkcg->cfq_leaf_weight = val;
+
 	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
 		struct cfq_group *cfqg = blkg_to_cfqg(blkg);
  
-		if (cfqg && !cfqg->dev_weight)
-			cfqg->new_weight = blkcg->cfq_weight;
+		if (!cfqg)
+			continue;
+
+		if (!is_leaf_weight) {
+			if (!cfqg->dev_weight)
+				cfqg->new_weight = blkcg->cfq_weight;
+		} else {
+			if (!cfqg->dev_leaf_weight)
+				cfqg->new_leaf_weight = blkcg->cfq_leaf_weight;
+		}
 	}
  
 	spin_unlock_irq(&blkcg->lock);
 	return 0;
 }
  
+static int cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val)
+{
+	return __cfq_set_weight(cgrp, cft, val, false);
+}
+
+static int cfq_set_leaf_weight(struct cgroup *cgrp, struct cftype *cft, u64 val)
+{
+	return __cfq_set_weight(cgrp, cft, val, true);
+}
+
 static int cfqg_print_stat(struct cgroup *cgrp, struct cftype *cft,
 			   struct seq_file *sf)
 {
@@ -1474,6 +1758,42 @@
 	return 0;
 }
  
+static u64 cfqg_prfill_stat_recursive(struct seq_file *sf,
+				      struct blkg_policy_data *pd, int off)
+{
+	u64 sum = cfqg_stat_pd_recursive_sum(pd, off);
+
+	return __blkg_prfill_u64(sf, pd, sum);
+}
+
+static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf,
+					struct blkg_policy_data *pd, int off)
+{
+	struct blkg_rwstat sum = cfqg_rwstat_pd_recursive_sum(pd, off);
+
+	return __blkg_prfill_rwstat(sf, pd, &sum);
+}
+
+static int cfqg_print_stat_recursive(struct cgroup *cgrp, struct cftype *cft,
+				     struct seq_file *sf)
+{
+	struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
+
+	blkcg_print_blkgs(sf, blkcg, cfqg_prfill_stat_recursive,
+			  &blkcg_policy_cfq, cft->private, false);
+	return 0;
+}
+
+static int cfqg_print_rwstat_recursive(struct cgroup *cgrp, struct cftype *cft,
+				       struct seq_file *sf)
+{
+	struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
+
+	blkcg_print_blkgs(sf, blkcg, cfqg_prfill_rwstat_recursive,
+			  &blkcg_policy_cfq, cft->private, true);
+	return 0;
+}
+
 #ifdef CONFIG_DEBUG_BLK_CGROUP
 static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf,
 				      struct blkg_policy_data *pd, int off)
  
  
  
  
@@ -1503,18 +1823,50 @@
 #endif	/* CONFIG_DEBUG_BLK_CGROUP */
  
 static struct cftype cfq_blkcg_files[] = {
+	/* on root, weight is mapped to leaf_weight */
 	{
 		.name = "weight_device",
+		.flags = CFTYPE_ONLY_ON_ROOT,
+		.read_seq_string = cfqg_print_leaf_weight_device,
+		.write_string = cfqg_set_leaf_weight_device,
+		.max_write_len = 256,
+	},
+	{
+		.name = "weight",
+		.flags = CFTYPE_ONLY_ON_ROOT,
+		.read_seq_string = cfq_print_leaf_weight,
+		.write_u64 = cfq_set_leaf_weight,
+	},
+
+	/* no such mapping necessary for !roots */
+	{
+		.name = "weight_device",
+		.flags = CFTYPE_NOT_ON_ROOT,
 		.read_seq_string = cfqg_print_weight_device,
 		.write_string = cfqg_set_weight_device,
 		.max_write_len = 256,
 	},
 	{
 		.name = "weight",
+		.flags = CFTYPE_NOT_ON_ROOT,
 		.read_seq_string = cfq_print_weight,
 		.write_u64 = cfq_set_weight,
 	},
+
 	{
+		.name = "leaf_weight_device",
+		.read_seq_string = cfqg_print_leaf_weight_device,
+		.write_string = cfqg_set_leaf_weight_device,
+		.max_write_len = 256,
+	},
+	{
+		.name = "leaf_weight",
+		.read_seq_string = cfq_print_leaf_weight,
+		.write_u64 = cfq_set_leaf_weight,
+	},
+
+	/* statistics, covers only the tasks in the cfqg */
+	{
 		.name = "time",
 		.private = offsetof(struct cfq_group, stats.time),
 		.read_seq_string = cfqg_print_stat,
@@ -1554,6 +1906,48 @@
 		.private = offsetof(struct cfq_group, stats.queued),
 		.read_seq_string = cfqg_print_rwstat,
 	},
+
+	/* the same statictics which cover the cfqg and its descendants */
+	{
+		.name = "time_recursive",
+		.private = offsetof(struct cfq_group, stats.time),
+		.read_seq_string = cfqg_print_stat_recursive,
+	},
+	{
+		.name = "sectors_recursive",
+		.private = offsetof(struct cfq_group, stats.sectors),
+		.read_seq_string = cfqg_print_stat_recursive,
+	},
+	{
+		.name = "io_service_bytes_recursive",
+		.private = offsetof(struct cfq_group, stats.service_bytes),
+		.read_seq_string = cfqg_print_rwstat_recursive,
+	},
+	{
+		.name = "io_serviced_recursive",
+		.private = offsetof(struct cfq_group, stats.serviced),
+		.read_seq_string = cfqg_print_rwstat_recursive,
+	},
+	{
+		.name = "io_service_time_recursive",
+		.private = offsetof(struct cfq_group, stats.service_time),
+		.read_seq_string = cfqg_print_rwstat_recursive,
+	},
+	{
+		.name = "io_wait_time_recursive",
+		.private = offsetof(struct cfq_group, stats.wait_time),
+		.read_seq_string = cfqg_print_rwstat_recursive,
+	},
+	{
+		.name = "io_merged_recursive",
+		.private = offsetof(struct cfq_group, stats.merged),
+		.read_seq_string = cfqg_print_rwstat_recursive,
+	},
+	{
+		.name = "io_queued_recursive",
+		.private = offsetof(struct cfq_group, stats.queued),
+		.read_seq_string = cfqg_print_rwstat_recursive,
+	},
 #ifdef CONFIG_DEBUG_BLK_CGROUP
 	{
 		.name = "avg_queue_size",
  
  
@@ -1612,15 +2006,14 @@
 	struct rb_node **p, *parent;
 	struct cfq_queue *__cfqq;
 	unsigned long rb_key;
-	struct cfq_rb_root *service_tree;
+	struct cfq_rb_root *st;
 	int left;
 	int new_cfqq = 1;
  
-	service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),
-						cfqq_type(cfqq));
+	st = st_for(cfqq->cfqg, cfqq_class(cfqq), cfqq_type(cfqq));
 	if (cfq_class_idle(cfqq)) {
 		rb_key = CFQ_IDLE_DELAY;
-		parent = rb_last(&service_tree->rb);
+		parent = rb_last(&st->rb);
 		if (parent && parent != &cfqq->rb_node) {
 			__cfqq = rb_entry(parent, struct cfq_queue, rb_node);
 			rb_key += __cfqq->rb_key;
@@ -1638,7 +2031,7 @@
 		cfqq->slice_resid = 0;
 	} else {
 		rb_key = -HZ;
-		__cfqq = cfq_rb_first(service_tree);
+		__cfqq = cfq_rb_first(st);
 		rb_key += __cfqq ? __cfqq->rb_key : jiffies;
 	}
  
@@ -1647,8 +2040,7 @@
 		/*
 		 * same position, nothing more to do
 		 */
-		if (rb_key == cfqq->rb_key &&
-		    cfqq->service_tree == service_tree)
+		if (rb_key == cfqq->rb_key && cfqq->service_tree == st)
 			return;
  
 		cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
  
@@ -1657,11 +2049,9 @@
  
 	left = 1;
 	parent = NULL;
-	cfqq->service_tree = service_tree;
-	p = &service_tree->rb.rb_node;
+	cfqq->service_tree = st;
+	p = &st->rb.rb_node;
 	while (*p) {
-		struct rb_node **n;
-
 		parent = *p;
 		__cfqq = rb_entry(parent, struct cfq_queue, rb_node);
  
  
  
  
  
@@ -1669,22 +2059,20 @@
 		 * sort by key, that represents service time.
 		 */
 		if (time_before(rb_key, __cfqq->rb_key))
-			n = &(*p)->rb_left;
+			p = &parent->rb_left;
 		else {
-			n = &(*p)->rb_right;
+			p = &parent->rb_right;
 			left = 0;
 		}
-
-		p = n;
 	}
  
 	if (left)
-		service_tree->left = &cfqq->rb_node;
+		st->left = &cfqq->rb_node;
  
 	cfqq->rb_key = rb_key;
 	rb_link_node(&cfqq->rb_node, parent, p);
-	rb_insert_color(&cfqq->rb_node, &service_tree->rb);
-	service_tree->count++;
+	rb_insert_color(&cfqq->rb_node, &st->rb);
+	st->count++;
 	if (add_front || !new_cfqq)
 		return;
 	cfq_group_notify_queue_add(cfqd, cfqq->cfqg);
@@ -2030,8 +2418,8 @@
 				   struct cfq_queue *cfqq)
 {
 	if (cfqq) {
-		cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d",
-				cfqd->serving_prio, cfqd->serving_type);
+		cfq_log_cfqq(cfqd, cfqq, "set_active wl_class:%d wl_type:%d",
+				cfqd->serving_wl_class, cfqd->serving_wl_type);
 		cfqg_stats_update_avg_queue_size(cfqq->cfqg);
 		cfqq->slice_start = 0;
 		cfqq->dispatch_start = jiffies;
  
  
  
@@ -2117,19 +2505,18 @@
  */
 static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
 {
-	struct cfq_rb_root *service_tree =
-		service_tree_for(cfqd->serving_group, cfqd->serving_prio,
-					cfqd->serving_type);
+	struct cfq_rb_root *st = st_for(cfqd->serving_group,
+			cfqd->serving_wl_class, cfqd->serving_wl_type);
  
 	if (!cfqd->rq_queued)
 		return NULL;
  
 	/* There is nothing to dispatch */
-	if (!service_tree)
+	if (!st)
 		return NULL;
-	if (RB_EMPTY_ROOT(&service_tree->rb))
+	if (RB_EMPTY_ROOT(&st->rb))
 		return NULL;
-	return cfq_rb_first(service_tree);
+	return cfq_rb_first(st);
 }
  
 static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
  
  
@@ -2285,17 +2672,17 @@
  
 static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
-	enum wl_prio_t prio = cfqq_prio(cfqq);
-	struct cfq_rb_root *service_tree = cfqq->service_tree;
+	enum wl_class_t wl_class = cfqq_class(cfqq);
+	struct cfq_rb_root *st = cfqq->service_tree;
  
-	BUG_ON(!service_tree);
-	BUG_ON(!service_tree->count);
+	BUG_ON(!st);
+	BUG_ON(!st->count);
  
 	if (!cfqd->cfq_slice_idle)
 		return false;
  
 	/* We never do for idle class queues. */
-	if (prio == IDLE_WORKLOAD)
+	if (wl_class == IDLE_WORKLOAD)
 		return false;
  
 	/* We do for queues that were marked with idle window flag. */
  
@@ -2307,11 +2694,10 @@
 	 * Otherwise, we do only if they are the last ones
 	 * in their service tree.
 	 */
-	if (service_tree->count == 1 && cfq_cfqq_sync(cfqq) &&
-	   !cfq_io_thinktime_big(cfqd, &service_tree->ttime, false))
+	if (st->count == 1 && cfq_cfqq_sync(cfqq) &&
+	   !cfq_io_thinktime_big(cfqd, &st->ttime, false))
 		return true;
-	cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d",
-			service_tree->count);
+	cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d", st->count);
 	return false;
 }
  
@@ -2494,8 +2880,8 @@
 	}
 }
  
-static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
-				struct cfq_group *cfqg, enum wl_prio_t prio)
+static enum wl_type_t cfq_choose_wl_type(struct cfq_data *cfqd,
+			struct cfq_group *cfqg, enum wl_class_t wl_class)
 {
 	struct cfq_queue *queue;
 	int i;
@@ -2505,7 +2891,7 @@
  
 	for (i = 0; i <= SYNC_WORKLOAD; ++i) {
 		/* select the one with lowest rb_key */
-		queue = cfq_rb_first(service_tree_for(cfqg, prio, i));
+		queue = cfq_rb_first(st_for(cfqg, wl_class, i));
 		if (queue &&
 		    (!key_valid || time_before(queue->rb_key, lowest_key))) {
 			lowest_key = queue->rb_key;
  
  
  
  
  
@@ -2517,26 +2903,27 @@
 	return cur_best;
 }
  
-static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
+static void
+choose_wl_class_and_type(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
 	unsigned slice;
 	unsigned count;
 	struct cfq_rb_root *st;
 	unsigned group_slice;
-	enum wl_prio_t original_prio = cfqd->serving_prio;
+	enum wl_class_t original_class = cfqd->serving_wl_class;
  
 	/* Choose next priority. RT > BE > IDLE */
 	if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
-		cfqd->serving_prio = RT_WORKLOAD;
+		cfqd->serving_wl_class = RT_WORKLOAD;
 	else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
-		cfqd->serving_prio = BE_WORKLOAD;
+		cfqd->serving_wl_class = BE_WORKLOAD;
 	else {
-		cfqd->serving_prio = IDLE_WORKLOAD;
+		cfqd->serving_wl_class = IDLE_WORKLOAD;
 		cfqd->workload_expires = jiffies + 1;
 		return;
 	}
  
-	if (original_prio != cfqd->serving_prio)
+	if (original_class != cfqd->serving_wl_class)
 		goto new_workload;
  
 	/*
@@ -2544,7 +2931,7 @@
 	 * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
 	 * expiration time
 	 */
-	st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
+	st = st_for(cfqg, cfqd->serving_wl_class, cfqd->serving_wl_type);
 	count = st->count;
  
 	/*
@@ -2555,9 +2942,9 @@
  
 new_workload:
 	/* otherwise select new workload type */
-	cfqd->serving_type =
-		cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);
-	st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
+	cfqd->serving_wl_type = cfq_choose_wl_type(cfqd, cfqg,
+					cfqd->serving_wl_class);
+	st = st_for(cfqg, cfqd->serving_wl_class, cfqd->serving_wl_type);
 	count = st->count;
  
 	/*
  
@@ -2568,10 +2955,11 @@
 	group_slice = cfq_group_slice(cfqd, cfqg);
  
 	slice = group_slice * count /
-		max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_prio],
-		      cfq_group_busy_queues_wl(cfqd->serving_prio, cfqd, cfqg));
+		max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_wl_class],
+		      cfq_group_busy_queues_wl(cfqd->serving_wl_class, cfqd,
+					cfqg));
  
-	if (cfqd->serving_type == ASYNC_WORKLOAD) {
+	if (cfqd->serving_wl_type == ASYNC_WORKLOAD) {
 		unsigned int tmp;
  
 		/*
  
@@ -2617,14 +3005,14 @@
 	cfqd->serving_group = cfqg;
  
 	/* Restore the workload type data */
-	if (cfqg->saved_workload_slice) {
-		cfqd->workload_expires = jiffies + cfqg->saved_workload_slice;
-		cfqd->serving_type = cfqg->saved_workload;
-		cfqd->serving_prio = cfqg->saved_serving_prio;
+	if (cfqg->saved_wl_slice) {
+		cfqd->workload_expires = jiffies + cfqg->saved_wl_slice;
+		cfqd->serving_wl_type = cfqg->saved_wl_type;
+		cfqd->serving_wl_class = cfqg->saved_wl_class;
 	} else
 		cfqd->workload_expires = jiffies - 1;
  
-	choose_service_tree(cfqd, cfqg);
+	choose_wl_class_and_type(cfqd, cfqg);
 }
  
 /*
@@ -3403,7 +3791,7 @@
 		return true;
  
 	/* Allow preemption only if we are idling on sync-noidle tree */
-	if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD &&
+	if (cfqd->serving_wl_type == SYNC_NOIDLE_WORKLOAD &&
 	    cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD &&
 	    new_cfqq->service_tree->count == 2 &&
 	    RB_EMPTY_ROOT(&cfqq->sort_list))
@@ -3455,7 +3843,7 @@
 	 * doesn't happen
 	 */
 	if (old_type != cfqq_type(cfqq))
-		cfqq->cfqg->saved_workload_slice = 0;
+		cfqq->cfqg->saved_wl_slice = 0;
  
 	/*
 	 * Put the new queue at the front of the of the current list,
  
  
@@ -3637,16 +4025,17 @@
 	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
  
 	if (sync) {
-		struct cfq_rb_root *service_tree;
+		struct cfq_rb_root *st;
  
 		RQ_CIC(rq)->ttime.last_end_request = now;
  
 		if (cfq_cfqq_on_rr(cfqq))
-			service_tree = cfqq->service_tree;
+			st = cfqq->service_tree;
 		else
-			service_tree = service_tree_for(cfqq->cfqg,
-				cfqq_prio(cfqq), cfqq_type(cfqq));
-		service_tree->ttime.last_end_request = now;
+			st = st_for(cfqq->cfqg, cfqq_class(cfqq),
+					cfqq_type(cfqq));
+
+		st->ttime.last_end_request = now;
 		if (!time_after(rq->start_time + cfqd->cfq_fifo_expire[1], now))
 			cfqd->last_delayed_sync = now;
 	}
@@ -3993,6 +4382,7 @@
 	cfq_init_cfqg_base(cfqd->root_group);
 #endif
 	cfqd->root_group->weight = 2 * CFQ_WEIGHT_DEFAULT;
+	cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_DEFAULT;
  
 	/*
 	 * Not strictly needed (since RB_ROOT just clears the node and we
@@ -4177,6 +4567,7 @@
 	.cftypes		= cfq_blkcg_files,
  
 	.pd_init_fn		= cfq_pd_init,
+	.pd_offline_fn		= cfq_pd_offline,
 	.pd_reset_stats_fn	= cfq_pd_reset_stats,
 };
 #endif
@@ -19,6 +19,7 @@
 #include <linux/gfp.h>
 #include <linux/bsg.h>
 #include <linux/smp.h>
+#include <linux/rcupdate.h>
  
 #include <asm/scatterlist.h>
  
@@ -437,6 +438,7 @@
 	/* Throttle data */
 	struct throtl_data *td;
 #endif
+	struct rcu_head		rcu_head;
 };
  
 #define QUEUE_FLAG_QUEUED	1	/* uses generic tag queueing */
...	...	@@ -102,6 +102,64 @@
102	102	performace although this can cause the latency of some I/O to increase due
103	103	to more number of requests.
104	104
	105	+CFQ Group scheduling
	106	+====================
	107	+
	108	+CFQ supports blkio cgroup and has "blkio." prefixed files in each
	109	+blkio cgroup directory. It is weight-based and there are four knobs
	110	+for configuration - weight[_device] and leaf_weight[_device].
	111	+Internal cgroup nodes (the ones with children) can also have tasks in
	112	+them, so the former two configure how much proportion the cgroup as a
	113	+whole is entitled to at its parent's level while the latter two
	114	+configure how much proportion the tasks in the cgroup have compared to
	115	+its direct children.
	116	+
	117	+Another way to think about it is assuming that each internal node has
	118	+an implicit leaf child node which hosts all the tasks whose weight is
	119	+configured by leaf_weight[_device]. Let's assume a blkio hierarchy
	120	+composed of five cgroups - root, A, B, AA and AB - with the following
	121	+weights where the names represent the hierarchy.
	122	+
	123	+ weight leaf_weight
	124	+ root : 125 125
	125	+ A : 500 750
	126	+ B : 250 500
	127	+ AA : 500 500
	128	+ AB : 1000 500
	129	+
	130	+root never has a parent making its weight is meaningless. For backward
	131	+compatibility, weight is always kept in sync with leaf_weight. B, AA
	132	+and AB have no child and thus its tasks have no children cgroup to
	133	+compete with. They always get 100% of what the cgroup won at the
	134	+parent level. Considering only the weights which matter, the hierarchy
	135	+looks like the following.
	136	+
	137	+ root
	138	+ / \| \
	139	+ A B leaf
	140	+ 500 250 125
	141	+ / \| \
	142	+ AA AB leaf
	143	+ 500 1000 750
	144	+
	145	+If all cgroups have active IOs and competing with each other, disk
	146	+time will be distributed like the following.
	147	+
	148	+Distribution below root. The total active weight at this level is
	149	+A:500 + B:250 + C:125 = 875.
	150	+
	151	+ root-leaf : 125 / 875 =~ 14%
	152	+ A : 500 / 875 =~ 57%
	153	+ B(-leaf) : 250 / 875 =~ 28%
	154	+
	155	+A has children and further distributes its 57% among the children and
	156	+the implicit leaf node. The total active weight at this level is
	157	+AA:500 + AB:1000 + A-leaf:750 = 2250.
	158	+
	159	+ A-leaf : ( 750 / 2250) * A =~ 19%
	160	+ AA(-leaf) : ( 500 / 2250) * A =~ 12%
	161	+ AB(-leaf) : (1000 / 2250) * A =~ 25%
	162	+
105	163	CFQ IOPS Mode for group scheduling
106	164	===================================
107	165	Basic CFQ design is to provide priority based time slices. Higher priority
...	...	@@ -94,13 +94,11 @@
94	94
95	95	Hierarchical Cgroups
96	96	====================
97		-- Currently none of the IO control policy supports hierarchical groups. But
98		- cgroup interface does allow creation of hierarchical cgroups and internally
99		- IO policies treat them as flat hierarchy.
	97	+- Currently only CFQ supports hierarchical groups. For throttling,
	98	+ cgroup interface does allow creation of hierarchical cgroups and
	99	+ internally it treats them as flat hierarchy.
100	100
101		- So this patch will allow creation of cgroup hierarchcy but at the backend
102		- everything will be treated as flat. So if somebody created a hierarchy like
103		- as follows.
	101	+ If somebody created a hierarchy like as follows.
104	102
105	103	root
106	104	/ \
107	105
...	...	@@ -108,16 +106,20 @@
108	106	\|
109	107	test3
110	108
111		- CFQ and throttling will practically treat all groups at same level.
	109	+ CFQ will handle the hierarchy correctly but and throttling will
	110	+ practically treat all groups at same level. For details on CFQ
	111	+ hierarchy support, refer to Documentation/block/cfq-iosched.txt.
	112	+ Throttling will treat the hierarchy as if it looks like the
	113	+ following.
112	114
113	115	pivot
114	116	/ / \ \
115	117	root test1 test2 test3
116	118
117		- Down the line we can implement hierarchical accounting/control support
118		- and also introduce a new cgroup file "use_hierarchy" which will control
119		- whether cgroup hierarchy is viewed as flat or hierarchical by the policy..
120		- This is how memory controller also has implemented the things.
	119	+ Nesting cgroups, while allowed, isn't officially supported and blkio
	120	+ genereates warning when cgroups nest. Once throttling implements
	121	+ hierarchy support, hierarchy will be supported and the warning will
	122	+ be removed.
121	123
122	124	Various user visible config options
123	125	===================================
...	...	@@ -172,6 +174,12 @@
172	174	dev weight
173	175	8:16 300
174	176
	177	+- blkio.leaf_weight[_device]
	178	+ - Equivalents of blkio.weight[_device] for the purpose of
	179	+ deciding how much weight tasks in the given cgroup has while
	180	+ competing with the cgroup's child cgroups. For details,
	181	+ please refer to Documentation/block/cfq-iosched.txt.
	182	+
175	183	- blkio.time
176	184	- disk time allocated to cgroup per device in milliseconds. First
177	185	two fields specify the major and minor number of the device and
...	...	@@ -278,6 +286,11 @@
278	286	from service tree of the device. First two fields specify the major
279	287	and minor number of the device and third field specifies the number
280	288	of times a group was dequeued from a particular device.
	289	+
	290	+- blkio.*_recursive
	291	+ - Recursive version of various stats. These files show the
	292	+ same information as their non-recursive counterparts but
	293	+ include stats from all the descendant cgroups.
281	294
282	295	Throttling/Upper limit policy files
283	296	-----------------------------------
...	...	@@ -26,11 +26,32 @@
26	26
27	27	static DEFINE_MUTEX(blkcg_pol_mutex);
28	28
29		-struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT };
	29	+struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT,
	30	+ .cfq_leaf_weight = 2 * CFQ_WEIGHT_DEFAULT, };
30	31	EXPORT_SYMBOL_GPL(blkcg_root);
31	32
32	33	static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
33	34
	35	+static struct blkcg_gq __blkg_lookup(struct blkcg blkcg,
	36	+ struct request_queue *q, bool update_hint);
	37	+
	38	+/**
	39	+ * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
	40	+ * @d_blkg: loop cursor pointing to the current descendant
	41	+ * @pos_cgrp: used for iteration
	42	+ * @p_blkg: target blkg to walk descendants of
	43	+ *
	44	+ * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU
	45	+ * read locked. If called under either blkcg or queue lock, the iteration
	46	+ * is guaranteed to include all and only online blkgs. The caller may
	47	+ * update @pos_cgrp by calling cgroup_rightmost_descendant() to skip
	48	+ * subtree.
	49	+ */
	50	+#define blkg_for_each_descendant_pre(d_blkg, pos_cgrp, p_blkg) \
	51	+ cgroup_for_each_descendant_pre((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \
	52	+ if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \
	53	+ (p_blkg)->q, false)))
	54	+
34	55	static bool blkcg_policy_enabled(struct request_queue *q,
35	56	const struct blkcg_policy *pol)
36	57	{
37	58
...	...	@@ -112,9 +133,10 @@
112	133
113	134	blkg->pd[i] = pd;
114	135	pd->blkg = blkg;
	136	+ pd->plid = i;
115	137
116	138	/* invoke per-policy init */
117		- if (blkcg_policy_enabled(blkg->q, pol))
	139	+ if (pol->pd_init_fn)
118	140	pol->pd_init_fn(blkg);
119	141	}
120	142
121	143
...	...	@@ -125,8 +147,19 @@
125	147	return NULL;
126	148	}
127	149
	150	+/**
	151	+ * __blkg_lookup - internal version of blkg_lookup()
	152	+ * @blkcg: blkcg of interest
	153	+ * @q: request_queue of interest
	154	+ * @update_hint: whether to update lookup hint with the result or not
	155	+ *
	156	+ * This is internal version and shouldn't be used by policy
	157	+ * implementations. Looks up blkgs for the @blkcg - @q pair regardless of
	158	+ * @q's bypass state. If @update_hint is %true, the caller should be
	159	+ * holding @q->queue_lock and lookup hint is updated on success.
	160	+ */
128	161	static struct blkcg_gq __blkg_lookup(struct blkcg blkcg,
129		- struct request_queue *q)
	162	+ struct request_queue *q, bool update_hint)
130	163	{
131	164	struct blkcg_gq *blkg;
132	165
133	166
134	167
...	...	@@ -135,14 +168,19 @@
135	168	return blkg;
136	169
137	170	/*
138		- * Hint didn't match. Look up from the radix tree. Note that we
139		- * may not be holding queue_lock and thus are not sure whether
140		- * @blkg from blkg_tree has already been removed or not, so we
141		- * can't update hint to the lookup result. Leave it to the caller.
	171	+ * Hint didn't match. Look up from the radix tree. Note that the
	172	+ * hint can only be updated under queue_lock as otherwise @blkg
	173	+ * could have already been removed from blkg_tree. The caller is
	174	+ * responsible for grabbing queue_lock if @update_hint.
142	175	*/
143	176	blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id);
144		- if (blkg && blkg->q == q)
	177	+ if (blkg && blkg->q == q) {
	178	+ if (update_hint) {
	179	+ lockdep_assert_held(q->queue_lock);
	180	+ rcu_assign_pointer(blkcg->blkg_hint, blkg);
	181	+ }
145	182	return blkg;
	183	+ }
146	184
147	185	return NULL;
148	186	}
...	...	@@ -162,7 +200,7 @@
162	200
163	201	if (unlikely(blk_queue_bypass(q)))
164	202	return NULL;
165		- return __blkg_lookup(blkcg, q);
	203	+ return __blkg_lookup(blkcg, q, false);
166	204	}
167	205	EXPORT_SYMBOL_GPL(blkg_lookup);
168	206
169	207
170	208
171	209
172	210
173	211
174	212
175	213
176	214
177	215
178	216
179	217
180	218
181	219
182	220
...	...	@@ -170,75 +208,129 @@
170	208	* If @new_blkg is %NULL, this function tries to allocate a new one as
171	209	* necessary using %GFP_ATOMIC. @new_blkg is always consumed on return.
172	210	*/
173		-static struct blkcg_gq __blkg_lookup_create(struct blkcg blkcg,
174		- struct request_queue *q,
175		- struct blkcg_gq *new_blkg)
	211	+static struct blkcg_gq blkg_create(struct blkcg blkcg,
	212	+ struct request_queue *q,
	213	+ struct blkcg_gq *new_blkg)
176	214	{
177	215	struct blkcg_gq *blkg;
178		- int ret;
	216	+ int i, ret;
179	217
180	218	WARN_ON_ONCE(!rcu_read_lock_held());
181	219	lockdep_assert_held(q->queue_lock);
182	220
183		- /* lookup and update hint on success, see __blkg_lookup() for details */
184		- blkg = __blkg_lookup(blkcg, q);
185		- if (blkg) {
186		- rcu_assign_pointer(blkcg->blkg_hint, blkg);
187		- goto out_free;
188		- }
189		-
190	221	/* blkg holds a reference to blkcg */
191	222	if (!css_tryget(&blkcg->css)) {
192		- blkg = ERR_PTR(-EINVAL);
193		- goto out_free;
	223	+ ret = -EINVAL;
	224	+ goto err_free_blkg;
194	225	}
195	226
196	227	/* allocate */
197	228	if (!new_blkg) {
198	229	new_blkg = blkg_alloc(blkcg, q, GFP_ATOMIC);
199	230	if (unlikely(!new_blkg)) {
200		- blkg = ERR_PTR(-ENOMEM);
201		- goto out_put;
	231	+ ret = -ENOMEM;
	232	+ goto err_put_css;
202	233	}
203	234	}
204	235	blkg = new_blkg;
205	236
206		- /* insert */
	237	+ /* link parent and insert */
	238	+ if (blkcg_parent(blkcg)) {
	239	+ blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
	240	+ if (WARN_ON_ONCE(!blkg->parent)) {
	241	+ blkg = ERR_PTR(-EINVAL);
	242	+ goto err_put_css;
	243	+ }
	244	+ blkg_get(blkg->parent);
	245	+ }
	246	+
207	247	spin_lock(&blkcg->lock);
208	248	ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg);
209	249	if (likely(!ret)) {
210	250	hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
211	251	list_add(&blkg->q_node, &q->blkg_list);
	252	+
	253	+ for (i = 0; i < BLKCG_MAX_POLS; i++) {
	254	+ struct blkcg_policy *pol = blkcg_policy[i];
	255	+
	256	+ if (blkg->pd[i] && pol->pd_online_fn)
	257	+ pol->pd_online_fn(blkg);
	258	+ }
212	259	}
	260	+ blkg->online = true;
213	261	spin_unlock(&blkcg->lock);
214	262
215	263	if (!ret)
216	264	return blkg;
217	265
218		- blkg = ERR_PTR(ret);
219		-out_put:
	266	+ /* @blkg failed fully initialized, use the usual release path */
	267	+ blkg_put(blkg);
	268	+ return ERR_PTR(ret);
	269	+
	270	+err_put_css:
220	271	css_put(&blkcg->css);
221		-out_free:
	272	+err_free_blkg:
222	273	blkg_free(new_blkg);
223		- return blkg;
	274	+ return ERR_PTR(ret);
224	275	}
225	276
	277	+/**
	278	+ * blkg_lookup_create - lookup blkg, try to create one if not there
	279	+ * @blkcg: blkcg of interest
	280	+ * @q: request_queue of interest
	281	+ *
	282	+ * Lookup blkg for the @blkcg - @q pair. If it doesn't exist, try to
	283	+ * create one. blkg creation is performed recursively from blkcg_root such
	284	+ * that all non-root blkg's have access to the parent blkg. This function
	285	+ * should be called under RCU read lock and @q->queue_lock.
	286	+ *
	287	+ * Returns pointer to the looked up or created blkg on success, ERR_PTR()
	288	+ * value on error. If @q is dead, returns ERR_PTR(-EINVAL). If @q is not
	289	+ * dead and bypassing, returns ERR_PTR(-EBUSY).
	290	+ */
226	291	struct blkcg_gq blkg_lookup_create(struct blkcg blkcg,
227	292	struct request_queue *q)
228	293	{
	294	+ struct blkcg_gq *blkg;
	295	+
	296	+ WARN_ON_ONCE(!rcu_read_lock_held());
	297	+ lockdep_assert_held(q->queue_lock);
	298	+
229	299	/*
230	300	* This could be the first entry point of blkcg implementation and
231	301	* we shouldn't allow anything to go through for a bypassing queue.
232	302	*/
233	303	if (unlikely(blk_queue_bypass(q)))
234	304	return ERR_PTR(blk_queue_dying(q) ? -EINVAL : -EBUSY);
235		- return __blkg_lookup_create(blkcg, q, NULL);
	305	+
	306	+ blkg = __blkg_lookup(blkcg, q, true);
	307	+ if (blkg)
	308	+ return blkg;
	309	+
	310	+ /*
	311	+ * Create blkgs walking down from blkcg_root to @blkcg, so that all
	312	+ * non-root blkgs have access to their parents.
	313	+ */
	314	+ while (true) {
	315	+ struct blkcg *pos = blkcg;
	316	+ struct blkcg *parent = blkcg_parent(blkcg);
	317	+
	318	+ while (parent && !__blkg_lookup(parent, q, false)) {
	319	+ pos = parent;
	320	+ parent = blkcg_parent(parent);
	321	+ }
	322	+
	323	+ blkg = blkg_create(pos, q, NULL);
	324	+ if (pos == blkcg \|\| IS_ERR(blkg))
	325	+ return blkg;
	326	+ }
236	327	}
237	328	EXPORT_SYMBOL_GPL(blkg_lookup_create);
238	329
239	330	static void blkg_destroy(struct blkcg_gq *blkg)
240	331	{
241	332	struct blkcg *blkcg = blkg->blkcg;
	333	+ int i;
242	334
243	335	lockdep_assert_held(blkg->q->queue_lock);
244	336	lockdep_assert_held(&blkcg->lock);
...	...	@@ -247,6 +339,14 @@
247	339	WARN_ON_ONCE(list_empty(&blkg->q_node));
248	340	WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node));
249	341
	342	+ for (i = 0; i < BLKCG_MAX_POLS; i++) {
	343	+ struct blkcg_policy *pol = blkcg_policy[i];
	344	+
	345	+ if (blkg->pd[i] && pol->pd_offline_fn)
	346	+ pol->pd_offline_fn(blkg);
	347	+ }
	348	+ blkg->online = false;
	349	+
250	350	radix_tree_delete(&blkcg->blkg_tree, blkg->q->id);
251	351	list_del_init(&blkg->q_node);
252	352	hlist_del_init_rcu(&blkg->blkcg_node);
253	353
...	...	@@ -301,8 +401,10 @@
301	401
302	402	void __blkg_release(struct blkcg_gq *blkg)
303	403	{
304		- /* release the extra blkcg reference this blkg has been holding */
	404	+ /* release the blkcg and parent blkg refs this blkg has been holding */
305	405	css_put(&blkg->blkcg->css);
	406	+ if (blkg->parent)
	407	+ blkg_put(blkg->parent);
306	408
307	409	/*
308	410	* A group is freed in rcu manner. But having an rcu lock does not
...	...	@@ -402,8 +504,9 @@
402	504	*
403	505	* This function invokes @prfill on each blkg of @blkcg if pd for the
404	506	* policy specified by @pol exists. @prfill is invoked with @sf, the
405		- * policy data and @data. If @show_total is %true, the sum of the return
406		- * values from @prfill is printed with "Total" label at the end.
	507	+ * policy data and @data and the matching queue lock held. If @show_total
	508	+ * is %true, the sum of the return values from @prfill is printed with
	509	+ * "Total" label at the end.
407	510	*
408	511	* This is to be used to construct print functions for
409	512	* cftype->read_seq_string method.
410	513
...	...	@@ -418,11 +521,14 @@
418	521	struct hlist_node *n;
419	522	u64 total = 0;
420	523
421		- spin_lock_irq(&blkcg->lock);
422		- hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node)
	524	+ rcu_read_lock();
	525	+ hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
	526	+ spin_lock_irq(blkg->q->queue_lock);
423	527	if (blkcg_policy_enabled(blkg->q, pol))
424	528	total += prfill(sf, blkg->pd[pol->plid], data);
425		- spin_unlock_irq(&blkcg->lock);
	529	+ spin_unlock_irq(blkg->q->queue_lock);
	530	+ }
	531	+ rcu_read_unlock();
426	532
427	533	if (show_total)
428	534	seq_printf(sf, "Total %llu\n", (unsigned long long)total);
...	...	@@ -481,6 +587,7 @@
481	587	seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v);
482	588	return v;
483	589	}
	590	+EXPORT_SYMBOL_GPL(__blkg_prfill_rwstat);
484	591
485	592	/**
486	593	* blkg_prfill_stat - prfill callback for blkg_stat
...	...	@@ -514,6 +621,82 @@
514	621	EXPORT_SYMBOL_GPL(blkg_prfill_rwstat);
515	622
516	623	/**
	624	+ * blkg_stat_recursive_sum - collect hierarchical blkg_stat
	625	+ * @pd: policy private data of interest
	626	+ * @off: offset to the blkg_stat in @pd
	627	+ *
	628	+ * Collect the blkg_stat specified by @off from @pd and all its online
	629	+ * descendants and return the sum. The caller must be holding the queue
	630	+ * lock for online tests.
	631	+ */
	632	+u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off)
	633	+{
	634	+ struct blkcg_policy *pol = blkcg_policy[pd->plid];
	635	+ struct blkcg_gq *pos_blkg;
	636	+ struct cgroup *pos_cgrp;
	637	+ u64 sum;
	638	+
	639	+ lockdep_assert_held(pd->blkg->q->queue_lock);
	640	+
	641	+ sum = blkg_stat_read((void *)pd + off);
	642	+
	643	+ rcu_read_lock();
	644	+ blkg_for_each_descendant_pre(pos_blkg, pos_cgrp, pd_to_blkg(pd)) {
	645	+ struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol);
	646	+ struct blkg_stat stat = (void )pos_pd + off;
	647	+
	648	+ if (pos_blkg->online)
	649	+ sum += blkg_stat_read(stat);
	650	+ }
	651	+ rcu_read_unlock();
	652	+
	653	+ return sum;
	654	+}
	655	+EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum);
	656	+
	657	+/**
	658	+ * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat
	659	+ * @pd: policy private data of interest
	660	+ * @off: offset to the blkg_stat in @pd
	661	+ *
	662	+ * Collect the blkg_rwstat specified by @off from @pd and all its online
	663	+ * descendants and return the sum. The caller must be holding the queue
	664	+ * lock for online tests.
	665	+ */
	666	+struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd,
	667	+ int off)
	668	+{
	669	+ struct blkcg_policy *pol = blkcg_policy[pd->plid];
	670	+ struct blkcg_gq *pos_blkg;
	671	+ struct cgroup *pos_cgrp;
	672	+ struct blkg_rwstat sum;
	673	+ int i;
	674	+
	675	+ lockdep_assert_held(pd->blkg->q->queue_lock);
	676	+
	677	+ sum = blkg_rwstat_read((void *)pd + off);
	678	+
	679	+ rcu_read_lock();
	680	+ blkg_for_each_descendant_pre(pos_blkg, pos_cgrp, pd_to_blkg(pd)) {
	681	+ struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol);
	682	+ struct blkg_rwstat rwstat = (void )pos_pd + off;
	683	+ struct blkg_rwstat tmp;
	684	+
	685	+ if (!pos_blkg->online)
	686	+ continue;
	687	+
	688	+ tmp = blkg_rwstat_read(rwstat);
	689	+
	690	+ for (i = 0; i < BLKG_RWSTAT_NR; i++)
	691	+ sum.cnt[i] += tmp.cnt[i];
	692	+ }
	693	+ rcu_read_unlock();
	694	+
	695	+ return sum;
	696	+}
	697	+EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum);
	698	+
	699	+/**
517	700	* blkg_conf_prep - parse and prepare for per-blkg config update
518	701	* @blkcg: target block cgroup
519	702	* @pol: target policy
...	...	@@ -658,6 +841,7 @@
658	841	return ERR_PTR(-ENOMEM);
659	842
660	843	blkcg->cfq_weight = CFQ_WEIGHT_DEFAULT;
	844	+ blkcg->cfq_leaf_weight = CFQ_WEIGHT_DEFAULT;
661	845	blkcg->id = atomic64_inc_return(&id_seq); /* root is 0, start from 1 */
662	846	done:
663	847	spin_lock_init(&blkcg->lock);
...	...	@@ -777,7 +961,7 @@
777	961	const struct blkcg_policy *pol)
778	962	{
779	963	LIST_HEAD(pds);
780		- struct blkcg_gq *blkg;
	964	+ struct blkcg_gq blkg, new_blkg;
781	965	struct blkg_policy_data pd, n;
782	966	int cnt = 0, ret;
783	967	bool preloaded;
784	968
785	969
...	...	@@ -786,19 +970,27 @@
786	970	return 0;
787	971
788	972	/* preallocations for root blkg */
789		- blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
790		- if (!blkg)
	973	+ new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
	974	+ if (!new_blkg)
791	975	return -ENOMEM;
792	976
793	977	preloaded = !radix_tree_preload(GFP_KERNEL);
794	978
795	979	blk_queue_bypass_start(q);
796	980
797		- /* make sure the root blkg exists and count the existing blkgs */
	981	+ /*
	982	+ * Make sure the root blkg exists and count the existing blkgs. As
	983	+ * @q is bypassing at this point, blkg_lookup_create() can't be
	984	+ * used. Open code it.
	985	+ */
798	986	spin_lock_irq(q->queue_lock);
799	987
800	988	rcu_read_lock();
801		- blkg = __blkg_lookup_create(&blkcg_root, q, blkg);
	989	+ blkg = __blkg_lookup(&blkcg_root, q, false);
	990	+ if (blkg)
	991	+ blkg_free(new_blkg);
	992	+ else
	993	+ blkg = blkg_create(&blkcg_root, q, new_blkg);
802	994	rcu_read_unlock();
803	995
804	996	if (preloaded)
...	...	@@ -846,6 +1038,7 @@
846	1038
847	1039	blkg->pd[pol->plid] = pd;
848	1040	pd->blkg = blkg;
	1041	+ pd->plid = pol->plid;
849	1042	pol->pd_init_fn(blkg);
850	1043
851	1044	spin_unlock(&blkg->blkcg->lock);
...	...	@@ -892,6 +1085,8 @@
892	1085	/* grab blkcg lock too while removing @pd from @blkg */
893	1086	spin_lock(&blkg->blkcg->lock);
894	1087
	1088	+ if (pol->pd_offline_fn)
	1089	+ pol->pd_offline_fn(blkg);
895	1090	if (pol->pd_exit_fn)
896	1091	pol->pd_exit_fn(blkg);
897	1092
...	...	@@ -54,6 +54,7 @@
54	54
55	55	/* TODO: per-policy storage in blkcg */
56	56	unsigned int cfq_weight; /* belongs to cfq */
	57	+ unsigned int cfq_leaf_weight;
57	58	};
58	59
59	60	struct blkg_stat {
60	61
...	...	@@ -80,8 +81,9 @@
80	81	* beginning and pd_size can't be smaller than pd.
81	82	*/
82	83	struct blkg_policy_data {
83		- /* the blkg this per-policy data belongs to */
	84	+ /* the blkg and policy id this per-policy data belongs to */
84	85	struct blkcg_gq *blkg;
	86	+ int plid;
85	87
86	88	/* used during policy activation */
87	89	struct list_head alloc_node;
88	90
89	91
90	92
...	...	@@ -94,17 +96,27 @@
94	96	struct list_head q_node;
95	97	struct hlist_node blkcg_node;
96	98	struct blkcg *blkcg;
	99	+
	100	+ /* all non-root blkcg_gq's are guaranteed to have access to parent */
	101	+ struct blkcg_gq *parent;
	102	+
97	103	/* request allocation list for this blkcg-q pair */
98	104	struct request_list rl;
	105	+
99	106	/* reference count */
100	107	int refcnt;
101	108
	109	+ /* is this blkg online? protected by both blkcg and q locks */
	110	+ bool online;
	111	+
102	112	struct blkg_policy_data *pd[BLKCG_MAX_POLS];
103	113
104	114	struct rcu_head rcu_head;
105	115	};
106	116
107	117	typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg);
	118	+typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg);
	119	+typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg);
108	120	typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg);
109	121	typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg);
110	122
...	...	@@ -117,6 +129,8 @@
117	129
118	130	/* operations */
119	131	blkcg_pol_init_pd_fn *pd_init_fn;
	132	+ blkcg_pol_online_pd_fn *pd_online_fn;
	133	+ blkcg_pol_offline_pd_fn *pd_offline_fn;
120	134	blkcg_pol_exit_pd_fn *pd_exit_fn;
121	135	blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn;
122	136	};
...	...	@@ -150,6 +164,10 @@
150	164	u64 blkg_prfill_rwstat(struct seq_file sf, struct blkg_policy_data pd,
151	165	int off);
152	166
	167	+u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off);
	168	+struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd,
	169	+ int off);
	170	+
153	171	struct blkg_conf_ctx {
154	172	struct gendisk *disk;
155	173	struct blkcg_gq *blkg;
...	...	@@ -181,6 +199,19 @@
181	199	}
182	200
183	201	/**
	202	+ * blkcg_parent - get the parent of a blkcg
	203	+ * @blkcg: blkcg of interest
	204	+ *
	205	+ * Return the parent blkcg of @blkcg. Can be called anytime.
	206	+ */
	207	+static inline struct blkcg blkcg_parent(struct blkcg blkcg)
	208	+{
	209	+ struct cgroup *pcg = blkcg->css.cgroup->parent;
	210	+
	211	+ return pcg ? cgroup_to_blkcg(pcg) : NULL;
	212	+}
	213	+
	214	+/**
184	215	* blkg_to_pdata - get policy private data
185	216	* @blkg: blkg of interest
186	217	* @pol: policy of interest
...	...	@@ -387,6 +418,18 @@
387	418	}
388	419
389	420	/**
	421	+ * blkg_stat_merge - merge a blkg_stat into another
	422	+ * @to: the destination blkg_stat
	423	+ * @from: the source
	424	+ *
	425	+ * Add @from's count to @to.
	426	+ */
	427	+static inline void blkg_stat_merge(struct blkg_stat to, struct blkg_stat from)
	428	+{
	429	+ blkg_stat_add(to, blkg_stat_read(from));
	430	+}
	431	+
	432	+/**
390	433	* blkg_rwstat_add - add a value to a blkg_rwstat
391	434	* @rwstat: target blkg_rwstat
392	435	* @rw: mask of REQ_{WRITE\|SYNC}
393	436
...	...	@@ -434,14 +477,14 @@
434	477	}
435	478
436	479	/**
437		- * blkg_rwstat_sum - read the total count of a blkg_rwstat
	480	+ * blkg_rwstat_total - read the total count of a blkg_rwstat
438	481	* @rwstat: blkg_rwstat to read
439	482	*
440	483	* Return the total count of @rwstat regardless of the IO direction. This
441	484	* function can be called without synchronization and takes care of u64
442	485	* atomicity.
443	486	*/
444		-static inline uint64_t blkg_rwstat_sum(struct blkg_rwstat *rwstat)
	487	+static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat)
445	488	{
446	489	struct blkg_rwstat tmp = blkg_rwstat_read(rwstat);
447	490
...	...	@@ -455,6 +498,25 @@
455	498	static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat)
456	499	{
457	500	memset(rwstat->cnt, 0, sizeof(rwstat->cnt));
	501	+}
	502	+
	503	+/**
	504	+ * blkg_rwstat_merge - merge a blkg_rwstat into another
	505	+ * @to: the destination blkg_rwstat
	506	+ * @from: the source
	507	+ *
	508	+ * Add @from's counts to @to.
	509	+ */
	510	+static inline void blkg_rwstat_merge(struct blkg_rwstat *to,
	511	+ struct blkg_rwstat *from)
	512	+{
	513	+ struct blkg_rwstat v = blkg_rwstat_read(from);
	514	+ int i;
	515	+
	516	+ u64_stats_update_begin(&to->syncp);
	517	+ for (i = 0; i < BLKG_RWSTAT_NR; i++)
	518	+ to->cnt[i] += v.cnt[i];
	519	+ u64_stats_update_end(&to->syncp);
458	520	}
459	521
460	522	#else /* CONFIG_BLK_CGROUP */
...	...	@@ -497,6 +497,13 @@
497	497	return res;
498	498	}
499	499
	500	+static void blk_free_queue_rcu(struct rcu_head *rcu_head)
	501	+{
	502	+ struct request_queue *q = container_of(rcu_head, struct request_queue,
	503	+ rcu_head);
	504	+ kmem_cache_free(blk_requestq_cachep, q);
	505	+}
	506	+
500	507	/**
501	508	* blk_release_queue: - release a &struct request_queue when it is no longer needed
502	509	* @kobj: the kobj belonging to the request queue to be released
...	...	@@ -538,7 +545,7 @@
538	545	bdi_destroy(&q->backing_dev_info);
539	546
540	547	ida_simple_remove(&blk_queue_ida, q->id);
541		- kmem_cache_free(blk_requestq_cachep, q);
	548	+ call_rcu(&q->rcu_head, blk_free_queue_rcu);
542	549	}
543	550
544	551	static const struct sysfs_ops queue_sysfs_ops = {
...	...	@@ -19,6 +19,7 @@
19	19	#include <linux/gfp.h>
20	20	#include <linux/bsg.h>
21	21	#include <linux/smp.h>
	22	+#include <linux/rcupdate.h>
22	23
23	24	#include <asm/scatterlist.h>
24	25
...	...	@@ -437,6 +438,7 @@
437	438	/* Throttle data */
438	439	struct throtl_data *td;
439	440	#endif
	441	+ struct rcu_head rcu_head;
440	442	};
441	443
442	444	#define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */