Commit 36805aaea5ae3cf1bb32f1643e0a800bb69f0d5b

Authored by Linus Torvalds

Merge branch 'for-3.11/core' of git://git.kernel.dk/linux-block

Pull core block IO updates from Jens Axboe:
 "Here are the core IO block bits for 3.11. It contains:

   - A tweak to the reserved tag logic from Jan, for weirdo devices with
     just 3 free tags.  But for those it improves things substantially
     for random writes.

   - Periodic writeback fix from Jan.  Marked for stable as well.

   - Fix for a race condition in IO scheduler switching from Jianpeng.

   - The hierarchical blk-cgroup support from Tejun.  This is the grunt
     of the series.

   - blk-throttle fix from Vivek.

  Just a note that I'm in the middle of a relocation, whole family is
  flying out tomorrow.  Hence I will be awal the remainder of this week,
  but back at work again on Monday the 15th.  CC'ing Tejun, since any
  potential "surprises" will most likely be from the blk-cgroup work.
  But it's been brewing for a while and sitting in my tree and
  linux-next for a long time, so should be solid."

* 'for-3.11/core' of git://git.kernel.dk/linux-block: (36 commits)
  elevator: Fix a race in elevator switching
  block: Reserve only one queue tag for sync IO if only 3 tags are available
  writeback: Fix periodic writeback after fs mount
  blk-throttle: implement proper hierarchy support
  blk-throttle: implement throtl_grp->has_rules[]
  blk-throttle: Account for child group's start time in parent while bio climbs up
  blk-throttle: add throtl_qnode for dispatch fairness
  blk-throttle: make throtl_pending_timer_fn() ready for hierarchy
  blk-throttle: make tg_dispatch_one_bio() ready for hierarchy
  blk-throttle: make blk_throtl_bio() ready for hierarchy
  blk-throttle: make blk_throtl_drain() ready for hierarchy
  blk-throttle: dispatch from throtl_pending_timer_fn()
  blk-throttle: implement dispatch looping
  blk-throttle: separate out throtl_service_queue->pending_timer from throtl_data->dispatch_work
  blk-throttle: set REQ_THROTTLED from throtl_charge_bio() and gate stats update with it
  blk-throttle: implement sq_to_tg(), sq_to_td() and throtl_log()
  blk-throttle: add throtl_service_queue->parent_sq
  blk-throttle: generalize update_disptime optimization in blk_throtl_bio()
  blk-throttle: dispatch to throtl_data->service_queue.bio_lists[]
  blk-throttle: move bio_lists[] and friends to throtl_service_queue
  ...

Showing 12 changed files Side-by-side Diff

Documentation/cgroups/blkio-controller.txt
... ... @@ -94,32 +94,33 @@
94 94  
95 95 Hierarchical Cgroups
96 96 ====================
97   -- Currently only CFQ supports hierarchical groups. For throttling,
98   - cgroup interface does allow creation of hierarchical cgroups and
99   - internally it treats them as flat hierarchy.
100 97  
101   - If somebody created a hierarchy like as follows.
  98 +Both CFQ and throttling implement hierarchy support; however,
  99 +throttling's hierarchy support is enabled iff "sane_behavior" is
  100 +enabled from cgroup side, which currently is a development option and
  101 +not publicly available.
102 102  
  103 +If somebody created a hierarchy like as follows.
  104 +
103 105 root
104 106 / \
105 107 test1 test2
106 108 |
107 109 test3
108 110  
109   - CFQ will handle the hierarchy correctly but and throttling will
110   - practically treat all groups at same level. For details on CFQ
111   - hierarchy support, refer to Documentation/block/cfq-iosched.txt.
112   - Throttling will treat the hierarchy as if it looks like the
113   - following.
  111 +CFQ by default and throttling with "sane_behavior" will handle the
  112 +hierarchy correctly. For details on CFQ hierarchy support, refer to
  113 +Documentation/block/cfq-iosched.txt. For throttling, all limits apply
  114 +to the whole subtree while all statistics are local to the IOs
  115 +directly generated by tasks in that cgroup.
114 116  
  117 +Throttling without "sane_behavior" enabled from cgroup side will
  118 +practically treat all groups at same level as if it looks like the
  119 +following.
  120 +
115 121 pivot
116 122 / / \ \
117 123 root test1 test2 test3
118   -
119   - Nesting cgroups, while allowed, isn't officially supported and blkio
120   - genereates warning when cgroups nest. Once throttling implements
121   - hierarchy support, hierarchy will be supported and the warning will
122   - be removed.
123 124  
124 125 Various user visible config options
125 126 ===================================
... ... @@ -32,26 +32,6 @@
32 32  
33 33 static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
34 34  
35   -static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
36   - struct request_queue *q, bool update_hint);
37   -
38   -/**
39   - * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
40   - * @d_blkg: loop cursor pointing to the current descendant
41   - * @pos_cgrp: used for iteration
42   - * @p_blkg: target blkg to walk descendants of
43   - *
44   - * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU
45   - * read locked. If called under either blkcg or queue lock, the iteration
46   - * is guaranteed to include all and only online blkgs. The caller may
47   - * update @pos_cgrp by calling cgroup_rightmost_descendant() to skip
48   - * subtree.
49   - */
50   -#define blkg_for_each_descendant_pre(d_blkg, pos_cgrp, p_blkg) \
51   - cgroup_for_each_descendant_pre((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \
52   - if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \
53   - (p_blkg)->q, false)))
54   -
55 35 static bool blkcg_policy_enabled(struct request_queue *q,
56 36 const struct blkcg_policy *pol)
57 37 {
58 38  
... ... @@ -71,19 +51,9 @@
71 51 if (!blkg)
72 52 return;
73 53  
74   - for (i = 0; i < BLKCG_MAX_POLS; i++) {
75   - struct blkcg_policy *pol = blkcg_policy[i];
76   - struct blkg_policy_data *pd = blkg->pd[i];
  54 + for (i = 0; i < BLKCG_MAX_POLS; i++)
  55 + kfree(blkg->pd[i]);
77 56  
78   - if (!pd)
79   - continue;
80   -
81   - if (pol && pol->pd_exit_fn)
82   - pol->pd_exit_fn(blkg);
83   -
84   - kfree(pd);
85   - }
86   -
87 57 blk_exit_rl(&blkg->rl);
88 58 kfree(blkg);
89 59 }
... ... @@ -134,10 +104,6 @@
134 104 blkg->pd[i] = pd;
135 105 pd->blkg = blkg;
136 106 pd->plid = i;
137   -
138   - /* invoke per-policy init */
139   - if (pol->pd_init_fn)
140   - pol->pd_init_fn(blkg);
141 107 }
142 108  
143 109 return blkg;
... ... @@ -158,8 +124,8 @@
158 124 * @q's bypass state. If @update_hint is %true, the caller should be
159 125 * holding @q->queue_lock and lookup hint is updated on success.
160 126 */
161   -static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
162   - struct request_queue *q, bool update_hint)
  127 +struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
  128 + bool update_hint)
163 129 {
164 130 struct blkcg_gq *blkg;
165 131  
166 132  
167 133  
... ... @@ -234,16 +200,25 @@
234 200 }
235 201 blkg = new_blkg;
236 202  
237   - /* link parent and insert */
  203 + /* link parent */
238 204 if (blkcg_parent(blkcg)) {
239 205 blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
240 206 if (WARN_ON_ONCE(!blkg->parent)) {
241   - blkg = ERR_PTR(-EINVAL);
  207 + ret = -EINVAL;
242 208 goto err_put_css;
243 209 }
244 210 blkg_get(blkg->parent);
245 211 }
246 212  
  213 + /* invoke per-policy init */
  214 + for (i = 0; i < BLKCG_MAX_POLS; i++) {
  215 + struct blkcg_policy *pol = blkcg_policy[i];
  216 +
  217 + if (blkg->pd[i] && pol->pd_init_fn)
  218 + pol->pd_init_fn(blkg);
  219 + }
  220 +
  221 + /* insert */
247 222 spin_lock(&blkcg->lock);
248 223 ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg);
249 224 if (likely(!ret)) {
250 225  
251 226  
252 227  
253 228  
254 229  
255 230  
... ... @@ -394,30 +369,38 @@
394 369 q->root_rl.blkg = NULL;
395 370 }
396 371  
397   -static void blkg_rcu_free(struct rcu_head *rcu_head)
  372 +/*
  373 + * A group is RCU protected, but having an rcu lock does not mean that one
  374 + * can access all the fields of blkg and assume these are valid. For
  375 + * example, don't try to follow throtl_data and request queue links.
  376 + *
  377 + * Having a reference to blkg under an rcu allows accesses to only values
  378 + * local to groups like group stats and group rate limits.
  379 + */
  380 +void __blkg_release_rcu(struct rcu_head *rcu_head)
398 381 {
399   - blkg_free(container_of(rcu_head, struct blkcg_gq, rcu_head));
400   -}
  382 + struct blkcg_gq *blkg = container_of(rcu_head, struct blkcg_gq, rcu_head);
  383 + int i;
401 384  
402   -void __blkg_release(struct blkcg_gq *blkg)
403   -{
  385 + /* tell policies that this one is being freed */
  386 + for (i = 0; i < BLKCG_MAX_POLS; i++) {
  387 + struct blkcg_policy *pol = blkcg_policy[i];
  388 +
  389 + if (blkg->pd[i] && pol->pd_exit_fn)
  390 + pol->pd_exit_fn(blkg);
  391 + }
  392 +
404 393 /* release the blkcg and parent blkg refs this blkg has been holding */
405 394 css_put(&blkg->blkcg->css);
406   - if (blkg->parent)
  395 + if (blkg->parent) {
  396 + spin_lock_irq(blkg->q->queue_lock);
407 397 blkg_put(blkg->parent);
  398 + spin_unlock_irq(blkg->q->queue_lock);
  399 + }
408 400  
409   - /*
410   - * A group is freed in rcu manner. But having an rcu lock does not
411   - * mean that one can access all the fields of blkg and assume these
412   - * are valid. For example, don't try to follow throtl_data and
413   - * request queue links.
414   - *
415   - * Having a reference to blkg under an rcu allows acess to only
416   - * values local to groups like group stats and group rate limits
417   - */
418   - call_rcu(&blkg->rcu_head, blkg_rcu_free);
  401 + blkg_free(blkg);
419 402 }
420   -EXPORT_SYMBOL_GPL(__blkg_release);
  403 +EXPORT_SYMBOL_GPL(__blkg_release_rcu);
421 404  
422 405 /*
423 406 * The next function used by blk_queue_for_each_rl(). It's a bit tricky
... ... @@ -928,14 +911,6 @@
928 911 .subsys_id = blkio_subsys_id,
929 912 .base_cftypes = blkcg_files,
930 913 .module = THIS_MODULE,
931   -
932   - /*
933   - * blkio subsystem is utterly broken in terms of hierarchy support.
934   - * It treats all cgroups equally regardless of where they're
935   - * located in the hierarchy - all cgroups are treated as if they're
936   - * right below the root. Fix it and remove the following.
937   - */
938   - .broken_hierarchy = true,
939 914 };
940 915 EXPORT_SYMBOL_GPL(blkio_subsys);
941 916  
... ... @@ -266,7 +266,7 @@
266 266 blkg->refcnt++;
267 267 }
268 268  
269   -void __blkg_release(struct blkcg_gq *blkg);
  269 +void __blkg_release_rcu(struct rcu_head *rcu);
270 270  
271 271 /**
272 272 * blkg_put - put a blkg reference
273 273  
... ... @@ -279,8 +279,42 @@
279 279 lockdep_assert_held(blkg->q->queue_lock);
280 280 WARN_ON_ONCE(blkg->refcnt <= 0);
281 281 if (!--blkg->refcnt)
282   - __blkg_release(blkg);
  282 + call_rcu(&blkg->rcu_head, __blkg_release_rcu);
283 283 }
  284 +
  285 +struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
  286 + bool update_hint);
  287 +
  288 +/**
  289 + * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
  290 + * @d_blkg: loop cursor pointing to the current descendant
  291 + * @pos_cgrp: used for iteration
  292 + * @p_blkg: target blkg to walk descendants of
  293 + *
  294 + * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU
  295 + * read locked. If called under either blkcg or queue lock, the iteration
  296 + * is guaranteed to include all and only online blkgs. The caller may
  297 + * update @pos_cgrp by calling cgroup_rightmost_descendant() to skip
  298 + * subtree.
  299 + */
  300 +#define blkg_for_each_descendant_pre(d_blkg, pos_cgrp, p_blkg) \
  301 + cgroup_for_each_descendant_pre((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \
  302 + if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \
  303 + (p_blkg)->q, false)))
  304 +
  305 +/**
  306 + * blkg_for_each_descendant_post - post-order walk of a blkg's descendants
  307 + * @d_blkg: loop cursor pointing to the current descendant
  308 + * @pos_cgrp: used for iteration
  309 + * @p_blkg: target blkg to walk descendants of
  310 + *
  311 + * Similar to blkg_for_each_descendant_pre() but performs post-order
  312 + * traversal instead. Synchronization rules are the same.
  313 + */
  314 +#define blkg_for_each_descendant_post(d_blkg, pos_cgrp, p_blkg) \
  315 + cgroup_for_each_descendant_post((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \
  316 + if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \
  317 + (p_blkg)->q, false)))
284 318  
285 319 /**
286 320 * blk_get_rl - get request_list to use
... ... @@ -348,9 +348,16 @@
348 348 */
349 349 max_depth = bqt->max_depth;
350 350 if (!rq_is_sync(rq) && max_depth > 1) {
351   - max_depth -= 2;
352   - if (!max_depth)
  351 + switch (max_depth) {
  352 + case 2:
353 353 max_depth = 1;
  354 + break;
  355 + case 3:
  356 + max_depth = 2;
  357 + break;
  358 + default:
  359 + max_depth -= 2;
  360 + }
354 361 if (q->in_flight[BLK_RW_ASYNC] > max_depth)
355 362 return 1;
356 363 }
block/blk-throttle.c
Changes suppressed. Click to show
... ... @@ -25,19 +25,62 @@
25 25  
26 26 /* A workqueue to queue throttle related work */
27 27 static struct workqueue_struct *kthrotld_workqueue;
28   -static void throtl_schedule_delayed_work(struct throtl_data *td,
29   - unsigned long delay);
30 28  
31   -struct throtl_rb_root {
32   - struct rb_root rb;
33   - struct rb_node *left;
34   - unsigned int count;
35   - unsigned long min_disptime;
  29 +/*
  30 + * To implement hierarchical throttling, throtl_grps form a tree and bios
  31 + * are dispatched upwards level by level until they reach the top and get
  32 + * issued. When dispatching bios from the children and local group at each
  33 + * level, if the bios are dispatched into a single bio_list, there's a risk
  34 + * of a local or child group which can queue many bios at once filling up
  35 + * the list starving others.
  36 + *
  37 + * To avoid such starvation, dispatched bios are queued separately
  38 + * according to where they came from. When they are again dispatched to
  39 + * the parent, they're popped in round-robin order so that no single source
  40 + * hogs the dispatch window.
  41 + *
  42 + * throtl_qnode is used to keep the queued bios separated by their sources.
  43 + * Bios are queued to throtl_qnode which in turn is queued to
  44 + * throtl_service_queue and then dispatched in round-robin order.
  45 + *
  46 + * It's also used to track the reference counts on blkg's. A qnode always
  47 + * belongs to a throtl_grp and gets queued on itself or the parent, so
  48 + * incrementing the reference of the associated throtl_grp when a qnode is
  49 + * queued and decrementing when dequeued is enough to keep the whole blkg
  50 + * tree pinned while bios are in flight.
  51 + */
  52 +struct throtl_qnode {
  53 + struct list_head node; /* service_queue->queued[] */
  54 + struct bio_list bios; /* queued bios */
  55 + struct throtl_grp *tg; /* tg this qnode belongs to */
36 56 };
37 57  
38   -#define THROTL_RB_ROOT (struct throtl_rb_root) { .rb = RB_ROOT, .left = NULL, \
39   - .count = 0, .min_disptime = 0}
  58 +struct throtl_service_queue {
  59 + struct throtl_service_queue *parent_sq; /* the parent service_queue */
40 60  
  61 + /*
  62 + * Bios queued directly to this service_queue or dispatched from
  63 + * children throtl_grp's.
  64 + */
  65 + struct list_head queued[2]; /* throtl_qnode [READ/WRITE] */
  66 + unsigned int nr_queued[2]; /* number of queued bios */
  67 +
  68 + /*
  69 + * RB tree of active children throtl_grp's, which are sorted by
  70 + * their ->disptime.
  71 + */
  72 + struct rb_root pending_tree; /* RB tree of active tgs */
  73 + struct rb_node *first_pending; /* first node in the tree */
  74 + unsigned int nr_pending; /* # queued in the tree */
  75 + unsigned long first_pending_disptime; /* disptime of the first tg */
  76 + struct timer_list pending_timer; /* fires on first_pending_disptime */
  77 +};
  78 +
  79 +enum tg_state_flags {
  80 + THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */
  81 + THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */
  82 +};
  83 +
41 84 #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node)
42 85  
43 86 /* Per-cpu group stats */
44 87  
45 88  
... ... @@ -52,10 +95,27 @@
52 95 /* must be the first member */
53 96 struct blkg_policy_data pd;
54 97  
55   - /* active throtl group service_tree member */
  98 + /* active throtl group service_queue member */
56 99 struct rb_node rb_node;
57 100  
  101 + /* throtl_data this group belongs to */
  102 + struct throtl_data *td;
  103 +
  104 + /* this group's service queue */
  105 + struct throtl_service_queue service_queue;
  106 +
58 107 /*
  108 + * qnode_on_self is used when bios are directly queued to this
  109 + * throtl_grp so that local bios compete fairly with bios
  110 + * dispatched from children. qnode_on_parent is used when bios are
  111 + * dispatched from this throtl_grp into its parent and will compete
  112 + * with the sibling qnode_on_parents and the parent's
  113 + * qnode_on_self.
  114 + */
  115 + struct throtl_qnode qnode_on_self[2];
  116 + struct throtl_qnode qnode_on_parent[2];
  117 +
  118 + /*
59 119 * Dispatch time in jiffies. This is the estimated time when group
60 120 * will unthrottle and is ready to dispatch more bio. It is used as
61 121 * key to sort active groups in service tree.
62 122  
... ... @@ -64,12 +124,9 @@
64 124  
65 125 unsigned int flags;
66 126  
67   - /* Two lists for READ and WRITE */
68   - struct bio_list bio_lists[2];
  127 + /* are there any throtl rules between this group and td? */
  128 + bool has_rules[2];
69 129  
70   - /* Number of queued bios on READ and WRITE lists */
71   - unsigned int nr_queued[2];
72   -
73 130 /* bytes per second rate limits */
74 131 uint64_t bps[2];
75 132  
... ... @@ -85,9 +142,6 @@
85 142 unsigned long slice_start[2];
86 143 unsigned long slice_end[2];
87 144  
88   - /* Some throttle limits got updated for the group */
89   - int limits_changed;
90   -
91 145 /* Per cpu stats pointer */
92 146 struct tg_stats_cpu __percpu *stats_cpu;
93 147  
... ... @@ -98,7 +152,7 @@
98 152 struct throtl_data
99 153 {
100 154 /* service tree for active throtl groups */
101   - struct throtl_rb_root tg_service_tree;
  155 + struct throtl_service_queue service_queue;
102 156  
103 157 struct request_queue *queue;
104 158  
... ... @@ -111,9 +165,7 @@
111 165 unsigned int nr_undestroyed_grps;
112 166  
113 167 /* Work for dispatching throttled bios */
114   - struct delayed_work throtl_work;
115   -
116   - int limits_changed;
  168 + struct work_struct dispatch_work;
117 169 };
118 170  
119 171 /* list and work item to allocate percpu group stats */
... ... @@ -123,6 +175,8 @@
123 175 static void tg_stats_alloc_fn(struct work_struct *);
124 176 static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn);
125 177  
  178 +static void throtl_pending_timer_fn(unsigned long arg);
  179 +
126 180 static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd)
127 181 {
128 182 return pd ? container_of(pd, struct throtl_grp, pd) : NULL;
129 183  
130 184  
131 185  
132 186  
... ... @@ -143,41 +197,65 @@
143 197 return blkg_to_tg(td->queue->root_blkg);
144 198 }
145 199  
146   -enum tg_state_flags {
147   - THROTL_TG_FLAG_on_rr = 0, /* on round-robin busy list */
148   -};
149   -
150   -#define THROTL_TG_FNS(name) \
151   -static inline void throtl_mark_tg_##name(struct throtl_grp *tg) \
152   -{ \
153   - (tg)->flags |= (1 << THROTL_TG_FLAG_##name); \
154   -} \
155   -static inline void throtl_clear_tg_##name(struct throtl_grp *tg) \
156   -{ \
157   - (tg)->flags &= ~(1 << THROTL_TG_FLAG_##name); \
158   -} \
159   -static inline int throtl_tg_##name(const struct throtl_grp *tg) \
160   -{ \
161   - return ((tg)->flags & (1 << THROTL_TG_FLAG_##name)) != 0; \
  200 +/**
  201 + * sq_to_tg - return the throl_grp the specified service queue belongs to
  202 + * @sq: the throtl_service_queue of interest
  203 + *
  204 + * Return the throtl_grp @sq belongs to. If @sq is the top-level one
  205 + * embedded in throtl_data, %NULL is returned.
  206 + */
  207 +static struct throtl_grp *sq_to_tg(struct throtl_service_queue *sq)
  208 +{
  209 + if (sq && sq->parent_sq)
  210 + return container_of(sq, struct throtl_grp, service_queue);
  211 + else
  212 + return NULL;
162 213 }
163 214  
164   -THROTL_TG_FNS(on_rr);
  215 +/**
  216 + * sq_to_td - return throtl_data the specified service queue belongs to
  217 + * @sq: the throtl_service_queue of interest
  218 + *
  219 + * A service_queue can be embeded in either a throtl_grp or throtl_data.
  220 + * Determine the associated throtl_data accordingly and return it.
  221 + */
  222 +static struct throtl_data *sq_to_td(struct throtl_service_queue *sq)
  223 +{
  224 + struct throtl_grp *tg = sq_to_tg(sq);
165 225  
166   -#define throtl_log_tg(td, tg, fmt, args...) do { \
167   - char __pbuf[128]; \
  226 + if (tg)
  227 + return tg->td;
  228 + else
  229 + return container_of(sq, struct throtl_data, service_queue);
  230 +}
  231 +
  232 +/**
  233 + * throtl_log - log debug message via blktrace
  234 + * @sq: the service_queue being reported
  235 + * @fmt: printf format string
  236 + * @args: printf args
  237 + *
  238 + * The messages are prefixed with "throtl BLKG_NAME" if @sq belongs to a
  239 + * throtl_grp; otherwise, just "throtl".
  240 + *
  241 + * TODO: this should be made a function and name formatting should happen
  242 + * after testing whether blktrace is enabled.
  243 + */
  244 +#define throtl_log(sq, fmt, args...) do { \
  245 + struct throtl_grp *__tg = sq_to_tg((sq)); \
  246 + struct throtl_data *__td = sq_to_td((sq)); \
168 247 \
169   - blkg_path(tg_to_blkg(tg), __pbuf, sizeof(__pbuf)); \
170   - blk_add_trace_msg((td)->queue, "throtl %s " fmt, __pbuf, ##args); \
  248 + (void)__td; \
  249 + if ((__tg)) { \
  250 + char __pbuf[128]; \
  251 + \
  252 + blkg_path(tg_to_blkg(__tg), __pbuf, sizeof(__pbuf)); \
  253 + blk_add_trace_msg(__td->queue, "throtl %s " fmt, __pbuf, ##args); \
  254 + } else { \
  255 + blk_add_trace_msg(__td->queue, "throtl " fmt, ##args); \
  256 + } \
171 257 } while (0)
172 258  
173   -#define throtl_log(td, fmt, args...) \
174   - blk_add_trace_msg((td)->queue, "throtl " fmt, ##args)
175   -
176   -static inline unsigned int total_nr_queued(struct throtl_data *td)
177   -{
178   - return td->nr_queued[0] + td->nr_queued[1];
179   -}
180   -
181 259 /*
182 260 * Worker for allocating per cpu stat for tgs. This is scheduled on the
183 261 * system_wq once there are some groups on the alloc_list waiting for
184 262  
185 263  
186 264  
187 265  
... ... @@ -215,15 +293,141 @@
215 293 goto alloc_stats;
216 294 }
217 295  
  296 +static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg)
  297 +{
  298 + INIT_LIST_HEAD(&qn->node);
  299 + bio_list_init(&qn->bios);
  300 + qn->tg = tg;
  301 +}
  302 +
  303 +/**
  304 + * throtl_qnode_add_bio - add a bio to a throtl_qnode and activate it
  305 + * @bio: bio being added
  306 + * @qn: qnode to add bio to
  307 + * @queued: the service_queue->queued[] list @qn belongs to
  308 + *
  309 + * Add @bio to @qn and put @qn on @queued if it's not already on.
  310 + * @qn->tg's reference count is bumped when @qn is activated. See the
  311 + * comment on top of throtl_qnode definition for details.
  312 + */
  313 +static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn,
  314 + struct list_head *queued)
  315 +{
  316 + bio_list_add(&qn->bios, bio);
  317 + if (list_empty(&qn->node)) {
  318 + list_add_tail(&qn->node, queued);
  319 + blkg_get(tg_to_blkg(qn->tg));
  320 + }
  321 +}
  322 +
  323 +/**
  324 + * throtl_peek_queued - peek the first bio on a qnode list
  325 + * @queued: the qnode list to peek
  326 + */
  327 +static struct bio *throtl_peek_queued(struct list_head *queued)
  328 +{
  329 + struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node);
  330 + struct bio *bio;
  331 +
  332 + if (list_empty(queued))
  333 + return NULL;
  334 +
  335 + bio = bio_list_peek(&qn->bios);
  336 + WARN_ON_ONCE(!bio);
  337 + return bio;
  338 +}
  339 +
  340 +/**
  341 + * throtl_pop_queued - pop the first bio form a qnode list
  342 + * @queued: the qnode list to pop a bio from
  343 + * @tg_to_put: optional out argument for throtl_grp to put
  344 + *
  345 + * Pop the first bio from the qnode list @queued. After popping, the first
  346 + * qnode is removed from @queued if empty or moved to the end of @queued so
  347 + * that the popping order is round-robin.
  348 + *
  349 + * When the first qnode is removed, its associated throtl_grp should be put
  350 + * too. If @tg_to_put is NULL, this function automatically puts it;
  351 + * otherwise, *@tg_to_put is set to the throtl_grp to put and the caller is
  352 + * responsible for putting it.
  353 + */
  354 +static struct bio *throtl_pop_queued(struct list_head *queued,
  355 + struct throtl_grp **tg_to_put)
  356 +{
  357 + struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node);
  358 + struct bio *bio;
  359 +
  360 + if (list_empty(queued))
  361 + return NULL;
  362 +
  363 + bio = bio_list_pop(&qn->bios);
  364 + WARN_ON_ONCE(!bio);
  365 +
  366 + if (bio_list_empty(&qn->bios)) {
  367 + list_del_init(&qn->node);
  368 + if (tg_to_put)
  369 + *tg_to_put = qn->tg;
  370 + else
  371 + blkg_put(tg_to_blkg(qn->tg));
  372 + } else {
  373 + list_move_tail(&qn->node, queued);
  374 + }
  375 +
  376 + return bio;
  377 +}
  378 +
  379 +/* init a service_queue, assumes the caller zeroed it */
  380 +static void throtl_service_queue_init(struct throtl_service_queue *sq,
  381 + struct throtl_service_queue *parent_sq)
  382 +{
  383 + INIT_LIST_HEAD(&sq->queued[0]);
  384 + INIT_LIST_HEAD(&sq->queued[1]);
  385 + sq->pending_tree = RB_ROOT;
  386 + sq->parent_sq = parent_sq;
  387 + setup_timer(&sq->pending_timer, throtl_pending_timer_fn,
  388 + (unsigned long)sq);
  389 +}
  390 +
  391 +static void throtl_service_queue_exit(struct throtl_service_queue *sq)
  392 +{
  393 + del_timer_sync(&sq->pending_timer);
  394 +}
  395 +
218 396 static void throtl_pd_init(struct blkcg_gq *blkg)
219 397 {
220 398 struct throtl_grp *tg = blkg_to_tg(blkg);
  399 + struct throtl_data *td = blkg->q->td;
  400 + struct throtl_service_queue *parent_sq;
221 401 unsigned long flags;
  402 + int rw;
222 403  
  404 + /*
  405 + * If sane_hierarchy is enabled, we switch to properly hierarchical
  406 + * behavior where limits on a given throtl_grp are applied to the
  407 + * whole subtree rather than just the group itself. e.g. If 16M
  408 + * read_bps limit is set on the root group, the whole system can't
  409 + * exceed 16M for the device.
  410 + *
  411 + * If sane_hierarchy is not enabled, the broken flat hierarchy
  412 + * behavior is retained where all throtl_grps are treated as if
  413 + * they're all separate root groups right below throtl_data.
  414 + * Limits of a group don't interact with limits of other groups
  415 + * regardless of the position of the group in the hierarchy.
  416 + */
  417 + parent_sq = &td->service_queue;
  418 +
  419 + if (cgroup_sane_behavior(blkg->blkcg->css.cgroup) && blkg->parent)
  420 + parent_sq = &blkg_to_tg(blkg->parent)->service_queue;
  421 +
  422 + throtl_service_queue_init(&tg->service_queue, parent_sq);
  423 +
  424 + for (rw = READ; rw <= WRITE; rw++) {
  425 + throtl_qnode_init(&tg->qnode_on_self[rw], tg);
  426 + throtl_qnode_init(&tg->qnode_on_parent[rw], tg);
  427 + }
  428 +
223 429 RB_CLEAR_NODE(&tg->rb_node);
224   - bio_list_init(&tg->bio_lists[0]);
225   - bio_list_init(&tg->bio_lists[1]);
226   - tg->limits_changed = false;
  430 + tg->td = td;
227 431  
228 432 tg->bps[READ] = -1;
229 433 tg->bps[WRITE] = -1;
... ... @@ -241,6 +445,30 @@
241 445 spin_unlock_irqrestore(&tg_stats_alloc_lock, flags);
242 446 }
243 447  
  448 +/*
  449 + * Set has_rules[] if @tg or any of its parents have limits configured.
  450 + * This doesn't require walking up to the top of the hierarchy as the
  451 + * parent's has_rules[] is guaranteed to be correct.
  452 + */
  453 +static void tg_update_has_rules(struct throtl_grp *tg)
  454 +{
  455 + struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq);
  456 + int rw;
  457 +
  458 + for (rw = READ; rw <= WRITE; rw++)
  459 + tg->has_rules[rw] = (parent_tg && parent_tg->has_rules[rw]) ||
  460 + (tg->bps[rw] != -1 || tg->iops[rw] != -1);
  461 +}
  462 +
  463 +static void throtl_pd_online(struct blkcg_gq *blkg)
  464 +{
  465 + /*
  466 + * We don't want new groups to escape the limits of its ancestors.
  467 + * Update has_rules[] after a new group is brought online.
  468 + */
  469 + tg_update_has_rules(blkg_to_tg(blkg));
  470 +}
  471 +
244 472 static void throtl_pd_exit(struct blkcg_gq *blkg)
245 473 {
246 474 struct throtl_grp *tg = blkg_to_tg(blkg);
... ... @@ -251,6 +479,8 @@
251 479 spin_unlock_irqrestore(&tg_stats_alloc_lock, flags);
252 480  
253 481 free_percpu(tg->stats_cpu);
  482 +
  483 + throtl_service_queue_exit(&tg->service_queue);
254 484 }
255 485  
256 486 static void throtl_pd_reset_stats(struct blkcg_gq *blkg)
257 487  
258 488  
259 489  
... ... @@ -309,17 +539,18 @@
309 539 return tg;
310 540 }
311 541  
312   -static struct throtl_grp *throtl_rb_first(struct throtl_rb_root *root)
  542 +static struct throtl_grp *
  543 +throtl_rb_first(struct throtl_service_queue *parent_sq)
313 544 {
314 545 /* Service tree is empty */
315   - if (!root->count)
  546 + if (!parent_sq->nr_pending)
316 547 return NULL;
317 548  
318   - if (!root->left)
319   - root->left = rb_first(&root->rb);
  549 + if (!parent_sq->first_pending)
  550 + parent_sq->first_pending = rb_first(&parent_sq->pending_tree);
320 551  
321   - if (root->left)
322   - return rb_entry_tg(root->left);
  552 + if (parent_sq->first_pending)
  553 + return rb_entry_tg(parent_sq->first_pending);
323 554  
324 555 return NULL;
325 556 }
326 557  
327 558  
328 559  
329 560  
330 561  
331 562  
... ... @@ -330,29 +561,30 @@
330 561 RB_CLEAR_NODE(n);
331 562 }
332 563  
333   -static void throtl_rb_erase(struct rb_node *n, struct throtl_rb_root *root)
  564 +static void throtl_rb_erase(struct rb_node *n,
  565 + struct throtl_service_queue *parent_sq)
334 566 {
335   - if (root->left == n)
336   - root->left = NULL;
337   - rb_erase_init(n, &root->rb);
338   - --root->count;
  567 + if (parent_sq->first_pending == n)
  568 + parent_sq->first_pending = NULL;
  569 + rb_erase_init(n, &parent_sq->pending_tree);
  570 + --parent_sq->nr_pending;
339 571 }
340 572  
341   -static void update_min_dispatch_time(struct throtl_rb_root *st)
  573 +static void update_min_dispatch_time(struct throtl_service_queue *parent_sq)
342 574 {
343 575 struct throtl_grp *tg;
344 576  
345   - tg = throtl_rb_first(st);
  577 + tg = throtl_rb_first(parent_sq);
346 578 if (!tg)
347 579 return;
348 580  
349   - st->min_disptime = tg->disptime;
  581 + parent_sq->first_pending_disptime = tg->disptime;
350 582 }
351 583  
352   -static void
353   -tg_service_tree_add(struct throtl_rb_root *st, struct throtl_grp *tg)
  584 +static void tg_service_queue_add(struct throtl_grp *tg)
354 585 {
355   - struct rb_node **node = &st->rb.rb_node;
  586 + struct throtl_service_queue *parent_sq = tg->service_queue.parent_sq;
  587 + struct rb_node **node = &parent_sq->pending_tree.rb_node;
356 588 struct rb_node *parent = NULL;
357 589 struct throtl_grp *__tg;
358 590 unsigned long key = tg->disptime;
359 591  
360 592  
361 593  
362 594  
363 595  
364 596  
365 597  
366 598  
367 599  
368 600  
369 601  
370 602  
371 603  
372 604  
373 605  
374 606  
375 607  
376 608  
377 609  
378 610  
379 611  
380 612  
... ... @@ -371,89 +603,135 @@
371 603 }
372 604  
373 605 if (left)
374   - st->left = &tg->rb_node;
  606 + parent_sq->first_pending = &tg->rb_node;
375 607  
376 608 rb_link_node(&tg->rb_node, parent, node);
377   - rb_insert_color(&tg->rb_node, &st->rb);
  609 + rb_insert_color(&tg->rb_node, &parent_sq->pending_tree);
378 610 }
379 611  
380   -static void __throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg)
  612 +static void __throtl_enqueue_tg(struct throtl_grp *tg)
381 613 {
382   - struct throtl_rb_root *st = &td->tg_service_tree;
  614 + tg_service_queue_add(tg);
  615 + tg->flags |= THROTL_TG_PENDING;
  616 + tg->service_queue.parent_sq->nr_pending++;
  617 +}
383 618  
384   - tg_service_tree_add(st, tg);
385   - throtl_mark_tg_on_rr(tg);
386   - st->count++;
  619 +static void throtl_enqueue_tg(struct throtl_grp *tg)
  620 +{
  621 + if (!(tg->flags & THROTL_TG_PENDING))
  622 + __throtl_enqueue_tg(tg);
387 623 }
388 624  
389   -static void throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg)
  625 +static void __throtl_dequeue_tg(struct throtl_grp *tg)
390 626 {
391   - if (!throtl_tg_on_rr(tg))
392   - __throtl_enqueue_tg(td, tg);
  627 + throtl_rb_erase(&tg->rb_node, tg->service_queue.parent_sq);
  628 + tg->flags &= ~THROTL_TG_PENDING;
393 629 }
394 630  
395   -static void __throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg)
  631 +static void throtl_dequeue_tg(struct throtl_grp *tg)
396 632 {
397   - throtl_rb_erase(&tg->rb_node, &td->tg_service_tree);
398   - throtl_clear_tg_on_rr(tg);
  633 + if (tg->flags & THROTL_TG_PENDING)
  634 + __throtl_dequeue_tg(tg);
399 635 }
400 636  
401   -static void throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg)
  637 +/* Call with queue lock held */
  638 +static void throtl_schedule_pending_timer(struct throtl_service_queue *sq,
  639 + unsigned long expires)
402 640 {
403   - if (throtl_tg_on_rr(tg))
404   - __throtl_dequeue_tg(td, tg);
  641 + mod_timer(&sq->pending_timer, expires);
  642 + throtl_log(sq, "schedule timer. delay=%lu jiffies=%lu",
  643 + expires - jiffies, jiffies);
405 644 }
406 645  
407   -static void throtl_schedule_next_dispatch(struct throtl_data *td)
  646 +/**
  647 + * throtl_schedule_next_dispatch - schedule the next dispatch cycle
  648 + * @sq: the service_queue to schedule dispatch for
  649 + * @force: force scheduling
  650 + *
  651 + * Arm @sq->pending_timer so that the next dispatch cycle starts on the
  652 + * dispatch time of the first pending child. Returns %true if either timer
  653 + * is armed or there's no pending child left. %false if the current
  654 + * dispatch window is still open and the caller should continue
  655 + * dispatching.
  656 + *
  657 + * If @force is %true, the dispatch timer is always scheduled and this
  658 + * function is guaranteed to return %true. This is to be used when the
  659 + * caller can't dispatch itself and needs to invoke pending_timer
  660 + * unconditionally. Note that forced scheduling is likely to induce short
  661 + * delay before dispatch starts even if @sq->first_pending_disptime is not
  662 + * in the future and thus shouldn't be used in hot paths.
  663 + */
  664 +static bool throtl_schedule_next_dispatch(struct throtl_service_queue *sq,
  665 + bool force)
408 666 {
409   - struct throtl_rb_root *st = &td->tg_service_tree;
  667 + /* any pending children left? */
  668 + if (!sq->nr_pending)
  669 + return true;
410 670  
411   - /*
412   - * If there are more bios pending, schedule more work.
413   - */
414   - if (!total_nr_queued(td))
415   - return;
  671 + update_min_dispatch_time(sq);
416 672  
417   - BUG_ON(!st->count);
  673 + /* is the next dispatch time in the future? */
  674 + if (force || time_after(sq->first_pending_disptime, jiffies)) {
  675 + throtl_schedule_pending_timer(sq, sq->first_pending_disptime);
  676 + return true;
  677 + }
418 678  
419   - update_min_dispatch_time(st);
  679 + /* tell the caller to continue dispatching */
  680 + return false;
  681 +}
420 682  
421   - if (time_before_eq(st->min_disptime, jiffies))
422   - throtl_schedule_delayed_work(td, 0);
423   - else
424   - throtl_schedule_delayed_work(td, (st->min_disptime - jiffies));
  683 +static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg,
  684 + bool rw, unsigned long start)
  685 +{
  686 + tg->bytes_disp[rw] = 0;
  687 + tg->io_disp[rw] = 0;
  688 +
  689 + /*
  690 + * Previous slice has expired. We must have trimmed it after last
  691 + * bio dispatch. That means since start of last slice, we never used
  692 + * that bandwidth. Do try to make use of that bandwidth while giving
  693 + * credit.
  694 + */
  695 + if (time_after_eq(start, tg->slice_start[rw]))
  696 + tg->slice_start[rw] = start;
  697 +
  698 + tg->slice_end[rw] = jiffies + throtl_slice;
  699 + throtl_log(&tg->service_queue,
  700 + "[%c] new slice with credit start=%lu end=%lu jiffies=%lu",
  701 + rw == READ ? 'R' : 'W', tg->slice_start[rw],
  702 + tg->slice_end[rw], jiffies);
425 703 }
426 704  
427   -static inline void
428   -throtl_start_new_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
  705 +static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw)
429 706 {
430 707 tg->bytes_disp[rw] = 0;
431 708 tg->io_disp[rw] = 0;
432 709 tg->slice_start[rw] = jiffies;
433 710 tg->slice_end[rw] = jiffies + throtl_slice;
434   - throtl_log_tg(td, tg, "[%c] new slice start=%lu end=%lu jiffies=%lu",
435   - rw == READ ? 'R' : 'W', tg->slice_start[rw],
436   - tg->slice_end[rw], jiffies);
  711 + throtl_log(&tg->service_queue,
  712 + "[%c] new slice start=%lu end=%lu jiffies=%lu",
  713 + rw == READ ? 'R' : 'W', tg->slice_start[rw],
  714 + tg->slice_end[rw], jiffies);
437 715 }
438 716  
439   -static inline void throtl_set_slice_end(struct throtl_data *td,
440   - struct throtl_grp *tg, bool rw, unsigned long jiffy_end)
  717 +static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw,
  718 + unsigned long jiffy_end)
441 719 {
442 720 tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
443 721 }
444 722  
445   -static inline void throtl_extend_slice(struct throtl_data *td,
446   - struct throtl_grp *tg, bool rw, unsigned long jiffy_end)
  723 +static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw,
  724 + unsigned long jiffy_end)
447 725 {
448 726 tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
449   - throtl_log_tg(td, tg, "[%c] extend slice start=%lu end=%lu jiffies=%lu",
450   - rw == READ ? 'R' : 'W', tg->slice_start[rw],
451   - tg->slice_end[rw], jiffies);
  727 + throtl_log(&tg->service_queue,
  728 + "[%c] extend slice start=%lu end=%lu jiffies=%lu",
  729 + rw == READ ? 'R' : 'W', tg->slice_start[rw],
  730 + tg->slice_end[rw], jiffies);
452 731 }
453 732  
454 733 /* Determine if previously allocated or extended slice is complete or not */
455   -static bool
456   -throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw)
  734 +static bool throtl_slice_used(struct throtl_grp *tg, bool rw)
457 735 {
458 736 if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw]))
459 737 return 0;
... ... @@ -462,8 +740,7 @@
462 740 }
463 741  
464 742 /* Trim the used slices and adjust slice start accordingly */
465   -static inline void
466   -throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
  743 +static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
467 744 {
468 745 unsigned long nr_slices, time_elapsed, io_trim;
469 746 u64 bytes_trim, tmp;
... ... @@ -475,7 +752,7 @@
475 752 * renewed. Don't try to trim the slice if slice is used. A new
476 753 * slice will start when appropriate.
477 754 */
478   - if (throtl_slice_used(td, tg, rw))
  755 + if (throtl_slice_used(tg, rw))
479 756 return;
480 757  
481 758 /*
... ... @@ -486,7 +763,7 @@
486 763 * is bad because it does not allow new slice to start.
487 764 */
488 765  
489   - throtl_set_slice_end(td, tg, rw, jiffies + throtl_slice);
  766 + throtl_set_slice_end(tg, rw, jiffies + throtl_slice);
490 767  
491 768 time_elapsed = jiffies - tg->slice_start[rw];
492 769  
493 770  
... ... @@ -515,14 +792,14 @@
515 792  
516 793 tg->slice_start[rw] += nr_slices * throtl_slice;
517 794  
518   - throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%llu io=%lu"
519   - " start=%lu end=%lu jiffies=%lu",
520   - rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim,
521   - tg->slice_start[rw], tg->slice_end[rw], jiffies);
  795 + throtl_log(&tg->service_queue,
  796 + "[%c] trim slice nr=%lu bytes=%llu io=%lu start=%lu end=%lu jiffies=%lu",
  797 + rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim,
  798 + tg->slice_start[rw], tg->slice_end[rw], jiffies);
522 799 }
523 800  
524   -static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg,
525   - struct bio *bio, unsigned long *wait)
  801 +static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
  802 + unsigned long *wait)
526 803 {
527 804 bool rw = bio_data_dir(bio);
528 805 unsigned int io_allowed;
... ... @@ -571,8 +848,8 @@
571 848 return 0;
572 849 }
573 850  
574   -static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg,
575   - struct bio *bio, unsigned long *wait)
  851 +static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
  852 + unsigned long *wait)
576 853 {
577 854 bool rw = bio_data_dir(bio);
578 855 u64 bytes_allowed, extra_bytes, tmp;
579 856  
... ... @@ -613,18 +890,12 @@
613 890 return 0;
614 891 }
615 892  
616   -static bool tg_no_rule_group(struct throtl_grp *tg, bool rw) {
617   - if (tg->bps[rw] == -1 && tg->iops[rw] == -1)
618   - return 1;
619   - return 0;
620   -}
621   -
622 893 /*
623 894 * Returns whether one can dispatch a bio or not. Also returns approx number
624 895 * of jiffies to wait before this bio is with-in IO rate and can be dispatched
625 896 */
626   -static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
627   - struct bio *bio, unsigned long *wait)
  897 +static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
  898 + unsigned long *wait)
628 899 {
629 900 bool rw = bio_data_dir(bio);
630 901 unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;
... ... @@ -635,7 +906,8 @@
635 906 * this function with a different bio if there are other bios
636 907 * queued.
637 908 */
638   - BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw]));
  909 + BUG_ON(tg->service_queue.nr_queued[rw] &&
  910 + bio != throtl_peek_queued(&tg->service_queue.queued[rw]));
639 911  
640 912 /* If tg->bps = -1, then BW is unlimited */
641 913 if (tg->bps[rw] == -1 && tg->iops[rw] == -1) {
642 914  
643 915  
... ... @@ -649,15 +921,15 @@
649 921 * existing slice to make sure it is at least throtl_slice interval
650 922 * long since now.
651 923 */
652   - if (throtl_slice_used(td, tg, rw))
653   - throtl_start_new_slice(td, tg, rw);
  924 + if (throtl_slice_used(tg, rw))
  925 + throtl_start_new_slice(tg, rw);
654 926 else {
655 927 if (time_before(tg->slice_end[rw], jiffies + throtl_slice))
656   - throtl_extend_slice(td, tg, rw, jiffies + throtl_slice);
  928 + throtl_extend_slice(tg, rw, jiffies + throtl_slice);
657 929 }
658 930  
659   - if (tg_with_in_bps_limit(td, tg, bio, &bps_wait)
660   - && tg_with_in_iops_limit(td, tg, bio, &iops_wait)) {
  931 + if (tg_with_in_bps_limit(tg, bio, &bps_wait) &&
  932 + tg_with_in_iops_limit(tg, bio, &iops_wait)) {
661 933 if (wait)
662 934 *wait = 0;
663 935 return 1;
... ... @@ -669,7 +941,7 @@
669 941 *wait = max_wait;
670 942  
671 943 if (time_before(tg->slice_end[rw], jiffies + max_wait))
672   - throtl_extend_slice(td, tg, rw, jiffies + max_wait);
  944 + throtl_extend_slice(tg, rw, jiffies + max_wait);
673 945  
674 946 return 0;
675 947 }
676 948  
677 949  
678 950  
679 951  
680 952  
681 953  
682 954  
683 955  
684 956  
685 957  
686 958  
687 959  
688 960  
689 961  
690 962  
691 963  
692 964  
693 965  
... ... @@ -708,65 +980,136 @@
708 980 tg->bytes_disp[rw] += bio->bi_size;
709 981 tg->io_disp[rw]++;
710 982  
711   - throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size, bio->bi_rw);
  983 + /*
  984 + * REQ_THROTTLED is used to prevent the same bio to be throttled
  985 + * more than once as a throttled bio will go through blk-throtl the
  986 + * second time when it eventually gets issued. Set it when a bio
  987 + * is being charged to a tg.
  988 + *
  989 + * Dispatch stats aren't recursive and each @bio should only be
  990 + * accounted by the @tg it was originally associated with. Let's
  991 + * update the stats when setting REQ_THROTTLED for the first time
  992 + * which is guaranteed to be for the @bio's original tg.
  993 + */
  994 + if (!(bio->bi_rw & REQ_THROTTLED)) {
  995 + bio->bi_rw |= REQ_THROTTLED;
  996 + throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size,
  997 + bio->bi_rw);
  998 + }
712 999 }
713 1000  
714   -static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg,
715   - struct bio *bio)
  1001 +/**
  1002 + * throtl_add_bio_tg - add a bio to the specified throtl_grp
  1003 + * @bio: bio to add
  1004 + * @qn: qnode to use
  1005 + * @tg: the target throtl_grp
  1006 + *
  1007 + * Add @bio to @tg's service_queue using @qn. If @qn is not specified,
  1008 + * tg->qnode_on_self[] is used.
  1009 + */
  1010 +static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn,
  1011 + struct throtl_grp *tg)
716 1012 {
  1013 + struct throtl_service_queue *sq = &tg->service_queue;
717 1014 bool rw = bio_data_dir(bio);
718 1015  
719   - bio_list_add(&tg->bio_lists[rw], bio);
720   - /* Take a bio reference on tg */
721   - blkg_get(tg_to_blkg(tg));
722   - tg->nr_queued[rw]++;
723   - td->nr_queued[rw]++;
724   - throtl_enqueue_tg(td, tg);
  1016 + if (!qn)
  1017 + qn = &tg->qnode_on_self[rw];
  1018 +
  1019 + /*
  1020 + * If @tg doesn't currently have any bios queued in the same
  1021 + * direction, queueing @bio can change when @tg should be
  1022 + * dispatched. Mark that @tg was empty. This is automatically
  1023 + * cleaered on the next tg_update_disptime().
  1024 + */
  1025 + if (!sq->nr_queued[rw])
  1026 + tg->flags |= THROTL_TG_WAS_EMPTY;
  1027 +
  1028 + throtl_qnode_add_bio(bio, qn, &sq->queued[rw]);
  1029 +
  1030 + sq->nr_queued[rw]++;
  1031 + throtl_enqueue_tg(tg);
725 1032 }
726 1033  
727   -static void tg_update_disptime(struct throtl_data *td, struct throtl_grp *tg)
  1034 +static void tg_update_disptime(struct throtl_grp *tg)
728 1035 {
  1036 + struct throtl_service_queue *sq = &tg->service_queue;
729 1037 unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;
730 1038 struct bio *bio;
731 1039  
732   - if ((bio = bio_list_peek(&tg->bio_lists[READ])))
733   - tg_may_dispatch(td, tg, bio, &read_wait);
  1040 + if ((bio = throtl_peek_queued(&sq->queued[READ])))
  1041 + tg_may_dispatch(tg, bio, &read_wait);
734 1042  
735   - if ((bio = bio_list_peek(&tg->bio_lists[WRITE])))
736   - tg_may_dispatch(td, tg, bio, &write_wait);
  1043 + if ((bio = throtl_peek_queued(&sq->queued[WRITE])))
  1044 + tg_may_dispatch(tg, bio, &write_wait);
737 1045  
738 1046 min_wait = min(read_wait, write_wait);
739 1047 disptime = jiffies + min_wait;
740 1048  
741 1049 /* Update dispatch time */
742   - throtl_dequeue_tg(td, tg);
  1050 + throtl_dequeue_tg(tg);
743 1051 tg->disptime = disptime;
744   - throtl_enqueue_tg(td, tg);
  1052 + throtl_enqueue_tg(tg);
  1053 +
  1054 + /* see throtl_add_bio_tg() */
  1055 + tg->flags &= ~THROTL_TG_WAS_EMPTY;
745 1056 }
746 1057  
747   -static void tg_dispatch_one_bio(struct throtl_data *td, struct throtl_grp *tg,
748   - bool rw, struct bio_list *bl)
  1058 +static void start_parent_slice_with_credit(struct throtl_grp *child_tg,
  1059 + struct throtl_grp *parent_tg, bool rw)
749 1060 {
750   - struct bio *bio;
  1061 + if (throtl_slice_used(parent_tg, rw)) {
  1062 + throtl_start_new_slice_with_credit(parent_tg, rw,
  1063 + child_tg->slice_start[rw]);
  1064 + }
751 1065  
752   - bio = bio_list_pop(&tg->bio_lists[rw]);
753   - tg->nr_queued[rw]--;
754   - /* Drop bio reference on blkg */
755   - blkg_put(tg_to_blkg(tg));
  1066 +}
756 1067  
757   - BUG_ON(td->nr_queued[rw] <= 0);
758   - td->nr_queued[rw]--;
  1068 +static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
  1069 +{
  1070 + struct throtl_service_queue *sq = &tg->service_queue;
  1071 + struct throtl_service_queue *parent_sq = sq->parent_sq;
  1072 + struct throtl_grp *parent_tg = sq_to_tg(parent_sq);
  1073 + struct throtl_grp *tg_to_put = NULL;
  1074 + struct bio *bio;
759 1075  
  1076 + /*
  1077 + * @bio is being transferred from @tg to @parent_sq. Popping a bio
  1078 + * from @tg may put its reference and @parent_sq might end up
  1079 + * getting released prematurely. Remember the tg to put and put it
  1080 + * after @bio is transferred to @parent_sq.
  1081 + */
  1082 + bio = throtl_pop_queued(&sq->queued[rw], &tg_to_put);
  1083 + sq->nr_queued[rw]--;
  1084 +
760 1085 throtl_charge_bio(tg, bio);
761   - bio_list_add(bl, bio);
762   - bio->bi_rw |= REQ_THROTTLED;
763 1086  
764   - throtl_trim_slice(td, tg, rw);
  1087 + /*
  1088 + * If our parent is another tg, we just need to transfer @bio to
  1089 + * the parent using throtl_add_bio_tg(). If our parent is
  1090 + * @td->service_queue, @bio is ready to be issued. Put it on its
  1091 + * bio_lists[] and decrease total number queued. The caller is
  1092 + * responsible for issuing these bios.
  1093 + */
  1094 + if (parent_tg) {
  1095 + throtl_add_bio_tg(bio, &tg->qnode_on_parent[rw], parent_tg);
  1096 + start_parent_slice_with_credit(tg, parent_tg, rw);
  1097 + } else {
  1098 + throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw],
  1099 + &parent_sq->queued[rw]);
  1100 + BUG_ON(tg->td->nr_queued[rw] <= 0);
  1101 + tg->td->nr_queued[rw]--;
  1102 + }
  1103 +
  1104 + throtl_trim_slice(tg, rw);
  1105 +
  1106 + if (tg_to_put)
  1107 + blkg_put(tg_to_blkg(tg_to_put));
765 1108 }
766 1109  
767   -static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg,
768   - struct bio_list *bl)
  1110 +static int throtl_dispatch_tg(struct throtl_grp *tg)
769 1111 {
  1112 + struct throtl_service_queue *sq = &tg->service_queue;
770 1113 unsigned int nr_reads = 0, nr_writes = 0;
771 1114 unsigned int max_nr_reads = throtl_grp_quantum*3/4;
772 1115 unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads;
773 1116  
774 1117  
775 1118  
... ... @@ -774,20 +1117,20 @@
774 1117  
775 1118 /* Try to dispatch 75% READS and 25% WRITES */
776 1119  
777   - while ((bio = bio_list_peek(&tg->bio_lists[READ]))
778   - && tg_may_dispatch(td, tg, bio, NULL)) {
  1120 + while ((bio = throtl_peek_queued(&sq->queued[READ])) &&
  1121 + tg_may_dispatch(tg, bio, NULL)) {
779 1122  
780   - tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl);
  1123 + tg_dispatch_one_bio(tg, bio_data_dir(bio));
781 1124 nr_reads++;
782 1125  
783 1126 if (nr_reads >= max_nr_reads)
784 1127 break;
785 1128 }
786 1129  
787   - while ((bio = bio_list_peek(&tg->bio_lists[WRITE]))
788   - && tg_may_dispatch(td, tg, bio, NULL)) {
  1130 + while ((bio = throtl_peek_queued(&sq->queued[WRITE])) &&
  1131 + tg_may_dispatch(tg, bio, NULL)) {
789 1132  
790   - tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl);
  1133 + tg_dispatch_one_bio(tg, bio_data_dir(bio));
791 1134 nr_writes++;
792 1135  
793 1136 if (nr_writes >= max_nr_writes)
794 1137  
795 1138  
... ... @@ -797,14 +1140,13 @@
797 1140 return nr_reads + nr_writes;
798 1141 }
799 1142  
800   -static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
  1143 +static int throtl_select_dispatch(struct throtl_service_queue *parent_sq)
801 1144 {
802 1145 unsigned int nr_disp = 0;
803   - struct throtl_grp *tg;
804   - struct throtl_rb_root *st = &td->tg_service_tree;
805 1146  
806 1147 while (1) {
807   - tg = throtl_rb_first(st);
  1148 + struct throtl_grp *tg = throtl_rb_first(parent_sq);
  1149 + struct throtl_service_queue *sq = &tg->service_queue;
808 1150  
809 1151 if (!tg)
810 1152 break;
811 1153  
812 1154  
... ... @@ -812,14 +1154,12 @@
812 1154 if (time_before(jiffies, tg->disptime))
813 1155 break;
814 1156  
815   - throtl_dequeue_tg(td, tg);
  1157 + throtl_dequeue_tg(tg);
816 1158  
817   - nr_disp += throtl_dispatch_tg(td, tg, bl);
  1159 + nr_disp += throtl_dispatch_tg(tg);
818 1160  
819   - if (tg->nr_queued[0] || tg->nr_queued[1]) {
820   - tg_update_disptime(td, tg);
821   - throtl_enqueue_tg(td, tg);
822   - }
  1161 + if (sq->nr_queued[0] || sq->nr_queued[1])
  1162 + tg_update_disptime(tg);
823 1163  
824 1164 if (nr_disp >= throtl_quantum)
825 1165 break;
826 1166  
827 1167  
828 1168  
829 1169  
830 1170  
831 1171  
832 1172  
833 1173  
834 1174  
835 1175  
836 1176  
837 1177  
838 1178  
839 1179  
840 1180  
841 1181  
842 1182  
843 1183  
... ... @@ -828,113 +1168,113 @@
828 1168 return nr_disp;
829 1169 }
830 1170  
831   -static void throtl_process_limit_change(struct throtl_data *td)
  1171 +/**
  1172 + * throtl_pending_timer_fn - timer function for service_queue->pending_timer
  1173 + * @arg: the throtl_service_queue being serviced
  1174 + *
  1175 + * This timer is armed when a child throtl_grp with active bio's become
  1176 + * pending and queued on the service_queue's pending_tree and expires when
  1177 + * the first child throtl_grp should be dispatched. This function
  1178 + * dispatches bio's from the children throtl_grps to the parent
  1179 + * service_queue.
  1180 + *
  1181 + * If the parent's parent is another throtl_grp, dispatching is propagated
  1182 + * by either arming its pending_timer or repeating dispatch directly. If
  1183 + * the top-level service_tree is reached, throtl_data->dispatch_work is
  1184 + * kicked so that the ready bio's are issued.
  1185 + */
  1186 +static void throtl_pending_timer_fn(unsigned long arg)
832 1187 {
  1188 + struct throtl_service_queue *sq = (void *)arg;
  1189 + struct throtl_grp *tg = sq_to_tg(sq);
  1190 + struct throtl_data *td = sq_to_td(sq);
833 1191 struct request_queue *q = td->queue;
834   - struct blkcg_gq *blkg, *n;
  1192 + struct throtl_service_queue *parent_sq;
  1193 + bool dispatched;
  1194 + int ret;
835 1195  
836   - if (!td->limits_changed)
837   - return;
  1196 + spin_lock_irq(q->queue_lock);
  1197 +again:
  1198 + parent_sq = sq->parent_sq;
  1199 + dispatched = false;
838 1200  
839   - xchg(&td->limits_changed, false);
  1201 + while (true) {
  1202 + throtl_log(sq, "dispatch nr_queued=%u read=%u write=%u",
  1203 + sq->nr_queued[READ] + sq->nr_queued[WRITE],
  1204 + sq->nr_queued[READ], sq->nr_queued[WRITE]);
840 1205  
841   - throtl_log(td, "limits changed");
  1206 + ret = throtl_select_dispatch(sq);
  1207 + if (ret) {
  1208 + throtl_log(sq, "bios disp=%u", ret);
  1209 + dispatched = true;
  1210 + }
842 1211  
843   - list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
844   - struct throtl_grp *tg = blkg_to_tg(blkg);
  1212 + if (throtl_schedule_next_dispatch(sq, false))
  1213 + break;
845 1214  
846   - if (!tg->limits_changed)
847   - continue;
  1215 + /* this dispatch windows is still open, relax and repeat */
  1216 + spin_unlock_irq(q->queue_lock);
  1217 + cpu_relax();
  1218 + spin_lock_irq(q->queue_lock);
  1219 + }
848 1220  
849   - if (!xchg(&tg->limits_changed, false))
850   - continue;
  1221 + if (!dispatched)
  1222 + goto out_unlock;
851 1223  
852   - throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu"
853   - " riops=%u wiops=%u", tg->bps[READ], tg->bps[WRITE],
854   - tg->iops[READ], tg->iops[WRITE]);
855   -
856   - /*
857   - * Restart the slices for both READ and WRITES. It
858   - * might happen that a group's limit are dropped
859   - * suddenly and we don't want to account recently
860   - * dispatched IO with new low rate
861   - */
862   - throtl_start_new_slice(td, tg, 0);
863   - throtl_start_new_slice(td, tg, 1);
864   -
865   - if (throtl_tg_on_rr(tg))
866   - tg_update_disptime(td, tg);
  1224 + if (parent_sq) {
  1225 + /* @parent_sq is another throl_grp, propagate dispatch */
  1226 + if (tg->flags & THROTL_TG_WAS_EMPTY) {
  1227 + tg_update_disptime(tg);
  1228 + if (!throtl_schedule_next_dispatch(parent_sq, false)) {
  1229 + /* window is already open, repeat dispatching */
  1230 + sq = parent_sq;
  1231 + tg = sq_to_tg(sq);
  1232 + goto again;
  1233 + }
  1234 + }
  1235 + } else {
  1236 + /* reached the top-level, queue issueing */
  1237 + queue_work(kthrotld_workqueue, &td->dispatch_work);
867 1238 }
  1239 +out_unlock:
  1240 + spin_unlock_irq(q->queue_lock);
868 1241 }
869 1242  
870   -/* Dispatch throttled bios. Should be called without queue lock held. */
871   -static int throtl_dispatch(struct request_queue *q)
  1243 +/**
  1244 + * blk_throtl_dispatch_work_fn - work function for throtl_data->dispatch_work
  1245 + * @work: work item being executed
  1246 + *
  1247 + * This function is queued for execution when bio's reach the bio_lists[]
  1248 + * of throtl_data->service_queue. Those bio's are ready and issued by this
  1249 + * function.
  1250 + */
  1251 +void blk_throtl_dispatch_work_fn(struct work_struct *work)
872 1252 {
873   - struct throtl_data *td = q->td;
874   - unsigned int nr_disp = 0;
  1253 + struct throtl_data *td = container_of(work, struct throtl_data,
  1254 + dispatch_work);
  1255 + struct throtl_service_queue *td_sq = &td->service_queue;
  1256 + struct request_queue *q = td->queue;
875 1257 struct bio_list bio_list_on_stack;
876 1258 struct bio *bio;
877 1259 struct blk_plug plug;
  1260 + int rw;
878 1261  
879   - spin_lock_irq(q->queue_lock);
880   -
881   - throtl_process_limit_change(td);
882   -
883   - if (!total_nr_queued(td))
884   - goto out;
885   -
886 1262 bio_list_init(&bio_list_on_stack);
887 1263  
888   - throtl_log(td, "dispatch nr_queued=%u read=%u write=%u",
889   - total_nr_queued(td), td->nr_queued[READ],
890   - td->nr_queued[WRITE]);
891   -
892   - nr_disp = throtl_select_dispatch(td, &bio_list_on_stack);
893   -
894   - if (nr_disp)
895   - throtl_log(td, "bios disp=%u", nr_disp);
896   -
897   - throtl_schedule_next_dispatch(td);
898   -out:
  1264 + spin_lock_irq(q->queue_lock);
  1265 + for (rw = READ; rw <= WRITE; rw++)
  1266 + while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL)))
  1267 + bio_list_add(&bio_list_on_stack, bio);
899 1268 spin_unlock_irq(q->queue_lock);
900 1269  
901   - /*
902   - * If we dispatched some requests, unplug the queue to make sure
903   - * immediate dispatch
904   - */
905   - if (nr_disp) {
  1270 + if (!bio_list_empty(&bio_list_on_stack)) {
906 1271 blk_start_plug(&plug);
907 1272 while((bio = bio_list_pop(&bio_list_on_stack)))
908 1273 generic_make_request(bio);
909 1274 blk_finish_plug(&plug);
910 1275 }
911   - return nr_disp;
912 1276 }
913 1277  
914   -void blk_throtl_work(struct work_struct *work)
915   -{
916   - struct throtl_data *td = container_of(work, struct throtl_data,
917   - throtl_work.work);
918   - struct request_queue *q = td->queue;
919   -
920   - throtl_dispatch(q);
921   -}
922   -
923   -/* Call with queue lock held */
924   -static void
925   -throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay)
926   -{
927   -
928   - struct delayed_work *dwork = &td->throtl_work;
929   -
930   - /* schedule work if limits changed even if no bio is queued */
931   - if (total_nr_queued(td) || td->limits_changed) {
932   - mod_delayed_work(kthrotld_workqueue, dwork, delay);
933   - throtl_log(td, "schedule work. delay=%lu jiffies=%lu",
934   - delay, jiffies);
935   - }
936   -}
937   -
938 1278 static u64 tg_prfill_cpu_rwstat(struct seq_file *sf,
939 1279 struct blkg_policy_data *pd, int off)
940 1280 {
... ... @@ -1007,7 +1347,9 @@
1007 1347 struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
1008 1348 struct blkg_conf_ctx ctx;
1009 1349 struct throtl_grp *tg;
1010   - struct throtl_data *td;
  1350 + struct throtl_service_queue *sq;
  1351 + struct blkcg_gq *blkg;
  1352 + struct cgroup *pos_cgrp;
1011 1353 int ret;
1012 1354  
1013 1355 ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
... ... @@ -1015,7 +1357,7 @@
1015 1357 return ret;
1016 1358  
1017 1359 tg = blkg_to_tg(ctx.blkg);
1018   - td = ctx.blkg->q->td;
  1360 + sq = &tg->service_queue;
1019 1361  
1020 1362 if (!ctx.v)
1021 1363 ctx.v = -1;
1022 1364  
... ... @@ -1025,11 +1367,38 @@
1025 1367 else
1026 1368 *(unsigned int *)((void *)tg + cft->private) = ctx.v;
1027 1369  
1028   - /* XXX: we don't need the following deferred processing */
1029   - xchg(&tg->limits_changed, true);
1030   - xchg(&td->limits_changed, true);
1031   - throtl_schedule_delayed_work(td, 0);
  1370 + throtl_log(&tg->service_queue,
  1371 + "limit change rbps=%llu wbps=%llu riops=%u wiops=%u",
  1372 + tg->bps[READ], tg->bps[WRITE],
  1373 + tg->iops[READ], tg->iops[WRITE]);
1032 1374  
  1375 + /*
  1376 + * Update has_rules[] flags for the updated tg's subtree. A tg is
  1377 + * considered to have rules if either the tg itself or any of its
  1378 + * ancestors has rules. This identifies groups without any
  1379 + * restrictions in the whole hierarchy and allows them to bypass
  1380 + * blk-throttle.
  1381 + */
  1382 + tg_update_has_rules(tg);
  1383 + blkg_for_each_descendant_pre(blkg, pos_cgrp, ctx.blkg)
  1384 + tg_update_has_rules(blkg_to_tg(blkg));
  1385 +
  1386 + /*
  1387 + * We're already holding queue_lock and know @tg is valid. Let's
  1388 + * apply the new config directly.
  1389 + *
  1390 + * Restart the slices for both READ and WRITES. It might happen
  1391 + * that a group's limit are dropped suddenly and we don't want to
  1392 + * account recently dispatched IO with new low rate.
  1393 + */
  1394 + throtl_start_new_slice(tg, 0);
  1395 + throtl_start_new_slice(tg, 1);
  1396 +
  1397 + if (tg->flags & THROTL_TG_PENDING) {
  1398 + tg_update_disptime(tg);
  1399 + throtl_schedule_next_dispatch(sq->parent_sq, true);
  1400 + }
  1401 +
1033 1402 blkg_conf_finish(&ctx);
1034 1403 return 0;
1035 1404 }
... ... @@ -1092,7 +1461,7 @@
1092 1461 {
1093 1462 struct throtl_data *td = q->td;
1094 1463  
1095   - cancel_delayed_work_sync(&td->throtl_work);
  1464 + cancel_work_sync(&td->dispatch_work);
1096 1465 }
1097 1466  
1098 1467 static struct blkcg_policy blkcg_policy_throtl = {
... ... @@ -1100,6 +1469,7 @@
1100 1469 .cftypes = throtl_files,
1101 1470  
1102 1471 .pd_init_fn = throtl_pd_init,
  1472 + .pd_online_fn = throtl_pd_online,
1103 1473 .pd_exit_fn = throtl_pd_exit,
1104 1474 .pd_reset_stats_fn = throtl_pd_reset_stats,
1105 1475 };
1106 1476  
1107 1477  
1108 1478  
... ... @@ -1107,15 +1477,16 @@
1107 1477 bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
1108 1478 {
1109 1479 struct throtl_data *td = q->td;
  1480 + struct throtl_qnode *qn = NULL;
1110 1481 struct throtl_grp *tg;
1111   - bool rw = bio_data_dir(bio), update_disptime = true;
  1482 + struct throtl_service_queue *sq;
  1483 + bool rw = bio_data_dir(bio);
1112 1484 struct blkcg *blkcg;
1113 1485 bool throttled = false;
1114 1486  
1115   - if (bio->bi_rw & REQ_THROTTLED) {
1116   - bio->bi_rw &= ~REQ_THROTTLED;
  1487 + /* see throtl_charge_bio() */
  1488 + if (bio->bi_rw & REQ_THROTTLED)
1117 1489 goto out;
1118   - }
1119 1490  
1120 1491 /*
1121 1492 * A throtl_grp pointer retrieved under rcu can be used to access
... ... @@ -1126,7 +1497,7 @@
1126 1497 blkcg = bio_blkcg(bio);
1127 1498 tg = throtl_lookup_tg(td, blkcg);
1128 1499 if (tg) {
1129   - if (tg_no_rule_group(tg, rw)) {
  1500 + if (!tg->has_rules[rw]) {
1130 1501 throtl_update_dispatch_stats(tg_to_blkg(tg),
1131 1502 bio->bi_size, bio->bi_rw);
1132 1503 goto out_unlock_rcu;
1133 1504  
1134 1505  
... ... @@ -1142,18 +1513,18 @@
1142 1513 if (unlikely(!tg))
1143 1514 goto out_unlock;
1144 1515  
1145   - if (tg->nr_queued[rw]) {
1146   - /*
1147   - * There is already another bio queued in same dir. No
1148   - * need to update dispatch time.
1149   - */
1150   - update_disptime = false;
1151   - goto queue_bio;
  1516 + sq = &tg->service_queue;
1152 1517  
1153   - }
  1518 + while (true) {
  1519 + /* throtl is FIFO - if bios are already queued, should queue */
  1520 + if (sq->nr_queued[rw])
  1521 + break;
1154 1522  
1155   - /* Bio is with-in rate limit of group */
1156   - if (tg_may_dispatch(td, tg, bio, NULL)) {
  1523 + /* if above limits, break to queue */
  1524 + if (!tg_may_dispatch(tg, bio, NULL))
  1525 + break;
  1526 +
  1527 + /* within limits, let's charge and dispatch directly */
1157 1528 throtl_charge_bio(tg, bio);
1158 1529  
1159 1530 /*
1160 1531  
1161 1532  
1162 1533  
... ... @@ -1167,25 +1538,41 @@
1167 1538 *
1168 1539 * So keep on trimming slice even if bio is not queued.
1169 1540 */
1170   - throtl_trim_slice(td, tg, rw);
1171   - goto out_unlock;
  1541 + throtl_trim_slice(tg, rw);
  1542 +
  1543 + /*
  1544 + * @bio passed through this layer without being throttled.
  1545 + * Climb up the ladder. If we''re already at the top, it
  1546 + * can be executed directly.
  1547 + */
  1548 + qn = &tg->qnode_on_parent[rw];
  1549 + sq = sq->parent_sq;
  1550 + tg = sq_to_tg(sq);
  1551 + if (!tg)
  1552 + goto out_unlock;
1172 1553 }
1173 1554  
1174   -queue_bio:
1175   - throtl_log_tg(td, tg, "[%c] bio. bdisp=%llu sz=%u bps=%llu"
1176   - " iodisp=%u iops=%u queued=%d/%d",
1177   - rw == READ ? 'R' : 'W',
1178   - tg->bytes_disp[rw], bio->bi_size, tg->bps[rw],
1179   - tg->io_disp[rw], tg->iops[rw],
1180   - tg->nr_queued[READ], tg->nr_queued[WRITE]);
  1555 + /* out-of-limit, queue to @tg */
  1556 + throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u iops=%u queued=%d/%d",
  1557 + rw == READ ? 'R' : 'W',
  1558 + tg->bytes_disp[rw], bio->bi_size, tg->bps[rw],
  1559 + tg->io_disp[rw], tg->iops[rw],
  1560 + sq->nr_queued[READ], sq->nr_queued[WRITE]);
1181 1561  
1182 1562 bio_associate_current(bio);
1183   - throtl_add_bio_tg(q->td, tg, bio);
  1563 + tg->td->nr_queued[rw]++;
  1564 + throtl_add_bio_tg(bio, qn, tg);
1184 1565 throttled = true;
1185 1566  
1186   - if (update_disptime) {
1187   - tg_update_disptime(td, tg);
1188   - throtl_schedule_next_dispatch(td);
  1567 + /*
  1568 + * Update @tg's dispatch time and force schedule dispatch if @tg
  1569 + * was empty before @bio. The forced scheduling isn't likely to
  1570 + * cause undue delay as @bio is likely to be dispatched directly if
  1571 + * its @tg's disptime is not in the future.
  1572 + */
  1573 + if (tg->flags & THROTL_TG_WAS_EMPTY) {
  1574 + tg_update_disptime(tg);
  1575 + throtl_schedule_next_dispatch(tg->service_queue.parent_sq, true);
1189 1576 }
1190 1577  
1191 1578 out_unlock:
1192 1579  
... ... @@ -1193,9 +1580,38 @@
1193 1580 out_unlock_rcu:
1194 1581 rcu_read_unlock();
1195 1582 out:
  1583 + /*
  1584 + * As multiple blk-throtls may stack in the same issue path, we
  1585 + * don't want bios to leave with the flag set. Clear the flag if
  1586 + * being issued.
  1587 + */
  1588 + if (!throttled)
  1589 + bio->bi_rw &= ~REQ_THROTTLED;
1196 1590 return throttled;
1197 1591 }
1198 1592  
  1593 +/*
  1594 + * Dispatch all bios from all children tg's queued on @parent_sq. On
  1595 + * return, @parent_sq is guaranteed to not have any active children tg's
  1596 + * and all bios from previously active tg's are on @parent_sq->bio_lists[].
  1597 + */
  1598 +static void tg_drain_bios(struct throtl_service_queue *parent_sq)
  1599 +{
  1600 + struct throtl_grp *tg;
  1601 +
  1602 + while ((tg = throtl_rb_first(parent_sq))) {
  1603 + struct throtl_service_queue *sq = &tg->service_queue;
  1604 + struct bio *bio;
  1605 +
  1606 + throtl_dequeue_tg(tg);
  1607 +
  1608 + while ((bio = throtl_peek_queued(&sq->queued[READ])))
  1609 + tg_dispatch_one_bio(tg, bio_data_dir(bio));
  1610 + while ((bio = throtl_peek_queued(&sq->queued[WRITE])))
  1611 + tg_dispatch_one_bio(tg, bio_data_dir(bio));
  1612 + }
  1613 +}
  1614 +
1199 1615 /**
1200 1616 * blk_throtl_drain - drain throttled bios
1201 1617 * @q: request_queue to drain throttled bios for
1202 1618  
1203 1619  
1204 1620  
1205 1621  
1206 1622  
1207 1623  
... ... @@ -1206,27 +1622,36 @@
1206 1622 __releases(q->queue_lock) __acquires(q->queue_lock)
1207 1623 {
1208 1624 struct throtl_data *td = q->td;
1209   - struct throtl_rb_root *st = &td->tg_service_tree;
1210   - struct throtl_grp *tg;
1211   - struct bio_list bl;
  1625 + struct blkcg_gq *blkg;
  1626 + struct cgroup *pos_cgrp;
1212 1627 struct bio *bio;
  1628 + int rw;
1213 1629  
1214 1630 queue_lockdep_assert_held(q);
  1631 + rcu_read_lock();
1215 1632  
1216   - bio_list_init(&bl);
  1633 + /*
  1634 + * Drain each tg while doing post-order walk on the blkg tree, so
  1635 + * that all bios are propagated to td->service_queue. It'd be
  1636 + * better to walk service_queue tree directly but blkg walk is
  1637 + * easier.
  1638 + */
  1639 + blkg_for_each_descendant_post(blkg, pos_cgrp, td->queue->root_blkg)
  1640 + tg_drain_bios(&blkg_to_tg(blkg)->service_queue);
1217 1641  
1218   - while ((tg = throtl_rb_first(st))) {
1219   - throtl_dequeue_tg(td, tg);
  1642 + tg_drain_bios(&td_root_tg(td)->service_queue);
1220 1643  
1221   - while ((bio = bio_list_peek(&tg->bio_lists[READ])))
1222   - tg_dispatch_one_bio(td, tg, bio_data_dir(bio), &bl);
1223   - while ((bio = bio_list_peek(&tg->bio_lists[WRITE])))
1224   - tg_dispatch_one_bio(td, tg, bio_data_dir(bio), &bl);
1225   - }
  1644 + /* finally, transfer bios from top-level tg's into the td */
  1645 + tg_drain_bios(&td->service_queue);
  1646 +
  1647 + rcu_read_unlock();
1226 1648 spin_unlock_irq(q->queue_lock);
1227 1649  
1228   - while ((bio = bio_list_pop(&bl)))
1229   - generic_make_request(bio);
  1650 + /* all bios now should be in td->service_queue, issue them */
  1651 + for (rw = READ; rw <= WRITE; rw++)
  1652 + while ((bio = throtl_pop_queued(&td->service_queue.queued[rw],
  1653 + NULL)))
  1654 + generic_make_request(bio);
1230 1655  
1231 1656 spin_lock_irq(q->queue_lock);
1232 1657 }
... ... @@ -1240,9 +1665,8 @@
1240 1665 if (!td)
1241 1666 return -ENOMEM;
1242 1667  
1243   - td->tg_service_tree = THROTL_RB_ROOT;
1244   - td->limits_changed = false;
1245   - INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);
  1668 + INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
  1669 + throtl_service_queue_init(&td->service_queue, NULL);
1246 1670  
1247 1671 q->td = td;
1248 1672 td->queue = q;
... ... @@ -4347,18 +4347,28 @@
4347 4347 kfree(cfqd);
4348 4348 }
4349 4349  
4350   -static int cfq_init_queue(struct request_queue *q)
  4350 +static int cfq_init_queue(struct request_queue *q, struct elevator_type *e)
4351 4351 {
4352 4352 struct cfq_data *cfqd;
4353 4353 struct blkcg_gq *blkg __maybe_unused;
4354 4354 int i, ret;
  4355 + struct elevator_queue *eq;
4355 4356  
  4357 + eq = elevator_alloc(q, e);
  4358 + if (!eq)
  4359 + return -ENOMEM;
  4360 +
4356 4361 cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
4357   - if (!cfqd)
  4362 + if (!cfqd) {
  4363 + kobject_put(&eq->kobj);
4358 4364 return -ENOMEM;
  4365 + }
  4366 + eq->elevator_data = cfqd;
4359 4367  
4360 4368 cfqd->queue = q;
4361   - q->elevator->elevator_data = cfqd;
  4369 + spin_lock_irq(q->queue_lock);
  4370 + q->elevator = eq;
  4371 + spin_unlock_irq(q->queue_lock);
4362 4372  
4363 4373 /* Init root service tree */
4364 4374 cfqd->grp_service_tree = CFQ_RB_ROOT;
... ... @@ -4433,6 +4443,7 @@
4433 4443  
4434 4444 out_free:
4435 4445 kfree(cfqd);
  4446 + kobject_put(&eq->kobj);
4436 4447 return ret;
4437 4448 }
4438 4449  
block/deadline-iosched.c
... ... @@ -337,13 +337,21 @@
337 337 /*
338 338 * initialize elevator private data (deadline_data).
339 339 */
340   -static int deadline_init_queue(struct request_queue *q)
  340 +static int deadline_init_queue(struct request_queue *q, struct elevator_type *e)
341 341 {
342 342 struct deadline_data *dd;
  343 + struct elevator_queue *eq;
343 344  
  345 + eq = elevator_alloc(q, e);
  346 + if (!eq)
  347 + return -ENOMEM;
  348 +
344 349 dd = kmalloc_node(sizeof(*dd), GFP_KERNEL | __GFP_ZERO, q->node);
345   - if (!dd)
  350 + if (!dd) {
  351 + kobject_put(&eq->kobj);
346 352 return -ENOMEM;
  353 + }
  354 + eq->elevator_data = dd;
347 355  
348 356 INIT_LIST_HEAD(&dd->fifo_list[READ]);
349 357 INIT_LIST_HEAD(&dd->fifo_list[WRITE]);
... ... @@ -355,7 +363,9 @@
355 363 dd->front_merges = 1;
356 364 dd->fifo_batch = fifo_batch;
357 365  
358   - q->elevator->elevator_data = dd;
  366 + spin_lock_irq(q->queue_lock);
  367 + q->elevator = eq;
  368 + spin_unlock_irq(q->queue_lock);
359 369 return 0;
360 370 }
361 371  
... ... @@ -150,7 +150,7 @@
150 150  
151 151 static struct kobj_type elv_ktype;
152 152  
153   -static struct elevator_queue *elevator_alloc(struct request_queue *q,
  153 +struct elevator_queue *elevator_alloc(struct request_queue *q,
154 154 struct elevator_type *e)
155 155 {
156 156 struct elevator_queue *eq;
... ... @@ -170,6 +170,7 @@
170 170 elevator_put(e);
171 171 return NULL;
172 172 }
  173 +EXPORT_SYMBOL(elevator_alloc);
173 174  
174 175 static void elevator_release(struct kobject *kobj)
175 176 {
... ... @@ -221,16 +222,7 @@
221 222 }
222 223 }
223 224  
224   - q->elevator = elevator_alloc(q, e);
225   - if (!q->elevator)
226   - return -ENOMEM;
227   -
228   - err = e->ops.elevator_init_fn(q);
229   - if (err) {
230   - kobject_put(&q->elevator->kobj);
231   - return err;
232   - }
233   -
  225 + err = e->ops.elevator_init_fn(q, e);
234 226 return 0;
235 227 }
236 228 EXPORT_SYMBOL(elevator_init);
237 229  
... ... @@ -935,16 +927,9 @@
935 927 spin_unlock_irq(q->queue_lock);
936 928  
937 929 /* allocate, init and register new elevator */
938   - err = -ENOMEM;
939   - q->elevator = elevator_alloc(q, new_e);
940   - if (!q->elevator)
  930 + err = new_e->ops.elevator_init_fn(q, new_e);
  931 + if (err)
941 932 goto fail_init;
942   -
943   - err = new_e->ops.elevator_init_fn(q);
944   - if (err) {
945   - kobject_put(&q->elevator->kobj);
946   - goto fail_init;
947   - }
948 933  
949 934 if (registered) {
950 935 err = elv_register_queue(q);
block/noop-iosched.c
... ... @@ -59,16 +59,27 @@
59 59 return list_entry(rq->queuelist.next, struct request, queuelist);
60 60 }
61 61  
62   -static int noop_init_queue(struct request_queue *q)
  62 +static int noop_init_queue(struct request_queue *q, struct elevator_type *e)
63 63 {
64 64 struct noop_data *nd;
  65 + struct elevator_queue *eq;
65 66  
  67 + eq = elevator_alloc(q, e);
  68 + if (!eq)
  69 + return -ENOMEM;
  70 +
66 71 nd = kmalloc_node(sizeof(*nd), GFP_KERNEL, q->node);
67   - if (!nd)
  72 + if (!nd) {
  73 + kobject_put(&eq->kobj);
68 74 return -ENOMEM;
  75 + }
  76 + eq->elevator_data = nd;
69 77  
70 78 INIT_LIST_HEAD(&nd->queue);
71   - q->elevator->elevator_data = nd;
  79 +
  80 + spin_lock_irq(q->queue_lock);
  81 + q->elevator = eq;
  82 + spin_unlock_irq(q->queue_lock);
72 83 return 0;
73 84 }
74 85  
... ... @@ -58,17 +58,24 @@
58 58 struct backing_dev_info *dst)
59 59 {
60 60 struct backing_dev_info *old = inode->i_data.backing_dev_info;
  61 + bool wakeup_bdi = false;
61 62  
62 63 if (unlikely(dst == old)) /* deadlock avoidance */
63 64 return;
64 65 bdi_lock_two(&old->wb, &dst->wb);
65 66 spin_lock(&inode->i_lock);
66 67 inode->i_data.backing_dev_info = dst;
67   - if (inode->i_state & I_DIRTY)
  68 + if (inode->i_state & I_DIRTY) {
  69 + if (bdi_cap_writeback_dirty(dst) && !wb_has_dirty_io(&dst->wb))
  70 + wakeup_bdi = true;
68 71 list_move(&inode->i_wb_list, &dst->wb.b_dirty);
  72 + }
69 73 spin_unlock(&inode->i_lock);
70 74 spin_unlock(&old->wb.list_lock);
71 75 spin_unlock(&dst->wb.list_lock);
  76 +
  77 + if (wakeup_bdi)
  78 + bdi_wakeup_thread_delayed(dst);
72 79 }
73 80  
74 81 /* Kill _all_ buffers and pagecache , dirty or not.. */
include/linux/cgroup.h
... ... @@ -278,6 +278,8 @@
278 278 *
279 279 * - memcg: use_hierarchy is on by default and the cgroup file for
280 280 * the flag is not created.
  281 + *
  282 + * - blkcg: blk-throttle becomes properly hierarchical.
281 283 */
282 284 CGRP_ROOT_SANE_BEHAVIOR = (1 << 0),
283 285  
include/linux/elevator.h
... ... @@ -7,6 +7,7 @@
7 7 #ifdef CONFIG_BLOCK
8 8  
9 9 struct io_cq;
  10 +struct elevator_type;
10 11  
11 12 typedef int (elevator_merge_fn) (struct request_queue *, struct request **,
12 13 struct bio *);
... ... @@ -35,7 +36,8 @@
35 36 typedef void (elevator_activate_req_fn) (struct request_queue *, struct request *);
36 37 typedef void (elevator_deactivate_req_fn) (struct request_queue *, struct request *);
37 38  
38   -typedef int (elevator_init_fn) (struct request_queue *);
  39 +typedef int (elevator_init_fn) (struct request_queue *,
  40 + struct elevator_type *e);
39 41 typedef void (elevator_exit_fn) (struct elevator_queue *);
40 42  
41 43 struct elevator_ops
... ... @@ -155,6 +157,8 @@
155 157 extern void elevator_exit(struct elevator_queue *);
156 158 extern int elevator_change(struct request_queue *, const char *);
157 159 extern bool elv_rq_merge_ok(struct request *, struct bio *);
  160 +extern struct elevator_queue *elevator_alloc(struct request_queue *,
  161 + struct elevator_type *);
158 162  
159 163 /*
160 164 * Helper functions.