Commit 36805aaea5ae3cf1bb32f1643e0a800bb69f0d5b
Exists in
smarc-imx_3.14.28_1.0.0_ga
and in
1 other branch
Merge branch 'for-3.11/core' of git://git.kernel.dk/linux-block
Pull core block IO updates from Jens Axboe: "Here are the core IO block bits for 3.11. It contains: - A tweak to the reserved tag logic from Jan, for weirdo devices with just 3 free tags. But for those it improves things substantially for random writes. - Periodic writeback fix from Jan. Marked for stable as well. - Fix for a race condition in IO scheduler switching from Jianpeng. - The hierarchical blk-cgroup support from Tejun. This is the grunt of the series. - blk-throttle fix from Vivek. Just a note that I'm in the middle of a relocation, whole family is flying out tomorrow. Hence I will be awal the remainder of this week, but back at work again on Monday the 15th. CC'ing Tejun, since any potential "surprises" will most likely be from the blk-cgroup work. But it's been brewing for a while and sitting in my tree and linux-next for a long time, so should be solid." * 'for-3.11/core' of git://git.kernel.dk/linux-block: (36 commits) elevator: Fix a race in elevator switching block: Reserve only one queue tag for sync IO if only 3 tags are available writeback: Fix periodic writeback after fs mount blk-throttle: implement proper hierarchy support blk-throttle: implement throtl_grp->has_rules[] blk-throttle: Account for child group's start time in parent while bio climbs up blk-throttle: add throtl_qnode for dispatch fairness blk-throttle: make throtl_pending_timer_fn() ready for hierarchy blk-throttle: make tg_dispatch_one_bio() ready for hierarchy blk-throttle: make blk_throtl_bio() ready for hierarchy blk-throttle: make blk_throtl_drain() ready for hierarchy blk-throttle: dispatch from throtl_pending_timer_fn() blk-throttle: implement dispatch looping blk-throttle: separate out throtl_service_queue->pending_timer from throtl_data->dispatch_work blk-throttle: set REQ_THROTTLED from throtl_charge_bio() and gate stats update with it blk-throttle: implement sq_to_tg(), sq_to_td() and throtl_log() blk-throttle: add throtl_service_queue->parent_sq blk-throttle: generalize update_disptime optimization in blk_throtl_bio() blk-throttle: dispatch to throtl_data->service_queue.bio_lists[] blk-throttle: move bio_lists[] and friends to throtl_service_queue ...
Showing 12 changed files Side-by-side Diff
Documentation/cgroups/blkio-controller.txt
... | ... | @@ -94,32 +94,33 @@ |
94 | 94 | |
95 | 95 | Hierarchical Cgroups |
96 | 96 | ==================== |
97 | -- Currently only CFQ supports hierarchical groups. For throttling, | |
98 | - cgroup interface does allow creation of hierarchical cgroups and | |
99 | - internally it treats them as flat hierarchy. | |
100 | 97 | |
101 | - If somebody created a hierarchy like as follows. | |
98 | +Both CFQ and throttling implement hierarchy support; however, | |
99 | +throttling's hierarchy support is enabled iff "sane_behavior" is | |
100 | +enabled from cgroup side, which currently is a development option and | |
101 | +not publicly available. | |
102 | 102 | |
103 | +If somebody created a hierarchy like as follows. | |
104 | + | |
103 | 105 | root |
104 | 106 | / \ |
105 | 107 | test1 test2 |
106 | 108 | | |
107 | 109 | test3 |
108 | 110 | |
109 | - CFQ will handle the hierarchy correctly but and throttling will | |
110 | - practically treat all groups at same level. For details on CFQ | |
111 | - hierarchy support, refer to Documentation/block/cfq-iosched.txt. | |
112 | - Throttling will treat the hierarchy as if it looks like the | |
113 | - following. | |
111 | +CFQ by default and throttling with "sane_behavior" will handle the | |
112 | +hierarchy correctly. For details on CFQ hierarchy support, refer to | |
113 | +Documentation/block/cfq-iosched.txt. For throttling, all limits apply | |
114 | +to the whole subtree while all statistics are local to the IOs | |
115 | +directly generated by tasks in that cgroup. | |
114 | 116 | |
117 | +Throttling without "sane_behavior" enabled from cgroup side will | |
118 | +practically treat all groups at same level as if it looks like the | |
119 | +following. | |
120 | + | |
115 | 121 | pivot |
116 | 122 | / / \ \ |
117 | 123 | root test1 test2 test3 |
118 | - | |
119 | - Nesting cgroups, while allowed, isn't officially supported and blkio | |
120 | - genereates warning when cgroups nest. Once throttling implements | |
121 | - hierarchy support, hierarchy will be supported and the warning will | |
122 | - be removed. | |
123 | 124 | |
124 | 125 | Various user visible config options |
125 | 126 | =================================== |
block/blk-cgroup.c
... | ... | @@ -32,26 +32,6 @@ |
32 | 32 | |
33 | 33 | static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; |
34 | 34 | |
35 | -static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, | |
36 | - struct request_queue *q, bool update_hint); | |
37 | - | |
38 | -/** | |
39 | - * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants | |
40 | - * @d_blkg: loop cursor pointing to the current descendant | |
41 | - * @pos_cgrp: used for iteration | |
42 | - * @p_blkg: target blkg to walk descendants of | |
43 | - * | |
44 | - * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU | |
45 | - * read locked. If called under either blkcg or queue lock, the iteration | |
46 | - * is guaranteed to include all and only online blkgs. The caller may | |
47 | - * update @pos_cgrp by calling cgroup_rightmost_descendant() to skip | |
48 | - * subtree. | |
49 | - */ | |
50 | -#define blkg_for_each_descendant_pre(d_blkg, pos_cgrp, p_blkg) \ | |
51 | - cgroup_for_each_descendant_pre((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \ | |
52 | - if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \ | |
53 | - (p_blkg)->q, false))) | |
54 | - | |
55 | 35 | static bool blkcg_policy_enabled(struct request_queue *q, |
56 | 36 | const struct blkcg_policy *pol) |
57 | 37 | { |
58 | 38 | |
... | ... | @@ -71,19 +51,9 @@ |
71 | 51 | if (!blkg) |
72 | 52 | return; |
73 | 53 | |
74 | - for (i = 0; i < BLKCG_MAX_POLS; i++) { | |
75 | - struct blkcg_policy *pol = blkcg_policy[i]; | |
76 | - struct blkg_policy_data *pd = blkg->pd[i]; | |
54 | + for (i = 0; i < BLKCG_MAX_POLS; i++) | |
55 | + kfree(blkg->pd[i]); | |
77 | 56 | |
78 | - if (!pd) | |
79 | - continue; | |
80 | - | |
81 | - if (pol && pol->pd_exit_fn) | |
82 | - pol->pd_exit_fn(blkg); | |
83 | - | |
84 | - kfree(pd); | |
85 | - } | |
86 | - | |
87 | 57 | blk_exit_rl(&blkg->rl); |
88 | 58 | kfree(blkg); |
89 | 59 | } |
... | ... | @@ -134,10 +104,6 @@ |
134 | 104 | blkg->pd[i] = pd; |
135 | 105 | pd->blkg = blkg; |
136 | 106 | pd->plid = i; |
137 | - | |
138 | - /* invoke per-policy init */ | |
139 | - if (pol->pd_init_fn) | |
140 | - pol->pd_init_fn(blkg); | |
141 | 107 | } |
142 | 108 | |
143 | 109 | return blkg; |
... | ... | @@ -158,8 +124,8 @@ |
158 | 124 | * @q's bypass state. If @update_hint is %true, the caller should be |
159 | 125 | * holding @q->queue_lock and lookup hint is updated on success. |
160 | 126 | */ |
161 | -static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, | |
162 | - struct request_queue *q, bool update_hint) | |
127 | +struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q, | |
128 | + bool update_hint) | |
163 | 129 | { |
164 | 130 | struct blkcg_gq *blkg; |
165 | 131 | |
166 | 132 | |
167 | 133 | |
... | ... | @@ -234,16 +200,25 @@ |
234 | 200 | } |
235 | 201 | blkg = new_blkg; |
236 | 202 | |
237 | - /* link parent and insert */ | |
203 | + /* link parent */ | |
238 | 204 | if (blkcg_parent(blkcg)) { |
239 | 205 | blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false); |
240 | 206 | if (WARN_ON_ONCE(!blkg->parent)) { |
241 | - blkg = ERR_PTR(-EINVAL); | |
207 | + ret = -EINVAL; | |
242 | 208 | goto err_put_css; |
243 | 209 | } |
244 | 210 | blkg_get(blkg->parent); |
245 | 211 | } |
246 | 212 | |
213 | + /* invoke per-policy init */ | |
214 | + for (i = 0; i < BLKCG_MAX_POLS; i++) { | |
215 | + struct blkcg_policy *pol = blkcg_policy[i]; | |
216 | + | |
217 | + if (blkg->pd[i] && pol->pd_init_fn) | |
218 | + pol->pd_init_fn(blkg); | |
219 | + } | |
220 | + | |
221 | + /* insert */ | |
247 | 222 | spin_lock(&blkcg->lock); |
248 | 223 | ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg); |
249 | 224 | if (likely(!ret)) { |
250 | 225 | |
251 | 226 | |
252 | 227 | |
253 | 228 | |
254 | 229 | |
255 | 230 | |
... | ... | @@ -394,30 +369,38 @@ |
394 | 369 | q->root_rl.blkg = NULL; |
395 | 370 | } |
396 | 371 | |
397 | -static void blkg_rcu_free(struct rcu_head *rcu_head) | |
372 | +/* | |
373 | + * A group is RCU protected, but having an rcu lock does not mean that one | |
374 | + * can access all the fields of blkg and assume these are valid. For | |
375 | + * example, don't try to follow throtl_data and request queue links. | |
376 | + * | |
377 | + * Having a reference to blkg under an rcu allows accesses to only values | |
378 | + * local to groups like group stats and group rate limits. | |
379 | + */ | |
380 | +void __blkg_release_rcu(struct rcu_head *rcu_head) | |
398 | 381 | { |
399 | - blkg_free(container_of(rcu_head, struct blkcg_gq, rcu_head)); | |
400 | -} | |
382 | + struct blkcg_gq *blkg = container_of(rcu_head, struct blkcg_gq, rcu_head); | |
383 | + int i; | |
401 | 384 | |
402 | -void __blkg_release(struct blkcg_gq *blkg) | |
403 | -{ | |
385 | + /* tell policies that this one is being freed */ | |
386 | + for (i = 0; i < BLKCG_MAX_POLS; i++) { | |
387 | + struct blkcg_policy *pol = blkcg_policy[i]; | |
388 | + | |
389 | + if (blkg->pd[i] && pol->pd_exit_fn) | |
390 | + pol->pd_exit_fn(blkg); | |
391 | + } | |
392 | + | |
404 | 393 | /* release the blkcg and parent blkg refs this blkg has been holding */ |
405 | 394 | css_put(&blkg->blkcg->css); |
406 | - if (blkg->parent) | |
395 | + if (blkg->parent) { | |
396 | + spin_lock_irq(blkg->q->queue_lock); | |
407 | 397 | blkg_put(blkg->parent); |
398 | + spin_unlock_irq(blkg->q->queue_lock); | |
399 | + } | |
408 | 400 | |
409 | - /* | |
410 | - * A group is freed in rcu manner. But having an rcu lock does not | |
411 | - * mean that one can access all the fields of blkg and assume these | |
412 | - * are valid. For example, don't try to follow throtl_data and | |
413 | - * request queue links. | |
414 | - * | |
415 | - * Having a reference to blkg under an rcu allows acess to only | |
416 | - * values local to groups like group stats and group rate limits | |
417 | - */ | |
418 | - call_rcu(&blkg->rcu_head, blkg_rcu_free); | |
401 | + blkg_free(blkg); | |
419 | 402 | } |
420 | -EXPORT_SYMBOL_GPL(__blkg_release); | |
403 | +EXPORT_SYMBOL_GPL(__blkg_release_rcu); | |
421 | 404 | |
422 | 405 | /* |
423 | 406 | * The next function used by blk_queue_for_each_rl(). It's a bit tricky |
... | ... | @@ -928,14 +911,6 @@ |
928 | 911 | .subsys_id = blkio_subsys_id, |
929 | 912 | .base_cftypes = blkcg_files, |
930 | 913 | .module = THIS_MODULE, |
931 | - | |
932 | - /* | |
933 | - * blkio subsystem is utterly broken in terms of hierarchy support. | |
934 | - * It treats all cgroups equally regardless of where they're | |
935 | - * located in the hierarchy - all cgroups are treated as if they're | |
936 | - * right below the root. Fix it and remove the following. | |
937 | - */ | |
938 | - .broken_hierarchy = true, | |
939 | 914 | }; |
940 | 915 | EXPORT_SYMBOL_GPL(blkio_subsys); |
941 | 916 |
block/blk-cgroup.h
... | ... | @@ -266,7 +266,7 @@ |
266 | 266 | blkg->refcnt++; |
267 | 267 | } |
268 | 268 | |
269 | -void __blkg_release(struct blkcg_gq *blkg); | |
269 | +void __blkg_release_rcu(struct rcu_head *rcu); | |
270 | 270 | |
271 | 271 | /** |
272 | 272 | * blkg_put - put a blkg reference |
273 | 273 | |
... | ... | @@ -279,8 +279,42 @@ |
279 | 279 | lockdep_assert_held(blkg->q->queue_lock); |
280 | 280 | WARN_ON_ONCE(blkg->refcnt <= 0); |
281 | 281 | if (!--blkg->refcnt) |
282 | - __blkg_release(blkg); | |
282 | + call_rcu(&blkg->rcu_head, __blkg_release_rcu); | |
283 | 283 | } |
284 | + | |
285 | +struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q, | |
286 | + bool update_hint); | |
287 | + | |
288 | +/** | |
289 | + * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants | |
290 | + * @d_blkg: loop cursor pointing to the current descendant | |
291 | + * @pos_cgrp: used for iteration | |
292 | + * @p_blkg: target blkg to walk descendants of | |
293 | + * | |
294 | + * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU | |
295 | + * read locked. If called under either blkcg or queue lock, the iteration | |
296 | + * is guaranteed to include all and only online blkgs. The caller may | |
297 | + * update @pos_cgrp by calling cgroup_rightmost_descendant() to skip | |
298 | + * subtree. | |
299 | + */ | |
300 | +#define blkg_for_each_descendant_pre(d_blkg, pos_cgrp, p_blkg) \ | |
301 | + cgroup_for_each_descendant_pre((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \ | |
302 | + if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \ | |
303 | + (p_blkg)->q, false))) | |
304 | + | |
305 | +/** | |
306 | + * blkg_for_each_descendant_post - post-order walk of a blkg's descendants | |
307 | + * @d_blkg: loop cursor pointing to the current descendant | |
308 | + * @pos_cgrp: used for iteration | |
309 | + * @p_blkg: target blkg to walk descendants of | |
310 | + * | |
311 | + * Similar to blkg_for_each_descendant_pre() but performs post-order | |
312 | + * traversal instead. Synchronization rules are the same. | |
313 | + */ | |
314 | +#define blkg_for_each_descendant_post(d_blkg, pos_cgrp, p_blkg) \ | |
315 | + cgroup_for_each_descendant_post((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \ | |
316 | + if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \ | |
317 | + (p_blkg)->q, false))) | |
284 | 318 | |
285 | 319 | /** |
286 | 320 | * blk_get_rl - get request_list to use |
block/blk-tag.c
... | ... | @@ -348,9 +348,16 @@ |
348 | 348 | */ |
349 | 349 | max_depth = bqt->max_depth; |
350 | 350 | if (!rq_is_sync(rq) && max_depth > 1) { |
351 | - max_depth -= 2; | |
352 | - if (!max_depth) | |
351 | + switch (max_depth) { | |
352 | + case 2: | |
353 | 353 | max_depth = 1; |
354 | + break; | |
355 | + case 3: | |
356 | + max_depth = 2; | |
357 | + break; | |
358 | + default: | |
359 | + max_depth -= 2; | |
360 | + } | |
354 | 361 | if (q->in_flight[BLK_RW_ASYNC] > max_depth) |
355 | 362 | return 1; |
356 | 363 | } |
block/blk-throttle.c
Changes suppressed. Click to show
... | ... | @@ -25,19 +25,62 @@ |
25 | 25 | |
26 | 26 | /* A workqueue to queue throttle related work */ |
27 | 27 | static struct workqueue_struct *kthrotld_workqueue; |
28 | -static void throtl_schedule_delayed_work(struct throtl_data *td, | |
29 | - unsigned long delay); | |
30 | 28 | |
31 | -struct throtl_rb_root { | |
32 | - struct rb_root rb; | |
33 | - struct rb_node *left; | |
34 | - unsigned int count; | |
35 | - unsigned long min_disptime; | |
29 | +/* | |
30 | + * To implement hierarchical throttling, throtl_grps form a tree and bios | |
31 | + * are dispatched upwards level by level until they reach the top and get | |
32 | + * issued. When dispatching bios from the children and local group at each | |
33 | + * level, if the bios are dispatched into a single bio_list, there's a risk | |
34 | + * of a local or child group which can queue many bios at once filling up | |
35 | + * the list starving others. | |
36 | + * | |
37 | + * To avoid such starvation, dispatched bios are queued separately | |
38 | + * according to where they came from. When they are again dispatched to | |
39 | + * the parent, they're popped in round-robin order so that no single source | |
40 | + * hogs the dispatch window. | |
41 | + * | |
42 | + * throtl_qnode is used to keep the queued bios separated by their sources. | |
43 | + * Bios are queued to throtl_qnode which in turn is queued to | |
44 | + * throtl_service_queue and then dispatched in round-robin order. | |
45 | + * | |
46 | + * It's also used to track the reference counts on blkg's. A qnode always | |
47 | + * belongs to a throtl_grp and gets queued on itself or the parent, so | |
48 | + * incrementing the reference of the associated throtl_grp when a qnode is | |
49 | + * queued and decrementing when dequeued is enough to keep the whole blkg | |
50 | + * tree pinned while bios are in flight. | |
51 | + */ | |
52 | +struct throtl_qnode { | |
53 | + struct list_head node; /* service_queue->queued[] */ | |
54 | + struct bio_list bios; /* queued bios */ | |
55 | + struct throtl_grp *tg; /* tg this qnode belongs to */ | |
36 | 56 | }; |
37 | 57 | |
38 | -#define THROTL_RB_ROOT (struct throtl_rb_root) { .rb = RB_ROOT, .left = NULL, \ | |
39 | - .count = 0, .min_disptime = 0} | |
58 | +struct throtl_service_queue { | |
59 | + struct throtl_service_queue *parent_sq; /* the parent service_queue */ | |
40 | 60 | |
61 | + /* | |
62 | + * Bios queued directly to this service_queue or dispatched from | |
63 | + * children throtl_grp's. | |
64 | + */ | |
65 | + struct list_head queued[2]; /* throtl_qnode [READ/WRITE] */ | |
66 | + unsigned int nr_queued[2]; /* number of queued bios */ | |
67 | + | |
68 | + /* | |
69 | + * RB tree of active children throtl_grp's, which are sorted by | |
70 | + * their ->disptime. | |
71 | + */ | |
72 | + struct rb_root pending_tree; /* RB tree of active tgs */ | |
73 | + struct rb_node *first_pending; /* first node in the tree */ | |
74 | + unsigned int nr_pending; /* # queued in the tree */ | |
75 | + unsigned long first_pending_disptime; /* disptime of the first tg */ | |
76 | + struct timer_list pending_timer; /* fires on first_pending_disptime */ | |
77 | +}; | |
78 | + | |
79 | +enum tg_state_flags { | |
80 | + THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */ | |
81 | + THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */ | |
82 | +}; | |
83 | + | |
41 | 84 | #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) |
42 | 85 | |
43 | 86 | /* Per-cpu group stats */ |
44 | 87 | |
45 | 88 | |
... | ... | @@ -52,10 +95,27 @@ |
52 | 95 | /* must be the first member */ |
53 | 96 | struct blkg_policy_data pd; |
54 | 97 | |
55 | - /* active throtl group service_tree member */ | |
98 | + /* active throtl group service_queue member */ | |
56 | 99 | struct rb_node rb_node; |
57 | 100 | |
101 | + /* throtl_data this group belongs to */ | |
102 | + struct throtl_data *td; | |
103 | + | |
104 | + /* this group's service queue */ | |
105 | + struct throtl_service_queue service_queue; | |
106 | + | |
58 | 107 | /* |
108 | + * qnode_on_self is used when bios are directly queued to this | |
109 | + * throtl_grp so that local bios compete fairly with bios | |
110 | + * dispatched from children. qnode_on_parent is used when bios are | |
111 | + * dispatched from this throtl_grp into its parent and will compete | |
112 | + * with the sibling qnode_on_parents and the parent's | |
113 | + * qnode_on_self. | |
114 | + */ | |
115 | + struct throtl_qnode qnode_on_self[2]; | |
116 | + struct throtl_qnode qnode_on_parent[2]; | |
117 | + | |
118 | + /* | |
59 | 119 | * Dispatch time in jiffies. This is the estimated time when group |
60 | 120 | * will unthrottle and is ready to dispatch more bio. It is used as |
61 | 121 | * key to sort active groups in service tree. |
62 | 122 | |
... | ... | @@ -64,12 +124,9 @@ |
64 | 124 | |
65 | 125 | unsigned int flags; |
66 | 126 | |
67 | - /* Two lists for READ and WRITE */ | |
68 | - struct bio_list bio_lists[2]; | |
127 | + /* are there any throtl rules between this group and td? */ | |
128 | + bool has_rules[2]; | |
69 | 129 | |
70 | - /* Number of queued bios on READ and WRITE lists */ | |
71 | - unsigned int nr_queued[2]; | |
72 | - | |
73 | 130 | /* bytes per second rate limits */ |
74 | 131 | uint64_t bps[2]; |
75 | 132 | |
... | ... | @@ -85,9 +142,6 @@ |
85 | 142 | unsigned long slice_start[2]; |
86 | 143 | unsigned long slice_end[2]; |
87 | 144 | |
88 | - /* Some throttle limits got updated for the group */ | |
89 | - int limits_changed; | |
90 | - | |
91 | 145 | /* Per cpu stats pointer */ |
92 | 146 | struct tg_stats_cpu __percpu *stats_cpu; |
93 | 147 | |
... | ... | @@ -98,7 +152,7 @@ |
98 | 152 | struct throtl_data |
99 | 153 | { |
100 | 154 | /* service tree for active throtl groups */ |
101 | - struct throtl_rb_root tg_service_tree; | |
155 | + struct throtl_service_queue service_queue; | |
102 | 156 | |
103 | 157 | struct request_queue *queue; |
104 | 158 | |
... | ... | @@ -111,9 +165,7 @@ |
111 | 165 | unsigned int nr_undestroyed_grps; |
112 | 166 | |
113 | 167 | /* Work for dispatching throttled bios */ |
114 | - struct delayed_work throtl_work; | |
115 | - | |
116 | - int limits_changed; | |
168 | + struct work_struct dispatch_work; | |
117 | 169 | }; |
118 | 170 | |
119 | 171 | /* list and work item to allocate percpu group stats */ |
... | ... | @@ -123,6 +175,8 @@ |
123 | 175 | static void tg_stats_alloc_fn(struct work_struct *); |
124 | 176 | static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn); |
125 | 177 | |
178 | +static void throtl_pending_timer_fn(unsigned long arg); | |
179 | + | |
126 | 180 | static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd) |
127 | 181 | { |
128 | 182 | return pd ? container_of(pd, struct throtl_grp, pd) : NULL; |
129 | 183 | |
130 | 184 | |
131 | 185 | |
132 | 186 | |
... | ... | @@ -143,41 +197,65 @@ |
143 | 197 | return blkg_to_tg(td->queue->root_blkg); |
144 | 198 | } |
145 | 199 | |
146 | -enum tg_state_flags { | |
147 | - THROTL_TG_FLAG_on_rr = 0, /* on round-robin busy list */ | |
148 | -}; | |
149 | - | |
150 | -#define THROTL_TG_FNS(name) \ | |
151 | -static inline void throtl_mark_tg_##name(struct throtl_grp *tg) \ | |
152 | -{ \ | |
153 | - (tg)->flags |= (1 << THROTL_TG_FLAG_##name); \ | |
154 | -} \ | |
155 | -static inline void throtl_clear_tg_##name(struct throtl_grp *tg) \ | |
156 | -{ \ | |
157 | - (tg)->flags &= ~(1 << THROTL_TG_FLAG_##name); \ | |
158 | -} \ | |
159 | -static inline int throtl_tg_##name(const struct throtl_grp *tg) \ | |
160 | -{ \ | |
161 | - return ((tg)->flags & (1 << THROTL_TG_FLAG_##name)) != 0; \ | |
200 | +/** | |
201 | + * sq_to_tg - return the throl_grp the specified service queue belongs to | |
202 | + * @sq: the throtl_service_queue of interest | |
203 | + * | |
204 | + * Return the throtl_grp @sq belongs to. If @sq is the top-level one | |
205 | + * embedded in throtl_data, %NULL is returned. | |
206 | + */ | |
207 | +static struct throtl_grp *sq_to_tg(struct throtl_service_queue *sq) | |
208 | +{ | |
209 | + if (sq && sq->parent_sq) | |
210 | + return container_of(sq, struct throtl_grp, service_queue); | |
211 | + else | |
212 | + return NULL; | |
162 | 213 | } |
163 | 214 | |
164 | -THROTL_TG_FNS(on_rr); | |
215 | +/** | |
216 | + * sq_to_td - return throtl_data the specified service queue belongs to | |
217 | + * @sq: the throtl_service_queue of interest | |
218 | + * | |
219 | + * A service_queue can be embeded in either a throtl_grp or throtl_data. | |
220 | + * Determine the associated throtl_data accordingly and return it. | |
221 | + */ | |
222 | +static struct throtl_data *sq_to_td(struct throtl_service_queue *sq) | |
223 | +{ | |
224 | + struct throtl_grp *tg = sq_to_tg(sq); | |
165 | 225 | |
166 | -#define throtl_log_tg(td, tg, fmt, args...) do { \ | |
167 | - char __pbuf[128]; \ | |
226 | + if (tg) | |
227 | + return tg->td; | |
228 | + else | |
229 | + return container_of(sq, struct throtl_data, service_queue); | |
230 | +} | |
231 | + | |
232 | +/** | |
233 | + * throtl_log - log debug message via blktrace | |
234 | + * @sq: the service_queue being reported | |
235 | + * @fmt: printf format string | |
236 | + * @args: printf args | |
237 | + * | |
238 | + * The messages are prefixed with "throtl BLKG_NAME" if @sq belongs to a | |
239 | + * throtl_grp; otherwise, just "throtl". | |
240 | + * | |
241 | + * TODO: this should be made a function and name formatting should happen | |
242 | + * after testing whether blktrace is enabled. | |
243 | + */ | |
244 | +#define throtl_log(sq, fmt, args...) do { \ | |
245 | + struct throtl_grp *__tg = sq_to_tg((sq)); \ | |
246 | + struct throtl_data *__td = sq_to_td((sq)); \ | |
168 | 247 | \ |
169 | - blkg_path(tg_to_blkg(tg), __pbuf, sizeof(__pbuf)); \ | |
170 | - blk_add_trace_msg((td)->queue, "throtl %s " fmt, __pbuf, ##args); \ | |
248 | + (void)__td; \ | |
249 | + if ((__tg)) { \ | |
250 | + char __pbuf[128]; \ | |
251 | + \ | |
252 | + blkg_path(tg_to_blkg(__tg), __pbuf, sizeof(__pbuf)); \ | |
253 | + blk_add_trace_msg(__td->queue, "throtl %s " fmt, __pbuf, ##args); \ | |
254 | + } else { \ | |
255 | + blk_add_trace_msg(__td->queue, "throtl " fmt, ##args); \ | |
256 | + } \ | |
171 | 257 | } while (0) |
172 | 258 | |
173 | -#define throtl_log(td, fmt, args...) \ | |
174 | - blk_add_trace_msg((td)->queue, "throtl " fmt, ##args) | |
175 | - | |
176 | -static inline unsigned int total_nr_queued(struct throtl_data *td) | |
177 | -{ | |
178 | - return td->nr_queued[0] + td->nr_queued[1]; | |
179 | -} | |
180 | - | |
181 | 259 | /* |
182 | 260 | * Worker for allocating per cpu stat for tgs. This is scheduled on the |
183 | 261 | * system_wq once there are some groups on the alloc_list waiting for |
184 | 262 | |
185 | 263 | |
186 | 264 | |
187 | 265 | |
... | ... | @@ -215,15 +293,141 @@ |
215 | 293 | goto alloc_stats; |
216 | 294 | } |
217 | 295 | |
296 | +static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg) | |
297 | +{ | |
298 | + INIT_LIST_HEAD(&qn->node); | |
299 | + bio_list_init(&qn->bios); | |
300 | + qn->tg = tg; | |
301 | +} | |
302 | + | |
303 | +/** | |
304 | + * throtl_qnode_add_bio - add a bio to a throtl_qnode and activate it | |
305 | + * @bio: bio being added | |
306 | + * @qn: qnode to add bio to | |
307 | + * @queued: the service_queue->queued[] list @qn belongs to | |
308 | + * | |
309 | + * Add @bio to @qn and put @qn on @queued if it's not already on. | |
310 | + * @qn->tg's reference count is bumped when @qn is activated. See the | |
311 | + * comment on top of throtl_qnode definition for details. | |
312 | + */ | |
313 | +static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn, | |
314 | + struct list_head *queued) | |
315 | +{ | |
316 | + bio_list_add(&qn->bios, bio); | |
317 | + if (list_empty(&qn->node)) { | |
318 | + list_add_tail(&qn->node, queued); | |
319 | + blkg_get(tg_to_blkg(qn->tg)); | |
320 | + } | |
321 | +} | |
322 | + | |
323 | +/** | |
324 | + * throtl_peek_queued - peek the first bio on a qnode list | |
325 | + * @queued: the qnode list to peek | |
326 | + */ | |
327 | +static struct bio *throtl_peek_queued(struct list_head *queued) | |
328 | +{ | |
329 | + struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node); | |
330 | + struct bio *bio; | |
331 | + | |
332 | + if (list_empty(queued)) | |
333 | + return NULL; | |
334 | + | |
335 | + bio = bio_list_peek(&qn->bios); | |
336 | + WARN_ON_ONCE(!bio); | |
337 | + return bio; | |
338 | +} | |
339 | + | |
340 | +/** | |
341 | + * throtl_pop_queued - pop the first bio form a qnode list | |
342 | + * @queued: the qnode list to pop a bio from | |
343 | + * @tg_to_put: optional out argument for throtl_grp to put | |
344 | + * | |
345 | + * Pop the first bio from the qnode list @queued. After popping, the first | |
346 | + * qnode is removed from @queued if empty or moved to the end of @queued so | |
347 | + * that the popping order is round-robin. | |
348 | + * | |
349 | + * When the first qnode is removed, its associated throtl_grp should be put | |
350 | + * too. If @tg_to_put is NULL, this function automatically puts it; | |
351 | + * otherwise, *@tg_to_put is set to the throtl_grp to put and the caller is | |
352 | + * responsible for putting it. | |
353 | + */ | |
354 | +static struct bio *throtl_pop_queued(struct list_head *queued, | |
355 | + struct throtl_grp **tg_to_put) | |
356 | +{ | |
357 | + struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node); | |
358 | + struct bio *bio; | |
359 | + | |
360 | + if (list_empty(queued)) | |
361 | + return NULL; | |
362 | + | |
363 | + bio = bio_list_pop(&qn->bios); | |
364 | + WARN_ON_ONCE(!bio); | |
365 | + | |
366 | + if (bio_list_empty(&qn->bios)) { | |
367 | + list_del_init(&qn->node); | |
368 | + if (tg_to_put) | |
369 | + *tg_to_put = qn->tg; | |
370 | + else | |
371 | + blkg_put(tg_to_blkg(qn->tg)); | |
372 | + } else { | |
373 | + list_move_tail(&qn->node, queued); | |
374 | + } | |
375 | + | |
376 | + return bio; | |
377 | +} | |
378 | + | |
379 | +/* init a service_queue, assumes the caller zeroed it */ | |
380 | +static void throtl_service_queue_init(struct throtl_service_queue *sq, | |
381 | + struct throtl_service_queue *parent_sq) | |
382 | +{ | |
383 | + INIT_LIST_HEAD(&sq->queued[0]); | |
384 | + INIT_LIST_HEAD(&sq->queued[1]); | |
385 | + sq->pending_tree = RB_ROOT; | |
386 | + sq->parent_sq = parent_sq; | |
387 | + setup_timer(&sq->pending_timer, throtl_pending_timer_fn, | |
388 | + (unsigned long)sq); | |
389 | +} | |
390 | + | |
391 | +static void throtl_service_queue_exit(struct throtl_service_queue *sq) | |
392 | +{ | |
393 | + del_timer_sync(&sq->pending_timer); | |
394 | +} | |
395 | + | |
218 | 396 | static void throtl_pd_init(struct blkcg_gq *blkg) |
219 | 397 | { |
220 | 398 | struct throtl_grp *tg = blkg_to_tg(blkg); |
399 | + struct throtl_data *td = blkg->q->td; | |
400 | + struct throtl_service_queue *parent_sq; | |
221 | 401 | unsigned long flags; |
402 | + int rw; | |
222 | 403 | |
404 | + /* | |
405 | + * If sane_hierarchy is enabled, we switch to properly hierarchical | |
406 | + * behavior where limits on a given throtl_grp are applied to the | |
407 | + * whole subtree rather than just the group itself. e.g. If 16M | |
408 | + * read_bps limit is set on the root group, the whole system can't | |
409 | + * exceed 16M for the device. | |
410 | + * | |
411 | + * If sane_hierarchy is not enabled, the broken flat hierarchy | |
412 | + * behavior is retained where all throtl_grps are treated as if | |
413 | + * they're all separate root groups right below throtl_data. | |
414 | + * Limits of a group don't interact with limits of other groups | |
415 | + * regardless of the position of the group in the hierarchy. | |
416 | + */ | |
417 | + parent_sq = &td->service_queue; | |
418 | + | |
419 | + if (cgroup_sane_behavior(blkg->blkcg->css.cgroup) && blkg->parent) | |
420 | + parent_sq = &blkg_to_tg(blkg->parent)->service_queue; | |
421 | + | |
422 | + throtl_service_queue_init(&tg->service_queue, parent_sq); | |
423 | + | |
424 | + for (rw = READ; rw <= WRITE; rw++) { | |
425 | + throtl_qnode_init(&tg->qnode_on_self[rw], tg); | |
426 | + throtl_qnode_init(&tg->qnode_on_parent[rw], tg); | |
427 | + } | |
428 | + | |
223 | 429 | RB_CLEAR_NODE(&tg->rb_node); |
224 | - bio_list_init(&tg->bio_lists[0]); | |
225 | - bio_list_init(&tg->bio_lists[1]); | |
226 | - tg->limits_changed = false; | |
430 | + tg->td = td; | |
227 | 431 | |
228 | 432 | tg->bps[READ] = -1; |
229 | 433 | tg->bps[WRITE] = -1; |
... | ... | @@ -241,6 +445,30 @@ |
241 | 445 | spin_unlock_irqrestore(&tg_stats_alloc_lock, flags); |
242 | 446 | } |
243 | 447 | |
448 | +/* | |
449 | + * Set has_rules[] if @tg or any of its parents have limits configured. | |
450 | + * This doesn't require walking up to the top of the hierarchy as the | |
451 | + * parent's has_rules[] is guaranteed to be correct. | |
452 | + */ | |
453 | +static void tg_update_has_rules(struct throtl_grp *tg) | |
454 | +{ | |
455 | + struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq); | |
456 | + int rw; | |
457 | + | |
458 | + for (rw = READ; rw <= WRITE; rw++) | |
459 | + tg->has_rules[rw] = (parent_tg && parent_tg->has_rules[rw]) || | |
460 | + (tg->bps[rw] != -1 || tg->iops[rw] != -1); | |
461 | +} | |
462 | + | |
463 | +static void throtl_pd_online(struct blkcg_gq *blkg) | |
464 | +{ | |
465 | + /* | |
466 | + * We don't want new groups to escape the limits of its ancestors. | |
467 | + * Update has_rules[] after a new group is brought online. | |
468 | + */ | |
469 | + tg_update_has_rules(blkg_to_tg(blkg)); | |
470 | +} | |
471 | + | |
244 | 472 | static void throtl_pd_exit(struct blkcg_gq *blkg) |
245 | 473 | { |
246 | 474 | struct throtl_grp *tg = blkg_to_tg(blkg); |
... | ... | @@ -251,6 +479,8 @@ |
251 | 479 | spin_unlock_irqrestore(&tg_stats_alloc_lock, flags); |
252 | 480 | |
253 | 481 | free_percpu(tg->stats_cpu); |
482 | + | |
483 | + throtl_service_queue_exit(&tg->service_queue); | |
254 | 484 | } |
255 | 485 | |
256 | 486 | static void throtl_pd_reset_stats(struct blkcg_gq *blkg) |
257 | 487 | |
258 | 488 | |
259 | 489 | |
... | ... | @@ -309,17 +539,18 @@ |
309 | 539 | return tg; |
310 | 540 | } |
311 | 541 | |
312 | -static struct throtl_grp *throtl_rb_first(struct throtl_rb_root *root) | |
542 | +static struct throtl_grp * | |
543 | +throtl_rb_first(struct throtl_service_queue *parent_sq) | |
313 | 544 | { |
314 | 545 | /* Service tree is empty */ |
315 | - if (!root->count) | |
546 | + if (!parent_sq->nr_pending) | |
316 | 547 | return NULL; |
317 | 548 | |
318 | - if (!root->left) | |
319 | - root->left = rb_first(&root->rb); | |
549 | + if (!parent_sq->first_pending) | |
550 | + parent_sq->first_pending = rb_first(&parent_sq->pending_tree); | |
320 | 551 | |
321 | - if (root->left) | |
322 | - return rb_entry_tg(root->left); | |
552 | + if (parent_sq->first_pending) | |
553 | + return rb_entry_tg(parent_sq->first_pending); | |
323 | 554 | |
324 | 555 | return NULL; |
325 | 556 | } |
326 | 557 | |
327 | 558 | |
328 | 559 | |
329 | 560 | |
330 | 561 | |
331 | 562 | |
... | ... | @@ -330,29 +561,30 @@ |
330 | 561 | RB_CLEAR_NODE(n); |
331 | 562 | } |
332 | 563 | |
333 | -static void throtl_rb_erase(struct rb_node *n, struct throtl_rb_root *root) | |
564 | +static void throtl_rb_erase(struct rb_node *n, | |
565 | + struct throtl_service_queue *parent_sq) | |
334 | 566 | { |
335 | - if (root->left == n) | |
336 | - root->left = NULL; | |
337 | - rb_erase_init(n, &root->rb); | |
338 | - --root->count; | |
567 | + if (parent_sq->first_pending == n) | |
568 | + parent_sq->first_pending = NULL; | |
569 | + rb_erase_init(n, &parent_sq->pending_tree); | |
570 | + --parent_sq->nr_pending; | |
339 | 571 | } |
340 | 572 | |
341 | -static void update_min_dispatch_time(struct throtl_rb_root *st) | |
573 | +static void update_min_dispatch_time(struct throtl_service_queue *parent_sq) | |
342 | 574 | { |
343 | 575 | struct throtl_grp *tg; |
344 | 576 | |
345 | - tg = throtl_rb_first(st); | |
577 | + tg = throtl_rb_first(parent_sq); | |
346 | 578 | if (!tg) |
347 | 579 | return; |
348 | 580 | |
349 | - st->min_disptime = tg->disptime; | |
581 | + parent_sq->first_pending_disptime = tg->disptime; | |
350 | 582 | } |
351 | 583 | |
352 | -static void | |
353 | -tg_service_tree_add(struct throtl_rb_root *st, struct throtl_grp *tg) | |
584 | +static void tg_service_queue_add(struct throtl_grp *tg) | |
354 | 585 | { |
355 | - struct rb_node **node = &st->rb.rb_node; | |
586 | + struct throtl_service_queue *parent_sq = tg->service_queue.parent_sq; | |
587 | + struct rb_node **node = &parent_sq->pending_tree.rb_node; | |
356 | 588 | struct rb_node *parent = NULL; |
357 | 589 | struct throtl_grp *__tg; |
358 | 590 | unsigned long key = tg->disptime; |
359 | 591 | |
360 | 592 | |
361 | 593 | |
362 | 594 | |
363 | 595 | |
364 | 596 | |
365 | 597 | |
366 | 598 | |
367 | 599 | |
368 | 600 | |
369 | 601 | |
370 | 602 | |
371 | 603 | |
372 | 604 | |
373 | 605 | |
374 | 606 | |
375 | 607 | |
376 | 608 | |
377 | 609 | |
378 | 610 | |
379 | 611 | |
380 | 612 | |
... | ... | @@ -371,89 +603,135 @@ |
371 | 603 | } |
372 | 604 | |
373 | 605 | if (left) |
374 | - st->left = &tg->rb_node; | |
606 | + parent_sq->first_pending = &tg->rb_node; | |
375 | 607 | |
376 | 608 | rb_link_node(&tg->rb_node, parent, node); |
377 | - rb_insert_color(&tg->rb_node, &st->rb); | |
609 | + rb_insert_color(&tg->rb_node, &parent_sq->pending_tree); | |
378 | 610 | } |
379 | 611 | |
380 | -static void __throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg) | |
612 | +static void __throtl_enqueue_tg(struct throtl_grp *tg) | |
381 | 613 | { |
382 | - struct throtl_rb_root *st = &td->tg_service_tree; | |
614 | + tg_service_queue_add(tg); | |
615 | + tg->flags |= THROTL_TG_PENDING; | |
616 | + tg->service_queue.parent_sq->nr_pending++; | |
617 | +} | |
383 | 618 | |
384 | - tg_service_tree_add(st, tg); | |
385 | - throtl_mark_tg_on_rr(tg); | |
386 | - st->count++; | |
619 | +static void throtl_enqueue_tg(struct throtl_grp *tg) | |
620 | +{ | |
621 | + if (!(tg->flags & THROTL_TG_PENDING)) | |
622 | + __throtl_enqueue_tg(tg); | |
387 | 623 | } |
388 | 624 | |
389 | -static void throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg) | |
625 | +static void __throtl_dequeue_tg(struct throtl_grp *tg) | |
390 | 626 | { |
391 | - if (!throtl_tg_on_rr(tg)) | |
392 | - __throtl_enqueue_tg(td, tg); | |
627 | + throtl_rb_erase(&tg->rb_node, tg->service_queue.parent_sq); | |
628 | + tg->flags &= ~THROTL_TG_PENDING; | |
393 | 629 | } |
394 | 630 | |
395 | -static void __throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg) | |
631 | +static void throtl_dequeue_tg(struct throtl_grp *tg) | |
396 | 632 | { |
397 | - throtl_rb_erase(&tg->rb_node, &td->tg_service_tree); | |
398 | - throtl_clear_tg_on_rr(tg); | |
633 | + if (tg->flags & THROTL_TG_PENDING) | |
634 | + __throtl_dequeue_tg(tg); | |
399 | 635 | } |
400 | 636 | |
401 | -static void throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg) | |
637 | +/* Call with queue lock held */ | |
638 | +static void throtl_schedule_pending_timer(struct throtl_service_queue *sq, | |
639 | + unsigned long expires) | |
402 | 640 | { |
403 | - if (throtl_tg_on_rr(tg)) | |
404 | - __throtl_dequeue_tg(td, tg); | |
641 | + mod_timer(&sq->pending_timer, expires); | |
642 | + throtl_log(sq, "schedule timer. delay=%lu jiffies=%lu", | |
643 | + expires - jiffies, jiffies); | |
405 | 644 | } |
406 | 645 | |
407 | -static void throtl_schedule_next_dispatch(struct throtl_data *td) | |
646 | +/** | |
647 | + * throtl_schedule_next_dispatch - schedule the next dispatch cycle | |
648 | + * @sq: the service_queue to schedule dispatch for | |
649 | + * @force: force scheduling | |
650 | + * | |
651 | + * Arm @sq->pending_timer so that the next dispatch cycle starts on the | |
652 | + * dispatch time of the first pending child. Returns %true if either timer | |
653 | + * is armed or there's no pending child left. %false if the current | |
654 | + * dispatch window is still open and the caller should continue | |
655 | + * dispatching. | |
656 | + * | |
657 | + * If @force is %true, the dispatch timer is always scheduled and this | |
658 | + * function is guaranteed to return %true. This is to be used when the | |
659 | + * caller can't dispatch itself and needs to invoke pending_timer | |
660 | + * unconditionally. Note that forced scheduling is likely to induce short | |
661 | + * delay before dispatch starts even if @sq->first_pending_disptime is not | |
662 | + * in the future and thus shouldn't be used in hot paths. | |
663 | + */ | |
664 | +static bool throtl_schedule_next_dispatch(struct throtl_service_queue *sq, | |
665 | + bool force) | |
408 | 666 | { |
409 | - struct throtl_rb_root *st = &td->tg_service_tree; | |
667 | + /* any pending children left? */ | |
668 | + if (!sq->nr_pending) | |
669 | + return true; | |
410 | 670 | |
411 | - /* | |
412 | - * If there are more bios pending, schedule more work. | |
413 | - */ | |
414 | - if (!total_nr_queued(td)) | |
415 | - return; | |
671 | + update_min_dispatch_time(sq); | |
416 | 672 | |
417 | - BUG_ON(!st->count); | |
673 | + /* is the next dispatch time in the future? */ | |
674 | + if (force || time_after(sq->first_pending_disptime, jiffies)) { | |
675 | + throtl_schedule_pending_timer(sq, sq->first_pending_disptime); | |
676 | + return true; | |
677 | + } | |
418 | 678 | |
419 | - update_min_dispatch_time(st); | |
679 | + /* tell the caller to continue dispatching */ | |
680 | + return false; | |
681 | +} | |
420 | 682 | |
421 | - if (time_before_eq(st->min_disptime, jiffies)) | |
422 | - throtl_schedule_delayed_work(td, 0); | |
423 | - else | |
424 | - throtl_schedule_delayed_work(td, (st->min_disptime - jiffies)); | |
683 | +static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg, | |
684 | + bool rw, unsigned long start) | |
685 | +{ | |
686 | + tg->bytes_disp[rw] = 0; | |
687 | + tg->io_disp[rw] = 0; | |
688 | + | |
689 | + /* | |
690 | + * Previous slice has expired. We must have trimmed it after last | |
691 | + * bio dispatch. That means since start of last slice, we never used | |
692 | + * that bandwidth. Do try to make use of that bandwidth while giving | |
693 | + * credit. | |
694 | + */ | |
695 | + if (time_after_eq(start, tg->slice_start[rw])) | |
696 | + tg->slice_start[rw] = start; | |
697 | + | |
698 | + tg->slice_end[rw] = jiffies + throtl_slice; | |
699 | + throtl_log(&tg->service_queue, | |
700 | + "[%c] new slice with credit start=%lu end=%lu jiffies=%lu", | |
701 | + rw == READ ? 'R' : 'W', tg->slice_start[rw], | |
702 | + tg->slice_end[rw], jiffies); | |
425 | 703 | } |
426 | 704 | |
427 | -static inline void | |
428 | -throtl_start_new_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) | |
705 | +static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw) | |
429 | 706 | { |
430 | 707 | tg->bytes_disp[rw] = 0; |
431 | 708 | tg->io_disp[rw] = 0; |
432 | 709 | tg->slice_start[rw] = jiffies; |
433 | 710 | tg->slice_end[rw] = jiffies + throtl_slice; |
434 | - throtl_log_tg(td, tg, "[%c] new slice start=%lu end=%lu jiffies=%lu", | |
435 | - rw == READ ? 'R' : 'W', tg->slice_start[rw], | |
436 | - tg->slice_end[rw], jiffies); | |
711 | + throtl_log(&tg->service_queue, | |
712 | + "[%c] new slice start=%lu end=%lu jiffies=%lu", | |
713 | + rw == READ ? 'R' : 'W', tg->slice_start[rw], | |
714 | + tg->slice_end[rw], jiffies); | |
437 | 715 | } |
438 | 716 | |
439 | -static inline void throtl_set_slice_end(struct throtl_data *td, | |
440 | - struct throtl_grp *tg, bool rw, unsigned long jiffy_end) | |
717 | +static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw, | |
718 | + unsigned long jiffy_end) | |
441 | 719 | { |
442 | 720 | tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); |
443 | 721 | } |
444 | 722 | |
445 | -static inline void throtl_extend_slice(struct throtl_data *td, | |
446 | - struct throtl_grp *tg, bool rw, unsigned long jiffy_end) | |
723 | +static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw, | |
724 | + unsigned long jiffy_end) | |
447 | 725 | { |
448 | 726 | tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); |
449 | - throtl_log_tg(td, tg, "[%c] extend slice start=%lu end=%lu jiffies=%lu", | |
450 | - rw == READ ? 'R' : 'W', tg->slice_start[rw], | |
451 | - tg->slice_end[rw], jiffies); | |
727 | + throtl_log(&tg->service_queue, | |
728 | + "[%c] extend slice start=%lu end=%lu jiffies=%lu", | |
729 | + rw == READ ? 'R' : 'W', tg->slice_start[rw], | |
730 | + tg->slice_end[rw], jiffies); | |
452 | 731 | } |
453 | 732 | |
454 | 733 | /* Determine if previously allocated or extended slice is complete or not */ |
455 | -static bool | |
456 | -throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw) | |
734 | +static bool throtl_slice_used(struct throtl_grp *tg, bool rw) | |
457 | 735 | { |
458 | 736 | if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw])) |
459 | 737 | return 0; |
... | ... | @@ -462,8 +740,7 @@ |
462 | 740 | } |
463 | 741 | |
464 | 742 | /* Trim the used slices and adjust slice start accordingly */ |
465 | -static inline void | |
466 | -throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) | |
743 | +static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) | |
467 | 744 | { |
468 | 745 | unsigned long nr_slices, time_elapsed, io_trim; |
469 | 746 | u64 bytes_trim, tmp; |
... | ... | @@ -475,7 +752,7 @@ |
475 | 752 | * renewed. Don't try to trim the slice if slice is used. A new |
476 | 753 | * slice will start when appropriate. |
477 | 754 | */ |
478 | - if (throtl_slice_used(td, tg, rw)) | |
755 | + if (throtl_slice_used(tg, rw)) | |
479 | 756 | return; |
480 | 757 | |
481 | 758 | /* |
... | ... | @@ -486,7 +763,7 @@ |
486 | 763 | * is bad because it does not allow new slice to start. |
487 | 764 | */ |
488 | 765 | |
489 | - throtl_set_slice_end(td, tg, rw, jiffies + throtl_slice); | |
766 | + throtl_set_slice_end(tg, rw, jiffies + throtl_slice); | |
490 | 767 | |
491 | 768 | time_elapsed = jiffies - tg->slice_start[rw]; |
492 | 769 | |
493 | 770 | |
... | ... | @@ -515,14 +792,14 @@ |
515 | 792 | |
516 | 793 | tg->slice_start[rw] += nr_slices * throtl_slice; |
517 | 794 | |
518 | - throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%llu io=%lu" | |
519 | - " start=%lu end=%lu jiffies=%lu", | |
520 | - rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim, | |
521 | - tg->slice_start[rw], tg->slice_end[rw], jiffies); | |
795 | + throtl_log(&tg->service_queue, | |
796 | + "[%c] trim slice nr=%lu bytes=%llu io=%lu start=%lu end=%lu jiffies=%lu", | |
797 | + rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim, | |
798 | + tg->slice_start[rw], tg->slice_end[rw], jiffies); | |
522 | 799 | } |
523 | 800 | |
524 | -static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg, | |
525 | - struct bio *bio, unsigned long *wait) | |
801 | +static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, | |
802 | + unsigned long *wait) | |
526 | 803 | { |
527 | 804 | bool rw = bio_data_dir(bio); |
528 | 805 | unsigned int io_allowed; |
... | ... | @@ -571,8 +848,8 @@ |
571 | 848 | return 0; |
572 | 849 | } |
573 | 850 | |
574 | -static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg, | |
575 | - struct bio *bio, unsigned long *wait) | |
851 | +static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio, | |
852 | + unsigned long *wait) | |
576 | 853 | { |
577 | 854 | bool rw = bio_data_dir(bio); |
578 | 855 | u64 bytes_allowed, extra_bytes, tmp; |
579 | 856 | |
... | ... | @@ -613,18 +890,12 @@ |
613 | 890 | return 0; |
614 | 891 | } |
615 | 892 | |
616 | -static bool tg_no_rule_group(struct throtl_grp *tg, bool rw) { | |
617 | - if (tg->bps[rw] == -1 && tg->iops[rw] == -1) | |
618 | - return 1; | |
619 | - return 0; | |
620 | -} | |
621 | - | |
622 | 893 | /* |
623 | 894 | * Returns whether one can dispatch a bio or not. Also returns approx number |
624 | 895 | * of jiffies to wait before this bio is with-in IO rate and can be dispatched |
625 | 896 | */ |
626 | -static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg, | |
627 | - struct bio *bio, unsigned long *wait) | |
897 | +static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, | |
898 | + unsigned long *wait) | |
628 | 899 | { |
629 | 900 | bool rw = bio_data_dir(bio); |
630 | 901 | unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0; |
... | ... | @@ -635,7 +906,8 @@ |
635 | 906 | * this function with a different bio if there are other bios |
636 | 907 | * queued. |
637 | 908 | */ |
638 | - BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw])); | |
909 | + BUG_ON(tg->service_queue.nr_queued[rw] && | |
910 | + bio != throtl_peek_queued(&tg->service_queue.queued[rw])); | |
639 | 911 | |
640 | 912 | /* If tg->bps = -1, then BW is unlimited */ |
641 | 913 | if (tg->bps[rw] == -1 && tg->iops[rw] == -1) { |
642 | 914 | |
643 | 915 | |
... | ... | @@ -649,15 +921,15 @@ |
649 | 921 | * existing slice to make sure it is at least throtl_slice interval |
650 | 922 | * long since now. |
651 | 923 | */ |
652 | - if (throtl_slice_used(td, tg, rw)) | |
653 | - throtl_start_new_slice(td, tg, rw); | |
924 | + if (throtl_slice_used(tg, rw)) | |
925 | + throtl_start_new_slice(tg, rw); | |
654 | 926 | else { |
655 | 927 | if (time_before(tg->slice_end[rw], jiffies + throtl_slice)) |
656 | - throtl_extend_slice(td, tg, rw, jiffies + throtl_slice); | |
928 | + throtl_extend_slice(tg, rw, jiffies + throtl_slice); | |
657 | 929 | } |
658 | 930 | |
659 | - if (tg_with_in_bps_limit(td, tg, bio, &bps_wait) | |
660 | - && tg_with_in_iops_limit(td, tg, bio, &iops_wait)) { | |
931 | + if (tg_with_in_bps_limit(tg, bio, &bps_wait) && | |
932 | + tg_with_in_iops_limit(tg, bio, &iops_wait)) { | |
661 | 933 | if (wait) |
662 | 934 | *wait = 0; |
663 | 935 | return 1; |
... | ... | @@ -669,7 +941,7 @@ |
669 | 941 | *wait = max_wait; |
670 | 942 | |
671 | 943 | if (time_before(tg->slice_end[rw], jiffies + max_wait)) |
672 | - throtl_extend_slice(td, tg, rw, jiffies + max_wait); | |
944 | + throtl_extend_slice(tg, rw, jiffies + max_wait); | |
673 | 945 | |
674 | 946 | return 0; |
675 | 947 | } |
676 | 948 | |
677 | 949 | |
678 | 950 | |
679 | 951 | |
680 | 952 | |
681 | 953 | |
682 | 954 | |
683 | 955 | |
684 | 956 | |
685 | 957 | |
686 | 958 | |
687 | 959 | |
688 | 960 | |
689 | 961 | |
690 | 962 | |
691 | 963 | |
692 | 964 | |
693 | 965 | |
... | ... | @@ -708,65 +980,136 @@ |
708 | 980 | tg->bytes_disp[rw] += bio->bi_size; |
709 | 981 | tg->io_disp[rw]++; |
710 | 982 | |
711 | - throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size, bio->bi_rw); | |
983 | + /* | |
984 | + * REQ_THROTTLED is used to prevent the same bio to be throttled | |
985 | + * more than once as a throttled bio will go through blk-throtl the | |
986 | + * second time when it eventually gets issued. Set it when a bio | |
987 | + * is being charged to a tg. | |
988 | + * | |
989 | + * Dispatch stats aren't recursive and each @bio should only be | |
990 | + * accounted by the @tg it was originally associated with. Let's | |
991 | + * update the stats when setting REQ_THROTTLED for the first time | |
992 | + * which is guaranteed to be for the @bio's original tg. | |
993 | + */ | |
994 | + if (!(bio->bi_rw & REQ_THROTTLED)) { | |
995 | + bio->bi_rw |= REQ_THROTTLED; | |
996 | + throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size, | |
997 | + bio->bi_rw); | |
998 | + } | |
712 | 999 | } |
713 | 1000 | |
714 | -static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg, | |
715 | - struct bio *bio) | |
1001 | +/** | |
1002 | + * throtl_add_bio_tg - add a bio to the specified throtl_grp | |
1003 | + * @bio: bio to add | |
1004 | + * @qn: qnode to use | |
1005 | + * @tg: the target throtl_grp | |
1006 | + * | |
1007 | + * Add @bio to @tg's service_queue using @qn. If @qn is not specified, | |
1008 | + * tg->qnode_on_self[] is used. | |
1009 | + */ | |
1010 | +static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn, | |
1011 | + struct throtl_grp *tg) | |
716 | 1012 | { |
1013 | + struct throtl_service_queue *sq = &tg->service_queue; | |
717 | 1014 | bool rw = bio_data_dir(bio); |
718 | 1015 | |
719 | - bio_list_add(&tg->bio_lists[rw], bio); | |
720 | - /* Take a bio reference on tg */ | |
721 | - blkg_get(tg_to_blkg(tg)); | |
722 | - tg->nr_queued[rw]++; | |
723 | - td->nr_queued[rw]++; | |
724 | - throtl_enqueue_tg(td, tg); | |
1016 | + if (!qn) | |
1017 | + qn = &tg->qnode_on_self[rw]; | |
1018 | + | |
1019 | + /* | |
1020 | + * If @tg doesn't currently have any bios queued in the same | |
1021 | + * direction, queueing @bio can change when @tg should be | |
1022 | + * dispatched. Mark that @tg was empty. This is automatically | |
1023 | + * cleaered on the next tg_update_disptime(). | |
1024 | + */ | |
1025 | + if (!sq->nr_queued[rw]) | |
1026 | + tg->flags |= THROTL_TG_WAS_EMPTY; | |
1027 | + | |
1028 | + throtl_qnode_add_bio(bio, qn, &sq->queued[rw]); | |
1029 | + | |
1030 | + sq->nr_queued[rw]++; | |
1031 | + throtl_enqueue_tg(tg); | |
725 | 1032 | } |
726 | 1033 | |
727 | -static void tg_update_disptime(struct throtl_data *td, struct throtl_grp *tg) | |
1034 | +static void tg_update_disptime(struct throtl_grp *tg) | |
728 | 1035 | { |
1036 | + struct throtl_service_queue *sq = &tg->service_queue; | |
729 | 1037 | unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime; |
730 | 1038 | struct bio *bio; |
731 | 1039 | |
732 | - if ((bio = bio_list_peek(&tg->bio_lists[READ]))) | |
733 | - tg_may_dispatch(td, tg, bio, &read_wait); | |
1040 | + if ((bio = throtl_peek_queued(&sq->queued[READ]))) | |
1041 | + tg_may_dispatch(tg, bio, &read_wait); | |
734 | 1042 | |
735 | - if ((bio = bio_list_peek(&tg->bio_lists[WRITE]))) | |
736 | - tg_may_dispatch(td, tg, bio, &write_wait); | |
1043 | + if ((bio = throtl_peek_queued(&sq->queued[WRITE]))) | |
1044 | + tg_may_dispatch(tg, bio, &write_wait); | |
737 | 1045 | |
738 | 1046 | min_wait = min(read_wait, write_wait); |
739 | 1047 | disptime = jiffies + min_wait; |
740 | 1048 | |
741 | 1049 | /* Update dispatch time */ |
742 | - throtl_dequeue_tg(td, tg); | |
1050 | + throtl_dequeue_tg(tg); | |
743 | 1051 | tg->disptime = disptime; |
744 | - throtl_enqueue_tg(td, tg); | |
1052 | + throtl_enqueue_tg(tg); | |
1053 | + | |
1054 | + /* see throtl_add_bio_tg() */ | |
1055 | + tg->flags &= ~THROTL_TG_WAS_EMPTY; | |
745 | 1056 | } |
746 | 1057 | |
747 | -static void tg_dispatch_one_bio(struct throtl_data *td, struct throtl_grp *tg, | |
748 | - bool rw, struct bio_list *bl) | |
1058 | +static void start_parent_slice_with_credit(struct throtl_grp *child_tg, | |
1059 | + struct throtl_grp *parent_tg, bool rw) | |
749 | 1060 | { |
750 | - struct bio *bio; | |
1061 | + if (throtl_slice_used(parent_tg, rw)) { | |
1062 | + throtl_start_new_slice_with_credit(parent_tg, rw, | |
1063 | + child_tg->slice_start[rw]); | |
1064 | + } | |
751 | 1065 | |
752 | - bio = bio_list_pop(&tg->bio_lists[rw]); | |
753 | - tg->nr_queued[rw]--; | |
754 | - /* Drop bio reference on blkg */ | |
755 | - blkg_put(tg_to_blkg(tg)); | |
1066 | +} | |
756 | 1067 | |
757 | - BUG_ON(td->nr_queued[rw] <= 0); | |
758 | - td->nr_queued[rw]--; | |
1068 | +static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw) | |
1069 | +{ | |
1070 | + struct throtl_service_queue *sq = &tg->service_queue; | |
1071 | + struct throtl_service_queue *parent_sq = sq->parent_sq; | |
1072 | + struct throtl_grp *parent_tg = sq_to_tg(parent_sq); | |
1073 | + struct throtl_grp *tg_to_put = NULL; | |
1074 | + struct bio *bio; | |
759 | 1075 | |
1076 | + /* | |
1077 | + * @bio is being transferred from @tg to @parent_sq. Popping a bio | |
1078 | + * from @tg may put its reference and @parent_sq might end up | |
1079 | + * getting released prematurely. Remember the tg to put and put it | |
1080 | + * after @bio is transferred to @parent_sq. | |
1081 | + */ | |
1082 | + bio = throtl_pop_queued(&sq->queued[rw], &tg_to_put); | |
1083 | + sq->nr_queued[rw]--; | |
1084 | + | |
760 | 1085 | throtl_charge_bio(tg, bio); |
761 | - bio_list_add(bl, bio); | |
762 | - bio->bi_rw |= REQ_THROTTLED; | |
763 | 1086 | |
764 | - throtl_trim_slice(td, tg, rw); | |
1087 | + /* | |
1088 | + * If our parent is another tg, we just need to transfer @bio to | |
1089 | + * the parent using throtl_add_bio_tg(). If our parent is | |
1090 | + * @td->service_queue, @bio is ready to be issued. Put it on its | |
1091 | + * bio_lists[] and decrease total number queued. The caller is | |
1092 | + * responsible for issuing these bios. | |
1093 | + */ | |
1094 | + if (parent_tg) { | |
1095 | + throtl_add_bio_tg(bio, &tg->qnode_on_parent[rw], parent_tg); | |
1096 | + start_parent_slice_with_credit(tg, parent_tg, rw); | |
1097 | + } else { | |
1098 | + throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw], | |
1099 | + &parent_sq->queued[rw]); | |
1100 | + BUG_ON(tg->td->nr_queued[rw] <= 0); | |
1101 | + tg->td->nr_queued[rw]--; | |
1102 | + } | |
1103 | + | |
1104 | + throtl_trim_slice(tg, rw); | |
1105 | + | |
1106 | + if (tg_to_put) | |
1107 | + blkg_put(tg_to_blkg(tg_to_put)); | |
765 | 1108 | } |
766 | 1109 | |
767 | -static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg, | |
768 | - struct bio_list *bl) | |
1110 | +static int throtl_dispatch_tg(struct throtl_grp *tg) | |
769 | 1111 | { |
1112 | + struct throtl_service_queue *sq = &tg->service_queue; | |
770 | 1113 | unsigned int nr_reads = 0, nr_writes = 0; |
771 | 1114 | unsigned int max_nr_reads = throtl_grp_quantum*3/4; |
772 | 1115 | unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads; |
773 | 1116 | |
774 | 1117 | |
775 | 1118 | |
... | ... | @@ -774,20 +1117,20 @@ |
774 | 1117 | |
775 | 1118 | /* Try to dispatch 75% READS and 25% WRITES */ |
776 | 1119 | |
777 | - while ((bio = bio_list_peek(&tg->bio_lists[READ])) | |
778 | - && tg_may_dispatch(td, tg, bio, NULL)) { | |
1120 | + while ((bio = throtl_peek_queued(&sq->queued[READ])) && | |
1121 | + tg_may_dispatch(tg, bio, NULL)) { | |
779 | 1122 | |
780 | - tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl); | |
1123 | + tg_dispatch_one_bio(tg, bio_data_dir(bio)); | |
781 | 1124 | nr_reads++; |
782 | 1125 | |
783 | 1126 | if (nr_reads >= max_nr_reads) |
784 | 1127 | break; |
785 | 1128 | } |
786 | 1129 | |
787 | - while ((bio = bio_list_peek(&tg->bio_lists[WRITE])) | |
788 | - && tg_may_dispatch(td, tg, bio, NULL)) { | |
1130 | + while ((bio = throtl_peek_queued(&sq->queued[WRITE])) && | |
1131 | + tg_may_dispatch(tg, bio, NULL)) { | |
789 | 1132 | |
790 | - tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl); | |
1133 | + tg_dispatch_one_bio(tg, bio_data_dir(bio)); | |
791 | 1134 | nr_writes++; |
792 | 1135 | |
793 | 1136 | if (nr_writes >= max_nr_writes) |
794 | 1137 | |
795 | 1138 | |
... | ... | @@ -797,14 +1140,13 @@ |
797 | 1140 | return nr_reads + nr_writes; |
798 | 1141 | } |
799 | 1142 | |
800 | -static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl) | |
1143 | +static int throtl_select_dispatch(struct throtl_service_queue *parent_sq) | |
801 | 1144 | { |
802 | 1145 | unsigned int nr_disp = 0; |
803 | - struct throtl_grp *tg; | |
804 | - struct throtl_rb_root *st = &td->tg_service_tree; | |
805 | 1146 | |
806 | 1147 | while (1) { |
807 | - tg = throtl_rb_first(st); | |
1148 | + struct throtl_grp *tg = throtl_rb_first(parent_sq); | |
1149 | + struct throtl_service_queue *sq = &tg->service_queue; | |
808 | 1150 | |
809 | 1151 | if (!tg) |
810 | 1152 | break; |
811 | 1153 | |
812 | 1154 | |
... | ... | @@ -812,14 +1154,12 @@ |
812 | 1154 | if (time_before(jiffies, tg->disptime)) |
813 | 1155 | break; |
814 | 1156 | |
815 | - throtl_dequeue_tg(td, tg); | |
1157 | + throtl_dequeue_tg(tg); | |
816 | 1158 | |
817 | - nr_disp += throtl_dispatch_tg(td, tg, bl); | |
1159 | + nr_disp += throtl_dispatch_tg(tg); | |
818 | 1160 | |
819 | - if (tg->nr_queued[0] || tg->nr_queued[1]) { | |
820 | - tg_update_disptime(td, tg); | |
821 | - throtl_enqueue_tg(td, tg); | |
822 | - } | |
1161 | + if (sq->nr_queued[0] || sq->nr_queued[1]) | |
1162 | + tg_update_disptime(tg); | |
823 | 1163 | |
824 | 1164 | if (nr_disp >= throtl_quantum) |
825 | 1165 | break; |
826 | 1166 | |
827 | 1167 | |
828 | 1168 | |
829 | 1169 | |
830 | 1170 | |
831 | 1171 | |
832 | 1172 | |
833 | 1173 | |
834 | 1174 | |
835 | 1175 | |
836 | 1176 | |
837 | 1177 | |
838 | 1178 | |
839 | 1179 | |
840 | 1180 | |
841 | 1181 | |
842 | 1182 | |
843 | 1183 | |
... | ... | @@ -828,113 +1168,113 @@ |
828 | 1168 | return nr_disp; |
829 | 1169 | } |
830 | 1170 | |
831 | -static void throtl_process_limit_change(struct throtl_data *td) | |
1171 | +/** | |
1172 | + * throtl_pending_timer_fn - timer function for service_queue->pending_timer | |
1173 | + * @arg: the throtl_service_queue being serviced | |
1174 | + * | |
1175 | + * This timer is armed when a child throtl_grp with active bio's become | |
1176 | + * pending and queued on the service_queue's pending_tree and expires when | |
1177 | + * the first child throtl_grp should be dispatched. This function | |
1178 | + * dispatches bio's from the children throtl_grps to the parent | |
1179 | + * service_queue. | |
1180 | + * | |
1181 | + * If the parent's parent is another throtl_grp, dispatching is propagated | |
1182 | + * by either arming its pending_timer or repeating dispatch directly. If | |
1183 | + * the top-level service_tree is reached, throtl_data->dispatch_work is | |
1184 | + * kicked so that the ready bio's are issued. | |
1185 | + */ | |
1186 | +static void throtl_pending_timer_fn(unsigned long arg) | |
832 | 1187 | { |
1188 | + struct throtl_service_queue *sq = (void *)arg; | |
1189 | + struct throtl_grp *tg = sq_to_tg(sq); | |
1190 | + struct throtl_data *td = sq_to_td(sq); | |
833 | 1191 | struct request_queue *q = td->queue; |
834 | - struct blkcg_gq *blkg, *n; | |
1192 | + struct throtl_service_queue *parent_sq; | |
1193 | + bool dispatched; | |
1194 | + int ret; | |
835 | 1195 | |
836 | - if (!td->limits_changed) | |
837 | - return; | |
1196 | + spin_lock_irq(q->queue_lock); | |
1197 | +again: | |
1198 | + parent_sq = sq->parent_sq; | |
1199 | + dispatched = false; | |
838 | 1200 | |
839 | - xchg(&td->limits_changed, false); | |
1201 | + while (true) { | |
1202 | + throtl_log(sq, "dispatch nr_queued=%u read=%u write=%u", | |
1203 | + sq->nr_queued[READ] + sq->nr_queued[WRITE], | |
1204 | + sq->nr_queued[READ], sq->nr_queued[WRITE]); | |
840 | 1205 | |
841 | - throtl_log(td, "limits changed"); | |
1206 | + ret = throtl_select_dispatch(sq); | |
1207 | + if (ret) { | |
1208 | + throtl_log(sq, "bios disp=%u", ret); | |
1209 | + dispatched = true; | |
1210 | + } | |
842 | 1211 | |
843 | - list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) { | |
844 | - struct throtl_grp *tg = blkg_to_tg(blkg); | |
1212 | + if (throtl_schedule_next_dispatch(sq, false)) | |
1213 | + break; | |
845 | 1214 | |
846 | - if (!tg->limits_changed) | |
847 | - continue; | |
1215 | + /* this dispatch windows is still open, relax and repeat */ | |
1216 | + spin_unlock_irq(q->queue_lock); | |
1217 | + cpu_relax(); | |
1218 | + spin_lock_irq(q->queue_lock); | |
1219 | + } | |
848 | 1220 | |
849 | - if (!xchg(&tg->limits_changed, false)) | |
850 | - continue; | |
1221 | + if (!dispatched) | |
1222 | + goto out_unlock; | |
851 | 1223 | |
852 | - throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu" | |
853 | - " riops=%u wiops=%u", tg->bps[READ], tg->bps[WRITE], | |
854 | - tg->iops[READ], tg->iops[WRITE]); | |
855 | - | |
856 | - /* | |
857 | - * Restart the slices for both READ and WRITES. It | |
858 | - * might happen that a group's limit are dropped | |
859 | - * suddenly and we don't want to account recently | |
860 | - * dispatched IO with new low rate | |
861 | - */ | |
862 | - throtl_start_new_slice(td, tg, 0); | |
863 | - throtl_start_new_slice(td, tg, 1); | |
864 | - | |
865 | - if (throtl_tg_on_rr(tg)) | |
866 | - tg_update_disptime(td, tg); | |
1224 | + if (parent_sq) { | |
1225 | + /* @parent_sq is another throl_grp, propagate dispatch */ | |
1226 | + if (tg->flags & THROTL_TG_WAS_EMPTY) { | |
1227 | + tg_update_disptime(tg); | |
1228 | + if (!throtl_schedule_next_dispatch(parent_sq, false)) { | |
1229 | + /* window is already open, repeat dispatching */ | |
1230 | + sq = parent_sq; | |
1231 | + tg = sq_to_tg(sq); | |
1232 | + goto again; | |
1233 | + } | |
1234 | + } | |
1235 | + } else { | |
1236 | + /* reached the top-level, queue issueing */ | |
1237 | + queue_work(kthrotld_workqueue, &td->dispatch_work); | |
867 | 1238 | } |
1239 | +out_unlock: | |
1240 | + spin_unlock_irq(q->queue_lock); | |
868 | 1241 | } |
869 | 1242 | |
870 | -/* Dispatch throttled bios. Should be called without queue lock held. */ | |
871 | -static int throtl_dispatch(struct request_queue *q) | |
1243 | +/** | |
1244 | + * blk_throtl_dispatch_work_fn - work function for throtl_data->dispatch_work | |
1245 | + * @work: work item being executed | |
1246 | + * | |
1247 | + * This function is queued for execution when bio's reach the bio_lists[] | |
1248 | + * of throtl_data->service_queue. Those bio's are ready and issued by this | |
1249 | + * function. | |
1250 | + */ | |
1251 | +void blk_throtl_dispatch_work_fn(struct work_struct *work) | |
872 | 1252 | { |
873 | - struct throtl_data *td = q->td; | |
874 | - unsigned int nr_disp = 0; | |
1253 | + struct throtl_data *td = container_of(work, struct throtl_data, | |
1254 | + dispatch_work); | |
1255 | + struct throtl_service_queue *td_sq = &td->service_queue; | |
1256 | + struct request_queue *q = td->queue; | |
875 | 1257 | struct bio_list bio_list_on_stack; |
876 | 1258 | struct bio *bio; |
877 | 1259 | struct blk_plug plug; |
1260 | + int rw; | |
878 | 1261 | |
879 | - spin_lock_irq(q->queue_lock); | |
880 | - | |
881 | - throtl_process_limit_change(td); | |
882 | - | |
883 | - if (!total_nr_queued(td)) | |
884 | - goto out; | |
885 | - | |
886 | 1262 | bio_list_init(&bio_list_on_stack); |
887 | 1263 | |
888 | - throtl_log(td, "dispatch nr_queued=%u read=%u write=%u", | |
889 | - total_nr_queued(td), td->nr_queued[READ], | |
890 | - td->nr_queued[WRITE]); | |
891 | - | |
892 | - nr_disp = throtl_select_dispatch(td, &bio_list_on_stack); | |
893 | - | |
894 | - if (nr_disp) | |
895 | - throtl_log(td, "bios disp=%u", nr_disp); | |
896 | - | |
897 | - throtl_schedule_next_dispatch(td); | |
898 | -out: | |
1264 | + spin_lock_irq(q->queue_lock); | |
1265 | + for (rw = READ; rw <= WRITE; rw++) | |
1266 | + while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL))) | |
1267 | + bio_list_add(&bio_list_on_stack, bio); | |
899 | 1268 | spin_unlock_irq(q->queue_lock); |
900 | 1269 | |
901 | - /* | |
902 | - * If we dispatched some requests, unplug the queue to make sure | |
903 | - * immediate dispatch | |
904 | - */ | |
905 | - if (nr_disp) { | |
1270 | + if (!bio_list_empty(&bio_list_on_stack)) { | |
906 | 1271 | blk_start_plug(&plug); |
907 | 1272 | while((bio = bio_list_pop(&bio_list_on_stack))) |
908 | 1273 | generic_make_request(bio); |
909 | 1274 | blk_finish_plug(&plug); |
910 | 1275 | } |
911 | - return nr_disp; | |
912 | 1276 | } |
913 | 1277 | |
914 | -void blk_throtl_work(struct work_struct *work) | |
915 | -{ | |
916 | - struct throtl_data *td = container_of(work, struct throtl_data, | |
917 | - throtl_work.work); | |
918 | - struct request_queue *q = td->queue; | |
919 | - | |
920 | - throtl_dispatch(q); | |
921 | -} | |
922 | - | |
923 | -/* Call with queue lock held */ | |
924 | -static void | |
925 | -throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay) | |
926 | -{ | |
927 | - | |
928 | - struct delayed_work *dwork = &td->throtl_work; | |
929 | - | |
930 | - /* schedule work if limits changed even if no bio is queued */ | |
931 | - if (total_nr_queued(td) || td->limits_changed) { | |
932 | - mod_delayed_work(kthrotld_workqueue, dwork, delay); | |
933 | - throtl_log(td, "schedule work. delay=%lu jiffies=%lu", | |
934 | - delay, jiffies); | |
935 | - } | |
936 | -} | |
937 | - | |
938 | 1278 | static u64 tg_prfill_cpu_rwstat(struct seq_file *sf, |
939 | 1279 | struct blkg_policy_data *pd, int off) |
940 | 1280 | { |
... | ... | @@ -1007,7 +1347,9 @@ |
1007 | 1347 | struct blkcg *blkcg = cgroup_to_blkcg(cgrp); |
1008 | 1348 | struct blkg_conf_ctx ctx; |
1009 | 1349 | struct throtl_grp *tg; |
1010 | - struct throtl_data *td; | |
1350 | + struct throtl_service_queue *sq; | |
1351 | + struct blkcg_gq *blkg; | |
1352 | + struct cgroup *pos_cgrp; | |
1011 | 1353 | int ret; |
1012 | 1354 | |
1013 | 1355 | ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); |
... | ... | @@ -1015,7 +1357,7 @@ |
1015 | 1357 | return ret; |
1016 | 1358 | |
1017 | 1359 | tg = blkg_to_tg(ctx.blkg); |
1018 | - td = ctx.blkg->q->td; | |
1360 | + sq = &tg->service_queue; | |
1019 | 1361 | |
1020 | 1362 | if (!ctx.v) |
1021 | 1363 | ctx.v = -1; |
1022 | 1364 | |
... | ... | @@ -1025,11 +1367,38 @@ |
1025 | 1367 | else |
1026 | 1368 | *(unsigned int *)((void *)tg + cft->private) = ctx.v; |
1027 | 1369 | |
1028 | - /* XXX: we don't need the following deferred processing */ | |
1029 | - xchg(&tg->limits_changed, true); | |
1030 | - xchg(&td->limits_changed, true); | |
1031 | - throtl_schedule_delayed_work(td, 0); | |
1370 | + throtl_log(&tg->service_queue, | |
1371 | + "limit change rbps=%llu wbps=%llu riops=%u wiops=%u", | |
1372 | + tg->bps[READ], tg->bps[WRITE], | |
1373 | + tg->iops[READ], tg->iops[WRITE]); | |
1032 | 1374 | |
1375 | + /* | |
1376 | + * Update has_rules[] flags for the updated tg's subtree. A tg is | |
1377 | + * considered to have rules if either the tg itself or any of its | |
1378 | + * ancestors has rules. This identifies groups without any | |
1379 | + * restrictions in the whole hierarchy and allows them to bypass | |
1380 | + * blk-throttle. | |
1381 | + */ | |
1382 | + tg_update_has_rules(tg); | |
1383 | + blkg_for_each_descendant_pre(blkg, pos_cgrp, ctx.blkg) | |
1384 | + tg_update_has_rules(blkg_to_tg(blkg)); | |
1385 | + | |
1386 | + /* | |
1387 | + * We're already holding queue_lock and know @tg is valid. Let's | |
1388 | + * apply the new config directly. | |
1389 | + * | |
1390 | + * Restart the slices for both READ and WRITES. It might happen | |
1391 | + * that a group's limit are dropped suddenly and we don't want to | |
1392 | + * account recently dispatched IO with new low rate. | |
1393 | + */ | |
1394 | + throtl_start_new_slice(tg, 0); | |
1395 | + throtl_start_new_slice(tg, 1); | |
1396 | + | |
1397 | + if (tg->flags & THROTL_TG_PENDING) { | |
1398 | + tg_update_disptime(tg); | |
1399 | + throtl_schedule_next_dispatch(sq->parent_sq, true); | |
1400 | + } | |
1401 | + | |
1033 | 1402 | blkg_conf_finish(&ctx); |
1034 | 1403 | return 0; |
1035 | 1404 | } |
... | ... | @@ -1092,7 +1461,7 @@ |
1092 | 1461 | { |
1093 | 1462 | struct throtl_data *td = q->td; |
1094 | 1463 | |
1095 | - cancel_delayed_work_sync(&td->throtl_work); | |
1464 | + cancel_work_sync(&td->dispatch_work); | |
1096 | 1465 | } |
1097 | 1466 | |
1098 | 1467 | static struct blkcg_policy blkcg_policy_throtl = { |
... | ... | @@ -1100,6 +1469,7 @@ |
1100 | 1469 | .cftypes = throtl_files, |
1101 | 1470 | |
1102 | 1471 | .pd_init_fn = throtl_pd_init, |
1472 | + .pd_online_fn = throtl_pd_online, | |
1103 | 1473 | .pd_exit_fn = throtl_pd_exit, |
1104 | 1474 | .pd_reset_stats_fn = throtl_pd_reset_stats, |
1105 | 1475 | }; |
1106 | 1476 | |
1107 | 1477 | |
1108 | 1478 | |
... | ... | @@ -1107,15 +1477,16 @@ |
1107 | 1477 | bool blk_throtl_bio(struct request_queue *q, struct bio *bio) |
1108 | 1478 | { |
1109 | 1479 | struct throtl_data *td = q->td; |
1480 | + struct throtl_qnode *qn = NULL; | |
1110 | 1481 | struct throtl_grp *tg; |
1111 | - bool rw = bio_data_dir(bio), update_disptime = true; | |
1482 | + struct throtl_service_queue *sq; | |
1483 | + bool rw = bio_data_dir(bio); | |
1112 | 1484 | struct blkcg *blkcg; |
1113 | 1485 | bool throttled = false; |
1114 | 1486 | |
1115 | - if (bio->bi_rw & REQ_THROTTLED) { | |
1116 | - bio->bi_rw &= ~REQ_THROTTLED; | |
1487 | + /* see throtl_charge_bio() */ | |
1488 | + if (bio->bi_rw & REQ_THROTTLED) | |
1117 | 1489 | goto out; |
1118 | - } | |
1119 | 1490 | |
1120 | 1491 | /* |
1121 | 1492 | * A throtl_grp pointer retrieved under rcu can be used to access |
... | ... | @@ -1126,7 +1497,7 @@ |
1126 | 1497 | blkcg = bio_blkcg(bio); |
1127 | 1498 | tg = throtl_lookup_tg(td, blkcg); |
1128 | 1499 | if (tg) { |
1129 | - if (tg_no_rule_group(tg, rw)) { | |
1500 | + if (!tg->has_rules[rw]) { | |
1130 | 1501 | throtl_update_dispatch_stats(tg_to_blkg(tg), |
1131 | 1502 | bio->bi_size, bio->bi_rw); |
1132 | 1503 | goto out_unlock_rcu; |
1133 | 1504 | |
1134 | 1505 | |
... | ... | @@ -1142,18 +1513,18 @@ |
1142 | 1513 | if (unlikely(!tg)) |
1143 | 1514 | goto out_unlock; |
1144 | 1515 | |
1145 | - if (tg->nr_queued[rw]) { | |
1146 | - /* | |
1147 | - * There is already another bio queued in same dir. No | |
1148 | - * need to update dispatch time. | |
1149 | - */ | |
1150 | - update_disptime = false; | |
1151 | - goto queue_bio; | |
1516 | + sq = &tg->service_queue; | |
1152 | 1517 | |
1153 | - } | |
1518 | + while (true) { | |
1519 | + /* throtl is FIFO - if bios are already queued, should queue */ | |
1520 | + if (sq->nr_queued[rw]) | |
1521 | + break; | |
1154 | 1522 | |
1155 | - /* Bio is with-in rate limit of group */ | |
1156 | - if (tg_may_dispatch(td, tg, bio, NULL)) { | |
1523 | + /* if above limits, break to queue */ | |
1524 | + if (!tg_may_dispatch(tg, bio, NULL)) | |
1525 | + break; | |
1526 | + | |
1527 | + /* within limits, let's charge and dispatch directly */ | |
1157 | 1528 | throtl_charge_bio(tg, bio); |
1158 | 1529 | |
1159 | 1530 | /* |
1160 | 1531 | |
1161 | 1532 | |
1162 | 1533 | |
... | ... | @@ -1167,25 +1538,41 @@ |
1167 | 1538 | * |
1168 | 1539 | * So keep on trimming slice even if bio is not queued. |
1169 | 1540 | */ |
1170 | - throtl_trim_slice(td, tg, rw); | |
1171 | - goto out_unlock; | |
1541 | + throtl_trim_slice(tg, rw); | |
1542 | + | |
1543 | + /* | |
1544 | + * @bio passed through this layer without being throttled. | |
1545 | + * Climb up the ladder. If we''re already at the top, it | |
1546 | + * can be executed directly. | |
1547 | + */ | |
1548 | + qn = &tg->qnode_on_parent[rw]; | |
1549 | + sq = sq->parent_sq; | |
1550 | + tg = sq_to_tg(sq); | |
1551 | + if (!tg) | |
1552 | + goto out_unlock; | |
1172 | 1553 | } |
1173 | 1554 | |
1174 | -queue_bio: | |
1175 | - throtl_log_tg(td, tg, "[%c] bio. bdisp=%llu sz=%u bps=%llu" | |
1176 | - " iodisp=%u iops=%u queued=%d/%d", | |
1177 | - rw == READ ? 'R' : 'W', | |
1178 | - tg->bytes_disp[rw], bio->bi_size, tg->bps[rw], | |
1179 | - tg->io_disp[rw], tg->iops[rw], | |
1180 | - tg->nr_queued[READ], tg->nr_queued[WRITE]); | |
1555 | + /* out-of-limit, queue to @tg */ | |
1556 | + throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u iops=%u queued=%d/%d", | |
1557 | + rw == READ ? 'R' : 'W', | |
1558 | + tg->bytes_disp[rw], bio->bi_size, tg->bps[rw], | |
1559 | + tg->io_disp[rw], tg->iops[rw], | |
1560 | + sq->nr_queued[READ], sq->nr_queued[WRITE]); | |
1181 | 1561 | |
1182 | 1562 | bio_associate_current(bio); |
1183 | - throtl_add_bio_tg(q->td, tg, bio); | |
1563 | + tg->td->nr_queued[rw]++; | |
1564 | + throtl_add_bio_tg(bio, qn, tg); | |
1184 | 1565 | throttled = true; |
1185 | 1566 | |
1186 | - if (update_disptime) { | |
1187 | - tg_update_disptime(td, tg); | |
1188 | - throtl_schedule_next_dispatch(td); | |
1567 | + /* | |
1568 | + * Update @tg's dispatch time and force schedule dispatch if @tg | |
1569 | + * was empty before @bio. The forced scheduling isn't likely to | |
1570 | + * cause undue delay as @bio is likely to be dispatched directly if | |
1571 | + * its @tg's disptime is not in the future. | |
1572 | + */ | |
1573 | + if (tg->flags & THROTL_TG_WAS_EMPTY) { | |
1574 | + tg_update_disptime(tg); | |
1575 | + throtl_schedule_next_dispatch(tg->service_queue.parent_sq, true); | |
1189 | 1576 | } |
1190 | 1577 | |
1191 | 1578 | out_unlock: |
1192 | 1579 | |
... | ... | @@ -1193,9 +1580,38 @@ |
1193 | 1580 | out_unlock_rcu: |
1194 | 1581 | rcu_read_unlock(); |
1195 | 1582 | out: |
1583 | + /* | |
1584 | + * As multiple blk-throtls may stack in the same issue path, we | |
1585 | + * don't want bios to leave with the flag set. Clear the flag if | |
1586 | + * being issued. | |
1587 | + */ | |
1588 | + if (!throttled) | |
1589 | + bio->bi_rw &= ~REQ_THROTTLED; | |
1196 | 1590 | return throttled; |
1197 | 1591 | } |
1198 | 1592 | |
1593 | +/* | |
1594 | + * Dispatch all bios from all children tg's queued on @parent_sq. On | |
1595 | + * return, @parent_sq is guaranteed to not have any active children tg's | |
1596 | + * and all bios from previously active tg's are on @parent_sq->bio_lists[]. | |
1597 | + */ | |
1598 | +static void tg_drain_bios(struct throtl_service_queue *parent_sq) | |
1599 | +{ | |
1600 | + struct throtl_grp *tg; | |
1601 | + | |
1602 | + while ((tg = throtl_rb_first(parent_sq))) { | |
1603 | + struct throtl_service_queue *sq = &tg->service_queue; | |
1604 | + struct bio *bio; | |
1605 | + | |
1606 | + throtl_dequeue_tg(tg); | |
1607 | + | |
1608 | + while ((bio = throtl_peek_queued(&sq->queued[READ]))) | |
1609 | + tg_dispatch_one_bio(tg, bio_data_dir(bio)); | |
1610 | + while ((bio = throtl_peek_queued(&sq->queued[WRITE]))) | |
1611 | + tg_dispatch_one_bio(tg, bio_data_dir(bio)); | |
1612 | + } | |
1613 | +} | |
1614 | + | |
1199 | 1615 | /** |
1200 | 1616 | * blk_throtl_drain - drain throttled bios |
1201 | 1617 | * @q: request_queue to drain throttled bios for |
1202 | 1618 | |
1203 | 1619 | |
1204 | 1620 | |
1205 | 1621 | |
1206 | 1622 | |
1207 | 1623 | |
... | ... | @@ -1206,27 +1622,36 @@ |
1206 | 1622 | __releases(q->queue_lock) __acquires(q->queue_lock) |
1207 | 1623 | { |
1208 | 1624 | struct throtl_data *td = q->td; |
1209 | - struct throtl_rb_root *st = &td->tg_service_tree; | |
1210 | - struct throtl_grp *tg; | |
1211 | - struct bio_list bl; | |
1625 | + struct blkcg_gq *blkg; | |
1626 | + struct cgroup *pos_cgrp; | |
1212 | 1627 | struct bio *bio; |
1628 | + int rw; | |
1213 | 1629 | |
1214 | 1630 | queue_lockdep_assert_held(q); |
1631 | + rcu_read_lock(); | |
1215 | 1632 | |
1216 | - bio_list_init(&bl); | |
1633 | + /* | |
1634 | + * Drain each tg while doing post-order walk on the blkg tree, so | |
1635 | + * that all bios are propagated to td->service_queue. It'd be | |
1636 | + * better to walk service_queue tree directly but blkg walk is | |
1637 | + * easier. | |
1638 | + */ | |
1639 | + blkg_for_each_descendant_post(blkg, pos_cgrp, td->queue->root_blkg) | |
1640 | + tg_drain_bios(&blkg_to_tg(blkg)->service_queue); | |
1217 | 1641 | |
1218 | - while ((tg = throtl_rb_first(st))) { | |
1219 | - throtl_dequeue_tg(td, tg); | |
1642 | + tg_drain_bios(&td_root_tg(td)->service_queue); | |
1220 | 1643 | |
1221 | - while ((bio = bio_list_peek(&tg->bio_lists[READ]))) | |
1222 | - tg_dispatch_one_bio(td, tg, bio_data_dir(bio), &bl); | |
1223 | - while ((bio = bio_list_peek(&tg->bio_lists[WRITE]))) | |
1224 | - tg_dispatch_one_bio(td, tg, bio_data_dir(bio), &bl); | |
1225 | - } | |
1644 | + /* finally, transfer bios from top-level tg's into the td */ | |
1645 | + tg_drain_bios(&td->service_queue); | |
1646 | + | |
1647 | + rcu_read_unlock(); | |
1226 | 1648 | spin_unlock_irq(q->queue_lock); |
1227 | 1649 | |
1228 | - while ((bio = bio_list_pop(&bl))) | |
1229 | - generic_make_request(bio); | |
1650 | + /* all bios now should be in td->service_queue, issue them */ | |
1651 | + for (rw = READ; rw <= WRITE; rw++) | |
1652 | + while ((bio = throtl_pop_queued(&td->service_queue.queued[rw], | |
1653 | + NULL))) | |
1654 | + generic_make_request(bio); | |
1230 | 1655 | |
1231 | 1656 | spin_lock_irq(q->queue_lock); |
1232 | 1657 | } |
... | ... | @@ -1240,9 +1665,8 @@ |
1240 | 1665 | if (!td) |
1241 | 1666 | return -ENOMEM; |
1242 | 1667 | |
1243 | - td->tg_service_tree = THROTL_RB_ROOT; | |
1244 | - td->limits_changed = false; | |
1245 | - INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work); | |
1668 | + INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn); | |
1669 | + throtl_service_queue_init(&td->service_queue, NULL); | |
1246 | 1670 | |
1247 | 1671 | q->td = td; |
1248 | 1672 | td->queue = q; |
block/cfq-iosched.c
... | ... | @@ -4347,18 +4347,28 @@ |
4347 | 4347 | kfree(cfqd); |
4348 | 4348 | } |
4349 | 4349 | |
4350 | -static int cfq_init_queue(struct request_queue *q) | |
4350 | +static int cfq_init_queue(struct request_queue *q, struct elevator_type *e) | |
4351 | 4351 | { |
4352 | 4352 | struct cfq_data *cfqd; |
4353 | 4353 | struct blkcg_gq *blkg __maybe_unused; |
4354 | 4354 | int i, ret; |
4355 | + struct elevator_queue *eq; | |
4355 | 4356 | |
4357 | + eq = elevator_alloc(q, e); | |
4358 | + if (!eq) | |
4359 | + return -ENOMEM; | |
4360 | + | |
4356 | 4361 | cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node); |
4357 | - if (!cfqd) | |
4362 | + if (!cfqd) { | |
4363 | + kobject_put(&eq->kobj); | |
4358 | 4364 | return -ENOMEM; |
4365 | + } | |
4366 | + eq->elevator_data = cfqd; | |
4359 | 4367 | |
4360 | 4368 | cfqd->queue = q; |
4361 | - q->elevator->elevator_data = cfqd; | |
4369 | + spin_lock_irq(q->queue_lock); | |
4370 | + q->elevator = eq; | |
4371 | + spin_unlock_irq(q->queue_lock); | |
4362 | 4372 | |
4363 | 4373 | /* Init root service tree */ |
4364 | 4374 | cfqd->grp_service_tree = CFQ_RB_ROOT; |
... | ... | @@ -4433,6 +4443,7 @@ |
4433 | 4443 | |
4434 | 4444 | out_free: |
4435 | 4445 | kfree(cfqd); |
4446 | + kobject_put(&eq->kobj); | |
4436 | 4447 | return ret; |
4437 | 4448 | } |
4438 | 4449 |
block/deadline-iosched.c
... | ... | @@ -337,13 +337,21 @@ |
337 | 337 | /* |
338 | 338 | * initialize elevator private data (deadline_data). |
339 | 339 | */ |
340 | -static int deadline_init_queue(struct request_queue *q) | |
340 | +static int deadline_init_queue(struct request_queue *q, struct elevator_type *e) | |
341 | 341 | { |
342 | 342 | struct deadline_data *dd; |
343 | + struct elevator_queue *eq; | |
343 | 344 | |
345 | + eq = elevator_alloc(q, e); | |
346 | + if (!eq) | |
347 | + return -ENOMEM; | |
348 | + | |
344 | 349 | dd = kmalloc_node(sizeof(*dd), GFP_KERNEL | __GFP_ZERO, q->node); |
345 | - if (!dd) | |
350 | + if (!dd) { | |
351 | + kobject_put(&eq->kobj); | |
346 | 352 | return -ENOMEM; |
353 | + } | |
354 | + eq->elevator_data = dd; | |
347 | 355 | |
348 | 356 | INIT_LIST_HEAD(&dd->fifo_list[READ]); |
349 | 357 | INIT_LIST_HEAD(&dd->fifo_list[WRITE]); |
... | ... | @@ -355,7 +363,9 @@ |
355 | 363 | dd->front_merges = 1; |
356 | 364 | dd->fifo_batch = fifo_batch; |
357 | 365 | |
358 | - q->elevator->elevator_data = dd; | |
366 | + spin_lock_irq(q->queue_lock); | |
367 | + q->elevator = eq; | |
368 | + spin_unlock_irq(q->queue_lock); | |
359 | 369 | return 0; |
360 | 370 | } |
361 | 371 |
block/elevator.c
... | ... | @@ -150,7 +150,7 @@ |
150 | 150 | |
151 | 151 | static struct kobj_type elv_ktype; |
152 | 152 | |
153 | -static struct elevator_queue *elevator_alloc(struct request_queue *q, | |
153 | +struct elevator_queue *elevator_alloc(struct request_queue *q, | |
154 | 154 | struct elevator_type *e) |
155 | 155 | { |
156 | 156 | struct elevator_queue *eq; |
... | ... | @@ -170,6 +170,7 @@ |
170 | 170 | elevator_put(e); |
171 | 171 | return NULL; |
172 | 172 | } |
173 | +EXPORT_SYMBOL(elevator_alloc); | |
173 | 174 | |
174 | 175 | static void elevator_release(struct kobject *kobj) |
175 | 176 | { |
... | ... | @@ -221,16 +222,7 @@ |
221 | 222 | } |
222 | 223 | } |
223 | 224 | |
224 | - q->elevator = elevator_alloc(q, e); | |
225 | - if (!q->elevator) | |
226 | - return -ENOMEM; | |
227 | - | |
228 | - err = e->ops.elevator_init_fn(q); | |
229 | - if (err) { | |
230 | - kobject_put(&q->elevator->kobj); | |
231 | - return err; | |
232 | - } | |
233 | - | |
225 | + err = e->ops.elevator_init_fn(q, e); | |
234 | 226 | return 0; |
235 | 227 | } |
236 | 228 | EXPORT_SYMBOL(elevator_init); |
237 | 229 | |
... | ... | @@ -935,16 +927,9 @@ |
935 | 927 | spin_unlock_irq(q->queue_lock); |
936 | 928 | |
937 | 929 | /* allocate, init and register new elevator */ |
938 | - err = -ENOMEM; | |
939 | - q->elevator = elevator_alloc(q, new_e); | |
940 | - if (!q->elevator) | |
930 | + err = new_e->ops.elevator_init_fn(q, new_e); | |
931 | + if (err) | |
941 | 932 | goto fail_init; |
942 | - | |
943 | - err = new_e->ops.elevator_init_fn(q); | |
944 | - if (err) { | |
945 | - kobject_put(&q->elevator->kobj); | |
946 | - goto fail_init; | |
947 | - } | |
948 | 933 | |
949 | 934 | if (registered) { |
950 | 935 | err = elv_register_queue(q); |
block/noop-iosched.c
... | ... | @@ -59,16 +59,27 @@ |
59 | 59 | return list_entry(rq->queuelist.next, struct request, queuelist); |
60 | 60 | } |
61 | 61 | |
62 | -static int noop_init_queue(struct request_queue *q) | |
62 | +static int noop_init_queue(struct request_queue *q, struct elevator_type *e) | |
63 | 63 | { |
64 | 64 | struct noop_data *nd; |
65 | + struct elevator_queue *eq; | |
65 | 66 | |
67 | + eq = elevator_alloc(q, e); | |
68 | + if (!eq) | |
69 | + return -ENOMEM; | |
70 | + | |
66 | 71 | nd = kmalloc_node(sizeof(*nd), GFP_KERNEL, q->node); |
67 | - if (!nd) | |
72 | + if (!nd) { | |
73 | + kobject_put(&eq->kobj); | |
68 | 74 | return -ENOMEM; |
75 | + } | |
76 | + eq->elevator_data = nd; | |
69 | 77 | |
70 | 78 | INIT_LIST_HEAD(&nd->queue); |
71 | - q->elevator->elevator_data = nd; | |
79 | + | |
80 | + spin_lock_irq(q->queue_lock); | |
81 | + q->elevator = eq; | |
82 | + spin_unlock_irq(q->queue_lock); | |
72 | 83 | return 0; |
73 | 84 | } |
74 | 85 |
fs/block_dev.c
... | ... | @@ -58,17 +58,24 @@ |
58 | 58 | struct backing_dev_info *dst) |
59 | 59 | { |
60 | 60 | struct backing_dev_info *old = inode->i_data.backing_dev_info; |
61 | + bool wakeup_bdi = false; | |
61 | 62 | |
62 | 63 | if (unlikely(dst == old)) /* deadlock avoidance */ |
63 | 64 | return; |
64 | 65 | bdi_lock_two(&old->wb, &dst->wb); |
65 | 66 | spin_lock(&inode->i_lock); |
66 | 67 | inode->i_data.backing_dev_info = dst; |
67 | - if (inode->i_state & I_DIRTY) | |
68 | + if (inode->i_state & I_DIRTY) { | |
69 | + if (bdi_cap_writeback_dirty(dst) && !wb_has_dirty_io(&dst->wb)) | |
70 | + wakeup_bdi = true; | |
68 | 71 | list_move(&inode->i_wb_list, &dst->wb.b_dirty); |
72 | + } | |
69 | 73 | spin_unlock(&inode->i_lock); |
70 | 74 | spin_unlock(&old->wb.list_lock); |
71 | 75 | spin_unlock(&dst->wb.list_lock); |
76 | + | |
77 | + if (wakeup_bdi) | |
78 | + bdi_wakeup_thread_delayed(dst); | |
72 | 79 | } |
73 | 80 | |
74 | 81 | /* Kill _all_ buffers and pagecache , dirty or not.. */ |
include/linux/cgroup.h
include/linux/elevator.h
... | ... | @@ -7,6 +7,7 @@ |
7 | 7 | #ifdef CONFIG_BLOCK |
8 | 8 | |
9 | 9 | struct io_cq; |
10 | +struct elevator_type; | |
10 | 11 | |
11 | 12 | typedef int (elevator_merge_fn) (struct request_queue *, struct request **, |
12 | 13 | struct bio *); |
... | ... | @@ -35,7 +36,8 @@ |
35 | 36 | typedef void (elevator_activate_req_fn) (struct request_queue *, struct request *); |
36 | 37 | typedef void (elevator_deactivate_req_fn) (struct request_queue *, struct request *); |
37 | 38 | |
38 | -typedef int (elevator_init_fn) (struct request_queue *); | |
39 | +typedef int (elevator_init_fn) (struct request_queue *, | |
40 | + struct elevator_type *e); | |
39 | 41 | typedef void (elevator_exit_fn) (struct elevator_queue *); |
40 | 42 | |
41 | 43 | struct elevator_ops |
... | ... | @@ -155,6 +157,8 @@ |
155 | 157 | extern void elevator_exit(struct elevator_queue *); |
156 | 158 | extern int elevator_change(struct request_queue *, const char *); |
157 | 159 | extern bool elv_rq_merge_ok(struct request *, struct bio *); |
160 | +extern struct elevator_queue *elevator_alloc(struct request_queue *, | |
161 | + struct elevator_type *); | |
158 | 162 | |
159 | 163 | /* |
160 | 164 | * Helper functions. |