Commit 0049af73bb4b74d1407db59caefc5fe057ee434a

Authored by Tejun Heo
1 parent e2d57e6019

blk-throttle: reorganize throtl_service_queue passed around as argument

throtl_service_queue will be the building block of hierarchy support
and will form a tree.  This patch updates its usages as arguments to
reduce confusion.

* When a service queue is used as the parent role - the host of the
  rbtree - use @parent_sq instead of @sq.

* For functions taking both @tg and @parent_sq, reorder them so that
  the order is (@tg, @parent_sq) not the other way around.  This makes
  the code follow the usual convention of specifying the primary
  target of the operation as the first argument.

This patch doesn't make any functional differences.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Vivek Goyal <vgoyal@redhat.com>

Showing 1 changed file with 51 additions and 49 deletions Inline Diff

block/blk-throttle.c
1 /* 1 /*
2 * Interface for controlling IO bandwidth on a request queue 2 * Interface for controlling IO bandwidth on a request queue
3 * 3 *
4 * Copyright (C) 2010 Vivek Goyal <vgoyal@redhat.com> 4 * Copyright (C) 2010 Vivek Goyal <vgoyal@redhat.com>
5 */ 5 */
6 6
7 #include <linux/module.h> 7 #include <linux/module.h>
8 #include <linux/slab.h> 8 #include <linux/slab.h>
9 #include <linux/blkdev.h> 9 #include <linux/blkdev.h>
10 #include <linux/bio.h> 10 #include <linux/bio.h>
11 #include <linux/blktrace_api.h> 11 #include <linux/blktrace_api.h>
12 #include "blk-cgroup.h" 12 #include "blk-cgroup.h"
13 #include "blk.h" 13 #include "blk.h"
14 14
15 /* Max dispatch from a group in 1 round */ 15 /* Max dispatch from a group in 1 round */
16 static int throtl_grp_quantum = 8; 16 static int throtl_grp_quantum = 8;
17 17
18 /* Total max dispatch from all groups in one round */ 18 /* Total max dispatch from all groups in one round */
19 static int throtl_quantum = 32; 19 static int throtl_quantum = 32;
20 20
21 /* Throttling is performed over 100ms slice and after that slice is renewed */ 21 /* Throttling is performed over 100ms slice and after that slice is renewed */
22 static unsigned long throtl_slice = HZ/10; /* 100 ms */ 22 static unsigned long throtl_slice = HZ/10; /* 100 ms */
23 23
24 static struct blkcg_policy blkcg_policy_throtl; 24 static struct blkcg_policy blkcg_policy_throtl;
25 25
26 /* A workqueue to queue throttle related work */ 26 /* A workqueue to queue throttle related work */
27 static struct workqueue_struct *kthrotld_workqueue; 27 static struct workqueue_struct *kthrotld_workqueue;
28 28
29 struct throtl_service_queue { 29 struct throtl_service_queue {
30 struct rb_root pending_tree; /* RB tree of active tgs */ 30 struct rb_root pending_tree; /* RB tree of active tgs */
31 struct rb_node *first_pending; /* first node in the tree */ 31 struct rb_node *first_pending; /* first node in the tree */
32 unsigned int nr_pending; /* # queued in the tree */ 32 unsigned int nr_pending; /* # queued in the tree */
33 unsigned long first_pending_disptime; /* disptime of the first tg */ 33 unsigned long first_pending_disptime; /* disptime of the first tg */
34 }; 34 };
35 35
36 #define THROTL_SERVICE_QUEUE_INITIALIZER \ 36 #define THROTL_SERVICE_QUEUE_INITIALIZER \
37 (struct throtl_service_queue){ .pending_tree = RB_ROOT } 37 (struct throtl_service_queue){ .pending_tree = RB_ROOT }
38 38
39 enum tg_state_flags { 39 enum tg_state_flags {
40 THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */ 40 THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */
41 }; 41 };
42 42
43 #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) 43 #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node)
44 44
45 /* Per-cpu group stats */ 45 /* Per-cpu group stats */
46 struct tg_stats_cpu { 46 struct tg_stats_cpu {
47 /* total bytes transferred */ 47 /* total bytes transferred */
48 struct blkg_rwstat service_bytes; 48 struct blkg_rwstat service_bytes;
49 /* total IOs serviced, post merge */ 49 /* total IOs serviced, post merge */
50 struct blkg_rwstat serviced; 50 struct blkg_rwstat serviced;
51 }; 51 };
52 52
53 struct throtl_grp { 53 struct throtl_grp {
54 /* must be the first member */ 54 /* must be the first member */
55 struct blkg_policy_data pd; 55 struct blkg_policy_data pd;
56 56
57 /* active throtl group service_queue member */ 57 /* active throtl group service_queue member */
58 struct rb_node rb_node; 58 struct rb_node rb_node;
59 59
60 /* throtl_data this group belongs to */ 60 /* throtl_data this group belongs to */
61 struct throtl_data *td; 61 struct throtl_data *td;
62 62
63 /* 63 /*
64 * Dispatch time in jiffies. This is the estimated time when group 64 * Dispatch time in jiffies. This is the estimated time when group
65 * will unthrottle and is ready to dispatch more bio. It is used as 65 * will unthrottle and is ready to dispatch more bio. It is used as
66 * key to sort active groups in service tree. 66 * key to sort active groups in service tree.
67 */ 67 */
68 unsigned long disptime; 68 unsigned long disptime;
69 69
70 unsigned int flags; 70 unsigned int flags;
71 71
72 /* Two lists for READ and WRITE */ 72 /* Two lists for READ and WRITE */
73 struct bio_list bio_lists[2]; 73 struct bio_list bio_lists[2];
74 74
75 /* Number of queued bios on READ and WRITE lists */ 75 /* Number of queued bios on READ and WRITE lists */
76 unsigned int nr_queued[2]; 76 unsigned int nr_queued[2];
77 77
78 /* bytes per second rate limits */ 78 /* bytes per second rate limits */
79 uint64_t bps[2]; 79 uint64_t bps[2];
80 80
81 /* IOPS limits */ 81 /* IOPS limits */
82 unsigned int iops[2]; 82 unsigned int iops[2];
83 83
84 /* Number of bytes disptached in current slice */ 84 /* Number of bytes disptached in current slice */
85 uint64_t bytes_disp[2]; 85 uint64_t bytes_disp[2];
86 /* Number of bio's dispatched in current slice */ 86 /* Number of bio's dispatched in current slice */
87 unsigned int io_disp[2]; 87 unsigned int io_disp[2];
88 88
89 /* When did we start a new slice */ 89 /* When did we start a new slice */
90 unsigned long slice_start[2]; 90 unsigned long slice_start[2];
91 unsigned long slice_end[2]; 91 unsigned long slice_end[2];
92 92
93 /* Per cpu stats pointer */ 93 /* Per cpu stats pointer */
94 struct tg_stats_cpu __percpu *stats_cpu; 94 struct tg_stats_cpu __percpu *stats_cpu;
95 95
96 /* List of tgs waiting for per cpu stats memory to be allocated */ 96 /* List of tgs waiting for per cpu stats memory to be allocated */
97 struct list_head stats_alloc_node; 97 struct list_head stats_alloc_node;
98 }; 98 };
99 99
100 struct throtl_data 100 struct throtl_data
101 { 101 {
102 /* service tree for active throtl groups */ 102 /* service tree for active throtl groups */
103 struct throtl_service_queue service_queue; 103 struct throtl_service_queue service_queue;
104 104
105 struct request_queue *queue; 105 struct request_queue *queue;
106 106
107 /* Total Number of queued bios on READ and WRITE lists */ 107 /* Total Number of queued bios on READ and WRITE lists */
108 unsigned int nr_queued[2]; 108 unsigned int nr_queued[2];
109 109
110 /* 110 /*
111 * number of total undestroyed groups 111 * number of total undestroyed groups
112 */ 112 */
113 unsigned int nr_undestroyed_grps; 113 unsigned int nr_undestroyed_grps;
114 114
115 /* Work for dispatching throttled bios */ 115 /* Work for dispatching throttled bios */
116 struct delayed_work dispatch_work; 116 struct delayed_work dispatch_work;
117 }; 117 };
118 118
119 /* list and work item to allocate percpu group stats */ 119 /* list and work item to allocate percpu group stats */
120 static DEFINE_SPINLOCK(tg_stats_alloc_lock); 120 static DEFINE_SPINLOCK(tg_stats_alloc_lock);
121 static LIST_HEAD(tg_stats_alloc_list); 121 static LIST_HEAD(tg_stats_alloc_list);
122 122
123 static void tg_stats_alloc_fn(struct work_struct *); 123 static void tg_stats_alloc_fn(struct work_struct *);
124 static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn); 124 static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn);
125 125
126 static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd) 126 static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd)
127 { 127 {
128 return pd ? container_of(pd, struct throtl_grp, pd) : NULL; 128 return pd ? container_of(pd, struct throtl_grp, pd) : NULL;
129 } 129 }
130 130
131 static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg) 131 static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg)
132 { 132 {
133 return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl)); 133 return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl));
134 } 134 }
135 135
136 static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg) 136 static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg)
137 { 137 {
138 return pd_to_blkg(&tg->pd); 138 return pd_to_blkg(&tg->pd);
139 } 139 }
140 140
141 static inline struct throtl_grp *td_root_tg(struct throtl_data *td) 141 static inline struct throtl_grp *td_root_tg(struct throtl_data *td)
142 { 142 {
143 return blkg_to_tg(td->queue->root_blkg); 143 return blkg_to_tg(td->queue->root_blkg);
144 } 144 }
145 145
146 #define throtl_log_tg(tg, fmt, args...) do { \ 146 #define throtl_log_tg(tg, fmt, args...) do { \
147 char __pbuf[128]; \ 147 char __pbuf[128]; \
148 \ 148 \
149 blkg_path(tg_to_blkg(tg), __pbuf, sizeof(__pbuf)); \ 149 blkg_path(tg_to_blkg(tg), __pbuf, sizeof(__pbuf)); \
150 blk_add_trace_msg((tg)->td->queue, "throtl %s " fmt, __pbuf, ##args); \ 150 blk_add_trace_msg((tg)->td->queue, "throtl %s " fmt, __pbuf, ##args); \
151 } while (0) 151 } while (0)
152 152
153 #define throtl_log(td, fmt, args...) \ 153 #define throtl_log(td, fmt, args...) \
154 blk_add_trace_msg((td)->queue, "throtl " fmt, ##args) 154 blk_add_trace_msg((td)->queue, "throtl " fmt, ##args)
155 155
156 /* 156 /*
157 * Worker for allocating per cpu stat for tgs. This is scheduled on the 157 * Worker for allocating per cpu stat for tgs. This is scheduled on the
158 * system_wq once there are some groups on the alloc_list waiting for 158 * system_wq once there are some groups on the alloc_list waiting for
159 * allocation. 159 * allocation.
160 */ 160 */
161 static void tg_stats_alloc_fn(struct work_struct *work) 161 static void tg_stats_alloc_fn(struct work_struct *work)
162 { 162 {
163 static struct tg_stats_cpu *stats_cpu; /* this fn is non-reentrant */ 163 static struct tg_stats_cpu *stats_cpu; /* this fn is non-reentrant */
164 struct delayed_work *dwork = to_delayed_work(work); 164 struct delayed_work *dwork = to_delayed_work(work);
165 bool empty = false; 165 bool empty = false;
166 166
167 alloc_stats: 167 alloc_stats:
168 if (!stats_cpu) { 168 if (!stats_cpu) {
169 stats_cpu = alloc_percpu(struct tg_stats_cpu); 169 stats_cpu = alloc_percpu(struct tg_stats_cpu);
170 if (!stats_cpu) { 170 if (!stats_cpu) {
171 /* allocation failed, try again after some time */ 171 /* allocation failed, try again after some time */
172 schedule_delayed_work(dwork, msecs_to_jiffies(10)); 172 schedule_delayed_work(dwork, msecs_to_jiffies(10));
173 return; 173 return;
174 } 174 }
175 } 175 }
176 176
177 spin_lock_irq(&tg_stats_alloc_lock); 177 spin_lock_irq(&tg_stats_alloc_lock);
178 178
179 if (!list_empty(&tg_stats_alloc_list)) { 179 if (!list_empty(&tg_stats_alloc_list)) {
180 struct throtl_grp *tg = list_first_entry(&tg_stats_alloc_list, 180 struct throtl_grp *tg = list_first_entry(&tg_stats_alloc_list,
181 struct throtl_grp, 181 struct throtl_grp,
182 stats_alloc_node); 182 stats_alloc_node);
183 swap(tg->stats_cpu, stats_cpu); 183 swap(tg->stats_cpu, stats_cpu);
184 list_del_init(&tg->stats_alloc_node); 184 list_del_init(&tg->stats_alloc_node);
185 } 185 }
186 186
187 empty = list_empty(&tg_stats_alloc_list); 187 empty = list_empty(&tg_stats_alloc_list);
188 spin_unlock_irq(&tg_stats_alloc_lock); 188 spin_unlock_irq(&tg_stats_alloc_lock);
189 if (!empty) 189 if (!empty)
190 goto alloc_stats; 190 goto alloc_stats;
191 } 191 }
192 192
193 static void throtl_pd_init(struct blkcg_gq *blkg) 193 static void throtl_pd_init(struct blkcg_gq *blkg)
194 { 194 {
195 struct throtl_grp *tg = blkg_to_tg(blkg); 195 struct throtl_grp *tg = blkg_to_tg(blkg);
196 unsigned long flags; 196 unsigned long flags;
197 197
198 RB_CLEAR_NODE(&tg->rb_node); 198 RB_CLEAR_NODE(&tg->rb_node);
199 tg->td = blkg->q->td; 199 tg->td = blkg->q->td;
200 bio_list_init(&tg->bio_lists[0]); 200 bio_list_init(&tg->bio_lists[0]);
201 bio_list_init(&tg->bio_lists[1]); 201 bio_list_init(&tg->bio_lists[1]);
202 202
203 tg->bps[READ] = -1; 203 tg->bps[READ] = -1;
204 tg->bps[WRITE] = -1; 204 tg->bps[WRITE] = -1;
205 tg->iops[READ] = -1; 205 tg->iops[READ] = -1;
206 tg->iops[WRITE] = -1; 206 tg->iops[WRITE] = -1;
207 207
208 /* 208 /*
209 * Ugh... We need to perform per-cpu allocation for tg->stats_cpu 209 * Ugh... We need to perform per-cpu allocation for tg->stats_cpu
210 * but percpu allocator can't be called from IO path. Queue tg on 210 * but percpu allocator can't be called from IO path. Queue tg on
211 * tg_stats_alloc_list and allocate from work item. 211 * tg_stats_alloc_list and allocate from work item.
212 */ 212 */
213 spin_lock_irqsave(&tg_stats_alloc_lock, flags); 213 spin_lock_irqsave(&tg_stats_alloc_lock, flags);
214 list_add(&tg->stats_alloc_node, &tg_stats_alloc_list); 214 list_add(&tg->stats_alloc_node, &tg_stats_alloc_list);
215 schedule_delayed_work(&tg_stats_alloc_work, 0); 215 schedule_delayed_work(&tg_stats_alloc_work, 0);
216 spin_unlock_irqrestore(&tg_stats_alloc_lock, flags); 216 spin_unlock_irqrestore(&tg_stats_alloc_lock, flags);
217 } 217 }
218 218
219 static void throtl_pd_exit(struct blkcg_gq *blkg) 219 static void throtl_pd_exit(struct blkcg_gq *blkg)
220 { 220 {
221 struct throtl_grp *tg = blkg_to_tg(blkg); 221 struct throtl_grp *tg = blkg_to_tg(blkg);
222 unsigned long flags; 222 unsigned long flags;
223 223
224 spin_lock_irqsave(&tg_stats_alloc_lock, flags); 224 spin_lock_irqsave(&tg_stats_alloc_lock, flags);
225 list_del_init(&tg->stats_alloc_node); 225 list_del_init(&tg->stats_alloc_node);
226 spin_unlock_irqrestore(&tg_stats_alloc_lock, flags); 226 spin_unlock_irqrestore(&tg_stats_alloc_lock, flags);
227 227
228 free_percpu(tg->stats_cpu); 228 free_percpu(tg->stats_cpu);
229 } 229 }
230 230
231 static void throtl_pd_reset_stats(struct blkcg_gq *blkg) 231 static void throtl_pd_reset_stats(struct blkcg_gq *blkg)
232 { 232 {
233 struct throtl_grp *tg = blkg_to_tg(blkg); 233 struct throtl_grp *tg = blkg_to_tg(blkg);
234 int cpu; 234 int cpu;
235 235
236 if (tg->stats_cpu == NULL) 236 if (tg->stats_cpu == NULL)
237 return; 237 return;
238 238
239 for_each_possible_cpu(cpu) { 239 for_each_possible_cpu(cpu) {
240 struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu); 240 struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu);
241 241
242 blkg_rwstat_reset(&sc->service_bytes); 242 blkg_rwstat_reset(&sc->service_bytes);
243 blkg_rwstat_reset(&sc->serviced); 243 blkg_rwstat_reset(&sc->serviced);
244 } 244 }
245 } 245 }
246 246
247 static struct throtl_grp *throtl_lookup_tg(struct throtl_data *td, 247 static struct throtl_grp *throtl_lookup_tg(struct throtl_data *td,
248 struct blkcg *blkcg) 248 struct blkcg *blkcg)
249 { 249 {
250 /* 250 /*
251 * This is the common case when there are no blkcgs. Avoid lookup 251 * This is the common case when there are no blkcgs. Avoid lookup
252 * in this case 252 * in this case
253 */ 253 */
254 if (blkcg == &blkcg_root) 254 if (blkcg == &blkcg_root)
255 return td_root_tg(td); 255 return td_root_tg(td);
256 256
257 return blkg_to_tg(blkg_lookup(blkcg, td->queue)); 257 return blkg_to_tg(blkg_lookup(blkcg, td->queue));
258 } 258 }
259 259
260 static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td, 260 static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td,
261 struct blkcg *blkcg) 261 struct blkcg *blkcg)
262 { 262 {
263 struct request_queue *q = td->queue; 263 struct request_queue *q = td->queue;
264 struct throtl_grp *tg = NULL; 264 struct throtl_grp *tg = NULL;
265 265
266 /* 266 /*
267 * This is the common case when there are no blkcgs. Avoid lookup 267 * This is the common case when there are no blkcgs. Avoid lookup
268 * in this case 268 * in this case
269 */ 269 */
270 if (blkcg == &blkcg_root) { 270 if (blkcg == &blkcg_root) {
271 tg = td_root_tg(td); 271 tg = td_root_tg(td);
272 } else { 272 } else {
273 struct blkcg_gq *blkg; 273 struct blkcg_gq *blkg;
274 274
275 blkg = blkg_lookup_create(blkcg, q); 275 blkg = blkg_lookup_create(blkcg, q);
276 276
277 /* if %NULL and @q is alive, fall back to root_tg */ 277 /* if %NULL and @q is alive, fall back to root_tg */
278 if (!IS_ERR(blkg)) 278 if (!IS_ERR(blkg))
279 tg = blkg_to_tg(blkg); 279 tg = blkg_to_tg(blkg);
280 else if (!blk_queue_dying(q)) 280 else if (!blk_queue_dying(q))
281 tg = td_root_tg(td); 281 tg = td_root_tg(td);
282 } 282 }
283 283
284 return tg; 284 return tg;
285 } 285 }
286 286
287 static struct throtl_grp *throtl_rb_first(struct throtl_service_queue *sq) 287 static struct throtl_grp *
288 throtl_rb_first(struct throtl_service_queue *parent_sq)
288 { 289 {
289 /* Service tree is empty */ 290 /* Service tree is empty */
290 if (!sq->nr_pending) 291 if (!parent_sq->nr_pending)
291 return NULL; 292 return NULL;
292 293
293 if (!sq->first_pending) 294 if (!parent_sq->first_pending)
294 sq->first_pending = rb_first(&sq->pending_tree); 295 parent_sq->first_pending = rb_first(&parent_sq->pending_tree);
295 296
296 if (sq->first_pending) 297 if (parent_sq->first_pending)
297 return rb_entry_tg(sq->first_pending); 298 return rb_entry_tg(parent_sq->first_pending);
298 299
299 return NULL; 300 return NULL;
300 } 301 }
301 302
302 static void rb_erase_init(struct rb_node *n, struct rb_root *root) 303 static void rb_erase_init(struct rb_node *n, struct rb_root *root)
303 { 304 {
304 rb_erase(n, root); 305 rb_erase(n, root);
305 RB_CLEAR_NODE(n); 306 RB_CLEAR_NODE(n);
306 } 307 }
307 308
308 static void throtl_rb_erase(struct rb_node *n, struct throtl_service_queue *sq) 309 static void throtl_rb_erase(struct rb_node *n,
310 struct throtl_service_queue *parent_sq)
309 { 311 {
310 if (sq->first_pending == n) 312 if (parent_sq->first_pending == n)
311 sq->first_pending = NULL; 313 parent_sq->first_pending = NULL;
312 rb_erase_init(n, &sq->pending_tree); 314 rb_erase_init(n, &parent_sq->pending_tree);
313 --sq->nr_pending; 315 --parent_sq->nr_pending;
314 } 316 }
315 317
316 static void update_min_dispatch_time(struct throtl_service_queue *sq) 318 static void update_min_dispatch_time(struct throtl_service_queue *parent_sq)
317 { 319 {
318 struct throtl_grp *tg; 320 struct throtl_grp *tg;
319 321
320 tg = throtl_rb_first(sq); 322 tg = throtl_rb_first(parent_sq);
321 if (!tg) 323 if (!tg)
322 return; 324 return;
323 325
324 sq->first_pending_disptime = tg->disptime; 326 parent_sq->first_pending_disptime = tg->disptime;
325 } 327 }
326 328
327 static void tg_service_queue_add(struct throtl_service_queue *sq, 329 static void tg_service_queue_add(struct throtl_grp *tg,
328 struct throtl_grp *tg) 330 struct throtl_service_queue *parent_sq)
329 { 331 {
330 struct rb_node **node = &sq->pending_tree.rb_node; 332 struct rb_node **node = &parent_sq->pending_tree.rb_node;
331 struct rb_node *parent = NULL; 333 struct rb_node *parent = NULL;
332 struct throtl_grp *__tg; 334 struct throtl_grp *__tg;
333 unsigned long key = tg->disptime; 335 unsigned long key = tg->disptime;
334 int left = 1; 336 int left = 1;
335 337
336 while (*node != NULL) { 338 while (*node != NULL) {
337 parent = *node; 339 parent = *node;
338 __tg = rb_entry_tg(parent); 340 __tg = rb_entry_tg(parent);
339 341
340 if (time_before(key, __tg->disptime)) 342 if (time_before(key, __tg->disptime))
341 node = &parent->rb_left; 343 node = &parent->rb_left;
342 else { 344 else {
343 node = &parent->rb_right; 345 node = &parent->rb_right;
344 left = 0; 346 left = 0;
345 } 347 }
346 } 348 }
347 349
348 if (left) 350 if (left)
349 sq->first_pending = &tg->rb_node; 351 parent_sq->first_pending = &tg->rb_node;
350 352
351 rb_link_node(&tg->rb_node, parent, node); 353 rb_link_node(&tg->rb_node, parent, node);
352 rb_insert_color(&tg->rb_node, &sq->pending_tree); 354 rb_insert_color(&tg->rb_node, &parent_sq->pending_tree);
353 } 355 }
354 356
355 static void __throtl_enqueue_tg(struct throtl_service_queue *sq, 357 static void __throtl_enqueue_tg(struct throtl_grp *tg,
356 struct throtl_grp *tg) 358 struct throtl_service_queue *parent_sq)
357 { 359 {
358 tg_service_queue_add(sq, tg); 360 tg_service_queue_add(tg, parent_sq);
359 tg->flags |= THROTL_TG_PENDING; 361 tg->flags |= THROTL_TG_PENDING;
360 sq->nr_pending++; 362 parent_sq->nr_pending++;
361 } 363 }
362 364
363 static void throtl_enqueue_tg(struct throtl_service_queue *sq, 365 static void throtl_enqueue_tg(struct throtl_grp *tg,
364 struct throtl_grp *tg) 366 struct throtl_service_queue *parent_sq)
365 { 367 {
366 if (!(tg->flags & THROTL_TG_PENDING)) 368 if (!(tg->flags & THROTL_TG_PENDING))
367 __throtl_enqueue_tg(sq, tg); 369 __throtl_enqueue_tg(tg, parent_sq);
368 } 370 }
369 371
370 static void __throtl_dequeue_tg(struct throtl_service_queue *sq, 372 static void __throtl_dequeue_tg(struct throtl_grp *tg,
371 struct throtl_grp *tg) 373 struct throtl_service_queue *parent_sq)
372 { 374 {
373 throtl_rb_erase(&tg->rb_node, sq); 375 throtl_rb_erase(&tg->rb_node, parent_sq);
374 tg->flags &= ~THROTL_TG_PENDING; 376 tg->flags &= ~THROTL_TG_PENDING;
375 } 377 }
376 378
377 static void throtl_dequeue_tg(struct throtl_service_queue *sq, 379 static void throtl_dequeue_tg(struct throtl_grp *tg,
378 struct throtl_grp *tg) 380 struct throtl_service_queue *parent_sq)
379 { 381 {
380 if (tg->flags & THROTL_TG_PENDING) 382 if (tg->flags & THROTL_TG_PENDING)
381 __throtl_dequeue_tg(sq, tg); 383 __throtl_dequeue_tg(tg, parent_sq);
382 } 384 }
383 385
384 /* Call with queue lock held */ 386 /* Call with queue lock held */
385 static void throtl_schedule_delayed_work(struct throtl_data *td, 387 static void throtl_schedule_delayed_work(struct throtl_data *td,
386 unsigned long delay) 388 unsigned long delay)
387 { 389 {
388 struct delayed_work *dwork = &td->dispatch_work; 390 struct delayed_work *dwork = &td->dispatch_work;
389 391
390 mod_delayed_work(kthrotld_workqueue, dwork, delay); 392 mod_delayed_work(kthrotld_workqueue, dwork, delay);
391 throtl_log(td, "schedule work. delay=%lu jiffies=%lu", delay, jiffies); 393 throtl_log(td, "schedule work. delay=%lu jiffies=%lu", delay, jiffies);
392 } 394 }
393 395
394 static void throtl_schedule_next_dispatch(struct throtl_data *td) 396 static void throtl_schedule_next_dispatch(struct throtl_data *td)
395 { 397 {
396 struct throtl_service_queue *sq = &td->service_queue; 398 struct throtl_service_queue *sq = &td->service_queue;
397 399
398 /* any pending children left? */ 400 /* any pending children left? */
399 if (!sq->nr_pending) 401 if (!sq->nr_pending)
400 return; 402 return;
401 403
402 update_min_dispatch_time(sq); 404 update_min_dispatch_time(sq);
403 405
404 if (time_before_eq(sq->first_pending_disptime, jiffies)) 406 if (time_before_eq(sq->first_pending_disptime, jiffies))
405 throtl_schedule_delayed_work(td, 0); 407 throtl_schedule_delayed_work(td, 0);
406 else 408 else
407 throtl_schedule_delayed_work(td, sq->first_pending_disptime - jiffies); 409 throtl_schedule_delayed_work(td, sq->first_pending_disptime - jiffies);
408 } 410 }
409 411
410 static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw) 412 static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw)
411 { 413 {
412 tg->bytes_disp[rw] = 0; 414 tg->bytes_disp[rw] = 0;
413 tg->io_disp[rw] = 0; 415 tg->io_disp[rw] = 0;
414 tg->slice_start[rw] = jiffies; 416 tg->slice_start[rw] = jiffies;
415 tg->slice_end[rw] = jiffies + throtl_slice; 417 tg->slice_end[rw] = jiffies + throtl_slice;
416 throtl_log_tg(tg, "[%c] new slice start=%lu end=%lu jiffies=%lu", 418 throtl_log_tg(tg, "[%c] new slice start=%lu end=%lu jiffies=%lu",
417 rw == READ ? 'R' : 'W', tg->slice_start[rw], 419 rw == READ ? 'R' : 'W', tg->slice_start[rw],
418 tg->slice_end[rw], jiffies); 420 tg->slice_end[rw], jiffies);
419 } 421 }
420 422
421 static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw, 423 static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw,
422 unsigned long jiffy_end) 424 unsigned long jiffy_end)
423 { 425 {
424 tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); 426 tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
425 } 427 }
426 428
427 static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw, 429 static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw,
428 unsigned long jiffy_end) 430 unsigned long jiffy_end)
429 { 431 {
430 tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); 432 tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
431 throtl_log_tg(tg, "[%c] extend slice start=%lu end=%lu jiffies=%lu", 433 throtl_log_tg(tg, "[%c] extend slice start=%lu end=%lu jiffies=%lu",
432 rw == READ ? 'R' : 'W', tg->slice_start[rw], 434 rw == READ ? 'R' : 'W', tg->slice_start[rw],
433 tg->slice_end[rw], jiffies); 435 tg->slice_end[rw], jiffies);
434 } 436 }
435 437
436 /* Determine if previously allocated or extended slice is complete or not */ 438 /* Determine if previously allocated or extended slice is complete or not */
437 static bool throtl_slice_used(struct throtl_grp *tg, bool rw) 439 static bool throtl_slice_used(struct throtl_grp *tg, bool rw)
438 { 440 {
439 if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw])) 441 if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw]))
440 return 0; 442 return 0;
441 443
442 return 1; 444 return 1;
443 } 445 }
444 446
445 /* Trim the used slices and adjust slice start accordingly */ 447 /* Trim the used slices and adjust slice start accordingly */
446 static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) 448 static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
447 { 449 {
448 unsigned long nr_slices, time_elapsed, io_trim; 450 unsigned long nr_slices, time_elapsed, io_trim;
449 u64 bytes_trim, tmp; 451 u64 bytes_trim, tmp;
450 452
451 BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw])); 453 BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw]));
452 454
453 /* 455 /*
454 * If bps are unlimited (-1), then time slice don't get 456 * If bps are unlimited (-1), then time slice don't get
455 * renewed. Don't try to trim the slice if slice is used. A new 457 * renewed. Don't try to trim the slice if slice is used. A new
456 * slice will start when appropriate. 458 * slice will start when appropriate.
457 */ 459 */
458 if (throtl_slice_used(tg, rw)) 460 if (throtl_slice_used(tg, rw))
459 return; 461 return;
460 462
461 /* 463 /*
462 * A bio has been dispatched. Also adjust slice_end. It might happen 464 * A bio has been dispatched. Also adjust slice_end. It might happen
463 * that initially cgroup limit was very low resulting in high 465 * that initially cgroup limit was very low resulting in high
464 * slice_end, but later limit was bumped up and bio was dispached 466 * slice_end, but later limit was bumped up and bio was dispached
465 * sooner, then we need to reduce slice_end. A high bogus slice_end 467 * sooner, then we need to reduce slice_end. A high bogus slice_end
466 * is bad because it does not allow new slice to start. 468 * is bad because it does not allow new slice to start.
467 */ 469 */
468 470
469 throtl_set_slice_end(tg, rw, jiffies + throtl_slice); 471 throtl_set_slice_end(tg, rw, jiffies + throtl_slice);
470 472
471 time_elapsed = jiffies - tg->slice_start[rw]; 473 time_elapsed = jiffies - tg->slice_start[rw];
472 474
473 nr_slices = time_elapsed / throtl_slice; 475 nr_slices = time_elapsed / throtl_slice;
474 476
475 if (!nr_slices) 477 if (!nr_slices)
476 return; 478 return;
477 tmp = tg->bps[rw] * throtl_slice * nr_slices; 479 tmp = tg->bps[rw] * throtl_slice * nr_slices;
478 do_div(tmp, HZ); 480 do_div(tmp, HZ);
479 bytes_trim = tmp; 481 bytes_trim = tmp;
480 482
481 io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ; 483 io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ;
482 484
483 if (!bytes_trim && !io_trim) 485 if (!bytes_trim && !io_trim)
484 return; 486 return;
485 487
486 if (tg->bytes_disp[rw] >= bytes_trim) 488 if (tg->bytes_disp[rw] >= bytes_trim)
487 tg->bytes_disp[rw] -= bytes_trim; 489 tg->bytes_disp[rw] -= bytes_trim;
488 else 490 else
489 tg->bytes_disp[rw] = 0; 491 tg->bytes_disp[rw] = 0;
490 492
491 if (tg->io_disp[rw] >= io_trim) 493 if (tg->io_disp[rw] >= io_trim)
492 tg->io_disp[rw] -= io_trim; 494 tg->io_disp[rw] -= io_trim;
493 else 495 else
494 tg->io_disp[rw] = 0; 496 tg->io_disp[rw] = 0;
495 497
496 tg->slice_start[rw] += nr_slices * throtl_slice; 498 tg->slice_start[rw] += nr_slices * throtl_slice;
497 499
498 throtl_log_tg(tg, "[%c] trim slice nr=%lu bytes=%llu io=%lu" 500 throtl_log_tg(tg, "[%c] trim slice nr=%lu bytes=%llu io=%lu"
499 " start=%lu end=%lu jiffies=%lu", 501 " start=%lu end=%lu jiffies=%lu",
500 rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim, 502 rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim,
501 tg->slice_start[rw], tg->slice_end[rw], jiffies); 503 tg->slice_start[rw], tg->slice_end[rw], jiffies);
502 } 504 }
503 505
504 static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, 506 static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
505 unsigned long *wait) 507 unsigned long *wait)
506 { 508 {
507 bool rw = bio_data_dir(bio); 509 bool rw = bio_data_dir(bio);
508 unsigned int io_allowed; 510 unsigned int io_allowed;
509 unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; 511 unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
510 u64 tmp; 512 u64 tmp;
511 513
512 jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; 514 jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
513 515
514 /* Slice has just started. Consider one slice interval */ 516 /* Slice has just started. Consider one slice interval */
515 if (!jiffy_elapsed) 517 if (!jiffy_elapsed)
516 jiffy_elapsed_rnd = throtl_slice; 518 jiffy_elapsed_rnd = throtl_slice;
517 519
518 jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice); 520 jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
519 521
520 /* 522 /*
521 * jiffy_elapsed_rnd should not be a big value as minimum iops can be 523 * jiffy_elapsed_rnd should not be a big value as minimum iops can be
522 * 1 then at max jiffy elapsed should be equivalent of 1 second as we 524 * 1 then at max jiffy elapsed should be equivalent of 1 second as we
523 * will allow dispatch after 1 second and after that slice should 525 * will allow dispatch after 1 second and after that slice should
524 * have been trimmed. 526 * have been trimmed.
525 */ 527 */
526 528
527 tmp = (u64)tg->iops[rw] * jiffy_elapsed_rnd; 529 tmp = (u64)tg->iops[rw] * jiffy_elapsed_rnd;
528 do_div(tmp, HZ); 530 do_div(tmp, HZ);
529 531
530 if (tmp > UINT_MAX) 532 if (tmp > UINT_MAX)
531 io_allowed = UINT_MAX; 533 io_allowed = UINT_MAX;
532 else 534 else
533 io_allowed = tmp; 535 io_allowed = tmp;
534 536
535 if (tg->io_disp[rw] + 1 <= io_allowed) { 537 if (tg->io_disp[rw] + 1 <= io_allowed) {
536 if (wait) 538 if (wait)
537 *wait = 0; 539 *wait = 0;
538 return 1; 540 return 1;
539 } 541 }
540 542
541 /* Calc approx time to dispatch */ 543 /* Calc approx time to dispatch */
542 jiffy_wait = ((tg->io_disp[rw] + 1) * HZ)/tg->iops[rw] + 1; 544 jiffy_wait = ((tg->io_disp[rw] + 1) * HZ)/tg->iops[rw] + 1;
543 545
544 if (jiffy_wait > jiffy_elapsed) 546 if (jiffy_wait > jiffy_elapsed)
545 jiffy_wait = jiffy_wait - jiffy_elapsed; 547 jiffy_wait = jiffy_wait - jiffy_elapsed;
546 else 548 else
547 jiffy_wait = 1; 549 jiffy_wait = 1;
548 550
549 if (wait) 551 if (wait)
550 *wait = jiffy_wait; 552 *wait = jiffy_wait;
551 return 0; 553 return 0;
552 } 554 }
553 555
554 static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio, 556 static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
555 unsigned long *wait) 557 unsigned long *wait)
556 { 558 {
557 bool rw = bio_data_dir(bio); 559 bool rw = bio_data_dir(bio);
558 u64 bytes_allowed, extra_bytes, tmp; 560 u64 bytes_allowed, extra_bytes, tmp;
559 unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; 561 unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
560 562
561 jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; 563 jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
562 564
563 /* Slice has just started. Consider one slice interval */ 565 /* Slice has just started. Consider one slice interval */
564 if (!jiffy_elapsed) 566 if (!jiffy_elapsed)
565 jiffy_elapsed_rnd = throtl_slice; 567 jiffy_elapsed_rnd = throtl_slice;
566 568
567 jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice); 569 jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
568 570
569 tmp = tg->bps[rw] * jiffy_elapsed_rnd; 571 tmp = tg->bps[rw] * jiffy_elapsed_rnd;
570 do_div(tmp, HZ); 572 do_div(tmp, HZ);
571 bytes_allowed = tmp; 573 bytes_allowed = tmp;
572 574
573 if (tg->bytes_disp[rw] + bio->bi_size <= bytes_allowed) { 575 if (tg->bytes_disp[rw] + bio->bi_size <= bytes_allowed) {
574 if (wait) 576 if (wait)
575 *wait = 0; 577 *wait = 0;
576 return 1; 578 return 1;
577 } 579 }
578 580
579 /* Calc approx time to dispatch */ 581 /* Calc approx time to dispatch */
580 extra_bytes = tg->bytes_disp[rw] + bio->bi_size - bytes_allowed; 582 extra_bytes = tg->bytes_disp[rw] + bio->bi_size - bytes_allowed;
581 jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]); 583 jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]);
582 584
583 if (!jiffy_wait) 585 if (!jiffy_wait)
584 jiffy_wait = 1; 586 jiffy_wait = 1;
585 587
586 /* 588 /*
587 * This wait time is without taking into consideration the rounding 589 * This wait time is without taking into consideration the rounding
588 * up we did. Add that time also. 590 * up we did. Add that time also.
589 */ 591 */
590 jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed); 592 jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed);
591 if (wait) 593 if (wait)
592 *wait = jiffy_wait; 594 *wait = jiffy_wait;
593 return 0; 595 return 0;
594 } 596 }
595 597
596 static bool tg_no_rule_group(struct throtl_grp *tg, bool rw) { 598 static bool tg_no_rule_group(struct throtl_grp *tg, bool rw) {
597 if (tg->bps[rw] == -1 && tg->iops[rw] == -1) 599 if (tg->bps[rw] == -1 && tg->iops[rw] == -1)
598 return 1; 600 return 1;
599 return 0; 601 return 0;
600 } 602 }
601 603
602 /* 604 /*
603 * Returns whether one can dispatch a bio or not. Also returns approx number 605 * Returns whether one can dispatch a bio or not. Also returns approx number
604 * of jiffies to wait before this bio is with-in IO rate and can be dispatched 606 * of jiffies to wait before this bio is with-in IO rate and can be dispatched
605 */ 607 */
606 static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, 608 static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
607 unsigned long *wait) 609 unsigned long *wait)
608 { 610 {
609 bool rw = bio_data_dir(bio); 611 bool rw = bio_data_dir(bio);
610 unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0; 612 unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;
611 613
612 /* 614 /*
613 * Currently whole state machine of group depends on first bio 615 * Currently whole state machine of group depends on first bio
614 * queued in the group bio list. So one should not be calling 616 * queued in the group bio list. So one should not be calling
615 * this function with a different bio if there are other bios 617 * this function with a different bio if there are other bios
616 * queued. 618 * queued.
617 */ 619 */
618 BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw])); 620 BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw]));
619 621
620 /* If tg->bps = -1, then BW is unlimited */ 622 /* If tg->bps = -1, then BW is unlimited */
621 if (tg->bps[rw] == -1 && tg->iops[rw] == -1) { 623 if (tg->bps[rw] == -1 && tg->iops[rw] == -1) {
622 if (wait) 624 if (wait)
623 *wait = 0; 625 *wait = 0;
624 return 1; 626 return 1;
625 } 627 }
626 628
627 /* 629 /*
628 * If previous slice expired, start a new one otherwise renew/extend 630 * If previous slice expired, start a new one otherwise renew/extend
629 * existing slice to make sure it is at least throtl_slice interval 631 * existing slice to make sure it is at least throtl_slice interval
630 * long since now. 632 * long since now.
631 */ 633 */
632 if (throtl_slice_used(tg, rw)) 634 if (throtl_slice_used(tg, rw))
633 throtl_start_new_slice(tg, rw); 635 throtl_start_new_slice(tg, rw);
634 else { 636 else {
635 if (time_before(tg->slice_end[rw], jiffies + throtl_slice)) 637 if (time_before(tg->slice_end[rw], jiffies + throtl_slice))
636 throtl_extend_slice(tg, rw, jiffies + throtl_slice); 638 throtl_extend_slice(tg, rw, jiffies + throtl_slice);
637 } 639 }
638 640
639 if (tg_with_in_bps_limit(tg, bio, &bps_wait) && 641 if (tg_with_in_bps_limit(tg, bio, &bps_wait) &&
640 tg_with_in_iops_limit(tg, bio, &iops_wait)) { 642 tg_with_in_iops_limit(tg, bio, &iops_wait)) {
641 if (wait) 643 if (wait)
642 *wait = 0; 644 *wait = 0;
643 return 1; 645 return 1;
644 } 646 }
645 647
646 max_wait = max(bps_wait, iops_wait); 648 max_wait = max(bps_wait, iops_wait);
647 649
648 if (wait) 650 if (wait)
649 *wait = max_wait; 651 *wait = max_wait;
650 652
651 if (time_before(tg->slice_end[rw], jiffies + max_wait)) 653 if (time_before(tg->slice_end[rw], jiffies + max_wait))
652 throtl_extend_slice(tg, rw, jiffies + max_wait); 654 throtl_extend_slice(tg, rw, jiffies + max_wait);
653 655
654 return 0; 656 return 0;
655 } 657 }
656 658
657 static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes, 659 static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes,
658 int rw) 660 int rw)
659 { 661 {
660 struct throtl_grp *tg = blkg_to_tg(blkg); 662 struct throtl_grp *tg = blkg_to_tg(blkg);
661 struct tg_stats_cpu *stats_cpu; 663 struct tg_stats_cpu *stats_cpu;
662 unsigned long flags; 664 unsigned long flags;
663 665
664 /* If per cpu stats are not allocated yet, don't do any accounting. */ 666 /* If per cpu stats are not allocated yet, don't do any accounting. */
665 if (tg->stats_cpu == NULL) 667 if (tg->stats_cpu == NULL)
666 return; 668 return;
667 669
668 /* 670 /*
669 * Disabling interrupts to provide mutual exclusion between two 671 * Disabling interrupts to provide mutual exclusion between two
670 * writes on same cpu. It probably is not needed for 64bit. Not 672 * writes on same cpu. It probably is not needed for 64bit. Not
671 * optimizing that case yet. 673 * optimizing that case yet.
672 */ 674 */
673 local_irq_save(flags); 675 local_irq_save(flags);
674 676
675 stats_cpu = this_cpu_ptr(tg->stats_cpu); 677 stats_cpu = this_cpu_ptr(tg->stats_cpu);
676 678
677 blkg_rwstat_add(&stats_cpu->serviced, rw, 1); 679 blkg_rwstat_add(&stats_cpu->serviced, rw, 1);
678 blkg_rwstat_add(&stats_cpu->service_bytes, rw, bytes); 680 blkg_rwstat_add(&stats_cpu->service_bytes, rw, bytes);
679 681
680 local_irq_restore(flags); 682 local_irq_restore(flags);
681 } 683 }
682 684
683 static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) 685 static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
684 { 686 {
685 bool rw = bio_data_dir(bio); 687 bool rw = bio_data_dir(bio);
686 688
687 /* Charge the bio to the group */ 689 /* Charge the bio to the group */
688 tg->bytes_disp[rw] += bio->bi_size; 690 tg->bytes_disp[rw] += bio->bi_size;
689 tg->io_disp[rw]++; 691 tg->io_disp[rw]++;
690 692
691 throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size, bio->bi_rw); 693 throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size, bio->bi_rw);
692 } 694 }
693 695
694 static void throtl_add_bio_tg(struct throtl_service_queue *sq, 696 static void throtl_add_bio_tg(struct bio *bio, struct throtl_grp *tg,
695 struct throtl_grp *tg, struct bio *bio) 697 struct throtl_service_queue *parent_sq)
696 { 698 {
697 bool rw = bio_data_dir(bio); 699 bool rw = bio_data_dir(bio);
698 700
699 bio_list_add(&tg->bio_lists[rw], bio); 701 bio_list_add(&tg->bio_lists[rw], bio);
700 /* Take a bio reference on tg */ 702 /* Take a bio reference on tg */
701 blkg_get(tg_to_blkg(tg)); 703 blkg_get(tg_to_blkg(tg));
702 tg->nr_queued[rw]++; 704 tg->nr_queued[rw]++;
703 tg->td->nr_queued[rw]++; 705 tg->td->nr_queued[rw]++;
704 throtl_enqueue_tg(sq, tg); 706 throtl_enqueue_tg(tg, parent_sq);
705 } 707 }
706 708
707 static void tg_update_disptime(struct throtl_service_queue *sq, 709 static void tg_update_disptime(struct throtl_grp *tg,
708 struct throtl_grp *tg) 710 struct throtl_service_queue *parent_sq)
709 { 711 {
710 unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime; 712 unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;
711 struct bio *bio; 713 struct bio *bio;
712 714
713 if ((bio = bio_list_peek(&tg->bio_lists[READ]))) 715 if ((bio = bio_list_peek(&tg->bio_lists[READ])))
714 tg_may_dispatch(tg, bio, &read_wait); 716 tg_may_dispatch(tg, bio, &read_wait);
715 717
716 if ((bio = bio_list_peek(&tg->bio_lists[WRITE]))) 718 if ((bio = bio_list_peek(&tg->bio_lists[WRITE])))
717 tg_may_dispatch(tg, bio, &write_wait); 719 tg_may_dispatch(tg, bio, &write_wait);
718 720
719 min_wait = min(read_wait, write_wait); 721 min_wait = min(read_wait, write_wait);
720 disptime = jiffies + min_wait; 722 disptime = jiffies + min_wait;
721 723
722 /* Update dispatch time */ 724 /* Update dispatch time */
723 throtl_dequeue_tg(sq, tg); 725 throtl_dequeue_tg(tg, parent_sq);
724 tg->disptime = disptime; 726 tg->disptime = disptime;
725 throtl_enqueue_tg(sq, tg); 727 throtl_enqueue_tg(tg, parent_sq);
726 } 728 }
727 729
728 static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw, 730 static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw,
729 struct bio_list *bl) 731 struct bio_list *bl)
730 { 732 {
731 struct bio *bio; 733 struct bio *bio;
732 734
733 bio = bio_list_pop(&tg->bio_lists[rw]); 735 bio = bio_list_pop(&tg->bio_lists[rw]);
734 tg->nr_queued[rw]--; 736 tg->nr_queued[rw]--;
735 /* Drop bio reference on blkg */ 737 /* Drop bio reference on blkg */
736 blkg_put(tg_to_blkg(tg)); 738 blkg_put(tg_to_blkg(tg));
737 739
738 BUG_ON(tg->td->nr_queued[rw] <= 0); 740 BUG_ON(tg->td->nr_queued[rw] <= 0);
739 tg->td->nr_queued[rw]--; 741 tg->td->nr_queued[rw]--;
740 742
741 throtl_charge_bio(tg, bio); 743 throtl_charge_bio(tg, bio);
742 bio_list_add(bl, bio); 744 bio_list_add(bl, bio);
743 bio->bi_rw |= REQ_THROTTLED; 745 bio->bi_rw |= REQ_THROTTLED;
744 746
745 throtl_trim_slice(tg, rw); 747 throtl_trim_slice(tg, rw);
746 } 748 }
747 749
748 static int throtl_dispatch_tg(struct throtl_grp *tg, struct bio_list *bl) 750 static int throtl_dispatch_tg(struct throtl_grp *tg, struct bio_list *bl)
749 { 751 {
750 unsigned int nr_reads = 0, nr_writes = 0; 752 unsigned int nr_reads = 0, nr_writes = 0;
751 unsigned int max_nr_reads = throtl_grp_quantum*3/4; 753 unsigned int max_nr_reads = throtl_grp_quantum*3/4;
752 unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads; 754 unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads;
753 struct bio *bio; 755 struct bio *bio;
754 756
755 /* Try to dispatch 75% READS and 25% WRITES */ 757 /* Try to dispatch 75% READS and 25% WRITES */
756 758
757 while ((bio = bio_list_peek(&tg->bio_lists[READ])) && 759 while ((bio = bio_list_peek(&tg->bio_lists[READ])) &&
758 tg_may_dispatch(tg, bio, NULL)) { 760 tg_may_dispatch(tg, bio, NULL)) {
759 761
760 tg_dispatch_one_bio(tg, bio_data_dir(bio), bl); 762 tg_dispatch_one_bio(tg, bio_data_dir(bio), bl);
761 nr_reads++; 763 nr_reads++;
762 764
763 if (nr_reads >= max_nr_reads) 765 if (nr_reads >= max_nr_reads)
764 break; 766 break;
765 } 767 }
766 768
767 while ((bio = bio_list_peek(&tg->bio_lists[WRITE])) && 769 while ((bio = bio_list_peek(&tg->bio_lists[WRITE])) &&
768 tg_may_dispatch(tg, bio, NULL)) { 770 tg_may_dispatch(tg, bio, NULL)) {
769 771
770 tg_dispatch_one_bio(tg, bio_data_dir(bio), bl); 772 tg_dispatch_one_bio(tg, bio_data_dir(bio), bl);
771 nr_writes++; 773 nr_writes++;
772 774
773 if (nr_writes >= max_nr_writes) 775 if (nr_writes >= max_nr_writes)
774 break; 776 break;
775 } 777 }
776 778
777 return nr_reads + nr_writes; 779 return nr_reads + nr_writes;
778 } 780 }
779 781
780 static int throtl_select_dispatch(struct throtl_service_queue *sq, 782 static int throtl_select_dispatch(struct throtl_service_queue *parent_sq,
781 struct bio_list *bl) 783 struct bio_list *bl)
782 { 784 {
783 unsigned int nr_disp = 0; 785 unsigned int nr_disp = 0;
784 struct throtl_grp *tg; 786 struct throtl_grp *tg;
785 787
786 while (1) { 788 while (1) {
787 tg = throtl_rb_first(sq); 789 tg = throtl_rb_first(parent_sq);
788 790
789 if (!tg) 791 if (!tg)
790 break; 792 break;
791 793
792 if (time_before(jiffies, tg->disptime)) 794 if (time_before(jiffies, tg->disptime))
793 break; 795 break;
794 796
795 throtl_dequeue_tg(sq, tg); 797 throtl_dequeue_tg(tg, parent_sq);
796 798
797 nr_disp += throtl_dispatch_tg(tg, bl); 799 nr_disp += throtl_dispatch_tg(tg, bl);
798 800
799 if (tg->nr_queued[0] || tg->nr_queued[1]) 801 if (tg->nr_queued[0] || tg->nr_queued[1])
800 tg_update_disptime(sq, tg); 802 tg_update_disptime(tg, parent_sq);
801 803
802 if (nr_disp >= throtl_quantum) 804 if (nr_disp >= throtl_quantum)
803 break; 805 break;
804 } 806 }
805 807
806 return nr_disp; 808 return nr_disp;
807 } 809 }
808 810
809 /* work function to dispatch throttled bios */ 811 /* work function to dispatch throttled bios */
810 void blk_throtl_dispatch_work_fn(struct work_struct *work) 812 void blk_throtl_dispatch_work_fn(struct work_struct *work)
811 { 813 {
812 struct throtl_data *td = container_of(to_delayed_work(work), 814 struct throtl_data *td = container_of(to_delayed_work(work),
813 struct throtl_data, dispatch_work); 815 struct throtl_data, dispatch_work);
814 struct request_queue *q = td->queue; 816 struct request_queue *q = td->queue;
815 unsigned int nr_disp = 0; 817 unsigned int nr_disp = 0;
816 struct bio_list bio_list_on_stack; 818 struct bio_list bio_list_on_stack;
817 struct bio *bio; 819 struct bio *bio;
818 struct blk_plug plug; 820 struct blk_plug plug;
819 821
820 spin_lock_irq(q->queue_lock); 822 spin_lock_irq(q->queue_lock);
821 823
822 bio_list_init(&bio_list_on_stack); 824 bio_list_init(&bio_list_on_stack);
823 825
824 throtl_log(td, "dispatch nr_queued=%u read=%u write=%u", 826 throtl_log(td, "dispatch nr_queued=%u read=%u write=%u",
825 td->nr_queued[READ] + td->nr_queued[WRITE], 827 td->nr_queued[READ] + td->nr_queued[WRITE],
826 td->nr_queued[READ], td->nr_queued[WRITE]); 828 td->nr_queued[READ], td->nr_queued[WRITE]);
827 829
828 nr_disp = throtl_select_dispatch(&td->service_queue, &bio_list_on_stack); 830 nr_disp = throtl_select_dispatch(&td->service_queue, &bio_list_on_stack);
829 831
830 if (nr_disp) 832 if (nr_disp)
831 throtl_log(td, "bios disp=%u", nr_disp); 833 throtl_log(td, "bios disp=%u", nr_disp);
832 834
833 throtl_schedule_next_dispatch(td); 835 throtl_schedule_next_dispatch(td);
834 836
835 spin_unlock_irq(q->queue_lock); 837 spin_unlock_irq(q->queue_lock);
836 838
837 /* 839 /*
838 * If we dispatched some requests, unplug the queue to make sure 840 * If we dispatched some requests, unplug the queue to make sure
839 * immediate dispatch 841 * immediate dispatch
840 */ 842 */
841 if (nr_disp) { 843 if (nr_disp) {
842 blk_start_plug(&plug); 844 blk_start_plug(&plug);
843 while((bio = bio_list_pop(&bio_list_on_stack))) 845 while((bio = bio_list_pop(&bio_list_on_stack)))
844 generic_make_request(bio); 846 generic_make_request(bio);
845 blk_finish_plug(&plug); 847 blk_finish_plug(&plug);
846 } 848 }
847 } 849 }
848 850
849 static u64 tg_prfill_cpu_rwstat(struct seq_file *sf, 851 static u64 tg_prfill_cpu_rwstat(struct seq_file *sf,
850 struct blkg_policy_data *pd, int off) 852 struct blkg_policy_data *pd, int off)
851 { 853 {
852 struct throtl_grp *tg = pd_to_tg(pd); 854 struct throtl_grp *tg = pd_to_tg(pd);
853 struct blkg_rwstat rwstat = { }, tmp; 855 struct blkg_rwstat rwstat = { }, tmp;
854 int i, cpu; 856 int i, cpu;
855 857
856 for_each_possible_cpu(cpu) { 858 for_each_possible_cpu(cpu) {
857 struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu); 859 struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu);
858 860
859 tmp = blkg_rwstat_read((void *)sc + off); 861 tmp = blkg_rwstat_read((void *)sc + off);
860 for (i = 0; i < BLKG_RWSTAT_NR; i++) 862 for (i = 0; i < BLKG_RWSTAT_NR; i++)
861 rwstat.cnt[i] += tmp.cnt[i]; 863 rwstat.cnt[i] += tmp.cnt[i];
862 } 864 }
863 865
864 return __blkg_prfill_rwstat(sf, pd, &rwstat); 866 return __blkg_prfill_rwstat(sf, pd, &rwstat);
865 } 867 }
866 868
867 static int tg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft, 869 static int tg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft,
868 struct seq_file *sf) 870 struct seq_file *sf)
869 { 871 {
870 struct blkcg *blkcg = cgroup_to_blkcg(cgrp); 872 struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
871 873
872 blkcg_print_blkgs(sf, blkcg, tg_prfill_cpu_rwstat, &blkcg_policy_throtl, 874 blkcg_print_blkgs(sf, blkcg, tg_prfill_cpu_rwstat, &blkcg_policy_throtl,
873 cft->private, true); 875 cft->private, true);
874 return 0; 876 return 0;
875 } 877 }
876 878
877 static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd, 879 static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd,
878 int off) 880 int off)
879 { 881 {
880 struct throtl_grp *tg = pd_to_tg(pd); 882 struct throtl_grp *tg = pd_to_tg(pd);
881 u64 v = *(u64 *)((void *)tg + off); 883 u64 v = *(u64 *)((void *)tg + off);
882 884
883 if (v == -1) 885 if (v == -1)
884 return 0; 886 return 0;
885 return __blkg_prfill_u64(sf, pd, v); 887 return __blkg_prfill_u64(sf, pd, v);
886 } 888 }
887 889
888 static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd, 890 static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd,
889 int off) 891 int off)
890 { 892 {
891 struct throtl_grp *tg = pd_to_tg(pd); 893 struct throtl_grp *tg = pd_to_tg(pd);
892 unsigned int v = *(unsigned int *)((void *)tg + off); 894 unsigned int v = *(unsigned int *)((void *)tg + off);
893 895
894 if (v == -1) 896 if (v == -1)
895 return 0; 897 return 0;
896 return __blkg_prfill_u64(sf, pd, v); 898 return __blkg_prfill_u64(sf, pd, v);
897 } 899 }
898 900
899 static int tg_print_conf_u64(struct cgroup *cgrp, struct cftype *cft, 901 static int tg_print_conf_u64(struct cgroup *cgrp, struct cftype *cft,
900 struct seq_file *sf) 902 struct seq_file *sf)
901 { 903 {
902 blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), tg_prfill_conf_u64, 904 blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), tg_prfill_conf_u64,
903 &blkcg_policy_throtl, cft->private, false); 905 &blkcg_policy_throtl, cft->private, false);
904 return 0; 906 return 0;
905 } 907 }
906 908
907 static int tg_print_conf_uint(struct cgroup *cgrp, struct cftype *cft, 909 static int tg_print_conf_uint(struct cgroup *cgrp, struct cftype *cft,
908 struct seq_file *sf) 910 struct seq_file *sf)
909 { 911 {
910 blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), tg_prfill_conf_uint, 912 blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), tg_prfill_conf_uint,
911 &blkcg_policy_throtl, cft->private, false); 913 &blkcg_policy_throtl, cft->private, false);
912 return 0; 914 return 0;
913 } 915 }
914 916
915 static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf, 917 static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf,
916 bool is_u64) 918 bool is_u64)
917 { 919 {
918 struct blkcg *blkcg = cgroup_to_blkcg(cgrp); 920 struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
919 struct blkg_conf_ctx ctx; 921 struct blkg_conf_ctx ctx;
920 struct throtl_grp *tg; 922 struct throtl_grp *tg;
921 struct throtl_data *td; 923 struct throtl_data *td;
922 int ret; 924 int ret;
923 925
924 ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); 926 ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
925 if (ret) 927 if (ret)
926 return ret; 928 return ret;
927 929
928 tg = blkg_to_tg(ctx.blkg); 930 tg = blkg_to_tg(ctx.blkg);
929 td = ctx.blkg->q->td; 931 td = ctx.blkg->q->td;
930 932
931 if (!ctx.v) 933 if (!ctx.v)
932 ctx.v = -1; 934 ctx.v = -1;
933 935
934 if (is_u64) 936 if (is_u64)
935 *(u64 *)((void *)tg + cft->private) = ctx.v; 937 *(u64 *)((void *)tg + cft->private) = ctx.v;
936 else 938 else
937 *(unsigned int *)((void *)tg + cft->private) = ctx.v; 939 *(unsigned int *)((void *)tg + cft->private) = ctx.v;
938 940
939 throtl_log_tg(tg, "limit change rbps=%llu wbps=%llu riops=%u wiops=%u", 941 throtl_log_tg(tg, "limit change rbps=%llu wbps=%llu riops=%u wiops=%u",
940 tg->bps[READ], tg->bps[WRITE], 942 tg->bps[READ], tg->bps[WRITE],
941 tg->iops[READ], tg->iops[WRITE]); 943 tg->iops[READ], tg->iops[WRITE]);
942 944
943 /* 945 /*
944 * We're already holding queue_lock and know @tg is valid. Let's 946 * We're already holding queue_lock and know @tg is valid. Let's
945 * apply the new config directly. 947 * apply the new config directly.
946 * 948 *
947 * Restart the slices for both READ and WRITES. It might happen 949 * Restart the slices for both READ and WRITES. It might happen
948 * that a group's limit are dropped suddenly and we don't want to 950 * that a group's limit are dropped suddenly and we don't want to
949 * account recently dispatched IO with new low rate. 951 * account recently dispatched IO with new low rate.
950 */ 952 */
951 throtl_start_new_slice(tg, 0); 953 throtl_start_new_slice(tg, 0);
952 throtl_start_new_slice(tg, 1); 954 throtl_start_new_slice(tg, 1);
953 955
954 if (tg->flags & THROTL_TG_PENDING) { 956 if (tg->flags & THROTL_TG_PENDING) {
955 tg_update_disptime(&td->service_queue, tg); 957 tg_update_disptime(tg, &td->service_queue);
956 throtl_schedule_next_dispatch(td); 958 throtl_schedule_next_dispatch(td);
957 } 959 }
958 960
959 blkg_conf_finish(&ctx); 961 blkg_conf_finish(&ctx);
960 return 0; 962 return 0;
961 } 963 }
962 964
963 static int tg_set_conf_u64(struct cgroup *cgrp, struct cftype *cft, 965 static int tg_set_conf_u64(struct cgroup *cgrp, struct cftype *cft,
964 const char *buf) 966 const char *buf)
965 { 967 {
966 return tg_set_conf(cgrp, cft, buf, true); 968 return tg_set_conf(cgrp, cft, buf, true);
967 } 969 }
968 970
969 static int tg_set_conf_uint(struct cgroup *cgrp, struct cftype *cft, 971 static int tg_set_conf_uint(struct cgroup *cgrp, struct cftype *cft,
970 const char *buf) 972 const char *buf)
971 { 973 {
972 return tg_set_conf(cgrp, cft, buf, false); 974 return tg_set_conf(cgrp, cft, buf, false);
973 } 975 }
974 976
975 static struct cftype throtl_files[] = { 977 static struct cftype throtl_files[] = {
976 { 978 {
977 .name = "throttle.read_bps_device", 979 .name = "throttle.read_bps_device",
978 .private = offsetof(struct throtl_grp, bps[READ]), 980 .private = offsetof(struct throtl_grp, bps[READ]),
979 .read_seq_string = tg_print_conf_u64, 981 .read_seq_string = tg_print_conf_u64,
980 .write_string = tg_set_conf_u64, 982 .write_string = tg_set_conf_u64,
981 .max_write_len = 256, 983 .max_write_len = 256,
982 }, 984 },
983 { 985 {
984 .name = "throttle.write_bps_device", 986 .name = "throttle.write_bps_device",
985 .private = offsetof(struct throtl_grp, bps[WRITE]), 987 .private = offsetof(struct throtl_grp, bps[WRITE]),
986 .read_seq_string = tg_print_conf_u64, 988 .read_seq_string = tg_print_conf_u64,
987 .write_string = tg_set_conf_u64, 989 .write_string = tg_set_conf_u64,
988 .max_write_len = 256, 990 .max_write_len = 256,
989 }, 991 },
990 { 992 {
991 .name = "throttle.read_iops_device", 993 .name = "throttle.read_iops_device",
992 .private = offsetof(struct throtl_grp, iops[READ]), 994 .private = offsetof(struct throtl_grp, iops[READ]),
993 .read_seq_string = tg_print_conf_uint, 995 .read_seq_string = tg_print_conf_uint,
994 .write_string = tg_set_conf_uint, 996 .write_string = tg_set_conf_uint,
995 .max_write_len = 256, 997 .max_write_len = 256,
996 }, 998 },
997 { 999 {
998 .name = "throttle.write_iops_device", 1000 .name = "throttle.write_iops_device",
999 .private = offsetof(struct throtl_grp, iops[WRITE]), 1001 .private = offsetof(struct throtl_grp, iops[WRITE]),
1000 .read_seq_string = tg_print_conf_uint, 1002 .read_seq_string = tg_print_conf_uint,
1001 .write_string = tg_set_conf_uint, 1003 .write_string = tg_set_conf_uint,
1002 .max_write_len = 256, 1004 .max_write_len = 256,
1003 }, 1005 },
1004 { 1006 {
1005 .name = "throttle.io_service_bytes", 1007 .name = "throttle.io_service_bytes",
1006 .private = offsetof(struct tg_stats_cpu, service_bytes), 1008 .private = offsetof(struct tg_stats_cpu, service_bytes),
1007 .read_seq_string = tg_print_cpu_rwstat, 1009 .read_seq_string = tg_print_cpu_rwstat,
1008 }, 1010 },
1009 { 1011 {
1010 .name = "throttle.io_serviced", 1012 .name = "throttle.io_serviced",
1011 .private = offsetof(struct tg_stats_cpu, serviced), 1013 .private = offsetof(struct tg_stats_cpu, serviced),
1012 .read_seq_string = tg_print_cpu_rwstat, 1014 .read_seq_string = tg_print_cpu_rwstat,
1013 }, 1015 },
1014 { } /* terminate */ 1016 { } /* terminate */
1015 }; 1017 };
1016 1018
1017 static void throtl_shutdown_wq(struct request_queue *q) 1019 static void throtl_shutdown_wq(struct request_queue *q)
1018 { 1020 {
1019 struct throtl_data *td = q->td; 1021 struct throtl_data *td = q->td;
1020 1022
1021 cancel_delayed_work_sync(&td->dispatch_work); 1023 cancel_delayed_work_sync(&td->dispatch_work);
1022 } 1024 }
1023 1025
1024 static struct blkcg_policy blkcg_policy_throtl = { 1026 static struct blkcg_policy blkcg_policy_throtl = {
1025 .pd_size = sizeof(struct throtl_grp), 1027 .pd_size = sizeof(struct throtl_grp),
1026 .cftypes = throtl_files, 1028 .cftypes = throtl_files,
1027 1029
1028 .pd_init_fn = throtl_pd_init, 1030 .pd_init_fn = throtl_pd_init,
1029 .pd_exit_fn = throtl_pd_exit, 1031 .pd_exit_fn = throtl_pd_exit,
1030 .pd_reset_stats_fn = throtl_pd_reset_stats, 1032 .pd_reset_stats_fn = throtl_pd_reset_stats,
1031 }; 1033 };
1032 1034
1033 bool blk_throtl_bio(struct request_queue *q, struct bio *bio) 1035 bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
1034 { 1036 {
1035 struct throtl_data *td = q->td; 1037 struct throtl_data *td = q->td;
1036 struct throtl_grp *tg; 1038 struct throtl_grp *tg;
1037 bool rw = bio_data_dir(bio), update_disptime = true; 1039 bool rw = bio_data_dir(bio), update_disptime = true;
1038 struct blkcg *blkcg; 1040 struct blkcg *blkcg;
1039 bool throttled = false; 1041 bool throttled = false;
1040 1042
1041 if (bio->bi_rw & REQ_THROTTLED) { 1043 if (bio->bi_rw & REQ_THROTTLED) {
1042 bio->bi_rw &= ~REQ_THROTTLED; 1044 bio->bi_rw &= ~REQ_THROTTLED;
1043 goto out; 1045 goto out;
1044 } 1046 }
1045 1047
1046 /* 1048 /*
1047 * A throtl_grp pointer retrieved under rcu can be used to access 1049 * A throtl_grp pointer retrieved under rcu can be used to access
1048 * basic fields like stats and io rates. If a group has no rules, 1050 * basic fields like stats and io rates. If a group has no rules,
1049 * just update the dispatch stats in lockless manner and return. 1051 * just update the dispatch stats in lockless manner and return.
1050 */ 1052 */
1051 rcu_read_lock(); 1053 rcu_read_lock();
1052 blkcg = bio_blkcg(bio); 1054 blkcg = bio_blkcg(bio);
1053 tg = throtl_lookup_tg(td, blkcg); 1055 tg = throtl_lookup_tg(td, blkcg);
1054 if (tg) { 1056 if (tg) {
1055 if (tg_no_rule_group(tg, rw)) { 1057 if (tg_no_rule_group(tg, rw)) {
1056 throtl_update_dispatch_stats(tg_to_blkg(tg), 1058 throtl_update_dispatch_stats(tg_to_blkg(tg),
1057 bio->bi_size, bio->bi_rw); 1059 bio->bi_size, bio->bi_rw);
1058 goto out_unlock_rcu; 1060 goto out_unlock_rcu;
1059 } 1061 }
1060 } 1062 }
1061 1063
1062 /* 1064 /*
1063 * Either group has not been allocated yet or it is not an unlimited 1065 * Either group has not been allocated yet or it is not an unlimited
1064 * IO group 1066 * IO group
1065 */ 1067 */
1066 spin_lock_irq(q->queue_lock); 1068 spin_lock_irq(q->queue_lock);
1067 tg = throtl_lookup_create_tg(td, blkcg); 1069 tg = throtl_lookup_create_tg(td, blkcg);
1068 if (unlikely(!tg)) 1070 if (unlikely(!tg))
1069 goto out_unlock; 1071 goto out_unlock;
1070 1072
1071 if (tg->nr_queued[rw]) { 1073 if (tg->nr_queued[rw]) {
1072 /* 1074 /*
1073 * There is already another bio queued in same dir. No 1075 * There is already another bio queued in same dir. No
1074 * need to update dispatch time. 1076 * need to update dispatch time.
1075 */ 1077 */
1076 update_disptime = false; 1078 update_disptime = false;
1077 goto queue_bio; 1079 goto queue_bio;
1078 1080
1079 } 1081 }
1080 1082
1081 /* Bio is with-in rate limit of group */ 1083 /* Bio is with-in rate limit of group */
1082 if (tg_may_dispatch(tg, bio, NULL)) { 1084 if (tg_may_dispatch(tg, bio, NULL)) {
1083 throtl_charge_bio(tg, bio); 1085 throtl_charge_bio(tg, bio);
1084 1086
1085 /* 1087 /*
1086 * We need to trim slice even when bios are not being queued 1088 * We need to trim slice even when bios are not being queued
1087 * otherwise it might happen that a bio is not queued for 1089 * otherwise it might happen that a bio is not queued for
1088 * a long time and slice keeps on extending and trim is not 1090 * a long time and slice keeps on extending and trim is not
1089 * called for a long time. Now if limits are reduced suddenly 1091 * called for a long time. Now if limits are reduced suddenly
1090 * we take into account all the IO dispatched so far at new 1092 * we take into account all the IO dispatched so far at new
1091 * low rate and * newly queued IO gets a really long dispatch 1093 * low rate and * newly queued IO gets a really long dispatch
1092 * time. 1094 * time.
1093 * 1095 *
1094 * So keep on trimming slice even if bio is not queued. 1096 * So keep on trimming slice even if bio is not queued.
1095 */ 1097 */
1096 throtl_trim_slice(tg, rw); 1098 throtl_trim_slice(tg, rw);
1097 goto out_unlock; 1099 goto out_unlock;
1098 } 1100 }
1099 1101
1100 queue_bio: 1102 queue_bio:
1101 throtl_log_tg(tg, "[%c] bio. bdisp=%llu sz=%u bps=%llu" 1103 throtl_log_tg(tg, "[%c] bio. bdisp=%llu sz=%u bps=%llu"
1102 " iodisp=%u iops=%u queued=%d/%d", 1104 " iodisp=%u iops=%u queued=%d/%d",
1103 rw == READ ? 'R' : 'W', 1105 rw == READ ? 'R' : 'W',
1104 tg->bytes_disp[rw], bio->bi_size, tg->bps[rw], 1106 tg->bytes_disp[rw], bio->bi_size, tg->bps[rw],
1105 tg->io_disp[rw], tg->iops[rw], 1107 tg->io_disp[rw], tg->iops[rw],
1106 tg->nr_queued[READ], tg->nr_queued[WRITE]); 1108 tg->nr_queued[READ], tg->nr_queued[WRITE]);
1107 1109
1108 bio_associate_current(bio); 1110 bio_associate_current(bio);
1109 throtl_add_bio_tg(&q->td->service_queue, tg, bio); 1111 throtl_add_bio_tg(bio, tg, &q->td->service_queue);
1110 throttled = true; 1112 throttled = true;
1111 1113
1112 if (update_disptime) { 1114 if (update_disptime) {
1113 tg_update_disptime(&td->service_queue, tg); 1115 tg_update_disptime(tg, &td->service_queue);
1114 throtl_schedule_next_dispatch(td); 1116 throtl_schedule_next_dispatch(td);
1115 } 1117 }
1116 1118
1117 out_unlock: 1119 out_unlock:
1118 spin_unlock_irq(q->queue_lock); 1120 spin_unlock_irq(q->queue_lock);
1119 out_unlock_rcu: 1121 out_unlock_rcu:
1120 rcu_read_unlock(); 1122 rcu_read_unlock();
1121 out: 1123 out:
1122 return throttled; 1124 return throttled;
1123 } 1125 }
1124 1126
1125 /** 1127 /**
1126 * blk_throtl_drain - drain throttled bios 1128 * blk_throtl_drain - drain throttled bios
1127 * @q: request_queue to drain throttled bios for 1129 * @q: request_queue to drain throttled bios for
1128 * 1130 *
1129 * Dispatch all currently throttled bios on @q through ->make_request_fn(). 1131 * Dispatch all currently throttled bios on @q through ->make_request_fn().
1130 */ 1132 */
1131 void blk_throtl_drain(struct request_queue *q) 1133 void blk_throtl_drain(struct request_queue *q)
1132 __releases(q->queue_lock) __acquires(q->queue_lock) 1134 __releases(q->queue_lock) __acquires(q->queue_lock)
1133 { 1135 {
1134 struct throtl_data *td = q->td; 1136 struct throtl_data *td = q->td;
1135 struct throtl_service_queue *sq = &td->service_queue; 1137 struct throtl_service_queue *parent_sq = &td->service_queue;
1136 struct throtl_grp *tg; 1138 struct throtl_grp *tg;
1137 struct bio_list bl; 1139 struct bio_list bl;
1138 struct bio *bio; 1140 struct bio *bio;
1139 1141
1140 queue_lockdep_assert_held(q); 1142 queue_lockdep_assert_held(q);
1141 1143
1142 bio_list_init(&bl); 1144 bio_list_init(&bl);
1143 1145
1144 while ((tg = throtl_rb_first(sq))) { 1146 while ((tg = throtl_rb_first(parent_sq))) {
1145 throtl_dequeue_tg(sq, tg); 1147 throtl_dequeue_tg(tg, parent_sq);
1146 1148
1147 while ((bio = bio_list_peek(&tg->bio_lists[READ]))) 1149 while ((bio = bio_list_peek(&tg->bio_lists[READ])))
1148 tg_dispatch_one_bio(tg, bio_data_dir(bio), &bl); 1150 tg_dispatch_one_bio(tg, bio_data_dir(bio), &bl);
1149 while ((bio = bio_list_peek(&tg->bio_lists[WRITE]))) 1151 while ((bio = bio_list_peek(&tg->bio_lists[WRITE])))
1150 tg_dispatch_one_bio(tg, bio_data_dir(bio), &bl); 1152 tg_dispatch_one_bio(tg, bio_data_dir(bio), &bl);
1151 } 1153 }
1152 spin_unlock_irq(q->queue_lock); 1154 spin_unlock_irq(q->queue_lock);
1153 1155
1154 while ((bio = bio_list_pop(&bl))) 1156 while ((bio = bio_list_pop(&bl)))
1155 generic_make_request(bio); 1157 generic_make_request(bio);
1156 1158
1157 spin_lock_irq(q->queue_lock); 1159 spin_lock_irq(q->queue_lock);
1158 } 1160 }
1159 1161
1160 int blk_throtl_init(struct request_queue *q) 1162 int blk_throtl_init(struct request_queue *q)
1161 { 1163 {
1162 struct throtl_data *td; 1164 struct throtl_data *td;
1163 int ret; 1165 int ret;
1164 1166
1165 td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); 1167 td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
1166 if (!td) 1168 if (!td)
1167 return -ENOMEM; 1169 return -ENOMEM;
1168 1170
1169 td->service_queue = THROTL_SERVICE_QUEUE_INITIALIZER; 1171 td->service_queue = THROTL_SERVICE_QUEUE_INITIALIZER;
1170 INIT_DELAYED_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn); 1172 INIT_DELAYED_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
1171 1173
1172 q->td = td; 1174 q->td = td;
1173 td->queue = q; 1175 td->queue = q;
1174 1176
1175 /* activate policy */ 1177 /* activate policy */
1176 ret = blkcg_activate_policy(q, &blkcg_policy_throtl); 1178 ret = blkcg_activate_policy(q, &blkcg_policy_throtl);
1177 if (ret) 1179 if (ret)
1178 kfree(td); 1180 kfree(td);
1179 return ret; 1181 return ret;
1180 } 1182 }
1181 1183
1182 void blk_throtl_exit(struct request_queue *q) 1184 void blk_throtl_exit(struct request_queue *q)
1183 { 1185 {
1184 BUG_ON(!q->td); 1186 BUG_ON(!q->td);
1185 throtl_shutdown_wq(q); 1187 throtl_shutdown_wq(q);
1186 blkcg_deactivate_policy(q, &blkcg_policy_throtl); 1188 blkcg_deactivate_policy(q, &blkcg_policy_throtl);
1187 kfree(q->td); 1189 kfree(q->td);
1188 } 1190 }
1189 1191
1190 static int __init throtl_init(void) 1192 static int __init throtl_init(void)
1191 { 1193 {
1192 kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0); 1194 kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0);
1193 if (!kthrotld_workqueue) 1195 if (!kthrotld_workqueue)
1194 panic("Failed to create kthrotld\n"); 1196 panic("Failed to create kthrotld\n");
1195 1197
1196 return blkcg_policy_register(&blkcg_policy_throtl); 1198 return blkcg_policy_register(&blkcg_policy_throtl);
1197 } 1199 }
1198 1200
1199 module_init(throtl_init); 1201 module_init(throtl_init);
1200 1202