Blame view

block/blk-throttle.c 45.7 KB
e43473b7f   Vivek Goyal   blkio: Core imple...
1
2
3
4
5
6
7
8
9
10
11
12
  /*
   * Interface for controlling IO bandwidth on a request queue
   *
   * Copyright (C) 2010 Vivek Goyal <vgoyal@redhat.com>
   */
  
  #include <linux/module.h>
  #include <linux/slab.h>
  #include <linux/blkdev.h>
  #include <linux/bio.h>
  #include <linux/blktrace_api.h>
  #include "blk-cgroup.h"
bc9fcbf9c   Tejun Heo   block: move blk_t...
13
  #include "blk.h"
e43473b7f   Vivek Goyal   blkio: Core imple...
14
15
16
17
18
19
20
21
22
  
  /* Max dispatch from a group in 1 round */
  static int throtl_grp_quantum = 8;
  
  /* Total max dispatch from all groups in one round */
  static int throtl_quantum = 32;
  
  /* Throttling is performed over 100ms slice and after that slice is renewed */
  static unsigned long throtl_slice = HZ/10;	/* 100 ms */
3c798398e   Tejun Heo   blkcg: mass renam...
23
  static struct blkcg_policy blkcg_policy_throtl;
0381411e4   Tejun Heo   blkcg: let blkcg ...
24

450adcbe5   Vivek Goyal   blk-throttle: Do ...
25
26
  /* A workqueue to queue throttle related work */
  static struct workqueue_struct *kthrotld_workqueue;
450adcbe5   Vivek Goyal   blk-throttle: Do ...
27

c5cc2070b   Tejun Heo   blk-throttle: add...
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
  /*
   * To implement hierarchical throttling, throtl_grps form a tree and bios
   * are dispatched upwards level by level until they reach the top and get
   * issued.  When dispatching bios from the children and local group at each
   * level, if the bios are dispatched into a single bio_list, there's a risk
   * of a local or child group which can queue many bios at once filling up
   * the list starving others.
   *
   * To avoid such starvation, dispatched bios are queued separately
   * according to where they came from.  When they are again dispatched to
   * the parent, they're popped in round-robin order so that no single source
   * hogs the dispatch window.
   *
   * throtl_qnode is used to keep the queued bios separated by their sources.
   * Bios are queued to throtl_qnode which in turn is queued to
   * throtl_service_queue and then dispatched in round-robin order.
   *
   * It's also used to track the reference counts on blkg's.  A qnode always
   * belongs to a throtl_grp and gets queued on itself or the parent, so
   * incrementing the reference of the associated throtl_grp when a qnode is
   * queued and decrementing when dequeued is enough to keep the whole blkg
   * tree pinned while bios are in flight.
   */
  struct throtl_qnode {
  	struct list_head	node;		/* service_queue->queued[] */
  	struct bio_list		bios;		/* queued bios */
  	struct throtl_grp	*tg;		/* tg this qnode belongs to */
  };
c9e0332e8   Tejun Heo   blk-throttle: ren...
56
  struct throtl_service_queue {
77216b048   Tejun Heo   blk-throttle: add...
57
  	struct throtl_service_queue *parent_sq;	/* the parent service_queue */
73f0d49a9   Tejun Heo   blk-throttle: mov...
58
59
60
61
  	/*
  	 * Bios queued directly to this service_queue or dispatched from
  	 * children throtl_grp's.
  	 */
c5cc2070b   Tejun Heo   blk-throttle: add...
62
  	struct list_head	queued[2];	/* throtl_qnode [READ/WRITE] */
73f0d49a9   Tejun Heo   blk-throttle: mov...
63
64
65
66
67
68
  	unsigned int		nr_queued[2];	/* number of queued bios */
  
  	/*
  	 * RB tree of active children throtl_grp's, which are sorted by
  	 * their ->disptime.
  	 */
c9e0332e8   Tejun Heo   blk-throttle: ren...
69
70
71
72
  	struct rb_root		pending_tree;	/* RB tree of active tgs */
  	struct rb_node		*first_pending;	/* first node in the tree */
  	unsigned int		nr_pending;	/* # queued in the tree */
  	unsigned long		first_pending_disptime;	/* disptime of the first tg */
69df0ab03   Tejun Heo   blk-throttle: sep...
73
  	struct timer_list	pending_timer;	/* fires on first_pending_disptime */
e43473b7f   Vivek Goyal   blkio: Core imple...
74
  };
5b2c16aae   Tejun Heo   blk-throttle: sim...
75
76
  enum tg_state_flags {
  	THROTL_TG_PENDING	= 1 << 0,	/* on parent's pending tree */
0e9f4164b   Tejun Heo   blk-throttle: gen...
77
  	THROTL_TG_WAS_EMPTY	= 1 << 1,	/* bio_lists[] became non-empty */
5b2c16aae   Tejun Heo   blk-throttle: sim...
78
  };
e43473b7f   Vivek Goyal   blkio: Core imple...
79
  #define rb_entry_tg(node)	rb_entry((node), struct throtl_grp, rb_node)
8a3d26151   Tejun Heo   blkcg: move blkio...
80
81
82
83
84
85
86
  /* Per-cpu group stats */
  struct tg_stats_cpu {
  	/* total bytes transferred */
  	struct blkg_rwstat		service_bytes;
  	/* total IOs serviced, post merge */
  	struct blkg_rwstat		serviced;
  };
e43473b7f   Vivek Goyal   blkio: Core imple...
87
  struct throtl_grp {
f95a04afa   Tejun Heo   blkcg: embed stru...
88
89
  	/* must be the first member */
  	struct blkg_policy_data pd;
c9e0332e8   Tejun Heo   blk-throttle: ren...
90
  	/* active throtl group service_queue member */
e43473b7f   Vivek Goyal   blkio: Core imple...
91
  	struct rb_node rb_node;
0f3457f60   Tejun Heo   blk-throttle: add...
92
93
  	/* throtl_data this group belongs to */
  	struct throtl_data *td;
49a2f1e3f   Tejun Heo   blk-throttle: add...
94
95
  	/* this group's service queue */
  	struct throtl_service_queue service_queue;
e43473b7f   Vivek Goyal   blkio: Core imple...
96
  	/*
c5cc2070b   Tejun Heo   blk-throttle: add...
97
98
99
100
101
102
103
104
105
106
107
  	 * qnode_on_self is used when bios are directly queued to this
  	 * throtl_grp so that local bios compete fairly with bios
  	 * dispatched from children.  qnode_on_parent is used when bios are
  	 * dispatched from this throtl_grp into its parent and will compete
  	 * with the sibling qnode_on_parents and the parent's
  	 * qnode_on_self.
  	 */
  	struct throtl_qnode qnode_on_self[2];
  	struct throtl_qnode qnode_on_parent[2];
  
  	/*
e43473b7f   Vivek Goyal   blkio: Core imple...
108
109
110
111
112
  	 * Dispatch time in jiffies. This is the estimated time when group
  	 * will unthrottle and is ready to dispatch more bio. It is used as
  	 * key to sort active groups in service tree.
  	 */
  	unsigned long disptime;
e43473b7f   Vivek Goyal   blkio: Core imple...
113
  	unsigned int flags;
693e751e7   Tejun Heo   blk-throttle: imp...
114
115
  	/* are there any throtl rules between this group and td? */
  	bool has_rules[2];
e43473b7f   Vivek Goyal   blkio: Core imple...
116
117
  	/* bytes per second rate limits */
  	uint64_t bps[2];
8e89d13f4   Vivek Goyal   blkio: Implementa...
118
119
  	/* IOPS limits */
  	unsigned int iops[2];
e43473b7f   Vivek Goyal   blkio: Core imple...
120
121
  	/* Number of bytes disptached in current slice */
  	uint64_t bytes_disp[2];
8e89d13f4   Vivek Goyal   blkio: Implementa...
122
123
  	/* Number of bio's dispatched in current slice */
  	unsigned int io_disp[2];
e43473b7f   Vivek Goyal   blkio: Core imple...
124
125
126
127
  
  	/* When did we start a new slice */
  	unsigned long slice_start[2];
  	unsigned long slice_end[2];
fe0714377   Vivek Goyal   blkio: Recalculat...
128

8a3d26151   Tejun Heo   blkcg: move blkio...
129
130
131
132
133
  	/* Per cpu stats pointer */
  	struct tg_stats_cpu __percpu *stats_cpu;
  
  	/* List of tgs waiting for per cpu stats memory to be allocated */
  	struct list_head stats_alloc_node;
e43473b7f   Vivek Goyal   blkio: Core imple...
134
135
136
137
  };
  
  struct throtl_data
  {
e43473b7f   Vivek Goyal   blkio: Core imple...
138
  	/* service tree for active throtl groups */
c9e0332e8   Tejun Heo   blk-throttle: ren...
139
  	struct throtl_service_queue service_queue;
e43473b7f   Vivek Goyal   blkio: Core imple...
140

e43473b7f   Vivek Goyal   blkio: Core imple...
141
142
143
144
145
146
  	struct request_queue *queue;
  
  	/* Total Number of queued bios on READ and WRITE lists */
  	unsigned int nr_queued[2];
  
  	/*
02977e4af   Vivek Goyal   blkio: Add root g...
147
  	 * number of total undestroyed groups
e43473b7f   Vivek Goyal   blkio: Core imple...
148
149
150
151
  	 */
  	unsigned int nr_undestroyed_grps;
  
  	/* Work for dispatching throttled bios */
69df0ab03   Tejun Heo   blk-throttle: sep...
152
  	struct work_struct dispatch_work;
e43473b7f   Vivek Goyal   blkio: Core imple...
153
  };
8a3d26151   Tejun Heo   blkcg: move blkio...
154
155
156
157
158
159
  /* list and work item to allocate percpu group stats */
  static DEFINE_SPINLOCK(tg_stats_alloc_lock);
  static LIST_HEAD(tg_stats_alloc_list);
  
  static void tg_stats_alloc_fn(struct work_struct *);
  static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn);
69df0ab03   Tejun Heo   blk-throttle: sep...
160
  static void throtl_pending_timer_fn(unsigned long arg);
f95a04afa   Tejun Heo   blkcg: embed stru...
161
162
163
164
  static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd)
  {
  	return pd ? container_of(pd, struct throtl_grp, pd) : NULL;
  }
3c798398e   Tejun Heo   blkcg: mass renam...
165
  static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg)
0381411e4   Tejun Heo   blkcg: let blkcg ...
166
  {
f95a04afa   Tejun Heo   blkcg: embed stru...
167
  	return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl));
0381411e4   Tejun Heo   blkcg: let blkcg ...
168
  }
3c798398e   Tejun Heo   blkcg: mass renam...
169
  static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg)
0381411e4   Tejun Heo   blkcg: let blkcg ...
170
  {
f95a04afa   Tejun Heo   blkcg: embed stru...
171
  	return pd_to_blkg(&tg->pd);
0381411e4   Tejun Heo   blkcg: let blkcg ...
172
  }
03d8e1114   Tejun Heo   blkcg: add reques...
173
174
175
176
  static inline struct throtl_grp *td_root_tg(struct throtl_data *td)
  {
  	return blkg_to_tg(td->queue->root_blkg);
  }
fda6f272c   Tejun Heo   blk-throttle: imp...
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
  /**
   * sq_to_tg - return the throl_grp the specified service queue belongs to
   * @sq: the throtl_service_queue of interest
   *
   * Return the throtl_grp @sq belongs to.  If @sq is the top-level one
   * embedded in throtl_data, %NULL is returned.
   */
  static struct throtl_grp *sq_to_tg(struct throtl_service_queue *sq)
  {
  	if (sq && sq->parent_sq)
  		return container_of(sq, struct throtl_grp, service_queue);
  	else
  		return NULL;
  }
  
  /**
   * sq_to_td - return throtl_data the specified service queue belongs to
   * @sq: the throtl_service_queue of interest
   *
   * A service_queue can be embeded in either a throtl_grp or throtl_data.
   * Determine the associated throtl_data accordingly and return it.
   */
  static struct throtl_data *sq_to_td(struct throtl_service_queue *sq)
  {
  	struct throtl_grp *tg = sq_to_tg(sq);
  
  	if (tg)
  		return tg->td;
  	else
  		return container_of(sq, struct throtl_data, service_queue);
  }
  
  /**
   * throtl_log - log debug message via blktrace
   * @sq: the service_queue being reported
   * @fmt: printf format string
   * @args: printf args
   *
   * The messages are prefixed with "throtl BLKG_NAME" if @sq belongs to a
   * throtl_grp; otherwise, just "throtl".
   *
   * TODO: this should be made a function and name formatting should happen
   * after testing whether blktrace is enabled.
   */
  #define throtl_log(sq, fmt, args...)	do {				\
  	struct throtl_grp *__tg = sq_to_tg((sq));			\
  	struct throtl_data *__td = sq_to_td((sq));			\
  									\
  	(void)__td;							\
  	if ((__tg)) {							\
  		char __pbuf[128];					\
54e7ed12b   Tejun Heo   blkcg: remove blk...
228
  									\
fda6f272c   Tejun Heo   blk-throttle: imp...
229
230
231
232
233
  		blkg_path(tg_to_blkg(__tg), __pbuf, sizeof(__pbuf));	\
  		blk_add_trace_msg(__td->queue, "throtl %s " fmt, __pbuf, ##args); \
  	} else {							\
  		blk_add_trace_msg(__td->queue, "throtl " fmt, ##args);	\
  	}								\
54e7ed12b   Tejun Heo   blkcg: remove blk...
234
  } while (0)
e43473b7f   Vivek Goyal   blkio: Core imple...
235

90d3839b9   Peter Zijlstra   block: Use u64_st...
236
237
238
239
240
  static void tg_stats_init(struct tg_stats_cpu *tg_stats)
  {
  	blkg_rwstat_init(&tg_stats->service_bytes);
  	blkg_rwstat_init(&tg_stats->serviced);
  }
8a3d26151   Tejun Heo   blkcg: move blkio...
241
242
  /*
   * Worker for allocating per cpu stat for tgs. This is scheduled on the
3b07e9ca2   Tejun Heo   workqueue: deprec...
243
   * system_wq once there are some groups on the alloc_list waiting for
8a3d26151   Tejun Heo   blkcg: move blkio...
244
245
246
247
248
249
250
251
252
253
   * allocation.
   */
  static void tg_stats_alloc_fn(struct work_struct *work)
  {
  	static struct tg_stats_cpu *stats_cpu;	/* this fn is non-reentrant */
  	struct delayed_work *dwork = to_delayed_work(work);
  	bool empty = false;
  
  alloc_stats:
  	if (!stats_cpu) {
90d3839b9   Peter Zijlstra   block: Use u64_st...
254
  		int cpu;
8a3d26151   Tejun Heo   blkcg: move blkio...
255
256
257
  		stats_cpu = alloc_percpu(struct tg_stats_cpu);
  		if (!stats_cpu) {
  			/* allocation failed, try again after some time */
3b07e9ca2   Tejun Heo   workqueue: deprec...
258
  			schedule_delayed_work(dwork, msecs_to_jiffies(10));
8a3d26151   Tejun Heo   blkcg: move blkio...
259
260
  			return;
  		}
90d3839b9   Peter Zijlstra   block: Use u64_st...
261
262
  		for_each_possible_cpu(cpu)
  			tg_stats_init(per_cpu_ptr(stats_cpu, cpu));
8a3d26151   Tejun Heo   blkcg: move blkio...
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
  	}
  
  	spin_lock_irq(&tg_stats_alloc_lock);
  
  	if (!list_empty(&tg_stats_alloc_list)) {
  		struct throtl_grp *tg = list_first_entry(&tg_stats_alloc_list,
  							 struct throtl_grp,
  							 stats_alloc_node);
  		swap(tg->stats_cpu, stats_cpu);
  		list_del_init(&tg->stats_alloc_node);
  	}
  
  	empty = list_empty(&tg_stats_alloc_list);
  	spin_unlock_irq(&tg_stats_alloc_lock);
  	if (!empty)
  		goto alloc_stats;
  }
c5cc2070b   Tejun Heo   blk-throttle: add...
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
  static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg)
  {
  	INIT_LIST_HEAD(&qn->node);
  	bio_list_init(&qn->bios);
  	qn->tg = tg;
  }
  
  /**
   * throtl_qnode_add_bio - add a bio to a throtl_qnode and activate it
   * @bio: bio being added
   * @qn: qnode to add bio to
   * @queued: the service_queue->queued[] list @qn belongs to
   *
   * Add @bio to @qn and put @qn on @queued if it's not already on.
   * @qn->tg's reference count is bumped when @qn is activated.  See the
   * comment on top of throtl_qnode definition for details.
   */
  static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn,
  				 struct list_head *queued)
  {
  	bio_list_add(&qn->bios, bio);
  	if (list_empty(&qn->node)) {
  		list_add_tail(&qn->node, queued);
  		blkg_get(tg_to_blkg(qn->tg));
  	}
  }
  
  /**
   * throtl_peek_queued - peek the first bio on a qnode list
   * @queued: the qnode list to peek
   */
  static struct bio *throtl_peek_queued(struct list_head *queued)
  {
  	struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node);
  	struct bio *bio;
  
  	if (list_empty(queued))
  		return NULL;
  
  	bio = bio_list_peek(&qn->bios);
  	WARN_ON_ONCE(!bio);
  	return bio;
  }
  
  /**
   * throtl_pop_queued - pop the first bio form a qnode list
   * @queued: the qnode list to pop a bio from
   * @tg_to_put: optional out argument for throtl_grp to put
   *
   * Pop the first bio from the qnode list @queued.  After popping, the first
   * qnode is removed from @queued if empty or moved to the end of @queued so
   * that the popping order is round-robin.
   *
   * When the first qnode is removed, its associated throtl_grp should be put
   * too.  If @tg_to_put is NULL, this function automatically puts it;
   * otherwise, *@tg_to_put is set to the throtl_grp to put and the caller is
   * responsible for putting it.
   */
  static struct bio *throtl_pop_queued(struct list_head *queued,
  				     struct throtl_grp **tg_to_put)
  {
  	struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node);
  	struct bio *bio;
  
  	if (list_empty(queued))
  		return NULL;
  
  	bio = bio_list_pop(&qn->bios);
  	WARN_ON_ONCE(!bio);
  
  	if (bio_list_empty(&qn->bios)) {
  		list_del_init(&qn->node);
  		if (tg_to_put)
  			*tg_to_put = qn->tg;
  		else
  			blkg_put(tg_to_blkg(qn->tg));
  	} else {
  		list_move_tail(&qn->node, queued);
  	}
  
  	return bio;
  }
49a2f1e3f   Tejun Heo   blk-throttle: add...
362
  /* init a service_queue, assumes the caller zeroed it */
77216b048   Tejun Heo   blk-throttle: add...
363
364
  static void throtl_service_queue_init(struct throtl_service_queue *sq,
  				      struct throtl_service_queue *parent_sq)
49a2f1e3f   Tejun Heo   blk-throttle: add...
365
  {
c5cc2070b   Tejun Heo   blk-throttle: add...
366
367
  	INIT_LIST_HEAD(&sq->queued[0]);
  	INIT_LIST_HEAD(&sq->queued[1]);
49a2f1e3f   Tejun Heo   blk-throttle: add...
368
  	sq->pending_tree = RB_ROOT;
77216b048   Tejun Heo   blk-throttle: add...
369
  	sq->parent_sq = parent_sq;
69df0ab03   Tejun Heo   blk-throttle: sep...
370
371
372
373
374
375
376
  	setup_timer(&sq->pending_timer, throtl_pending_timer_fn,
  		    (unsigned long)sq);
  }
  
  static void throtl_service_queue_exit(struct throtl_service_queue *sq)
  {
  	del_timer_sync(&sq->pending_timer);
49a2f1e3f   Tejun Heo   blk-throttle: add...
377
  }
3c798398e   Tejun Heo   blkcg: mass renam...
378
  static void throtl_pd_init(struct blkcg_gq *blkg)
a29a171e7   Vivek Goyal   blk-throttle: Do ...
379
  {
0381411e4   Tejun Heo   blkcg: let blkcg ...
380
  	struct throtl_grp *tg = blkg_to_tg(blkg);
77216b048   Tejun Heo   blk-throttle: add...
381
  	struct throtl_data *td = blkg->q->td;
9138125be   Tejun Heo   blk-throttle: imp...
382
  	struct throtl_service_queue *parent_sq;
ff26eaadf   Tejun Heo   blkcg: tg_stats_a...
383
  	unsigned long flags;
c5cc2070b   Tejun Heo   blk-throttle: add...
384
  	int rw;
cd1604fab   Tejun Heo   blkcg: factor out...
385

9138125be   Tejun Heo   blk-throttle: imp...
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
  	/*
  	 * If sane_hierarchy is enabled, we switch to properly hierarchical
  	 * behavior where limits on a given throtl_grp are applied to the
  	 * whole subtree rather than just the group itself.  e.g. If 16M
  	 * read_bps limit is set on the root group, the whole system can't
  	 * exceed 16M for the device.
  	 *
  	 * If sane_hierarchy is not enabled, the broken flat hierarchy
  	 * behavior is retained where all throtl_grps are treated as if
  	 * they're all separate root groups right below throtl_data.
  	 * Limits of a group don't interact with limits of other groups
  	 * regardless of the position of the group in the hierarchy.
  	 */
  	parent_sq = &td->service_queue;
  
  	if (cgroup_sane_behavior(blkg->blkcg->css.cgroup) && blkg->parent)
  		parent_sq = &blkg_to_tg(blkg->parent)->service_queue;
  
  	throtl_service_queue_init(&tg->service_queue, parent_sq);
c5cc2070b   Tejun Heo   blk-throttle: add...
405
406
407
408
  	for (rw = READ; rw <= WRITE; rw++) {
  		throtl_qnode_init(&tg->qnode_on_self[rw], tg);
  		throtl_qnode_init(&tg->qnode_on_parent[rw], tg);
  	}
a29a171e7   Vivek Goyal   blk-throttle: Do ...
409
  	RB_CLEAR_NODE(&tg->rb_node);
77216b048   Tejun Heo   blk-throttle: add...
410
  	tg->td = td;
a29a171e7   Vivek Goyal   blk-throttle: Do ...
411

e56da7e28   Tejun Heo   blkcg: don't allo...
412
413
414
415
  	tg->bps[READ] = -1;
  	tg->bps[WRITE] = -1;
  	tg->iops[READ] = -1;
  	tg->iops[WRITE] = -1;
8a3d26151   Tejun Heo   blkcg: move blkio...
416
417
418
419
420
421
  
  	/*
  	 * Ugh... We need to perform per-cpu allocation for tg->stats_cpu
  	 * but percpu allocator can't be called from IO path.  Queue tg on
  	 * tg_stats_alloc_list and allocate from work item.
  	 */
ff26eaadf   Tejun Heo   blkcg: tg_stats_a...
422
  	spin_lock_irqsave(&tg_stats_alloc_lock, flags);
8a3d26151   Tejun Heo   blkcg: move blkio...
423
  	list_add(&tg->stats_alloc_node, &tg_stats_alloc_list);
3b07e9ca2   Tejun Heo   workqueue: deprec...
424
  	schedule_delayed_work(&tg_stats_alloc_work, 0);
ff26eaadf   Tejun Heo   blkcg: tg_stats_a...
425
  	spin_unlock_irqrestore(&tg_stats_alloc_lock, flags);
8a3d26151   Tejun Heo   blkcg: move blkio...
426
  }
693e751e7   Tejun Heo   blk-throttle: imp...
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
  /*
   * Set has_rules[] if @tg or any of its parents have limits configured.
   * This doesn't require walking up to the top of the hierarchy as the
   * parent's has_rules[] is guaranteed to be correct.
   */
  static void tg_update_has_rules(struct throtl_grp *tg)
  {
  	struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq);
  	int rw;
  
  	for (rw = READ; rw <= WRITE; rw++)
  		tg->has_rules[rw] = (parent_tg && parent_tg->has_rules[rw]) ||
  				    (tg->bps[rw] != -1 || tg->iops[rw] != -1);
  }
  
  static void throtl_pd_online(struct blkcg_gq *blkg)
  {
  	/*
  	 * We don't want new groups to escape the limits of its ancestors.
  	 * Update has_rules[] after a new group is brought online.
  	 */
  	tg_update_has_rules(blkg_to_tg(blkg));
  }
3c798398e   Tejun Heo   blkcg: mass renam...
450
  static void throtl_pd_exit(struct blkcg_gq *blkg)
8a3d26151   Tejun Heo   blkcg: move blkio...
451
452
  {
  	struct throtl_grp *tg = blkg_to_tg(blkg);
ff26eaadf   Tejun Heo   blkcg: tg_stats_a...
453
  	unsigned long flags;
8a3d26151   Tejun Heo   blkcg: move blkio...
454

ff26eaadf   Tejun Heo   blkcg: tg_stats_a...
455
  	spin_lock_irqsave(&tg_stats_alloc_lock, flags);
8a3d26151   Tejun Heo   blkcg: move blkio...
456
  	list_del_init(&tg->stats_alloc_node);
ff26eaadf   Tejun Heo   blkcg: tg_stats_a...
457
  	spin_unlock_irqrestore(&tg_stats_alloc_lock, flags);
8a3d26151   Tejun Heo   blkcg: move blkio...
458
459
  
  	free_percpu(tg->stats_cpu);
69df0ab03   Tejun Heo   blk-throttle: sep...
460
461
  
  	throtl_service_queue_exit(&tg->service_queue);
8a3d26151   Tejun Heo   blkcg: move blkio...
462
  }
3c798398e   Tejun Heo   blkcg: mass renam...
463
  static void throtl_pd_reset_stats(struct blkcg_gq *blkg)
8a3d26151   Tejun Heo   blkcg: move blkio...
464
465
466
467
468
469
470
471
472
473
474
475
476
  {
  	struct throtl_grp *tg = blkg_to_tg(blkg);
  	int cpu;
  
  	if (tg->stats_cpu == NULL)
  		return;
  
  	for_each_possible_cpu(cpu) {
  		struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu);
  
  		blkg_rwstat_reset(&sc->service_bytes);
  		blkg_rwstat_reset(&sc->serviced);
  	}
a29a171e7   Vivek Goyal   blk-throttle: Do ...
477
  }
3c798398e   Tejun Heo   blkcg: mass renam...
478
479
  static struct throtl_grp *throtl_lookup_tg(struct throtl_data *td,
  					   struct blkcg *blkcg)
e43473b7f   Vivek Goyal   blkio: Core imple...
480
  {
e43473b7f   Vivek Goyal   blkio: Core imple...
481
  	/*
3c798398e   Tejun Heo   blkcg: mass renam...
482
483
  	 * This is the common case when there are no blkcgs.  Avoid lookup
  	 * in this case
cd1604fab   Tejun Heo   blkcg: factor out...
484
  	 */
3c798398e   Tejun Heo   blkcg: mass renam...
485
  	if (blkcg == &blkcg_root)
03d8e1114   Tejun Heo   blkcg: add reques...
486
  		return td_root_tg(td);
e43473b7f   Vivek Goyal   blkio: Core imple...
487

e8989fae3   Tejun Heo   blkcg: unify blkg...
488
  	return blkg_to_tg(blkg_lookup(blkcg, td->queue));
e43473b7f   Vivek Goyal   blkio: Core imple...
489
  }
cd1604fab   Tejun Heo   blkcg: factor out...
490
  static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td,
3c798398e   Tejun Heo   blkcg: mass renam...
491
  						  struct blkcg *blkcg)
e43473b7f   Vivek Goyal   blkio: Core imple...
492
  {
f469a7b4d   Vivek Goyal   blk-cgroup: Allow...
493
  	struct request_queue *q = td->queue;
cd1604fab   Tejun Heo   blkcg: factor out...
494
  	struct throtl_grp *tg = NULL;
bc16a4f93   Tejun Heo   block: reorganize...
495

f469a7b4d   Vivek Goyal   blk-cgroup: Allow...
496
  	/*
3c798398e   Tejun Heo   blkcg: mass renam...
497
498
  	 * This is the common case when there are no blkcgs.  Avoid lookup
  	 * in this case
f469a7b4d   Vivek Goyal   blk-cgroup: Allow...
499
  	 */
3c798398e   Tejun Heo   blkcg: mass renam...
500
  	if (blkcg == &blkcg_root) {
03d8e1114   Tejun Heo   blkcg: add reques...
501
  		tg = td_root_tg(td);
cd1604fab   Tejun Heo   blkcg: factor out...
502
  	} else {
3c798398e   Tejun Heo   blkcg: mass renam...
503
  		struct blkcg_gq *blkg;
f469a7b4d   Vivek Goyal   blk-cgroup: Allow...
504

3c96cb32d   Tejun Heo   blkcg: drop stuff...
505
  		blkg = blkg_lookup_create(blkcg, q);
f469a7b4d   Vivek Goyal   blk-cgroup: Allow...
506

cd1604fab   Tejun Heo   blkcg: factor out...
507
508
  		/* if %NULL and @q is alive, fall back to root_tg */
  		if (!IS_ERR(blkg))
0381411e4   Tejun Heo   blkcg: let blkcg ...
509
  			tg = blkg_to_tg(blkg);
3f3299d5c   Bart Van Assche   block: Rename que...
510
  		else if (!blk_queue_dying(q))
03d8e1114   Tejun Heo   blkcg: add reques...
511
  			tg = td_root_tg(td);
f469a7b4d   Vivek Goyal   blk-cgroup: Allow...
512
  	}
e43473b7f   Vivek Goyal   blkio: Core imple...
513
514
  	return tg;
  }
0049af73b   Tejun Heo   blk-throttle: reo...
515
516
  static struct throtl_grp *
  throtl_rb_first(struct throtl_service_queue *parent_sq)
e43473b7f   Vivek Goyal   blkio: Core imple...
517
518
  {
  	/* Service tree is empty */
0049af73b   Tejun Heo   blk-throttle: reo...
519
  	if (!parent_sq->nr_pending)
e43473b7f   Vivek Goyal   blkio: Core imple...
520
  		return NULL;
0049af73b   Tejun Heo   blk-throttle: reo...
521
522
  	if (!parent_sq->first_pending)
  		parent_sq->first_pending = rb_first(&parent_sq->pending_tree);
e43473b7f   Vivek Goyal   blkio: Core imple...
523

0049af73b   Tejun Heo   blk-throttle: reo...
524
525
  	if (parent_sq->first_pending)
  		return rb_entry_tg(parent_sq->first_pending);
e43473b7f   Vivek Goyal   blkio: Core imple...
526
527
528
529
530
531
532
533
534
  
  	return NULL;
  }
  
  static void rb_erase_init(struct rb_node *n, struct rb_root *root)
  {
  	rb_erase(n, root);
  	RB_CLEAR_NODE(n);
  }
0049af73b   Tejun Heo   blk-throttle: reo...
535
536
  static void throtl_rb_erase(struct rb_node *n,
  			    struct throtl_service_queue *parent_sq)
e43473b7f   Vivek Goyal   blkio: Core imple...
537
  {
0049af73b   Tejun Heo   blk-throttle: reo...
538
539
540
541
  	if (parent_sq->first_pending == n)
  		parent_sq->first_pending = NULL;
  	rb_erase_init(n, &parent_sq->pending_tree);
  	--parent_sq->nr_pending;
e43473b7f   Vivek Goyal   blkio: Core imple...
542
  }
0049af73b   Tejun Heo   blk-throttle: reo...
543
  static void update_min_dispatch_time(struct throtl_service_queue *parent_sq)
e43473b7f   Vivek Goyal   blkio: Core imple...
544
545
  {
  	struct throtl_grp *tg;
0049af73b   Tejun Heo   blk-throttle: reo...
546
  	tg = throtl_rb_first(parent_sq);
e43473b7f   Vivek Goyal   blkio: Core imple...
547
548
  	if (!tg)
  		return;
0049af73b   Tejun Heo   blk-throttle: reo...
549
  	parent_sq->first_pending_disptime = tg->disptime;
e43473b7f   Vivek Goyal   blkio: Core imple...
550
  }
77216b048   Tejun Heo   blk-throttle: add...
551
  static void tg_service_queue_add(struct throtl_grp *tg)
e43473b7f   Vivek Goyal   blkio: Core imple...
552
  {
77216b048   Tejun Heo   blk-throttle: add...
553
  	struct throtl_service_queue *parent_sq = tg->service_queue.parent_sq;
0049af73b   Tejun Heo   blk-throttle: reo...
554
  	struct rb_node **node = &parent_sq->pending_tree.rb_node;
e43473b7f   Vivek Goyal   blkio: Core imple...
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
  	struct rb_node *parent = NULL;
  	struct throtl_grp *__tg;
  	unsigned long key = tg->disptime;
  	int left = 1;
  
  	while (*node != NULL) {
  		parent = *node;
  		__tg = rb_entry_tg(parent);
  
  		if (time_before(key, __tg->disptime))
  			node = &parent->rb_left;
  		else {
  			node = &parent->rb_right;
  			left = 0;
  		}
  	}
  
  	if (left)
0049af73b   Tejun Heo   blk-throttle: reo...
573
  		parent_sq->first_pending = &tg->rb_node;
e43473b7f   Vivek Goyal   blkio: Core imple...
574
575
  
  	rb_link_node(&tg->rb_node, parent, node);
0049af73b   Tejun Heo   blk-throttle: reo...
576
  	rb_insert_color(&tg->rb_node, &parent_sq->pending_tree);
e43473b7f   Vivek Goyal   blkio: Core imple...
577
  }
77216b048   Tejun Heo   blk-throttle: add...
578
  static void __throtl_enqueue_tg(struct throtl_grp *tg)
e43473b7f   Vivek Goyal   blkio: Core imple...
579
  {
77216b048   Tejun Heo   blk-throttle: add...
580
  	tg_service_queue_add(tg);
5b2c16aae   Tejun Heo   blk-throttle: sim...
581
  	tg->flags |= THROTL_TG_PENDING;
77216b048   Tejun Heo   blk-throttle: add...
582
  	tg->service_queue.parent_sq->nr_pending++;
e43473b7f   Vivek Goyal   blkio: Core imple...
583
  }
77216b048   Tejun Heo   blk-throttle: add...
584
  static void throtl_enqueue_tg(struct throtl_grp *tg)
e43473b7f   Vivek Goyal   blkio: Core imple...
585
  {
5b2c16aae   Tejun Heo   blk-throttle: sim...
586
  	if (!(tg->flags & THROTL_TG_PENDING))
77216b048   Tejun Heo   blk-throttle: add...
587
  		__throtl_enqueue_tg(tg);
e43473b7f   Vivek Goyal   blkio: Core imple...
588
  }
77216b048   Tejun Heo   blk-throttle: add...
589
  static void __throtl_dequeue_tg(struct throtl_grp *tg)
e43473b7f   Vivek Goyal   blkio: Core imple...
590
  {
77216b048   Tejun Heo   blk-throttle: add...
591
  	throtl_rb_erase(&tg->rb_node, tg->service_queue.parent_sq);
5b2c16aae   Tejun Heo   blk-throttle: sim...
592
  	tg->flags &= ~THROTL_TG_PENDING;
e43473b7f   Vivek Goyal   blkio: Core imple...
593
  }
77216b048   Tejun Heo   blk-throttle: add...
594
  static void throtl_dequeue_tg(struct throtl_grp *tg)
e43473b7f   Vivek Goyal   blkio: Core imple...
595
  {
5b2c16aae   Tejun Heo   blk-throttle: sim...
596
  	if (tg->flags & THROTL_TG_PENDING)
77216b048   Tejun Heo   blk-throttle: add...
597
  		__throtl_dequeue_tg(tg);
e43473b7f   Vivek Goyal   blkio: Core imple...
598
  }
a9131a27e   Tejun Heo   blk-throttle: rel...
599
  /* Call with queue lock held */
69df0ab03   Tejun Heo   blk-throttle: sep...
600
601
  static void throtl_schedule_pending_timer(struct throtl_service_queue *sq,
  					  unsigned long expires)
a9131a27e   Tejun Heo   blk-throttle: rel...
602
  {
69df0ab03   Tejun Heo   blk-throttle: sep...
603
604
605
  	mod_timer(&sq->pending_timer, expires);
  	throtl_log(sq, "schedule timer. delay=%lu jiffies=%lu",
  		   expires - jiffies, jiffies);
a9131a27e   Tejun Heo   blk-throttle: rel...
606
  }
7f52f98c2   Tejun Heo   blk-throttle: imp...
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
  /**
   * throtl_schedule_next_dispatch - schedule the next dispatch cycle
   * @sq: the service_queue to schedule dispatch for
   * @force: force scheduling
   *
   * Arm @sq->pending_timer so that the next dispatch cycle starts on the
   * dispatch time of the first pending child.  Returns %true if either timer
   * is armed or there's no pending child left.  %false if the current
   * dispatch window is still open and the caller should continue
   * dispatching.
   *
   * If @force is %true, the dispatch timer is always scheduled and this
   * function is guaranteed to return %true.  This is to be used when the
   * caller can't dispatch itself and needs to invoke pending_timer
   * unconditionally.  Note that forced scheduling is likely to induce short
   * delay before dispatch starts even if @sq->first_pending_disptime is not
   * in the future and thus shouldn't be used in hot paths.
   */
  static bool throtl_schedule_next_dispatch(struct throtl_service_queue *sq,
  					  bool force)
e43473b7f   Vivek Goyal   blkio: Core imple...
627
  {
6a525600f   Tejun Heo   blk-throttle: rem...
628
  	/* any pending children left? */
c9e0332e8   Tejun Heo   blk-throttle: ren...
629
  	if (!sq->nr_pending)
7f52f98c2   Tejun Heo   blk-throttle: imp...
630
  		return true;
e43473b7f   Vivek Goyal   blkio: Core imple...
631

c9e0332e8   Tejun Heo   blk-throttle: ren...
632
  	update_min_dispatch_time(sq);
e43473b7f   Vivek Goyal   blkio: Core imple...
633

69df0ab03   Tejun Heo   blk-throttle: sep...
634
  	/* is the next dispatch time in the future? */
7f52f98c2   Tejun Heo   blk-throttle: imp...
635
  	if (force || time_after(sq->first_pending_disptime, jiffies)) {
69df0ab03   Tejun Heo   blk-throttle: sep...
636
  		throtl_schedule_pending_timer(sq, sq->first_pending_disptime);
7f52f98c2   Tejun Heo   blk-throttle: imp...
637
  		return true;
69df0ab03   Tejun Heo   blk-throttle: sep...
638
  	}
7f52f98c2   Tejun Heo   blk-throttle: imp...
639
640
  	/* tell the caller to continue dispatching */
  	return false;
e43473b7f   Vivek Goyal   blkio: Core imple...
641
  }
32ee5bc47   Vivek Goyal   blk-throttle: Acc...
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
  static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg,
  		bool rw, unsigned long start)
  {
  	tg->bytes_disp[rw] = 0;
  	tg->io_disp[rw] = 0;
  
  	/*
  	 * Previous slice has expired. We must have trimmed it after last
  	 * bio dispatch. That means since start of last slice, we never used
  	 * that bandwidth. Do try to make use of that bandwidth while giving
  	 * credit.
  	 */
  	if (time_after_eq(start, tg->slice_start[rw]))
  		tg->slice_start[rw] = start;
  
  	tg->slice_end[rw] = jiffies + throtl_slice;
  	throtl_log(&tg->service_queue,
  		   "[%c] new slice with credit start=%lu end=%lu jiffies=%lu",
  		   rw == READ ? 'R' : 'W', tg->slice_start[rw],
  		   tg->slice_end[rw], jiffies);
  }
0f3457f60   Tejun Heo   blk-throttle: add...
663
  static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw)
e43473b7f   Vivek Goyal   blkio: Core imple...
664
665
  {
  	tg->bytes_disp[rw] = 0;
8e89d13f4   Vivek Goyal   blkio: Implementa...
666
  	tg->io_disp[rw] = 0;
e43473b7f   Vivek Goyal   blkio: Core imple...
667
668
  	tg->slice_start[rw] = jiffies;
  	tg->slice_end[rw] = jiffies + throtl_slice;
fda6f272c   Tejun Heo   blk-throttle: imp...
669
670
671
672
  	throtl_log(&tg->service_queue,
  		   "[%c] new slice start=%lu end=%lu jiffies=%lu",
  		   rw == READ ? 'R' : 'W', tg->slice_start[rw],
  		   tg->slice_end[rw], jiffies);
e43473b7f   Vivek Goyal   blkio: Core imple...
673
  }
0f3457f60   Tejun Heo   blk-throttle: add...
674
675
  static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw,
  					unsigned long jiffy_end)
d1ae8ffdf   Vivek Goyal   blk-throttle: Tri...
676
677
678
  {
  	tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
  }
0f3457f60   Tejun Heo   blk-throttle: add...
679
680
  static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw,
  				       unsigned long jiffy_end)
e43473b7f   Vivek Goyal   blkio: Core imple...
681
682
  {
  	tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
fda6f272c   Tejun Heo   blk-throttle: imp...
683
684
685
686
  	throtl_log(&tg->service_queue,
  		   "[%c] extend slice start=%lu end=%lu jiffies=%lu",
  		   rw == READ ? 'R' : 'W', tg->slice_start[rw],
  		   tg->slice_end[rw], jiffies);
e43473b7f   Vivek Goyal   blkio: Core imple...
687
688
689
  }
  
  /* Determine if previously allocated or extended slice is complete or not */
0f3457f60   Tejun Heo   blk-throttle: add...
690
  static bool throtl_slice_used(struct throtl_grp *tg, bool rw)
e43473b7f   Vivek Goyal   blkio: Core imple...
691
692
693
694
695
696
697
698
  {
  	if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw]))
  		return 0;
  
  	return 1;
  }
  
  /* Trim the used slices and adjust slice start accordingly */
0f3457f60   Tejun Heo   blk-throttle: add...
699
  static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
e43473b7f   Vivek Goyal   blkio: Core imple...
700
  {
3aad5d3ee   Vivek Goyal   blkio-throttle: F...
701
702
  	unsigned long nr_slices, time_elapsed, io_trim;
  	u64 bytes_trim, tmp;
e43473b7f   Vivek Goyal   blkio: Core imple...
703
704
705
706
707
708
709
710
  
  	BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw]));
  
  	/*
  	 * If bps are unlimited (-1), then time slice don't get
  	 * renewed. Don't try to trim the slice if slice is used. A new
  	 * slice will start when appropriate.
  	 */
0f3457f60   Tejun Heo   blk-throttle: add...
711
  	if (throtl_slice_used(tg, rw))
e43473b7f   Vivek Goyal   blkio: Core imple...
712
  		return;
d1ae8ffdf   Vivek Goyal   blk-throttle: Tri...
713
714
715
716
717
718
719
  	/*
  	 * A bio has been dispatched. Also adjust slice_end. It might happen
  	 * that initially cgroup limit was very low resulting in high
  	 * slice_end, but later limit was bumped up and bio was dispached
  	 * sooner, then we need to reduce slice_end. A high bogus slice_end
  	 * is bad because it does not allow new slice to start.
  	 */
0f3457f60   Tejun Heo   blk-throttle: add...
720
  	throtl_set_slice_end(tg, rw, jiffies + throtl_slice);
d1ae8ffdf   Vivek Goyal   blk-throttle: Tri...
721

e43473b7f   Vivek Goyal   blkio: Core imple...
722
723
724
725
726
727
  	time_elapsed = jiffies - tg->slice_start[rw];
  
  	nr_slices = time_elapsed / throtl_slice;
  
  	if (!nr_slices)
  		return;
3aad5d3ee   Vivek Goyal   blkio-throttle: F...
728
729
730
  	tmp = tg->bps[rw] * throtl_slice * nr_slices;
  	do_div(tmp, HZ);
  	bytes_trim = tmp;
e43473b7f   Vivek Goyal   blkio: Core imple...
731

8e89d13f4   Vivek Goyal   blkio: Implementa...
732
  	io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ;
e43473b7f   Vivek Goyal   blkio: Core imple...
733

8e89d13f4   Vivek Goyal   blkio: Implementa...
734
  	if (!bytes_trim && !io_trim)
e43473b7f   Vivek Goyal   blkio: Core imple...
735
736
737
738
739
740
  		return;
  
  	if (tg->bytes_disp[rw] >= bytes_trim)
  		tg->bytes_disp[rw] -= bytes_trim;
  	else
  		tg->bytes_disp[rw] = 0;
8e89d13f4   Vivek Goyal   blkio: Implementa...
741
742
743
744
  	if (tg->io_disp[rw] >= io_trim)
  		tg->io_disp[rw] -= io_trim;
  	else
  		tg->io_disp[rw] = 0;
e43473b7f   Vivek Goyal   blkio: Core imple...
745
  	tg->slice_start[rw] += nr_slices * throtl_slice;
fda6f272c   Tejun Heo   blk-throttle: imp...
746
747
748
749
  	throtl_log(&tg->service_queue,
  		   "[%c] trim slice nr=%lu bytes=%llu io=%lu start=%lu end=%lu jiffies=%lu",
  		   rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim,
  		   tg->slice_start[rw], tg->slice_end[rw], jiffies);
e43473b7f   Vivek Goyal   blkio: Core imple...
750
  }
0f3457f60   Tejun Heo   blk-throttle: add...
751
752
  static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
  				  unsigned long *wait)
e43473b7f   Vivek Goyal   blkio: Core imple...
753
754
  {
  	bool rw = bio_data_dir(bio);
8e89d13f4   Vivek Goyal   blkio: Implementa...
755
  	unsigned int io_allowed;
e43473b7f   Vivek Goyal   blkio: Core imple...
756
  	unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
c49c06e49   Vivek Goyal   blkio-throttle: F...
757
  	u64 tmp;
e43473b7f   Vivek Goyal   blkio: Core imple...
758

8e89d13f4   Vivek Goyal   blkio: Implementa...
759
  	jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
e43473b7f   Vivek Goyal   blkio: Core imple...
760

8e89d13f4   Vivek Goyal   blkio: Implementa...
761
762
763
764
765
  	/* Slice has just started. Consider one slice interval */
  	if (!jiffy_elapsed)
  		jiffy_elapsed_rnd = throtl_slice;
  
  	jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
c49c06e49   Vivek Goyal   blkio-throttle: F...
766
767
768
769
770
771
772
773
774
775
776
777
778
779
  	/*
  	 * jiffy_elapsed_rnd should not be a big value as minimum iops can be
  	 * 1 then at max jiffy elapsed should be equivalent of 1 second as we
  	 * will allow dispatch after 1 second and after that slice should
  	 * have been trimmed.
  	 */
  
  	tmp = (u64)tg->iops[rw] * jiffy_elapsed_rnd;
  	do_div(tmp, HZ);
  
  	if (tmp > UINT_MAX)
  		io_allowed = UINT_MAX;
  	else
  		io_allowed = tmp;
8e89d13f4   Vivek Goyal   blkio: Implementa...
780
781
  
  	if (tg->io_disp[rw] + 1 <= io_allowed) {
e43473b7f   Vivek Goyal   blkio: Core imple...
782
783
784
785
  		if (wait)
  			*wait = 0;
  		return 1;
  	}
8e89d13f4   Vivek Goyal   blkio: Implementa...
786
787
788
789
790
791
792
793
794
795
796
797
  	/* Calc approx time to dispatch */
  	jiffy_wait = ((tg->io_disp[rw] + 1) * HZ)/tg->iops[rw] + 1;
  
  	if (jiffy_wait > jiffy_elapsed)
  		jiffy_wait = jiffy_wait - jiffy_elapsed;
  	else
  		jiffy_wait = 1;
  
  	if (wait)
  		*wait = jiffy_wait;
  	return 0;
  }
0f3457f60   Tejun Heo   blk-throttle: add...
798
799
  static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
  				 unsigned long *wait)
8e89d13f4   Vivek Goyal   blkio: Implementa...
800
801
  {
  	bool rw = bio_data_dir(bio);
3aad5d3ee   Vivek Goyal   blkio-throttle: F...
802
  	u64 bytes_allowed, extra_bytes, tmp;
8e89d13f4   Vivek Goyal   blkio: Implementa...
803
  	unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
e43473b7f   Vivek Goyal   blkio: Core imple...
804
805
806
807
808
809
810
811
  
  	jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
  
  	/* Slice has just started. Consider one slice interval */
  	if (!jiffy_elapsed)
  		jiffy_elapsed_rnd = throtl_slice;
  
  	jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
5e901a2b9   Vivek Goyal   blkio-throttle: T...
812
813
  	tmp = tg->bps[rw] * jiffy_elapsed_rnd;
  	do_div(tmp, HZ);
3aad5d3ee   Vivek Goyal   blkio-throttle: F...
814
  	bytes_allowed = tmp;
e43473b7f   Vivek Goyal   blkio: Core imple...
815

4f024f379   Kent Overstreet   block: Abstract o...
816
  	if (tg->bytes_disp[rw] + bio->bi_iter.bi_size <= bytes_allowed) {
e43473b7f   Vivek Goyal   blkio: Core imple...
817
818
819
820
821
822
  		if (wait)
  			*wait = 0;
  		return 1;
  	}
  
  	/* Calc approx time to dispatch */
4f024f379   Kent Overstreet   block: Abstract o...
823
  	extra_bytes = tg->bytes_disp[rw] + bio->bi_iter.bi_size - bytes_allowed;
e43473b7f   Vivek Goyal   blkio: Core imple...
824
825
826
827
828
829
830
831
832
833
  	jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]);
  
  	if (!jiffy_wait)
  		jiffy_wait = 1;
  
  	/*
  	 * This wait time is without taking into consideration the rounding
  	 * up we did. Add that time also.
  	 */
  	jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed);
e43473b7f   Vivek Goyal   blkio: Core imple...
834
835
  	if (wait)
  		*wait = jiffy_wait;
8e89d13f4   Vivek Goyal   blkio: Implementa...
836
837
838
839
840
841
842
  	return 0;
  }
  
  /*
   * Returns whether one can dispatch a bio or not. Also returns approx number
   * of jiffies to wait before this bio is with-in IO rate and can be dispatched
   */
0f3457f60   Tejun Heo   blk-throttle: add...
843
844
  static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
  			    unsigned long *wait)
8e89d13f4   Vivek Goyal   blkio: Implementa...
845
846
847
848
849
850
851
852
853
854
  {
  	bool rw = bio_data_dir(bio);
  	unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;
  
  	/*
   	 * Currently whole state machine of group depends on first bio
  	 * queued in the group bio list. So one should not be calling
  	 * this function with a different bio if there are other bios
  	 * queued.
  	 */
73f0d49a9   Tejun Heo   blk-throttle: mov...
855
  	BUG_ON(tg->service_queue.nr_queued[rw] &&
c5cc2070b   Tejun Heo   blk-throttle: add...
856
  	       bio != throtl_peek_queued(&tg->service_queue.queued[rw]));
e43473b7f   Vivek Goyal   blkio: Core imple...
857

8e89d13f4   Vivek Goyal   blkio: Implementa...
858
859
860
861
862
863
864
865
866
867
868
869
  	/* If tg->bps = -1, then BW is unlimited */
  	if (tg->bps[rw] == -1 && tg->iops[rw] == -1) {
  		if (wait)
  			*wait = 0;
  		return 1;
  	}
  
  	/*
  	 * If previous slice expired, start a new one otherwise renew/extend
  	 * existing slice to make sure it is at least throtl_slice interval
  	 * long since now.
  	 */
0f3457f60   Tejun Heo   blk-throttle: add...
870
871
  	if (throtl_slice_used(tg, rw))
  		throtl_start_new_slice(tg, rw);
8e89d13f4   Vivek Goyal   blkio: Implementa...
872
873
  	else {
  		if (time_before(tg->slice_end[rw], jiffies + throtl_slice))
0f3457f60   Tejun Heo   blk-throttle: add...
874
  			throtl_extend_slice(tg, rw, jiffies + throtl_slice);
8e89d13f4   Vivek Goyal   blkio: Implementa...
875
  	}
0f3457f60   Tejun Heo   blk-throttle: add...
876
877
  	if (tg_with_in_bps_limit(tg, bio, &bps_wait) &&
  	    tg_with_in_iops_limit(tg, bio, &iops_wait)) {
8e89d13f4   Vivek Goyal   blkio: Implementa...
878
879
880
881
882
883
884
885
886
887
888
  		if (wait)
  			*wait = 0;
  		return 1;
  	}
  
  	max_wait = max(bps_wait, iops_wait);
  
  	if (wait)
  		*wait = max_wait;
  
  	if (time_before(tg->slice_end[rw], jiffies + max_wait))
0f3457f60   Tejun Heo   blk-throttle: add...
889
  		throtl_extend_slice(tg, rw, jiffies + max_wait);
e43473b7f   Vivek Goyal   blkio: Core imple...
890
891
892
  
  	return 0;
  }
3c798398e   Tejun Heo   blkcg: mass renam...
893
  static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes,
629ed0b10   Tejun Heo   blkcg: move stati...
894
895
  					 int rw)
  {
8a3d26151   Tejun Heo   blkcg: move blkio...
896
897
  	struct throtl_grp *tg = blkg_to_tg(blkg);
  	struct tg_stats_cpu *stats_cpu;
629ed0b10   Tejun Heo   blkcg: move stati...
898
899
900
  	unsigned long flags;
  
  	/* If per cpu stats are not allocated yet, don't do any accounting. */
8a3d26151   Tejun Heo   blkcg: move blkio...
901
  	if (tg->stats_cpu == NULL)
629ed0b10   Tejun Heo   blkcg: move stati...
902
903
904
905
906
907
908
909
  		return;
  
  	/*
  	 * Disabling interrupts to provide mutual exclusion between two
  	 * writes on same cpu. It probably is not needed for 64bit. Not
  	 * optimizing that case yet.
  	 */
  	local_irq_save(flags);
8a3d26151   Tejun Heo   blkcg: move blkio...
910
  	stats_cpu = this_cpu_ptr(tg->stats_cpu);
629ed0b10   Tejun Heo   blkcg: move stati...
911

629ed0b10   Tejun Heo   blkcg: move stati...
912
913
914
915
916
  	blkg_rwstat_add(&stats_cpu->serviced, rw, 1);
  	blkg_rwstat_add(&stats_cpu->service_bytes, rw, bytes);
  
  	local_irq_restore(flags);
  }
e43473b7f   Vivek Goyal   blkio: Core imple...
917
918
919
  static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
  {
  	bool rw = bio_data_dir(bio);
e43473b7f   Vivek Goyal   blkio: Core imple...
920
921
  
  	/* Charge the bio to the group */
4f024f379   Kent Overstreet   block: Abstract o...
922
  	tg->bytes_disp[rw] += bio->bi_iter.bi_size;
8e89d13f4   Vivek Goyal   blkio: Implementa...
923
  	tg->io_disp[rw]++;
e43473b7f   Vivek Goyal   blkio: Core imple...
924

2a0f61e6e   Tejun Heo   blk-throttle: set...
925
926
927
928
929
930
931
932
933
934
935
936
937
  	/*
  	 * REQ_THROTTLED is used to prevent the same bio to be throttled
  	 * more than once as a throttled bio will go through blk-throtl the
  	 * second time when it eventually gets issued.  Set it when a bio
  	 * is being charged to a tg.
  	 *
  	 * Dispatch stats aren't recursive and each @bio should only be
  	 * accounted by the @tg it was originally associated with.  Let's
  	 * update the stats when setting REQ_THROTTLED for the first time
  	 * which is guaranteed to be for the @bio's original tg.
  	 */
  	if (!(bio->bi_rw & REQ_THROTTLED)) {
  		bio->bi_rw |= REQ_THROTTLED;
4f024f379   Kent Overstreet   block: Abstract o...
938
939
  		throtl_update_dispatch_stats(tg_to_blkg(tg),
  					     bio->bi_iter.bi_size, bio->bi_rw);
2a0f61e6e   Tejun Heo   blk-throttle: set...
940
  	}
e43473b7f   Vivek Goyal   blkio: Core imple...
941
  }
c5cc2070b   Tejun Heo   blk-throttle: add...
942
943
944
945
946
947
948
949
950
951
952
  /**
   * throtl_add_bio_tg - add a bio to the specified throtl_grp
   * @bio: bio to add
   * @qn: qnode to use
   * @tg: the target throtl_grp
   *
   * Add @bio to @tg's service_queue using @qn.  If @qn is not specified,
   * tg->qnode_on_self[] is used.
   */
  static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn,
  			      struct throtl_grp *tg)
e43473b7f   Vivek Goyal   blkio: Core imple...
953
  {
73f0d49a9   Tejun Heo   blk-throttle: mov...
954
  	struct throtl_service_queue *sq = &tg->service_queue;
e43473b7f   Vivek Goyal   blkio: Core imple...
955
  	bool rw = bio_data_dir(bio);
c5cc2070b   Tejun Heo   blk-throttle: add...
956
957
  	if (!qn)
  		qn = &tg->qnode_on_self[rw];
0e9f4164b   Tejun Heo   blk-throttle: gen...
958
959
960
961
962
963
964
965
  	/*
  	 * If @tg doesn't currently have any bios queued in the same
  	 * direction, queueing @bio can change when @tg should be
  	 * dispatched.  Mark that @tg was empty.  This is automatically
  	 * cleaered on the next tg_update_disptime().
  	 */
  	if (!sq->nr_queued[rw])
  		tg->flags |= THROTL_TG_WAS_EMPTY;
c5cc2070b   Tejun Heo   blk-throttle: add...
966
  	throtl_qnode_add_bio(bio, qn, &sq->queued[rw]);
73f0d49a9   Tejun Heo   blk-throttle: mov...
967
  	sq->nr_queued[rw]++;
77216b048   Tejun Heo   blk-throttle: add...
968
  	throtl_enqueue_tg(tg);
e43473b7f   Vivek Goyal   blkio: Core imple...
969
  }
77216b048   Tejun Heo   blk-throttle: add...
970
  static void tg_update_disptime(struct throtl_grp *tg)
e43473b7f   Vivek Goyal   blkio: Core imple...
971
  {
73f0d49a9   Tejun Heo   blk-throttle: mov...
972
  	struct throtl_service_queue *sq = &tg->service_queue;
e43473b7f   Vivek Goyal   blkio: Core imple...
973
974
  	unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;
  	struct bio *bio;
c5cc2070b   Tejun Heo   blk-throttle: add...
975
  	if ((bio = throtl_peek_queued(&sq->queued[READ])))
0f3457f60   Tejun Heo   blk-throttle: add...
976
  		tg_may_dispatch(tg, bio, &read_wait);
e43473b7f   Vivek Goyal   blkio: Core imple...
977

c5cc2070b   Tejun Heo   blk-throttle: add...
978
  	if ((bio = throtl_peek_queued(&sq->queued[WRITE])))
0f3457f60   Tejun Heo   blk-throttle: add...
979
  		tg_may_dispatch(tg, bio, &write_wait);
e43473b7f   Vivek Goyal   blkio: Core imple...
980
981
982
  
  	min_wait = min(read_wait, write_wait);
  	disptime = jiffies + min_wait;
e43473b7f   Vivek Goyal   blkio: Core imple...
983
  	/* Update dispatch time */
77216b048   Tejun Heo   blk-throttle: add...
984
  	throtl_dequeue_tg(tg);
e43473b7f   Vivek Goyal   blkio: Core imple...
985
  	tg->disptime = disptime;
77216b048   Tejun Heo   blk-throttle: add...
986
  	throtl_enqueue_tg(tg);
0e9f4164b   Tejun Heo   blk-throttle: gen...
987
988
989
  
  	/* see throtl_add_bio_tg() */
  	tg->flags &= ~THROTL_TG_WAS_EMPTY;
e43473b7f   Vivek Goyal   blkio: Core imple...
990
  }
32ee5bc47   Vivek Goyal   blk-throttle: Acc...
991
992
993
994
995
996
997
998
999
  static void start_parent_slice_with_credit(struct throtl_grp *child_tg,
  					struct throtl_grp *parent_tg, bool rw)
  {
  	if (throtl_slice_used(parent_tg, rw)) {
  		throtl_start_new_slice_with_credit(parent_tg, rw,
  				child_tg->slice_start[rw]);
  	}
  
  }
77216b048   Tejun Heo   blk-throttle: add...
1000
  static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
e43473b7f   Vivek Goyal   blkio: Core imple...
1001
  {
73f0d49a9   Tejun Heo   blk-throttle: mov...
1002
  	struct throtl_service_queue *sq = &tg->service_queue;
6bc9c2b46   Tejun Heo   blk-throttle: mak...
1003
1004
  	struct throtl_service_queue *parent_sq = sq->parent_sq;
  	struct throtl_grp *parent_tg = sq_to_tg(parent_sq);
c5cc2070b   Tejun Heo   blk-throttle: add...
1005
  	struct throtl_grp *tg_to_put = NULL;
e43473b7f   Vivek Goyal   blkio: Core imple...
1006
  	struct bio *bio;
c5cc2070b   Tejun Heo   blk-throttle: add...
1007
1008
1009
1010
1011
1012
1013
  	/*
  	 * @bio is being transferred from @tg to @parent_sq.  Popping a bio
  	 * from @tg may put its reference and @parent_sq might end up
  	 * getting released prematurely.  Remember the tg to put and put it
  	 * after @bio is transferred to @parent_sq.
  	 */
  	bio = throtl_pop_queued(&sq->queued[rw], &tg_to_put);
73f0d49a9   Tejun Heo   blk-throttle: mov...
1014
  	sq->nr_queued[rw]--;
e43473b7f   Vivek Goyal   blkio: Core imple...
1015
1016
  
  	throtl_charge_bio(tg, bio);
6bc9c2b46   Tejun Heo   blk-throttle: mak...
1017
1018
1019
1020
1021
1022
1023
1024
1025
  
  	/*
  	 * If our parent is another tg, we just need to transfer @bio to
  	 * the parent using throtl_add_bio_tg().  If our parent is
  	 * @td->service_queue, @bio is ready to be issued.  Put it on its
  	 * bio_lists[] and decrease total number queued.  The caller is
  	 * responsible for issuing these bios.
  	 */
  	if (parent_tg) {
c5cc2070b   Tejun Heo   blk-throttle: add...
1026
  		throtl_add_bio_tg(bio, &tg->qnode_on_parent[rw], parent_tg);
32ee5bc47   Vivek Goyal   blk-throttle: Acc...
1027
  		start_parent_slice_with_credit(tg, parent_tg, rw);
6bc9c2b46   Tejun Heo   blk-throttle: mak...
1028
  	} else {
c5cc2070b   Tejun Heo   blk-throttle: add...
1029
1030
  		throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw],
  				     &parent_sq->queued[rw]);
6bc9c2b46   Tejun Heo   blk-throttle: mak...
1031
1032
1033
  		BUG_ON(tg->td->nr_queued[rw] <= 0);
  		tg->td->nr_queued[rw]--;
  	}
e43473b7f   Vivek Goyal   blkio: Core imple...
1034

0f3457f60   Tejun Heo   blk-throttle: add...
1035
  	throtl_trim_slice(tg, rw);
6bc9c2b46   Tejun Heo   blk-throttle: mak...
1036

c5cc2070b   Tejun Heo   blk-throttle: add...
1037
1038
  	if (tg_to_put)
  		blkg_put(tg_to_blkg(tg_to_put));
e43473b7f   Vivek Goyal   blkio: Core imple...
1039
  }
77216b048   Tejun Heo   blk-throttle: add...
1040
  static int throtl_dispatch_tg(struct throtl_grp *tg)
e43473b7f   Vivek Goyal   blkio: Core imple...
1041
  {
73f0d49a9   Tejun Heo   blk-throttle: mov...
1042
  	struct throtl_service_queue *sq = &tg->service_queue;
e43473b7f   Vivek Goyal   blkio: Core imple...
1043
1044
  	unsigned int nr_reads = 0, nr_writes = 0;
  	unsigned int max_nr_reads = throtl_grp_quantum*3/4;
c2f6805d4   Vivek Goyal   blk-throttle: Fix...
1045
  	unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads;
e43473b7f   Vivek Goyal   blkio: Core imple...
1046
1047
1048
  	struct bio *bio;
  
  	/* Try to dispatch 75% READS and 25% WRITES */
c5cc2070b   Tejun Heo   blk-throttle: add...
1049
  	while ((bio = throtl_peek_queued(&sq->queued[READ])) &&
0f3457f60   Tejun Heo   blk-throttle: add...
1050
  	       tg_may_dispatch(tg, bio, NULL)) {
e43473b7f   Vivek Goyal   blkio: Core imple...
1051

77216b048   Tejun Heo   blk-throttle: add...
1052
  		tg_dispatch_one_bio(tg, bio_data_dir(bio));
e43473b7f   Vivek Goyal   blkio: Core imple...
1053
1054
1055
1056
1057
  		nr_reads++;
  
  		if (nr_reads >= max_nr_reads)
  			break;
  	}
c5cc2070b   Tejun Heo   blk-throttle: add...
1058
  	while ((bio = throtl_peek_queued(&sq->queued[WRITE])) &&
0f3457f60   Tejun Heo   blk-throttle: add...
1059
  	       tg_may_dispatch(tg, bio, NULL)) {
e43473b7f   Vivek Goyal   blkio: Core imple...
1060

77216b048   Tejun Heo   blk-throttle: add...
1061
  		tg_dispatch_one_bio(tg, bio_data_dir(bio));
e43473b7f   Vivek Goyal   blkio: Core imple...
1062
1063
1064
1065
1066
1067
1068
1069
  		nr_writes++;
  
  		if (nr_writes >= max_nr_writes)
  			break;
  	}
  
  	return nr_reads + nr_writes;
  }
651930bc1   Tejun Heo   blk-throttle: dis...
1070
  static int throtl_select_dispatch(struct throtl_service_queue *parent_sq)
e43473b7f   Vivek Goyal   blkio: Core imple...
1071
1072
  {
  	unsigned int nr_disp = 0;
e43473b7f   Vivek Goyal   blkio: Core imple...
1073
1074
  
  	while (1) {
73f0d49a9   Tejun Heo   blk-throttle: mov...
1075
1076
  		struct throtl_grp *tg = throtl_rb_first(parent_sq);
  		struct throtl_service_queue *sq = &tg->service_queue;
e43473b7f   Vivek Goyal   blkio: Core imple...
1077
1078
1079
1080
1081
1082
  
  		if (!tg)
  			break;
  
  		if (time_before(jiffies, tg->disptime))
  			break;
77216b048   Tejun Heo   blk-throttle: add...
1083
  		throtl_dequeue_tg(tg);
e43473b7f   Vivek Goyal   blkio: Core imple...
1084

77216b048   Tejun Heo   blk-throttle: add...
1085
  		nr_disp += throtl_dispatch_tg(tg);
e43473b7f   Vivek Goyal   blkio: Core imple...
1086

73f0d49a9   Tejun Heo   blk-throttle: mov...
1087
  		if (sq->nr_queued[0] || sq->nr_queued[1])
77216b048   Tejun Heo   blk-throttle: add...
1088
  			tg_update_disptime(tg);
e43473b7f   Vivek Goyal   blkio: Core imple...
1089
1090
1091
1092
1093
1094
1095
  
  		if (nr_disp >= throtl_quantum)
  			break;
  	}
  
  	return nr_disp;
  }
6e1a5704c   Tejun Heo   blk-throttle: dis...
1096
1097
1098
1099
1100
1101
1102
  /**
   * throtl_pending_timer_fn - timer function for service_queue->pending_timer
   * @arg: the throtl_service_queue being serviced
   *
   * This timer is armed when a child throtl_grp with active bio's become
   * pending and queued on the service_queue's pending_tree and expires when
   * the first child throtl_grp should be dispatched.  This function
2e48a530a   Tejun Heo   blk-throttle: mak...
1103
1104
1105
1106
1107
1108
1109
   * dispatches bio's from the children throtl_grps to the parent
   * service_queue.
   *
   * If the parent's parent is another throtl_grp, dispatching is propagated
   * by either arming its pending_timer or repeating dispatch directly.  If
   * the top-level service_tree is reached, throtl_data->dispatch_work is
   * kicked so that the ready bio's are issued.
6e1a5704c   Tejun Heo   blk-throttle: dis...
1110
   */
69df0ab03   Tejun Heo   blk-throttle: sep...
1111
1112
1113
  static void throtl_pending_timer_fn(unsigned long arg)
  {
  	struct throtl_service_queue *sq = (void *)arg;
2e48a530a   Tejun Heo   blk-throttle: mak...
1114
  	struct throtl_grp *tg = sq_to_tg(sq);
69df0ab03   Tejun Heo   blk-throttle: sep...
1115
  	struct throtl_data *td = sq_to_td(sq);
cb76199c3   Tejun Heo   blk-throttle: col...
1116
  	struct request_queue *q = td->queue;
2e48a530a   Tejun Heo   blk-throttle: mak...
1117
1118
  	struct throtl_service_queue *parent_sq;
  	bool dispatched;
6e1a5704c   Tejun Heo   blk-throttle: dis...
1119
  	int ret;
e43473b7f   Vivek Goyal   blkio: Core imple...
1120
1121
  
  	spin_lock_irq(q->queue_lock);
2e48a530a   Tejun Heo   blk-throttle: mak...
1122
1123
1124
  again:
  	parent_sq = sq->parent_sq;
  	dispatched = false;
e43473b7f   Vivek Goyal   blkio: Core imple...
1125

7f52f98c2   Tejun Heo   blk-throttle: imp...
1126
1127
  	while (true) {
  		throtl_log(sq, "dispatch nr_queued=%u read=%u write=%u",
2e48a530a   Tejun Heo   blk-throttle: mak...
1128
1129
  			   sq->nr_queued[READ] + sq->nr_queued[WRITE],
  			   sq->nr_queued[READ], sq->nr_queued[WRITE]);
7f52f98c2   Tejun Heo   blk-throttle: imp...
1130
1131
1132
  
  		ret = throtl_select_dispatch(sq);
  		if (ret) {
7f52f98c2   Tejun Heo   blk-throttle: imp...
1133
1134
1135
  			throtl_log(sq, "bios disp=%u", ret);
  			dispatched = true;
  		}
e43473b7f   Vivek Goyal   blkio: Core imple...
1136

7f52f98c2   Tejun Heo   blk-throttle: imp...
1137
1138
  		if (throtl_schedule_next_dispatch(sq, false))
  			break;
e43473b7f   Vivek Goyal   blkio: Core imple...
1139

7f52f98c2   Tejun Heo   blk-throttle: imp...
1140
1141
1142
1143
  		/* this dispatch windows is still open, relax and repeat */
  		spin_unlock_irq(q->queue_lock);
  		cpu_relax();
  		spin_lock_irq(q->queue_lock);
651930bc1   Tejun Heo   blk-throttle: dis...
1144
  	}
e43473b7f   Vivek Goyal   blkio: Core imple...
1145

2e48a530a   Tejun Heo   blk-throttle: mak...
1146
1147
  	if (!dispatched)
  		goto out_unlock;
6e1a5704c   Tejun Heo   blk-throttle: dis...
1148

2e48a530a   Tejun Heo   blk-throttle: mak...
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
  	if (parent_sq) {
  		/* @parent_sq is another throl_grp, propagate dispatch */
  		if (tg->flags & THROTL_TG_WAS_EMPTY) {
  			tg_update_disptime(tg);
  			if (!throtl_schedule_next_dispatch(parent_sq, false)) {
  				/* window is already open, repeat dispatching */
  				sq = parent_sq;
  				tg = sq_to_tg(sq);
  				goto again;
  			}
  		}
  	} else {
  		/* reached the top-level, queue issueing */
  		queue_work(kthrotld_workqueue, &td->dispatch_work);
  	}
  out_unlock:
e43473b7f   Vivek Goyal   blkio: Core imple...
1165
  	spin_unlock_irq(q->queue_lock);
6e1a5704c   Tejun Heo   blk-throttle: dis...
1166
  }
e43473b7f   Vivek Goyal   blkio: Core imple...
1167

6e1a5704c   Tejun Heo   blk-throttle: dis...
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
  /**
   * blk_throtl_dispatch_work_fn - work function for throtl_data->dispatch_work
   * @work: work item being executed
   *
   * This function is queued for execution when bio's reach the bio_lists[]
   * of throtl_data->service_queue.  Those bio's are ready and issued by this
   * function.
   */
  void blk_throtl_dispatch_work_fn(struct work_struct *work)
  {
  	struct throtl_data *td = container_of(work, struct throtl_data,
  					      dispatch_work);
  	struct throtl_service_queue *td_sq = &td->service_queue;
  	struct request_queue *q = td->queue;
  	struct bio_list bio_list_on_stack;
  	struct bio *bio;
  	struct blk_plug plug;
  	int rw;
  
  	bio_list_init(&bio_list_on_stack);
  
  	spin_lock_irq(q->queue_lock);
c5cc2070b   Tejun Heo   blk-throttle: add...
1190
1191
1192
  	for (rw = READ; rw <= WRITE; rw++)
  		while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL)))
  			bio_list_add(&bio_list_on_stack, bio);
6e1a5704c   Tejun Heo   blk-throttle: dis...
1193
1194
1195
  	spin_unlock_irq(q->queue_lock);
  
  	if (!bio_list_empty(&bio_list_on_stack)) {
69d60eb96   Vivek Goyal   blk-throttle: Use...
1196
  		blk_start_plug(&plug);
e43473b7f   Vivek Goyal   blkio: Core imple...
1197
1198
  		while((bio = bio_list_pop(&bio_list_on_stack)))
  			generic_make_request(bio);
69d60eb96   Vivek Goyal   blk-throttle: Use...
1199
  		blk_finish_plug(&plug);
e43473b7f   Vivek Goyal   blkio: Core imple...
1200
  	}
e43473b7f   Vivek Goyal   blkio: Core imple...
1201
  }
f95a04afa   Tejun Heo   blkcg: embed stru...
1202
1203
  static u64 tg_prfill_cpu_rwstat(struct seq_file *sf,
  				struct blkg_policy_data *pd, int off)
41b38b6d5   Tejun Heo   blkcg: cfq doesn'...
1204
  {
f95a04afa   Tejun Heo   blkcg: embed stru...
1205
  	struct throtl_grp *tg = pd_to_tg(pd);
41b38b6d5   Tejun Heo   blkcg: cfq doesn'...
1206
1207
1208
1209
  	struct blkg_rwstat rwstat = { }, tmp;
  	int i, cpu;
  
  	for_each_possible_cpu(cpu) {
8a3d26151   Tejun Heo   blkcg: move blkio...
1210
  		struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu);
41b38b6d5   Tejun Heo   blkcg: cfq doesn'...
1211
1212
1213
1214
1215
  
  		tmp = blkg_rwstat_read((void *)sc + off);
  		for (i = 0; i < BLKG_RWSTAT_NR; i++)
  			rwstat.cnt[i] += tmp.cnt[i];
  	}
f95a04afa   Tejun Heo   blkcg: embed stru...
1216
  	return __blkg_prfill_rwstat(sf, pd, &rwstat);
41b38b6d5   Tejun Heo   blkcg: cfq doesn'...
1217
  }
2da8ca822   Tejun Heo   cgroup: replace c...
1218
  static int tg_print_cpu_rwstat(struct seq_file *sf, void *v)
41b38b6d5   Tejun Heo   blkcg: cfq doesn'...
1219
  {
2da8ca822   Tejun Heo   cgroup: replace c...
1220
1221
  	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_cpu_rwstat,
  			  &blkcg_policy_throtl, seq_cft(sf)->private, true);
41b38b6d5   Tejun Heo   blkcg: cfq doesn'...
1222
1223
  	return 0;
  }
f95a04afa   Tejun Heo   blkcg: embed stru...
1224
1225
  static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd,
  			      int off)
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1226
  {
f95a04afa   Tejun Heo   blkcg: embed stru...
1227
1228
  	struct throtl_grp *tg = pd_to_tg(pd);
  	u64 v = *(u64 *)((void *)tg + off);
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1229

af133ceb2   Tejun Heo   blkcg: move blkio...
1230
  	if (v == -1)
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1231
  		return 0;
f95a04afa   Tejun Heo   blkcg: embed stru...
1232
  	return __blkg_prfill_u64(sf, pd, v);
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1233
  }
f95a04afa   Tejun Heo   blkcg: embed stru...
1234
1235
  static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd,
  			       int off)
e43473b7f   Vivek Goyal   blkio: Core imple...
1236
  {
f95a04afa   Tejun Heo   blkcg: embed stru...
1237
1238
  	struct throtl_grp *tg = pd_to_tg(pd);
  	unsigned int v = *(unsigned int *)((void *)tg + off);
fe0714377   Vivek Goyal   blkio: Recalculat...
1239

af133ceb2   Tejun Heo   blkcg: move blkio...
1240
1241
  	if (v == -1)
  		return 0;
f95a04afa   Tejun Heo   blkcg: embed stru...
1242
  	return __blkg_prfill_u64(sf, pd, v);
e43473b7f   Vivek Goyal   blkio: Core imple...
1243
  }
2da8ca822   Tejun Heo   cgroup: replace c...
1244
  static int tg_print_conf_u64(struct seq_file *sf, void *v)
8e89d13f4   Vivek Goyal   blkio: Implementa...
1245
  {
2da8ca822   Tejun Heo   cgroup: replace c...
1246
1247
  	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_u64,
  			  &blkcg_policy_throtl, seq_cft(sf)->private, false);
af133ceb2   Tejun Heo   blkcg: move blkio...
1248
  	return 0;
8e89d13f4   Vivek Goyal   blkio: Implementa...
1249
  }
2da8ca822   Tejun Heo   cgroup: replace c...
1250
  static int tg_print_conf_uint(struct seq_file *sf, void *v)
8e89d13f4   Vivek Goyal   blkio: Implementa...
1251
  {
2da8ca822   Tejun Heo   cgroup: replace c...
1252
1253
  	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_uint,
  			  &blkcg_policy_throtl, seq_cft(sf)->private, false);
af133ceb2   Tejun Heo   blkcg: move blkio...
1254
  	return 0;
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1255
  }
182446d08   Tejun Heo   cgroup: pass arou...
1256
1257
  static int tg_set_conf(struct cgroup_subsys_state *css, struct cftype *cft,
  		       const char *buf, bool is_u64)
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1258
  {
182446d08   Tejun Heo   cgroup: pass arou...
1259
  	struct blkcg *blkcg = css_to_blkcg(css);
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1260
  	struct blkg_conf_ctx ctx;
af133ceb2   Tejun Heo   blkcg: move blkio...
1261
  	struct throtl_grp *tg;
69df0ab03   Tejun Heo   blk-throttle: sep...
1262
  	struct throtl_service_queue *sq;
693e751e7   Tejun Heo   blk-throttle: imp...
1263
  	struct blkcg_gq *blkg;
492eb21b9   Tejun Heo   cgroup: make hier...
1264
  	struct cgroup_subsys_state *pos_css;
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1265
  	int ret;
3c798398e   Tejun Heo   blkcg: mass renam...
1266
  	ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1267
1268
  	if (ret)
  		return ret;
af133ceb2   Tejun Heo   blkcg: move blkio...
1269
  	tg = blkg_to_tg(ctx.blkg);
69df0ab03   Tejun Heo   blk-throttle: sep...
1270
  	sq = &tg->service_queue;
af133ceb2   Tejun Heo   blkcg: move blkio...
1271

a2b1693ba   Tejun Heo   blkcg: implement ...
1272
1273
  	if (!ctx.v)
  		ctx.v = -1;
af133ceb2   Tejun Heo   blkcg: move blkio...
1274

a2b1693ba   Tejun Heo   blkcg: implement ...
1275
1276
1277
1278
  	if (is_u64)
  		*(u64 *)((void *)tg + cft->private) = ctx.v;
  	else
  		*(unsigned int *)((void *)tg + cft->private) = ctx.v;
af133ceb2   Tejun Heo   blkcg: move blkio...
1279

fda6f272c   Tejun Heo   blk-throttle: imp...
1280
1281
1282
1283
  	throtl_log(&tg->service_queue,
  		   "limit change rbps=%llu wbps=%llu riops=%u wiops=%u",
  		   tg->bps[READ], tg->bps[WRITE],
  		   tg->iops[READ], tg->iops[WRITE]);
632b44935   Tejun Heo   blk-throttle: rem...
1284
1285
  
  	/*
693e751e7   Tejun Heo   blk-throttle: imp...
1286
1287
1288
1289
1290
1291
  	 * Update has_rules[] flags for the updated tg's subtree.  A tg is
  	 * considered to have rules if either the tg itself or any of its
  	 * ancestors has rules.  This identifies groups without any
  	 * restrictions in the whole hierarchy and allows them to bypass
  	 * blk-throttle.
  	 */
492eb21b9   Tejun Heo   cgroup: make hier...
1292
  	blkg_for_each_descendant_pre(blkg, pos_css, ctx.blkg)
693e751e7   Tejun Heo   blk-throttle: imp...
1293
1294
1295
  		tg_update_has_rules(blkg_to_tg(blkg));
  
  	/*
632b44935   Tejun Heo   blk-throttle: rem...
1296
1297
1298
1299
1300
1301
1302
  	 * We're already holding queue_lock and know @tg is valid.  Let's
  	 * apply the new config directly.
  	 *
  	 * Restart the slices for both READ and WRITES. It might happen
  	 * that a group's limit are dropped suddenly and we don't want to
  	 * account recently dispatched IO with new low rate.
  	 */
0f3457f60   Tejun Heo   blk-throttle: add...
1303
1304
  	throtl_start_new_slice(tg, 0);
  	throtl_start_new_slice(tg, 1);
632b44935   Tejun Heo   blk-throttle: rem...
1305

5b2c16aae   Tejun Heo   blk-throttle: sim...
1306
  	if (tg->flags & THROTL_TG_PENDING) {
77216b048   Tejun Heo   blk-throttle: add...
1307
  		tg_update_disptime(tg);
7f52f98c2   Tejun Heo   blk-throttle: imp...
1308
  		throtl_schedule_next_dispatch(sq->parent_sq, true);
632b44935   Tejun Heo   blk-throttle: rem...
1309
  	}
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1310
1311
  
  	blkg_conf_finish(&ctx);
a2b1693ba   Tejun Heo   blkcg: implement ...
1312
  	return 0;
8e89d13f4   Vivek Goyal   blkio: Implementa...
1313
  }
182446d08   Tejun Heo   cgroup: pass arou...
1314
  static int tg_set_conf_u64(struct cgroup_subsys_state *css, struct cftype *cft,
4d3bb511b   Tejun Heo   cgroup: drop cons...
1315
  			   char *buf)
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1316
  {
182446d08   Tejun Heo   cgroup: pass arou...
1317
  	return tg_set_conf(css, cft, buf, true);
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1318
  }
182446d08   Tejun Heo   cgroup: pass arou...
1319
  static int tg_set_conf_uint(struct cgroup_subsys_state *css, struct cftype *cft,
4d3bb511b   Tejun Heo   cgroup: drop cons...
1320
  			    char *buf)
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1321
  {
182446d08   Tejun Heo   cgroup: pass arou...
1322
  	return tg_set_conf(css, cft, buf, false);
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1323
1324
1325
1326
1327
  }
  
  static struct cftype throtl_files[] = {
  	{
  		.name = "throttle.read_bps_device",
af133ceb2   Tejun Heo   blkcg: move blkio...
1328
  		.private = offsetof(struct throtl_grp, bps[READ]),
2da8ca822   Tejun Heo   cgroup: replace c...
1329
  		.seq_show = tg_print_conf_u64,
af133ceb2   Tejun Heo   blkcg: move blkio...
1330
  		.write_string = tg_set_conf_u64,
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1331
1332
1333
  	},
  	{
  		.name = "throttle.write_bps_device",
af133ceb2   Tejun Heo   blkcg: move blkio...
1334
  		.private = offsetof(struct throtl_grp, bps[WRITE]),
2da8ca822   Tejun Heo   cgroup: replace c...
1335
  		.seq_show = tg_print_conf_u64,
af133ceb2   Tejun Heo   blkcg: move blkio...
1336
  		.write_string = tg_set_conf_u64,
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1337
1338
1339
  	},
  	{
  		.name = "throttle.read_iops_device",
af133ceb2   Tejun Heo   blkcg: move blkio...
1340
  		.private = offsetof(struct throtl_grp, iops[READ]),
2da8ca822   Tejun Heo   cgroup: replace c...
1341
  		.seq_show = tg_print_conf_uint,
af133ceb2   Tejun Heo   blkcg: move blkio...
1342
  		.write_string = tg_set_conf_uint,
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1343
1344
1345
  	},
  	{
  		.name = "throttle.write_iops_device",
af133ceb2   Tejun Heo   blkcg: move blkio...
1346
  		.private = offsetof(struct throtl_grp, iops[WRITE]),
2da8ca822   Tejun Heo   cgroup: replace c...
1347
  		.seq_show = tg_print_conf_uint,
af133ceb2   Tejun Heo   blkcg: move blkio...
1348
  		.write_string = tg_set_conf_uint,
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1349
1350
1351
  	},
  	{
  		.name = "throttle.io_service_bytes",
5bc4afb1e   Tejun Heo   blkcg: drop BLKCG...
1352
  		.private = offsetof(struct tg_stats_cpu, service_bytes),
2da8ca822   Tejun Heo   cgroup: replace c...
1353
  		.seq_show = tg_print_cpu_rwstat,
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1354
1355
1356
  	},
  	{
  		.name = "throttle.io_serviced",
5bc4afb1e   Tejun Heo   blkcg: drop BLKCG...
1357
  		.private = offsetof(struct tg_stats_cpu, serviced),
2da8ca822   Tejun Heo   cgroup: replace c...
1358
  		.seq_show = tg_print_cpu_rwstat,
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1359
1360
1361
  	},
  	{ }	/* terminate */
  };
da5277700   Vivek Goyal   block: Move blk_t...
1362
  static void throtl_shutdown_wq(struct request_queue *q)
e43473b7f   Vivek Goyal   blkio: Core imple...
1363
1364
  {
  	struct throtl_data *td = q->td;
69df0ab03   Tejun Heo   blk-throttle: sep...
1365
  	cancel_work_sync(&td->dispatch_work);
e43473b7f   Vivek Goyal   blkio: Core imple...
1366
  }
3c798398e   Tejun Heo   blkcg: mass renam...
1367
  static struct blkcg_policy blkcg_policy_throtl = {
f9fcc2d39   Tejun Heo   blkcg: collapse b...
1368
1369
1370
1371
  	.pd_size		= sizeof(struct throtl_grp),
  	.cftypes		= throtl_files,
  
  	.pd_init_fn		= throtl_pd_init,
693e751e7   Tejun Heo   blk-throttle: imp...
1372
  	.pd_online_fn		= throtl_pd_online,
f9fcc2d39   Tejun Heo   blkcg: collapse b...
1373
1374
  	.pd_exit_fn		= throtl_pd_exit,
  	.pd_reset_stats_fn	= throtl_pd_reset_stats,
e43473b7f   Vivek Goyal   blkio: Core imple...
1375
  };
bc16a4f93   Tejun Heo   block: reorganize...
1376
  bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
e43473b7f   Vivek Goyal   blkio: Core imple...
1377
1378
  {
  	struct throtl_data *td = q->td;
c5cc2070b   Tejun Heo   blk-throttle: add...
1379
  	struct throtl_qnode *qn = NULL;
e43473b7f   Vivek Goyal   blkio: Core imple...
1380
  	struct throtl_grp *tg;
73f0d49a9   Tejun Heo   blk-throttle: mov...
1381
  	struct throtl_service_queue *sq;
0e9f4164b   Tejun Heo   blk-throttle: gen...
1382
  	bool rw = bio_data_dir(bio);
3c798398e   Tejun Heo   blkcg: mass renam...
1383
  	struct blkcg *blkcg;
bc16a4f93   Tejun Heo   block: reorganize...
1384
  	bool throttled = false;
e43473b7f   Vivek Goyal   blkio: Core imple...
1385

2a0f61e6e   Tejun Heo   blk-throttle: set...
1386
1387
  	/* see throtl_charge_bio() */
  	if (bio->bi_rw & REQ_THROTTLED)
bc16a4f93   Tejun Heo   block: reorganize...
1388
  		goto out;
e43473b7f   Vivek Goyal   blkio: Core imple...
1389

af75cd3c6   Vivek Goyal   blk-throttle: Mak...
1390
1391
1392
1393
1394
  	/*
  	 * A throtl_grp pointer retrieved under rcu can be used to access
  	 * basic fields like stats and io rates. If a group has no rules,
  	 * just update the dispatch stats in lockless manner and return.
  	 */
af75cd3c6   Vivek Goyal   blk-throttle: Mak...
1395
  	rcu_read_lock();
3c798398e   Tejun Heo   blkcg: mass renam...
1396
  	blkcg = bio_blkcg(bio);
cd1604fab   Tejun Heo   blkcg: factor out...
1397
  	tg = throtl_lookup_tg(td, blkcg);
af75cd3c6   Vivek Goyal   blk-throttle: Mak...
1398
  	if (tg) {
693e751e7   Tejun Heo   blk-throttle: imp...
1399
  		if (!tg->has_rules[rw]) {
629ed0b10   Tejun Heo   blkcg: move stati...
1400
  			throtl_update_dispatch_stats(tg_to_blkg(tg),
4f024f379   Kent Overstreet   block: Abstract o...
1401
  					bio->bi_iter.bi_size, bio->bi_rw);
2a7f12441   Tejun Heo   blkcg: move rcu_r...
1402
  			goto out_unlock_rcu;
af75cd3c6   Vivek Goyal   blk-throttle: Mak...
1403
1404
  		}
  	}
af75cd3c6   Vivek Goyal   blk-throttle: Mak...
1405
1406
1407
1408
1409
  
  	/*
  	 * Either group has not been allocated yet or it is not an unlimited
  	 * IO group
  	 */
e43473b7f   Vivek Goyal   blkio: Core imple...
1410
  	spin_lock_irq(q->queue_lock);
cd1604fab   Tejun Heo   blkcg: factor out...
1411
  	tg = throtl_lookup_create_tg(td, blkcg);
bc16a4f93   Tejun Heo   block: reorganize...
1412
1413
  	if (unlikely(!tg))
  		goto out_unlock;
f469a7b4d   Vivek Goyal   blk-cgroup: Allow...
1414

73f0d49a9   Tejun Heo   blk-throttle: mov...
1415
  	sq = &tg->service_queue;
9e660acff   Tejun Heo   blk-throttle: mak...
1416
1417
1418
1419
  	while (true) {
  		/* throtl is FIFO - if bios are already queued, should queue */
  		if (sq->nr_queued[rw])
  			break;
de701c74a   Vivek Goyal   blk-throttle: Som...
1420

9e660acff   Tejun Heo   blk-throttle: mak...
1421
1422
1423
1424
1425
  		/* if above limits, break to queue */
  		if (!tg_may_dispatch(tg, bio, NULL))
  			break;
  
  		/* within limits, let's charge and dispatch directly */
e43473b7f   Vivek Goyal   blkio: Core imple...
1426
  		throtl_charge_bio(tg, bio);
04521db04   Vivek Goyal   blk-throttle: Res...
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
  
  		/*
  		 * We need to trim slice even when bios are not being queued
  		 * otherwise it might happen that a bio is not queued for
  		 * a long time and slice keeps on extending and trim is not
  		 * called for a long time. Now if limits are reduced suddenly
  		 * we take into account all the IO dispatched so far at new
  		 * low rate and * newly queued IO gets a really long dispatch
  		 * time.
  		 *
  		 * So keep on trimming slice even if bio is not queued.
  		 */
0f3457f60   Tejun Heo   blk-throttle: add...
1439
  		throtl_trim_slice(tg, rw);
9e660acff   Tejun Heo   blk-throttle: mak...
1440
1441
1442
1443
1444
1445
  
  		/*
  		 * @bio passed through this layer without being throttled.
  		 * Climb up the ladder.  If we''re already at the top, it
  		 * can be executed directly.
  		 */
c5cc2070b   Tejun Heo   blk-throttle: add...
1446
  		qn = &tg->qnode_on_parent[rw];
9e660acff   Tejun Heo   blk-throttle: mak...
1447
1448
1449
1450
  		sq = sq->parent_sq;
  		tg = sq_to_tg(sq);
  		if (!tg)
  			goto out_unlock;
e43473b7f   Vivek Goyal   blkio: Core imple...
1451
  	}
9e660acff   Tejun Heo   blk-throttle: mak...
1452
  	/* out-of-limit, queue to @tg */
fda6f272c   Tejun Heo   blk-throttle: imp...
1453
1454
  	throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u iops=%u queued=%d/%d",
  		   rw == READ ? 'R' : 'W',
4f024f379   Kent Overstreet   block: Abstract o...
1455
  		   tg->bytes_disp[rw], bio->bi_iter.bi_size, tg->bps[rw],
fda6f272c   Tejun Heo   blk-throttle: imp...
1456
1457
  		   tg->io_disp[rw], tg->iops[rw],
  		   sq->nr_queued[READ], sq->nr_queued[WRITE]);
e43473b7f   Vivek Goyal   blkio: Core imple...
1458

671058fb2   Tejun Heo   block: make blk-t...
1459
  	bio_associate_current(bio);
6bc9c2b46   Tejun Heo   blk-throttle: mak...
1460
  	tg->td->nr_queued[rw]++;
c5cc2070b   Tejun Heo   blk-throttle: add...
1461
  	throtl_add_bio_tg(bio, qn, tg);
bc16a4f93   Tejun Heo   block: reorganize...
1462
  	throttled = true;
e43473b7f   Vivek Goyal   blkio: Core imple...
1463

7f52f98c2   Tejun Heo   blk-throttle: imp...
1464
1465
1466
1467
1468
1469
  	/*
  	 * Update @tg's dispatch time and force schedule dispatch if @tg
  	 * was empty before @bio.  The forced scheduling isn't likely to
  	 * cause undue delay as @bio is likely to be dispatched directly if
  	 * its @tg's disptime is not in the future.
  	 */
0e9f4164b   Tejun Heo   blk-throttle: gen...
1470
  	if (tg->flags & THROTL_TG_WAS_EMPTY) {
77216b048   Tejun Heo   blk-throttle: add...
1471
  		tg_update_disptime(tg);
7f52f98c2   Tejun Heo   blk-throttle: imp...
1472
  		throtl_schedule_next_dispatch(tg->service_queue.parent_sq, true);
e43473b7f   Vivek Goyal   blkio: Core imple...
1473
  	}
bc16a4f93   Tejun Heo   block: reorganize...
1474
  out_unlock:
e43473b7f   Vivek Goyal   blkio: Core imple...
1475
  	spin_unlock_irq(q->queue_lock);
2a7f12441   Tejun Heo   blkcg: move rcu_r...
1476
1477
  out_unlock_rcu:
  	rcu_read_unlock();
bc16a4f93   Tejun Heo   block: reorganize...
1478
  out:
2a0f61e6e   Tejun Heo   blk-throttle: set...
1479
1480
1481
1482
1483
1484
1485
  	/*
  	 * As multiple blk-throtls may stack in the same issue path, we
  	 * don't want bios to leave with the flag set.  Clear the flag if
  	 * being issued.
  	 */
  	if (!throttled)
  		bio->bi_rw &= ~REQ_THROTTLED;
bc16a4f93   Tejun Heo   block: reorganize...
1486
  	return throttled;
e43473b7f   Vivek Goyal   blkio: Core imple...
1487
  }
2a12f0dcd   Tejun Heo   blk-throttle: mak...
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
  /*
   * Dispatch all bios from all children tg's queued on @parent_sq.  On
   * return, @parent_sq is guaranteed to not have any active children tg's
   * and all bios from previously active tg's are on @parent_sq->bio_lists[].
   */
  static void tg_drain_bios(struct throtl_service_queue *parent_sq)
  {
  	struct throtl_grp *tg;
  
  	while ((tg = throtl_rb_first(parent_sq))) {
  		struct throtl_service_queue *sq = &tg->service_queue;
  		struct bio *bio;
  
  		throtl_dequeue_tg(tg);
c5cc2070b   Tejun Heo   blk-throttle: add...
1502
  		while ((bio = throtl_peek_queued(&sq->queued[READ])))
2a12f0dcd   Tejun Heo   blk-throttle: mak...
1503
  			tg_dispatch_one_bio(tg, bio_data_dir(bio));
c5cc2070b   Tejun Heo   blk-throttle: add...
1504
  		while ((bio = throtl_peek_queued(&sq->queued[WRITE])))
2a12f0dcd   Tejun Heo   blk-throttle: mak...
1505
1506
1507
  			tg_dispatch_one_bio(tg, bio_data_dir(bio));
  	}
  }
c9a929dde   Tejun Heo   block: fix reques...
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
  /**
   * blk_throtl_drain - drain throttled bios
   * @q: request_queue to drain throttled bios for
   *
   * Dispatch all currently throttled bios on @q through ->make_request_fn().
   */
  void blk_throtl_drain(struct request_queue *q)
  	__releases(q->queue_lock) __acquires(q->queue_lock)
  {
  	struct throtl_data *td = q->td;
2a12f0dcd   Tejun Heo   blk-throttle: mak...
1518
  	struct blkcg_gq *blkg;
492eb21b9   Tejun Heo   cgroup: make hier...
1519
  	struct cgroup_subsys_state *pos_css;
c9a929dde   Tejun Heo   block: fix reques...
1520
  	struct bio *bio;
651930bc1   Tejun Heo   blk-throttle: dis...
1521
  	int rw;
c9a929dde   Tejun Heo   block: fix reques...
1522

8bcb6c7d4   Andi Kleen   block: use lockde...
1523
  	queue_lockdep_assert_held(q);
2a12f0dcd   Tejun Heo   blk-throttle: mak...
1524
  	rcu_read_lock();
c9a929dde   Tejun Heo   block: fix reques...
1525

2a12f0dcd   Tejun Heo   blk-throttle: mak...
1526
1527
1528
1529
1530
1531
  	/*
  	 * Drain each tg while doing post-order walk on the blkg tree, so
  	 * that all bios are propagated to td->service_queue.  It'd be
  	 * better to walk service_queue tree directly but blkg walk is
  	 * easier.
  	 */
492eb21b9   Tejun Heo   cgroup: make hier...
1532
  	blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg)
2a12f0dcd   Tejun Heo   blk-throttle: mak...
1533
  		tg_drain_bios(&blkg_to_tg(blkg)->service_queue);
73f0d49a9   Tejun Heo   blk-throttle: mov...
1534

2a12f0dcd   Tejun Heo   blk-throttle: mak...
1535
1536
1537
1538
  	/* finally, transfer bios from top-level tg's into the td */
  	tg_drain_bios(&td->service_queue);
  
  	rcu_read_unlock();
c9a929dde   Tejun Heo   block: fix reques...
1539
  	spin_unlock_irq(q->queue_lock);
2a12f0dcd   Tejun Heo   blk-throttle: mak...
1540
  	/* all bios now should be in td->service_queue, issue them */
651930bc1   Tejun Heo   blk-throttle: dis...
1541
  	for (rw = READ; rw <= WRITE; rw++)
c5cc2070b   Tejun Heo   blk-throttle: add...
1542
1543
  		while ((bio = throtl_pop_queued(&td->service_queue.queued[rw],
  						NULL)))
651930bc1   Tejun Heo   blk-throttle: dis...
1544
  			generic_make_request(bio);
c9a929dde   Tejun Heo   block: fix reques...
1545
1546
1547
  
  	spin_lock_irq(q->queue_lock);
  }
e43473b7f   Vivek Goyal   blkio: Core imple...
1548
1549
1550
  int blk_throtl_init(struct request_queue *q)
  {
  	struct throtl_data *td;
a2b1693ba   Tejun Heo   blkcg: implement ...
1551
  	int ret;
e43473b7f   Vivek Goyal   blkio: Core imple...
1552
1553
1554
1555
  
  	td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
  	if (!td)
  		return -ENOMEM;
69df0ab03   Tejun Heo   blk-throttle: sep...
1556
  	INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
77216b048   Tejun Heo   blk-throttle: add...
1557
  	throtl_service_queue_init(&td->service_queue, NULL);
e43473b7f   Vivek Goyal   blkio: Core imple...
1558

cd1604fab   Tejun Heo   blkcg: factor out...
1559
  	q->td = td;
29b125892   Vivek Goyal   blk-throttle: Dyn...
1560
  	td->queue = q;
02977e4af   Vivek Goyal   blkio: Add root g...
1561

a2b1693ba   Tejun Heo   blkcg: implement ...
1562
  	/* activate policy */
3c798398e   Tejun Heo   blkcg: mass renam...
1563
  	ret = blkcg_activate_policy(q, &blkcg_policy_throtl);
a2b1693ba   Tejun Heo   blkcg: implement ...
1564
  	if (ret)
f51b802c1   Tejun Heo   blkcg: use the us...
1565
  		kfree(td);
a2b1693ba   Tejun Heo   blkcg: implement ...
1566
  	return ret;
e43473b7f   Vivek Goyal   blkio: Core imple...
1567
1568
1569
1570
  }
  
  void blk_throtl_exit(struct request_queue *q)
  {
c875f4d02   Tejun Heo   blkcg: drop unnec...
1571
  	BUG_ON(!q->td);
da5277700   Vivek Goyal   block: Move blk_t...
1572
  	throtl_shutdown_wq(q);
3c798398e   Tejun Heo   blkcg: mass renam...
1573
  	blkcg_deactivate_policy(q, &blkcg_policy_throtl);
c9a929dde   Tejun Heo   block: fix reques...
1574
  	kfree(q->td);
e43473b7f   Vivek Goyal   blkio: Core imple...
1575
1576
1577
1578
  }
  
  static int __init throtl_init(void)
  {
450adcbe5   Vivek Goyal   blk-throttle: Do ...
1579
1580
1581
1582
  	kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0);
  	if (!kthrotld_workqueue)
  		panic("Failed to create kthrotld
  ");
3c798398e   Tejun Heo   blkcg: mass renam...
1583
  	return blkcg_policy_register(&blkcg_policy_throtl);
e43473b7f   Vivek Goyal   blkio: Core imple...
1584
1585
1586
  }
  
  module_init(throtl_init);