Blame view

block/blk-throttle.c 45.8 KB
e43473b7f   Vivek Goyal   blkio: Core imple...
1
2
3
4
5
6
7
8
9
10
11
  /*
   * Interface for controlling IO bandwidth on a request queue
   *
   * Copyright (C) 2010 Vivek Goyal <vgoyal@redhat.com>
   */
  
  #include <linux/module.h>
  #include <linux/slab.h>
  #include <linux/blkdev.h>
  #include <linux/bio.h>
  #include <linux/blktrace_api.h>
eea8f41cc   Tejun Heo   blkcg: move block...
12
  #include <linux/blk-cgroup.h>
bc9fcbf9c   Tejun Heo   block: move blk_t...
13
  #include "blk.h"
e43473b7f   Vivek Goyal   blkio: Core imple...
14
15
16
17
18
19
20
21
22
  
  /* Max dispatch from a group in 1 round */
  static int throtl_grp_quantum = 8;
  
  /* Total max dispatch from all groups in one round */
  static int throtl_quantum = 32;
  
  /* Throttling is performed over 100ms slice and after that slice is renewed */
  static unsigned long throtl_slice = HZ/10;	/* 100 ms */
3c798398e   Tejun Heo   blkcg: mass renam...
23
  static struct blkcg_policy blkcg_policy_throtl;
0381411e4   Tejun Heo   blkcg: let blkcg ...
24

450adcbe5   Vivek Goyal   blk-throttle: Do ...
25
26
  /* A workqueue to queue throttle related work */
  static struct workqueue_struct *kthrotld_workqueue;
450adcbe5   Vivek Goyal   blk-throttle: Do ...
27

c5cc2070b   Tejun Heo   blk-throttle: add...
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
  /*
   * To implement hierarchical throttling, throtl_grps form a tree and bios
   * are dispatched upwards level by level until they reach the top and get
   * issued.  When dispatching bios from the children and local group at each
   * level, if the bios are dispatched into a single bio_list, there's a risk
   * of a local or child group which can queue many bios at once filling up
   * the list starving others.
   *
   * To avoid such starvation, dispatched bios are queued separately
   * according to where they came from.  When they are again dispatched to
   * the parent, they're popped in round-robin order so that no single source
   * hogs the dispatch window.
   *
   * throtl_qnode is used to keep the queued bios separated by their sources.
   * Bios are queued to throtl_qnode which in turn is queued to
   * throtl_service_queue and then dispatched in round-robin order.
   *
   * It's also used to track the reference counts on blkg's.  A qnode always
   * belongs to a throtl_grp and gets queued on itself or the parent, so
   * incrementing the reference of the associated throtl_grp when a qnode is
   * queued and decrementing when dequeued is enough to keep the whole blkg
   * tree pinned while bios are in flight.
   */
  struct throtl_qnode {
  	struct list_head	node;		/* service_queue->queued[] */
  	struct bio_list		bios;		/* queued bios */
  	struct throtl_grp	*tg;		/* tg this qnode belongs to */
  };
c9e0332e8   Tejun Heo   blk-throttle: ren...
56
  struct throtl_service_queue {
77216b048   Tejun Heo   blk-throttle: add...
57
  	struct throtl_service_queue *parent_sq;	/* the parent service_queue */
73f0d49a9   Tejun Heo   blk-throttle: mov...
58
59
60
61
  	/*
  	 * Bios queued directly to this service_queue or dispatched from
  	 * children throtl_grp's.
  	 */
c5cc2070b   Tejun Heo   blk-throttle: add...
62
  	struct list_head	queued[2];	/* throtl_qnode [READ/WRITE] */
73f0d49a9   Tejun Heo   blk-throttle: mov...
63
64
65
66
67
68
  	unsigned int		nr_queued[2];	/* number of queued bios */
  
  	/*
  	 * RB tree of active children throtl_grp's, which are sorted by
  	 * their ->disptime.
  	 */
c9e0332e8   Tejun Heo   blk-throttle: ren...
69
70
71
72
  	struct rb_root		pending_tree;	/* RB tree of active tgs */
  	struct rb_node		*first_pending;	/* first node in the tree */
  	unsigned int		nr_pending;	/* # queued in the tree */
  	unsigned long		first_pending_disptime;	/* disptime of the first tg */
69df0ab03   Tejun Heo   blk-throttle: sep...
73
  	struct timer_list	pending_timer;	/* fires on first_pending_disptime */
e43473b7f   Vivek Goyal   blkio: Core imple...
74
  };
5b2c16aae   Tejun Heo   blk-throttle: sim...
75
76
  enum tg_state_flags {
  	THROTL_TG_PENDING	= 1 << 0,	/* on parent's pending tree */
0e9f4164b   Tejun Heo   blk-throttle: gen...
77
  	THROTL_TG_WAS_EMPTY	= 1 << 1,	/* bio_lists[] became non-empty */
5b2c16aae   Tejun Heo   blk-throttle: sim...
78
  };
e43473b7f   Vivek Goyal   blkio: Core imple...
79
  #define rb_entry_tg(node)	rb_entry((node), struct throtl_grp, rb_node)
8a3d26151   Tejun Heo   blkcg: move blkio...
80
81
82
83
84
85
86
  /* Per-cpu group stats */
  struct tg_stats_cpu {
  	/* total bytes transferred */
  	struct blkg_rwstat		service_bytes;
  	/* total IOs serviced, post merge */
  	struct blkg_rwstat		serviced;
  };
e43473b7f   Vivek Goyal   blkio: Core imple...
87
  struct throtl_grp {
f95a04afa   Tejun Heo   blkcg: embed stru...
88
89
  	/* must be the first member */
  	struct blkg_policy_data pd;
c9e0332e8   Tejun Heo   blk-throttle: ren...
90
  	/* active throtl group service_queue member */
e43473b7f   Vivek Goyal   blkio: Core imple...
91
  	struct rb_node rb_node;
0f3457f60   Tejun Heo   blk-throttle: add...
92
93
  	/* throtl_data this group belongs to */
  	struct throtl_data *td;
49a2f1e3f   Tejun Heo   blk-throttle: add...
94
95
  	/* this group's service queue */
  	struct throtl_service_queue service_queue;
e43473b7f   Vivek Goyal   blkio: Core imple...
96
  	/*
c5cc2070b   Tejun Heo   blk-throttle: add...
97
98
99
100
101
102
103
104
105
106
107
  	 * qnode_on_self is used when bios are directly queued to this
  	 * throtl_grp so that local bios compete fairly with bios
  	 * dispatched from children.  qnode_on_parent is used when bios are
  	 * dispatched from this throtl_grp into its parent and will compete
  	 * with the sibling qnode_on_parents and the parent's
  	 * qnode_on_self.
  	 */
  	struct throtl_qnode qnode_on_self[2];
  	struct throtl_qnode qnode_on_parent[2];
  
  	/*
e43473b7f   Vivek Goyal   blkio: Core imple...
108
109
110
111
112
  	 * Dispatch time in jiffies. This is the estimated time when group
  	 * will unthrottle and is ready to dispatch more bio. It is used as
  	 * key to sort active groups in service tree.
  	 */
  	unsigned long disptime;
e43473b7f   Vivek Goyal   blkio: Core imple...
113
  	unsigned int flags;
693e751e7   Tejun Heo   blk-throttle: imp...
114
115
  	/* are there any throtl rules between this group and td? */
  	bool has_rules[2];
e43473b7f   Vivek Goyal   blkio: Core imple...
116
117
  	/* bytes per second rate limits */
  	uint64_t bps[2];
8e89d13f4   Vivek Goyal   blkio: Implementa...
118
119
  	/* IOPS limits */
  	unsigned int iops[2];
e43473b7f   Vivek Goyal   blkio: Core imple...
120
121
  	/* Number of bytes disptached in current slice */
  	uint64_t bytes_disp[2];
8e89d13f4   Vivek Goyal   blkio: Implementa...
122
123
  	/* Number of bio's dispatched in current slice */
  	unsigned int io_disp[2];
e43473b7f   Vivek Goyal   blkio: Core imple...
124
125
126
127
  
  	/* When did we start a new slice */
  	unsigned long slice_start[2];
  	unsigned long slice_end[2];
fe0714377   Vivek Goyal   blkio: Recalculat...
128

8a3d26151   Tejun Heo   blkcg: move blkio...
129
130
131
132
133
  	/* Per cpu stats pointer */
  	struct tg_stats_cpu __percpu *stats_cpu;
  
  	/* List of tgs waiting for per cpu stats memory to be allocated */
  	struct list_head stats_alloc_node;
e43473b7f   Vivek Goyal   blkio: Core imple...
134
135
136
137
  };
  
  struct throtl_data
  {
e43473b7f   Vivek Goyal   blkio: Core imple...
138
  	/* service tree for active throtl groups */
c9e0332e8   Tejun Heo   blk-throttle: ren...
139
  	struct throtl_service_queue service_queue;
e43473b7f   Vivek Goyal   blkio: Core imple...
140

e43473b7f   Vivek Goyal   blkio: Core imple...
141
142
143
144
145
146
  	struct request_queue *queue;
  
  	/* Total Number of queued bios on READ and WRITE lists */
  	unsigned int nr_queued[2];
  
  	/*
02977e4af   Vivek Goyal   blkio: Add root g...
147
  	 * number of total undestroyed groups
e43473b7f   Vivek Goyal   blkio: Core imple...
148
149
150
151
  	 */
  	unsigned int nr_undestroyed_grps;
  
  	/* Work for dispatching throttled bios */
69df0ab03   Tejun Heo   blk-throttle: sep...
152
  	struct work_struct dispatch_work;
e43473b7f   Vivek Goyal   blkio: Core imple...
153
  };
8a3d26151   Tejun Heo   blkcg: move blkio...
154
155
156
157
158
159
  /* list and work item to allocate percpu group stats */
  static DEFINE_SPINLOCK(tg_stats_alloc_lock);
  static LIST_HEAD(tg_stats_alloc_list);
  
  static void tg_stats_alloc_fn(struct work_struct *);
  static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn);
69df0ab03   Tejun Heo   blk-throttle: sep...
160
  static void throtl_pending_timer_fn(unsigned long arg);
f95a04afa   Tejun Heo   blkcg: embed stru...
161
162
163
164
  static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd)
  {
  	return pd ? container_of(pd, struct throtl_grp, pd) : NULL;
  }
3c798398e   Tejun Heo   blkcg: mass renam...
165
  static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg)
0381411e4   Tejun Heo   blkcg: let blkcg ...
166
  {
f95a04afa   Tejun Heo   blkcg: embed stru...
167
  	return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl));
0381411e4   Tejun Heo   blkcg: let blkcg ...
168
  }
3c798398e   Tejun Heo   blkcg: mass renam...
169
  static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg)
0381411e4   Tejun Heo   blkcg: let blkcg ...
170
  {
f95a04afa   Tejun Heo   blkcg: embed stru...
171
  	return pd_to_blkg(&tg->pd);
0381411e4   Tejun Heo   blkcg: let blkcg ...
172
  }
03d8e1114   Tejun Heo   blkcg: add reques...
173
174
175
176
  static inline struct throtl_grp *td_root_tg(struct throtl_data *td)
  {
  	return blkg_to_tg(td->queue->root_blkg);
  }
fda6f272c   Tejun Heo   blk-throttle: imp...
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
  /**
   * sq_to_tg - return the throl_grp the specified service queue belongs to
   * @sq: the throtl_service_queue of interest
   *
   * Return the throtl_grp @sq belongs to.  If @sq is the top-level one
   * embedded in throtl_data, %NULL is returned.
   */
  static struct throtl_grp *sq_to_tg(struct throtl_service_queue *sq)
  {
  	if (sq && sq->parent_sq)
  		return container_of(sq, struct throtl_grp, service_queue);
  	else
  		return NULL;
  }
  
  /**
   * sq_to_td - return throtl_data the specified service queue belongs to
   * @sq: the throtl_service_queue of interest
   *
   * A service_queue can be embeded in either a throtl_grp or throtl_data.
   * Determine the associated throtl_data accordingly and return it.
   */
  static struct throtl_data *sq_to_td(struct throtl_service_queue *sq)
  {
  	struct throtl_grp *tg = sq_to_tg(sq);
  
  	if (tg)
  		return tg->td;
  	else
  		return container_of(sq, struct throtl_data, service_queue);
  }
  
  /**
   * throtl_log - log debug message via blktrace
   * @sq: the service_queue being reported
   * @fmt: printf format string
   * @args: printf args
   *
   * The messages are prefixed with "throtl BLKG_NAME" if @sq belongs to a
   * throtl_grp; otherwise, just "throtl".
   *
   * TODO: this should be made a function and name formatting should happen
   * after testing whether blktrace is enabled.
   */
  #define throtl_log(sq, fmt, args...)	do {				\
  	struct throtl_grp *__tg = sq_to_tg((sq));			\
  	struct throtl_data *__td = sq_to_td((sq));			\
  									\
  	(void)__td;							\
  	if ((__tg)) {							\
  		char __pbuf[128];					\
54e7ed12b   Tejun Heo   blkcg: remove blk...
228
  									\
fda6f272c   Tejun Heo   blk-throttle: imp...
229
230
231
232
233
  		blkg_path(tg_to_blkg(__tg), __pbuf, sizeof(__pbuf));	\
  		blk_add_trace_msg(__td->queue, "throtl %s " fmt, __pbuf, ##args); \
  	} else {							\
  		blk_add_trace_msg(__td->queue, "throtl " fmt, ##args);	\
  	}								\
54e7ed12b   Tejun Heo   blkcg: remove blk...
234
  } while (0)
e43473b7f   Vivek Goyal   blkio: Core imple...
235

90d3839b9   Peter Zijlstra   block: Use u64_st...
236
237
238
239
240
  static void tg_stats_init(struct tg_stats_cpu *tg_stats)
  {
  	blkg_rwstat_init(&tg_stats->service_bytes);
  	blkg_rwstat_init(&tg_stats->serviced);
  }
8a3d26151   Tejun Heo   blkcg: move blkio...
241
242
  /*
   * Worker for allocating per cpu stat for tgs. This is scheduled on the
3b07e9ca2   Tejun Heo   workqueue: deprec...
243
   * system_wq once there are some groups on the alloc_list waiting for
8a3d26151   Tejun Heo   blkcg: move blkio...
244
245
246
247
248
249
250
251
252
253
   * allocation.
   */
  static void tg_stats_alloc_fn(struct work_struct *work)
  {
  	static struct tg_stats_cpu *stats_cpu;	/* this fn is non-reentrant */
  	struct delayed_work *dwork = to_delayed_work(work);
  	bool empty = false;
  
  alloc_stats:
  	if (!stats_cpu) {
90d3839b9   Peter Zijlstra   block: Use u64_st...
254
  		int cpu;
8a3d26151   Tejun Heo   blkcg: move blkio...
255
256
257
  		stats_cpu = alloc_percpu(struct tg_stats_cpu);
  		if (!stats_cpu) {
  			/* allocation failed, try again after some time */
3b07e9ca2   Tejun Heo   workqueue: deprec...
258
  			schedule_delayed_work(dwork, msecs_to_jiffies(10));
8a3d26151   Tejun Heo   blkcg: move blkio...
259
260
  			return;
  		}
90d3839b9   Peter Zijlstra   block: Use u64_st...
261
262
  		for_each_possible_cpu(cpu)
  			tg_stats_init(per_cpu_ptr(stats_cpu, cpu));
8a3d26151   Tejun Heo   blkcg: move blkio...
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
  	}
  
  	spin_lock_irq(&tg_stats_alloc_lock);
  
  	if (!list_empty(&tg_stats_alloc_list)) {
  		struct throtl_grp *tg = list_first_entry(&tg_stats_alloc_list,
  							 struct throtl_grp,
  							 stats_alloc_node);
  		swap(tg->stats_cpu, stats_cpu);
  		list_del_init(&tg->stats_alloc_node);
  	}
  
  	empty = list_empty(&tg_stats_alloc_list);
  	spin_unlock_irq(&tg_stats_alloc_lock);
  	if (!empty)
  		goto alloc_stats;
  }
c5cc2070b   Tejun Heo   blk-throttle: add...
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
  static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg)
  {
  	INIT_LIST_HEAD(&qn->node);
  	bio_list_init(&qn->bios);
  	qn->tg = tg;
  }
  
  /**
   * throtl_qnode_add_bio - add a bio to a throtl_qnode and activate it
   * @bio: bio being added
   * @qn: qnode to add bio to
   * @queued: the service_queue->queued[] list @qn belongs to
   *
   * Add @bio to @qn and put @qn on @queued if it's not already on.
   * @qn->tg's reference count is bumped when @qn is activated.  See the
   * comment on top of throtl_qnode definition for details.
   */
  static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn,
  				 struct list_head *queued)
  {
  	bio_list_add(&qn->bios, bio);
  	if (list_empty(&qn->node)) {
  		list_add_tail(&qn->node, queued);
  		blkg_get(tg_to_blkg(qn->tg));
  	}
  }
  
  /**
   * throtl_peek_queued - peek the first bio on a qnode list
   * @queued: the qnode list to peek
   */
  static struct bio *throtl_peek_queued(struct list_head *queued)
  {
  	struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node);
  	struct bio *bio;
  
  	if (list_empty(queued))
  		return NULL;
  
  	bio = bio_list_peek(&qn->bios);
  	WARN_ON_ONCE(!bio);
  	return bio;
  }
  
  /**
   * throtl_pop_queued - pop the first bio form a qnode list
   * @queued: the qnode list to pop a bio from
   * @tg_to_put: optional out argument for throtl_grp to put
   *
   * Pop the first bio from the qnode list @queued.  After popping, the first
   * qnode is removed from @queued if empty or moved to the end of @queued so
   * that the popping order is round-robin.
   *
   * When the first qnode is removed, its associated throtl_grp should be put
   * too.  If @tg_to_put is NULL, this function automatically puts it;
   * otherwise, *@tg_to_put is set to the throtl_grp to put and the caller is
   * responsible for putting it.
   */
  static struct bio *throtl_pop_queued(struct list_head *queued,
  				     struct throtl_grp **tg_to_put)
  {
  	struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node);
  	struct bio *bio;
  
  	if (list_empty(queued))
  		return NULL;
  
  	bio = bio_list_pop(&qn->bios);
  	WARN_ON_ONCE(!bio);
  
  	if (bio_list_empty(&qn->bios)) {
  		list_del_init(&qn->node);
  		if (tg_to_put)
  			*tg_to_put = qn->tg;
  		else
  			blkg_put(tg_to_blkg(qn->tg));
  	} else {
  		list_move_tail(&qn->node, queued);
  	}
  
  	return bio;
  }
49a2f1e3f   Tejun Heo   blk-throttle: add...
362
  /* init a service_queue, assumes the caller zeroed it */
77216b048   Tejun Heo   blk-throttle: add...
363
364
  static void throtl_service_queue_init(struct throtl_service_queue *sq,
  				      struct throtl_service_queue *parent_sq)
49a2f1e3f   Tejun Heo   blk-throttle: add...
365
  {
c5cc2070b   Tejun Heo   blk-throttle: add...
366
367
  	INIT_LIST_HEAD(&sq->queued[0]);
  	INIT_LIST_HEAD(&sq->queued[1]);
49a2f1e3f   Tejun Heo   blk-throttle: add...
368
  	sq->pending_tree = RB_ROOT;
77216b048   Tejun Heo   blk-throttle: add...
369
  	sq->parent_sq = parent_sq;
69df0ab03   Tejun Heo   blk-throttle: sep...
370
371
372
373
374
375
376
  	setup_timer(&sq->pending_timer, throtl_pending_timer_fn,
  		    (unsigned long)sq);
  }
  
  static void throtl_service_queue_exit(struct throtl_service_queue *sq)
  {
  	del_timer_sync(&sq->pending_timer);
49a2f1e3f   Tejun Heo   blk-throttle: add...
377
  }
3c798398e   Tejun Heo   blkcg: mass renam...
378
  static void throtl_pd_init(struct blkcg_gq *blkg)
a29a171e7   Vivek Goyal   blk-throttle: Do ...
379
  {
0381411e4   Tejun Heo   blkcg: let blkcg ...
380
  	struct throtl_grp *tg = blkg_to_tg(blkg);
77216b048   Tejun Heo   blk-throttle: add...
381
  	struct throtl_data *td = blkg->q->td;
9138125be   Tejun Heo   blk-throttle: imp...
382
  	struct throtl_service_queue *parent_sq;
ff26eaadf   Tejun Heo   blkcg: tg_stats_a...
383
  	unsigned long flags;
c5cc2070b   Tejun Heo   blk-throttle: add...
384
  	int rw;
cd1604fab   Tejun Heo   blkcg: factor out...
385

9138125be   Tejun Heo   blk-throttle: imp...
386
  	/*
aa6ec29be   Tejun Heo   cgroup: remove sa...
387
  	 * If on the default hierarchy, we switch to properly hierarchical
9138125be   Tejun Heo   blk-throttle: imp...
388
389
390
391
392
  	 * behavior where limits on a given throtl_grp are applied to the
  	 * whole subtree rather than just the group itself.  e.g. If 16M
  	 * read_bps limit is set on the root group, the whole system can't
  	 * exceed 16M for the device.
  	 *
aa6ec29be   Tejun Heo   cgroup: remove sa...
393
  	 * If not on the default hierarchy, the broken flat hierarchy
9138125be   Tejun Heo   blk-throttle: imp...
394
395
396
397
398
399
  	 * behavior is retained where all throtl_grps are treated as if
  	 * they're all separate root groups right below throtl_data.
  	 * Limits of a group don't interact with limits of other groups
  	 * regardless of the position of the group in the hierarchy.
  	 */
  	parent_sq = &td->service_queue;
aa6ec29be   Tejun Heo   cgroup: remove sa...
400
  	if (cgroup_on_dfl(blkg->blkcg->css.cgroup) && blkg->parent)
9138125be   Tejun Heo   blk-throttle: imp...
401
402
403
  		parent_sq = &blkg_to_tg(blkg->parent)->service_queue;
  
  	throtl_service_queue_init(&tg->service_queue, parent_sq);
c5cc2070b   Tejun Heo   blk-throttle: add...
404
405
406
407
  	for (rw = READ; rw <= WRITE; rw++) {
  		throtl_qnode_init(&tg->qnode_on_self[rw], tg);
  		throtl_qnode_init(&tg->qnode_on_parent[rw], tg);
  	}
a29a171e7   Vivek Goyal   blk-throttle: Do ...
408
  	RB_CLEAR_NODE(&tg->rb_node);
77216b048   Tejun Heo   blk-throttle: add...
409
  	tg->td = td;
a29a171e7   Vivek Goyal   blk-throttle: Do ...
410

e56da7e28   Tejun Heo   blkcg: don't allo...
411
412
413
414
  	tg->bps[READ] = -1;
  	tg->bps[WRITE] = -1;
  	tg->iops[READ] = -1;
  	tg->iops[WRITE] = -1;
8a3d26151   Tejun Heo   blkcg: move blkio...
415
416
417
418
419
420
  
  	/*
  	 * Ugh... We need to perform per-cpu allocation for tg->stats_cpu
  	 * but percpu allocator can't be called from IO path.  Queue tg on
  	 * tg_stats_alloc_list and allocate from work item.
  	 */
ff26eaadf   Tejun Heo   blkcg: tg_stats_a...
421
  	spin_lock_irqsave(&tg_stats_alloc_lock, flags);
8a3d26151   Tejun Heo   blkcg: move blkio...
422
  	list_add(&tg->stats_alloc_node, &tg_stats_alloc_list);
3b07e9ca2   Tejun Heo   workqueue: deprec...
423
  	schedule_delayed_work(&tg_stats_alloc_work, 0);
ff26eaadf   Tejun Heo   blkcg: tg_stats_a...
424
  	spin_unlock_irqrestore(&tg_stats_alloc_lock, flags);
8a3d26151   Tejun Heo   blkcg: move blkio...
425
  }
693e751e7   Tejun Heo   blk-throttle: imp...
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
  /*
   * Set has_rules[] if @tg or any of its parents have limits configured.
   * This doesn't require walking up to the top of the hierarchy as the
   * parent's has_rules[] is guaranteed to be correct.
   */
  static void tg_update_has_rules(struct throtl_grp *tg)
  {
  	struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq);
  	int rw;
  
  	for (rw = READ; rw <= WRITE; rw++)
  		tg->has_rules[rw] = (parent_tg && parent_tg->has_rules[rw]) ||
  				    (tg->bps[rw] != -1 || tg->iops[rw] != -1);
  }
  
  static void throtl_pd_online(struct blkcg_gq *blkg)
  {
  	/*
  	 * We don't want new groups to escape the limits of its ancestors.
  	 * Update has_rules[] after a new group is brought online.
  	 */
  	tg_update_has_rules(blkg_to_tg(blkg));
  }
3c798398e   Tejun Heo   blkcg: mass renam...
449
  static void throtl_pd_exit(struct blkcg_gq *blkg)
8a3d26151   Tejun Heo   blkcg: move blkio...
450
451
  {
  	struct throtl_grp *tg = blkg_to_tg(blkg);
ff26eaadf   Tejun Heo   blkcg: tg_stats_a...
452
  	unsigned long flags;
8a3d26151   Tejun Heo   blkcg: move blkio...
453

ff26eaadf   Tejun Heo   blkcg: tg_stats_a...
454
  	spin_lock_irqsave(&tg_stats_alloc_lock, flags);
8a3d26151   Tejun Heo   blkcg: move blkio...
455
  	list_del_init(&tg->stats_alloc_node);
ff26eaadf   Tejun Heo   blkcg: tg_stats_a...
456
  	spin_unlock_irqrestore(&tg_stats_alloc_lock, flags);
8a3d26151   Tejun Heo   blkcg: move blkio...
457
458
  
  	free_percpu(tg->stats_cpu);
69df0ab03   Tejun Heo   blk-throttle: sep...
459
460
  
  	throtl_service_queue_exit(&tg->service_queue);
8a3d26151   Tejun Heo   blkcg: move blkio...
461
  }
3c798398e   Tejun Heo   blkcg: mass renam...
462
  static void throtl_pd_reset_stats(struct blkcg_gq *blkg)
8a3d26151   Tejun Heo   blkcg: move blkio...
463
464
465
466
467
468
469
470
471
472
473
474
475
  {
  	struct throtl_grp *tg = blkg_to_tg(blkg);
  	int cpu;
  
  	if (tg->stats_cpu == NULL)
  		return;
  
  	for_each_possible_cpu(cpu) {
  		struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu);
  
  		blkg_rwstat_reset(&sc->service_bytes);
  		blkg_rwstat_reset(&sc->serviced);
  	}
a29a171e7   Vivek Goyal   blk-throttle: Do ...
476
  }
3c798398e   Tejun Heo   blkcg: mass renam...
477
478
  static struct throtl_grp *throtl_lookup_tg(struct throtl_data *td,
  					   struct blkcg *blkcg)
e43473b7f   Vivek Goyal   blkio: Core imple...
479
  {
e43473b7f   Vivek Goyal   blkio: Core imple...
480
  	/*
3c798398e   Tejun Heo   blkcg: mass renam...
481
482
  	 * This is the common case when there are no blkcgs.  Avoid lookup
  	 * in this case
cd1604fab   Tejun Heo   blkcg: factor out...
483
  	 */
3c798398e   Tejun Heo   blkcg: mass renam...
484
  	if (blkcg == &blkcg_root)
03d8e1114   Tejun Heo   blkcg: add reques...
485
  		return td_root_tg(td);
e43473b7f   Vivek Goyal   blkio: Core imple...
486

e8989fae3   Tejun Heo   blkcg: unify blkg...
487
  	return blkg_to_tg(blkg_lookup(blkcg, td->queue));
e43473b7f   Vivek Goyal   blkio: Core imple...
488
  }
cd1604fab   Tejun Heo   blkcg: factor out...
489
  static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td,
3c798398e   Tejun Heo   blkcg: mass renam...
490
  						  struct blkcg *blkcg)
e43473b7f   Vivek Goyal   blkio: Core imple...
491
  {
f469a7b4d   Vivek Goyal   blk-cgroup: Allow...
492
  	struct request_queue *q = td->queue;
cd1604fab   Tejun Heo   blkcg: factor out...
493
  	struct throtl_grp *tg = NULL;
bc16a4f93   Tejun Heo   block: reorganize...
494

f469a7b4d   Vivek Goyal   blk-cgroup: Allow...
495
  	/*
3c798398e   Tejun Heo   blkcg: mass renam...
496
497
  	 * This is the common case when there are no blkcgs.  Avoid lookup
  	 * in this case
f469a7b4d   Vivek Goyal   blk-cgroup: Allow...
498
  	 */
3c798398e   Tejun Heo   blkcg: mass renam...
499
  	if (blkcg == &blkcg_root) {
03d8e1114   Tejun Heo   blkcg: add reques...
500
  		tg = td_root_tg(td);
cd1604fab   Tejun Heo   blkcg: factor out...
501
  	} else {
3c798398e   Tejun Heo   blkcg: mass renam...
502
  		struct blkcg_gq *blkg;
f469a7b4d   Vivek Goyal   blk-cgroup: Allow...
503

3c96cb32d   Tejun Heo   blkcg: drop stuff...
504
  		blkg = blkg_lookup_create(blkcg, q);
f469a7b4d   Vivek Goyal   blk-cgroup: Allow...
505

cd1604fab   Tejun Heo   blkcg: factor out...
506
507
  		/* if %NULL and @q is alive, fall back to root_tg */
  		if (!IS_ERR(blkg))
0381411e4   Tejun Heo   blkcg: let blkcg ...
508
  			tg = blkg_to_tg(blkg);
3f3299d5c   Bart Van Assche   block: Rename que...
509
  		else if (!blk_queue_dying(q))
03d8e1114   Tejun Heo   blkcg: add reques...
510
  			tg = td_root_tg(td);
f469a7b4d   Vivek Goyal   blk-cgroup: Allow...
511
  	}
e43473b7f   Vivek Goyal   blkio: Core imple...
512
513
  	return tg;
  }
0049af73b   Tejun Heo   blk-throttle: reo...
514
515
  static struct throtl_grp *
  throtl_rb_first(struct throtl_service_queue *parent_sq)
e43473b7f   Vivek Goyal   blkio: Core imple...
516
517
  {
  	/* Service tree is empty */
0049af73b   Tejun Heo   blk-throttle: reo...
518
  	if (!parent_sq->nr_pending)
e43473b7f   Vivek Goyal   blkio: Core imple...
519
  		return NULL;
0049af73b   Tejun Heo   blk-throttle: reo...
520
521
  	if (!parent_sq->first_pending)
  		parent_sq->first_pending = rb_first(&parent_sq->pending_tree);
e43473b7f   Vivek Goyal   blkio: Core imple...
522

0049af73b   Tejun Heo   blk-throttle: reo...
523
524
  	if (parent_sq->first_pending)
  		return rb_entry_tg(parent_sq->first_pending);
e43473b7f   Vivek Goyal   blkio: Core imple...
525
526
527
528
529
530
531
532
533
  
  	return NULL;
  }
  
  static void rb_erase_init(struct rb_node *n, struct rb_root *root)
  {
  	rb_erase(n, root);
  	RB_CLEAR_NODE(n);
  }
0049af73b   Tejun Heo   blk-throttle: reo...
534
535
  static void throtl_rb_erase(struct rb_node *n,
  			    struct throtl_service_queue *parent_sq)
e43473b7f   Vivek Goyal   blkio: Core imple...
536
  {
0049af73b   Tejun Heo   blk-throttle: reo...
537
538
539
540
  	if (parent_sq->first_pending == n)
  		parent_sq->first_pending = NULL;
  	rb_erase_init(n, &parent_sq->pending_tree);
  	--parent_sq->nr_pending;
e43473b7f   Vivek Goyal   blkio: Core imple...
541
  }
0049af73b   Tejun Heo   blk-throttle: reo...
542
  static void update_min_dispatch_time(struct throtl_service_queue *parent_sq)
e43473b7f   Vivek Goyal   blkio: Core imple...
543
544
  {
  	struct throtl_grp *tg;
0049af73b   Tejun Heo   blk-throttle: reo...
545
  	tg = throtl_rb_first(parent_sq);
e43473b7f   Vivek Goyal   blkio: Core imple...
546
547
  	if (!tg)
  		return;
0049af73b   Tejun Heo   blk-throttle: reo...
548
  	parent_sq->first_pending_disptime = tg->disptime;
e43473b7f   Vivek Goyal   blkio: Core imple...
549
  }
77216b048   Tejun Heo   blk-throttle: add...
550
  static void tg_service_queue_add(struct throtl_grp *tg)
e43473b7f   Vivek Goyal   blkio: Core imple...
551
  {
77216b048   Tejun Heo   blk-throttle: add...
552
  	struct throtl_service_queue *parent_sq = tg->service_queue.parent_sq;
0049af73b   Tejun Heo   blk-throttle: reo...
553
  	struct rb_node **node = &parent_sq->pending_tree.rb_node;
e43473b7f   Vivek Goyal   blkio: Core imple...
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
  	struct rb_node *parent = NULL;
  	struct throtl_grp *__tg;
  	unsigned long key = tg->disptime;
  	int left = 1;
  
  	while (*node != NULL) {
  		parent = *node;
  		__tg = rb_entry_tg(parent);
  
  		if (time_before(key, __tg->disptime))
  			node = &parent->rb_left;
  		else {
  			node = &parent->rb_right;
  			left = 0;
  		}
  	}
  
  	if (left)
0049af73b   Tejun Heo   blk-throttle: reo...
572
  		parent_sq->first_pending = &tg->rb_node;
e43473b7f   Vivek Goyal   blkio: Core imple...
573
574
  
  	rb_link_node(&tg->rb_node, parent, node);
0049af73b   Tejun Heo   blk-throttle: reo...
575
  	rb_insert_color(&tg->rb_node, &parent_sq->pending_tree);
e43473b7f   Vivek Goyal   blkio: Core imple...
576
  }
77216b048   Tejun Heo   blk-throttle: add...
577
  static void __throtl_enqueue_tg(struct throtl_grp *tg)
e43473b7f   Vivek Goyal   blkio: Core imple...
578
  {
77216b048   Tejun Heo   blk-throttle: add...
579
  	tg_service_queue_add(tg);
5b2c16aae   Tejun Heo   blk-throttle: sim...
580
  	tg->flags |= THROTL_TG_PENDING;
77216b048   Tejun Heo   blk-throttle: add...
581
  	tg->service_queue.parent_sq->nr_pending++;
e43473b7f   Vivek Goyal   blkio: Core imple...
582
  }
77216b048   Tejun Heo   blk-throttle: add...
583
  static void throtl_enqueue_tg(struct throtl_grp *tg)
e43473b7f   Vivek Goyal   blkio: Core imple...
584
  {
5b2c16aae   Tejun Heo   blk-throttle: sim...
585
  	if (!(tg->flags & THROTL_TG_PENDING))
77216b048   Tejun Heo   blk-throttle: add...
586
  		__throtl_enqueue_tg(tg);
e43473b7f   Vivek Goyal   blkio: Core imple...
587
  }
77216b048   Tejun Heo   blk-throttle: add...
588
  static void __throtl_dequeue_tg(struct throtl_grp *tg)
e43473b7f   Vivek Goyal   blkio: Core imple...
589
  {
77216b048   Tejun Heo   blk-throttle: add...
590
  	throtl_rb_erase(&tg->rb_node, tg->service_queue.parent_sq);
5b2c16aae   Tejun Heo   blk-throttle: sim...
591
  	tg->flags &= ~THROTL_TG_PENDING;
e43473b7f   Vivek Goyal   blkio: Core imple...
592
  }
77216b048   Tejun Heo   blk-throttle: add...
593
  static void throtl_dequeue_tg(struct throtl_grp *tg)
e43473b7f   Vivek Goyal   blkio: Core imple...
594
  {
5b2c16aae   Tejun Heo   blk-throttle: sim...
595
  	if (tg->flags & THROTL_TG_PENDING)
77216b048   Tejun Heo   blk-throttle: add...
596
  		__throtl_dequeue_tg(tg);
e43473b7f   Vivek Goyal   blkio: Core imple...
597
  }
a9131a27e   Tejun Heo   blk-throttle: rel...
598
  /* Call with queue lock held */
69df0ab03   Tejun Heo   blk-throttle: sep...
599
600
  static void throtl_schedule_pending_timer(struct throtl_service_queue *sq,
  					  unsigned long expires)
a9131a27e   Tejun Heo   blk-throttle: rel...
601
  {
69df0ab03   Tejun Heo   blk-throttle: sep...
602
603
604
  	mod_timer(&sq->pending_timer, expires);
  	throtl_log(sq, "schedule timer. delay=%lu jiffies=%lu",
  		   expires - jiffies, jiffies);
a9131a27e   Tejun Heo   blk-throttle: rel...
605
  }
7f52f98c2   Tejun Heo   blk-throttle: imp...
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
  /**
   * throtl_schedule_next_dispatch - schedule the next dispatch cycle
   * @sq: the service_queue to schedule dispatch for
   * @force: force scheduling
   *
   * Arm @sq->pending_timer so that the next dispatch cycle starts on the
   * dispatch time of the first pending child.  Returns %true if either timer
   * is armed or there's no pending child left.  %false if the current
   * dispatch window is still open and the caller should continue
   * dispatching.
   *
   * If @force is %true, the dispatch timer is always scheduled and this
   * function is guaranteed to return %true.  This is to be used when the
   * caller can't dispatch itself and needs to invoke pending_timer
   * unconditionally.  Note that forced scheduling is likely to induce short
   * delay before dispatch starts even if @sq->first_pending_disptime is not
   * in the future and thus shouldn't be used in hot paths.
   */
  static bool throtl_schedule_next_dispatch(struct throtl_service_queue *sq,
  					  bool force)
e43473b7f   Vivek Goyal   blkio: Core imple...
626
  {
6a525600f   Tejun Heo   blk-throttle: rem...
627
  	/* any pending children left? */
c9e0332e8   Tejun Heo   blk-throttle: ren...
628
  	if (!sq->nr_pending)
7f52f98c2   Tejun Heo   blk-throttle: imp...
629
  		return true;
e43473b7f   Vivek Goyal   blkio: Core imple...
630

c9e0332e8   Tejun Heo   blk-throttle: ren...
631
  	update_min_dispatch_time(sq);
e43473b7f   Vivek Goyal   blkio: Core imple...
632

69df0ab03   Tejun Heo   blk-throttle: sep...
633
  	/* is the next dispatch time in the future? */
7f52f98c2   Tejun Heo   blk-throttle: imp...
634
  	if (force || time_after(sq->first_pending_disptime, jiffies)) {
69df0ab03   Tejun Heo   blk-throttle: sep...
635
  		throtl_schedule_pending_timer(sq, sq->first_pending_disptime);
7f52f98c2   Tejun Heo   blk-throttle: imp...
636
  		return true;
69df0ab03   Tejun Heo   blk-throttle: sep...
637
  	}
7f52f98c2   Tejun Heo   blk-throttle: imp...
638
639
  	/* tell the caller to continue dispatching */
  	return false;
e43473b7f   Vivek Goyal   blkio: Core imple...
640
  }
32ee5bc47   Vivek Goyal   blk-throttle: Acc...
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
  static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg,
  		bool rw, unsigned long start)
  {
  	tg->bytes_disp[rw] = 0;
  	tg->io_disp[rw] = 0;
  
  	/*
  	 * Previous slice has expired. We must have trimmed it after last
  	 * bio dispatch. That means since start of last slice, we never used
  	 * that bandwidth. Do try to make use of that bandwidth while giving
  	 * credit.
  	 */
  	if (time_after_eq(start, tg->slice_start[rw]))
  		tg->slice_start[rw] = start;
  
  	tg->slice_end[rw] = jiffies + throtl_slice;
  	throtl_log(&tg->service_queue,
  		   "[%c] new slice with credit start=%lu end=%lu jiffies=%lu",
  		   rw == READ ? 'R' : 'W', tg->slice_start[rw],
  		   tg->slice_end[rw], jiffies);
  }
0f3457f60   Tejun Heo   blk-throttle: add...
662
  static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw)
e43473b7f   Vivek Goyal   blkio: Core imple...
663
664
  {
  	tg->bytes_disp[rw] = 0;
8e89d13f4   Vivek Goyal   blkio: Implementa...
665
  	tg->io_disp[rw] = 0;
e43473b7f   Vivek Goyal   blkio: Core imple...
666
667
  	tg->slice_start[rw] = jiffies;
  	tg->slice_end[rw] = jiffies + throtl_slice;
fda6f272c   Tejun Heo   blk-throttle: imp...
668
669
670
671
  	throtl_log(&tg->service_queue,
  		   "[%c] new slice start=%lu end=%lu jiffies=%lu",
  		   rw == READ ? 'R' : 'W', tg->slice_start[rw],
  		   tg->slice_end[rw], jiffies);
e43473b7f   Vivek Goyal   blkio: Core imple...
672
  }
0f3457f60   Tejun Heo   blk-throttle: add...
673
674
  static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw,
  					unsigned long jiffy_end)
d1ae8ffdf   Vivek Goyal   blk-throttle: Tri...
675
676
677
  {
  	tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
  }
0f3457f60   Tejun Heo   blk-throttle: add...
678
679
  static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw,
  				       unsigned long jiffy_end)
e43473b7f   Vivek Goyal   blkio: Core imple...
680
681
  {
  	tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
fda6f272c   Tejun Heo   blk-throttle: imp...
682
683
684
685
  	throtl_log(&tg->service_queue,
  		   "[%c] extend slice start=%lu end=%lu jiffies=%lu",
  		   rw == READ ? 'R' : 'W', tg->slice_start[rw],
  		   tg->slice_end[rw], jiffies);
e43473b7f   Vivek Goyal   blkio: Core imple...
686
687
688
  }
  
  /* Determine if previously allocated or extended slice is complete or not */
0f3457f60   Tejun Heo   blk-throttle: add...
689
  static bool throtl_slice_used(struct throtl_grp *tg, bool rw)
e43473b7f   Vivek Goyal   blkio: Core imple...
690
691
  {
  	if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw]))
5cf8c2277   Fabian Frederick   block/blk-throttl...
692
  		return false;
e43473b7f   Vivek Goyal   blkio: Core imple...
693
694
695
696
697
  
  	return 1;
  }
  
  /* Trim the used slices and adjust slice start accordingly */
0f3457f60   Tejun Heo   blk-throttle: add...
698
  static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
e43473b7f   Vivek Goyal   blkio: Core imple...
699
  {
3aad5d3ee   Vivek Goyal   blkio-throttle: F...
700
701
  	unsigned long nr_slices, time_elapsed, io_trim;
  	u64 bytes_trim, tmp;
e43473b7f   Vivek Goyal   blkio: Core imple...
702
703
704
705
706
707
708
709
  
  	BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw]));
  
  	/*
  	 * If bps are unlimited (-1), then time slice don't get
  	 * renewed. Don't try to trim the slice if slice is used. A new
  	 * slice will start when appropriate.
  	 */
0f3457f60   Tejun Heo   blk-throttle: add...
710
  	if (throtl_slice_used(tg, rw))
e43473b7f   Vivek Goyal   blkio: Core imple...
711
  		return;
d1ae8ffdf   Vivek Goyal   blk-throttle: Tri...
712
713
714
715
716
717
718
  	/*
  	 * A bio has been dispatched. Also adjust slice_end. It might happen
  	 * that initially cgroup limit was very low resulting in high
  	 * slice_end, but later limit was bumped up and bio was dispached
  	 * sooner, then we need to reduce slice_end. A high bogus slice_end
  	 * is bad because it does not allow new slice to start.
  	 */
0f3457f60   Tejun Heo   blk-throttle: add...
719
  	throtl_set_slice_end(tg, rw, jiffies + throtl_slice);
d1ae8ffdf   Vivek Goyal   blk-throttle: Tri...
720

e43473b7f   Vivek Goyal   blkio: Core imple...
721
722
723
724
725
726
  	time_elapsed = jiffies - tg->slice_start[rw];
  
  	nr_slices = time_elapsed / throtl_slice;
  
  	if (!nr_slices)
  		return;
3aad5d3ee   Vivek Goyal   blkio-throttle: F...
727
728
729
  	tmp = tg->bps[rw] * throtl_slice * nr_slices;
  	do_div(tmp, HZ);
  	bytes_trim = tmp;
e43473b7f   Vivek Goyal   blkio: Core imple...
730

8e89d13f4   Vivek Goyal   blkio: Implementa...
731
  	io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ;
e43473b7f   Vivek Goyal   blkio: Core imple...
732

8e89d13f4   Vivek Goyal   blkio: Implementa...
733
  	if (!bytes_trim && !io_trim)
e43473b7f   Vivek Goyal   blkio: Core imple...
734
735
736
737
738
739
  		return;
  
  	if (tg->bytes_disp[rw] >= bytes_trim)
  		tg->bytes_disp[rw] -= bytes_trim;
  	else
  		tg->bytes_disp[rw] = 0;
8e89d13f4   Vivek Goyal   blkio: Implementa...
740
741
742
743
  	if (tg->io_disp[rw] >= io_trim)
  		tg->io_disp[rw] -= io_trim;
  	else
  		tg->io_disp[rw] = 0;
e43473b7f   Vivek Goyal   blkio: Core imple...
744
  	tg->slice_start[rw] += nr_slices * throtl_slice;
fda6f272c   Tejun Heo   blk-throttle: imp...
745
746
747
748
  	throtl_log(&tg->service_queue,
  		   "[%c] trim slice nr=%lu bytes=%llu io=%lu start=%lu end=%lu jiffies=%lu",
  		   rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim,
  		   tg->slice_start[rw], tg->slice_end[rw], jiffies);
e43473b7f   Vivek Goyal   blkio: Core imple...
749
  }
0f3457f60   Tejun Heo   blk-throttle: add...
750
751
  static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
  				  unsigned long *wait)
e43473b7f   Vivek Goyal   blkio: Core imple...
752
753
  {
  	bool rw = bio_data_dir(bio);
8e89d13f4   Vivek Goyal   blkio: Implementa...
754
  	unsigned int io_allowed;
e43473b7f   Vivek Goyal   blkio: Core imple...
755
  	unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
c49c06e49   Vivek Goyal   blkio-throttle: F...
756
  	u64 tmp;
e43473b7f   Vivek Goyal   blkio: Core imple...
757

8e89d13f4   Vivek Goyal   blkio: Implementa...
758
  	jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
e43473b7f   Vivek Goyal   blkio: Core imple...
759

8e89d13f4   Vivek Goyal   blkio: Implementa...
760
761
762
763
764
  	/* Slice has just started. Consider one slice interval */
  	if (!jiffy_elapsed)
  		jiffy_elapsed_rnd = throtl_slice;
  
  	jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
c49c06e49   Vivek Goyal   blkio-throttle: F...
765
766
767
768
769
770
771
772
773
774
775
776
777
778
  	/*
  	 * jiffy_elapsed_rnd should not be a big value as minimum iops can be
  	 * 1 then at max jiffy elapsed should be equivalent of 1 second as we
  	 * will allow dispatch after 1 second and after that slice should
  	 * have been trimmed.
  	 */
  
  	tmp = (u64)tg->iops[rw] * jiffy_elapsed_rnd;
  	do_div(tmp, HZ);
  
  	if (tmp > UINT_MAX)
  		io_allowed = UINT_MAX;
  	else
  		io_allowed = tmp;
8e89d13f4   Vivek Goyal   blkio: Implementa...
779
780
  
  	if (tg->io_disp[rw] + 1 <= io_allowed) {
e43473b7f   Vivek Goyal   blkio: Core imple...
781
782
  		if (wait)
  			*wait = 0;
5cf8c2277   Fabian Frederick   block/blk-throttl...
783
  		return true;
e43473b7f   Vivek Goyal   blkio: Core imple...
784
  	}
8e89d13f4   Vivek Goyal   blkio: Implementa...
785
786
787
788
789
790
791
792
793
794
795
796
  	/* Calc approx time to dispatch */
  	jiffy_wait = ((tg->io_disp[rw] + 1) * HZ)/tg->iops[rw] + 1;
  
  	if (jiffy_wait > jiffy_elapsed)
  		jiffy_wait = jiffy_wait - jiffy_elapsed;
  	else
  		jiffy_wait = 1;
  
  	if (wait)
  		*wait = jiffy_wait;
  	return 0;
  }
0f3457f60   Tejun Heo   blk-throttle: add...
797
798
  static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
  				 unsigned long *wait)
8e89d13f4   Vivek Goyal   blkio: Implementa...
799
800
  {
  	bool rw = bio_data_dir(bio);
3aad5d3ee   Vivek Goyal   blkio-throttle: F...
801
  	u64 bytes_allowed, extra_bytes, tmp;
8e89d13f4   Vivek Goyal   blkio: Implementa...
802
  	unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
e43473b7f   Vivek Goyal   blkio: Core imple...
803
804
805
806
807
808
809
810
  
  	jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
  
  	/* Slice has just started. Consider one slice interval */
  	if (!jiffy_elapsed)
  		jiffy_elapsed_rnd = throtl_slice;
  
  	jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
5e901a2b9   Vivek Goyal   blkio-throttle: T...
811
812
  	tmp = tg->bps[rw] * jiffy_elapsed_rnd;
  	do_div(tmp, HZ);
3aad5d3ee   Vivek Goyal   blkio-throttle: F...
813
  	bytes_allowed = tmp;
e43473b7f   Vivek Goyal   blkio: Core imple...
814

4f024f379   Kent Overstreet   block: Abstract o...
815
  	if (tg->bytes_disp[rw] + bio->bi_iter.bi_size <= bytes_allowed) {
e43473b7f   Vivek Goyal   blkio: Core imple...
816
817
  		if (wait)
  			*wait = 0;
5cf8c2277   Fabian Frederick   block/blk-throttl...
818
  		return true;
e43473b7f   Vivek Goyal   blkio: Core imple...
819
820
821
  	}
  
  	/* Calc approx time to dispatch */
4f024f379   Kent Overstreet   block: Abstract o...
822
  	extra_bytes = tg->bytes_disp[rw] + bio->bi_iter.bi_size - bytes_allowed;
e43473b7f   Vivek Goyal   blkio: Core imple...
823
824
825
826
827
828
829
830
831
832
  	jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]);
  
  	if (!jiffy_wait)
  		jiffy_wait = 1;
  
  	/*
  	 * This wait time is without taking into consideration the rounding
  	 * up we did. Add that time also.
  	 */
  	jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed);
e43473b7f   Vivek Goyal   blkio: Core imple...
833
834
  	if (wait)
  		*wait = jiffy_wait;
8e89d13f4   Vivek Goyal   blkio: Implementa...
835
836
837
838
839
840
841
  	return 0;
  }
  
  /*
   * Returns whether one can dispatch a bio or not. Also returns approx number
   * of jiffies to wait before this bio is with-in IO rate and can be dispatched
   */
0f3457f60   Tejun Heo   blk-throttle: add...
842
843
  static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
  			    unsigned long *wait)
8e89d13f4   Vivek Goyal   blkio: Implementa...
844
845
846
847
848
849
850
851
852
853
  {
  	bool rw = bio_data_dir(bio);
  	unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;
  
  	/*
   	 * Currently whole state machine of group depends on first bio
  	 * queued in the group bio list. So one should not be calling
  	 * this function with a different bio if there are other bios
  	 * queued.
  	 */
73f0d49a9   Tejun Heo   blk-throttle: mov...
854
  	BUG_ON(tg->service_queue.nr_queued[rw] &&
c5cc2070b   Tejun Heo   blk-throttle: add...
855
  	       bio != throtl_peek_queued(&tg->service_queue.queued[rw]));
e43473b7f   Vivek Goyal   blkio: Core imple...
856

8e89d13f4   Vivek Goyal   blkio: Implementa...
857
858
859
860
  	/* If tg->bps = -1, then BW is unlimited */
  	if (tg->bps[rw] == -1 && tg->iops[rw] == -1) {
  		if (wait)
  			*wait = 0;
5cf8c2277   Fabian Frederick   block/blk-throttl...
861
  		return true;
8e89d13f4   Vivek Goyal   blkio: Implementa...
862
863
864
865
866
867
868
  	}
  
  	/*
  	 * If previous slice expired, start a new one otherwise renew/extend
  	 * existing slice to make sure it is at least throtl_slice interval
  	 * long since now.
  	 */
0f3457f60   Tejun Heo   blk-throttle: add...
869
870
  	if (throtl_slice_used(tg, rw))
  		throtl_start_new_slice(tg, rw);
8e89d13f4   Vivek Goyal   blkio: Implementa...
871
872
  	else {
  		if (time_before(tg->slice_end[rw], jiffies + throtl_slice))
0f3457f60   Tejun Heo   blk-throttle: add...
873
  			throtl_extend_slice(tg, rw, jiffies + throtl_slice);
8e89d13f4   Vivek Goyal   blkio: Implementa...
874
  	}
0f3457f60   Tejun Heo   blk-throttle: add...
875
876
  	if (tg_with_in_bps_limit(tg, bio, &bps_wait) &&
  	    tg_with_in_iops_limit(tg, bio, &iops_wait)) {
8e89d13f4   Vivek Goyal   blkio: Implementa...
877
878
879
880
881
882
883
884
885
886
887
  		if (wait)
  			*wait = 0;
  		return 1;
  	}
  
  	max_wait = max(bps_wait, iops_wait);
  
  	if (wait)
  		*wait = max_wait;
  
  	if (time_before(tg->slice_end[rw], jiffies + max_wait))
0f3457f60   Tejun Heo   blk-throttle: add...
888
  		throtl_extend_slice(tg, rw, jiffies + max_wait);
e43473b7f   Vivek Goyal   blkio: Core imple...
889
890
891
  
  	return 0;
  }
3c798398e   Tejun Heo   blkcg: mass renam...
892
  static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes,
629ed0b10   Tejun Heo   blkcg: move stati...
893
894
  					 int rw)
  {
8a3d26151   Tejun Heo   blkcg: move blkio...
895
896
  	struct throtl_grp *tg = blkg_to_tg(blkg);
  	struct tg_stats_cpu *stats_cpu;
629ed0b10   Tejun Heo   blkcg: move stati...
897
898
899
  	unsigned long flags;
  
  	/* If per cpu stats are not allocated yet, don't do any accounting. */
8a3d26151   Tejun Heo   blkcg: move blkio...
900
  	if (tg->stats_cpu == NULL)
629ed0b10   Tejun Heo   blkcg: move stati...
901
902
903
904
905
906
907
908
  		return;
  
  	/*
  	 * Disabling interrupts to provide mutual exclusion between two
  	 * writes on same cpu. It probably is not needed for 64bit. Not
  	 * optimizing that case yet.
  	 */
  	local_irq_save(flags);
8a3d26151   Tejun Heo   blkcg: move blkio...
909
  	stats_cpu = this_cpu_ptr(tg->stats_cpu);
629ed0b10   Tejun Heo   blkcg: move stati...
910

629ed0b10   Tejun Heo   blkcg: move stati...
911
912
913
914
915
  	blkg_rwstat_add(&stats_cpu->serviced, rw, 1);
  	blkg_rwstat_add(&stats_cpu->service_bytes, rw, bytes);
  
  	local_irq_restore(flags);
  }
e43473b7f   Vivek Goyal   blkio: Core imple...
916
917
918
  static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
  {
  	bool rw = bio_data_dir(bio);
e43473b7f   Vivek Goyal   blkio: Core imple...
919
920
  
  	/* Charge the bio to the group */
4f024f379   Kent Overstreet   block: Abstract o...
921
  	tg->bytes_disp[rw] += bio->bi_iter.bi_size;
8e89d13f4   Vivek Goyal   blkio: Implementa...
922
  	tg->io_disp[rw]++;
e43473b7f   Vivek Goyal   blkio: Core imple...
923

2a0f61e6e   Tejun Heo   blk-throttle: set...
924
925
926
927
928
929
930
931
932
933
934
935
936
  	/*
  	 * REQ_THROTTLED is used to prevent the same bio to be throttled
  	 * more than once as a throttled bio will go through blk-throtl the
  	 * second time when it eventually gets issued.  Set it when a bio
  	 * is being charged to a tg.
  	 *
  	 * Dispatch stats aren't recursive and each @bio should only be
  	 * accounted by the @tg it was originally associated with.  Let's
  	 * update the stats when setting REQ_THROTTLED for the first time
  	 * which is guaranteed to be for the @bio's original tg.
  	 */
  	if (!(bio->bi_rw & REQ_THROTTLED)) {
  		bio->bi_rw |= REQ_THROTTLED;
4f024f379   Kent Overstreet   block: Abstract o...
937
938
  		throtl_update_dispatch_stats(tg_to_blkg(tg),
  					     bio->bi_iter.bi_size, bio->bi_rw);
2a0f61e6e   Tejun Heo   blk-throttle: set...
939
  	}
e43473b7f   Vivek Goyal   blkio: Core imple...
940
  }
c5cc2070b   Tejun Heo   blk-throttle: add...
941
942
943
944
945
946
947
948
949
950
951
  /**
   * throtl_add_bio_tg - add a bio to the specified throtl_grp
   * @bio: bio to add
   * @qn: qnode to use
   * @tg: the target throtl_grp
   *
   * Add @bio to @tg's service_queue using @qn.  If @qn is not specified,
   * tg->qnode_on_self[] is used.
   */
  static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn,
  			      struct throtl_grp *tg)
e43473b7f   Vivek Goyal   blkio: Core imple...
952
  {
73f0d49a9   Tejun Heo   blk-throttle: mov...
953
  	struct throtl_service_queue *sq = &tg->service_queue;
e43473b7f   Vivek Goyal   blkio: Core imple...
954
  	bool rw = bio_data_dir(bio);
c5cc2070b   Tejun Heo   blk-throttle: add...
955
956
  	if (!qn)
  		qn = &tg->qnode_on_self[rw];
0e9f4164b   Tejun Heo   blk-throttle: gen...
957
958
959
960
961
962
963
964
  	/*
  	 * If @tg doesn't currently have any bios queued in the same
  	 * direction, queueing @bio can change when @tg should be
  	 * dispatched.  Mark that @tg was empty.  This is automatically
  	 * cleaered on the next tg_update_disptime().
  	 */
  	if (!sq->nr_queued[rw])
  		tg->flags |= THROTL_TG_WAS_EMPTY;
c5cc2070b   Tejun Heo   blk-throttle: add...
965
  	throtl_qnode_add_bio(bio, qn, &sq->queued[rw]);
73f0d49a9   Tejun Heo   blk-throttle: mov...
966
  	sq->nr_queued[rw]++;
77216b048   Tejun Heo   blk-throttle: add...
967
  	throtl_enqueue_tg(tg);
e43473b7f   Vivek Goyal   blkio: Core imple...
968
  }
77216b048   Tejun Heo   blk-throttle: add...
969
  static void tg_update_disptime(struct throtl_grp *tg)
e43473b7f   Vivek Goyal   blkio: Core imple...
970
  {
73f0d49a9   Tejun Heo   blk-throttle: mov...
971
  	struct throtl_service_queue *sq = &tg->service_queue;
e43473b7f   Vivek Goyal   blkio: Core imple...
972
973
  	unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;
  	struct bio *bio;
c5cc2070b   Tejun Heo   blk-throttle: add...
974
  	if ((bio = throtl_peek_queued(&sq->queued[READ])))
0f3457f60   Tejun Heo   blk-throttle: add...
975
  		tg_may_dispatch(tg, bio, &read_wait);
e43473b7f   Vivek Goyal   blkio: Core imple...
976

c5cc2070b   Tejun Heo   blk-throttle: add...
977
  	if ((bio = throtl_peek_queued(&sq->queued[WRITE])))
0f3457f60   Tejun Heo   blk-throttle: add...
978
  		tg_may_dispatch(tg, bio, &write_wait);
e43473b7f   Vivek Goyal   blkio: Core imple...
979
980
981
  
  	min_wait = min(read_wait, write_wait);
  	disptime = jiffies + min_wait;
e43473b7f   Vivek Goyal   blkio: Core imple...
982
  	/* Update dispatch time */
77216b048   Tejun Heo   blk-throttle: add...
983
  	throtl_dequeue_tg(tg);
e43473b7f   Vivek Goyal   blkio: Core imple...
984
  	tg->disptime = disptime;
77216b048   Tejun Heo   blk-throttle: add...
985
  	throtl_enqueue_tg(tg);
0e9f4164b   Tejun Heo   blk-throttle: gen...
986
987
988
  
  	/* see throtl_add_bio_tg() */
  	tg->flags &= ~THROTL_TG_WAS_EMPTY;
e43473b7f   Vivek Goyal   blkio: Core imple...
989
  }
32ee5bc47   Vivek Goyal   blk-throttle: Acc...
990
991
992
993
994
995
996
997
998
  static void start_parent_slice_with_credit(struct throtl_grp *child_tg,
  					struct throtl_grp *parent_tg, bool rw)
  {
  	if (throtl_slice_used(parent_tg, rw)) {
  		throtl_start_new_slice_with_credit(parent_tg, rw,
  				child_tg->slice_start[rw]);
  	}
  
  }
77216b048   Tejun Heo   blk-throttle: add...
999
  static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
e43473b7f   Vivek Goyal   blkio: Core imple...
1000
  {
73f0d49a9   Tejun Heo   blk-throttle: mov...
1001
  	struct throtl_service_queue *sq = &tg->service_queue;
6bc9c2b46   Tejun Heo   blk-throttle: mak...
1002
1003
  	struct throtl_service_queue *parent_sq = sq->parent_sq;
  	struct throtl_grp *parent_tg = sq_to_tg(parent_sq);
c5cc2070b   Tejun Heo   blk-throttle: add...
1004
  	struct throtl_grp *tg_to_put = NULL;
e43473b7f   Vivek Goyal   blkio: Core imple...
1005
  	struct bio *bio;
c5cc2070b   Tejun Heo   blk-throttle: add...
1006
1007
1008
1009
1010
1011
1012
  	/*
  	 * @bio is being transferred from @tg to @parent_sq.  Popping a bio
  	 * from @tg may put its reference and @parent_sq might end up
  	 * getting released prematurely.  Remember the tg to put and put it
  	 * after @bio is transferred to @parent_sq.
  	 */
  	bio = throtl_pop_queued(&sq->queued[rw], &tg_to_put);
73f0d49a9   Tejun Heo   blk-throttle: mov...
1013
  	sq->nr_queued[rw]--;
e43473b7f   Vivek Goyal   blkio: Core imple...
1014
1015
  
  	throtl_charge_bio(tg, bio);
6bc9c2b46   Tejun Heo   blk-throttle: mak...
1016
1017
1018
1019
1020
1021
1022
1023
1024
  
  	/*
  	 * If our parent is another tg, we just need to transfer @bio to
  	 * the parent using throtl_add_bio_tg().  If our parent is
  	 * @td->service_queue, @bio is ready to be issued.  Put it on its
  	 * bio_lists[] and decrease total number queued.  The caller is
  	 * responsible for issuing these bios.
  	 */
  	if (parent_tg) {
c5cc2070b   Tejun Heo   blk-throttle: add...
1025
  		throtl_add_bio_tg(bio, &tg->qnode_on_parent[rw], parent_tg);
32ee5bc47   Vivek Goyal   blk-throttle: Acc...
1026
  		start_parent_slice_with_credit(tg, parent_tg, rw);
6bc9c2b46   Tejun Heo   blk-throttle: mak...
1027
  	} else {
c5cc2070b   Tejun Heo   blk-throttle: add...
1028
1029
  		throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw],
  				     &parent_sq->queued[rw]);
6bc9c2b46   Tejun Heo   blk-throttle: mak...
1030
1031
1032
  		BUG_ON(tg->td->nr_queued[rw] <= 0);
  		tg->td->nr_queued[rw]--;
  	}
e43473b7f   Vivek Goyal   blkio: Core imple...
1033

0f3457f60   Tejun Heo   blk-throttle: add...
1034
  	throtl_trim_slice(tg, rw);
6bc9c2b46   Tejun Heo   blk-throttle: mak...
1035

c5cc2070b   Tejun Heo   blk-throttle: add...
1036
1037
  	if (tg_to_put)
  		blkg_put(tg_to_blkg(tg_to_put));
e43473b7f   Vivek Goyal   blkio: Core imple...
1038
  }
77216b048   Tejun Heo   blk-throttle: add...
1039
  static int throtl_dispatch_tg(struct throtl_grp *tg)
e43473b7f   Vivek Goyal   blkio: Core imple...
1040
  {
73f0d49a9   Tejun Heo   blk-throttle: mov...
1041
  	struct throtl_service_queue *sq = &tg->service_queue;
e43473b7f   Vivek Goyal   blkio: Core imple...
1042
1043
  	unsigned int nr_reads = 0, nr_writes = 0;
  	unsigned int max_nr_reads = throtl_grp_quantum*3/4;
c2f6805d4   Vivek Goyal   blk-throttle: Fix...
1044
  	unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads;
e43473b7f   Vivek Goyal   blkio: Core imple...
1045
1046
1047
  	struct bio *bio;
  
  	/* Try to dispatch 75% READS and 25% WRITES */
c5cc2070b   Tejun Heo   blk-throttle: add...
1048
  	while ((bio = throtl_peek_queued(&sq->queued[READ])) &&
0f3457f60   Tejun Heo   blk-throttle: add...
1049
  	       tg_may_dispatch(tg, bio, NULL)) {
e43473b7f   Vivek Goyal   blkio: Core imple...
1050

77216b048   Tejun Heo   blk-throttle: add...
1051
  		tg_dispatch_one_bio(tg, bio_data_dir(bio));
e43473b7f   Vivek Goyal   blkio: Core imple...
1052
1053
1054
1055
1056
  		nr_reads++;
  
  		if (nr_reads >= max_nr_reads)
  			break;
  	}
c5cc2070b   Tejun Heo   blk-throttle: add...
1057
  	while ((bio = throtl_peek_queued(&sq->queued[WRITE])) &&
0f3457f60   Tejun Heo   blk-throttle: add...
1058
  	       tg_may_dispatch(tg, bio, NULL)) {
e43473b7f   Vivek Goyal   blkio: Core imple...
1059

77216b048   Tejun Heo   blk-throttle: add...
1060
  		tg_dispatch_one_bio(tg, bio_data_dir(bio));
e43473b7f   Vivek Goyal   blkio: Core imple...
1061
1062
1063
1064
1065
1066
1067
1068
  		nr_writes++;
  
  		if (nr_writes >= max_nr_writes)
  			break;
  	}
  
  	return nr_reads + nr_writes;
  }
651930bc1   Tejun Heo   blk-throttle: dis...
1069
  static int throtl_select_dispatch(struct throtl_service_queue *parent_sq)
e43473b7f   Vivek Goyal   blkio: Core imple...
1070
1071
  {
  	unsigned int nr_disp = 0;
e43473b7f   Vivek Goyal   blkio: Core imple...
1072
1073
  
  	while (1) {
73f0d49a9   Tejun Heo   blk-throttle: mov...
1074
1075
  		struct throtl_grp *tg = throtl_rb_first(parent_sq);
  		struct throtl_service_queue *sq = &tg->service_queue;
e43473b7f   Vivek Goyal   blkio: Core imple...
1076
1077
1078
1079
1080
1081
  
  		if (!tg)
  			break;
  
  		if (time_before(jiffies, tg->disptime))
  			break;
77216b048   Tejun Heo   blk-throttle: add...
1082
  		throtl_dequeue_tg(tg);
e43473b7f   Vivek Goyal   blkio: Core imple...
1083

77216b048   Tejun Heo   blk-throttle: add...
1084
  		nr_disp += throtl_dispatch_tg(tg);
e43473b7f   Vivek Goyal   blkio: Core imple...
1085

73f0d49a9   Tejun Heo   blk-throttle: mov...
1086
  		if (sq->nr_queued[0] || sq->nr_queued[1])
77216b048   Tejun Heo   blk-throttle: add...
1087
  			tg_update_disptime(tg);
e43473b7f   Vivek Goyal   blkio: Core imple...
1088
1089
1090
1091
1092
1093
1094
  
  		if (nr_disp >= throtl_quantum)
  			break;
  	}
  
  	return nr_disp;
  }
6e1a5704c   Tejun Heo   blk-throttle: dis...
1095
1096
1097
1098
1099
1100
1101
  /**
   * throtl_pending_timer_fn - timer function for service_queue->pending_timer
   * @arg: the throtl_service_queue being serviced
   *
   * This timer is armed when a child throtl_grp with active bio's become
   * pending and queued on the service_queue's pending_tree and expires when
   * the first child throtl_grp should be dispatched.  This function
2e48a530a   Tejun Heo   blk-throttle: mak...
1102
1103
1104
1105
1106
1107
1108
   * dispatches bio's from the children throtl_grps to the parent
   * service_queue.
   *
   * If the parent's parent is another throtl_grp, dispatching is propagated
   * by either arming its pending_timer or repeating dispatch directly.  If
   * the top-level service_tree is reached, throtl_data->dispatch_work is
   * kicked so that the ready bio's are issued.
6e1a5704c   Tejun Heo   blk-throttle: dis...
1109
   */
69df0ab03   Tejun Heo   blk-throttle: sep...
1110
1111
1112
  static void throtl_pending_timer_fn(unsigned long arg)
  {
  	struct throtl_service_queue *sq = (void *)arg;
2e48a530a   Tejun Heo   blk-throttle: mak...
1113
  	struct throtl_grp *tg = sq_to_tg(sq);
69df0ab03   Tejun Heo   blk-throttle: sep...
1114
  	struct throtl_data *td = sq_to_td(sq);
cb76199c3   Tejun Heo   blk-throttle: col...
1115
  	struct request_queue *q = td->queue;
2e48a530a   Tejun Heo   blk-throttle: mak...
1116
1117
  	struct throtl_service_queue *parent_sq;
  	bool dispatched;
6e1a5704c   Tejun Heo   blk-throttle: dis...
1118
  	int ret;
e43473b7f   Vivek Goyal   blkio: Core imple...
1119
1120
  
  	spin_lock_irq(q->queue_lock);
2e48a530a   Tejun Heo   blk-throttle: mak...
1121
1122
1123
  again:
  	parent_sq = sq->parent_sq;
  	dispatched = false;
e43473b7f   Vivek Goyal   blkio: Core imple...
1124

7f52f98c2   Tejun Heo   blk-throttle: imp...
1125
1126
  	while (true) {
  		throtl_log(sq, "dispatch nr_queued=%u read=%u write=%u",
2e48a530a   Tejun Heo   blk-throttle: mak...
1127
1128
  			   sq->nr_queued[READ] + sq->nr_queued[WRITE],
  			   sq->nr_queued[READ], sq->nr_queued[WRITE]);
7f52f98c2   Tejun Heo   blk-throttle: imp...
1129
1130
1131
  
  		ret = throtl_select_dispatch(sq);
  		if (ret) {
7f52f98c2   Tejun Heo   blk-throttle: imp...
1132
1133
1134
  			throtl_log(sq, "bios disp=%u", ret);
  			dispatched = true;
  		}
e43473b7f   Vivek Goyal   blkio: Core imple...
1135

7f52f98c2   Tejun Heo   blk-throttle: imp...
1136
1137
  		if (throtl_schedule_next_dispatch(sq, false))
  			break;
e43473b7f   Vivek Goyal   blkio: Core imple...
1138

7f52f98c2   Tejun Heo   blk-throttle: imp...
1139
1140
1141
1142
  		/* this dispatch windows is still open, relax and repeat */
  		spin_unlock_irq(q->queue_lock);
  		cpu_relax();
  		spin_lock_irq(q->queue_lock);
651930bc1   Tejun Heo   blk-throttle: dis...
1143
  	}
e43473b7f   Vivek Goyal   blkio: Core imple...
1144

2e48a530a   Tejun Heo   blk-throttle: mak...
1145
1146
  	if (!dispatched)
  		goto out_unlock;
6e1a5704c   Tejun Heo   blk-throttle: dis...
1147

2e48a530a   Tejun Heo   blk-throttle: mak...
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
  	if (parent_sq) {
  		/* @parent_sq is another throl_grp, propagate dispatch */
  		if (tg->flags & THROTL_TG_WAS_EMPTY) {
  			tg_update_disptime(tg);
  			if (!throtl_schedule_next_dispatch(parent_sq, false)) {
  				/* window is already open, repeat dispatching */
  				sq = parent_sq;
  				tg = sq_to_tg(sq);
  				goto again;
  			}
  		}
  	} else {
  		/* reached the top-level, queue issueing */
  		queue_work(kthrotld_workqueue, &td->dispatch_work);
  	}
  out_unlock:
e43473b7f   Vivek Goyal   blkio: Core imple...
1164
  	spin_unlock_irq(q->queue_lock);
6e1a5704c   Tejun Heo   blk-throttle: dis...
1165
  }
e43473b7f   Vivek Goyal   blkio: Core imple...
1166

6e1a5704c   Tejun Heo   blk-throttle: dis...
1167
1168
1169
1170
1171
1172
1173
1174
  /**
   * blk_throtl_dispatch_work_fn - work function for throtl_data->dispatch_work
   * @work: work item being executed
   *
   * This function is queued for execution when bio's reach the bio_lists[]
   * of throtl_data->service_queue.  Those bio's are ready and issued by this
   * function.
   */
8876e140e   Fabian Frederick   block/blk-throttl...
1175
  static void blk_throtl_dispatch_work_fn(struct work_struct *work)
6e1a5704c   Tejun Heo   blk-throttle: dis...
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
  {
  	struct throtl_data *td = container_of(work, struct throtl_data,
  					      dispatch_work);
  	struct throtl_service_queue *td_sq = &td->service_queue;
  	struct request_queue *q = td->queue;
  	struct bio_list bio_list_on_stack;
  	struct bio *bio;
  	struct blk_plug plug;
  	int rw;
  
  	bio_list_init(&bio_list_on_stack);
  
  	spin_lock_irq(q->queue_lock);
c5cc2070b   Tejun Heo   blk-throttle: add...
1189
1190
1191
  	for (rw = READ; rw <= WRITE; rw++)
  		while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL)))
  			bio_list_add(&bio_list_on_stack, bio);
6e1a5704c   Tejun Heo   blk-throttle: dis...
1192
1193
1194
  	spin_unlock_irq(q->queue_lock);
  
  	if (!bio_list_empty(&bio_list_on_stack)) {
69d60eb96   Vivek Goyal   blk-throttle: Use...
1195
  		blk_start_plug(&plug);
e43473b7f   Vivek Goyal   blkio: Core imple...
1196
1197
  		while((bio = bio_list_pop(&bio_list_on_stack)))
  			generic_make_request(bio);
69d60eb96   Vivek Goyal   blk-throttle: Use...
1198
  		blk_finish_plug(&plug);
e43473b7f   Vivek Goyal   blkio: Core imple...
1199
  	}
e43473b7f   Vivek Goyal   blkio: Core imple...
1200
  }
f95a04afa   Tejun Heo   blkcg: embed stru...
1201
1202
  static u64 tg_prfill_cpu_rwstat(struct seq_file *sf,
  				struct blkg_policy_data *pd, int off)
41b38b6d5   Tejun Heo   blkcg: cfq doesn'...
1203
  {
f95a04afa   Tejun Heo   blkcg: embed stru...
1204
  	struct throtl_grp *tg = pd_to_tg(pd);
41b38b6d5   Tejun Heo   blkcg: cfq doesn'...
1205
1206
  	struct blkg_rwstat rwstat = { }, tmp;
  	int i, cpu;
045c47ca3   Thadeu Lima de Souza Cascardo   blk-throttle: che...
1207
1208
  	if (tg->stats_cpu == NULL)
  		return 0;
41b38b6d5   Tejun Heo   blkcg: cfq doesn'...
1209
  	for_each_possible_cpu(cpu) {
8a3d26151   Tejun Heo   blkcg: move blkio...
1210
  		struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu);
41b38b6d5   Tejun Heo   blkcg: cfq doesn'...
1211
1212
1213
1214
1215
  
  		tmp = blkg_rwstat_read((void *)sc + off);
  		for (i = 0; i < BLKG_RWSTAT_NR; i++)
  			rwstat.cnt[i] += tmp.cnt[i];
  	}
f95a04afa   Tejun Heo   blkcg: embed stru...
1216
  	return __blkg_prfill_rwstat(sf, pd, &rwstat);
41b38b6d5   Tejun Heo   blkcg: cfq doesn'...
1217
  }
2da8ca822   Tejun Heo   cgroup: replace c...
1218
  static int tg_print_cpu_rwstat(struct seq_file *sf, void *v)
41b38b6d5   Tejun Heo   blkcg: cfq doesn'...
1219
  {
2da8ca822   Tejun Heo   cgroup: replace c...
1220
1221
  	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_cpu_rwstat,
  			  &blkcg_policy_throtl, seq_cft(sf)->private, true);
41b38b6d5   Tejun Heo   blkcg: cfq doesn'...
1222
1223
  	return 0;
  }
f95a04afa   Tejun Heo   blkcg: embed stru...
1224
1225
  static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd,
  			      int off)
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1226
  {
f95a04afa   Tejun Heo   blkcg: embed stru...
1227
1228
  	struct throtl_grp *tg = pd_to_tg(pd);
  	u64 v = *(u64 *)((void *)tg + off);
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1229

af133ceb2   Tejun Heo   blkcg: move blkio...
1230
  	if (v == -1)
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1231
  		return 0;
f95a04afa   Tejun Heo   blkcg: embed stru...
1232
  	return __blkg_prfill_u64(sf, pd, v);
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1233
  }
f95a04afa   Tejun Heo   blkcg: embed stru...
1234
1235
  static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd,
  			       int off)
e43473b7f   Vivek Goyal   blkio: Core imple...
1236
  {
f95a04afa   Tejun Heo   blkcg: embed stru...
1237
1238
  	struct throtl_grp *tg = pd_to_tg(pd);
  	unsigned int v = *(unsigned int *)((void *)tg + off);
fe0714377   Vivek Goyal   blkio: Recalculat...
1239

af133ceb2   Tejun Heo   blkcg: move blkio...
1240
1241
  	if (v == -1)
  		return 0;
f95a04afa   Tejun Heo   blkcg: embed stru...
1242
  	return __blkg_prfill_u64(sf, pd, v);
e43473b7f   Vivek Goyal   blkio: Core imple...
1243
  }
2da8ca822   Tejun Heo   cgroup: replace c...
1244
  static int tg_print_conf_u64(struct seq_file *sf, void *v)
8e89d13f4   Vivek Goyal   blkio: Implementa...
1245
  {
2da8ca822   Tejun Heo   cgroup: replace c...
1246
1247
  	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_u64,
  			  &blkcg_policy_throtl, seq_cft(sf)->private, false);
af133ceb2   Tejun Heo   blkcg: move blkio...
1248
  	return 0;
8e89d13f4   Vivek Goyal   blkio: Implementa...
1249
  }
2da8ca822   Tejun Heo   cgroup: replace c...
1250
  static int tg_print_conf_uint(struct seq_file *sf, void *v)
8e89d13f4   Vivek Goyal   blkio: Implementa...
1251
  {
2da8ca822   Tejun Heo   cgroup: replace c...
1252
1253
  	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_uint,
  			  &blkcg_policy_throtl, seq_cft(sf)->private, false);
af133ceb2   Tejun Heo   blkcg: move blkio...
1254
  	return 0;
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1255
  }
451af504d   Tejun Heo   cgroup: replace c...
1256
1257
  static ssize_t tg_set_conf(struct kernfs_open_file *of,
  			   char *buf, size_t nbytes, loff_t off, bool is_u64)
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1258
  {
451af504d   Tejun Heo   cgroup: replace c...
1259
  	struct blkcg *blkcg = css_to_blkcg(of_css(of));
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1260
  	struct blkg_conf_ctx ctx;
af133ceb2   Tejun Heo   blkcg: move blkio...
1261
  	struct throtl_grp *tg;
69df0ab03   Tejun Heo   blk-throttle: sep...
1262
  	struct throtl_service_queue *sq;
693e751e7   Tejun Heo   blk-throttle: imp...
1263
  	struct blkcg_gq *blkg;
492eb21b9   Tejun Heo   cgroup: make hier...
1264
  	struct cgroup_subsys_state *pos_css;
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1265
  	int ret;
3c798398e   Tejun Heo   blkcg: mass renam...
1266
  	ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1267
1268
  	if (ret)
  		return ret;
af133ceb2   Tejun Heo   blkcg: move blkio...
1269
  	tg = blkg_to_tg(ctx.blkg);
69df0ab03   Tejun Heo   blk-throttle: sep...
1270
  	sq = &tg->service_queue;
af133ceb2   Tejun Heo   blkcg: move blkio...
1271

a2b1693ba   Tejun Heo   blkcg: implement ...
1272
1273
  	if (!ctx.v)
  		ctx.v = -1;
af133ceb2   Tejun Heo   blkcg: move blkio...
1274

a2b1693ba   Tejun Heo   blkcg: implement ...
1275
  	if (is_u64)
451af504d   Tejun Heo   cgroup: replace c...
1276
  		*(u64 *)((void *)tg + of_cft(of)->private) = ctx.v;
a2b1693ba   Tejun Heo   blkcg: implement ...
1277
  	else
451af504d   Tejun Heo   cgroup: replace c...
1278
  		*(unsigned int *)((void *)tg + of_cft(of)->private) = ctx.v;
af133ceb2   Tejun Heo   blkcg: move blkio...
1279

fda6f272c   Tejun Heo   blk-throttle: imp...
1280
1281
1282
1283
  	throtl_log(&tg->service_queue,
  		   "limit change rbps=%llu wbps=%llu riops=%u wiops=%u",
  		   tg->bps[READ], tg->bps[WRITE],
  		   tg->iops[READ], tg->iops[WRITE]);
632b44935   Tejun Heo   blk-throttle: rem...
1284
1285
  
  	/*
693e751e7   Tejun Heo   blk-throttle: imp...
1286
1287
1288
1289
1290
1291
  	 * Update has_rules[] flags for the updated tg's subtree.  A tg is
  	 * considered to have rules if either the tg itself or any of its
  	 * ancestors has rules.  This identifies groups without any
  	 * restrictions in the whole hierarchy and allows them to bypass
  	 * blk-throttle.
  	 */
492eb21b9   Tejun Heo   cgroup: make hier...
1292
  	blkg_for_each_descendant_pre(blkg, pos_css, ctx.blkg)
693e751e7   Tejun Heo   blk-throttle: imp...
1293
1294
1295
  		tg_update_has_rules(blkg_to_tg(blkg));
  
  	/*
632b44935   Tejun Heo   blk-throttle: rem...
1296
1297
1298
1299
1300
1301
1302
  	 * We're already holding queue_lock and know @tg is valid.  Let's
  	 * apply the new config directly.
  	 *
  	 * Restart the slices for both READ and WRITES. It might happen
  	 * that a group's limit are dropped suddenly and we don't want to
  	 * account recently dispatched IO with new low rate.
  	 */
0f3457f60   Tejun Heo   blk-throttle: add...
1303
1304
  	throtl_start_new_slice(tg, 0);
  	throtl_start_new_slice(tg, 1);
632b44935   Tejun Heo   blk-throttle: rem...
1305

5b2c16aae   Tejun Heo   blk-throttle: sim...
1306
  	if (tg->flags & THROTL_TG_PENDING) {
77216b048   Tejun Heo   blk-throttle: add...
1307
  		tg_update_disptime(tg);
7f52f98c2   Tejun Heo   blk-throttle: imp...
1308
  		throtl_schedule_next_dispatch(sq->parent_sq, true);
632b44935   Tejun Heo   blk-throttle: rem...
1309
  	}
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1310
1311
  
  	blkg_conf_finish(&ctx);
451af504d   Tejun Heo   cgroup: replace c...
1312
  	return nbytes;
8e89d13f4   Vivek Goyal   blkio: Implementa...
1313
  }
451af504d   Tejun Heo   cgroup: replace c...
1314
1315
  static ssize_t tg_set_conf_u64(struct kernfs_open_file *of,
  			       char *buf, size_t nbytes, loff_t off)
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1316
  {
451af504d   Tejun Heo   cgroup: replace c...
1317
  	return tg_set_conf(of, buf, nbytes, off, true);
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1318
  }
451af504d   Tejun Heo   cgroup: replace c...
1319
1320
  static ssize_t tg_set_conf_uint(struct kernfs_open_file *of,
  				char *buf, size_t nbytes, loff_t off)
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1321
  {
451af504d   Tejun Heo   cgroup: replace c...
1322
  	return tg_set_conf(of, buf, nbytes, off, false);
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1323
1324
1325
1326
1327
  }
  
  static struct cftype throtl_files[] = {
  	{
  		.name = "throttle.read_bps_device",
af133ceb2   Tejun Heo   blkcg: move blkio...
1328
  		.private = offsetof(struct throtl_grp, bps[READ]),
2da8ca822   Tejun Heo   cgroup: replace c...
1329
  		.seq_show = tg_print_conf_u64,
451af504d   Tejun Heo   cgroup: replace c...
1330
  		.write = tg_set_conf_u64,
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1331
1332
1333
  	},
  	{
  		.name = "throttle.write_bps_device",
af133ceb2   Tejun Heo   blkcg: move blkio...
1334
  		.private = offsetof(struct throtl_grp, bps[WRITE]),
2da8ca822   Tejun Heo   cgroup: replace c...
1335
  		.seq_show = tg_print_conf_u64,
451af504d   Tejun Heo   cgroup: replace c...
1336
  		.write = tg_set_conf_u64,
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1337
1338
1339
  	},
  	{
  		.name = "throttle.read_iops_device",
af133ceb2   Tejun Heo   blkcg: move blkio...
1340
  		.private = offsetof(struct throtl_grp, iops[READ]),
2da8ca822   Tejun Heo   cgroup: replace c...
1341
  		.seq_show = tg_print_conf_uint,
451af504d   Tejun Heo   cgroup: replace c...
1342
  		.write = tg_set_conf_uint,
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1343
1344
1345
  	},
  	{
  		.name = "throttle.write_iops_device",
af133ceb2   Tejun Heo   blkcg: move blkio...
1346
  		.private = offsetof(struct throtl_grp, iops[WRITE]),
2da8ca822   Tejun Heo   cgroup: replace c...
1347
  		.seq_show = tg_print_conf_uint,
451af504d   Tejun Heo   cgroup: replace c...
1348
  		.write = tg_set_conf_uint,
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1349
1350
1351
  	},
  	{
  		.name = "throttle.io_service_bytes",
5bc4afb1e   Tejun Heo   blkcg: drop BLKCG...
1352
  		.private = offsetof(struct tg_stats_cpu, service_bytes),
2da8ca822   Tejun Heo   cgroup: replace c...
1353
  		.seq_show = tg_print_cpu_rwstat,
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1354
1355
1356
  	},
  	{
  		.name = "throttle.io_serviced",
5bc4afb1e   Tejun Heo   blkcg: drop BLKCG...
1357
  		.private = offsetof(struct tg_stats_cpu, serviced),
2da8ca822   Tejun Heo   cgroup: replace c...
1358
  		.seq_show = tg_print_cpu_rwstat,
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1359
1360
1361
  	},
  	{ }	/* terminate */
  };
da5277700   Vivek Goyal   block: Move blk_t...
1362
  static void throtl_shutdown_wq(struct request_queue *q)
e43473b7f   Vivek Goyal   blkio: Core imple...
1363
1364
  {
  	struct throtl_data *td = q->td;
69df0ab03   Tejun Heo   blk-throttle: sep...
1365
  	cancel_work_sync(&td->dispatch_work);
e43473b7f   Vivek Goyal   blkio: Core imple...
1366
  }
3c798398e   Tejun Heo   blkcg: mass renam...
1367
  static struct blkcg_policy blkcg_policy_throtl = {
f9fcc2d39   Tejun Heo   blkcg: collapse b...
1368
1369
1370
1371
  	.pd_size		= sizeof(struct throtl_grp),
  	.cftypes		= throtl_files,
  
  	.pd_init_fn		= throtl_pd_init,
693e751e7   Tejun Heo   blk-throttle: imp...
1372
  	.pd_online_fn		= throtl_pd_online,
f9fcc2d39   Tejun Heo   blkcg: collapse b...
1373
1374
  	.pd_exit_fn		= throtl_pd_exit,
  	.pd_reset_stats_fn	= throtl_pd_reset_stats,
e43473b7f   Vivek Goyal   blkio: Core imple...
1375
  };
bc16a4f93   Tejun Heo   block: reorganize...
1376
  bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
e43473b7f   Vivek Goyal   blkio: Core imple...
1377
1378
  {
  	struct throtl_data *td = q->td;
c5cc2070b   Tejun Heo   blk-throttle: add...
1379
  	struct throtl_qnode *qn = NULL;
e43473b7f   Vivek Goyal   blkio: Core imple...
1380
  	struct throtl_grp *tg;
73f0d49a9   Tejun Heo   blk-throttle: mov...
1381
  	struct throtl_service_queue *sq;
0e9f4164b   Tejun Heo   blk-throttle: gen...
1382
  	bool rw = bio_data_dir(bio);
3c798398e   Tejun Heo   blkcg: mass renam...
1383
  	struct blkcg *blkcg;
bc16a4f93   Tejun Heo   block: reorganize...
1384
  	bool throttled = false;
e43473b7f   Vivek Goyal   blkio: Core imple...
1385

2a0f61e6e   Tejun Heo   blk-throttle: set...
1386
1387
  	/* see throtl_charge_bio() */
  	if (bio->bi_rw & REQ_THROTTLED)
bc16a4f93   Tejun Heo   block: reorganize...
1388
  		goto out;
e43473b7f   Vivek Goyal   blkio: Core imple...
1389

af75cd3c6   Vivek Goyal   blk-throttle: Mak...
1390
1391
1392
1393
1394
  	/*
  	 * A throtl_grp pointer retrieved under rcu can be used to access
  	 * basic fields like stats and io rates. If a group has no rules,
  	 * just update the dispatch stats in lockless manner and return.
  	 */
af75cd3c6   Vivek Goyal   blk-throttle: Mak...
1395
  	rcu_read_lock();
3c798398e   Tejun Heo   blkcg: mass renam...
1396
  	blkcg = bio_blkcg(bio);
cd1604fab   Tejun Heo   blkcg: factor out...
1397
  	tg = throtl_lookup_tg(td, blkcg);
af75cd3c6   Vivek Goyal   blk-throttle: Mak...
1398
  	if (tg) {
693e751e7   Tejun Heo   blk-throttle: imp...
1399
  		if (!tg->has_rules[rw]) {
629ed0b10   Tejun Heo   blkcg: move stati...
1400
  			throtl_update_dispatch_stats(tg_to_blkg(tg),
4f024f379   Kent Overstreet   block: Abstract o...
1401
  					bio->bi_iter.bi_size, bio->bi_rw);
2a7f12441   Tejun Heo   blkcg: move rcu_r...
1402
  			goto out_unlock_rcu;
af75cd3c6   Vivek Goyal   blk-throttle: Mak...
1403
1404
  		}
  	}
af75cd3c6   Vivek Goyal   blk-throttle: Mak...
1405
1406
1407
1408
1409
  
  	/*
  	 * Either group has not been allocated yet or it is not an unlimited
  	 * IO group
  	 */
e43473b7f   Vivek Goyal   blkio: Core imple...
1410
  	spin_lock_irq(q->queue_lock);
cd1604fab   Tejun Heo   blkcg: factor out...
1411
  	tg = throtl_lookup_create_tg(td, blkcg);
bc16a4f93   Tejun Heo   block: reorganize...
1412
1413
  	if (unlikely(!tg))
  		goto out_unlock;
f469a7b4d   Vivek Goyal   blk-cgroup: Allow...
1414

73f0d49a9   Tejun Heo   blk-throttle: mov...
1415
  	sq = &tg->service_queue;
9e660acff   Tejun Heo   blk-throttle: mak...
1416
1417
1418
1419
  	while (true) {
  		/* throtl is FIFO - if bios are already queued, should queue */
  		if (sq->nr_queued[rw])
  			break;
de701c74a   Vivek Goyal   blk-throttle: Som...
1420

9e660acff   Tejun Heo   blk-throttle: mak...
1421
1422
1423
1424
1425
  		/* if above limits, break to queue */
  		if (!tg_may_dispatch(tg, bio, NULL))
  			break;
  
  		/* within limits, let's charge and dispatch directly */
e43473b7f   Vivek Goyal   blkio: Core imple...
1426
  		throtl_charge_bio(tg, bio);
04521db04   Vivek Goyal   blk-throttle: Res...
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
  
  		/*
  		 * We need to trim slice even when bios are not being queued
  		 * otherwise it might happen that a bio is not queued for
  		 * a long time and slice keeps on extending and trim is not
  		 * called for a long time. Now if limits are reduced suddenly
  		 * we take into account all the IO dispatched so far at new
  		 * low rate and * newly queued IO gets a really long dispatch
  		 * time.
  		 *
  		 * So keep on trimming slice even if bio is not queued.
  		 */
0f3457f60   Tejun Heo   blk-throttle: add...
1439
  		throtl_trim_slice(tg, rw);
9e660acff   Tejun Heo   blk-throttle: mak...
1440
1441
1442
1443
1444
1445
  
  		/*
  		 * @bio passed through this layer without being throttled.
  		 * Climb up the ladder.  If we''re already at the top, it
  		 * can be executed directly.
  		 */
c5cc2070b   Tejun Heo   blk-throttle: add...
1446
  		qn = &tg->qnode_on_parent[rw];
9e660acff   Tejun Heo   blk-throttle: mak...
1447
1448
1449
1450
  		sq = sq->parent_sq;
  		tg = sq_to_tg(sq);
  		if (!tg)
  			goto out_unlock;
e43473b7f   Vivek Goyal   blkio: Core imple...
1451
  	}
9e660acff   Tejun Heo   blk-throttle: mak...
1452
  	/* out-of-limit, queue to @tg */
fda6f272c   Tejun Heo   blk-throttle: imp...
1453
1454
  	throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u iops=%u queued=%d/%d",
  		   rw == READ ? 'R' : 'W',
4f024f379   Kent Overstreet   block: Abstract o...
1455
  		   tg->bytes_disp[rw], bio->bi_iter.bi_size, tg->bps[rw],
fda6f272c   Tejun Heo   blk-throttle: imp...
1456
1457
  		   tg->io_disp[rw], tg->iops[rw],
  		   sq->nr_queued[READ], sq->nr_queued[WRITE]);
e43473b7f   Vivek Goyal   blkio: Core imple...
1458

671058fb2   Tejun Heo   block: make blk-t...
1459
  	bio_associate_current(bio);
6bc9c2b46   Tejun Heo   blk-throttle: mak...
1460
  	tg->td->nr_queued[rw]++;
c5cc2070b   Tejun Heo   blk-throttle: add...
1461
  	throtl_add_bio_tg(bio, qn, tg);
bc16a4f93   Tejun Heo   block: reorganize...
1462
  	throttled = true;
e43473b7f   Vivek Goyal   blkio: Core imple...
1463

7f52f98c2   Tejun Heo   blk-throttle: imp...
1464
1465
1466
1467
1468
1469
  	/*
  	 * Update @tg's dispatch time and force schedule dispatch if @tg
  	 * was empty before @bio.  The forced scheduling isn't likely to
  	 * cause undue delay as @bio is likely to be dispatched directly if
  	 * its @tg's disptime is not in the future.
  	 */
0e9f4164b   Tejun Heo   blk-throttle: gen...
1470
  	if (tg->flags & THROTL_TG_WAS_EMPTY) {
77216b048   Tejun Heo   blk-throttle: add...
1471
  		tg_update_disptime(tg);
7f52f98c2   Tejun Heo   blk-throttle: imp...
1472
  		throtl_schedule_next_dispatch(tg->service_queue.parent_sq, true);
e43473b7f   Vivek Goyal   blkio: Core imple...
1473
  	}
bc16a4f93   Tejun Heo   block: reorganize...
1474
  out_unlock:
e43473b7f   Vivek Goyal   blkio: Core imple...
1475
  	spin_unlock_irq(q->queue_lock);
2a7f12441   Tejun Heo   blkcg: move rcu_r...
1476
1477
  out_unlock_rcu:
  	rcu_read_unlock();
bc16a4f93   Tejun Heo   block: reorganize...
1478
  out:
2a0f61e6e   Tejun Heo   blk-throttle: set...
1479
1480
1481
1482
1483
1484
1485
  	/*
  	 * As multiple blk-throtls may stack in the same issue path, we
  	 * don't want bios to leave with the flag set.  Clear the flag if
  	 * being issued.
  	 */
  	if (!throttled)
  		bio->bi_rw &= ~REQ_THROTTLED;
bc16a4f93   Tejun Heo   block: reorganize...
1486
  	return throttled;
e43473b7f   Vivek Goyal   blkio: Core imple...
1487
  }
2a12f0dcd   Tejun Heo   blk-throttle: mak...
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
  /*
   * Dispatch all bios from all children tg's queued on @parent_sq.  On
   * return, @parent_sq is guaranteed to not have any active children tg's
   * and all bios from previously active tg's are on @parent_sq->bio_lists[].
   */
  static void tg_drain_bios(struct throtl_service_queue *parent_sq)
  {
  	struct throtl_grp *tg;
  
  	while ((tg = throtl_rb_first(parent_sq))) {
  		struct throtl_service_queue *sq = &tg->service_queue;
  		struct bio *bio;
  
  		throtl_dequeue_tg(tg);
c5cc2070b   Tejun Heo   blk-throttle: add...
1502
  		while ((bio = throtl_peek_queued(&sq->queued[READ])))
2a12f0dcd   Tejun Heo   blk-throttle: mak...
1503
  			tg_dispatch_one_bio(tg, bio_data_dir(bio));
c5cc2070b   Tejun Heo   blk-throttle: add...
1504
  		while ((bio = throtl_peek_queued(&sq->queued[WRITE])))
2a12f0dcd   Tejun Heo   blk-throttle: mak...
1505
1506
1507
  			tg_dispatch_one_bio(tg, bio_data_dir(bio));
  	}
  }
c9a929dde   Tejun Heo   block: fix reques...
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
  /**
   * blk_throtl_drain - drain throttled bios
   * @q: request_queue to drain throttled bios for
   *
   * Dispatch all currently throttled bios on @q through ->make_request_fn().
   */
  void blk_throtl_drain(struct request_queue *q)
  	__releases(q->queue_lock) __acquires(q->queue_lock)
  {
  	struct throtl_data *td = q->td;
2a12f0dcd   Tejun Heo   blk-throttle: mak...
1518
  	struct blkcg_gq *blkg;
492eb21b9   Tejun Heo   cgroup: make hier...
1519
  	struct cgroup_subsys_state *pos_css;
c9a929dde   Tejun Heo   block: fix reques...
1520
  	struct bio *bio;
651930bc1   Tejun Heo   blk-throttle: dis...
1521
  	int rw;
c9a929dde   Tejun Heo   block: fix reques...
1522

8bcb6c7d4   Andi Kleen   block: use lockde...
1523
  	queue_lockdep_assert_held(q);
2a12f0dcd   Tejun Heo   blk-throttle: mak...
1524
  	rcu_read_lock();
c9a929dde   Tejun Heo   block: fix reques...
1525

2a12f0dcd   Tejun Heo   blk-throttle: mak...
1526
1527
1528
1529
1530
1531
  	/*
  	 * Drain each tg while doing post-order walk on the blkg tree, so
  	 * that all bios are propagated to td->service_queue.  It'd be
  	 * better to walk service_queue tree directly but blkg walk is
  	 * easier.
  	 */
492eb21b9   Tejun Heo   cgroup: make hier...
1532
  	blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg)
2a12f0dcd   Tejun Heo   blk-throttle: mak...
1533
  		tg_drain_bios(&blkg_to_tg(blkg)->service_queue);
73f0d49a9   Tejun Heo   blk-throttle: mov...
1534

2a12f0dcd   Tejun Heo   blk-throttle: mak...
1535
1536
1537
1538
  	/* finally, transfer bios from top-level tg's into the td */
  	tg_drain_bios(&td->service_queue);
  
  	rcu_read_unlock();
c9a929dde   Tejun Heo   block: fix reques...
1539
  	spin_unlock_irq(q->queue_lock);
2a12f0dcd   Tejun Heo   blk-throttle: mak...
1540
  	/* all bios now should be in td->service_queue, issue them */
651930bc1   Tejun Heo   blk-throttle: dis...
1541
  	for (rw = READ; rw <= WRITE; rw++)
c5cc2070b   Tejun Heo   blk-throttle: add...
1542
1543
  		while ((bio = throtl_pop_queued(&td->service_queue.queued[rw],
  						NULL)))
651930bc1   Tejun Heo   blk-throttle: dis...
1544
  			generic_make_request(bio);
c9a929dde   Tejun Heo   block: fix reques...
1545
1546
1547
  
  	spin_lock_irq(q->queue_lock);
  }
e43473b7f   Vivek Goyal   blkio: Core imple...
1548
1549
1550
  int blk_throtl_init(struct request_queue *q)
  {
  	struct throtl_data *td;
a2b1693ba   Tejun Heo   blkcg: implement ...
1551
  	int ret;
e43473b7f   Vivek Goyal   blkio: Core imple...
1552
1553
1554
1555
  
  	td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
  	if (!td)
  		return -ENOMEM;
69df0ab03   Tejun Heo   blk-throttle: sep...
1556
  	INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
77216b048   Tejun Heo   blk-throttle: add...
1557
  	throtl_service_queue_init(&td->service_queue, NULL);
e43473b7f   Vivek Goyal   blkio: Core imple...
1558

cd1604fab   Tejun Heo   blkcg: factor out...
1559
  	q->td = td;
29b125892   Vivek Goyal   blk-throttle: Dyn...
1560
  	td->queue = q;
02977e4af   Vivek Goyal   blkio: Add root g...
1561

a2b1693ba   Tejun Heo   blkcg: implement ...
1562
  	/* activate policy */
3c798398e   Tejun Heo   blkcg: mass renam...
1563
  	ret = blkcg_activate_policy(q, &blkcg_policy_throtl);
a2b1693ba   Tejun Heo   blkcg: implement ...
1564
  	if (ret)
f51b802c1   Tejun Heo   blkcg: use the us...
1565
  		kfree(td);
a2b1693ba   Tejun Heo   blkcg: implement ...
1566
  	return ret;
e43473b7f   Vivek Goyal   blkio: Core imple...
1567
1568
1569
1570
  }
  
  void blk_throtl_exit(struct request_queue *q)
  {
c875f4d02   Tejun Heo   blkcg: drop unnec...
1571
  	BUG_ON(!q->td);
da5277700   Vivek Goyal   block: Move blk_t...
1572
  	throtl_shutdown_wq(q);
3c798398e   Tejun Heo   blkcg: mass renam...
1573
  	blkcg_deactivate_policy(q, &blkcg_policy_throtl);
c9a929dde   Tejun Heo   block: fix reques...
1574
  	kfree(q->td);
e43473b7f   Vivek Goyal   blkio: Core imple...
1575
1576
1577
1578
  }
  
  static int __init throtl_init(void)
  {
450adcbe5   Vivek Goyal   blk-throttle: Do ...
1579
1580
1581
1582
  	kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0);
  	if (!kthrotld_workqueue)
  		panic("Failed to create kthrotld
  ");
3c798398e   Tejun Heo   blkcg: mass renam...
1583
  	return blkcg_policy_register(&blkcg_policy_throtl);
e43473b7f   Vivek Goyal   blkio: Core imple...
1584
1585
1586
  }
  
  module_init(throtl_init);