Blame view

block/blk-throttle.c 67.8 KB
b24413180   Greg Kroah-Hartman   License cleanup: ...
1
  // SPDX-License-Identifier: GPL-2.0
e43473b7f   Vivek Goyal   blkio: Core imple...
2
3
4
5
6
7
8
9
10
11
12
  /*
   * Interface for controlling IO bandwidth on a request queue
   *
   * Copyright (C) 2010 Vivek Goyal <vgoyal@redhat.com>
   */
  
  #include <linux/module.h>
  #include <linux/slab.h>
  #include <linux/blkdev.h>
  #include <linux/bio.h>
  #include <linux/blktrace_api.h>
eea8f41cc   Tejun Heo   blkcg: move block...
13
  #include <linux/blk-cgroup.h>
bc9fcbf9c   Tejun Heo   block: move blk_t...
14
  #include "blk.h"
e43473b7f   Vivek Goyal   blkio: Core imple...
15
16
17
18
19
20
  
  /* Max dispatch from a group in 1 round */
  static int throtl_grp_quantum = 8;
  
  /* Total max dispatch from all groups in one round */
  static int throtl_quantum = 32;
d61fcfa4b   Shaohua Li   blk-throttle: cho...
21
22
23
  /* Throttling is performed over a slice and after that slice is renewed */
  #define DFL_THROTL_SLICE_HD (HZ / 10)
  #define DFL_THROTL_SLICE_SSD (HZ / 50)
297e3d854   Shaohua Li   blk-throttle: mak...
24
  #define MAX_THROTL_SLICE (HZ)
9e234eeaf   Shaohua Li   blk-throttle: add...
25
  #define MAX_IDLE_TIME (5L * 1000 * 1000) /* 5 s */
9bb67aeb9   Shaohua Li   blk-throttle: res...
26
27
  #define MIN_THROTL_BPS (320 * 1024)
  #define MIN_THROTL_IOPS (10)
b4f428ef2   Shaohua Li   blk-throttle: for...
28
29
  #define DFL_LATENCY_TARGET (-1L)
  #define DFL_IDLE_THRESHOLD (0)
6679a90c4   Shaohua Li   blk-throttle: set...
30
31
32
33
34
35
36
  #define DFL_HD_BASELINE_LATENCY (4000L) /* 4ms */
  #define LATENCY_FILTERED_SSD (0)
  /*
   * For HD, very small latency comes from sequential IO. Such IO is helpless to
   * help determine if its IO is impacted by others, hence we ignore the IO
   */
  #define LATENCY_FILTERED_HD (1000L) /* 1ms */
e43473b7f   Vivek Goyal   blkio: Core imple...
37

b9147dd1b   Shaohua Li   blk-throttle: add...
38
  #define SKIP_LATENCY (((u64)1) << BLK_STAT_RES_SHIFT)
3c798398e   Tejun Heo   blkcg: mass renam...
39
  static struct blkcg_policy blkcg_policy_throtl;
0381411e4   Tejun Heo   blkcg: let blkcg ...
40

450adcbe5   Vivek Goyal   blk-throttle: Do ...
41
42
  /* A workqueue to queue throttle related work */
  static struct workqueue_struct *kthrotld_workqueue;
450adcbe5   Vivek Goyal   blk-throttle: Do ...
43

c5cc2070b   Tejun Heo   blk-throttle: add...
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
  /*
   * To implement hierarchical throttling, throtl_grps form a tree and bios
   * are dispatched upwards level by level until they reach the top and get
   * issued.  When dispatching bios from the children and local group at each
   * level, if the bios are dispatched into a single bio_list, there's a risk
   * of a local or child group which can queue many bios at once filling up
   * the list starving others.
   *
   * To avoid such starvation, dispatched bios are queued separately
   * according to where they came from.  When they are again dispatched to
   * the parent, they're popped in round-robin order so that no single source
   * hogs the dispatch window.
   *
   * throtl_qnode is used to keep the queued bios separated by their sources.
   * Bios are queued to throtl_qnode which in turn is queued to
   * throtl_service_queue and then dispatched in round-robin order.
   *
   * It's also used to track the reference counts on blkg's.  A qnode always
   * belongs to a throtl_grp and gets queued on itself or the parent, so
   * incrementing the reference of the associated throtl_grp when a qnode is
   * queued and decrementing when dequeued is enough to keep the whole blkg
   * tree pinned while bios are in flight.
   */
  struct throtl_qnode {
  	struct list_head	node;		/* service_queue->queued[] */
  	struct bio_list		bios;		/* queued bios */
  	struct throtl_grp	*tg;		/* tg this qnode belongs to */
  };
c9e0332e8   Tejun Heo   blk-throttle: ren...
72
  struct throtl_service_queue {
77216b048   Tejun Heo   blk-throttle: add...
73
  	struct throtl_service_queue *parent_sq;	/* the parent service_queue */
73f0d49a9   Tejun Heo   blk-throttle: mov...
74
75
76
77
  	/*
  	 * Bios queued directly to this service_queue or dispatched from
  	 * children throtl_grp's.
  	 */
c5cc2070b   Tejun Heo   blk-throttle: add...
78
  	struct list_head	queued[2];	/* throtl_qnode [READ/WRITE] */
73f0d49a9   Tejun Heo   blk-throttle: mov...
79
80
81
82
83
84
  	unsigned int		nr_queued[2];	/* number of queued bios */
  
  	/*
  	 * RB tree of active children throtl_grp's, which are sorted by
  	 * their ->disptime.
  	 */
c9e0332e8   Tejun Heo   blk-throttle: ren...
85
86
87
88
  	struct rb_root		pending_tree;	/* RB tree of active tgs */
  	struct rb_node		*first_pending;	/* first node in the tree */
  	unsigned int		nr_pending;	/* # queued in the tree */
  	unsigned long		first_pending_disptime;	/* disptime of the first tg */
69df0ab03   Tejun Heo   blk-throttle: sep...
89
  	struct timer_list	pending_timer;	/* fires on first_pending_disptime */
e43473b7f   Vivek Goyal   blkio: Core imple...
90
  };
5b2c16aae   Tejun Heo   blk-throttle: sim...
91
92
  enum tg_state_flags {
  	THROTL_TG_PENDING	= 1 << 0,	/* on parent's pending tree */
0e9f4164b   Tejun Heo   blk-throttle: gen...
93
  	THROTL_TG_WAS_EMPTY	= 1 << 1,	/* bio_lists[] became non-empty */
5b2c16aae   Tejun Heo   blk-throttle: sim...
94
  };
e43473b7f   Vivek Goyal   blkio: Core imple...
95
  #define rb_entry_tg(node)	rb_entry((node), struct throtl_grp, rb_node)
9f626e372   Shaohua Li   blk-throttle: pre...
96
  enum {
cd5ab1b0f   Shaohua Li   blk-throttle: add...
97
  	LIMIT_LOW,
9f626e372   Shaohua Li   blk-throttle: pre...
98
99
100
  	LIMIT_MAX,
  	LIMIT_CNT,
  };
e43473b7f   Vivek Goyal   blkio: Core imple...
101
  struct throtl_grp {
f95a04afa   Tejun Heo   blkcg: embed stru...
102
103
  	/* must be the first member */
  	struct blkg_policy_data pd;
c9e0332e8   Tejun Heo   blk-throttle: ren...
104
  	/* active throtl group service_queue member */
e43473b7f   Vivek Goyal   blkio: Core imple...
105
  	struct rb_node rb_node;
0f3457f60   Tejun Heo   blk-throttle: add...
106
107
  	/* throtl_data this group belongs to */
  	struct throtl_data *td;
49a2f1e3f   Tejun Heo   blk-throttle: add...
108
109
  	/* this group's service queue */
  	struct throtl_service_queue service_queue;
e43473b7f   Vivek Goyal   blkio: Core imple...
110
  	/*
c5cc2070b   Tejun Heo   blk-throttle: add...
111
112
113
114
115
116
117
118
119
120
121
  	 * qnode_on_self is used when bios are directly queued to this
  	 * throtl_grp so that local bios compete fairly with bios
  	 * dispatched from children.  qnode_on_parent is used when bios are
  	 * dispatched from this throtl_grp into its parent and will compete
  	 * with the sibling qnode_on_parents and the parent's
  	 * qnode_on_self.
  	 */
  	struct throtl_qnode qnode_on_self[2];
  	struct throtl_qnode qnode_on_parent[2];
  
  	/*
e43473b7f   Vivek Goyal   blkio: Core imple...
122
123
124
125
126
  	 * Dispatch time in jiffies. This is the estimated time when group
  	 * will unthrottle and is ready to dispatch more bio. It is used as
  	 * key to sort active groups in service tree.
  	 */
  	unsigned long disptime;
e43473b7f   Vivek Goyal   blkio: Core imple...
127
  	unsigned int flags;
693e751e7   Tejun Heo   blk-throttle: imp...
128
129
  	/* are there any throtl rules between this group and td? */
  	bool has_rules[2];
cd5ab1b0f   Shaohua Li   blk-throttle: add...
130
  	/* internally used bytes per second rate limits */
9f626e372   Shaohua Li   blk-throttle: pre...
131
  	uint64_t bps[2][LIMIT_CNT];
cd5ab1b0f   Shaohua Li   blk-throttle: add...
132
133
  	/* user configured bps limits */
  	uint64_t bps_conf[2][LIMIT_CNT];
e43473b7f   Vivek Goyal   blkio: Core imple...
134

cd5ab1b0f   Shaohua Li   blk-throttle: add...
135
  	/* internally used IOPS limits */
9f626e372   Shaohua Li   blk-throttle: pre...
136
  	unsigned int iops[2][LIMIT_CNT];
cd5ab1b0f   Shaohua Li   blk-throttle: add...
137
138
  	/* user configured IOPS limits */
  	unsigned int iops_conf[2][LIMIT_CNT];
8e89d13f4   Vivek Goyal   blkio: Implementa...
139

e43473b7f   Vivek Goyal   blkio: Core imple...
140
141
  	/* Number of bytes disptached in current slice */
  	uint64_t bytes_disp[2];
8e89d13f4   Vivek Goyal   blkio: Implementa...
142
143
  	/* Number of bio's dispatched in current slice */
  	unsigned int io_disp[2];
e43473b7f   Vivek Goyal   blkio: Core imple...
144

3f0abd806   Shaohua Li   blk-throttle: add...
145
146
147
148
149
150
  	unsigned long last_low_overflow_time[2];
  
  	uint64_t last_bytes_disp[2];
  	unsigned int last_io_disp[2];
  
  	unsigned long last_check_time;
ec80991d6   Shaohua Li   blk-throttle: add...
151
  	unsigned long latency_target; /* us */
5b81fc3cc   Shaohua Li   blk-throttle: add...
152
  	unsigned long latency_target_conf; /* us */
e43473b7f   Vivek Goyal   blkio: Core imple...
153
154
155
  	/* When did we start a new slice */
  	unsigned long slice_start[2];
  	unsigned long slice_end[2];
9e234eeaf   Shaohua Li   blk-throttle: add...
156
157
158
159
160
  
  	unsigned long last_finish_time; /* ns / 1024 */
  	unsigned long checked_last_finish_time; /* ns / 1024 */
  	unsigned long avg_idletime; /* ns / 1024 */
  	unsigned long idletime_threshold; /* us */
5b81fc3cc   Shaohua Li   blk-throttle: add...
161
  	unsigned long idletime_threshold_conf; /* us */
53696b8d2   Shaohua Li   blk-throttle: add...
162
163
164
165
  
  	unsigned int bio_cnt; /* total bios */
  	unsigned int bad_bio_cnt; /* bios exceeding latency threshold */
  	unsigned long bio_cnt_reset_time;
e43473b7f   Vivek Goyal   blkio: Core imple...
166
  };
b9147dd1b   Shaohua Li   blk-throttle: add...
167
168
169
170
171
172
173
174
175
176
177
178
  /* We measure latency for request size from <= 4k to >= 1M */
  #define LATENCY_BUCKET_SIZE 9
  
  struct latency_bucket {
  	unsigned long total_latency; /* ns / 1024 */
  	int samples;
  };
  
  struct avg_latency_bucket {
  	unsigned long latency; /* ns / 1024 */
  	bool valid;
  };
e43473b7f   Vivek Goyal   blkio: Core imple...
179
180
  struct throtl_data
  {
e43473b7f   Vivek Goyal   blkio: Core imple...
181
  	/* service tree for active throtl groups */
c9e0332e8   Tejun Heo   blk-throttle: ren...
182
  	struct throtl_service_queue service_queue;
e43473b7f   Vivek Goyal   blkio: Core imple...
183

e43473b7f   Vivek Goyal   blkio: Core imple...
184
185
186
187
  	struct request_queue *queue;
  
  	/* Total Number of queued bios on READ and WRITE lists */
  	unsigned int nr_queued[2];
297e3d854   Shaohua Li   blk-throttle: mak...
188
  	unsigned int throtl_slice;
e43473b7f   Vivek Goyal   blkio: Core imple...
189
  	/* Work for dispatching throttled bios */
69df0ab03   Tejun Heo   blk-throttle: sep...
190
  	struct work_struct dispatch_work;
9f626e372   Shaohua Li   blk-throttle: pre...
191
192
  	unsigned int limit_index;
  	bool limit_valid[LIMIT_CNT];
3f0abd806   Shaohua Li   blk-throttle: add...
193
194
195
  
  	unsigned long low_upgrade_time;
  	unsigned long low_downgrade_time;
7394e31fa   Shaohua Li   blk-throttle: mak...
196
197
  
  	unsigned int scale;
b9147dd1b   Shaohua Li   blk-throttle: add...
198
199
200
201
202
  
  	struct latency_bucket tmp_buckets[LATENCY_BUCKET_SIZE];
  	struct avg_latency_bucket avg_buckets[LATENCY_BUCKET_SIZE];
  	struct latency_bucket __percpu *latency_buckets;
  	unsigned long last_calculate_time;
6679a90c4   Shaohua Li   blk-throttle: set...
203
  	unsigned long filtered_latency;
b9147dd1b   Shaohua Li   blk-throttle: add...
204
205
  
  	bool track_bio_latency;
e43473b7f   Vivek Goyal   blkio: Core imple...
206
  };
69df0ab03   Tejun Heo   blk-throttle: sep...
207
  static void throtl_pending_timer_fn(unsigned long arg);
f95a04afa   Tejun Heo   blkcg: embed stru...
208
209
210
211
  static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd)
  {
  	return pd ? container_of(pd, struct throtl_grp, pd) : NULL;
  }
3c798398e   Tejun Heo   blkcg: mass renam...
212
  static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg)
0381411e4   Tejun Heo   blkcg: let blkcg ...
213
  {
f95a04afa   Tejun Heo   blkcg: embed stru...
214
  	return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl));
0381411e4   Tejun Heo   blkcg: let blkcg ...
215
  }
3c798398e   Tejun Heo   blkcg: mass renam...
216
  static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg)
0381411e4   Tejun Heo   blkcg: let blkcg ...
217
  {
f95a04afa   Tejun Heo   blkcg: embed stru...
218
  	return pd_to_blkg(&tg->pd);
0381411e4   Tejun Heo   blkcg: let blkcg ...
219
  }
fda6f272c   Tejun Heo   blk-throttle: imp...
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
  /**
   * sq_to_tg - return the throl_grp the specified service queue belongs to
   * @sq: the throtl_service_queue of interest
   *
   * Return the throtl_grp @sq belongs to.  If @sq is the top-level one
   * embedded in throtl_data, %NULL is returned.
   */
  static struct throtl_grp *sq_to_tg(struct throtl_service_queue *sq)
  {
  	if (sq && sq->parent_sq)
  		return container_of(sq, struct throtl_grp, service_queue);
  	else
  		return NULL;
  }
  
  /**
   * sq_to_td - return throtl_data the specified service queue belongs to
   * @sq: the throtl_service_queue of interest
   *
b43daedc0   Masahiro Yamada   scripts/spelling....
239
   * A service_queue can be embedded in either a throtl_grp or throtl_data.
fda6f272c   Tejun Heo   blk-throttle: imp...
240
241
242
243
244
245
246
247
248
249
250
   * Determine the associated throtl_data accordingly and return it.
   */
  static struct throtl_data *sq_to_td(struct throtl_service_queue *sq)
  {
  	struct throtl_grp *tg = sq_to_tg(sq);
  
  	if (tg)
  		return tg->td;
  	else
  		return container_of(sq, struct throtl_data, service_queue);
  }
7394e31fa   Shaohua Li   blk-throttle: mak...
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
  /*
   * cgroup's limit in LIMIT_MAX is scaled if low limit is set. This scale is to
   * make the IO dispatch more smooth.
   * Scale up: linearly scale up according to lapsed time since upgrade. For
   *           every throtl_slice, the limit scales up 1/2 .low limit till the
   *           limit hits .max limit
   * Scale down: exponentially scale down if a cgroup doesn't hit its .low limit
   */
  static uint64_t throtl_adjusted_limit(uint64_t low, struct throtl_data *td)
  {
  	/* arbitrary value to avoid too big scale */
  	if (td->scale < 4096 && time_after_eq(jiffies,
  	    td->low_upgrade_time + td->scale * td->throtl_slice))
  		td->scale = (jiffies - td->low_upgrade_time) / td->throtl_slice;
  
  	return low + (low >> 1) * td->scale;
  }
9f626e372   Shaohua Li   blk-throttle: pre...
268
269
  static uint64_t tg_bps_limit(struct throtl_grp *tg, int rw)
  {
b22c417c8   Shaohua Li   blk-throttle: con...
270
  	struct blkcg_gq *blkg = tg_to_blkg(tg);
7394e31fa   Shaohua Li   blk-throttle: mak...
271
  	struct throtl_data *td;
b22c417c8   Shaohua Li   blk-throttle: con...
272
273
274
275
  	uint64_t ret;
  
  	if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent)
  		return U64_MAX;
7394e31fa   Shaohua Li   blk-throttle: mak...
276
277
278
  
  	td = tg->td;
  	ret = tg->bps[rw][td->limit_index];
9bb67aeb9   Shaohua Li   blk-throttle: res...
279
280
281
282
283
284
285
286
  	if (ret == 0 && td->limit_index == LIMIT_LOW) {
  		/* intermediate node or iops isn't 0 */
  		if (!list_empty(&blkg->blkcg->css.children) ||
  		    tg->iops[rw][td->limit_index])
  			return U64_MAX;
  		else
  			return MIN_THROTL_BPS;
  	}
7394e31fa   Shaohua Li   blk-throttle: mak...
287
288
289
290
291
292
293
294
  
  	if (td->limit_index == LIMIT_MAX && tg->bps[rw][LIMIT_LOW] &&
  	    tg->bps[rw][LIMIT_LOW] != tg->bps[rw][LIMIT_MAX]) {
  		uint64_t adjusted;
  
  		adjusted = throtl_adjusted_limit(tg->bps[rw][LIMIT_LOW], td);
  		ret = min(tg->bps[rw][LIMIT_MAX], adjusted);
  	}
b22c417c8   Shaohua Li   blk-throttle: con...
295
  	return ret;
9f626e372   Shaohua Li   blk-throttle: pre...
296
297
298
299
  }
  
  static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw)
  {
b22c417c8   Shaohua Li   blk-throttle: con...
300
  	struct blkcg_gq *blkg = tg_to_blkg(tg);
7394e31fa   Shaohua Li   blk-throttle: mak...
301
  	struct throtl_data *td;
b22c417c8   Shaohua Li   blk-throttle: con...
302
303
304
305
  	unsigned int ret;
  
  	if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent)
  		return UINT_MAX;
9bb67aeb9   Shaohua Li   blk-throttle: res...
306

7394e31fa   Shaohua Li   blk-throttle: mak...
307
308
  	td = tg->td;
  	ret = tg->iops[rw][td->limit_index];
9bb67aeb9   Shaohua Li   blk-throttle: res...
309
310
311
312
313
314
315
316
  	if (ret == 0 && tg->td->limit_index == LIMIT_LOW) {
  		/* intermediate node or bps isn't 0 */
  		if (!list_empty(&blkg->blkcg->css.children) ||
  		    tg->bps[rw][td->limit_index])
  			return UINT_MAX;
  		else
  			return MIN_THROTL_IOPS;
  	}
7394e31fa   Shaohua Li   blk-throttle: mak...
317
318
319
320
321
322
323
324
325
326
  
  	if (td->limit_index == LIMIT_MAX && tg->iops[rw][LIMIT_LOW] &&
  	    tg->iops[rw][LIMIT_LOW] != tg->iops[rw][LIMIT_MAX]) {
  		uint64_t adjusted;
  
  		adjusted = throtl_adjusted_limit(tg->iops[rw][LIMIT_LOW], td);
  		if (adjusted > UINT_MAX)
  			adjusted = UINT_MAX;
  		ret = min_t(unsigned int, tg->iops[rw][LIMIT_MAX], adjusted);
  	}
b22c417c8   Shaohua Li   blk-throttle: con...
327
  	return ret;
9f626e372   Shaohua Li   blk-throttle: pre...
328
  }
b9147dd1b   Shaohua Li   blk-throttle: add...
329
330
  #define request_bucket_index(sectors) \
  	clamp_t(int, order_base_2(sectors) - 3, 0, LATENCY_BUCKET_SIZE - 1)
fda6f272c   Tejun Heo   blk-throttle: imp...
331
332
333
334
335
336
337
338
  /**
   * throtl_log - log debug message via blktrace
   * @sq: the service_queue being reported
   * @fmt: printf format string
   * @args: printf args
   *
   * The messages are prefixed with "throtl BLKG_NAME" if @sq belongs to a
   * throtl_grp; otherwise, just "throtl".
fda6f272c   Tejun Heo   blk-throttle: imp...
339
340
341
342
343
344
   */
  #define throtl_log(sq, fmt, args...)	do {				\
  	struct throtl_grp *__tg = sq_to_tg((sq));			\
  	struct throtl_data *__td = sq_to_td((sq));			\
  									\
  	(void)__td;							\
59fa0224c   Shaohua Li   blk-throttle: don...
345
346
  	if (likely(!blk_trace_note_message_enabled(__td->queue)))	\
  		break;							\
fda6f272c   Tejun Heo   blk-throttle: imp...
347
  	if ((__tg)) {							\
35fe6d763   Shaohua Li   block: use standa...
348
349
  		blk_add_cgroup_trace_msg(__td->queue,			\
  			tg_to_blkg(__tg)->blkcg, "throtl " fmt, ##args);\
fda6f272c   Tejun Heo   blk-throttle: imp...
350
351
352
  	} else {							\
  		blk_add_trace_msg(__td->queue, "throtl " fmt, ##args);	\
  	}								\
54e7ed12b   Tejun Heo   blkcg: remove blk...
353
  } while (0)
e43473b7f   Vivek Goyal   blkio: Core imple...
354

ea0ea2bc6   Shaohua Li   blk-throttle: cap...
355
356
357
358
359
360
361
  static inline unsigned int throtl_bio_data_size(struct bio *bio)
  {
  	/* assume it's one sector */
  	if (unlikely(bio_op(bio) == REQ_OP_DISCARD))
  		return 512;
  	return bio->bi_iter.bi_size;
  }
c5cc2070b   Tejun Heo   blk-throttle: add...
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
  static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg)
  {
  	INIT_LIST_HEAD(&qn->node);
  	bio_list_init(&qn->bios);
  	qn->tg = tg;
  }
  
  /**
   * throtl_qnode_add_bio - add a bio to a throtl_qnode and activate it
   * @bio: bio being added
   * @qn: qnode to add bio to
   * @queued: the service_queue->queued[] list @qn belongs to
   *
   * Add @bio to @qn and put @qn on @queued if it's not already on.
   * @qn->tg's reference count is bumped when @qn is activated.  See the
   * comment on top of throtl_qnode definition for details.
   */
  static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn,
  				 struct list_head *queued)
  {
  	bio_list_add(&qn->bios, bio);
  	if (list_empty(&qn->node)) {
  		list_add_tail(&qn->node, queued);
  		blkg_get(tg_to_blkg(qn->tg));
  	}
  }
  
  /**
   * throtl_peek_queued - peek the first bio on a qnode list
   * @queued: the qnode list to peek
   */
  static struct bio *throtl_peek_queued(struct list_head *queued)
  {
  	struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node);
  	struct bio *bio;
  
  	if (list_empty(queued))
  		return NULL;
  
  	bio = bio_list_peek(&qn->bios);
  	WARN_ON_ONCE(!bio);
  	return bio;
  }
  
  /**
   * throtl_pop_queued - pop the first bio form a qnode list
   * @queued: the qnode list to pop a bio from
   * @tg_to_put: optional out argument for throtl_grp to put
   *
   * Pop the first bio from the qnode list @queued.  After popping, the first
   * qnode is removed from @queued if empty or moved to the end of @queued so
   * that the popping order is round-robin.
   *
   * When the first qnode is removed, its associated throtl_grp should be put
   * too.  If @tg_to_put is NULL, this function automatically puts it;
   * otherwise, *@tg_to_put is set to the throtl_grp to put and the caller is
   * responsible for putting it.
   */
  static struct bio *throtl_pop_queued(struct list_head *queued,
  				     struct throtl_grp **tg_to_put)
  {
  	struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node);
  	struct bio *bio;
  
  	if (list_empty(queued))
  		return NULL;
  
  	bio = bio_list_pop(&qn->bios);
  	WARN_ON_ONCE(!bio);
  
  	if (bio_list_empty(&qn->bios)) {
  		list_del_init(&qn->node);
  		if (tg_to_put)
  			*tg_to_put = qn->tg;
  		else
  			blkg_put(tg_to_blkg(qn->tg));
  	} else {
  		list_move_tail(&qn->node, queued);
  	}
  
  	return bio;
  }
49a2f1e3f   Tejun Heo   blk-throttle: add...
444
  /* init a service_queue, assumes the caller zeroed it */
b2ce2643c   Tejun Heo   blk-throttle: cle...
445
  static void throtl_service_queue_init(struct throtl_service_queue *sq)
49a2f1e3f   Tejun Heo   blk-throttle: add...
446
  {
c5cc2070b   Tejun Heo   blk-throttle: add...
447
448
  	INIT_LIST_HEAD(&sq->queued[0]);
  	INIT_LIST_HEAD(&sq->queued[1]);
49a2f1e3f   Tejun Heo   blk-throttle: add...
449
  	sq->pending_tree = RB_ROOT;
69df0ab03   Tejun Heo   blk-throttle: sep...
450
451
452
  	setup_timer(&sq->pending_timer, throtl_pending_timer_fn,
  		    (unsigned long)sq);
  }
001bea73e   Tejun Heo   blkcg: replace bl...
453
454
  static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node)
  {
4fb72036f   Tejun Heo   blk-throttle: rem...
455
  	struct throtl_grp *tg;
24bdb8ef0   Tejun Heo   blkcg: make blkcg...
456
  	int rw;
4fb72036f   Tejun Heo   blk-throttle: rem...
457
458
459
  
  	tg = kzalloc_node(sizeof(*tg), gfp, node);
  	if (!tg)
77ea73388   Tejun Heo   blkcg: move io_se...
460
  		return NULL;
4fb72036f   Tejun Heo   blk-throttle: rem...
461

b2ce2643c   Tejun Heo   blk-throttle: cle...
462
463
464
465
466
467
468
469
  	throtl_service_queue_init(&tg->service_queue);
  
  	for (rw = READ; rw <= WRITE; rw++) {
  		throtl_qnode_init(&tg->qnode_on_self[rw], tg);
  		throtl_qnode_init(&tg->qnode_on_parent[rw], tg);
  	}
  
  	RB_CLEAR_NODE(&tg->rb_node);
9f626e372   Shaohua Li   blk-throttle: pre...
470
471
472
473
  	tg->bps[READ][LIMIT_MAX] = U64_MAX;
  	tg->bps[WRITE][LIMIT_MAX] = U64_MAX;
  	tg->iops[READ][LIMIT_MAX] = UINT_MAX;
  	tg->iops[WRITE][LIMIT_MAX] = UINT_MAX;
cd5ab1b0f   Shaohua Li   blk-throttle: add...
474
475
476
477
478
  	tg->bps_conf[READ][LIMIT_MAX] = U64_MAX;
  	tg->bps_conf[WRITE][LIMIT_MAX] = U64_MAX;
  	tg->iops_conf[READ][LIMIT_MAX] = UINT_MAX;
  	tg->iops_conf[WRITE][LIMIT_MAX] = UINT_MAX;
  	/* LIMIT_LOW will have default value 0 */
b2ce2643c   Tejun Heo   blk-throttle: cle...
479

ec80991d6   Shaohua Li   blk-throttle: add...
480
  	tg->latency_target = DFL_LATENCY_TARGET;
5b81fc3cc   Shaohua Li   blk-throttle: add...
481
  	tg->latency_target_conf = DFL_LATENCY_TARGET;
b4f428ef2   Shaohua Li   blk-throttle: for...
482
483
  	tg->idletime_threshold = DFL_IDLE_THRESHOLD;
  	tg->idletime_threshold_conf = DFL_IDLE_THRESHOLD;
ec80991d6   Shaohua Li   blk-throttle: add...
484

4fb72036f   Tejun Heo   blk-throttle: rem...
485
  	return &tg->pd;
001bea73e   Tejun Heo   blkcg: replace bl...
486
  }
a9520cd6f   Tejun Heo   blkcg: make blkcg...
487
  static void throtl_pd_init(struct blkg_policy_data *pd)
a29a171e7   Vivek Goyal   blk-throttle: Do ...
488
  {
a9520cd6f   Tejun Heo   blkcg: make blkcg...
489
490
  	struct throtl_grp *tg = pd_to_tg(pd);
  	struct blkcg_gq *blkg = tg_to_blkg(tg);
77216b048   Tejun Heo   blk-throttle: add...
491
  	struct throtl_data *td = blkg->q->td;
b2ce2643c   Tejun Heo   blk-throttle: cle...
492
  	struct throtl_service_queue *sq = &tg->service_queue;
cd1604fab   Tejun Heo   blkcg: factor out...
493

9138125be   Tejun Heo   blk-throttle: imp...
494
  	/*
aa6ec29be   Tejun Heo   cgroup: remove sa...
495
  	 * If on the default hierarchy, we switch to properly hierarchical
9138125be   Tejun Heo   blk-throttle: imp...
496
497
498
499
500
  	 * behavior where limits on a given throtl_grp are applied to the
  	 * whole subtree rather than just the group itself.  e.g. If 16M
  	 * read_bps limit is set on the root group, the whole system can't
  	 * exceed 16M for the device.
  	 *
aa6ec29be   Tejun Heo   cgroup: remove sa...
501
  	 * If not on the default hierarchy, the broken flat hierarchy
9138125be   Tejun Heo   blk-throttle: imp...
502
503
504
505
506
  	 * behavior is retained where all throtl_grps are treated as if
  	 * they're all separate root groups right below throtl_data.
  	 * Limits of a group don't interact with limits of other groups
  	 * regardless of the position of the group in the hierarchy.
  	 */
b2ce2643c   Tejun Heo   blk-throttle: cle...
507
  	sq->parent_sq = &td->service_queue;
9e10a130d   Tejun Heo   cgroup: replace c...
508
  	if (cgroup_subsys_on_dfl(io_cgrp_subsys) && blkg->parent)
b2ce2643c   Tejun Heo   blk-throttle: cle...
509
  		sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue;
77216b048   Tejun Heo   blk-throttle: add...
510
  	tg->td = td;
8a3d26151   Tejun Heo   blkcg: move blkio...
511
  }
693e751e7   Tejun Heo   blk-throttle: imp...
512
513
514
515
516
517
518
519
  /*
   * Set has_rules[] if @tg or any of its parents have limits configured.
   * This doesn't require walking up to the top of the hierarchy as the
   * parent's has_rules[] is guaranteed to be correct.
   */
  static void tg_update_has_rules(struct throtl_grp *tg)
  {
  	struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq);
9f626e372   Shaohua Li   blk-throttle: pre...
520
  	struct throtl_data *td = tg->td;
693e751e7   Tejun Heo   blk-throttle: imp...
521
522
523
524
  	int rw;
  
  	for (rw = READ; rw <= WRITE; rw++)
  		tg->has_rules[rw] = (parent_tg && parent_tg->has_rules[rw]) ||
9f626e372   Shaohua Li   blk-throttle: pre...
525
526
527
  			(td->limit_valid[td->limit_index] &&
  			 (tg_bps_limit(tg, rw) != U64_MAX ||
  			  tg_iops_limit(tg, rw) != UINT_MAX));
693e751e7   Tejun Heo   blk-throttle: imp...
528
  }
a9520cd6f   Tejun Heo   blkcg: make blkcg...
529
  static void throtl_pd_online(struct blkg_policy_data *pd)
693e751e7   Tejun Heo   blk-throttle: imp...
530
  {
aec242468   Shaohua Li   blk-throttle: det...
531
  	struct throtl_grp *tg = pd_to_tg(pd);
693e751e7   Tejun Heo   blk-throttle: imp...
532
533
534
535
  	/*
  	 * We don't want new groups to escape the limits of its ancestors.
  	 * Update has_rules[] after a new group is brought online.
  	 */
aec242468   Shaohua Li   blk-throttle: det...
536
  	tg_update_has_rules(tg);
693e751e7   Tejun Heo   blk-throttle: imp...
537
  }
cd5ab1b0f   Shaohua Li   blk-throttle: add...
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
  static void blk_throtl_update_limit_valid(struct throtl_data *td)
  {
  	struct cgroup_subsys_state *pos_css;
  	struct blkcg_gq *blkg;
  	bool low_valid = false;
  
  	rcu_read_lock();
  	blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
  		struct throtl_grp *tg = blkg_to_tg(blkg);
  
  		if (tg->bps[READ][LIMIT_LOW] || tg->bps[WRITE][LIMIT_LOW] ||
  		    tg->iops[READ][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW])
  			low_valid = true;
  	}
  	rcu_read_unlock();
  
  	td->limit_valid[LIMIT_LOW] = low_valid;
  }
c79892c55   Shaohua Li   blk-throttle: add...
556
  static void throtl_upgrade_state(struct throtl_data *td);
cd5ab1b0f   Shaohua Li   blk-throttle: add...
557
558
559
560
561
562
563
564
565
566
  static void throtl_pd_offline(struct blkg_policy_data *pd)
  {
  	struct throtl_grp *tg = pd_to_tg(pd);
  
  	tg->bps[READ][LIMIT_LOW] = 0;
  	tg->bps[WRITE][LIMIT_LOW] = 0;
  	tg->iops[READ][LIMIT_LOW] = 0;
  	tg->iops[WRITE][LIMIT_LOW] = 0;
  
  	blk_throtl_update_limit_valid(tg->td);
c79892c55   Shaohua Li   blk-throttle: add...
567
568
  	if (!tg->td->limit_valid[tg->td->limit_index])
  		throtl_upgrade_state(tg->td);
cd5ab1b0f   Shaohua Li   blk-throttle: add...
569
  }
001bea73e   Tejun Heo   blkcg: replace bl...
570
571
  static void throtl_pd_free(struct blkg_policy_data *pd)
  {
4fb72036f   Tejun Heo   blk-throttle: rem...
572
  	struct throtl_grp *tg = pd_to_tg(pd);
b2ce2643c   Tejun Heo   blk-throttle: cle...
573
  	del_timer_sync(&tg->service_queue.pending_timer);
4fb72036f   Tejun Heo   blk-throttle: rem...
574
  	kfree(tg);
001bea73e   Tejun Heo   blkcg: replace bl...
575
  }
0049af73b   Tejun Heo   blk-throttle: reo...
576
577
  static struct throtl_grp *
  throtl_rb_first(struct throtl_service_queue *parent_sq)
e43473b7f   Vivek Goyal   blkio: Core imple...
578
579
  {
  	/* Service tree is empty */
0049af73b   Tejun Heo   blk-throttle: reo...
580
  	if (!parent_sq->nr_pending)
e43473b7f   Vivek Goyal   blkio: Core imple...
581
  		return NULL;
0049af73b   Tejun Heo   blk-throttle: reo...
582
583
  	if (!parent_sq->first_pending)
  		parent_sq->first_pending = rb_first(&parent_sq->pending_tree);
e43473b7f   Vivek Goyal   blkio: Core imple...
584

0049af73b   Tejun Heo   blk-throttle: reo...
585
586
  	if (parent_sq->first_pending)
  		return rb_entry_tg(parent_sq->first_pending);
e43473b7f   Vivek Goyal   blkio: Core imple...
587
588
589
590
591
592
593
594
595
  
  	return NULL;
  }
  
  static void rb_erase_init(struct rb_node *n, struct rb_root *root)
  {
  	rb_erase(n, root);
  	RB_CLEAR_NODE(n);
  }
0049af73b   Tejun Heo   blk-throttle: reo...
596
597
  static void throtl_rb_erase(struct rb_node *n,
  			    struct throtl_service_queue *parent_sq)
e43473b7f   Vivek Goyal   blkio: Core imple...
598
  {
0049af73b   Tejun Heo   blk-throttle: reo...
599
600
601
602
  	if (parent_sq->first_pending == n)
  		parent_sq->first_pending = NULL;
  	rb_erase_init(n, &parent_sq->pending_tree);
  	--parent_sq->nr_pending;
e43473b7f   Vivek Goyal   blkio: Core imple...
603
  }
0049af73b   Tejun Heo   blk-throttle: reo...
604
  static void update_min_dispatch_time(struct throtl_service_queue *parent_sq)
e43473b7f   Vivek Goyal   blkio: Core imple...
605
606
  {
  	struct throtl_grp *tg;
0049af73b   Tejun Heo   blk-throttle: reo...
607
  	tg = throtl_rb_first(parent_sq);
e43473b7f   Vivek Goyal   blkio: Core imple...
608
609
  	if (!tg)
  		return;
0049af73b   Tejun Heo   blk-throttle: reo...
610
  	parent_sq->first_pending_disptime = tg->disptime;
e43473b7f   Vivek Goyal   blkio: Core imple...
611
  }
77216b048   Tejun Heo   blk-throttle: add...
612
  static void tg_service_queue_add(struct throtl_grp *tg)
e43473b7f   Vivek Goyal   blkio: Core imple...
613
  {
77216b048   Tejun Heo   blk-throttle: add...
614
  	struct throtl_service_queue *parent_sq = tg->service_queue.parent_sq;
0049af73b   Tejun Heo   blk-throttle: reo...
615
  	struct rb_node **node = &parent_sq->pending_tree.rb_node;
e43473b7f   Vivek Goyal   blkio: Core imple...
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
  	struct rb_node *parent = NULL;
  	struct throtl_grp *__tg;
  	unsigned long key = tg->disptime;
  	int left = 1;
  
  	while (*node != NULL) {
  		parent = *node;
  		__tg = rb_entry_tg(parent);
  
  		if (time_before(key, __tg->disptime))
  			node = &parent->rb_left;
  		else {
  			node = &parent->rb_right;
  			left = 0;
  		}
  	}
  
  	if (left)
0049af73b   Tejun Heo   blk-throttle: reo...
634
  		parent_sq->first_pending = &tg->rb_node;
e43473b7f   Vivek Goyal   blkio: Core imple...
635
636
  
  	rb_link_node(&tg->rb_node, parent, node);
0049af73b   Tejun Heo   blk-throttle: reo...
637
  	rb_insert_color(&tg->rb_node, &parent_sq->pending_tree);
e43473b7f   Vivek Goyal   blkio: Core imple...
638
  }
77216b048   Tejun Heo   blk-throttle: add...
639
  static void __throtl_enqueue_tg(struct throtl_grp *tg)
e43473b7f   Vivek Goyal   blkio: Core imple...
640
  {
77216b048   Tejun Heo   blk-throttle: add...
641
  	tg_service_queue_add(tg);
5b2c16aae   Tejun Heo   blk-throttle: sim...
642
  	tg->flags |= THROTL_TG_PENDING;
77216b048   Tejun Heo   blk-throttle: add...
643
  	tg->service_queue.parent_sq->nr_pending++;
e43473b7f   Vivek Goyal   blkio: Core imple...
644
  }
77216b048   Tejun Heo   blk-throttle: add...
645
  static void throtl_enqueue_tg(struct throtl_grp *tg)
e43473b7f   Vivek Goyal   blkio: Core imple...
646
  {
5b2c16aae   Tejun Heo   blk-throttle: sim...
647
  	if (!(tg->flags & THROTL_TG_PENDING))
77216b048   Tejun Heo   blk-throttle: add...
648
  		__throtl_enqueue_tg(tg);
e43473b7f   Vivek Goyal   blkio: Core imple...
649
  }
77216b048   Tejun Heo   blk-throttle: add...
650
  static void __throtl_dequeue_tg(struct throtl_grp *tg)
e43473b7f   Vivek Goyal   blkio: Core imple...
651
  {
77216b048   Tejun Heo   blk-throttle: add...
652
  	throtl_rb_erase(&tg->rb_node, tg->service_queue.parent_sq);
5b2c16aae   Tejun Heo   blk-throttle: sim...
653
  	tg->flags &= ~THROTL_TG_PENDING;
e43473b7f   Vivek Goyal   blkio: Core imple...
654
  }
77216b048   Tejun Heo   blk-throttle: add...
655
  static void throtl_dequeue_tg(struct throtl_grp *tg)
e43473b7f   Vivek Goyal   blkio: Core imple...
656
  {
5b2c16aae   Tejun Heo   blk-throttle: sim...
657
  	if (tg->flags & THROTL_TG_PENDING)
77216b048   Tejun Heo   blk-throttle: add...
658
  		__throtl_dequeue_tg(tg);
e43473b7f   Vivek Goyal   blkio: Core imple...
659
  }
a9131a27e   Tejun Heo   blk-throttle: rel...
660
  /* Call with queue lock held */
69df0ab03   Tejun Heo   blk-throttle: sep...
661
662
  static void throtl_schedule_pending_timer(struct throtl_service_queue *sq,
  					  unsigned long expires)
a9131a27e   Tejun Heo   blk-throttle: rel...
663
  {
a41b816c1   Joseph Qi   blk-throttle: fix...
664
  	unsigned long max_expire = jiffies + 8 * sq_to_td(sq)->throtl_slice;
06cceedcc   Shaohua Li   blk-throttle: mak...
665
666
667
668
669
670
671
672
673
674
  
  	/*
  	 * Since we are adjusting the throttle limit dynamically, the sleep
  	 * time calculated according to previous limit might be invalid. It's
  	 * possible the cgroup sleep time is very long and no other cgroups
  	 * have IO running so notify the limit changes. Make sure the cgroup
  	 * doesn't sleep too long to avoid the missed notification.
  	 */
  	if (time_after(expires, max_expire))
  		expires = max_expire;
69df0ab03   Tejun Heo   blk-throttle: sep...
675
676
677
  	mod_timer(&sq->pending_timer, expires);
  	throtl_log(sq, "schedule timer. delay=%lu jiffies=%lu",
  		   expires - jiffies, jiffies);
a9131a27e   Tejun Heo   blk-throttle: rel...
678
  }
7f52f98c2   Tejun Heo   blk-throttle: imp...
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
  /**
   * throtl_schedule_next_dispatch - schedule the next dispatch cycle
   * @sq: the service_queue to schedule dispatch for
   * @force: force scheduling
   *
   * Arm @sq->pending_timer so that the next dispatch cycle starts on the
   * dispatch time of the first pending child.  Returns %true if either timer
   * is armed or there's no pending child left.  %false if the current
   * dispatch window is still open and the caller should continue
   * dispatching.
   *
   * If @force is %true, the dispatch timer is always scheduled and this
   * function is guaranteed to return %true.  This is to be used when the
   * caller can't dispatch itself and needs to invoke pending_timer
   * unconditionally.  Note that forced scheduling is likely to induce short
   * delay before dispatch starts even if @sq->first_pending_disptime is not
   * in the future and thus shouldn't be used in hot paths.
   */
  static bool throtl_schedule_next_dispatch(struct throtl_service_queue *sq,
  					  bool force)
e43473b7f   Vivek Goyal   blkio: Core imple...
699
  {
6a525600f   Tejun Heo   blk-throttle: rem...
700
  	/* any pending children left? */
c9e0332e8   Tejun Heo   blk-throttle: ren...
701
  	if (!sq->nr_pending)
7f52f98c2   Tejun Heo   blk-throttle: imp...
702
  		return true;
e43473b7f   Vivek Goyal   blkio: Core imple...
703

c9e0332e8   Tejun Heo   blk-throttle: ren...
704
  	update_min_dispatch_time(sq);
e43473b7f   Vivek Goyal   blkio: Core imple...
705

69df0ab03   Tejun Heo   blk-throttle: sep...
706
  	/* is the next dispatch time in the future? */
7f52f98c2   Tejun Heo   blk-throttle: imp...
707
  	if (force || time_after(sq->first_pending_disptime, jiffies)) {
69df0ab03   Tejun Heo   blk-throttle: sep...
708
  		throtl_schedule_pending_timer(sq, sq->first_pending_disptime);
7f52f98c2   Tejun Heo   blk-throttle: imp...
709
  		return true;
69df0ab03   Tejun Heo   blk-throttle: sep...
710
  	}
7f52f98c2   Tejun Heo   blk-throttle: imp...
711
712
  	/* tell the caller to continue dispatching */
  	return false;
e43473b7f   Vivek Goyal   blkio: Core imple...
713
  }
32ee5bc47   Vivek Goyal   blk-throttle: Acc...
714
715
716
717
718
719
720
721
722
723
724
725
726
727
  static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg,
  		bool rw, unsigned long start)
  {
  	tg->bytes_disp[rw] = 0;
  	tg->io_disp[rw] = 0;
  
  	/*
  	 * Previous slice has expired. We must have trimmed it after last
  	 * bio dispatch. That means since start of last slice, we never used
  	 * that bandwidth. Do try to make use of that bandwidth while giving
  	 * credit.
  	 */
  	if (time_after_eq(start, tg->slice_start[rw]))
  		tg->slice_start[rw] = start;
297e3d854   Shaohua Li   blk-throttle: mak...
728
  	tg->slice_end[rw] = jiffies + tg->td->throtl_slice;
32ee5bc47   Vivek Goyal   blk-throttle: Acc...
729
730
731
732
733
  	throtl_log(&tg->service_queue,
  		   "[%c] new slice with credit start=%lu end=%lu jiffies=%lu",
  		   rw == READ ? 'R' : 'W', tg->slice_start[rw],
  		   tg->slice_end[rw], jiffies);
  }
0f3457f60   Tejun Heo   blk-throttle: add...
734
  static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw)
e43473b7f   Vivek Goyal   blkio: Core imple...
735
736
  {
  	tg->bytes_disp[rw] = 0;
8e89d13f4   Vivek Goyal   blkio: Implementa...
737
  	tg->io_disp[rw] = 0;
e43473b7f   Vivek Goyal   blkio: Core imple...
738
  	tg->slice_start[rw] = jiffies;
297e3d854   Shaohua Li   blk-throttle: mak...
739
  	tg->slice_end[rw] = jiffies + tg->td->throtl_slice;
fda6f272c   Tejun Heo   blk-throttle: imp...
740
741
742
743
  	throtl_log(&tg->service_queue,
  		   "[%c] new slice start=%lu end=%lu jiffies=%lu",
  		   rw == READ ? 'R' : 'W', tg->slice_start[rw],
  		   tg->slice_end[rw], jiffies);
e43473b7f   Vivek Goyal   blkio: Core imple...
744
  }
0f3457f60   Tejun Heo   blk-throttle: add...
745
746
  static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw,
  					unsigned long jiffy_end)
d1ae8ffdf   Vivek Goyal   blk-throttle: Tri...
747
  {
297e3d854   Shaohua Li   blk-throttle: mak...
748
  	tg->slice_end[rw] = roundup(jiffy_end, tg->td->throtl_slice);
d1ae8ffdf   Vivek Goyal   blk-throttle: Tri...
749
  }
0f3457f60   Tejun Heo   blk-throttle: add...
750
751
  static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw,
  				       unsigned long jiffy_end)
e43473b7f   Vivek Goyal   blkio: Core imple...
752
  {
297e3d854   Shaohua Li   blk-throttle: mak...
753
  	tg->slice_end[rw] = roundup(jiffy_end, tg->td->throtl_slice);
fda6f272c   Tejun Heo   blk-throttle: imp...
754
755
756
757
  	throtl_log(&tg->service_queue,
  		   "[%c] extend slice start=%lu end=%lu jiffies=%lu",
  		   rw == READ ? 'R' : 'W', tg->slice_start[rw],
  		   tg->slice_end[rw], jiffies);
e43473b7f   Vivek Goyal   blkio: Core imple...
758
759
760
  }
  
  /* Determine if previously allocated or extended slice is complete or not */
0f3457f60   Tejun Heo   blk-throttle: add...
761
  static bool throtl_slice_used(struct throtl_grp *tg, bool rw)
e43473b7f   Vivek Goyal   blkio: Core imple...
762
763
  {
  	if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw]))
5cf8c2277   Fabian Frederick   block/blk-throttl...
764
  		return false;
e43473b7f   Vivek Goyal   blkio: Core imple...
765
766
767
768
769
  
  	return 1;
  }
  
  /* Trim the used slices and adjust slice start accordingly */
0f3457f60   Tejun Heo   blk-throttle: add...
770
  static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
e43473b7f   Vivek Goyal   blkio: Core imple...
771
  {
3aad5d3ee   Vivek Goyal   blkio-throttle: F...
772
773
  	unsigned long nr_slices, time_elapsed, io_trim;
  	u64 bytes_trim, tmp;
e43473b7f   Vivek Goyal   blkio: Core imple...
774
775
776
777
778
779
780
781
  
  	BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw]));
  
  	/*
  	 * If bps are unlimited (-1), then time slice don't get
  	 * renewed. Don't try to trim the slice if slice is used. A new
  	 * slice will start when appropriate.
  	 */
0f3457f60   Tejun Heo   blk-throttle: add...
782
  	if (throtl_slice_used(tg, rw))
e43473b7f   Vivek Goyal   blkio: Core imple...
783
  		return;
d1ae8ffdf   Vivek Goyal   blk-throttle: Tri...
784
785
786
787
788
789
790
  	/*
  	 * A bio has been dispatched. Also adjust slice_end. It might happen
  	 * that initially cgroup limit was very low resulting in high
  	 * slice_end, but later limit was bumped up and bio was dispached
  	 * sooner, then we need to reduce slice_end. A high bogus slice_end
  	 * is bad because it does not allow new slice to start.
  	 */
297e3d854   Shaohua Li   blk-throttle: mak...
791
  	throtl_set_slice_end(tg, rw, jiffies + tg->td->throtl_slice);
d1ae8ffdf   Vivek Goyal   blk-throttle: Tri...
792

e43473b7f   Vivek Goyal   blkio: Core imple...
793
  	time_elapsed = jiffies - tg->slice_start[rw];
297e3d854   Shaohua Li   blk-throttle: mak...
794
  	nr_slices = time_elapsed / tg->td->throtl_slice;
e43473b7f   Vivek Goyal   blkio: Core imple...
795
796
797
  
  	if (!nr_slices)
  		return;
297e3d854   Shaohua Li   blk-throttle: mak...
798
  	tmp = tg_bps_limit(tg, rw) * tg->td->throtl_slice * nr_slices;
3aad5d3ee   Vivek Goyal   blkio-throttle: F...
799
800
  	do_div(tmp, HZ);
  	bytes_trim = tmp;
e43473b7f   Vivek Goyal   blkio: Core imple...
801

297e3d854   Shaohua Li   blk-throttle: mak...
802
803
  	io_trim = (tg_iops_limit(tg, rw) * tg->td->throtl_slice * nr_slices) /
  		HZ;
e43473b7f   Vivek Goyal   blkio: Core imple...
804

8e89d13f4   Vivek Goyal   blkio: Implementa...
805
  	if (!bytes_trim && !io_trim)
e43473b7f   Vivek Goyal   blkio: Core imple...
806
807
808
809
810
811
  		return;
  
  	if (tg->bytes_disp[rw] >= bytes_trim)
  		tg->bytes_disp[rw] -= bytes_trim;
  	else
  		tg->bytes_disp[rw] = 0;
8e89d13f4   Vivek Goyal   blkio: Implementa...
812
813
814
815
  	if (tg->io_disp[rw] >= io_trim)
  		tg->io_disp[rw] -= io_trim;
  	else
  		tg->io_disp[rw] = 0;
297e3d854   Shaohua Li   blk-throttle: mak...
816
  	tg->slice_start[rw] += nr_slices * tg->td->throtl_slice;
e43473b7f   Vivek Goyal   blkio: Core imple...
817

fda6f272c   Tejun Heo   blk-throttle: imp...
818
819
820
821
  	throtl_log(&tg->service_queue,
  		   "[%c] trim slice nr=%lu bytes=%llu io=%lu start=%lu end=%lu jiffies=%lu",
  		   rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim,
  		   tg->slice_start[rw], tg->slice_end[rw], jiffies);
e43473b7f   Vivek Goyal   blkio: Core imple...
822
  }
0f3457f60   Tejun Heo   blk-throttle: add...
823
824
  static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
  				  unsigned long *wait)
e43473b7f   Vivek Goyal   blkio: Core imple...
825
826
  {
  	bool rw = bio_data_dir(bio);
8e89d13f4   Vivek Goyal   blkio: Implementa...
827
  	unsigned int io_allowed;
e43473b7f   Vivek Goyal   blkio: Core imple...
828
  	unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
c49c06e49   Vivek Goyal   blkio-throttle: F...
829
  	u64 tmp;
e43473b7f   Vivek Goyal   blkio: Core imple...
830

8e89d13f4   Vivek Goyal   blkio: Implementa...
831
  	jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
e43473b7f   Vivek Goyal   blkio: Core imple...
832

8e89d13f4   Vivek Goyal   blkio: Implementa...
833
834
  	/* Slice has just started. Consider one slice interval */
  	if (!jiffy_elapsed)
297e3d854   Shaohua Li   blk-throttle: mak...
835
  		jiffy_elapsed_rnd = tg->td->throtl_slice;
8e89d13f4   Vivek Goyal   blkio: Implementa...
836

297e3d854   Shaohua Li   blk-throttle: mak...
837
  	jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice);
8e89d13f4   Vivek Goyal   blkio: Implementa...
838

c49c06e49   Vivek Goyal   blkio-throttle: F...
839
840
841
842
843
844
  	/*
  	 * jiffy_elapsed_rnd should not be a big value as minimum iops can be
  	 * 1 then at max jiffy elapsed should be equivalent of 1 second as we
  	 * will allow dispatch after 1 second and after that slice should
  	 * have been trimmed.
  	 */
9f626e372   Shaohua Li   blk-throttle: pre...
845
  	tmp = (u64)tg_iops_limit(tg, rw) * jiffy_elapsed_rnd;
c49c06e49   Vivek Goyal   blkio-throttle: F...
846
847
848
849
850
851
  	do_div(tmp, HZ);
  
  	if (tmp > UINT_MAX)
  		io_allowed = UINT_MAX;
  	else
  		io_allowed = tmp;
8e89d13f4   Vivek Goyal   blkio: Implementa...
852
853
  
  	if (tg->io_disp[rw] + 1 <= io_allowed) {
e43473b7f   Vivek Goyal   blkio: Core imple...
854
855
  		if (wait)
  			*wait = 0;
5cf8c2277   Fabian Frederick   block/blk-throttl...
856
  		return true;
e43473b7f   Vivek Goyal   blkio: Core imple...
857
  	}
8e89d13f4   Vivek Goyal   blkio: Implementa...
858
  	/* Calc approx time to dispatch */
9f626e372   Shaohua Li   blk-throttle: pre...
859
  	jiffy_wait = ((tg->io_disp[rw] + 1) * HZ) / tg_iops_limit(tg, rw) + 1;
8e89d13f4   Vivek Goyal   blkio: Implementa...
860
861
862
863
864
865
866
867
868
869
  
  	if (jiffy_wait > jiffy_elapsed)
  		jiffy_wait = jiffy_wait - jiffy_elapsed;
  	else
  		jiffy_wait = 1;
  
  	if (wait)
  		*wait = jiffy_wait;
  	return 0;
  }
0f3457f60   Tejun Heo   blk-throttle: add...
870
871
  static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
  				 unsigned long *wait)
8e89d13f4   Vivek Goyal   blkio: Implementa...
872
873
  {
  	bool rw = bio_data_dir(bio);
3aad5d3ee   Vivek Goyal   blkio-throttle: F...
874
  	u64 bytes_allowed, extra_bytes, tmp;
8e89d13f4   Vivek Goyal   blkio: Implementa...
875
  	unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
ea0ea2bc6   Shaohua Li   blk-throttle: cap...
876
  	unsigned int bio_size = throtl_bio_data_size(bio);
e43473b7f   Vivek Goyal   blkio: Core imple...
877
878
879
880
881
  
  	jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
  
  	/* Slice has just started. Consider one slice interval */
  	if (!jiffy_elapsed)
297e3d854   Shaohua Li   blk-throttle: mak...
882
  		jiffy_elapsed_rnd = tg->td->throtl_slice;
e43473b7f   Vivek Goyal   blkio: Core imple...
883

297e3d854   Shaohua Li   blk-throttle: mak...
884
  	jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice);
e43473b7f   Vivek Goyal   blkio: Core imple...
885

9f626e372   Shaohua Li   blk-throttle: pre...
886
  	tmp = tg_bps_limit(tg, rw) * jiffy_elapsed_rnd;
5e901a2b9   Vivek Goyal   blkio-throttle: T...
887
  	do_div(tmp, HZ);
3aad5d3ee   Vivek Goyal   blkio-throttle: F...
888
  	bytes_allowed = tmp;
e43473b7f   Vivek Goyal   blkio: Core imple...
889

ea0ea2bc6   Shaohua Li   blk-throttle: cap...
890
  	if (tg->bytes_disp[rw] + bio_size <= bytes_allowed) {
e43473b7f   Vivek Goyal   blkio: Core imple...
891
892
  		if (wait)
  			*wait = 0;
5cf8c2277   Fabian Frederick   block/blk-throttl...
893
  		return true;
e43473b7f   Vivek Goyal   blkio: Core imple...
894
895
896
  	}
  
  	/* Calc approx time to dispatch */
ea0ea2bc6   Shaohua Li   blk-throttle: cap...
897
  	extra_bytes = tg->bytes_disp[rw] + bio_size - bytes_allowed;
9f626e372   Shaohua Li   blk-throttle: pre...
898
  	jiffy_wait = div64_u64(extra_bytes * HZ, tg_bps_limit(tg, rw));
e43473b7f   Vivek Goyal   blkio: Core imple...
899
900
901
902
903
904
905
906
907
  
  	if (!jiffy_wait)
  		jiffy_wait = 1;
  
  	/*
  	 * This wait time is without taking into consideration the rounding
  	 * up we did. Add that time also.
  	 */
  	jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed);
e43473b7f   Vivek Goyal   blkio: Core imple...
908
909
  	if (wait)
  		*wait = jiffy_wait;
8e89d13f4   Vivek Goyal   blkio: Implementa...
910
911
912
913
914
915
916
  	return 0;
  }
  
  /*
   * Returns whether one can dispatch a bio or not. Also returns approx number
   * of jiffies to wait before this bio is with-in IO rate and can be dispatched
   */
0f3457f60   Tejun Heo   blk-throttle: add...
917
918
  static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
  			    unsigned long *wait)
8e89d13f4   Vivek Goyal   blkio: Implementa...
919
920
921
922
923
924
925
926
927
928
  {
  	bool rw = bio_data_dir(bio);
  	unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;
  
  	/*
   	 * Currently whole state machine of group depends on first bio
  	 * queued in the group bio list. So one should not be calling
  	 * this function with a different bio if there are other bios
  	 * queued.
  	 */
73f0d49a9   Tejun Heo   blk-throttle: mov...
929
  	BUG_ON(tg->service_queue.nr_queued[rw] &&
c5cc2070b   Tejun Heo   blk-throttle: add...
930
  	       bio != throtl_peek_queued(&tg->service_queue.queued[rw]));
e43473b7f   Vivek Goyal   blkio: Core imple...
931

8e89d13f4   Vivek Goyal   blkio: Implementa...
932
  	/* If tg->bps = -1, then BW is unlimited */
9f626e372   Shaohua Li   blk-throttle: pre...
933
934
  	if (tg_bps_limit(tg, rw) == U64_MAX &&
  	    tg_iops_limit(tg, rw) == UINT_MAX) {
8e89d13f4   Vivek Goyal   blkio: Implementa...
935
936
  		if (wait)
  			*wait = 0;
5cf8c2277   Fabian Frederick   block/blk-throttl...
937
  		return true;
8e89d13f4   Vivek Goyal   blkio: Implementa...
938
939
940
941
942
  	}
  
  	/*
  	 * If previous slice expired, start a new one otherwise renew/extend
  	 * existing slice to make sure it is at least throtl_slice interval
164c80ed8   Vivek Goyal   blk-throttle: Ext...
943
944
945
  	 * long since now. New slice is started only for empty throttle group.
  	 * If there is queued bio, that means there should be an active
  	 * slice and it should be extended instead.
8e89d13f4   Vivek Goyal   blkio: Implementa...
946
  	 */
164c80ed8   Vivek Goyal   blk-throttle: Ext...
947
  	if (throtl_slice_used(tg, rw) && !(tg->service_queue.nr_queued[rw]))
0f3457f60   Tejun Heo   blk-throttle: add...
948
  		throtl_start_new_slice(tg, rw);
8e89d13f4   Vivek Goyal   blkio: Implementa...
949
  	else {
297e3d854   Shaohua Li   blk-throttle: mak...
950
951
952
953
  		if (time_before(tg->slice_end[rw],
  		    jiffies + tg->td->throtl_slice))
  			throtl_extend_slice(tg, rw,
  				jiffies + tg->td->throtl_slice);
8e89d13f4   Vivek Goyal   blkio: Implementa...
954
  	}
0f3457f60   Tejun Heo   blk-throttle: add...
955
956
  	if (tg_with_in_bps_limit(tg, bio, &bps_wait) &&
  	    tg_with_in_iops_limit(tg, bio, &iops_wait)) {
8e89d13f4   Vivek Goyal   blkio: Implementa...
957
958
959
960
961
962
963
964
965
966
967
  		if (wait)
  			*wait = 0;
  		return 1;
  	}
  
  	max_wait = max(bps_wait, iops_wait);
  
  	if (wait)
  		*wait = max_wait;
  
  	if (time_before(tg->slice_end[rw], jiffies + max_wait))
0f3457f60   Tejun Heo   blk-throttle: add...
968
  		throtl_extend_slice(tg, rw, jiffies + max_wait);
e43473b7f   Vivek Goyal   blkio: Core imple...
969
970
971
972
973
974
975
  
  	return 0;
  }
  
  static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
  {
  	bool rw = bio_data_dir(bio);
ea0ea2bc6   Shaohua Li   blk-throttle: cap...
976
  	unsigned int bio_size = throtl_bio_data_size(bio);
e43473b7f   Vivek Goyal   blkio: Core imple...
977
978
  
  	/* Charge the bio to the group */
ea0ea2bc6   Shaohua Li   blk-throttle: cap...
979
  	tg->bytes_disp[rw] += bio_size;
8e89d13f4   Vivek Goyal   blkio: Implementa...
980
  	tg->io_disp[rw]++;
ea0ea2bc6   Shaohua Li   blk-throttle: cap...
981
  	tg->last_bytes_disp[rw] += bio_size;
3f0abd806   Shaohua Li   blk-throttle: add...
982
  	tg->last_io_disp[rw]++;
e43473b7f   Vivek Goyal   blkio: Core imple...
983

2a0f61e6e   Tejun Heo   blk-throttle: set...
984
  	/*
8d2bbd4c8   Christoph Hellwig   block: replace RE...
985
  	 * BIO_THROTTLED is used to prevent the same bio to be throttled
2a0f61e6e   Tejun Heo   blk-throttle: set...
986
987
988
  	 * more than once as a throttled bio will go through blk-throtl the
  	 * second time when it eventually gets issued.  Set it when a bio
  	 * is being charged to a tg.
2a0f61e6e   Tejun Heo   blk-throttle: set...
989
  	 */
8d2bbd4c8   Christoph Hellwig   block: replace RE...
990
991
  	if (!bio_flagged(bio, BIO_THROTTLED))
  		bio_set_flag(bio, BIO_THROTTLED);
e43473b7f   Vivek Goyal   blkio: Core imple...
992
  }
c5cc2070b   Tejun Heo   blk-throttle: add...
993
994
995
996
997
998
999
1000
1001
1002
1003
  /**
   * throtl_add_bio_tg - add a bio to the specified throtl_grp
   * @bio: bio to add
   * @qn: qnode to use
   * @tg: the target throtl_grp
   *
   * Add @bio to @tg's service_queue using @qn.  If @qn is not specified,
   * tg->qnode_on_self[] is used.
   */
  static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn,
  			      struct throtl_grp *tg)
e43473b7f   Vivek Goyal   blkio: Core imple...
1004
  {
73f0d49a9   Tejun Heo   blk-throttle: mov...
1005
  	struct throtl_service_queue *sq = &tg->service_queue;
e43473b7f   Vivek Goyal   blkio: Core imple...
1006
  	bool rw = bio_data_dir(bio);
c5cc2070b   Tejun Heo   blk-throttle: add...
1007
1008
  	if (!qn)
  		qn = &tg->qnode_on_self[rw];
0e9f4164b   Tejun Heo   blk-throttle: gen...
1009
1010
1011
1012
1013
1014
1015
1016
  	/*
  	 * If @tg doesn't currently have any bios queued in the same
  	 * direction, queueing @bio can change when @tg should be
  	 * dispatched.  Mark that @tg was empty.  This is automatically
  	 * cleaered on the next tg_update_disptime().
  	 */
  	if (!sq->nr_queued[rw])
  		tg->flags |= THROTL_TG_WAS_EMPTY;
c5cc2070b   Tejun Heo   blk-throttle: add...
1017
  	throtl_qnode_add_bio(bio, qn, &sq->queued[rw]);
73f0d49a9   Tejun Heo   blk-throttle: mov...
1018
  	sq->nr_queued[rw]++;
77216b048   Tejun Heo   blk-throttle: add...
1019
  	throtl_enqueue_tg(tg);
e43473b7f   Vivek Goyal   blkio: Core imple...
1020
  }
77216b048   Tejun Heo   blk-throttle: add...
1021
  static void tg_update_disptime(struct throtl_grp *tg)
e43473b7f   Vivek Goyal   blkio: Core imple...
1022
  {
73f0d49a9   Tejun Heo   blk-throttle: mov...
1023
  	struct throtl_service_queue *sq = &tg->service_queue;
e43473b7f   Vivek Goyal   blkio: Core imple...
1024
1025
  	unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;
  	struct bio *bio;
d609af3a1   Markus Elfring   blk-throttle: Adj...
1026
1027
  	bio = throtl_peek_queued(&sq->queued[READ]);
  	if (bio)
0f3457f60   Tejun Heo   blk-throttle: add...
1028
  		tg_may_dispatch(tg, bio, &read_wait);
e43473b7f   Vivek Goyal   blkio: Core imple...
1029

d609af3a1   Markus Elfring   blk-throttle: Adj...
1030
1031
  	bio = throtl_peek_queued(&sq->queued[WRITE]);
  	if (bio)
0f3457f60   Tejun Heo   blk-throttle: add...
1032
  		tg_may_dispatch(tg, bio, &write_wait);
e43473b7f   Vivek Goyal   blkio: Core imple...
1033
1034
1035
  
  	min_wait = min(read_wait, write_wait);
  	disptime = jiffies + min_wait;
e43473b7f   Vivek Goyal   blkio: Core imple...
1036
  	/* Update dispatch time */
77216b048   Tejun Heo   blk-throttle: add...
1037
  	throtl_dequeue_tg(tg);
e43473b7f   Vivek Goyal   blkio: Core imple...
1038
  	tg->disptime = disptime;
77216b048   Tejun Heo   blk-throttle: add...
1039
  	throtl_enqueue_tg(tg);
0e9f4164b   Tejun Heo   blk-throttle: gen...
1040
1041
1042
  
  	/* see throtl_add_bio_tg() */
  	tg->flags &= ~THROTL_TG_WAS_EMPTY;
e43473b7f   Vivek Goyal   blkio: Core imple...
1043
  }
32ee5bc47   Vivek Goyal   blk-throttle: Acc...
1044
1045
1046
1047
1048
1049
1050
1051
1052
  static void start_parent_slice_with_credit(struct throtl_grp *child_tg,
  					struct throtl_grp *parent_tg, bool rw)
  {
  	if (throtl_slice_used(parent_tg, rw)) {
  		throtl_start_new_slice_with_credit(parent_tg, rw,
  				child_tg->slice_start[rw]);
  	}
  
  }
77216b048   Tejun Heo   blk-throttle: add...
1053
  static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
e43473b7f   Vivek Goyal   blkio: Core imple...
1054
  {
73f0d49a9   Tejun Heo   blk-throttle: mov...
1055
  	struct throtl_service_queue *sq = &tg->service_queue;
6bc9c2b46   Tejun Heo   blk-throttle: mak...
1056
1057
  	struct throtl_service_queue *parent_sq = sq->parent_sq;
  	struct throtl_grp *parent_tg = sq_to_tg(parent_sq);
c5cc2070b   Tejun Heo   blk-throttle: add...
1058
  	struct throtl_grp *tg_to_put = NULL;
e43473b7f   Vivek Goyal   blkio: Core imple...
1059
  	struct bio *bio;
c5cc2070b   Tejun Heo   blk-throttle: add...
1060
1061
1062
1063
1064
1065
1066
  	/*
  	 * @bio is being transferred from @tg to @parent_sq.  Popping a bio
  	 * from @tg may put its reference and @parent_sq might end up
  	 * getting released prematurely.  Remember the tg to put and put it
  	 * after @bio is transferred to @parent_sq.
  	 */
  	bio = throtl_pop_queued(&sq->queued[rw], &tg_to_put);
73f0d49a9   Tejun Heo   blk-throttle: mov...
1067
  	sq->nr_queued[rw]--;
e43473b7f   Vivek Goyal   blkio: Core imple...
1068
1069
  
  	throtl_charge_bio(tg, bio);
6bc9c2b46   Tejun Heo   blk-throttle: mak...
1070
1071
1072
1073
1074
1075
1076
1077
1078
  
  	/*
  	 * If our parent is another tg, we just need to transfer @bio to
  	 * the parent using throtl_add_bio_tg().  If our parent is
  	 * @td->service_queue, @bio is ready to be issued.  Put it on its
  	 * bio_lists[] and decrease total number queued.  The caller is
  	 * responsible for issuing these bios.
  	 */
  	if (parent_tg) {
c5cc2070b   Tejun Heo   blk-throttle: add...
1079
  		throtl_add_bio_tg(bio, &tg->qnode_on_parent[rw], parent_tg);
32ee5bc47   Vivek Goyal   blk-throttle: Acc...
1080
  		start_parent_slice_with_credit(tg, parent_tg, rw);
6bc9c2b46   Tejun Heo   blk-throttle: mak...
1081
  	} else {
c5cc2070b   Tejun Heo   blk-throttle: add...
1082
1083
  		throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw],
  				     &parent_sq->queued[rw]);
6bc9c2b46   Tejun Heo   blk-throttle: mak...
1084
1085
1086
  		BUG_ON(tg->td->nr_queued[rw] <= 0);
  		tg->td->nr_queued[rw]--;
  	}
e43473b7f   Vivek Goyal   blkio: Core imple...
1087

0f3457f60   Tejun Heo   blk-throttle: add...
1088
  	throtl_trim_slice(tg, rw);
6bc9c2b46   Tejun Heo   blk-throttle: mak...
1089

c5cc2070b   Tejun Heo   blk-throttle: add...
1090
1091
  	if (tg_to_put)
  		blkg_put(tg_to_blkg(tg_to_put));
e43473b7f   Vivek Goyal   blkio: Core imple...
1092
  }
77216b048   Tejun Heo   blk-throttle: add...
1093
  static int throtl_dispatch_tg(struct throtl_grp *tg)
e43473b7f   Vivek Goyal   blkio: Core imple...
1094
  {
73f0d49a9   Tejun Heo   blk-throttle: mov...
1095
  	struct throtl_service_queue *sq = &tg->service_queue;
e43473b7f   Vivek Goyal   blkio: Core imple...
1096
1097
  	unsigned int nr_reads = 0, nr_writes = 0;
  	unsigned int max_nr_reads = throtl_grp_quantum*3/4;
c2f6805d4   Vivek Goyal   blk-throttle: Fix...
1098
  	unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads;
e43473b7f   Vivek Goyal   blkio: Core imple...
1099
1100
1101
  	struct bio *bio;
  
  	/* Try to dispatch 75% READS and 25% WRITES */
c5cc2070b   Tejun Heo   blk-throttle: add...
1102
  	while ((bio = throtl_peek_queued(&sq->queued[READ])) &&
0f3457f60   Tejun Heo   blk-throttle: add...
1103
  	       tg_may_dispatch(tg, bio, NULL)) {
e43473b7f   Vivek Goyal   blkio: Core imple...
1104

77216b048   Tejun Heo   blk-throttle: add...
1105
  		tg_dispatch_one_bio(tg, bio_data_dir(bio));
e43473b7f   Vivek Goyal   blkio: Core imple...
1106
1107
1108
1109
1110
  		nr_reads++;
  
  		if (nr_reads >= max_nr_reads)
  			break;
  	}
c5cc2070b   Tejun Heo   blk-throttle: add...
1111
  	while ((bio = throtl_peek_queued(&sq->queued[WRITE])) &&
0f3457f60   Tejun Heo   blk-throttle: add...
1112
  	       tg_may_dispatch(tg, bio, NULL)) {
e43473b7f   Vivek Goyal   blkio: Core imple...
1113

77216b048   Tejun Heo   blk-throttle: add...
1114
  		tg_dispatch_one_bio(tg, bio_data_dir(bio));
e43473b7f   Vivek Goyal   blkio: Core imple...
1115
1116
1117
1118
1119
1120
1121
1122
  		nr_writes++;
  
  		if (nr_writes >= max_nr_writes)
  			break;
  	}
  
  	return nr_reads + nr_writes;
  }
651930bc1   Tejun Heo   blk-throttle: dis...
1123
  static int throtl_select_dispatch(struct throtl_service_queue *parent_sq)
e43473b7f   Vivek Goyal   blkio: Core imple...
1124
1125
  {
  	unsigned int nr_disp = 0;
e43473b7f   Vivek Goyal   blkio: Core imple...
1126
1127
  
  	while (1) {
73f0d49a9   Tejun Heo   blk-throttle: mov...
1128
1129
  		struct throtl_grp *tg = throtl_rb_first(parent_sq);
  		struct throtl_service_queue *sq = &tg->service_queue;
e43473b7f   Vivek Goyal   blkio: Core imple...
1130
1131
1132
1133
1134
1135
  
  		if (!tg)
  			break;
  
  		if (time_before(jiffies, tg->disptime))
  			break;
77216b048   Tejun Heo   blk-throttle: add...
1136
  		throtl_dequeue_tg(tg);
e43473b7f   Vivek Goyal   blkio: Core imple...
1137

77216b048   Tejun Heo   blk-throttle: add...
1138
  		nr_disp += throtl_dispatch_tg(tg);
e43473b7f   Vivek Goyal   blkio: Core imple...
1139

73f0d49a9   Tejun Heo   blk-throttle: mov...
1140
  		if (sq->nr_queued[0] || sq->nr_queued[1])
77216b048   Tejun Heo   blk-throttle: add...
1141
  			tg_update_disptime(tg);
e43473b7f   Vivek Goyal   blkio: Core imple...
1142
1143
1144
1145
1146
1147
1148
  
  		if (nr_disp >= throtl_quantum)
  			break;
  	}
  
  	return nr_disp;
  }
c79892c55   Shaohua Li   blk-throttle: add...
1149
1150
  static bool throtl_can_upgrade(struct throtl_data *td,
  	struct throtl_grp *this_tg);
6e1a5704c   Tejun Heo   blk-throttle: dis...
1151
1152
1153
1154
1155
1156
1157
  /**
   * throtl_pending_timer_fn - timer function for service_queue->pending_timer
   * @arg: the throtl_service_queue being serviced
   *
   * This timer is armed when a child throtl_grp with active bio's become
   * pending and queued on the service_queue's pending_tree and expires when
   * the first child throtl_grp should be dispatched.  This function
2e48a530a   Tejun Heo   blk-throttle: mak...
1158
1159
1160
1161
1162
1163
1164
   * dispatches bio's from the children throtl_grps to the parent
   * service_queue.
   *
   * If the parent's parent is another throtl_grp, dispatching is propagated
   * by either arming its pending_timer or repeating dispatch directly.  If
   * the top-level service_tree is reached, throtl_data->dispatch_work is
   * kicked so that the ready bio's are issued.
6e1a5704c   Tejun Heo   blk-throttle: dis...
1165
   */
69df0ab03   Tejun Heo   blk-throttle: sep...
1166
1167
1168
  static void throtl_pending_timer_fn(unsigned long arg)
  {
  	struct throtl_service_queue *sq = (void *)arg;
2e48a530a   Tejun Heo   blk-throttle: mak...
1169
  	struct throtl_grp *tg = sq_to_tg(sq);
69df0ab03   Tejun Heo   blk-throttle: sep...
1170
  	struct throtl_data *td = sq_to_td(sq);
cb76199c3   Tejun Heo   blk-throttle: col...
1171
  	struct request_queue *q = td->queue;
2e48a530a   Tejun Heo   blk-throttle: mak...
1172
1173
  	struct throtl_service_queue *parent_sq;
  	bool dispatched;
6e1a5704c   Tejun Heo   blk-throttle: dis...
1174
  	int ret;
e43473b7f   Vivek Goyal   blkio: Core imple...
1175
1176
  
  	spin_lock_irq(q->queue_lock);
c79892c55   Shaohua Li   blk-throttle: add...
1177
1178
  	if (throtl_can_upgrade(td, NULL))
  		throtl_upgrade_state(td);
2e48a530a   Tejun Heo   blk-throttle: mak...
1179
1180
1181
  again:
  	parent_sq = sq->parent_sq;
  	dispatched = false;
e43473b7f   Vivek Goyal   blkio: Core imple...
1182

7f52f98c2   Tejun Heo   blk-throttle: imp...
1183
1184
  	while (true) {
  		throtl_log(sq, "dispatch nr_queued=%u read=%u write=%u",
2e48a530a   Tejun Heo   blk-throttle: mak...
1185
1186
  			   sq->nr_queued[READ] + sq->nr_queued[WRITE],
  			   sq->nr_queued[READ], sq->nr_queued[WRITE]);
7f52f98c2   Tejun Heo   blk-throttle: imp...
1187
1188
1189
  
  		ret = throtl_select_dispatch(sq);
  		if (ret) {
7f52f98c2   Tejun Heo   blk-throttle: imp...
1190
1191
1192
  			throtl_log(sq, "bios disp=%u", ret);
  			dispatched = true;
  		}
e43473b7f   Vivek Goyal   blkio: Core imple...
1193

7f52f98c2   Tejun Heo   blk-throttle: imp...
1194
1195
  		if (throtl_schedule_next_dispatch(sq, false))
  			break;
e43473b7f   Vivek Goyal   blkio: Core imple...
1196

7f52f98c2   Tejun Heo   blk-throttle: imp...
1197
1198
1199
1200
  		/* this dispatch windows is still open, relax and repeat */
  		spin_unlock_irq(q->queue_lock);
  		cpu_relax();
  		spin_lock_irq(q->queue_lock);
651930bc1   Tejun Heo   blk-throttle: dis...
1201
  	}
e43473b7f   Vivek Goyal   blkio: Core imple...
1202

2e48a530a   Tejun Heo   blk-throttle: mak...
1203
1204
  	if (!dispatched)
  		goto out_unlock;
6e1a5704c   Tejun Heo   blk-throttle: dis...
1205

2e48a530a   Tejun Heo   blk-throttle: mak...
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
  	if (parent_sq) {
  		/* @parent_sq is another throl_grp, propagate dispatch */
  		if (tg->flags & THROTL_TG_WAS_EMPTY) {
  			tg_update_disptime(tg);
  			if (!throtl_schedule_next_dispatch(parent_sq, false)) {
  				/* window is already open, repeat dispatching */
  				sq = parent_sq;
  				tg = sq_to_tg(sq);
  				goto again;
  			}
  		}
  	} else {
  		/* reached the top-level, queue issueing */
  		queue_work(kthrotld_workqueue, &td->dispatch_work);
  	}
  out_unlock:
e43473b7f   Vivek Goyal   blkio: Core imple...
1222
  	spin_unlock_irq(q->queue_lock);
6e1a5704c   Tejun Heo   blk-throttle: dis...
1223
  }
e43473b7f   Vivek Goyal   blkio: Core imple...
1224

6e1a5704c   Tejun Heo   blk-throttle: dis...
1225
1226
1227
1228
1229
1230
1231
1232
  /**
   * blk_throtl_dispatch_work_fn - work function for throtl_data->dispatch_work
   * @work: work item being executed
   *
   * This function is queued for execution when bio's reach the bio_lists[]
   * of throtl_data->service_queue.  Those bio's are ready and issued by this
   * function.
   */
8876e140e   Fabian Frederick   block/blk-throttl...
1233
  static void blk_throtl_dispatch_work_fn(struct work_struct *work)
6e1a5704c   Tejun Heo   blk-throttle: dis...
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
  {
  	struct throtl_data *td = container_of(work, struct throtl_data,
  					      dispatch_work);
  	struct throtl_service_queue *td_sq = &td->service_queue;
  	struct request_queue *q = td->queue;
  	struct bio_list bio_list_on_stack;
  	struct bio *bio;
  	struct blk_plug plug;
  	int rw;
  
  	bio_list_init(&bio_list_on_stack);
  
  	spin_lock_irq(q->queue_lock);
c5cc2070b   Tejun Heo   blk-throttle: add...
1247
1248
1249
  	for (rw = READ; rw <= WRITE; rw++)
  		while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL)))
  			bio_list_add(&bio_list_on_stack, bio);
6e1a5704c   Tejun Heo   blk-throttle: dis...
1250
1251
1252
  	spin_unlock_irq(q->queue_lock);
  
  	if (!bio_list_empty(&bio_list_on_stack)) {
69d60eb96   Vivek Goyal   blk-throttle: Use...
1253
  		blk_start_plug(&plug);
e43473b7f   Vivek Goyal   blkio: Core imple...
1254
1255
  		while((bio = bio_list_pop(&bio_list_on_stack)))
  			generic_make_request(bio);
69d60eb96   Vivek Goyal   blk-throttle: Use...
1256
  		blk_finish_plug(&plug);
e43473b7f   Vivek Goyal   blkio: Core imple...
1257
  	}
e43473b7f   Vivek Goyal   blkio: Core imple...
1258
  }
f95a04afa   Tejun Heo   blkcg: embed stru...
1259
1260
  static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd,
  			      int off)
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1261
  {
f95a04afa   Tejun Heo   blkcg: embed stru...
1262
1263
  	struct throtl_grp *tg = pd_to_tg(pd);
  	u64 v = *(u64 *)((void *)tg + off);
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1264

2ab5492de   Shaohua Li   blk-throttle: use...
1265
  	if (v == U64_MAX)
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1266
  		return 0;
f95a04afa   Tejun Heo   blkcg: embed stru...
1267
  	return __blkg_prfill_u64(sf, pd, v);
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1268
  }
f95a04afa   Tejun Heo   blkcg: embed stru...
1269
1270
  static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd,
  			       int off)
e43473b7f   Vivek Goyal   blkio: Core imple...
1271
  {
f95a04afa   Tejun Heo   blkcg: embed stru...
1272
1273
  	struct throtl_grp *tg = pd_to_tg(pd);
  	unsigned int v = *(unsigned int *)((void *)tg + off);
fe0714377   Vivek Goyal   blkio: Recalculat...
1274

2ab5492de   Shaohua Li   blk-throttle: use...
1275
  	if (v == UINT_MAX)
af133ceb2   Tejun Heo   blkcg: move blkio...
1276
  		return 0;
f95a04afa   Tejun Heo   blkcg: embed stru...
1277
  	return __blkg_prfill_u64(sf, pd, v);
e43473b7f   Vivek Goyal   blkio: Core imple...
1278
  }
2da8ca822   Tejun Heo   cgroup: replace c...
1279
  static int tg_print_conf_u64(struct seq_file *sf, void *v)
8e89d13f4   Vivek Goyal   blkio: Implementa...
1280
  {
2da8ca822   Tejun Heo   cgroup: replace c...
1281
1282
  	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_u64,
  			  &blkcg_policy_throtl, seq_cft(sf)->private, false);
af133ceb2   Tejun Heo   blkcg: move blkio...
1283
  	return 0;
8e89d13f4   Vivek Goyal   blkio: Implementa...
1284
  }
2da8ca822   Tejun Heo   cgroup: replace c...
1285
  static int tg_print_conf_uint(struct seq_file *sf, void *v)
8e89d13f4   Vivek Goyal   blkio: Implementa...
1286
  {
2da8ca822   Tejun Heo   cgroup: replace c...
1287
1288
  	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_uint,
  			  &blkcg_policy_throtl, seq_cft(sf)->private, false);
af133ceb2   Tejun Heo   blkcg: move blkio...
1289
  	return 0;
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1290
  }
9bb67aeb9   Shaohua Li   blk-throttle: res...
1291
  static void tg_conf_updated(struct throtl_grp *tg, bool global)
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1292
  {
69948b070   Tejun Heo   blkcg: separate o...
1293
  	struct throtl_service_queue *sq = &tg->service_queue;
492eb21b9   Tejun Heo   cgroup: make hier...
1294
  	struct cgroup_subsys_state *pos_css;
69948b070   Tejun Heo   blkcg: separate o...
1295
  	struct blkcg_gq *blkg;
af133ceb2   Tejun Heo   blkcg: move blkio...
1296

fda6f272c   Tejun Heo   blk-throttle: imp...
1297
1298
  	throtl_log(&tg->service_queue,
  		   "limit change rbps=%llu wbps=%llu riops=%u wiops=%u",
9f626e372   Shaohua Li   blk-throttle: pre...
1299
1300
  		   tg_bps_limit(tg, READ), tg_bps_limit(tg, WRITE),
  		   tg_iops_limit(tg, READ), tg_iops_limit(tg, WRITE));
632b44935   Tejun Heo   blk-throttle: rem...
1301
1302
  
  	/*
693e751e7   Tejun Heo   blk-throttle: imp...
1303
1304
1305
1306
1307
1308
  	 * Update has_rules[] flags for the updated tg's subtree.  A tg is
  	 * considered to have rules if either the tg itself or any of its
  	 * ancestors has rules.  This identifies groups without any
  	 * restrictions in the whole hierarchy and allows them to bypass
  	 * blk-throttle.
  	 */
9bb67aeb9   Shaohua Li   blk-throttle: res...
1309
1310
  	blkg_for_each_descendant_pre(blkg, pos_css,
  			global ? tg->td->queue->root_blkg : tg_to_blkg(tg)) {
5b81fc3cc   Shaohua Li   blk-throttle: add...
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
  		struct throtl_grp *this_tg = blkg_to_tg(blkg);
  		struct throtl_grp *parent_tg;
  
  		tg_update_has_rules(this_tg);
  		/* ignore root/second level */
  		if (!cgroup_subsys_on_dfl(io_cgrp_subsys) || !blkg->parent ||
  		    !blkg->parent->parent)
  			continue;
  		parent_tg = blkg_to_tg(blkg->parent);
  		/*
  		 * make sure all children has lower idle time threshold and
  		 * higher latency target
  		 */
  		this_tg->idletime_threshold = min(this_tg->idletime_threshold,
  				parent_tg->idletime_threshold);
  		this_tg->latency_target = max(this_tg->latency_target,
  				parent_tg->latency_target);
  	}
693e751e7   Tejun Heo   blk-throttle: imp...
1329
1330
  
  	/*
632b44935   Tejun Heo   blk-throttle: rem...
1331
1332
1333
1334
1335
1336
1337
  	 * We're already holding queue_lock and know @tg is valid.  Let's
  	 * apply the new config directly.
  	 *
  	 * Restart the slices for both READ and WRITES. It might happen
  	 * that a group's limit are dropped suddenly and we don't want to
  	 * account recently dispatched IO with new low rate.
  	 */
0f3457f60   Tejun Heo   blk-throttle: add...
1338
1339
  	throtl_start_new_slice(tg, 0);
  	throtl_start_new_slice(tg, 1);
632b44935   Tejun Heo   blk-throttle: rem...
1340

5b2c16aae   Tejun Heo   blk-throttle: sim...
1341
  	if (tg->flags & THROTL_TG_PENDING) {
77216b048   Tejun Heo   blk-throttle: add...
1342
  		tg_update_disptime(tg);
7f52f98c2   Tejun Heo   blk-throttle: imp...
1343
  		throtl_schedule_next_dispatch(sq->parent_sq, true);
632b44935   Tejun Heo   blk-throttle: rem...
1344
  	}
69948b070   Tejun Heo   blkcg: separate o...
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
  }
  
  static ssize_t tg_set_conf(struct kernfs_open_file *of,
  			   char *buf, size_t nbytes, loff_t off, bool is_u64)
  {
  	struct blkcg *blkcg = css_to_blkcg(of_css(of));
  	struct blkg_conf_ctx ctx;
  	struct throtl_grp *tg;
  	int ret;
  	u64 v;
  
  	ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
  	if (ret)
  		return ret;
  
  	ret = -EINVAL;
  	if (sscanf(ctx.body, "%llu", &v) != 1)
  		goto out_finish;
  	if (!v)
2ab5492de   Shaohua Li   blk-throttle: use...
1364
  		v = U64_MAX;
69948b070   Tejun Heo   blkcg: separate o...
1365
1366
1367
1368
1369
1370
1371
  
  	tg = blkg_to_tg(ctx.blkg);
  
  	if (is_u64)
  		*(u64 *)((void *)tg + of_cft(of)->private) = v;
  	else
  		*(unsigned int *)((void *)tg + of_cft(of)->private) = v;
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1372

9bb67aeb9   Shaohua Li   blk-throttle: res...
1373
  	tg_conf_updated(tg, false);
36aa9e5f5   Tejun Heo   blkcg: move body ...
1374
1375
  	ret = 0;
  out_finish:
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1376
  	blkg_conf_finish(&ctx);
36aa9e5f5   Tejun Heo   blkcg: move body ...
1377
  	return ret ?: nbytes;
8e89d13f4   Vivek Goyal   blkio: Implementa...
1378
  }
451af504d   Tejun Heo   cgroup: replace c...
1379
1380
  static ssize_t tg_set_conf_u64(struct kernfs_open_file *of,
  			       char *buf, size_t nbytes, loff_t off)
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1381
  {
451af504d   Tejun Heo   cgroup: replace c...
1382
  	return tg_set_conf(of, buf, nbytes, off, true);
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1383
  }
451af504d   Tejun Heo   cgroup: replace c...
1384
1385
  static ssize_t tg_set_conf_uint(struct kernfs_open_file *of,
  				char *buf, size_t nbytes, loff_t off)
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1386
  {
451af504d   Tejun Heo   cgroup: replace c...
1387
  	return tg_set_conf(of, buf, nbytes, off, false);
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1388
  }
880f50e22   Tejun Heo   blkcg: mark exist...
1389
  static struct cftype throtl_legacy_files[] = {
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1390
1391
  	{
  		.name = "throttle.read_bps_device",
9f626e372   Shaohua Li   blk-throttle: pre...
1392
  		.private = offsetof(struct throtl_grp, bps[READ][LIMIT_MAX]),
2da8ca822   Tejun Heo   cgroup: replace c...
1393
  		.seq_show = tg_print_conf_u64,
451af504d   Tejun Heo   cgroup: replace c...
1394
  		.write = tg_set_conf_u64,
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1395
1396
1397
  	},
  	{
  		.name = "throttle.write_bps_device",
9f626e372   Shaohua Li   blk-throttle: pre...
1398
  		.private = offsetof(struct throtl_grp, bps[WRITE][LIMIT_MAX]),
2da8ca822   Tejun Heo   cgroup: replace c...
1399
  		.seq_show = tg_print_conf_u64,
451af504d   Tejun Heo   cgroup: replace c...
1400
  		.write = tg_set_conf_u64,
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1401
1402
1403
  	},
  	{
  		.name = "throttle.read_iops_device",
9f626e372   Shaohua Li   blk-throttle: pre...
1404
  		.private = offsetof(struct throtl_grp, iops[READ][LIMIT_MAX]),
2da8ca822   Tejun Heo   cgroup: replace c...
1405
  		.seq_show = tg_print_conf_uint,
451af504d   Tejun Heo   cgroup: replace c...
1406
  		.write = tg_set_conf_uint,
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1407
1408
1409
  	},
  	{
  		.name = "throttle.write_iops_device",
9f626e372   Shaohua Li   blk-throttle: pre...
1410
  		.private = offsetof(struct throtl_grp, iops[WRITE][LIMIT_MAX]),
2da8ca822   Tejun Heo   cgroup: replace c...
1411
  		.seq_show = tg_print_conf_uint,
451af504d   Tejun Heo   cgroup: replace c...
1412
  		.write = tg_set_conf_uint,
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1413
1414
1415
  	},
  	{
  		.name = "throttle.io_service_bytes",
77ea73388   Tejun Heo   blkcg: move io_se...
1416
1417
  		.private = (unsigned long)&blkcg_policy_throtl,
  		.seq_show = blkg_print_stat_bytes,
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1418
1419
  	},
  	{
1b9137ac5   weiping zhang   blk-throttle: exp...
1420
1421
1422
1423
1424
  		.name = "throttle.io_service_bytes_recursive",
  		.private = (unsigned long)&blkcg_policy_throtl,
  		.seq_show = blkg_print_stat_bytes_recursive,
  	},
  	{
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1425
  		.name = "throttle.io_serviced",
77ea73388   Tejun Heo   blkcg: move io_se...
1426
1427
  		.private = (unsigned long)&blkcg_policy_throtl,
  		.seq_show = blkg_print_stat_ios,
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1428
  	},
1b9137ac5   weiping zhang   blk-throttle: exp...
1429
1430
1431
1432
1433
  	{
  		.name = "throttle.io_serviced_recursive",
  		.private = (unsigned long)&blkcg_policy_throtl,
  		.seq_show = blkg_print_stat_ios_recursive,
  	},
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1434
1435
  	{ }	/* terminate */
  };
cd5ab1b0f   Shaohua Li   blk-throttle: add...
1436
  static u64 tg_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd,
2ee867dcf   Tejun Heo   blkcg: implement ...
1437
1438
1439
1440
1441
  			 int off)
  {
  	struct throtl_grp *tg = pd_to_tg(pd);
  	const char *dname = blkg_dev_name(pd->blkg);
  	char bufs[4][21] = { "max", "max", "max", "max" };
cd5ab1b0f   Shaohua Li   blk-throttle: add...
1442
1443
  	u64 bps_dft;
  	unsigned int iops_dft;
ada75b6e5   Shaohua Li   blk-throttle: add...
1444
  	char idle_time[26] = "";
ec80991d6   Shaohua Li   blk-throttle: add...
1445
  	char latency_time[26] = "";
2ee867dcf   Tejun Heo   blkcg: implement ...
1446
1447
1448
  
  	if (!dname)
  		return 0;
9f626e372   Shaohua Li   blk-throttle: pre...
1449

cd5ab1b0f   Shaohua Li   blk-throttle: add...
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
  	if (off == LIMIT_LOW) {
  		bps_dft = 0;
  		iops_dft = 0;
  	} else {
  		bps_dft = U64_MAX;
  		iops_dft = UINT_MAX;
  	}
  
  	if (tg->bps_conf[READ][off] == bps_dft &&
  	    tg->bps_conf[WRITE][off] == bps_dft &&
  	    tg->iops_conf[READ][off] == iops_dft &&
ada75b6e5   Shaohua Li   blk-throttle: add...
1461
  	    tg->iops_conf[WRITE][off] == iops_dft &&
ec80991d6   Shaohua Li   blk-throttle: add...
1462
  	    (off != LIMIT_LOW ||
b4f428ef2   Shaohua Li   blk-throttle: for...
1463
  	     (tg->idletime_threshold_conf == DFL_IDLE_THRESHOLD &&
5b81fc3cc   Shaohua Li   blk-throttle: add...
1464
  	      tg->latency_target_conf == DFL_LATENCY_TARGET)))
2ee867dcf   Tejun Heo   blkcg: implement ...
1465
  		return 0;
9bb67aeb9   Shaohua Li   blk-throttle: res...
1466
  	if (tg->bps_conf[READ][off] != U64_MAX)
9f626e372   Shaohua Li   blk-throttle: pre...
1467
  		snprintf(bufs[0], sizeof(bufs[0]), "%llu",
cd5ab1b0f   Shaohua Li   blk-throttle: add...
1468
  			tg->bps_conf[READ][off]);
9bb67aeb9   Shaohua Li   blk-throttle: res...
1469
  	if (tg->bps_conf[WRITE][off] != U64_MAX)
9f626e372   Shaohua Li   blk-throttle: pre...
1470
  		snprintf(bufs[1], sizeof(bufs[1]), "%llu",
cd5ab1b0f   Shaohua Li   blk-throttle: add...
1471
  			tg->bps_conf[WRITE][off]);
9bb67aeb9   Shaohua Li   blk-throttle: res...
1472
  	if (tg->iops_conf[READ][off] != UINT_MAX)
9f626e372   Shaohua Li   blk-throttle: pre...
1473
  		snprintf(bufs[2], sizeof(bufs[2]), "%u",
cd5ab1b0f   Shaohua Li   blk-throttle: add...
1474
  			tg->iops_conf[READ][off]);
9bb67aeb9   Shaohua Li   blk-throttle: res...
1475
  	if (tg->iops_conf[WRITE][off] != UINT_MAX)
9f626e372   Shaohua Li   blk-throttle: pre...
1476
  		snprintf(bufs[3], sizeof(bufs[3]), "%u",
cd5ab1b0f   Shaohua Li   blk-throttle: add...
1477
  			tg->iops_conf[WRITE][off]);
ada75b6e5   Shaohua Li   blk-throttle: add...
1478
  	if (off == LIMIT_LOW) {
5b81fc3cc   Shaohua Li   blk-throttle: add...
1479
  		if (tg->idletime_threshold_conf == ULONG_MAX)
ada75b6e5   Shaohua Li   blk-throttle: add...
1480
1481
1482
  			strcpy(idle_time, " idle=max");
  		else
  			snprintf(idle_time, sizeof(idle_time), " idle=%lu",
5b81fc3cc   Shaohua Li   blk-throttle: add...
1483
  				tg->idletime_threshold_conf);
ec80991d6   Shaohua Li   blk-throttle: add...
1484

5b81fc3cc   Shaohua Li   blk-throttle: add...
1485
  		if (tg->latency_target_conf == ULONG_MAX)
ec80991d6   Shaohua Li   blk-throttle: add...
1486
1487
1488
  			strcpy(latency_time, " latency=max");
  		else
  			snprintf(latency_time, sizeof(latency_time),
5b81fc3cc   Shaohua Li   blk-throttle: add...
1489
  				" latency=%lu", tg->latency_target_conf);
ada75b6e5   Shaohua Li   blk-throttle: add...
1490
  	}
2ee867dcf   Tejun Heo   blkcg: implement ...
1491

ec80991d6   Shaohua Li   blk-throttle: add...
1492
1493
1494
1495
  	seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s%s%s
  ",
  		   dname, bufs[0], bufs[1], bufs[2], bufs[3], idle_time,
  		   latency_time);
2ee867dcf   Tejun Heo   blkcg: implement ...
1496
1497
  	return 0;
  }
cd5ab1b0f   Shaohua Li   blk-throttle: add...
1498
  static int tg_print_limit(struct seq_file *sf, void *v)
2ee867dcf   Tejun Heo   blkcg: implement ...
1499
  {
cd5ab1b0f   Shaohua Li   blk-throttle: add...
1500
  	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_limit,
2ee867dcf   Tejun Heo   blkcg: implement ...
1501
1502
1503
  			  &blkcg_policy_throtl, seq_cft(sf)->private, false);
  	return 0;
  }
cd5ab1b0f   Shaohua Li   blk-throttle: add...
1504
  static ssize_t tg_set_limit(struct kernfs_open_file *of,
2ee867dcf   Tejun Heo   blkcg: implement ...
1505
1506
1507
1508
1509
1510
  			  char *buf, size_t nbytes, loff_t off)
  {
  	struct blkcg *blkcg = css_to_blkcg(of_css(of));
  	struct blkg_conf_ctx ctx;
  	struct throtl_grp *tg;
  	u64 v[4];
ada75b6e5   Shaohua Li   blk-throttle: add...
1511
  	unsigned long idle_time;
ec80991d6   Shaohua Li   blk-throttle: add...
1512
  	unsigned long latency_time;
2ee867dcf   Tejun Heo   blkcg: implement ...
1513
  	int ret;
cd5ab1b0f   Shaohua Li   blk-throttle: add...
1514
  	int index = of_cft(of)->private;
2ee867dcf   Tejun Heo   blkcg: implement ...
1515
1516
1517
1518
1519
1520
  
  	ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
  	if (ret)
  		return ret;
  
  	tg = blkg_to_tg(ctx.blkg);
cd5ab1b0f   Shaohua Li   blk-throttle: add...
1521
1522
1523
1524
  	v[0] = tg->bps_conf[READ][index];
  	v[1] = tg->bps_conf[WRITE][index];
  	v[2] = tg->iops_conf[READ][index];
  	v[3] = tg->iops_conf[WRITE][index];
2ee867dcf   Tejun Heo   blkcg: implement ...
1525

5b81fc3cc   Shaohua Li   blk-throttle: add...
1526
1527
  	idle_time = tg->idletime_threshold_conf;
  	latency_time = tg->latency_target_conf;
2ee867dcf   Tejun Heo   blkcg: implement ...
1528
1529
1530
  	while (true) {
  		char tok[27];	/* wiops=18446744073709551616 */
  		char *p;
2ab5492de   Shaohua Li   blk-throttle: use...
1531
  		u64 val = U64_MAX;
2ee867dcf   Tejun Heo   blkcg: implement ...
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
  		int len;
  
  		if (sscanf(ctx.body, "%26s%n", tok, &len) != 1)
  			break;
  		if (tok[0] == '\0')
  			break;
  		ctx.body += len;
  
  		ret = -EINVAL;
  		p = tok;
  		strsep(&p, "=");
  		if (!p || (sscanf(p, "%llu", &val) != 1 && strcmp(p, "max")))
  			goto out_finish;
  
  		ret = -ERANGE;
  		if (!val)
  			goto out_finish;
  
  		ret = -EINVAL;
  		if (!strcmp(tok, "rbps"))
  			v[0] = val;
  		else if (!strcmp(tok, "wbps"))
  			v[1] = val;
  		else if (!strcmp(tok, "riops"))
  			v[2] = min_t(u64, val, UINT_MAX);
  		else if (!strcmp(tok, "wiops"))
  			v[3] = min_t(u64, val, UINT_MAX);
ada75b6e5   Shaohua Li   blk-throttle: add...
1559
1560
  		else if (off == LIMIT_LOW && !strcmp(tok, "idle"))
  			idle_time = val;
ec80991d6   Shaohua Li   blk-throttle: add...
1561
1562
  		else if (off == LIMIT_LOW && !strcmp(tok, "latency"))
  			latency_time = val;
2ee867dcf   Tejun Heo   blkcg: implement ...
1563
1564
1565
  		else
  			goto out_finish;
  	}
cd5ab1b0f   Shaohua Li   blk-throttle: add...
1566
1567
1568
1569
  	tg->bps_conf[READ][index] = v[0];
  	tg->bps_conf[WRITE][index] = v[1];
  	tg->iops_conf[READ][index] = v[2];
  	tg->iops_conf[WRITE][index] = v[3];
2ee867dcf   Tejun Heo   blkcg: implement ...
1570

cd5ab1b0f   Shaohua Li   blk-throttle: add...
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
  	if (index == LIMIT_MAX) {
  		tg->bps[READ][index] = v[0];
  		tg->bps[WRITE][index] = v[1];
  		tg->iops[READ][index] = v[2];
  		tg->iops[WRITE][index] = v[3];
  	}
  	tg->bps[READ][LIMIT_LOW] = min(tg->bps_conf[READ][LIMIT_LOW],
  		tg->bps_conf[READ][LIMIT_MAX]);
  	tg->bps[WRITE][LIMIT_LOW] = min(tg->bps_conf[WRITE][LIMIT_LOW],
  		tg->bps_conf[WRITE][LIMIT_MAX]);
  	tg->iops[READ][LIMIT_LOW] = min(tg->iops_conf[READ][LIMIT_LOW],
  		tg->iops_conf[READ][LIMIT_MAX]);
  	tg->iops[WRITE][LIMIT_LOW] = min(tg->iops_conf[WRITE][LIMIT_LOW],
  		tg->iops_conf[WRITE][LIMIT_MAX]);
b4f428ef2   Shaohua Li   blk-throttle: for...
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
  	tg->idletime_threshold_conf = idle_time;
  	tg->latency_target_conf = latency_time;
  
  	/* force user to configure all settings for low limit  */
  	if (!(tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW] ||
  	      tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) ||
  	    tg->idletime_threshold_conf == DFL_IDLE_THRESHOLD ||
  	    tg->latency_target_conf == DFL_LATENCY_TARGET) {
  		tg->bps[READ][LIMIT_LOW] = 0;
  		tg->bps[WRITE][LIMIT_LOW] = 0;
  		tg->iops[READ][LIMIT_LOW] = 0;
  		tg->iops[WRITE][LIMIT_LOW] = 0;
  		tg->idletime_threshold = DFL_IDLE_THRESHOLD;
  		tg->latency_target = DFL_LATENCY_TARGET;
  	} else if (index == LIMIT_LOW) {
5b81fc3cc   Shaohua Li   blk-throttle: add...
1600
  		tg->idletime_threshold = tg->idletime_threshold_conf;
5b81fc3cc   Shaohua Li   blk-throttle: add...
1601
  		tg->latency_target = tg->latency_target_conf;
cd5ab1b0f   Shaohua Li   blk-throttle: add...
1602
  	}
b4f428ef2   Shaohua Li   blk-throttle: for...
1603
1604
1605
1606
1607
1608
1609
  
  	blk_throtl_update_limit_valid(tg->td);
  	if (tg->td->limit_valid[LIMIT_LOW]) {
  		if (index == LIMIT_LOW)
  			tg->td->limit_index = LIMIT_LOW;
  	} else
  		tg->td->limit_index = LIMIT_MAX;
9bb67aeb9   Shaohua Li   blk-throttle: res...
1610
1611
  	tg_conf_updated(tg, index == LIMIT_LOW &&
  		tg->td->limit_valid[LIMIT_LOW]);
2ee867dcf   Tejun Heo   blkcg: implement ...
1612
1613
1614
1615
1616
1617
1618
  	ret = 0;
  out_finish:
  	blkg_conf_finish(&ctx);
  	return ret ?: nbytes;
  }
  
  static struct cftype throtl_files[] = {
cd5ab1b0f   Shaohua Li   blk-throttle: add...
1619
1620
1621
1622
1623
1624
1625
1626
1627
  #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
  	{
  		.name = "low",
  		.flags = CFTYPE_NOT_ON_ROOT,
  		.seq_show = tg_print_limit,
  		.write = tg_set_limit,
  		.private = LIMIT_LOW,
  	},
  #endif
2ee867dcf   Tejun Heo   blkcg: implement ...
1628
1629
1630
  	{
  		.name = "max",
  		.flags = CFTYPE_NOT_ON_ROOT,
cd5ab1b0f   Shaohua Li   blk-throttle: add...
1631
1632
1633
  		.seq_show = tg_print_limit,
  		.write = tg_set_limit,
  		.private = LIMIT_MAX,
2ee867dcf   Tejun Heo   blkcg: implement ...
1634
1635
1636
  	},
  	{ }	/* terminate */
  };
da5277700   Vivek Goyal   block: Move blk_t...
1637
  static void throtl_shutdown_wq(struct request_queue *q)
e43473b7f   Vivek Goyal   blkio: Core imple...
1638
1639
  {
  	struct throtl_data *td = q->td;
69df0ab03   Tejun Heo   blk-throttle: sep...
1640
  	cancel_work_sync(&td->dispatch_work);
e43473b7f   Vivek Goyal   blkio: Core imple...
1641
  }
3c798398e   Tejun Heo   blkcg: mass renam...
1642
  static struct blkcg_policy blkcg_policy_throtl = {
2ee867dcf   Tejun Heo   blkcg: implement ...
1643
  	.dfl_cftypes		= throtl_files,
880f50e22   Tejun Heo   blkcg: mark exist...
1644
  	.legacy_cftypes		= throtl_legacy_files,
f9fcc2d39   Tejun Heo   blkcg: collapse b...
1645

001bea73e   Tejun Heo   blkcg: replace bl...
1646
  	.pd_alloc_fn		= throtl_pd_alloc,
f9fcc2d39   Tejun Heo   blkcg: collapse b...
1647
  	.pd_init_fn		= throtl_pd_init,
693e751e7   Tejun Heo   blk-throttle: imp...
1648
  	.pd_online_fn		= throtl_pd_online,
cd5ab1b0f   Shaohua Li   blk-throttle: add...
1649
  	.pd_offline_fn		= throtl_pd_offline,
001bea73e   Tejun Heo   blkcg: replace bl...
1650
  	.pd_free_fn		= throtl_pd_free,
e43473b7f   Vivek Goyal   blkio: Core imple...
1651
  };
3f0abd806   Shaohua Li   blk-throttle: add...
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
  static unsigned long __tg_last_low_overflow_time(struct throtl_grp *tg)
  {
  	unsigned long rtime = jiffies, wtime = jiffies;
  
  	if (tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW])
  		rtime = tg->last_low_overflow_time[READ];
  	if (tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW])
  		wtime = tg->last_low_overflow_time[WRITE];
  	return min(rtime, wtime);
  }
  
  /* tg should not be an intermediate node */
  static unsigned long tg_last_low_overflow_time(struct throtl_grp *tg)
  {
  	struct throtl_service_queue *parent_sq;
  	struct throtl_grp *parent = tg;
  	unsigned long ret = __tg_last_low_overflow_time(tg);
  
  	while (true) {
  		parent_sq = parent->service_queue.parent_sq;
  		parent = sq_to_tg(parent_sq);
  		if (!parent)
  			break;
  
  		/*
  		 * The parent doesn't have low limit, it always reaches low
  		 * limit. Its overflow time is useless for children
  		 */
  		if (!parent->bps[READ][LIMIT_LOW] &&
  		    !parent->iops[READ][LIMIT_LOW] &&
  		    !parent->bps[WRITE][LIMIT_LOW] &&
  		    !parent->iops[WRITE][LIMIT_LOW])
  			continue;
  		if (time_after(__tg_last_low_overflow_time(parent), ret))
  			ret = __tg_last_low_overflow_time(parent);
  	}
  	return ret;
  }
9e234eeaf   Shaohua Li   blk-throttle: add...
1690
1691
1692
1693
1694
  static bool throtl_tg_is_idle(struct throtl_grp *tg)
  {
  	/*
  	 * cgroup is idle if:
  	 * - single idle is too long, longer than a fixed value (in case user
b4f428ef2   Shaohua Li   blk-throttle: for...
1695
  	 *   configure a too big threshold) or 4 times of idletime threshold
9e234eeaf   Shaohua Li   blk-throttle: add...
1696
  	 * - average think time is more than threshold
53696b8d2   Shaohua Li   blk-throttle: add...
1697
  	 * - IO latency is largely below threshold
9e234eeaf   Shaohua Li   blk-throttle: add...
1698
  	 */
b4f428ef2   Shaohua Li   blk-throttle: for...
1699
  	unsigned long time;
4cff729f6   Shaohua Li   blk-throttle: out...
1700
  	bool ret;
9e234eeaf   Shaohua Li   blk-throttle: add...
1701

b4f428ef2   Shaohua Li   blk-throttle: for...
1702
1703
1704
1705
1706
1707
  	time = min_t(unsigned long, MAX_IDLE_TIME, 4 * tg->idletime_threshold);
  	ret = tg->latency_target == DFL_LATENCY_TARGET ||
  	      tg->idletime_threshold == DFL_IDLE_THRESHOLD ||
  	      (ktime_get_ns() >> 10) - tg->last_finish_time > time ||
  	      tg->avg_idletime > tg->idletime_threshold ||
  	      (tg->latency_target && tg->bio_cnt &&
53696b8d2   Shaohua Li   blk-throttle: add...
1708
  		tg->bad_bio_cnt * 5 < tg->bio_cnt);
4cff729f6   Shaohua Li   blk-throttle: out...
1709
1710
1711
1712
1713
  	throtl_log(&tg->service_queue,
  		"avg_idle=%ld, idle_threshold=%ld, bad_bio=%d, total_bio=%d, is_idle=%d, scale=%d",
  		tg->avg_idletime, tg->idletime_threshold, tg->bad_bio_cnt,
  		tg->bio_cnt, ret, tg->td->scale);
  	return ret;
9e234eeaf   Shaohua Li   blk-throttle: add...
1714
  }
c79892c55   Shaohua Li   blk-throttle: add...
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
  static bool throtl_tg_can_upgrade(struct throtl_grp *tg)
  {
  	struct throtl_service_queue *sq = &tg->service_queue;
  	bool read_limit, write_limit;
  
  	/*
  	 * if cgroup reaches low limit (if low limit is 0, the cgroup always
  	 * reaches), it's ok to upgrade to next limit
  	 */
  	read_limit = tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW];
  	write_limit = tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW];
  	if (!read_limit && !write_limit)
  		return true;
  	if (read_limit && sq->nr_queued[READ] &&
  	    (!write_limit || sq->nr_queued[WRITE]))
  		return true;
  	if (write_limit && sq->nr_queued[WRITE] &&
  	    (!read_limit || sq->nr_queued[READ]))
  		return true;
aec242468   Shaohua Li   blk-throttle: det...
1734
1735
  
  	if (time_after_eq(jiffies,
fa6fb5aab   Shaohua Li   blk-throttle: ign...
1736
1737
  		tg_last_low_overflow_time(tg) + tg->td->throtl_slice) &&
  	    throtl_tg_is_idle(tg))
aec242468   Shaohua Li   blk-throttle: det...
1738
  		return true;
c79892c55   Shaohua Li   blk-throttle: add...
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
  	return false;
  }
  
  static bool throtl_hierarchy_can_upgrade(struct throtl_grp *tg)
  {
  	while (true) {
  		if (throtl_tg_can_upgrade(tg))
  			return true;
  		tg = sq_to_tg(tg->service_queue.parent_sq);
  		if (!tg || !tg_to_blkg(tg)->parent)
  			return false;
  	}
  	return false;
  }
  
  static bool throtl_can_upgrade(struct throtl_data *td,
  	struct throtl_grp *this_tg)
  {
  	struct cgroup_subsys_state *pos_css;
  	struct blkcg_gq *blkg;
  
  	if (td->limit_index != LIMIT_LOW)
  		return false;
297e3d854   Shaohua Li   blk-throttle: mak...
1762
  	if (time_before(jiffies, td->low_downgrade_time + td->throtl_slice))
3f0abd806   Shaohua Li   blk-throttle: add...
1763
  		return false;
c79892c55   Shaohua Li   blk-throttle: add...
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
  	rcu_read_lock();
  	blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
  		struct throtl_grp *tg = blkg_to_tg(blkg);
  
  		if (tg == this_tg)
  			continue;
  		if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children))
  			continue;
  		if (!throtl_hierarchy_can_upgrade(tg)) {
  			rcu_read_unlock();
  			return false;
  		}
  	}
  	rcu_read_unlock();
  	return true;
  }
fa6fb5aab   Shaohua Li   blk-throttle: ign...
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
  static void throtl_upgrade_check(struct throtl_grp *tg)
  {
  	unsigned long now = jiffies;
  
  	if (tg->td->limit_index != LIMIT_LOW)
  		return;
  
  	if (time_after(tg->last_check_time + tg->td->throtl_slice, now))
  		return;
  
  	tg->last_check_time = now;
  
  	if (!time_after_eq(now,
  	     __tg_last_low_overflow_time(tg) + tg->td->throtl_slice))
  		return;
  
  	if (throtl_can_upgrade(tg->td, NULL))
  		throtl_upgrade_state(tg->td);
  }
c79892c55   Shaohua Li   blk-throttle: add...
1799
1800
1801
1802
  static void throtl_upgrade_state(struct throtl_data *td)
  {
  	struct cgroup_subsys_state *pos_css;
  	struct blkcg_gq *blkg;
4cff729f6   Shaohua Li   blk-throttle: out...
1803
  	throtl_log(&td->service_queue, "upgrade to max");
c79892c55   Shaohua Li   blk-throttle: add...
1804
  	td->limit_index = LIMIT_MAX;
3f0abd806   Shaohua Li   blk-throttle: add...
1805
  	td->low_upgrade_time = jiffies;
7394e31fa   Shaohua Li   blk-throttle: mak...
1806
  	td->scale = 0;
c79892c55   Shaohua Li   blk-throttle: add...
1807
1808
1809
1810
1811
1812
1813
  	rcu_read_lock();
  	blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
  		struct throtl_grp *tg = blkg_to_tg(blkg);
  		struct throtl_service_queue *sq = &tg->service_queue;
  
  		tg->disptime = jiffies - 1;
  		throtl_select_dispatch(sq);
4f02fb761   Joseph Qi   blk-throttle: fix...
1814
  		throtl_schedule_next_dispatch(sq, true);
c79892c55   Shaohua Li   blk-throttle: add...
1815
1816
1817
  	}
  	rcu_read_unlock();
  	throtl_select_dispatch(&td->service_queue);
4f02fb761   Joseph Qi   blk-throttle: fix...
1818
  	throtl_schedule_next_dispatch(&td->service_queue, true);
c79892c55   Shaohua Li   blk-throttle: add...
1819
1820
  	queue_work(kthrotld_workqueue, &td->dispatch_work);
  }
3f0abd806   Shaohua Li   blk-throttle: add...
1821
1822
  static void throtl_downgrade_state(struct throtl_data *td, int new)
  {
7394e31fa   Shaohua Li   blk-throttle: mak...
1823
  	td->scale /= 2;
4cff729f6   Shaohua Li   blk-throttle: out...
1824
  	throtl_log(&td->service_queue, "downgrade, scale %d", td->scale);
7394e31fa   Shaohua Li   blk-throttle: mak...
1825
1826
1827
1828
  	if (td->scale) {
  		td->low_upgrade_time = jiffies - td->scale * td->throtl_slice;
  		return;
  	}
3f0abd806   Shaohua Li   blk-throttle: add...
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
  	td->limit_index = new;
  	td->low_downgrade_time = jiffies;
  }
  
  static bool throtl_tg_can_downgrade(struct throtl_grp *tg)
  {
  	struct throtl_data *td = tg->td;
  	unsigned long now = jiffies;
  
  	/*
  	 * If cgroup is below low limit, consider downgrade and throttle other
  	 * cgroups
  	 */
297e3d854   Shaohua Li   blk-throttle: mak...
1842
1843
  	if (time_after_eq(now, td->low_upgrade_time + td->throtl_slice) &&
  	    time_after_eq(now, tg_last_low_overflow_time(tg) +
fa6fb5aab   Shaohua Li   blk-throttle: ign...
1844
1845
1846
  					td->throtl_slice) &&
  	    (!throtl_tg_is_idle(tg) ||
  	     !list_empty(&tg_to_blkg(tg)->blkcg->css.children)))
3f0abd806   Shaohua Li   blk-throttle: add...
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
  		return true;
  	return false;
  }
  
  static bool throtl_hierarchy_can_downgrade(struct throtl_grp *tg)
  {
  	while (true) {
  		if (!throtl_tg_can_downgrade(tg))
  			return false;
  		tg = sq_to_tg(tg->service_queue.parent_sq);
  		if (!tg || !tg_to_blkg(tg)->parent)
  			break;
  	}
  	return true;
  }
  
  static void throtl_downgrade_check(struct throtl_grp *tg)
  {
  	uint64_t bps;
  	unsigned int iops;
  	unsigned long elapsed_time;
  	unsigned long now = jiffies;
  
  	if (tg->td->limit_index != LIMIT_MAX ||
  	    !tg->td->limit_valid[LIMIT_LOW])
  		return;
  	if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children))
  		return;
297e3d854   Shaohua Li   blk-throttle: mak...
1875
  	if (time_after(tg->last_check_time + tg->td->throtl_slice, now))
3f0abd806   Shaohua Li   blk-throttle: add...
1876
1877
1878
1879
  		return;
  
  	elapsed_time = now - tg->last_check_time;
  	tg->last_check_time = now;
297e3d854   Shaohua Li   blk-throttle: mak...
1880
1881
  	if (time_before(now, tg_last_low_overflow_time(tg) +
  			tg->td->throtl_slice))
3f0abd806   Shaohua Li   blk-throttle: add...
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
  		return;
  
  	if (tg->bps[READ][LIMIT_LOW]) {
  		bps = tg->last_bytes_disp[READ] * HZ;
  		do_div(bps, elapsed_time);
  		if (bps >= tg->bps[READ][LIMIT_LOW])
  			tg->last_low_overflow_time[READ] = now;
  	}
  
  	if (tg->bps[WRITE][LIMIT_LOW]) {
  		bps = tg->last_bytes_disp[WRITE] * HZ;
  		do_div(bps, elapsed_time);
  		if (bps >= tg->bps[WRITE][LIMIT_LOW])
  			tg->last_low_overflow_time[WRITE] = now;
  	}
  
  	if (tg->iops[READ][LIMIT_LOW]) {
  		iops = tg->last_io_disp[READ] * HZ / elapsed_time;
  		if (iops >= tg->iops[READ][LIMIT_LOW])
  			tg->last_low_overflow_time[READ] = now;
  	}
  
  	if (tg->iops[WRITE][LIMIT_LOW]) {
  		iops = tg->last_io_disp[WRITE] * HZ / elapsed_time;
  		if (iops >= tg->iops[WRITE][LIMIT_LOW])
  			tg->last_low_overflow_time[WRITE] = now;
  	}
  
  	/*
  	 * If cgroup is below low limit, consider downgrade and throttle other
  	 * cgroups
  	 */
  	if (throtl_hierarchy_can_downgrade(tg))
  		throtl_downgrade_state(tg->td, LIMIT_LOW);
  
  	tg->last_bytes_disp[READ] = 0;
  	tg->last_bytes_disp[WRITE] = 0;
  	tg->last_io_disp[READ] = 0;
  	tg->last_io_disp[WRITE] = 0;
  }
9e234eeaf   Shaohua Li   blk-throttle: add...
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
  static void blk_throtl_update_idletime(struct throtl_grp *tg)
  {
  	unsigned long now = ktime_get_ns() >> 10;
  	unsigned long last_finish_time = tg->last_finish_time;
  
  	if (now <= last_finish_time || last_finish_time == 0 ||
  	    last_finish_time == tg->checked_last_finish_time)
  		return;
  
  	tg->avg_idletime = (tg->avg_idletime * 7 + now - last_finish_time) >> 3;
  	tg->checked_last_finish_time = last_finish_time;
  }
b9147dd1b   Shaohua Li   blk-throttle: add...
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
  #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
  static void throtl_update_latency_buckets(struct throtl_data *td)
  {
  	struct avg_latency_bucket avg_latency[LATENCY_BUCKET_SIZE];
  	int i, cpu;
  	unsigned long last_latency = 0;
  	unsigned long latency;
  
  	if (!blk_queue_nonrot(td->queue))
  		return;
  	if (time_before(jiffies, td->last_calculate_time + HZ))
  		return;
  	td->last_calculate_time = jiffies;
  
  	memset(avg_latency, 0, sizeof(avg_latency));
  	for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
  		struct latency_bucket *tmp = &td->tmp_buckets[i];
  
  		for_each_possible_cpu(cpu) {
  			struct latency_bucket *bucket;
  
  			/* this isn't race free, but ok in practice */
  			bucket = per_cpu_ptr(td->latency_buckets, cpu);
  			tmp->total_latency += bucket[i].total_latency;
  			tmp->samples += bucket[i].samples;
  			bucket[i].total_latency = 0;
  			bucket[i].samples = 0;
  		}
  
  		if (tmp->samples >= 32) {
  			int samples = tmp->samples;
  
  			latency = tmp->total_latency;
  
  			tmp->total_latency = 0;
  			tmp->samples = 0;
  			latency /= samples;
  			if (latency == 0)
  				continue;
  			avg_latency[i].latency = latency;
  		}
  	}
  
  	for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
  		if (!avg_latency[i].latency) {
  			if (td->avg_buckets[i].latency < last_latency)
  				td->avg_buckets[i].latency = last_latency;
  			continue;
  		}
  
  		if (!td->avg_buckets[i].valid)
  			latency = avg_latency[i].latency;
  		else
  			latency = (td->avg_buckets[i].latency * 7 +
  				avg_latency[i].latency) >> 3;
  
  		td->avg_buckets[i].latency = max(latency, last_latency);
  		td->avg_buckets[i].valid = true;
  		last_latency = td->avg_buckets[i].latency;
  	}
4cff729f6   Shaohua Li   blk-throttle: out...
1994
1995
1996
1997
1998
  
  	for (i = 0; i < LATENCY_BUCKET_SIZE; i++)
  		throtl_log(&td->service_queue,
  			"Latency bucket %d: latency=%ld, valid=%d", i,
  			td->avg_buckets[i].latency, td->avg_buckets[i].valid);
b9147dd1b   Shaohua Li   blk-throttle: add...
1999
2000
2001
2002
2003
2004
  }
  #else
  static inline void throtl_update_latency_buckets(struct throtl_data *td)
  {
  }
  #endif
2bc19cd5f   Jens Axboe   blk-throttle: fix...
2005
2006
2007
  static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio)
  {
  #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
007cc56b7   Shaohua Li   block: always att...
2008
  	if (bio->bi_css)
2bc19cd5f   Jens Axboe   blk-throttle: fix...
2009
2010
  		bio->bi_cg_private = tg;
  	blk_stat_set_issue(&bio->bi_issue_stat, bio_sectors(bio));
2bc19cd5f   Jens Axboe   blk-throttle: fix...
2011
2012
  #endif
  }
ae1188963   Tejun Heo   blkcg: consolidat...
2013
2014
  bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
  		    struct bio *bio)
e43473b7f   Vivek Goyal   blkio: Core imple...
2015
  {
c5cc2070b   Tejun Heo   blk-throttle: add...
2016
  	struct throtl_qnode *qn = NULL;
ae1188963   Tejun Heo   blkcg: consolidat...
2017
  	struct throtl_grp *tg = blkg_to_tg(blkg ?: q->root_blkg);
73f0d49a9   Tejun Heo   blk-throttle: mov...
2018
  	struct throtl_service_queue *sq;
0e9f4164b   Tejun Heo   blk-throttle: gen...
2019
  	bool rw = bio_data_dir(bio);
bc16a4f93   Tejun Heo   block: reorganize...
2020
  	bool throttled = false;
b9147dd1b   Shaohua Li   blk-throttle: add...
2021
  	struct throtl_data *td = tg->td;
e43473b7f   Vivek Goyal   blkio: Core imple...
2022

ae1188963   Tejun Heo   blkcg: consolidat...
2023
  	WARN_ON_ONCE(!rcu_read_lock_held());
2a0f61e6e   Tejun Heo   blk-throttle: set...
2024
  	/* see throtl_charge_bio() */
8d2bbd4c8   Christoph Hellwig   block: replace RE...
2025
  	if (bio_flagged(bio, BIO_THROTTLED) || !tg->has_rules[rw])
bc16a4f93   Tejun Heo   block: reorganize...
2026
  		goto out;
e43473b7f   Vivek Goyal   blkio: Core imple...
2027
2028
  
  	spin_lock_irq(q->queue_lock);
c9589f03e   Tejun Heo   blk-throttle: imp...
2029

b9147dd1b   Shaohua Li   blk-throttle: add...
2030
  	throtl_update_latency_buckets(td);
c9589f03e   Tejun Heo   blk-throttle: imp...
2031
  	if (unlikely(blk_queue_bypass(q)))
bc16a4f93   Tejun Heo   block: reorganize...
2032
  		goto out_unlock;
f469a7b4d   Vivek Goyal   blk-cgroup: Allow...
2033

2bc19cd5f   Jens Axboe   blk-throttle: fix...
2034
  	blk_throtl_assoc_bio(tg, bio);
9e234eeaf   Shaohua Li   blk-throttle: add...
2035
  	blk_throtl_update_idletime(tg);
73f0d49a9   Tejun Heo   blk-throttle: mov...
2036
  	sq = &tg->service_queue;
c79892c55   Shaohua Li   blk-throttle: add...
2037
  again:
9e660acff   Tejun Heo   blk-throttle: mak...
2038
  	while (true) {
3f0abd806   Shaohua Li   blk-throttle: add...
2039
2040
2041
  		if (tg->last_low_overflow_time[rw] == 0)
  			tg->last_low_overflow_time[rw] = jiffies;
  		throtl_downgrade_check(tg);
fa6fb5aab   Shaohua Li   blk-throttle: ign...
2042
  		throtl_upgrade_check(tg);
9e660acff   Tejun Heo   blk-throttle: mak...
2043
2044
2045
  		/* throtl is FIFO - if bios are already queued, should queue */
  		if (sq->nr_queued[rw])
  			break;
de701c74a   Vivek Goyal   blk-throttle: Som...
2046

9e660acff   Tejun Heo   blk-throttle: mak...
2047
  		/* if above limits, break to queue */
c79892c55   Shaohua Li   blk-throttle: add...
2048
  		if (!tg_may_dispatch(tg, bio, NULL)) {
3f0abd806   Shaohua Li   blk-throttle: add...
2049
  			tg->last_low_overflow_time[rw] = jiffies;
b9147dd1b   Shaohua Li   blk-throttle: add...
2050
2051
  			if (throtl_can_upgrade(td, tg)) {
  				throtl_upgrade_state(td);
c79892c55   Shaohua Li   blk-throttle: add...
2052
2053
  				goto again;
  			}
9e660acff   Tejun Heo   blk-throttle: mak...
2054
  			break;
c79892c55   Shaohua Li   blk-throttle: add...
2055
  		}
9e660acff   Tejun Heo   blk-throttle: mak...
2056
2057
  
  		/* within limits, let's charge and dispatch directly */
e43473b7f   Vivek Goyal   blkio: Core imple...
2058
  		throtl_charge_bio(tg, bio);
04521db04   Vivek Goyal   blk-throttle: Res...
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
  
  		/*
  		 * We need to trim slice even when bios are not being queued
  		 * otherwise it might happen that a bio is not queued for
  		 * a long time and slice keeps on extending and trim is not
  		 * called for a long time. Now if limits are reduced suddenly
  		 * we take into account all the IO dispatched so far at new
  		 * low rate and * newly queued IO gets a really long dispatch
  		 * time.
  		 *
  		 * So keep on trimming slice even if bio is not queued.
  		 */
0f3457f60   Tejun Heo   blk-throttle: add...
2071
  		throtl_trim_slice(tg, rw);
9e660acff   Tejun Heo   blk-throttle: mak...
2072
2073
2074
2075
2076
2077
  
  		/*
  		 * @bio passed through this layer without being throttled.
  		 * Climb up the ladder.  If we''re already at the top, it
  		 * can be executed directly.
  		 */
c5cc2070b   Tejun Heo   blk-throttle: add...
2078
  		qn = &tg->qnode_on_parent[rw];
9e660acff   Tejun Heo   blk-throttle: mak...
2079
2080
2081
2082
  		sq = sq->parent_sq;
  		tg = sq_to_tg(sq);
  		if (!tg)
  			goto out_unlock;
e43473b7f   Vivek Goyal   blkio: Core imple...
2083
  	}
9e660acff   Tejun Heo   blk-throttle: mak...
2084
  	/* out-of-limit, queue to @tg */
fda6f272c   Tejun Heo   blk-throttle: imp...
2085
2086
  	throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u iops=%u queued=%d/%d",
  		   rw == READ ? 'R' : 'W',
9f626e372   Shaohua Li   blk-throttle: pre...
2087
2088
2089
  		   tg->bytes_disp[rw], bio->bi_iter.bi_size,
  		   tg_bps_limit(tg, rw),
  		   tg->io_disp[rw], tg_iops_limit(tg, rw),
fda6f272c   Tejun Heo   blk-throttle: imp...
2090
  		   sq->nr_queued[READ], sq->nr_queued[WRITE]);
e43473b7f   Vivek Goyal   blkio: Core imple...
2091

3f0abd806   Shaohua Li   blk-throttle: add...
2092
  	tg->last_low_overflow_time[rw] = jiffies;
b9147dd1b   Shaohua Li   blk-throttle: add...
2093
  	td->nr_queued[rw]++;
c5cc2070b   Tejun Heo   blk-throttle: add...
2094
  	throtl_add_bio_tg(bio, qn, tg);
bc16a4f93   Tejun Heo   block: reorganize...
2095
  	throttled = true;
e43473b7f   Vivek Goyal   blkio: Core imple...
2096

7f52f98c2   Tejun Heo   blk-throttle: imp...
2097
2098
2099
2100
2101
2102
  	/*
  	 * Update @tg's dispatch time and force schedule dispatch if @tg
  	 * was empty before @bio.  The forced scheduling isn't likely to
  	 * cause undue delay as @bio is likely to be dispatched directly if
  	 * its @tg's disptime is not in the future.
  	 */
0e9f4164b   Tejun Heo   blk-throttle: gen...
2103
  	if (tg->flags & THROTL_TG_WAS_EMPTY) {
77216b048   Tejun Heo   blk-throttle: add...
2104
  		tg_update_disptime(tg);
7f52f98c2   Tejun Heo   blk-throttle: imp...
2105
  		throtl_schedule_next_dispatch(tg->service_queue.parent_sq, true);
e43473b7f   Vivek Goyal   blkio: Core imple...
2106
  	}
bc16a4f93   Tejun Heo   block: reorganize...
2107
  out_unlock:
e43473b7f   Vivek Goyal   blkio: Core imple...
2108
  	spin_unlock_irq(q->queue_lock);
bc16a4f93   Tejun Heo   block: reorganize...
2109
  out:
3ef1c33f9   Shaohua Li   block-throttle: a...
2110
  	bio_set_flag(bio, BIO_THROTTLED);
b9147dd1b   Shaohua Li   blk-throttle: add...
2111
2112
2113
2114
2115
  
  #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
  	if (throttled || !td->track_bio_latency)
  		bio->bi_issue_stat.stat |= SKIP_LATENCY;
  #endif
bc16a4f93   Tejun Heo   block: reorganize...
2116
  	return throttled;
e43473b7f   Vivek Goyal   blkio: Core imple...
2117
  }
9e234eeaf   Shaohua Li   blk-throttle: add...
2118
  #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
b9147dd1b   Shaohua Li   blk-throttle: add...
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
  static void throtl_track_latency(struct throtl_data *td, sector_t size,
  	int op, unsigned long time)
  {
  	struct latency_bucket *latency;
  	int index;
  
  	if (!td || td->limit_index != LIMIT_LOW || op != REQ_OP_READ ||
  	    !blk_queue_nonrot(td->queue))
  		return;
  
  	index = request_bucket_index(size);
  
  	latency = get_cpu_ptr(td->latency_buckets);
  	latency[index].total_latency += time;
  	latency[index].samples++;
  	put_cpu_ptr(td->latency_buckets);
  }
  
  void blk_throtl_stat_add(struct request *rq, u64 time_ns)
  {
  	struct request_queue *q = rq->q;
  	struct throtl_data *td = q->td;
  
  	throtl_track_latency(td, blk_stat_size(&rq->issue_stat),
  		req_op(rq), time_ns >> 10);
  }
9e234eeaf   Shaohua Li   blk-throttle: add...
2145
2146
2147
  void blk_throtl_bio_endio(struct bio *bio)
  {
  	struct throtl_grp *tg;
b9147dd1b   Shaohua Li   blk-throttle: add...
2148
2149
2150
2151
  	u64 finish_time_ns;
  	unsigned long finish_time;
  	unsigned long start_time;
  	unsigned long lat;
9e234eeaf   Shaohua Li   blk-throttle: add...
2152
2153
2154
2155
2156
  
  	tg = bio->bi_cg_private;
  	if (!tg)
  		return;
  	bio->bi_cg_private = NULL;
b9147dd1b   Shaohua Li   blk-throttle: add...
2157
2158
2159
2160
2161
  	finish_time_ns = ktime_get_ns();
  	tg->last_finish_time = finish_time_ns >> 10;
  
  	start_time = blk_stat_time(&bio->bi_issue_stat) >> 10;
  	finish_time = __blk_stat_time(finish_time_ns) >> 10;
53696b8d2   Shaohua Li   blk-throttle: add...
2162
2163
2164
2165
  	if (!start_time || finish_time <= start_time)
  		return;
  
  	lat = finish_time - start_time;
b9147dd1b   Shaohua Li   blk-throttle: add...
2166
  	/* this is only for bio based driver */
53696b8d2   Shaohua Li   blk-throttle: add...
2167
  	if (!(bio->bi_issue_stat.stat & SKIP_LATENCY))
b9147dd1b   Shaohua Li   blk-throttle: add...
2168
2169
  		throtl_track_latency(tg->td, blk_stat_size(&bio->bi_issue_stat),
  			bio_op(bio), lat);
53696b8d2   Shaohua Li   blk-throttle: add...
2170

6679a90c4   Shaohua Li   blk-throttle: set...
2171
  	if (tg->latency_target && lat >= tg->td->filtered_latency) {
53696b8d2   Shaohua Li   blk-throttle: add...
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
  		int bucket;
  		unsigned int threshold;
  
  		bucket = request_bucket_index(
  			blk_stat_size(&bio->bi_issue_stat));
  		threshold = tg->td->avg_buckets[bucket].latency +
  			tg->latency_target;
  		if (lat > threshold)
  			tg->bad_bio_cnt++;
  		/*
  		 * Not race free, could get wrong count, which means cgroups
  		 * will be throttled
  		 */
  		tg->bio_cnt++;
  	}
  
  	if (time_after(jiffies, tg->bio_cnt_reset_time) || tg->bio_cnt > 1024) {
  		tg->bio_cnt_reset_time = tg->td->throtl_slice + jiffies;
  		tg->bio_cnt /= 2;
  		tg->bad_bio_cnt /= 2;
b9147dd1b   Shaohua Li   blk-throttle: add...
2192
  	}
9e234eeaf   Shaohua Li   blk-throttle: add...
2193
2194
  }
  #endif
2a12f0dcd   Tejun Heo   blk-throttle: mak...
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
  /*
   * Dispatch all bios from all children tg's queued on @parent_sq.  On
   * return, @parent_sq is guaranteed to not have any active children tg's
   * and all bios from previously active tg's are on @parent_sq->bio_lists[].
   */
  static void tg_drain_bios(struct throtl_service_queue *parent_sq)
  {
  	struct throtl_grp *tg;
  
  	while ((tg = throtl_rb_first(parent_sq))) {
  		struct throtl_service_queue *sq = &tg->service_queue;
  		struct bio *bio;
  
  		throtl_dequeue_tg(tg);
c5cc2070b   Tejun Heo   blk-throttle: add...
2209
  		while ((bio = throtl_peek_queued(&sq->queued[READ])))
2a12f0dcd   Tejun Heo   blk-throttle: mak...
2210
  			tg_dispatch_one_bio(tg, bio_data_dir(bio));
c5cc2070b   Tejun Heo   blk-throttle: add...
2211
  		while ((bio = throtl_peek_queued(&sq->queued[WRITE])))
2a12f0dcd   Tejun Heo   blk-throttle: mak...
2212
2213
2214
  			tg_dispatch_one_bio(tg, bio_data_dir(bio));
  	}
  }
c9a929dde   Tejun Heo   block: fix reques...
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
  /**
   * blk_throtl_drain - drain throttled bios
   * @q: request_queue to drain throttled bios for
   *
   * Dispatch all currently throttled bios on @q through ->make_request_fn().
   */
  void blk_throtl_drain(struct request_queue *q)
  	__releases(q->queue_lock) __acquires(q->queue_lock)
  {
  	struct throtl_data *td = q->td;
2a12f0dcd   Tejun Heo   blk-throttle: mak...
2225
  	struct blkcg_gq *blkg;
492eb21b9   Tejun Heo   cgroup: make hier...
2226
  	struct cgroup_subsys_state *pos_css;
c9a929dde   Tejun Heo   block: fix reques...
2227
  	struct bio *bio;
651930bc1   Tejun Heo   blk-throttle: dis...
2228
  	int rw;
c9a929dde   Tejun Heo   block: fix reques...
2229

8bcb6c7d4   Andi Kleen   block: use lockde...
2230
  	queue_lockdep_assert_held(q);
2a12f0dcd   Tejun Heo   blk-throttle: mak...
2231
  	rcu_read_lock();
c9a929dde   Tejun Heo   block: fix reques...
2232

2a12f0dcd   Tejun Heo   blk-throttle: mak...
2233
2234
2235
2236
2237
2238
  	/*
  	 * Drain each tg while doing post-order walk on the blkg tree, so
  	 * that all bios are propagated to td->service_queue.  It'd be
  	 * better to walk service_queue tree directly but blkg walk is
  	 * easier.
  	 */
492eb21b9   Tejun Heo   cgroup: make hier...
2239
  	blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg)
2a12f0dcd   Tejun Heo   blk-throttle: mak...
2240
  		tg_drain_bios(&blkg_to_tg(blkg)->service_queue);
73f0d49a9   Tejun Heo   blk-throttle: mov...
2241

2a12f0dcd   Tejun Heo   blk-throttle: mak...
2242
2243
2244
2245
  	/* finally, transfer bios from top-level tg's into the td */
  	tg_drain_bios(&td->service_queue);
  
  	rcu_read_unlock();
c9a929dde   Tejun Heo   block: fix reques...
2246
  	spin_unlock_irq(q->queue_lock);
2a12f0dcd   Tejun Heo   blk-throttle: mak...
2247
  	/* all bios now should be in td->service_queue, issue them */
651930bc1   Tejun Heo   blk-throttle: dis...
2248
  	for (rw = READ; rw <= WRITE; rw++)
c5cc2070b   Tejun Heo   blk-throttle: add...
2249
2250
  		while ((bio = throtl_pop_queued(&td->service_queue.queued[rw],
  						NULL)))
651930bc1   Tejun Heo   blk-throttle: dis...
2251
  			generic_make_request(bio);
c9a929dde   Tejun Heo   block: fix reques...
2252
2253
2254
  
  	spin_lock_irq(q->queue_lock);
  }
e43473b7f   Vivek Goyal   blkio: Core imple...
2255
2256
2257
  int blk_throtl_init(struct request_queue *q)
  {
  	struct throtl_data *td;
a2b1693ba   Tejun Heo   blkcg: implement ...
2258
  	int ret;
e43473b7f   Vivek Goyal   blkio: Core imple...
2259
2260
2261
2262
  
  	td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
  	if (!td)
  		return -ENOMEM;
b9147dd1b   Shaohua Li   blk-throttle: add...
2263
2264
2265
2266
2267
2268
  	td->latency_buckets = __alloc_percpu(sizeof(struct latency_bucket) *
  		LATENCY_BUCKET_SIZE, __alignof__(u64));
  	if (!td->latency_buckets) {
  		kfree(td);
  		return -ENOMEM;
  	}
e43473b7f   Vivek Goyal   blkio: Core imple...
2269

69df0ab03   Tejun Heo   blk-throttle: sep...
2270
  	INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
b2ce2643c   Tejun Heo   blk-throttle: cle...
2271
  	throtl_service_queue_init(&td->service_queue);
e43473b7f   Vivek Goyal   blkio: Core imple...
2272

cd1604fab   Tejun Heo   blkcg: factor out...
2273
  	q->td = td;
29b125892   Vivek Goyal   blk-throttle: Dyn...
2274
  	td->queue = q;
02977e4af   Vivek Goyal   blkio: Add root g...
2275

9f626e372   Shaohua Li   blk-throttle: pre...
2276
  	td->limit_valid[LIMIT_MAX] = true;
cd5ab1b0f   Shaohua Li   blk-throttle: add...
2277
  	td->limit_index = LIMIT_MAX;
3f0abd806   Shaohua Li   blk-throttle: add...
2278
2279
  	td->low_upgrade_time = jiffies;
  	td->low_downgrade_time = jiffies;
9e234eeaf   Shaohua Li   blk-throttle: add...
2280

a2b1693ba   Tejun Heo   blkcg: implement ...
2281
  	/* activate policy */
3c798398e   Tejun Heo   blkcg: mass renam...
2282
  	ret = blkcg_activate_policy(q, &blkcg_policy_throtl);
b9147dd1b   Shaohua Li   blk-throttle: add...
2283
2284
  	if (ret) {
  		free_percpu(td->latency_buckets);
f51b802c1   Tejun Heo   blkcg: use the us...
2285
  		kfree(td);
b9147dd1b   Shaohua Li   blk-throttle: add...
2286
  	}
a2b1693ba   Tejun Heo   blkcg: implement ...
2287
  	return ret;
e43473b7f   Vivek Goyal   blkio: Core imple...
2288
2289
2290
2291
  }
  
  void blk_throtl_exit(struct request_queue *q)
  {
c875f4d02   Tejun Heo   blkcg: drop unnec...
2292
  	BUG_ON(!q->td);
da5277700   Vivek Goyal   block: Move blk_t...
2293
  	throtl_shutdown_wq(q);
3c798398e   Tejun Heo   blkcg: mass renam...
2294
  	blkcg_deactivate_policy(q, &blkcg_policy_throtl);
b9147dd1b   Shaohua Li   blk-throttle: add...
2295
  	free_percpu(q->td->latency_buckets);
c9a929dde   Tejun Heo   block: fix reques...
2296
  	kfree(q->td);
e43473b7f   Vivek Goyal   blkio: Core imple...
2297
  }
d61fcfa4b   Shaohua Li   blk-throttle: cho...
2298
2299
2300
  void blk_throtl_register_queue(struct request_queue *q)
  {
  	struct throtl_data *td;
6679a90c4   Shaohua Li   blk-throttle: set...
2301
  	int i;
d61fcfa4b   Shaohua Li   blk-throttle: cho...
2302
2303
2304
  
  	td = q->td;
  	BUG_ON(!td);
6679a90c4   Shaohua Li   blk-throttle: set...
2305
  	if (blk_queue_nonrot(q)) {
d61fcfa4b   Shaohua Li   blk-throttle: cho...
2306
  		td->throtl_slice = DFL_THROTL_SLICE_SSD;
6679a90c4   Shaohua Li   blk-throttle: set...
2307
2308
  		td->filtered_latency = LATENCY_FILTERED_SSD;
  	} else {
d61fcfa4b   Shaohua Li   blk-throttle: cho...
2309
  		td->throtl_slice = DFL_THROTL_SLICE_HD;
6679a90c4   Shaohua Li   blk-throttle: set...
2310
2311
2312
2313
  		td->filtered_latency = LATENCY_FILTERED_HD;
  		for (i = 0; i < LATENCY_BUCKET_SIZE; i++)
  			td->avg_buckets[i].latency = DFL_HD_BASELINE_LATENCY;
  	}
d61fcfa4b   Shaohua Li   blk-throttle: cho...
2314
2315
2316
2317
  #ifndef CONFIG_BLK_DEV_THROTTLING_LOW
  	/* if no low limit, use previous default */
  	td->throtl_slice = DFL_THROTL_SLICE_HD;
  #endif
9e234eeaf   Shaohua Li   blk-throttle: add...
2318

b9147dd1b   Shaohua Li   blk-throttle: add...
2319
2320
2321
  	td->track_bio_latency = !q->mq_ops && !q->request_fn;
  	if (!td->track_bio_latency)
  		blk_stat_enable_accounting(q);
d61fcfa4b   Shaohua Li   blk-throttle: cho...
2322
  }
297e3d854   Shaohua Li   blk-throttle: mak...
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
  #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
  ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page)
  {
  	if (!q->td)
  		return -EINVAL;
  	return sprintf(page, "%u
  ", jiffies_to_msecs(q->td->throtl_slice));
  }
  
  ssize_t blk_throtl_sample_time_store(struct request_queue *q,
  	const char *page, size_t count)
  {
  	unsigned long v;
  	unsigned long t;
  
  	if (!q->td)
  		return -EINVAL;
  	if (kstrtoul(page, 10, &v))
  		return -EINVAL;
  	t = msecs_to_jiffies(v);
  	if (t == 0 || t > MAX_THROTL_SLICE)
  		return -EINVAL;
  	q->td->throtl_slice = t;
  	return count;
  }
  #endif
e43473b7f   Vivek Goyal   blkio: Core imple...
2349
2350
  static int __init throtl_init(void)
  {
450adcbe5   Vivek Goyal   blk-throttle: Do ...
2351
2352
2353
2354
  	kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0);
  	if (!kthrotld_workqueue)
  		panic("Failed to create kthrotld
  ");
3c798398e   Tejun Heo   blkcg: mass renam...
2355
  	return blkcg_policy_register(&blkcg_policy_throtl);
e43473b7f   Vivek Goyal   blkio: Core imple...
2356
2357
2358
  }
  
  module_init(throtl_init);