Blame view

block/blk-throttle.c 67.9 KB
b24413180   Greg Kroah-Hartman   License cleanup: ...
1
  // SPDX-License-Identifier: GPL-2.0
e43473b7f   Vivek Goyal   blkio: Core imple...
2
3
4
5
6
7
8
9
10
11
12
  /*
   * Interface for controlling IO bandwidth on a request queue
   *
   * Copyright (C) 2010 Vivek Goyal <vgoyal@redhat.com>
   */
  
  #include <linux/module.h>
  #include <linux/slab.h>
  #include <linux/blkdev.h>
  #include <linux/bio.h>
  #include <linux/blktrace_api.h>
eea8f41cc   Tejun Heo   blkcg: move block...
13
  #include <linux/blk-cgroup.h>
bc9fcbf9c   Tejun Heo   block: move blk_t...
14
  #include "blk.h"
e43473b7f   Vivek Goyal   blkio: Core imple...
15
16
17
18
19
20
  
  /* Max dispatch from a group in 1 round */
  static int throtl_grp_quantum = 8;
  
  /* Total max dispatch from all groups in one round */
  static int throtl_quantum = 32;
d61fcfa4b   Shaohua Li   blk-throttle: cho...
21
22
23
  /* Throttling is performed over a slice and after that slice is renewed */
  #define DFL_THROTL_SLICE_HD (HZ / 10)
  #define DFL_THROTL_SLICE_SSD (HZ / 50)
297e3d854   Shaohua Li   blk-throttle: mak...
24
  #define MAX_THROTL_SLICE (HZ)
9e234eeaf   Shaohua Li   blk-throttle: add...
25
  #define MAX_IDLE_TIME (5L * 1000 * 1000) /* 5 s */
9bb67aeb9   Shaohua Li   blk-throttle: res...
26
27
  #define MIN_THROTL_BPS (320 * 1024)
  #define MIN_THROTL_IOPS (10)
b4f428ef2   Shaohua Li   blk-throttle: for...
28
29
  #define DFL_LATENCY_TARGET (-1L)
  #define DFL_IDLE_THRESHOLD (0)
6679a90c4   Shaohua Li   blk-throttle: set...
30
31
32
33
34
35
36
  #define DFL_HD_BASELINE_LATENCY (4000L) /* 4ms */
  #define LATENCY_FILTERED_SSD (0)
  /*
   * For HD, very small latency comes from sequential IO. Such IO is helpless to
   * help determine if its IO is impacted by others, hence we ignore the IO
   */
  #define LATENCY_FILTERED_HD (1000L) /* 1ms */
e43473b7f   Vivek Goyal   blkio: Core imple...
37

3c798398e   Tejun Heo   blkcg: mass renam...
38
  static struct blkcg_policy blkcg_policy_throtl;
0381411e4   Tejun Heo   blkcg: let blkcg ...
39

450adcbe5   Vivek Goyal   blk-throttle: Do ...
40
41
  /* A workqueue to queue throttle related work */
  static struct workqueue_struct *kthrotld_workqueue;
450adcbe5   Vivek Goyal   blk-throttle: Do ...
42

c5cc2070b   Tejun Heo   blk-throttle: add...
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
  /*
   * To implement hierarchical throttling, throtl_grps form a tree and bios
   * are dispatched upwards level by level until they reach the top and get
   * issued.  When dispatching bios from the children and local group at each
   * level, if the bios are dispatched into a single bio_list, there's a risk
   * of a local or child group which can queue many bios at once filling up
   * the list starving others.
   *
   * To avoid such starvation, dispatched bios are queued separately
   * according to where they came from.  When they are again dispatched to
   * the parent, they're popped in round-robin order so that no single source
   * hogs the dispatch window.
   *
   * throtl_qnode is used to keep the queued bios separated by their sources.
   * Bios are queued to throtl_qnode which in turn is queued to
   * throtl_service_queue and then dispatched in round-robin order.
   *
   * It's also used to track the reference counts on blkg's.  A qnode always
   * belongs to a throtl_grp and gets queued on itself or the parent, so
   * incrementing the reference of the associated throtl_grp when a qnode is
   * queued and decrementing when dequeued is enough to keep the whole blkg
   * tree pinned while bios are in flight.
   */
  struct throtl_qnode {
  	struct list_head	node;		/* service_queue->queued[] */
  	struct bio_list		bios;		/* queued bios */
  	struct throtl_grp	*tg;		/* tg this qnode belongs to */
  };
c9e0332e8   Tejun Heo   blk-throttle: ren...
71
  struct throtl_service_queue {
77216b048   Tejun Heo   blk-throttle: add...
72
  	struct throtl_service_queue *parent_sq;	/* the parent service_queue */
73f0d49a9   Tejun Heo   blk-throttle: mov...
73
74
75
76
  	/*
  	 * Bios queued directly to this service_queue or dispatched from
  	 * children throtl_grp's.
  	 */
c5cc2070b   Tejun Heo   blk-throttle: add...
77
  	struct list_head	queued[2];	/* throtl_qnode [READ/WRITE] */
73f0d49a9   Tejun Heo   blk-throttle: mov...
78
79
80
81
82
83
  	unsigned int		nr_queued[2];	/* number of queued bios */
  
  	/*
  	 * RB tree of active children throtl_grp's, which are sorted by
  	 * their ->disptime.
  	 */
9ff01255a   Liu Bo   Blk-throttle: upd...
84
  	struct rb_root_cached	pending_tree;	/* RB tree of active tgs */
c9e0332e8   Tejun Heo   blk-throttle: ren...
85
86
  	unsigned int		nr_pending;	/* # queued in the tree */
  	unsigned long		first_pending_disptime;	/* disptime of the first tg */
69df0ab03   Tejun Heo   blk-throttle: sep...
87
  	struct timer_list	pending_timer;	/* fires on first_pending_disptime */
e43473b7f   Vivek Goyal   blkio: Core imple...
88
  };
5b2c16aae   Tejun Heo   blk-throttle: sim...
89
90
  enum tg_state_flags {
  	THROTL_TG_PENDING	= 1 << 0,	/* on parent's pending tree */
0e9f4164b   Tejun Heo   blk-throttle: gen...
91
  	THROTL_TG_WAS_EMPTY	= 1 << 1,	/* bio_lists[] became non-empty */
5b2c16aae   Tejun Heo   blk-throttle: sim...
92
  };
e43473b7f   Vivek Goyal   blkio: Core imple...
93
  #define rb_entry_tg(node)	rb_entry((node), struct throtl_grp, rb_node)
9f626e372   Shaohua Li   blk-throttle: pre...
94
  enum {
cd5ab1b0f   Shaohua Li   blk-throttle: add...
95
  	LIMIT_LOW,
9f626e372   Shaohua Li   blk-throttle: pre...
96
97
98
  	LIMIT_MAX,
  	LIMIT_CNT,
  };
e43473b7f   Vivek Goyal   blkio: Core imple...
99
  struct throtl_grp {
f95a04afa   Tejun Heo   blkcg: embed stru...
100
101
  	/* must be the first member */
  	struct blkg_policy_data pd;
c9e0332e8   Tejun Heo   blk-throttle: ren...
102
  	/* active throtl group service_queue member */
e43473b7f   Vivek Goyal   blkio: Core imple...
103
  	struct rb_node rb_node;
0f3457f60   Tejun Heo   blk-throttle: add...
104
105
  	/* throtl_data this group belongs to */
  	struct throtl_data *td;
49a2f1e3f   Tejun Heo   blk-throttle: add...
106
107
  	/* this group's service queue */
  	struct throtl_service_queue service_queue;
e43473b7f   Vivek Goyal   blkio: Core imple...
108
  	/*
c5cc2070b   Tejun Heo   blk-throttle: add...
109
110
111
112
113
114
115
116
117
118
119
  	 * qnode_on_self is used when bios are directly queued to this
  	 * throtl_grp so that local bios compete fairly with bios
  	 * dispatched from children.  qnode_on_parent is used when bios are
  	 * dispatched from this throtl_grp into its parent and will compete
  	 * with the sibling qnode_on_parents and the parent's
  	 * qnode_on_self.
  	 */
  	struct throtl_qnode qnode_on_self[2];
  	struct throtl_qnode qnode_on_parent[2];
  
  	/*
e43473b7f   Vivek Goyal   blkio: Core imple...
120
121
122
123
124
  	 * Dispatch time in jiffies. This is the estimated time when group
  	 * will unthrottle and is ready to dispatch more bio. It is used as
  	 * key to sort active groups in service tree.
  	 */
  	unsigned long disptime;
e43473b7f   Vivek Goyal   blkio: Core imple...
125
  	unsigned int flags;
693e751e7   Tejun Heo   blk-throttle: imp...
126
127
  	/* are there any throtl rules between this group and td? */
  	bool has_rules[2];
cd5ab1b0f   Shaohua Li   blk-throttle: add...
128
  	/* internally used bytes per second rate limits */
9f626e372   Shaohua Li   blk-throttle: pre...
129
  	uint64_t bps[2][LIMIT_CNT];
cd5ab1b0f   Shaohua Li   blk-throttle: add...
130
131
  	/* user configured bps limits */
  	uint64_t bps_conf[2][LIMIT_CNT];
e43473b7f   Vivek Goyal   blkio: Core imple...
132

cd5ab1b0f   Shaohua Li   blk-throttle: add...
133
  	/* internally used IOPS limits */
9f626e372   Shaohua Li   blk-throttle: pre...
134
  	unsigned int iops[2][LIMIT_CNT];
cd5ab1b0f   Shaohua Li   blk-throttle: add...
135
136
  	/* user configured IOPS limits */
  	unsigned int iops_conf[2][LIMIT_CNT];
8e89d13f4   Vivek Goyal   blkio: Implementa...
137

e43473b7f   Vivek Goyal   blkio: Core imple...
138
139
  	/* Number of bytes disptached in current slice */
  	uint64_t bytes_disp[2];
8e89d13f4   Vivek Goyal   blkio: Implementa...
140
141
  	/* Number of bio's dispatched in current slice */
  	unsigned int io_disp[2];
e43473b7f   Vivek Goyal   blkio: Core imple...
142

3f0abd806   Shaohua Li   blk-throttle: add...
143
144
145
146
147
148
  	unsigned long last_low_overflow_time[2];
  
  	uint64_t last_bytes_disp[2];
  	unsigned int last_io_disp[2];
  
  	unsigned long last_check_time;
ec80991d6   Shaohua Li   blk-throttle: add...
149
  	unsigned long latency_target; /* us */
5b81fc3cc   Shaohua Li   blk-throttle: add...
150
  	unsigned long latency_target_conf; /* us */
e43473b7f   Vivek Goyal   blkio: Core imple...
151
152
153
  	/* When did we start a new slice */
  	unsigned long slice_start[2];
  	unsigned long slice_end[2];
9e234eeaf   Shaohua Li   blk-throttle: add...
154
155
156
157
158
  
  	unsigned long last_finish_time; /* ns / 1024 */
  	unsigned long checked_last_finish_time; /* ns / 1024 */
  	unsigned long avg_idletime; /* ns / 1024 */
  	unsigned long idletime_threshold; /* us */
5b81fc3cc   Shaohua Li   blk-throttle: add...
159
  	unsigned long idletime_threshold_conf; /* us */
53696b8d2   Shaohua Li   blk-throttle: add...
160
161
162
163
  
  	unsigned int bio_cnt; /* total bios */
  	unsigned int bad_bio_cnt; /* bios exceeding latency threshold */
  	unsigned long bio_cnt_reset_time;
e43473b7f   Vivek Goyal   blkio: Core imple...
164
  };
b9147dd1b   Shaohua Li   blk-throttle: add...
165
166
167
168
169
170
171
172
173
174
175
176
  /* We measure latency for request size from <= 4k to >= 1M */
  #define LATENCY_BUCKET_SIZE 9
  
  struct latency_bucket {
  	unsigned long total_latency; /* ns / 1024 */
  	int samples;
  };
  
  struct avg_latency_bucket {
  	unsigned long latency; /* ns / 1024 */
  	bool valid;
  };
e43473b7f   Vivek Goyal   blkio: Core imple...
177
178
  struct throtl_data
  {
e43473b7f   Vivek Goyal   blkio: Core imple...
179
  	/* service tree for active throtl groups */
c9e0332e8   Tejun Heo   blk-throttle: ren...
180
  	struct throtl_service_queue service_queue;
e43473b7f   Vivek Goyal   blkio: Core imple...
181

e43473b7f   Vivek Goyal   blkio: Core imple...
182
183
184
185
  	struct request_queue *queue;
  
  	/* Total Number of queued bios on READ and WRITE lists */
  	unsigned int nr_queued[2];
297e3d854   Shaohua Li   blk-throttle: mak...
186
  	unsigned int throtl_slice;
e43473b7f   Vivek Goyal   blkio: Core imple...
187
  	/* Work for dispatching throttled bios */
69df0ab03   Tejun Heo   blk-throttle: sep...
188
  	struct work_struct dispatch_work;
9f626e372   Shaohua Li   blk-throttle: pre...
189
190
  	unsigned int limit_index;
  	bool limit_valid[LIMIT_CNT];
3f0abd806   Shaohua Li   blk-throttle: add...
191
192
193
  
  	unsigned long low_upgrade_time;
  	unsigned long low_downgrade_time;
7394e31fa   Shaohua Li   blk-throttle: mak...
194
195
  
  	unsigned int scale;
b9147dd1b   Shaohua Li   blk-throttle: add...
196

b889bf66d   Joseph Qi   blk-throttle: tra...
197
198
199
  	struct latency_bucket tmp_buckets[2][LATENCY_BUCKET_SIZE];
  	struct avg_latency_bucket avg_buckets[2][LATENCY_BUCKET_SIZE];
  	struct latency_bucket __percpu *latency_buckets[2];
b9147dd1b   Shaohua Li   blk-throttle: add...
200
  	unsigned long last_calculate_time;
6679a90c4   Shaohua Li   blk-throttle: set...
201
  	unsigned long filtered_latency;
b9147dd1b   Shaohua Li   blk-throttle: add...
202
203
  
  	bool track_bio_latency;
e43473b7f   Vivek Goyal   blkio: Core imple...
204
  };
e99e88a9d   Kees Cook   treewide: setup_t...
205
  static void throtl_pending_timer_fn(struct timer_list *t);
69df0ab03   Tejun Heo   blk-throttle: sep...
206

f95a04afa   Tejun Heo   blkcg: embed stru...
207
208
209
210
  static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd)
  {
  	return pd ? container_of(pd, struct throtl_grp, pd) : NULL;
  }
3c798398e   Tejun Heo   blkcg: mass renam...
211
  static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg)
0381411e4   Tejun Heo   blkcg: let blkcg ...
212
  {
f95a04afa   Tejun Heo   blkcg: embed stru...
213
  	return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl));
0381411e4   Tejun Heo   blkcg: let blkcg ...
214
  }
3c798398e   Tejun Heo   blkcg: mass renam...
215
  static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg)
0381411e4   Tejun Heo   blkcg: let blkcg ...
216
  {
f95a04afa   Tejun Heo   blkcg: embed stru...
217
  	return pd_to_blkg(&tg->pd);
0381411e4   Tejun Heo   blkcg: let blkcg ...
218
  }
fda6f272c   Tejun Heo   blk-throttle: imp...
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
  /**
   * sq_to_tg - return the throl_grp the specified service queue belongs to
   * @sq: the throtl_service_queue of interest
   *
   * Return the throtl_grp @sq belongs to.  If @sq is the top-level one
   * embedded in throtl_data, %NULL is returned.
   */
  static struct throtl_grp *sq_to_tg(struct throtl_service_queue *sq)
  {
  	if (sq && sq->parent_sq)
  		return container_of(sq, struct throtl_grp, service_queue);
  	else
  		return NULL;
  }
  
  /**
   * sq_to_td - return throtl_data the specified service queue belongs to
   * @sq: the throtl_service_queue of interest
   *
b43daedc0   Masahiro Yamada   scripts/spelling....
238
   * A service_queue can be embedded in either a throtl_grp or throtl_data.
fda6f272c   Tejun Heo   blk-throttle: imp...
239
240
241
242
243
244
245
246
247
248
249
   * Determine the associated throtl_data accordingly and return it.
   */
  static struct throtl_data *sq_to_td(struct throtl_service_queue *sq)
  {
  	struct throtl_grp *tg = sq_to_tg(sq);
  
  	if (tg)
  		return tg->td;
  	else
  		return container_of(sq, struct throtl_data, service_queue);
  }
7394e31fa   Shaohua Li   blk-throttle: mak...
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
  /*
   * cgroup's limit in LIMIT_MAX is scaled if low limit is set. This scale is to
   * make the IO dispatch more smooth.
   * Scale up: linearly scale up according to lapsed time since upgrade. For
   *           every throtl_slice, the limit scales up 1/2 .low limit till the
   *           limit hits .max limit
   * Scale down: exponentially scale down if a cgroup doesn't hit its .low limit
   */
  static uint64_t throtl_adjusted_limit(uint64_t low, struct throtl_data *td)
  {
  	/* arbitrary value to avoid too big scale */
  	if (td->scale < 4096 && time_after_eq(jiffies,
  	    td->low_upgrade_time + td->scale * td->throtl_slice))
  		td->scale = (jiffies - td->low_upgrade_time) / td->throtl_slice;
  
  	return low + (low >> 1) * td->scale;
  }
9f626e372   Shaohua Li   blk-throttle: pre...
267
268
  static uint64_t tg_bps_limit(struct throtl_grp *tg, int rw)
  {
b22c417c8   Shaohua Li   blk-throttle: con...
269
  	struct blkcg_gq *blkg = tg_to_blkg(tg);
7394e31fa   Shaohua Li   blk-throttle: mak...
270
  	struct throtl_data *td;
b22c417c8   Shaohua Li   blk-throttle: con...
271
272
273
274
  	uint64_t ret;
  
  	if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent)
  		return U64_MAX;
7394e31fa   Shaohua Li   blk-throttle: mak...
275
276
277
  
  	td = tg->td;
  	ret = tg->bps[rw][td->limit_index];
9bb67aeb9   Shaohua Li   blk-throttle: res...
278
279
280
281
282
283
284
285
  	if (ret == 0 && td->limit_index == LIMIT_LOW) {
  		/* intermediate node or iops isn't 0 */
  		if (!list_empty(&blkg->blkcg->css.children) ||
  		    tg->iops[rw][td->limit_index])
  			return U64_MAX;
  		else
  			return MIN_THROTL_BPS;
  	}
7394e31fa   Shaohua Li   blk-throttle: mak...
286
287
288
289
290
291
292
293
  
  	if (td->limit_index == LIMIT_MAX && tg->bps[rw][LIMIT_LOW] &&
  	    tg->bps[rw][LIMIT_LOW] != tg->bps[rw][LIMIT_MAX]) {
  		uint64_t adjusted;
  
  		adjusted = throtl_adjusted_limit(tg->bps[rw][LIMIT_LOW], td);
  		ret = min(tg->bps[rw][LIMIT_MAX], adjusted);
  	}
b22c417c8   Shaohua Li   blk-throttle: con...
294
  	return ret;
9f626e372   Shaohua Li   blk-throttle: pre...
295
296
297
298
  }
  
  static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw)
  {
b22c417c8   Shaohua Li   blk-throttle: con...
299
  	struct blkcg_gq *blkg = tg_to_blkg(tg);
7394e31fa   Shaohua Li   blk-throttle: mak...
300
  	struct throtl_data *td;
b22c417c8   Shaohua Li   blk-throttle: con...
301
302
303
304
  	unsigned int ret;
  
  	if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent)
  		return UINT_MAX;
9bb67aeb9   Shaohua Li   blk-throttle: res...
305

7394e31fa   Shaohua Li   blk-throttle: mak...
306
307
  	td = tg->td;
  	ret = tg->iops[rw][td->limit_index];
9bb67aeb9   Shaohua Li   blk-throttle: res...
308
309
310
311
312
313
314
315
  	if (ret == 0 && tg->td->limit_index == LIMIT_LOW) {
  		/* intermediate node or bps isn't 0 */
  		if (!list_empty(&blkg->blkcg->css.children) ||
  		    tg->bps[rw][td->limit_index])
  			return UINT_MAX;
  		else
  			return MIN_THROTL_IOPS;
  	}
7394e31fa   Shaohua Li   blk-throttle: mak...
316
317
318
319
320
321
322
323
324
325
  
  	if (td->limit_index == LIMIT_MAX && tg->iops[rw][LIMIT_LOW] &&
  	    tg->iops[rw][LIMIT_LOW] != tg->iops[rw][LIMIT_MAX]) {
  		uint64_t adjusted;
  
  		adjusted = throtl_adjusted_limit(tg->iops[rw][LIMIT_LOW], td);
  		if (adjusted > UINT_MAX)
  			adjusted = UINT_MAX;
  		ret = min_t(unsigned int, tg->iops[rw][LIMIT_MAX], adjusted);
  	}
b22c417c8   Shaohua Li   blk-throttle: con...
326
  	return ret;
9f626e372   Shaohua Li   blk-throttle: pre...
327
  }
b9147dd1b   Shaohua Li   blk-throttle: add...
328
329
  #define request_bucket_index(sectors) \
  	clamp_t(int, order_base_2(sectors) - 3, 0, LATENCY_BUCKET_SIZE - 1)
fda6f272c   Tejun Heo   blk-throttle: imp...
330
331
332
333
334
335
336
337
  /**
   * throtl_log - log debug message via blktrace
   * @sq: the service_queue being reported
   * @fmt: printf format string
   * @args: printf args
   *
   * The messages are prefixed with "throtl BLKG_NAME" if @sq belongs to a
   * throtl_grp; otherwise, just "throtl".
fda6f272c   Tejun Heo   blk-throttle: imp...
338
339
340
341
342
343
   */
  #define throtl_log(sq, fmt, args...)	do {				\
  	struct throtl_grp *__tg = sq_to_tg((sq));			\
  	struct throtl_data *__td = sq_to_td((sq));			\
  									\
  	(void)__td;							\
59fa0224c   Shaohua Li   blk-throttle: don...
344
345
  	if (likely(!blk_trace_note_message_enabled(__td->queue)))	\
  		break;							\
fda6f272c   Tejun Heo   blk-throttle: imp...
346
  	if ((__tg)) {							\
35fe6d763   Shaohua Li   block: use standa...
347
348
  		blk_add_cgroup_trace_msg(__td->queue,			\
  			tg_to_blkg(__tg)->blkcg, "throtl " fmt, ##args);\
fda6f272c   Tejun Heo   blk-throttle: imp...
349
350
351
  	} else {							\
  		blk_add_trace_msg(__td->queue, "throtl " fmt, ##args);	\
  	}								\
54e7ed12b   Tejun Heo   blkcg: remove blk...
352
  } while (0)
e43473b7f   Vivek Goyal   blkio: Core imple...
353

ea0ea2bc6   Shaohua Li   blk-throttle: cap...
354
355
356
357
358
359
360
  static inline unsigned int throtl_bio_data_size(struct bio *bio)
  {
  	/* assume it's one sector */
  	if (unlikely(bio_op(bio) == REQ_OP_DISCARD))
  		return 512;
  	return bio->bi_iter.bi_size;
  }
c5cc2070b   Tejun Heo   blk-throttle: add...
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
  static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg)
  {
  	INIT_LIST_HEAD(&qn->node);
  	bio_list_init(&qn->bios);
  	qn->tg = tg;
  }
  
  /**
   * throtl_qnode_add_bio - add a bio to a throtl_qnode and activate it
   * @bio: bio being added
   * @qn: qnode to add bio to
   * @queued: the service_queue->queued[] list @qn belongs to
   *
   * Add @bio to @qn and put @qn on @queued if it's not already on.
   * @qn->tg's reference count is bumped when @qn is activated.  See the
   * comment on top of throtl_qnode definition for details.
   */
  static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn,
  				 struct list_head *queued)
  {
  	bio_list_add(&qn->bios, bio);
  	if (list_empty(&qn->node)) {
  		list_add_tail(&qn->node, queued);
  		blkg_get(tg_to_blkg(qn->tg));
  	}
  }
  
  /**
   * throtl_peek_queued - peek the first bio on a qnode list
   * @queued: the qnode list to peek
   */
  static struct bio *throtl_peek_queued(struct list_head *queued)
  {
  	struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node);
  	struct bio *bio;
  
  	if (list_empty(queued))
  		return NULL;
  
  	bio = bio_list_peek(&qn->bios);
  	WARN_ON_ONCE(!bio);
  	return bio;
  }
  
  /**
   * throtl_pop_queued - pop the first bio form a qnode list
   * @queued: the qnode list to pop a bio from
   * @tg_to_put: optional out argument for throtl_grp to put
   *
   * Pop the first bio from the qnode list @queued.  After popping, the first
   * qnode is removed from @queued if empty or moved to the end of @queued so
   * that the popping order is round-robin.
   *
   * When the first qnode is removed, its associated throtl_grp should be put
   * too.  If @tg_to_put is NULL, this function automatically puts it;
   * otherwise, *@tg_to_put is set to the throtl_grp to put and the caller is
   * responsible for putting it.
   */
  static struct bio *throtl_pop_queued(struct list_head *queued,
  				     struct throtl_grp **tg_to_put)
  {
  	struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node);
  	struct bio *bio;
  
  	if (list_empty(queued))
  		return NULL;
  
  	bio = bio_list_pop(&qn->bios);
  	WARN_ON_ONCE(!bio);
  
  	if (bio_list_empty(&qn->bios)) {
  		list_del_init(&qn->node);
  		if (tg_to_put)
  			*tg_to_put = qn->tg;
  		else
  			blkg_put(tg_to_blkg(qn->tg));
  	} else {
  		list_move_tail(&qn->node, queued);
  	}
  
  	return bio;
  }
49a2f1e3f   Tejun Heo   blk-throttle: add...
443
  /* init a service_queue, assumes the caller zeroed it */
b2ce2643c   Tejun Heo   blk-throttle: cle...
444
  static void throtl_service_queue_init(struct throtl_service_queue *sq)
49a2f1e3f   Tejun Heo   blk-throttle: add...
445
  {
c5cc2070b   Tejun Heo   blk-throttle: add...
446
447
  	INIT_LIST_HEAD(&sq->queued[0]);
  	INIT_LIST_HEAD(&sq->queued[1]);
9ff01255a   Liu Bo   Blk-throttle: upd...
448
  	sq->pending_tree = RB_ROOT_CACHED;
e99e88a9d   Kees Cook   treewide: setup_t...
449
  	timer_setup(&sq->pending_timer, throtl_pending_timer_fn, 0);
69df0ab03   Tejun Heo   blk-throttle: sep...
450
  }
cf09a8ee1   Tejun Heo   blkcg: pass @q an...
451
452
453
  static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp,
  						struct request_queue *q,
  						struct blkcg *blkcg)
001bea73e   Tejun Heo   blkcg: replace bl...
454
  {
4fb72036f   Tejun Heo   blk-throttle: rem...
455
  	struct throtl_grp *tg;
24bdb8ef0   Tejun Heo   blkcg: make blkcg...
456
  	int rw;
4fb72036f   Tejun Heo   blk-throttle: rem...
457

cf09a8ee1   Tejun Heo   blkcg: pass @q an...
458
  	tg = kzalloc_node(sizeof(*tg), gfp, q->node);
4fb72036f   Tejun Heo   blk-throttle: rem...
459
  	if (!tg)
77ea73388   Tejun Heo   blkcg: move io_se...
460
  		return NULL;
4fb72036f   Tejun Heo   blk-throttle: rem...
461

b2ce2643c   Tejun Heo   blk-throttle: cle...
462
463
464
465
466
467
468
469
  	throtl_service_queue_init(&tg->service_queue);
  
  	for (rw = READ; rw <= WRITE; rw++) {
  		throtl_qnode_init(&tg->qnode_on_self[rw], tg);
  		throtl_qnode_init(&tg->qnode_on_parent[rw], tg);
  	}
  
  	RB_CLEAR_NODE(&tg->rb_node);
9f626e372   Shaohua Li   blk-throttle: pre...
470
471
472
473
  	tg->bps[READ][LIMIT_MAX] = U64_MAX;
  	tg->bps[WRITE][LIMIT_MAX] = U64_MAX;
  	tg->iops[READ][LIMIT_MAX] = UINT_MAX;
  	tg->iops[WRITE][LIMIT_MAX] = UINT_MAX;
cd5ab1b0f   Shaohua Li   blk-throttle: add...
474
475
476
477
478
  	tg->bps_conf[READ][LIMIT_MAX] = U64_MAX;
  	tg->bps_conf[WRITE][LIMIT_MAX] = U64_MAX;
  	tg->iops_conf[READ][LIMIT_MAX] = UINT_MAX;
  	tg->iops_conf[WRITE][LIMIT_MAX] = UINT_MAX;
  	/* LIMIT_LOW will have default value 0 */
b2ce2643c   Tejun Heo   blk-throttle: cle...
479

ec80991d6   Shaohua Li   blk-throttle: add...
480
  	tg->latency_target = DFL_LATENCY_TARGET;
5b81fc3cc   Shaohua Li   blk-throttle: add...
481
  	tg->latency_target_conf = DFL_LATENCY_TARGET;
b4f428ef2   Shaohua Li   blk-throttle: for...
482
483
  	tg->idletime_threshold = DFL_IDLE_THRESHOLD;
  	tg->idletime_threshold_conf = DFL_IDLE_THRESHOLD;
ec80991d6   Shaohua Li   blk-throttle: add...
484

4fb72036f   Tejun Heo   blk-throttle: rem...
485
  	return &tg->pd;
001bea73e   Tejun Heo   blkcg: replace bl...
486
  }
a9520cd6f   Tejun Heo   blkcg: make blkcg...
487
  static void throtl_pd_init(struct blkg_policy_data *pd)
a29a171e7   Vivek Goyal   blk-throttle: Do ...
488
  {
a9520cd6f   Tejun Heo   blkcg: make blkcg...
489
490
  	struct throtl_grp *tg = pd_to_tg(pd);
  	struct blkcg_gq *blkg = tg_to_blkg(tg);
77216b048   Tejun Heo   blk-throttle: add...
491
  	struct throtl_data *td = blkg->q->td;
b2ce2643c   Tejun Heo   blk-throttle: cle...
492
  	struct throtl_service_queue *sq = &tg->service_queue;
cd1604fab   Tejun Heo   blkcg: factor out...
493

9138125be   Tejun Heo   blk-throttle: imp...
494
  	/*
aa6ec29be   Tejun Heo   cgroup: remove sa...
495
  	 * If on the default hierarchy, we switch to properly hierarchical
9138125be   Tejun Heo   blk-throttle: imp...
496
497
498
499
500
  	 * behavior where limits on a given throtl_grp are applied to the
  	 * whole subtree rather than just the group itself.  e.g. If 16M
  	 * read_bps limit is set on the root group, the whole system can't
  	 * exceed 16M for the device.
  	 *
aa6ec29be   Tejun Heo   cgroup: remove sa...
501
  	 * If not on the default hierarchy, the broken flat hierarchy
9138125be   Tejun Heo   blk-throttle: imp...
502
503
504
505
506
  	 * behavior is retained where all throtl_grps are treated as if
  	 * they're all separate root groups right below throtl_data.
  	 * Limits of a group don't interact with limits of other groups
  	 * regardless of the position of the group in the hierarchy.
  	 */
b2ce2643c   Tejun Heo   blk-throttle: cle...
507
  	sq->parent_sq = &td->service_queue;
9e10a130d   Tejun Heo   cgroup: replace c...
508
  	if (cgroup_subsys_on_dfl(io_cgrp_subsys) && blkg->parent)
b2ce2643c   Tejun Heo   blk-throttle: cle...
509
  		sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue;
77216b048   Tejun Heo   blk-throttle: add...
510
  	tg->td = td;
8a3d26151   Tejun Heo   blkcg: move blkio...
511
  }
693e751e7   Tejun Heo   blk-throttle: imp...
512
513
514
515
516
517
518
519
  /*
   * Set has_rules[] if @tg or any of its parents have limits configured.
   * This doesn't require walking up to the top of the hierarchy as the
   * parent's has_rules[] is guaranteed to be correct.
   */
  static void tg_update_has_rules(struct throtl_grp *tg)
  {
  	struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq);
9f626e372   Shaohua Li   blk-throttle: pre...
520
  	struct throtl_data *td = tg->td;
693e751e7   Tejun Heo   blk-throttle: imp...
521
522
523
524
  	int rw;
  
  	for (rw = READ; rw <= WRITE; rw++)
  		tg->has_rules[rw] = (parent_tg && parent_tg->has_rules[rw]) ||
9f626e372   Shaohua Li   blk-throttle: pre...
525
526
527
  			(td->limit_valid[td->limit_index] &&
  			 (tg_bps_limit(tg, rw) != U64_MAX ||
  			  tg_iops_limit(tg, rw) != UINT_MAX));
693e751e7   Tejun Heo   blk-throttle: imp...
528
  }
a9520cd6f   Tejun Heo   blkcg: make blkcg...
529
  static void throtl_pd_online(struct blkg_policy_data *pd)
693e751e7   Tejun Heo   blk-throttle: imp...
530
  {
aec242468   Shaohua Li   blk-throttle: det...
531
  	struct throtl_grp *tg = pd_to_tg(pd);
693e751e7   Tejun Heo   blk-throttle: imp...
532
533
534
535
  	/*
  	 * We don't want new groups to escape the limits of its ancestors.
  	 * Update has_rules[] after a new group is brought online.
  	 */
aec242468   Shaohua Li   blk-throttle: det...
536
  	tg_update_has_rules(tg);
693e751e7   Tejun Heo   blk-throttle: imp...
537
  }
cd5ab1b0f   Shaohua Li   blk-throttle: add...
538
539
540
541
542
543
544
545
546
547
548
  static void blk_throtl_update_limit_valid(struct throtl_data *td)
  {
  	struct cgroup_subsys_state *pos_css;
  	struct blkcg_gq *blkg;
  	bool low_valid = false;
  
  	rcu_read_lock();
  	blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
  		struct throtl_grp *tg = blkg_to_tg(blkg);
  
  		if (tg->bps[READ][LIMIT_LOW] || tg->bps[WRITE][LIMIT_LOW] ||
43ada7878   Liu Bo   Block: blk-thrott...
549
  		    tg->iops[READ][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) {
cd5ab1b0f   Shaohua Li   blk-throttle: add...
550
  			low_valid = true;
43ada7878   Liu Bo   Block: blk-thrott...
551
552
  			break;
  		}
cd5ab1b0f   Shaohua Li   blk-throttle: add...
553
554
555
556
557
  	}
  	rcu_read_unlock();
  
  	td->limit_valid[LIMIT_LOW] = low_valid;
  }
c79892c55   Shaohua Li   blk-throttle: add...
558
  static void throtl_upgrade_state(struct throtl_data *td);
cd5ab1b0f   Shaohua Li   blk-throttle: add...
559
560
561
562
563
564
565
566
567
568
  static void throtl_pd_offline(struct blkg_policy_data *pd)
  {
  	struct throtl_grp *tg = pd_to_tg(pd);
  
  	tg->bps[READ][LIMIT_LOW] = 0;
  	tg->bps[WRITE][LIMIT_LOW] = 0;
  	tg->iops[READ][LIMIT_LOW] = 0;
  	tg->iops[WRITE][LIMIT_LOW] = 0;
  
  	blk_throtl_update_limit_valid(tg->td);
c79892c55   Shaohua Li   blk-throttle: add...
569
570
  	if (!tg->td->limit_valid[tg->td->limit_index])
  		throtl_upgrade_state(tg->td);
cd5ab1b0f   Shaohua Li   blk-throttle: add...
571
  }
001bea73e   Tejun Heo   blkcg: replace bl...
572
573
  static void throtl_pd_free(struct blkg_policy_data *pd)
  {
4fb72036f   Tejun Heo   blk-throttle: rem...
574
  	struct throtl_grp *tg = pd_to_tg(pd);
b2ce2643c   Tejun Heo   blk-throttle: cle...
575
  	del_timer_sync(&tg->service_queue.pending_timer);
4fb72036f   Tejun Heo   blk-throttle: rem...
576
  	kfree(tg);
001bea73e   Tejun Heo   blkcg: replace bl...
577
  }
0049af73b   Tejun Heo   blk-throttle: reo...
578
579
  static struct throtl_grp *
  throtl_rb_first(struct throtl_service_queue *parent_sq)
e43473b7f   Vivek Goyal   blkio: Core imple...
580
  {
9ff01255a   Liu Bo   Blk-throttle: upd...
581
  	struct rb_node *n;
e43473b7f   Vivek Goyal   blkio: Core imple...
582
  	/* Service tree is empty */
0049af73b   Tejun Heo   blk-throttle: reo...
583
  	if (!parent_sq->nr_pending)
e43473b7f   Vivek Goyal   blkio: Core imple...
584
  		return NULL;
9ff01255a   Liu Bo   Blk-throttle: upd...
585
586
587
588
589
  	n = rb_first_cached(&parent_sq->pending_tree);
  	WARN_ON_ONCE(!n);
  	if (!n)
  		return NULL;
  	return rb_entry_tg(n);
e43473b7f   Vivek Goyal   blkio: Core imple...
590
  }
0049af73b   Tejun Heo   blk-throttle: reo...
591
592
  static void throtl_rb_erase(struct rb_node *n,
  			    struct throtl_service_queue *parent_sq)
e43473b7f   Vivek Goyal   blkio: Core imple...
593
  {
9ff01255a   Liu Bo   Blk-throttle: upd...
594
595
  	rb_erase_cached(n, &parent_sq->pending_tree);
  	RB_CLEAR_NODE(n);
0049af73b   Tejun Heo   blk-throttle: reo...
596
  	--parent_sq->nr_pending;
e43473b7f   Vivek Goyal   blkio: Core imple...
597
  }
0049af73b   Tejun Heo   blk-throttle: reo...
598
  static void update_min_dispatch_time(struct throtl_service_queue *parent_sq)
e43473b7f   Vivek Goyal   blkio: Core imple...
599
600
  {
  	struct throtl_grp *tg;
0049af73b   Tejun Heo   blk-throttle: reo...
601
  	tg = throtl_rb_first(parent_sq);
e43473b7f   Vivek Goyal   blkio: Core imple...
602
603
  	if (!tg)
  		return;
0049af73b   Tejun Heo   blk-throttle: reo...
604
  	parent_sq->first_pending_disptime = tg->disptime;
e43473b7f   Vivek Goyal   blkio: Core imple...
605
  }
77216b048   Tejun Heo   blk-throttle: add...
606
  static void tg_service_queue_add(struct throtl_grp *tg)
e43473b7f   Vivek Goyal   blkio: Core imple...
607
  {
77216b048   Tejun Heo   blk-throttle: add...
608
  	struct throtl_service_queue *parent_sq = tg->service_queue.parent_sq;
9ff01255a   Liu Bo   Blk-throttle: upd...
609
  	struct rb_node **node = &parent_sq->pending_tree.rb_root.rb_node;
e43473b7f   Vivek Goyal   blkio: Core imple...
610
611
612
  	struct rb_node *parent = NULL;
  	struct throtl_grp *__tg;
  	unsigned long key = tg->disptime;
9ff01255a   Liu Bo   Blk-throttle: upd...
613
  	bool leftmost = true;
e43473b7f   Vivek Goyal   blkio: Core imple...
614
615
616
617
618
619
620
621
622
  
  	while (*node != NULL) {
  		parent = *node;
  		__tg = rb_entry_tg(parent);
  
  		if (time_before(key, __tg->disptime))
  			node = &parent->rb_left;
  		else {
  			node = &parent->rb_right;
9ff01255a   Liu Bo   Blk-throttle: upd...
623
  			leftmost = false;
e43473b7f   Vivek Goyal   blkio: Core imple...
624
625
  		}
  	}
e43473b7f   Vivek Goyal   blkio: Core imple...
626
  	rb_link_node(&tg->rb_node, parent, node);
9ff01255a   Liu Bo   Blk-throttle: upd...
627
628
  	rb_insert_color_cached(&tg->rb_node, &parent_sq->pending_tree,
  			       leftmost);
e43473b7f   Vivek Goyal   blkio: Core imple...
629
  }
77216b048   Tejun Heo   blk-throttle: add...
630
  static void __throtl_enqueue_tg(struct throtl_grp *tg)
e43473b7f   Vivek Goyal   blkio: Core imple...
631
  {
77216b048   Tejun Heo   blk-throttle: add...
632
  	tg_service_queue_add(tg);
5b2c16aae   Tejun Heo   blk-throttle: sim...
633
  	tg->flags |= THROTL_TG_PENDING;
77216b048   Tejun Heo   blk-throttle: add...
634
  	tg->service_queue.parent_sq->nr_pending++;
e43473b7f   Vivek Goyal   blkio: Core imple...
635
  }
77216b048   Tejun Heo   blk-throttle: add...
636
  static void throtl_enqueue_tg(struct throtl_grp *tg)
e43473b7f   Vivek Goyal   blkio: Core imple...
637
  {
5b2c16aae   Tejun Heo   blk-throttle: sim...
638
  	if (!(tg->flags & THROTL_TG_PENDING))
77216b048   Tejun Heo   blk-throttle: add...
639
  		__throtl_enqueue_tg(tg);
e43473b7f   Vivek Goyal   blkio: Core imple...
640
  }
77216b048   Tejun Heo   blk-throttle: add...
641
  static void __throtl_dequeue_tg(struct throtl_grp *tg)
e43473b7f   Vivek Goyal   blkio: Core imple...
642
  {
77216b048   Tejun Heo   blk-throttle: add...
643
  	throtl_rb_erase(&tg->rb_node, tg->service_queue.parent_sq);
5b2c16aae   Tejun Heo   blk-throttle: sim...
644
  	tg->flags &= ~THROTL_TG_PENDING;
e43473b7f   Vivek Goyal   blkio: Core imple...
645
  }
77216b048   Tejun Heo   blk-throttle: add...
646
  static void throtl_dequeue_tg(struct throtl_grp *tg)
e43473b7f   Vivek Goyal   blkio: Core imple...
647
  {
5b2c16aae   Tejun Heo   blk-throttle: sim...
648
  	if (tg->flags & THROTL_TG_PENDING)
77216b048   Tejun Heo   blk-throttle: add...
649
  		__throtl_dequeue_tg(tg);
e43473b7f   Vivek Goyal   blkio: Core imple...
650
  }
a9131a27e   Tejun Heo   blk-throttle: rel...
651
  /* Call with queue lock held */
69df0ab03   Tejun Heo   blk-throttle: sep...
652
653
  static void throtl_schedule_pending_timer(struct throtl_service_queue *sq,
  					  unsigned long expires)
a9131a27e   Tejun Heo   blk-throttle: rel...
654
  {
a41b816c1   Joseph Qi   blk-throttle: fix...
655
  	unsigned long max_expire = jiffies + 8 * sq_to_td(sq)->throtl_slice;
06cceedcc   Shaohua Li   blk-throttle: mak...
656
657
658
659
660
661
662
663
664
665
  
  	/*
  	 * Since we are adjusting the throttle limit dynamically, the sleep
  	 * time calculated according to previous limit might be invalid. It's
  	 * possible the cgroup sleep time is very long and no other cgroups
  	 * have IO running so notify the limit changes. Make sure the cgroup
  	 * doesn't sleep too long to avoid the missed notification.
  	 */
  	if (time_after(expires, max_expire))
  		expires = max_expire;
69df0ab03   Tejun Heo   blk-throttle: sep...
666
667
668
  	mod_timer(&sq->pending_timer, expires);
  	throtl_log(sq, "schedule timer. delay=%lu jiffies=%lu",
  		   expires - jiffies, jiffies);
a9131a27e   Tejun Heo   blk-throttle: rel...
669
  }
7f52f98c2   Tejun Heo   blk-throttle: imp...
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
  /**
   * throtl_schedule_next_dispatch - schedule the next dispatch cycle
   * @sq: the service_queue to schedule dispatch for
   * @force: force scheduling
   *
   * Arm @sq->pending_timer so that the next dispatch cycle starts on the
   * dispatch time of the first pending child.  Returns %true if either timer
   * is armed or there's no pending child left.  %false if the current
   * dispatch window is still open and the caller should continue
   * dispatching.
   *
   * If @force is %true, the dispatch timer is always scheduled and this
   * function is guaranteed to return %true.  This is to be used when the
   * caller can't dispatch itself and needs to invoke pending_timer
   * unconditionally.  Note that forced scheduling is likely to induce short
   * delay before dispatch starts even if @sq->first_pending_disptime is not
   * in the future and thus shouldn't be used in hot paths.
   */
  static bool throtl_schedule_next_dispatch(struct throtl_service_queue *sq,
  					  bool force)
e43473b7f   Vivek Goyal   blkio: Core imple...
690
  {
6a525600f   Tejun Heo   blk-throttle: rem...
691
  	/* any pending children left? */
c9e0332e8   Tejun Heo   blk-throttle: ren...
692
  	if (!sq->nr_pending)
7f52f98c2   Tejun Heo   blk-throttle: imp...
693
  		return true;
e43473b7f   Vivek Goyal   blkio: Core imple...
694

c9e0332e8   Tejun Heo   blk-throttle: ren...
695
  	update_min_dispatch_time(sq);
e43473b7f   Vivek Goyal   blkio: Core imple...
696

69df0ab03   Tejun Heo   blk-throttle: sep...
697
  	/* is the next dispatch time in the future? */
7f52f98c2   Tejun Heo   blk-throttle: imp...
698
  	if (force || time_after(sq->first_pending_disptime, jiffies)) {
69df0ab03   Tejun Heo   blk-throttle: sep...
699
  		throtl_schedule_pending_timer(sq, sq->first_pending_disptime);
7f52f98c2   Tejun Heo   blk-throttle: imp...
700
  		return true;
69df0ab03   Tejun Heo   blk-throttle: sep...
701
  	}
7f52f98c2   Tejun Heo   blk-throttle: imp...
702
703
  	/* tell the caller to continue dispatching */
  	return false;
e43473b7f   Vivek Goyal   blkio: Core imple...
704
  }
32ee5bc47   Vivek Goyal   blk-throttle: Acc...
705
706
707
708
709
710
711
712
713
714
715
716
717
718
  static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg,
  		bool rw, unsigned long start)
  {
  	tg->bytes_disp[rw] = 0;
  	tg->io_disp[rw] = 0;
  
  	/*
  	 * Previous slice has expired. We must have trimmed it after last
  	 * bio dispatch. That means since start of last slice, we never used
  	 * that bandwidth. Do try to make use of that bandwidth while giving
  	 * credit.
  	 */
  	if (time_after_eq(start, tg->slice_start[rw]))
  		tg->slice_start[rw] = start;
297e3d854   Shaohua Li   blk-throttle: mak...
719
  	tg->slice_end[rw] = jiffies + tg->td->throtl_slice;
32ee5bc47   Vivek Goyal   blk-throttle: Acc...
720
721
722
723
724
  	throtl_log(&tg->service_queue,
  		   "[%c] new slice with credit start=%lu end=%lu jiffies=%lu",
  		   rw == READ ? 'R' : 'W', tg->slice_start[rw],
  		   tg->slice_end[rw], jiffies);
  }
0f3457f60   Tejun Heo   blk-throttle: add...
725
  static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw)
e43473b7f   Vivek Goyal   blkio: Core imple...
726
727
  {
  	tg->bytes_disp[rw] = 0;
8e89d13f4   Vivek Goyal   blkio: Implementa...
728
  	tg->io_disp[rw] = 0;
e43473b7f   Vivek Goyal   blkio: Core imple...
729
  	tg->slice_start[rw] = jiffies;
297e3d854   Shaohua Li   blk-throttle: mak...
730
  	tg->slice_end[rw] = jiffies + tg->td->throtl_slice;
fda6f272c   Tejun Heo   blk-throttle: imp...
731
732
733
734
  	throtl_log(&tg->service_queue,
  		   "[%c] new slice start=%lu end=%lu jiffies=%lu",
  		   rw == READ ? 'R' : 'W', tg->slice_start[rw],
  		   tg->slice_end[rw], jiffies);
e43473b7f   Vivek Goyal   blkio: Core imple...
735
  }
0f3457f60   Tejun Heo   blk-throttle: add...
736
737
  static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw,
  					unsigned long jiffy_end)
d1ae8ffdf   Vivek Goyal   blk-throttle: Tri...
738
  {
297e3d854   Shaohua Li   blk-throttle: mak...
739
  	tg->slice_end[rw] = roundup(jiffy_end, tg->td->throtl_slice);
d1ae8ffdf   Vivek Goyal   blk-throttle: Tri...
740
  }
0f3457f60   Tejun Heo   blk-throttle: add...
741
742
  static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw,
  				       unsigned long jiffy_end)
e43473b7f   Vivek Goyal   blkio: Core imple...
743
  {
297e3d854   Shaohua Li   blk-throttle: mak...
744
  	tg->slice_end[rw] = roundup(jiffy_end, tg->td->throtl_slice);
fda6f272c   Tejun Heo   blk-throttle: imp...
745
746
747
748
  	throtl_log(&tg->service_queue,
  		   "[%c] extend slice start=%lu end=%lu jiffies=%lu",
  		   rw == READ ? 'R' : 'W', tg->slice_start[rw],
  		   tg->slice_end[rw], jiffies);
e43473b7f   Vivek Goyal   blkio: Core imple...
749
750
751
  }
  
  /* Determine if previously allocated or extended slice is complete or not */
0f3457f60   Tejun Heo   blk-throttle: add...
752
  static bool throtl_slice_used(struct throtl_grp *tg, bool rw)
e43473b7f   Vivek Goyal   blkio: Core imple...
753
754
  {
  	if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw]))
5cf8c2277   Fabian Frederick   block/blk-throttl...
755
  		return false;
e43473b7f   Vivek Goyal   blkio: Core imple...
756

0b6bad7d6   Chengguang Xu   blk-throttle: ret...
757
  	return true;
e43473b7f   Vivek Goyal   blkio: Core imple...
758
759
760
  }
  
  /* Trim the used slices and adjust slice start accordingly */
0f3457f60   Tejun Heo   blk-throttle: add...
761
  static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
e43473b7f   Vivek Goyal   blkio: Core imple...
762
  {
3aad5d3ee   Vivek Goyal   blkio-throttle: F...
763
764
  	unsigned long nr_slices, time_elapsed, io_trim;
  	u64 bytes_trim, tmp;
e43473b7f   Vivek Goyal   blkio: Core imple...
765
766
767
768
769
770
771
772
  
  	BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw]));
  
  	/*
  	 * If bps are unlimited (-1), then time slice don't get
  	 * renewed. Don't try to trim the slice if slice is used. A new
  	 * slice will start when appropriate.
  	 */
0f3457f60   Tejun Heo   blk-throttle: add...
773
  	if (throtl_slice_used(tg, rw))
e43473b7f   Vivek Goyal   blkio: Core imple...
774
  		return;
d1ae8ffdf   Vivek Goyal   blk-throttle: Tri...
775
776
777
778
779
780
781
  	/*
  	 * A bio has been dispatched. Also adjust slice_end. It might happen
  	 * that initially cgroup limit was very low resulting in high
  	 * slice_end, but later limit was bumped up and bio was dispached
  	 * sooner, then we need to reduce slice_end. A high bogus slice_end
  	 * is bad because it does not allow new slice to start.
  	 */
297e3d854   Shaohua Li   blk-throttle: mak...
782
  	throtl_set_slice_end(tg, rw, jiffies + tg->td->throtl_slice);
d1ae8ffdf   Vivek Goyal   blk-throttle: Tri...
783

e43473b7f   Vivek Goyal   blkio: Core imple...
784
  	time_elapsed = jiffies - tg->slice_start[rw];
297e3d854   Shaohua Li   blk-throttle: mak...
785
  	nr_slices = time_elapsed / tg->td->throtl_slice;
e43473b7f   Vivek Goyal   blkio: Core imple...
786
787
788
  
  	if (!nr_slices)
  		return;
297e3d854   Shaohua Li   blk-throttle: mak...
789
  	tmp = tg_bps_limit(tg, rw) * tg->td->throtl_slice * nr_slices;
3aad5d3ee   Vivek Goyal   blkio-throttle: F...
790
791
  	do_div(tmp, HZ);
  	bytes_trim = tmp;
e43473b7f   Vivek Goyal   blkio: Core imple...
792

297e3d854   Shaohua Li   blk-throttle: mak...
793
794
  	io_trim = (tg_iops_limit(tg, rw) * tg->td->throtl_slice * nr_slices) /
  		HZ;
e43473b7f   Vivek Goyal   blkio: Core imple...
795

8e89d13f4   Vivek Goyal   blkio: Implementa...
796
  	if (!bytes_trim && !io_trim)
e43473b7f   Vivek Goyal   blkio: Core imple...
797
798
799
800
801
802
  		return;
  
  	if (tg->bytes_disp[rw] >= bytes_trim)
  		tg->bytes_disp[rw] -= bytes_trim;
  	else
  		tg->bytes_disp[rw] = 0;
8e89d13f4   Vivek Goyal   blkio: Implementa...
803
804
805
806
  	if (tg->io_disp[rw] >= io_trim)
  		tg->io_disp[rw] -= io_trim;
  	else
  		tg->io_disp[rw] = 0;
297e3d854   Shaohua Li   blk-throttle: mak...
807
  	tg->slice_start[rw] += nr_slices * tg->td->throtl_slice;
e43473b7f   Vivek Goyal   blkio: Core imple...
808

fda6f272c   Tejun Heo   blk-throttle: imp...
809
810
811
812
  	throtl_log(&tg->service_queue,
  		   "[%c] trim slice nr=%lu bytes=%llu io=%lu start=%lu end=%lu jiffies=%lu",
  		   rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim,
  		   tg->slice_start[rw], tg->slice_end[rw], jiffies);
e43473b7f   Vivek Goyal   blkio: Core imple...
813
  }
0f3457f60   Tejun Heo   blk-throttle: add...
814
815
  static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
  				  unsigned long *wait)
e43473b7f   Vivek Goyal   blkio: Core imple...
816
817
  {
  	bool rw = bio_data_dir(bio);
8e89d13f4   Vivek Goyal   blkio: Implementa...
818
  	unsigned int io_allowed;
e43473b7f   Vivek Goyal   blkio: Core imple...
819
  	unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
c49c06e49   Vivek Goyal   blkio-throttle: F...
820
  	u64 tmp;
e43473b7f   Vivek Goyal   blkio: Core imple...
821

3a10f999f   Konstantin Khlebnikov   blk-throttle: fix...
822
  	jiffy_elapsed = jiffies - tg->slice_start[rw];
e43473b7f   Vivek Goyal   blkio: Core imple...
823

3a10f999f   Konstantin Khlebnikov   blk-throttle: fix...
824
825
  	/* Round up to the next throttle slice, wait time must be nonzero */
  	jiffy_elapsed_rnd = roundup(jiffy_elapsed + 1, tg->td->throtl_slice);
8e89d13f4   Vivek Goyal   blkio: Implementa...
826

c49c06e49   Vivek Goyal   blkio-throttle: F...
827
828
829
830
831
832
  	/*
  	 * jiffy_elapsed_rnd should not be a big value as minimum iops can be
  	 * 1 then at max jiffy elapsed should be equivalent of 1 second as we
  	 * will allow dispatch after 1 second and after that slice should
  	 * have been trimmed.
  	 */
9f626e372   Shaohua Li   blk-throttle: pre...
833
  	tmp = (u64)tg_iops_limit(tg, rw) * jiffy_elapsed_rnd;
c49c06e49   Vivek Goyal   blkio-throttle: F...
834
835
836
837
838
839
  	do_div(tmp, HZ);
  
  	if (tmp > UINT_MAX)
  		io_allowed = UINT_MAX;
  	else
  		io_allowed = tmp;
8e89d13f4   Vivek Goyal   blkio: Implementa...
840
841
  
  	if (tg->io_disp[rw] + 1 <= io_allowed) {
e43473b7f   Vivek Goyal   blkio: Core imple...
842
843
  		if (wait)
  			*wait = 0;
5cf8c2277   Fabian Frederick   block/blk-throttl...
844
  		return true;
e43473b7f   Vivek Goyal   blkio: Core imple...
845
  	}
8e89d13f4   Vivek Goyal   blkio: Implementa...
846
  	/* Calc approx time to dispatch */
991f61fe7   Liu Bo   Blk-throttle: red...
847
  	jiffy_wait = jiffy_elapsed_rnd - jiffy_elapsed;
8e89d13f4   Vivek Goyal   blkio: Implementa...
848
849
850
  
  	if (wait)
  		*wait = jiffy_wait;
0b6bad7d6   Chengguang Xu   blk-throttle: ret...
851
  	return false;
8e89d13f4   Vivek Goyal   blkio: Implementa...
852
  }
0f3457f60   Tejun Heo   blk-throttle: add...
853
854
  static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
  				 unsigned long *wait)
8e89d13f4   Vivek Goyal   blkio: Implementa...
855
856
  {
  	bool rw = bio_data_dir(bio);
3aad5d3ee   Vivek Goyal   blkio-throttle: F...
857
  	u64 bytes_allowed, extra_bytes, tmp;
8e89d13f4   Vivek Goyal   blkio: Implementa...
858
  	unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
ea0ea2bc6   Shaohua Li   blk-throttle: cap...
859
  	unsigned int bio_size = throtl_bio_data_size(bio);
e43473b7f   Vivek Goyal   blkio: Core imple...
860
861
862
863
864
  
  	jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
  
  	/* Slice has just started. Consider one slice interval */
  	if (!jiffy_elapsed)
297e3d854   Shaohua Li   blk-throttle: mak...
865
  		jiffy_elapsed_rnd = tg->td->throtl_slice;
e43473b7f   Vivek Goyal   blkio: Core imple...
866

297e3d854   Shaohua Li   blk-throttle: mak...
867
  	jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice);
e43473b7f   Vivek Goyal   blkio: Core imple...
868

9f626e372   Shaohua Li   blk-throttle: pre...
869
  	tmp = tg_bps_limit(tg, rw) * jiffy_elapsed_rnd;
5e901a2b9   Vivek Goyal   blkio-throttle: T...
870
  	do_div(tmp, HZ);
3aad5d3ee   Vivek Goyal   blkio-throttle: F...
871
  	bytes_allowed = tmp;
e43473b7f   Vivek Goyal   blkio: Core imple...
872

ea0ea2bc6   Shaohua Li   blk-throttle: cap...
873
  	if (tg->bytes_disp[rw] + bio_size <= bytes_allowed) {
e43473b7f   Vivek Goyal   blkio: Core imple...
874
875
  		if (wait)
  			*wait = 0;
5cf8c2277   Fabian Frederick   block/blk-throttl...
876
  		return true;
e43473b7f   Vivek Goyal   blkio: Core imple...
877
878
879
  	}
  
  	/* Calc approx time to dispatch */
ea0ea2bc6   Shaohua Li   blk-throttle: cap...
880
  	extra_bytes = tg->bytes_disp[rw] + bio_size - bytes_allowed;
9f626e372   Shaohua Li   blk-throttle: pre...
881
  	jiffy_wait = div64_u64(extra_bytes * HZ, tg_bps_limit(tg, rw));
e43473b7f   Vivek Goyal   blkio: Core imple...
882
883
884
885
886
887
888
889
890
  
  	if (!jiffy_wait)
  		jiffy_wait = 1;
  
  	/*
  	 * This wait time is without taking into consideration the rounding
  	 * up we did. Add that time also.
  	 */
  	jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed);
e43473b7f   Vivek Goyal   blkio: Core imple...
891
892
  	if (wait)
  		*wait = jiffy_wait;
0b6bad7d6   Chengguang Xu   blk-throttle: ret...
893
  	return false;
8e89d13f4   Vivek Goyal   blkio: Implementa...
894
895
896
897
898
899
  }
  
  /*
   * Returns whether one can dispatch a bio or not. Also returns approx number
   * of jiffies to wait before this bio is with-in IO rate and can be dispatched
   */
0f3457f60   Tejun Heo   blk-throttle: add...
900
901
  static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
  			    unsigned long *wait)
8e89d13f4   Vivek Goyal   blkio: Implementa...
902
903
904
905
906
907
908
909
910
911
  {
  	bool rw = bio_data_dir(bio);
  	unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;
  
  	/*
   	 * Currently whole state machine of group depends on first bio
  	 * queued in the group bio list. So one should not be calling
  	 * this function with a different bio if there are other bios
  	 * queued.
  	 */
73f0d49a9   Tejun Heo   blk-throttle: mov...
912
  	BUG_ON(tg->service_queue.nr_queued[rw] &&
c5cc2070b   Tejun Heo   blk-throttle: add...
913
  	       bio != throtl_peek_queued(&tg->service_queue.queued[rw]));
e43473b7f   Vivek Goyal   blkio: Core imple...
914

8e89d13f4   Vivek Goyal   blkio: Implementa...
915
  	/* If tg->bps = -1, then BW is unlimited */
9f626e372   Shaohua Li   blk-throttle: pre...
916
917
  	if (tg_bps_limit(tg, rw) == U64_MAX &&
  	    tg_iops_limit(tg, rw) == UINT_MAX) {
8e89d13f4   Vivek Goyal   blkio: Implementa...
918
919
  		if (wait)
  			*wait = 0;
5cf8c2277   Fabian Frederick   block/blk-throttl...
920
  		return true;
8e89d13f4   Vivek Goyal   blkio: Implementa...
921
922
923
924
925
  	}
  
  	/*
  	 * If previous slice expired, start a new one otherwise renew/extend
  	 * existing slice to make sure it is at least throtl_slice interval
164c80ed8   Vivek Goyal   blk-throttle: Ext...
926
927
928
  	 * long since now. New slice is started only for empty throttle group.
  	 * If there is queued bio, that means there should be an active
  	 * slice and it should be extended instead.
8e89d13f4   Vivek Goyal   blkio: Implementa...
929
  	 */
164c80ed8   Vivek Goyal   blk-throttle: Ext...
930
  	if (throtl_slice_used(tg, rw) && !(tg->service_queue.nr_queued[rw]))
0f3457f60   Tejun Heo   blk-throttle: add...
931
  		throtl_start_new_slice(tg, rw);
8e89d13f4   Vivek Goyal   blkio: Implementa...
932
  	else {
297e3d854   Shaohua Li   blk-throttle: mak...
933
934
935
936
  		if (time_before(tg->slice_end[rw],
  		    jiffies + tg->td->throtl_slice))
  			throtl_extend_slice(tg, rw,
  				jiffies + tg->td->throtl_slice);
8e89d13f4   Vivek Goyal   blkio: Implementa...
937
  	}
0f3457f60   Tejun Heo   blk-throttle: add...
938
939
  	if (tg_with_in_bps_limit(tg, bio, &bps_wait) &&
  	    tg_with_in_iops_limit(tg, bio, &iops_wait)) {
8e89d13f4   Vivek Goyal   blkio: Implementa...
940
941
  		if (wait)
  			*wait = 0;
0b6bad7d6   Chengguang Xu   blk-throttle: ret...
942
  		return true;
8e89d13f4   Vivek Goyal   blkio: Implementa...
943
944
945
946
947
948
949
950
  	}
  
  	max_wait = max(bps_wait, iops_wait);
  
  	if (wait)
  		*wait = max_wait;
  
  	if (time_before(tg->slice_end[rw], jiffies + max_wait))
0f3457f60   Tejun Heo   blk-throttle: add...
951
  		throtl_extend_slice(tg, rw, jiffies + max_wait);
e43473b7f   Vivek Goyal   blkio: Core imple...
952

0b6bad7d6   Chengguang Xu   blk-throttle: ret...
953
  	return false;
e43473b7f   Vivek Goyal   blkio: Core imple...
954
955
956
957
958
  }
  
  static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
  {
  	bool rw = bio_data_dir(bio);
ea0ea2bc6   Shaohua Li   blk-throttle: cap...
959
  	unsigned int bio_size = throtl_bio_data_size(bio);
e43473b7f   Vivek Goyal   blkio: Core imple...
960
961
  
  	/* Charge the bio to the group */
ea0ea2bc6   Shaohua Li   blk-throttle: cap...
962
  	tg->bytes_disp[rw] += bio_size;
8e89d13f4   Vivek Goyal   blkio: Implementa...
963
  	tg->io_disp[rw]++;
ea0ea2bc6   Shaohua Li   blk-throttle: cap...
964
  	tg->last_bytes_disp[rw] += bio_size;
3f0abd806   Shaohua Li   blk-throttle: add...
965
  	tg->last_io_disp[rw]++;
e43473b7f   Vivek Goyal   blkio: Core imple...
966

2a0f61e6e   Tejun Heo   blk-throttle: set...
967
  	/*
8d2bbd4c8   Christoph Hellwig   block: replace RE...
968
  	 * BIO_THROTTLED is used to prevent the same bio to be throttled
2a0f61e6e   Tejun Heo   blk-throttle: set...
969
970
971
  	 * more than once as a throttled bio will go through blk-throtl the
  	 * second time when it eventually gets issued.  Set it when a bio
  	 * is being charged to a tg.
2a0f61e6e   Tejun Heo   blk-throttle: set...
972
  	 */
8d2bbd4c8   Christoph Hellwig   block: replace RE...
973
974
  	if (!bio_flagged(bio, BIO_THROTTLED))
  		bio_set_flag(bio, BIO_THROTTLED);
e43473b7f   Vivek Goyal   blkio: Core imple...
975
  }
c5cc2070b   Tejun Heo   blk-throttle: add...
976
977
978
979
980
981
982
983
984
985
986
  /**
   * throtl_add_bio_tg - add a bio to the specified throtl_grp
   * @bio: bio to add
   * @qn: qnode to use
   * @tg: the target throtl_grp
   *
   * Add @bio to @tg's service_queue using @qn.  If @qn is not specified,
   * tg->qnode_on_self[] is used.
   */
  static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn,
  			      struct throtl_grp *tg)
e43473b7f   Vivek Goyal   blkio: Core imple...
987
  {
73f0d49a9   Tejun Heo   blk-throttle: mov...
988
  	struct throtl_service_queue *sq = &tg->service_queue;
e43473b7f   Vivek Goyal   blkio: Core imple...
989
  	bool rw = bio_data_dir(bio);
c5cc2070b   Tejun Heo   blk-throttle: add...
990
991
  	if (!qn)
  		qn = &tg->qnode_on_self[rw];
0e9f4164b   Tejun Heo   blk-throttle: gen...
992
993
994
995
996
997
998
999
  	/*
  	 * If @tg doesn't currently have any bios queued in the same
  	 * direction, queueing @bio can change when @tg should be
  	 * dispatched.  Mark that @tg was empty.  This is automatically
  	 * cleaered on the next tg_update_disptime().
  	 */
  	if (!sq->nr_queued[rw])
  		tg->flags |= THROTL_TG_WAS_EMPTY;
c5cc2070b   Tejun Heo   blk-throttle: add...
1000
  	throtl_qnode_add_bio(bio, qn, &sq->queued[rw]);
73f0d49a9   Tejun Heo   blk-throttle: mov...
1001
  	sq->nr_queued[rw]++;
77216b048   Tejun Heo   blk-throttle: add...
1002
  	throtl_enqueue_tg(tg);
e43473b7f   Vivek Goyal   blkio: Core imple...
1003
  }
77216b048   Tejun Heo   blk-throttle: add...
1004
  static void tg_update_disptime(struct throtl_grp *tg)
e43473b7f   Vivek Goyal   blkio: Core imple...
1005
  {
73f0d49a9   Tejun Heo   blk-throttle: mov...
1006
  	struct throtl_service_queue *sq = &tg->service_queue;
e43473b7f   Vivek Goyal   blkio: Core imple...
1007
1008
  	unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;
  	struct bio *bio;
d609af3a1   Markus Elfring   blk-throttle: Adj...
1009
1010
  	bio = throtl_peek_queued(&sq->queued[READ]);
  	if (bio)
0f3457f60   Tejun Heo   blk-throttle: add...
1011
  		tg_may_dispatch(tg, bio, &read_wait);
e43473b7f   Vivek Goyal   blkio: Core imple...
1012

d609af3a1   Markus Elfring   blk-throttle: Adj...
1013
1014
  	bio = throtl_peek_queued(&sq->queued[WRITE]);
  	if (bio)
0f3457f60   Tejun Heo   blk-throttle: add...
1015
  		tg_may_dispatch(tg, bio, &write_wait);
e43473b7f   Vivek Goyal   blkio: Core imple...
1016
1017
1018
  
  	min_wait = min(read_wait, write_wait);
  	disptime = jiffies + min_wait;
e43473b7f   Vivek Goyal   blkio: Core imple...
1019
  	/* Update dispatch time */
77216b048   Tejun Heo   blk-throttle: add...
1020
  	throtl_dequeue_tg(tg);
e43473b7f   Vivek Goyal   blkio: Core imple...
1021
  	tg->disptime = disptime;
77216b048   Tejun Heo   blk-throttle: add...
1022
  	throtl_enqueue_tg(tg);
0e9f4164b   Tejun Heo   blk-throttle: gen...
1023
1024
1025
  
  	/* see throtl_add_bio_tg() */
  	tg->flags &= ~THROTL_TG_WAS_EMPTY;
e43473b7f   Vivek Goyal   blkio: Core imple...
1026
  }
32ee5bc47   Vivek Goyal   blk-throttle: Acc...
1027
1028
1029
1030
1031
1032
1033
1034
1035
  static void start_parent_slice_with_credit(struct throtl_grp *child_tg,
  					struct throtl_grp *parent_tg, bool rw)
  {
  	if (throtl_slice_used(parent_tg, rw)) {
  		throtl_start_new_slice_with_credit(parent_tg, rw,
  				child_tg->slice_start[rw]);
  	}
  
  }
77216b048   Tejun Heo   blk-throttle: add...
1036
  static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
e43473b7f   Vivek Goyal   blkio: Core imple...
1037
  {
73f0d49a9   Tejun Heo   blk-throttle: mov...
1038
  	struct throtl_service_queue *sq = &tg->service_queue;
6bc9c2b46   Tejun Heo   blk-throttle: mak...
1039
1040
  	struct throtl_service_queue *parent_sq = sq->parent_sq;
  	struct throtl_grp *parent_tg = sq_to_tg(parent_sq);
c5cc2070b   Tejun Heo   blk-throttle: add...
1041
  	struct throtl_grp *tg_to_put = NULL;
e43473b7f   Vivek Goyal   blkio: Core imple...
1042
  	struct bio *bio;
c5cc2070b   Tejun Heo   blk-throttle: add...
1043
1044
1045
1046
1047
1048
1049
  	/*
  	 * @bio is being transferred from @tg to @parent_sq.  Popping a bio
  	 * from @tg may put its reference and @parent_sq might end up
  	 * getting released prematurely.  Remember the tg to put and put it
  	 * after @bio is transferred to @parent_sq.
  	 */
  	bio = throtl_pop_queued(&sq->queued[rw], &tg_to_put);
73f0d49a9   Tejun Heo   blk-throttle: mov...
1050
  	sq->nr_queued[rw]--;
e43473b7f   Vivek Goyal   blkio: Core imple...
1051
1052
  
  	throtl_charge_bio(tg, bio);
6bc9c2b46   Tejun Heo   blk-throttle: mak...
1053
1054
1055
1056
1057
1058
1059
1060
1061
  
  	/*
  	 * If our parent is another tg, we just need to transfer @bio to
  	 * the parent using throtl_add_bio_tg().  If our parent is
  	 * @td->service_queue, @bio is ready to be issued.  Put it on its
  	 * bio_lists[] and decrease total number queued.  The caller is
  	 * responsible for issuing these bios.
  	 */
  	if (parent_tg) {
c5cc2070b   Tejun Heo   blk-throttle: add...
1062
  		throtl_add_bio_tg(bio, &tg->qnode_on_parent[rw], parent_tg);
32ee5bc47   Vivek Goyal   blk-throttle: Acc...
1063
  		start_parent_slice_with_credit(tg, parent_tg, rw);
6bc9c2b46   Tejun Heo   blk-throttle: mak...
1064
  	} else {
c5cc2070b   Tejun Heo   blk-throttle: add...
1065
1066
  		throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw],
  				     &parent_sq->queued[rw]);
6bc9c2b46   Tejun Heo   blk-throttle: mak...
1067
1068
1069
  		BUG_ON(tg->td->nr_queued[rw] <= 0);
  		tg->td->nr_queued[rw]--;
  	}
e43473b7f   Vivek Goyal   blkio: Core imple...
1070

0f3457f60   Tejun Heo   blk-throttle: add...
1071
  	throtl_trim_slice(tg, rw);
6bc9c2b46   Tejun Heo   blk-throttle: mak...
1072

c5cc2070b   Tejun Heo   blk-throttle: add...
1073
1074
  	if (tg_to_put)
  		blkg_put(tg_to_blkg(tg_to_put));
e43473b7f   Vivek Goyal   blkio: Core imple...
1075
  }
77216b048   Tejun Heo   blk-throttle: add...
1076
  static int throtl_dispatch_tg(struct throtl_grp *tg)
e43473b7f   Vivek Goyal   blkio: Core imple...
1077
  {
73f0d49a9   Tejun Heo   blk-throttle: mov...
1078
  	struct throtl_service_queue *sq = &tg->service_queue;
e43473b7f   Vivek Goyal   blkio: Core imple...
1079
1080
  	unsigned int nr_reads = 0, nr_writes = 0;
  	unsigned int max_nr_reads = throtl_grp_quantum*3/4;
c2f6805d4   Vivek Goyal   blk-throttle: Fix...
1081
  	unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads;
e43473b7f   Vivek Goyal   blkio: Core imple...
1082
1083
1084
  	struct bio *bio;
  
  	/* Try to dispatch 75% READS and 25% WRITES */
c5cc2070b   Tejun Heo   blk-throttle: add...
1085
  	while ((bio = throtl_peek_queued(&sq->queued[READ])) &&
0f3457f60   Tejun Heo   blk-throttle: add...
1086
  	       tg_may_dispatch(tg, bio, NULL)) {
e43473b7f   Vivek Goyal   blkio: Core imple...
1087

77216b048   Tejun Heo   blk-throttle: add...
1088
  		tg_dispatch_one_bio(tg, bio_data_dir(bio));
e43473b7f   Vivek Goyal   blkio: Core imple...
1089
1090
1091
1092
1093
  		nr_reads++;
  
  		if (nr_reads >= max_nr_reads)
  			break;
  	}
c5cc2070b   Tejun Heo   blk-throttle: add...
1094
  	while ((bio = throtl_peek_queued(&sq->queued[WRITE])) &&
0f3457f60   Tejun Heo   blk-throttle: add...
1095
  	       tg_may_dispatch(tg, bio, NULL)) {
e43473b7f   Vivek Goyal   blkio: Core imple...
1096

77216b048   Tejun Heo   blk-throttle: add...
1097
  		tg_dispatch_one_bio(tg, bio_data_dir(bio));
e43473b7f   Vivek Goyal   blkio: Core imple...
1098
1099
1100
1101
1102
1103
1104
1105
  		nr_writes++;
  
  		if (nr_writes >= max_nr_writes)
  			break;
  	}
  
  	return nr_reads + nr_writes;
  }
651930bc1   Tejun Heo   blk-throttle: dis...
1106
  static int throtl_select_dispatch(struct throtl_service_queue *parent_sq)
e43473b7f   Vivek Goyal   blkio: Core imple...
1107
1108
  {
  	unsigned int nr_disp = 0;
e43473b7f   Vivek Goyal   blkio: Core imple...
1109
1110
  
  	while (1) {
73f0d49a9   Tejun Heo   blk-throttle: mov...
1111
  		struct throtl_grp *tg = throtl_rb_first(parent_sq);
2ab74cd29   Liu Bo   blk-throttle: fix...
1112
  		struct throtl_service_queue *sq;
e43473b7f   Vivek Goyal   blkio: Core imple...
1113
1114
1115
1116
1117
1118
  
  		if (!tg)
  			break;
  
  		if (time_before(jiffies, tg->disptime))
  			break;
77216b048   Tejun Heo   blk-throttle: add...
1119
  		throtl_dequeue_tg(tg);
e43473b7f   Vivek Goyal   blkio: Core imple...
1120

77216b048   Tejun Heo   blk-throttle: add...
1121
  		nr_disp += throtl_dispatch_tg(tg);
e43473b7f   Vivek Goyal   blkio: Core imple...
1122

2ab74cd29   Liu Bo   blk-throttle: fix...
1123
  		sq = &tg->service_queue;
73f0d49a9   Tejun Heo   blk-throttle: mov...
1124
  		if (sq->nr_queued[0] || sq->nr_queued[1])
77216b048   Tejun Heo   blk-throttle: add...
1125
  			tg_update_disptime(tg);
e43473b7f   Vivek Goyal   blkio: Core imple...
1126
1127
1128
1129
1130
1131
1132
  
  		if (nr_disp >= throtl_quantum)
  			break;
  	}
  
  	return nr_disp;
  }
c79892c55   Shaohua Li   blk-throttle: add...
1133
1134
  static bool throtl_can_upgrade(struct throtl_data *td,
  	struct throtl_grp *this_tg);
6e1a5704c   Tejun Heo   blk-throttle: dis...
1135
1136
  /**
   * throtl_pending_timer_fn - timer function for service_queue->pending_timer
216382dcc   Bart Van Assche   block: Fix throtl...
1137
   * @t: the pending_timer member of the throtl_service_queue being serviced
6e1a5704c   Tejun Heo   blk-throttle: dis...
1138
1139
1140
1141
   *
   * This timer is armed when a child throtl_grp with active bio's become
   * pending and queued on the service_queue's pending_tree and expires when
   * the first child throtl_grp should be dispatched.  This function
2e48a530a   Tejun Heo   blk-throttle: mak...
1142
1143
1144
1145
1146
1147
1148
   * dispatches bio's from the children throtl_grps to the parent
   * service_queue.
   *
   * If the parent's parent is another throtl_grp, dispatching is propagated
   * by either arming its pending_timer or repeating dispatch directly.  If
   * the top-level service_tree is reached, throtl_data->dispatch_work is
   * kicked so that the ready bio's are issued.
6e1a5704c   Tejun Heo   blk-throttle: dis...
1149
   */
e99e88a9d   Kees Cook   treewide: setup_t...
1150
  static void throtl_pending_timer_fn(struct timer_list *t)
69df0ab03   Tejun Heo   blk-throttle: sep...
1151
  {
e99e88a9d   Kees Cook   treewide: setup_t...
1152
  	struct throtl_service_queue *sq = from_timer(sq, t, pending_timer);
2e48a530a   Tejun Heo   blk-throttle: mak...
1153
  	struct throtl_grp *tg = sq_to_tg(sq);
69df0ab03   Tejun Heo   blk-throttle: sep...
1154
  	struct throtl_data *td = sq_to_td(sq);
cb76199c3   Tejun Heo   blk-throttle: col...
1155
  	struct request_queue *q = td->queue;
2e48a530a   Tejun Heo   blk-throttle: mak...
1156
1157
  	struct throtl_service_queue *parent_sq;
  	bool dispatched;
6e1a5704c   Tejun Heo   blk-throttle: dis...
1158
  	int ret;
e43473b7f   Vivek Goyal   blkio: Core imple...
1159

0d945c1f9   Christoph Hellwig   block: remove the...
1160
  	spin_lock_irq(&q->queue_lock);
c79892c55   Shaohua Li   blk-throttle: add...
1161
1162
  	if (throtl_can_upgrade(td, NULL))
  		throtl_upgrade_state(td);
2e48a530a   Tejun Heo   blk-throttle: mak...
1163
1164
1165
  again:
  	parent_sq = sq->parent_sq;
  	dispatched = false;
e43473b7f   Vivek Goyal   blkio: Core imple...
1166

7f52f98c2   Tejun Heo   blk-throttle: imp...
1167
1168
  	while (true) {
  		throtl_log(sq, "dispatch nr_queued=%u read=%u write=%u",
2e48a530a   Tejun Heo   blk-throttle: mak...
1169
1170
  			   sq->nr_queued[READ] + sq->nr_queued[WRITE],
  			   sq->nr_queued[READ], sq->nr_queued[WRITE]);
7f52f98c2   Tejun Heo   blk-throttle: imp...
1171
1172
1173
  
  		ret = throtl_select_dispatch(sq);
  		if (ret) {
7f52f98c2   Tejun Heo   blk-throttle: imp...
1174
1175
1176
  			throtl_log(sq, "bios disp=%u", ret);
  			dispatched = true;
  		}
e43473b7f   Vivek Goyal   blkio: Core imple...
1177

7f52f98c2   Tejun Heo   blk-throttle: imp...
1178
1179
  		if (throtl_schedule_next_dispatch(sq, false))
  			break;
e43473b7f   Vivek Goyal   blkio: Core imple...
1180

7f52f98c2   Tejun Heo   blk-throttle: imp...
1181
  		/* this dispatch windows is still open, relax and repeat */
0d945c1f9   Christoph Hellwig   block: remove the...
1182
  		spin_unlock_irq(&q->queue_lock);
7f52f98c2   Tejun Heo   blk-throttle: imp...
1183
  		cpu_relax();
0d945c1f9   Christoph Hellwig   block: remove the...
1184
  		spin_lock_irq(&q->queue_lock);
651930bc1   Tejun Heo   blk-throttle: dis...
1185
  	}
e43473b7f   Vivek Goyal   blkio: Core imple...
1186

2e48a530a   Tejun Heo   blk-throttle: mak...
1187
1188
  	if (!dispatched)
  		goto out_unlock;
6e1a5704c   Tejun Heo   blk-throttle: dis...
1189

2e48a530a   Tejun Heo   blk-throttle: mak...
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
  	if (parent_sq) {
  		/* @parent_sq is another throl_grp, propagate dispatch */
  		if (tg->flags & THROTL_TG_WAS_EMPTY) {
  			tg_update_disptime(tg);
  			if (!throtl_schedule_next_dispatch(parent_sq, false)) {
  				/* window is already open, repeat dispatching */
  				sq = parent_sq;
  				tg = sq_to_tg(sq);
  				goto again;
  			}
  		}
  	} else {
  		/* reached the top-level, queue issueing */
  		queue_work(kthrotld_workqueue, &td->dispatch_work);
  	}
  out_unlock:
0d945c1f9   Christoph Hellwig   block: remove the...
1206
  	spin_unlock_irq(&q->queue_lock);
6e1a5704c   Tejun Heo   blk-throttle: dis...
1207
  }
e43473b7f   Vivek Goyal   blkio: Core imple...
1208

6e1a5704c   Tejun Heo   blk-throttle: dis...
1209
1210
1211
1212
1213
1214
1215
1216
  /**
   * blk_throtl_dispatch_work_fn - work function for throtl_data->dispatch_work
   * @work: work item being executed
   *
   * This function is queued for execution when bio's reach the bio_lists[]
   * of throtl_data->service_queue.  Those bio's are ready and issued by this
   * function.
   */
8876e140e   Fabian Frederick   block/blk-throttl...
1217
  static void blk_throtl_dispatch_work_fn(struct work_struct *work)
6e1a5704c   Tejun Heo   blk-throttle: dis...
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
  {
  	struct throtl_data *td = container_of(work, struct throtl_data,
  					      dispatch_work);
  	struct throtl_service_queue *td_sq = &td->service_queue;
  	struct request_queue *q = td->queue;
  	struct bio_list bio_list_on_stack;
  	struct bio *bio;
  	struct blk_plug plug;
  	int rw;
  
  	bio_list_init(&bio_list_on_stack);
0d945c1f9   Christoph Hellwig   block: remove the...
1229
  	spin_lock_irq(&q->queue_lock);
c5cc2070b   Tejun Heo   blk-throttle: add...
1230
1231
1232
  	for (rw = READ; rw <= WRITE; rw++)
  		while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL)))
  			bio_list_add(&bio_list_on_stack, bio);
0d945c1f9   Christoph Hellwig   block: remove the...
1233
  	spin_unlock_irq(&q->queue_lock);
6e1a5704c   Tejun Heo   blk-throttle: dis...
1234
1235
  
  	if (!bio_list_empty(&bio_list_on_stack)) {
69d60eb96   Vivek Goyal   blk-throttle: Use...
1236
  		blk_start_plug(&plug);
e43473b7f   Vivek Goyal   blkio: Core imple...
1237
1238
  		while((bio = bio_list_pop(&bio_list_on_stack)))
  			generic_make_request(bio);
69d60eb96   Vivek Goyal   blk-throttle: Use...
1239
  		blk_finish_plug(&plug);
e43473b7f   Vivek Goyal   blkio: Core imple...
1240
  	}
e43473b7f   Vivek Goyal   blkio: Core imple...
1241
  }
f95a04afa   Tejun Heo   blkcg: embed stru...
1242
1243
  static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd,
  			      int off)
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1244
  {
f95a04afa   Tejun Heo   blkcg: embed stru...
1245
1246
  	struct throtl_grp *tg = pd_to_tg(pd);
  	u64 v = *(u64 *)((void *)tg + off);
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1247

2ab5492de   Shaohua Li   blk-throttle: use...
1248
  	if (v == U64_MAX)
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1249
  		return 0;
f95a04afa   Tejun Heo   blkcg: embed stru...
1250
  	return __blkg_prfill_u64(sf, pd, v);
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1251
  }
f95a04afa   Tejun Heo   blkcg: embed stru...
1252
1253
  static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd,
  			       int off)
e43473b7f   Vivek Goyal   blkio: Core imple...
1254
  {
f95a04afa   Tejun Heo   blkcg: embed stru...
1255
1256
  	struct throtl_grp *tg = pd_to_tg(pd);
  	unsigned int v = *(unsigned int *)((void *)tg + off);
fe0714377   Vivek Goyal   blkio: Recalculat...
1257

2ab5492de   Shaohua Li   blk-throttle: use...
1258
  	if (v == UINT_MAX)
af133ceb2   Tejun Heo   blkcg: move blkio...
1259
  		return 0;
f95a04afa   Tejun Heo   blkcg: embed stru...
1260
  	return __blkg_prfill_u64(sf, pd, v);
e43473b7f   Vivek Goyal   blkio: Core imple...
1261
  }
2da8ca822   Tejun Heo   cgroup: replace c...
1262
  static int tg_print_conf_u64(struct seq_file *sf, void *v)
8e89d13f4   Vivek Goyal   blkio: Implementa...
1263
  {
2da8ca822   Tejun Heo   cgroup: replace c...
1264
1265
  	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_u64,
  			  &blkcg_policy_throtl, seq_cft(sf)->private, false);
af133ceb2   Tejun Heo   blkcg: move blkio...
1266
  	return 0;
8e89d13f4   Vivek Goyal   blkio: Implementa...
1267
  }
2da8ca822   Tejun Heo   cgroup: replace c...
1268
  static int tg_print_conf_uint(struct seq_file *sf, void *v)
8e89d13f4   Vivek Goyal   blkio: Implementa...
1269
  {
2da8ca822   Tejun Heo   cgroup: replace c...
1270
1271
  	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_uint,
  			  &blkcg_policy_throtl, seq_cft(sf)->private, false);
af133ceb2   Tejun Heo   blkcg: move blkio...
1272
  	return 0;
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1273
  }
9bb67aeb9   Shaohua Li   blk-throttle: res...
1274
  static void tg_conf_updated(struct throtl_grp *tg, bool global)
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1275
  {
69948b070   Tejun Heo   blkcg: separate o...
1276
  	struct throtl_service_queue *sq = &tg->service_queue;
492eb21b9   Tejun Heo   cgroup: make hier...
1277
  	struct cgroup_subsys_state *pos_css;
69948b070   Tejun Heo   blkcg: separate o...
1278
  	struct blkcg_gq *blkg;
af133ceb2   Tejun Heo   blkcg: move blkio...
1279

fda6f272c   Tejun Heo   blk-throttle: imp...
1280
1281
  	throtl_log(&tg->service_queue,
  		   "limit change rbps=%llu wbps=%llu riops=%u wiops=%u",
9f626e372   Shaohua Li   blk-throttle: pre...
1282
1283
  		   tg_bps_limit(tg, READ), tg_bps_limit(tg, WRITE),
  		   tg_iops_limit(tg, READ), tg_iops_limit(tg, WRITE));
632b44935   Tejun Heo   blk-throttle: rem...
1284
1285
  
  	/*
693e751e7   Tejun Heo   blk-throttle: imp...
1286
1287
1288
1289
1290
1291
  	 * Update has_rules[] flags for the updated tg's subtree.  A tg is
  	 * considered to have rules if either the tg itself or any of its
  	 * ancestors has rules.  This identifies groups without any
  	 * restrictions in the whole hierarchy and allows them to bypass
  	 * blk-throttle.
  	 */
9bb67aeb9   Shaohua Li   blk-throttle: res...
1292
1293
  	blkg_for_each_descendant_pre(blkg, pos_css,
  			global ? tg->td->queue->root_blkg : tg_to_blkg(tg)) {
5b81fc3cc   Shaohua Li   blk-throttle: add...
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
  		struct throtl_grp *this_tg = blkg_to_tg(blkg);
  		struct throtl_grp *parent_tg;
  
  		tg_update_has_rules(this_tg);
  		/* ignore root/second level */
  		if (!cgroup_subsys_on_dfl(io_cgrp_subsys) || !blkg->parent ||
  		    !blkg->parent->parent)
  			continue;
  		parent_tg = blkg_to_tg(blkg->parent);
  		/*
  		 * make sure all children has lower idle time threshold and
  		 * higher latency target
  		 */
  		this_tg->idletime_threshold = min(this_tg->idletime_threshold,
  				parent_tg->idletime_threshold);
  		this_tg->latency_target = max(this_tg->latency_target,
  				parent_tg->latency_target);
  	}
693e751e7   Tejun Heo   blk-throttle: imp...
1312
1313
  
  	/*
632b44935   Tejun Heo   blk-throttle: rem...
1314
1315
1316
1317
1318
1319
1320
  	 * We're already holding queue_lock and know @tg is valid.  Let's
  	 * apply the new config directly.
  	 *
  	 * Restart the slices for both READ and WRITES. It might happen
  	 * that a group's limit are dropped suddenly and we don't want to
  	 * account recently dispatched IO with new low rate.
  	 */
0f3457f60   Tejun Heo   blk-throttle: add...
1321
1322
  	throtl_start_new_slice(tg, 0);
  	throtl_start_new_slice(tg, 1);
632b44935   Tejun Heo   blk-throttle: rem...
1323

5b2c16aae   Tejun Heo   blk-throttle: sim...
1324
  	if (tg->flags & THROTL_TG_PENDING) {
77216b048   Tejun Heo   blk-throttle: add...
1325
  		tg_update_disptime(tg);
7f52f98c2   Tejun Heo   blk-throttle: imp...
1326
  		throtl_schedule_next_dispatch(sq->parent_sq, true);
632b44935   Tejun Heo   blk-throttle: rem...
1327
  	}
69948b070   Tejun Heo   blkcg: separate o...
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
  }
  
  static ssize_t tg_set_conf(struct kernfs_open_file *of,
  			   char *buf, size_t nbytes, loff_t off, bool is_u64)
  {
  	struct blkcg *blkcg = css_to_blkcg(of_css(of));
  	struct blkg_conf_ctx ctx;
  	struct throtl_grp *tg;
  	int ret;
  	u64 v;
  
  	ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
  	if (ret)
  		return ret;
  
  	ret = -EINVAL;
  	if (sscanf(ctx.body, "%llu", &v) != 1)
  		goto out_finish;
  	if (!v)
2ab5492de   Shaohua Li   blk-throttle: use...
1347
  		v = U64_MAX;
69948b070   Tejun Heo   blkcg: separate o...
1348
1349
1350
1351
1352
1353
1354
  
  	tg = blkg_to_tg(ctx.blkg);
  
  	if (is_u64)
  		*(u64 *)((void *)tg + of_cft(of)->private) = v;
  	else
  		*(unsigned int *)((void *)tg + of_cft(of)->private) = v;
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1355

9bb67aeb9   Shaohua Li   blk-throttle: res...
1356
  	tg_conf_updated(tg, false);
36aa9e5f5   Tejun Heo   blkcg: move body ...
1357
1358
  	ret = 0;
  out_finish:
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1359
  	blkg_conf_finish(&ctx);
36aa9e5f5   Tejun Heo   blkcg: move body ...
1360
  	return ret ?: nbytes;
8e89d13f4   Vivek Goyal   blkio: Implementa...
1361
  }
451af504d   Tejun Heo   cgroup: replace c...
1362
1363
  static ssize_t tg_set_conf_u64(struct kernfs_open_file *of,
  			       char *buf, size_t nbytes, loff_t off)
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1364
  {
451af504d   Tejun Heo   cgroup: replace c...
1365
  	return tg_set_conf(of, buf, nbytes, off, true);
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1366
  }
451af504d   Tejun Heo   cgroup: replace c...
1367
1368
  static ssize_t tg_set_conf_uint(struct kernfs_open_file *of,
  				char *buf, size_t nbytes, loff_t off)
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1369
  {
451af504d   Tejun Heo   cgroup: replace c...
1370
  	return tg_set_conf(of, buf, nbytes, off, false);
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1371
  }
880f50e22   Tejun Heo   blkcg: mark exist...
1372
  static struct cftype throtl_legacy_files[] = {
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1373
1374
  	{
  		.name = "throttle.read_bps_device",
9f626e372   Shaohua Li   blk-throttle: pre...
1375
  		.private = offsetof(struct throtl_grp, bps[READ][LIMIT_MAX]),
2da8ca822   Tejun Heo   cgroup: replace c...
1376
  		.seq_show = tg_print_conf_u64,
451af504d   Tejun Heo   cgroup: replace c...
1377
  		.write = tg_set_conf_u64,
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1378
1379
1380
  	},
  	{
  		.name = "throttle.write_bps_device",
9f626e372   Shaohua Li   blk-throttle: pre...
1381
  		.private = offsetof(struct throtl_grp, bps[WRITE][LIMIT_MAX]),
2da8ca822   Tejun Heo   cgroup: replace c...
1382
  		.seq_show = tg_print_conf_u64,
451af504d   Tejun Heo   cgroup: replace c...
1383
  		.write = tg_set_conf_u64,
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1384
1385
1386
  	},
  	{
  		.name = "throttle.read_iops_device",
9f626e372   Shaohua Li   blk-throttle: pre...
1387
  		.private = offsetof(struct throtl_grp, iops[READ][LIMIT_MAX]),
2da8ca822   Tejun Heo   cgroup: replace c...
1388
  		.seq_show = tg_print_conf_uint,
451af504d   Tejun Heo   cgroup: replace c...
1389
  		.write = tg_set_conf_uint,
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1390
1391
1392
  	},
  	{
  		.name = "throttle.write_iops_device",
9f626e372   Shaohua Li   blk-throttle: pre...
1393
  		.private = offsetof(struct throtl_grp, iops[WRITE][LIMIT_MAX]),
2da8ca822   Tejun Heo   cgroup: replace c...
1394
  		.seq_show = tg_print_conf_uint,
451af504d   Tejun Heo   cgroup: replace c...
1395
  		.write = tg_set_conf_uint,
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1396
1397
1398
  	},
  	{
  		.name = "throttle.io_service_bytes",
77ea73388   Tejun Heo   blkcg: move io_se...
1399
1400
  		.private = (unsigned long)&blkcg_policy_throtl,
  		.seq_show = blkg_print_stat_bytes,
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1401
1402
  	},
  	{
17534c6f2   weiping zhang   blk-throttle: exp...
1403
1404
1405
1406
1407
  		.name = "throttle.io_service_bytes_recursive",
  		.private = (unsigned long)&blkcg_policy_throtl,
  		.seq_show = blkg_print_stat_bytes_recursive,
  	},
  	{
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1408
  		.name = "throttle.io_serviced",
77ea73388   Tejun Heo   blkcg: move io_se...
1409
1410
  		.private = (unsigned long)&blkcg_policy_throtl,
  		.seq_show = blkg_print_stat_ios,
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1411
  	},
17534c6f2   weiping zhang   blk-throttle: exp...
1412
1413
1414
1415
1416
  	{
  		.name = "throttle.io_serviced_recursive",
  		.private = (unsigned long)&blkcg_policy_throtl,
  		.seq_show = blkg_print_stat_ios_recursive,
  	},
60c2bc2d5   Tejun Heo   blkcg: move conf/...
1417
1418
  	{ }	/* terminate */
  };
cd5ab1b0f   Shaohua Li   blk-throttle: add...
1419
  static u64 tg_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd,
2ee867dcf   Tejun Heo   blkcg: implement ...
1420
1421
1422
1423
1424
  			 int off)
  {
  	struct throtl_grp *tg = pd_to_tg(pd);
  	const char *dname = blkg_dev_name(pd->blkg);
  	char bufs[4][21] = { "max", "max", "max", "max" };
cd5ab1b0f   Shaohua Li   blk-throttle: add...
1425
1426
  	u64 bps_dft;
  	unsigned int iops_dft;
ada75b6e5   Shaohua Li   blk-throttle: add...
1427
  	char idle_time[26] = "";
ec80991d6   Shaohua Li   blk-throttle: add...
1428
  	char latency_time[26] = "";
2ee867dcf   Tejun Heo   blkcg: implement ...
1429
1430
1431
  
  	if (!dname)
  		return 0;
9f626e372   Shaohua Li   blk-throttle: pre...
1432

cd5ab1b0f   Shaohua Li   blk-throttle: add...
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
  	if (off == LIMIT_LOW) {
  		bps_dft = 0;
  		iops_dft = 0;
  	} else {
  		bps_dft = U64_MAX;
  		iops_dft = UINT_MAX;
  	}
  
  	if (tg->bps_conf[READ][off] == bps_dft &&
  	    tg->bps_conf[WRITE][off] == bps_dft &&
  	    tg->iops_conf[READ][off] == iops_dft &&
ada75b6e5   Shaohua Li   blk-throttle: add...
1444
  	    tg->iops_conf[WRITE][off] == iops_dft &&
ec80991d6   Shaohua Li   blk-throttle: add...
1445
  	    (off != LIMIT_LOW ||
b4f428ef2   Shaohua Li   blk-throttle: for...
1446
  	     (tg->idletime_threshold_conf == DFL_IDLE_THRESHOLD &&
5b81fc3cc   Shaohua Li   blk-throttle: add...
1447
  	      tg->latency_target_conf == DFL_LATENCY_TARGET)))
2ee867dcf   Tejun Heo   blkcg: implement ...
1448
  		return 0;
9bb67aeb9   Shaohua Li   blk-throttle: res...
1449
  	if (tg->bps_conf[READ][off] != U64_MAX)
9f626e372   Shaohua Li   blk-throttle: pre...
1450
  		snprintf(bufs[0], sizeof(bufs[0]), "%llu",
cd5ab1b0f   Shaohua Li   blk-throttle: add...
1451
  			tg->bps_conf[READ][off]);
9bb67aeb9   Shaohua Li   blk-throttle: res...
1452
  	if (tg->bps_conf[WRITE][off] != U64_MAX)
9f626e372   Shaohua Li   blk-throttle: pre...
1453
  		snprintf(bufs[1], sizeof(bufs[1]), "%llu",
cd5ab1b0f   Shaohua Li   blk-throttle: add...
1454
  			tg->bps_conf[WRITE][off]);
9bb67aeb9   Shaohua Li   blk-throttle: res...
1455
  	if (tg->iops_conf[READ][off] != UINT_MAX)
9f626e372   Shaohua Li   blk-throttle: pre...
1456
  		snprintf(bufs[2], sizeof(bufs[2]), "%u",
cd5ab1b0f   Shaohua Li   blk-throttle: add...
1457
  			tg->iops_conf[READ][off]);
9bb67aeb9   Shaohua Li   blk-throttle: res...
1458
  	if (tg->iops_conf[WRITE][off] != UINT_MAX)
9f626e372   Shaohua Li   blk-throttle: pre...
1459
  		snprintf(bufs[3], sizeof(bufs[3]), "%u",
cd5ab1b0f   Shaohua Li   blk-throttle: add...
1460
  			tg->iops_conf[WRITE][off]);
ada75b6e5   Shaohua Li   blk-throttle: add...
1461
  	if (off == LIMIT_LOW) {
5b81fc3cc   Shaohua Li   blk-throttle: add...
1462
  		if (tg->idletime_threshold_conf == ULONG_MAX)
ada75b6e5   Shaohua Li   blk-throttle: add...
1463
1464
1465
  			strcpy(idle_time, " idle=max");
  		else
  			snprintf(idle_time, sizeof(idle_time), " idle=%lu",
5b81fc3cc   Shaohua Li   blk-throttle: add...
1466
  				tg->idletime_threshold_conf);
ec80991d6   Shaohua Li   blk-throttle: add...
1467

5b81fc3cc   Shaohua Li   blk-throttle: add...
1468
  		if (tg->latency_target_conf == ULONG_MAX)
ec80991d6   Shaohua Li   blk-throttle: add...
1469
1470
1471
  			strcpy(latency_time, " latency=max");
  		else
  			snprintf(latency_time, sizeof(latency_time),
5b81fc3cc   Shaohua Li   blk-throttle: add...
1472
  				" latency=%lu", tg->latency_target_conf);
ada75b6e5   Shaohua Li   blk-throttle: add...
1473
  	}
2ee867dcf   Tejun Heo   blkcg: implement ...
1474

ec80991d6   Shaohua Li   blk-throttle: add...
1475
1476
1477
1478
  	seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s%s%s
  ",
  		   dname, bufs[0], bufs[1], bufs[2], bufs[3], idle_time,
  		   latency_time);
2ee867dcf   Tejun Heo   blkcg: implement ...
1479
1480
  	return 0;
  }
cd5ab1b0f   Shaohua Li   blk-throttle: add...
1481
  static int tg_print_limit(struct seq_file *sf, void *v)
2ee867dcf   Tejun Heo   blkcg: implement ...
1482
  {
cd5ab1b0f   Shaohua Li   blk-throttle: add...
1483
  	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_limit,
2ee867dcf   Tejun Heo   blkcg: implement ...
1484
1485
1486
  			  &blkcg_policy_throtl, seq_cft(sf)->private, false);
  	return 0;
  }
cd5ab1b0f   Shaohua Li   blk-throttle: add...
1487
  static ssize_t tg_set_limit(struct kernfs_open_file *of,
2ee867dcf   Tejun Heo   blkcg: implement ...
1488
1489
1490
1491
1492
1493
  			  char *buf, size_t nbytes, loff_t off)
  {
  	struct blkcg *blkcg = css_to_blkcg(of_css(of));
  	struct blkg_conf_ctx ctx;
  	struct throtl_grp *tg;
  	u64 v[4];
ada75b6e5   Shaohua Li   blk-throttle: add...
1494
  	unsigned long idle_time;
ec80991d6   Shaohua Li   blk-throttle: add...
1495
  	unsigned long latency_time;
2ee867dcf   Tejun Heo   blkcg: implement ...
1496
  	int ret;
cd5ab1b0f   Shaohua Li   blk-throttle: add...
1497
  	int index = of_cft(of)->private;
2ee867dcf   Tejun Heo   blkcg: implement ...
1498
1499
1500
1501
1502
1503
  
  	ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
  	if (ret)
  		return ret;
  
  	tg = blkg_to_tg(ctx.blkg);
cd5ab1b0f   Shaohua Li   blk-throttle: add...
1504
1505
1506
1507
  	v[0] = tg->bps_conf[READ][index];
  	v[1] = tg->bps_conf[WRITE][index];
  	v[2] = tg->iops_conf[READ][index];
  	v[3] = tg->iops_conf[WRITE][index];
2ee867dcf   Tejun Heo   blkcg: implement ...
1508

5b81fc3cc   Shaohua Li   blk-throttle: add...
1509
1510
  	idle_time = tg->idletime_threshold_conf;
  	latency_time = tg->latency_target_conf;
2ee867dcf   Tejun Heo   blkcg: implement ...
1511
1512
1513
  	while (true) {
  		char tok[27];	/* wiops=18446744073709551616 */
  		char *p;
2ab5492de   Shaohua Li   blk-throttle: use...
1514
  		u64 val = U64_MAX;
2ee867dcf   Tejun Heo   blkcg: implement ...
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
  		int len;
  
  		if (sscanf(ctx.body, "%26s%n", tok, &len) != 1)
  			break;
  		if (tok[0] == '\0')
  			break;
  		ctx.body += len;
  
  		ret = -EINVAL;
  		p = tok;
  		strsep(&p, "=");
  		if (!p || (sscanf(p, "%llu", &val) != 1 && strcmp(p, "max")))
  			goto out_finish;
  
  		ret = -ERANGE;
  		if (!val)
  			goto out_finish;
  
  		ret = -EINVAL;
  		if (!strcmp(tok, "rbps"))
  			v[0] = val;
  		else if (!strcmp(tok, "wbps"))
  			v[1] = val;
  		else if (!strcmp(tok, "riops"))
  			v[2] = min_t(u64, val, UINT_MAX);
  		else if (!strcmp(tok, "wiops"))
  			v[3] = min_t(u64, val, UINT_MAX);
ada75b6e5   Shaohua Li   blk-throttle: add...
1542
1543
  		else if (off == LIMIT_LOW && !strcmp(tok, "idle"))
  			idle_time = val;
ec80991d6   Shaohua Li   blk-throttle: add...
1544
1545
  		else if (off == LIMIT_LOW && !strcmp(tok, "latency"))
  			latency_time = val;
2ee867dcf   Tejun Heo   blkcg: implement ...
1546
1547
1548
  		else
  			goto out_finish;
  	}
cd5ab1b0f   Shaohua Li   blk-throttle: add...
1549
1550
1551
1552
  	tg->bps_conf[READ][index] = v[0];
  	tg->bps_conf[WRITE][index] = v[1];
  	tg->iops_conf[READ][index] = v[2];
  	tg->iops_conf[WRITE][index] = v[3];
2ee867dcf   Tejun Heo   blkcg: implement ...
1553

cd5ab1b0f   Shaohua Li   blk-throttle: add...
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
  	if (index == LIMIT_MAX) {
  		tg->bps[READ][index] = v[0];
  		tg->bps[WRITE][index] = v[1];
  		tg->iops[READ][index] = v[2];
  		tg->iops[WRITE][index] = v[3];
  	}
  	tg->bps[READ][LIMIT_LOW] = min(tg->bps_conf[READ][LIMIT_LOW],
  		tg->bps_conf[READ][LIMIT_MAX]);
  	tg->bps[WRITE][LIMIT_LOW] = min(tg->bps_conf[WRITE][LIMIT_LOW],
  		tg->bps_conf[WRITE][LIMIT_MAX]);
  	tg->iops[READ][LIMIT_LOW] = min(tg->iops_conf[READ][LIMIT_LOW],
  		tg->iops_conf[READ][LIMIT_MAX]);
  	tg->iops[WRITE][LIMIT_LOW] = min(tg->iops_conf[WRITE][LIMIT_LOW],
  		tg->iops_conf[WRITE][LIMIT_MAX]);
b4f428ef2   Shaohua Li   blk-throttle: for...
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
  	tg->idletime_threshold_conf = idle_time;
  	tg->latency_target_conf = latency_time;
  
  	/* force user to configure all settings for low limit  */
  	if (!(tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW] ||
  	      tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) ||
  	    tg->idletime_threshold_conf == DFL_IDLE_THRESHOLD ||
  	    tg->latency_target_conf == DFL_LATENCY_TARGET) {
  		tg->bps[READ][LIMIT_LOW] = 0;
  		tg->bps[WRITE][LIMIT_LOW] = 0;
  		tg->iops[READ][LIMIT_LOW] = 0;
  		tg->iops[WRITE][LIMIT_LOW] = 0;
  		tg->idletime_threshold = DFL_IDLE_THRESHOLD;
  		tg->latency_target = DFL_LATENCY_TARGET;
  	} else if (index == LIMIT_LOW) {
5b81fc3cc   Shaohua Li   blk-throttle: add...
1583
  		tg->idletime_threshold = tg->idletime_threshold_conf;
5b81fc3cc   Shaohua Li   blk-throttle: add...
1584
  		tg->latency_target = tg->latency_target_conf;
cd5ab1b0f   Shaohua Li   blk-throttle: add...
1585
  	}
b4f428ef2   Shaohua Li   blk-throttle: for...
1586
1587
1588
1589
1590
1591
1592
  
  	blk_throtl_update_limit_valid(tg->td);
  	if (tg->td->limit_valid[LIMIT_LOW]) {
  		if (index == LIMIT_LOW)
  			tg->td->limit_index = LIMIT_LOW;
  	} else
  		tg->td->limit_index = LIMIT_MAX;
9bb67aeb9   Shaohua Li   blk-throttle: res...
1593
1594
  	tg_conf_updated(tg, index == LIMIT_LOW &&
  		tg->td->limit_valid[LIMIT_LOW]);
2ee867dcf   Tejun Heo   blkcg: implement ...
1595
1596
1597
1598
1599
1600
1601
  	ret = 0;
  out_finish:
  	blkg_conf_finish(&ctx);
  	return ret ?: nbytes;
  }
  
  static struct cftype throtl_files[] = {
cd5ab1b0f   Shaohua Li   blk-throttle: add...
1602
1603
1604
1605
1606
1607
1608
1609
1610
  #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
  	{
  		.name = "low",
  		.flags = CFTYPE_NOT_ON_ROOT,
  		.seq_show = tg_print_limit,
  		.write = tg_set_limit,
  		.private = LIMIT_LOW,
  	},
  #endif
2ee867dcf   Tejun Heo   blkcg: implement ...
1611
1612
1613
  	{
  		.name = "max",
  		.flags = CFTYPE_NOT_ON_ROOT,
cd5ab1b0f   Shaohua Li   blk-throttle: add...
1614
1615
1616
  		.seq_show = tg_print_limit,
  		.write = tg_set_limit,
  		.private = LIMIT_MAX,
2ee867dcf   Tejun Heo   blkcg: implement ...
1617
1618
1619
  	},
  	{ }	/* terminate */
  };
da5277700   Vivek Goyal   block: Move blk_t...
1620
  static void throtl_shutdown_wq(struct request_queue *q)
e43473b7f   Vivek Goyal   blkio: Core imple...
1621
1622
  {
  	struct throtl_data *td = q->td;
69df0ab03   Tejun Heo   blk-throttle: sep...
1623
  	cancel_work_sync(&td->dispatch_work);
e43473b7f   Vivek Goyal   blkio: Core imple...
1624
  }
3c798398e   Tejun Heo   blkcg: mass renam...
1625
  static struct blkcg_policy blkcg_policy_throtl = {
2ee867dcf   Tejun Heo   blkcg: implement ...
1626
  	.dfl_cftypes		= throtl_files,
880f50e22   Tejun Heo   blkcg: mark exist...
1627
  	.legacy_cftypes		= throtl_legacy_files,
f9fcc2d39   Tejun Heo   blkcg: collapse b...
1628

001bea73e   Tejun Heo   blkcg: replace bl...
1629
  	.pd_alloc_fn		= throtl_pd_alloc,
f9fcc2d39   Tejun Heo   blkcg: collapse b...
1630
  	.pd_init_fn		= throtl_pd_init,
693e751e7   Tejun Heo   blk-throttle: imp...
1631
  	.pd_online_fn		= throtl_pd_online,
cd5ab1b0f   Shaohua Li   blk-throttle: add...
1632
  	.pd_offline_fn		= throtl_pd_offline,
001bea73e   Tejun Heo   blkcg: replace bl...
1633
  	.pd_free_fn		= throtl_pd_free,
e43473b7f   Vivek Goyal   blkio: Core imple...
1634
  };
3f0abd806   Shaohua Li   blk-throttle: add...
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
  static unsigned long __tg_last_low_overflow_time(struct throtl_grp *tg)
  {
  	unsigned long rtime = jiffies, wtime = jiffies;
  
  	if (tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW])
  		rtime = tg->last_low_overflow_time[READ];
  	if (tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW])
  		wtime = tg->last_low_overflow_time[WRITE];
  	return min(rtime, wtime);
  }
  
  /* tg should not be an intermediate node */
  static unsigned long tg_last_low_overflow_time(struct throtl_grp *tg)
  {
  	struct throtl_service_queue *parent_sq;
  	struct throtl_grp *parent = tg;
  	unsigned long ret = __tg_last_low_overflow_time(tg);
  
  	while (true) {
  		parent_sq = parent->service_queue.parent_sq;
  		parent = sq_to_tg(parent_sq);
  		if (!parent)
  			break;
  
  		/*
  		 * The parent doesn't have low limit, it always reaches low
  		 * limit. Its overflow time is useless for children
  		 */
  		if (!parent->bps[READ][LIMIT_LOW] &&
  		    !parent->iops[READ][LIMIT_LOW] &&
  		    !parent->bps[WRITE][LIMIT_LOW] &&
  		    !parent->iops[WRITE][LIMIT_LOW])
  			continue;
  		if (time_after(__tg_last_low_overflow_time(parent), ret))
  			ret = __tg_last_low_overflow_time(parent);
  	}
  	return ret;
  }
9e234eeaf   Shaohua Li   blk-throttle: add...
1673
1674
1675
1676
1677
  static bool throtl_tg_is_idle(struct throtl_grp *tg)
  {
  	/*
  	 * cgroup is idle if:
  	 * - single idle is too long, longer than a fixed value (in case user
b4f428ef2   Shaohua Li   blk-throttle: for...
1678
  	 *   configure a too big threshold) or 4 times of idletime threshold
9e234eeaf   Shaohua Li   blk-throttle: add...
1679
  	 * - average think time is more than threshold
53696b8d2   Shaohua Li   blk-throttle: add...
1680
  	 * - IO latency is largely below threshold
9e234eeaf   Shaohua Li   blk-throttle: add...
1681
  	 */
b4f428ef2   Shaohua Li   blk-throttle: for...
1682
  	unsigned long time;
4cff729f6   Shaohua Li   blk-throttle: out...
1683
  	bool ret;
9e234eeaf   Shaohua Li   blk-throttle: add...
1684

b4f428ef2   Shaohua Li   blk-throttle: for...
1685
1686
1687
1688
1689
1690
  	time = min_t(unsigned long, MAX_IDLE_TIME, 4 * tg->idletime_threshold);
  	ret = tg->latency_target == DFL_LATENCY_TARGET ||
  	      tg->idletime_threshold == DFL_IDLE_THRESHOLD ||
  	      (ktime_get_ns() >> 10) - tg->last_finish_time > time ||
  	      tg->avg_idletime > tg->idletime_threshold ||
  	      (tg->latency_target && tg->bio_cnt &&
53696b8d2   Shaohua Li   blk-throttle: add...
1691
  		tg->bad_bio_cnt * 5 < tg->bio_cnt);
4cff729f6   Shaohua Li   blk-throttle: out...
1692
1693
1694
1695
1696
  	throtl_log(&tg->service_queue,
  		"avg_idle=%ld, idle_threshold=%ld, bad_bio=%d, total_bio=%d, is_idle=%d, scale=%d",
  		tg->avg_idletime, tg->idletime_threshold, tg->bad_bio_cnt,
  		tg->bio_cnt, ret, tg->td->scale);
  	return ret;
9e234eeaf   Shaohua Li   blk-throttle: add...
1697
  }
c79892c55   Shaohua Li   blk-throttle: add...
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
  static bool throtl_tg_can_upgrade(struct throtl_grp *tg)
  {
  	struct throtl_service_queue *sq = &tg->service_queue;
  	bool read_limit, write_limit;
  
  	/*
  	 * if cgroup reaches low limit (if low limit is 0, the cgroup always
  	 * reaches), it's ok to upgrade to next limit
  	 */
  	read_limit = tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW];
  	write_limit = tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW];
  	if (!read_limit && !write_limit)
  		return true;
  	if (read_limit && sq->nr_queued[READ] &&
  	    (!write_limit || sq->nr_queued[WRITE]))
  		return true;
  	if (write_limit && sq->nr_queued[WRITE] &&
  	    (!read_limit || sq->nr_queued[READ]))
  		return true;
aec242468   Shaohua Li   blk-throttle: det...
1717
1718
  
  	if (time_after_eq(jiffies,
fa6fb5aab   Shaohua Li   blk-throttle: ign...
1719
1720
  		tg_last_low_overflow_time(tg) + tg->td->throtl_slice) &&
  	    throtl_tg_is_idle(tg))
aec242468   Shaohua Li   blk-throttle: det...
1721
  		return true;
c79892c55   Shaohua Li   blk-throttle: add...
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
  	return false;
  }
  
  static bool throtl_hierarchy_can_upgrade(struct throtl_grp *tg)
  {
  	while (true) {
  		if (throtl_tg_can_upgrade(tg))
  			return true;
  		tg = sq_to_tg(tg->service_queue.parent_sq);
  		if (!tg || !tg_to_blkg(tg)->parent)
  			return false;
  	}
  	return false;
  }
  
  static bool throtl_can_upgrade(struct throtl_data *td,
  	struct throtl_grp *this_tg)
  {
  	struct cgroup_subsys_state *pos_css;
  	struct blkcg_gq *blkg;
  
  	if (td->limit_index != LIMIT_LOW)
  		return false;
297e3d854   Shaohua Li   blk-throttle: mak...
1745
  	if (time_before(jiffies, td->low_downgrade_time + td->throtl_slice))
3f0abd806   Shaohua Li   blk-throttle: add...
1746
  		return false;
c79892c55   Shaohua Li   blk-throttle: add...
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
  	rcu_read_lock();
  	blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
  		struct throtl_grp *tg = blkg_to_tg(blkg);
  
  		if (tg == this_tg)
  			continue;
  		if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children))
  			continue;
  		if (!throtl_hierarchy_can_upgrade(tg)) {
  			rcu_read_unlock();
  			return false;
  		}
  	}
  	rcu_read_unlock();
  	return true;
  }
fa6fb5aab   Shaohua Li   blk-throttle: ign...
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
  static void throtl_upgrade_check(struct throtl_grp *tg)
  {
  	unsigned long now = jiffies;
  
  	if (tg->td->limit_index != LIMIT_LOW)
  		return;
  
  	if (time_after(tg->last_check_time + tg->td->throtl_slice, now))
  		return;
  
  	tg->last_check_time = now;
  
  	if (!time_after_eq(now,
  	     __tg_last_low_overflow_time(tg) + tg->td->throtl_slice))
  		return;
  
  	if (throtl_can_upgrade(tg->td, NULL))
  		throtl_upgrade_state(tg->td);
  }
c79892c55   Shaohua Li   blk-throttle: add...
1782
1783
1784
1785
  static void throtl_upgrade_state(struct throtl_data *td)
  {
  	struct cgroup_subsys_state *pos_css;
  	struct blkcg_gq *blkg;
4cff729f6   Shaohua Li   blk-throttle: out...
1786
  	throtl_log(&td->service_queue, "upgrade to max");
c79892c55   Shaohua Li   blk-throttle: add...
1787
  	td->limit_index = LIMIT_MAX;
3f0abd806   Shaohua Li   blk-throttle: add...
1788
  	td->low_upgrade_time = jiffies;
7394e31fa   Shaohua Li   blk-throttle: mak...
1789
  	td->scale = 0;
c79892c55   Shaohua Li   blk-throttle: add...
1790
1791
1792
1793
1794
1795
1796
  	rcu_read_lock();
  	blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
  		struct throtl_grp *tg = blkg_to_tg(blkg);
  		struct throtl_service_queue *sq = &tg->service_queue;
  
  		tg->disptime = jiffies - 1;
  		throtl_select_dispatch(sq);
4f02fb761   Joseph Qi   blk-throttle: fix...
1797
  		throtl_schedule_next_dispatch(sq, true);
c79892c55   Shaohua Li   blk-throttle: add...
1798
1799
1800
  	}
  	rcu_read_unlock();
  	throtl_select_dispatch(&td->service_queue);
4f02fb761   Joseph Qi   blk-throttle: fix...
1801
  	throtl_schedule_next_dispatch(&td->service_queue, true);
c79892c55   Shaohua Li   blk-throttle: add...
1802
1803
  	queue_work(kthrotld_workqueue, &td->dispatch_work);
  }
3f0abd806   Shaohua Li   blk-throttle: add...
1804
1805
  static void throtl_downgrade_state(struct throtl_data *td, int new)
  {
7394e31fa   Shaohua Li   blk-throttle: mak...
1806
  	td->scale /= 2;
4cff729f6   Shaohua Li   blk-throttle: out...
1807
  	throtl_log(&td->service_queue, "downgrade, scale %d", td->scale);
7394e31fa   Shaohua Li   blk-throttle: mak...
1808
1809
1810
1811
  	if (td->scale) {
  		td->low_upgrade_time = jiffies - td->scale * td->throtl_slice;
  		return;
  	}
3f0abd806   Shaohua Li   blk-throttle: add...
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
  	td->limit_index = new;
  	td->low_downgrade_time = jiffies;
  }
  
  static bool throtl_tg_can_downgrade(struct throtl_grp *tg)
  {
  	struct throtl_data *td = tg->td;
  	unsigned long now = jiffies;
  
  	/*
  	 * If cgroup is below low limit, consider downgrade and throttle other
  	 * cgroups
  	 */
297e3d854   Shaohua Li   blk-throttle: mak...
1825
1826
  	if (time_after_eq(now, td->low_upgrade_time + td->throtl_slice) &&
  	    time_after_eq(now, tg_last_low_overflow_time(tg) +
fa6fb5aab   Shaohua Li   blk-throttle: ign...
1827
1828
1829
  					td->throtl_slice) &&
  	    (!throtl_tg_is_idle(tg) ||
  	     !list_empty(&tg_to_blkg(tg)->blkcg->css.children)))
3f0abd806   Shaohua Li   blk-throttle: add...
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
  		return true;
  	return false;
  }
  
  static bool throtl_hierarchy_can_downgrade(struct throtl_grp *tg)
  {
  	while (true) {
  		if (!throtl_tg_can_downgrade(tg))
  			return false;
  		tg = sq_to_tg(tg->service_queue.parent_sq);
  		if (!tg || !tg_to_blkg(tg)->parent)
  			break;
  	}
  	return true;
  }
  
  static void throtl_downgrade_check(struct throtl_grp *tg)
  {
  	uint64_t bps;
  	unsigned int iops;
  	unsigned long elapsed_time;
  	unsigned long now = jiffies;
  
  	if (tg->td->limit_index != LIMIT_MAX ||
  	    !tg->td->limit_valid[LIMIT_LOW])
  		return;
  	if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children))
  		return;
297e3d854   Shaohua Li   blk-throttle: mak...
1858
  	if (time_after(tg->last_check_time + tg->td->throtl_slice, now))
3f0abd806   Shaohua Li   blk-throttle: add...
1859
1860
1861
1862
  		return;
  
  	elapsed_time = now - tg->last_check_time;
  	tg->last_check_time = now;
297e3d854   Shaohua Li   blk-throttle: mak...
1863
1864
  	if (time_before(now, tg_last_low_overflow_time(tg) +
  			tg->td->throtl_slice))
3f0abd806   Shaohua Li   blk-throttle: add...
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
  		return;
  
  	if (tg->bps[READ][LIMIT_LOW]) {
  		bps = tg->last_bytes_disp[READ] * HZ;
  		do_div(bps, elapsed_time);
  		if (bps >= tg->bps[READ][LIMIT_LOW])
  			tg->last_low_overflow_time[READ] = now;
  	}
  
  	if (tg->bps[WRITE][LIMIT_LOW]) {
  		bps = tg->last_bytes_disp[WRITE] * HZ;
  		do_div(bps, elapsed_time);
  		if (bps >= tg->bps[WRITE][LIMIT_LOW])
  			tg->last_low_overflow_time[WRITE] = now;
  	}
  
  	if (tg->iops[READ][LIMIT_LOW]) {
  		iops = tg->last_io_disp[READ] * HZ / elapsed_time;
  		if (iops >= tg->iops[READ][LIMIT_LOW])
  			tg->last_low_overflow_time[READ] = now;
  	}
  
  	if (tg->iops[WRITE][LIMIT_LOW]) {
  		iops = tg->last_io_disp[WRITE] * HZ / elapsed_time;
  		if (iops >= tg->iops[WRITE][LIMIT_LOW])
  			tg->last_low_overflow_time[WRITE] = now;
  	}
  
  	/*
  	 * If cgroup is below low limit, consider downgrade and throttle other
  	 * cgroups
  	 */
  	if (throtl_hierarchy_can_downgrade(tg))
  		throtl_downgrade_state(tg->td, LIMIT_LOW);
  
  	tg->last_bytes_disp[READ] = 0;
  	tg->last_bytes_disp[WRITE] = 0;
  	tg->last_io_disp[READ] = 0;
  	tg->last_io_disp[WRITE] = 0;
  }
9e234eeaf   Shaohua Li   blk-throttle: add...
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
  static void blk_throtl_update_idletime(struct throtl_grp *tg)
  {
  	unsigned long now = ktime_get_ns() >> 10;
  	unsigned long last_finish_time = tg->last_finish_time;
  
  	if (now <= last_finish_time || last_finish_time == 0 ||
  	    last_finish_time == tg->checked_last_finish_time)
  		return;
  
  	tg->avg_idletime = (tg->avg_idletime * 7 + now - last_finish_time) >> 3;
  	tg->checked_last_finish_time = last_finish_time;
  }
b9147dd1b   Shaohua Li   blk-throttle: add...
1917
1918
1919
  #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
  static void throtl_update_latency_buckets(struct throtl_data *td)
  {
b889bf66d   Joseph Qi   blk-throttle: tra...
1920
1921
1922
1923
  	struct avg_latency_bucket avg_latency[2][LATENCY_BUCKET_SIZE];
  	int i, cpu, rw;
  	unsigned long last_latency[2] = { 0 };
  	unsigned long latency[2];
b9147dd1b   Shaohua Li   blk-throttle: add...
1924
1925
1926
1927
1928
1929
1930
1931
  
  	if (!blk_queue_nonrot(td->queue))
  		return;
  	if (time_before(jiffies, td->last_calculate_time + HZ))
  		return;
  	td->last_calculate_time = jiffies;
  
  	memset(avg_latency, 0, sizeof(avg_latency));
b889bf66d   Joseph Qi   blk-throttle: tra...
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
  	for (rw = READ; rw <= WRITE; rw++) {
  		for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
  			struct latency_bucket *tmp = &td->tmp_buckets[rw][i];
  
  			for_each_possible_cpu(cpu) {
  				struct latency_bucket *bucket;
  
  				/* this isn't race free, but ok in practice */
  				bucket = per_cpu_ptr(td->latency_buckets[rw],
  					cpu);
  				tmp->total_latency += bucket[i].total_latency;
  				tmp->samples += bucket[i].samples;
  				bucket[i].total_latency = 0;
  				bucket[i].samples = 0;
  			}
b9147dd1b   Shaohua Li   blk-throttle: add...
1947

b889bf66d   Joseph Qi   blk-throttle: tra...
1948
1949
  			if (tmp->samples >= 32) {
  				int samples = tmp->samples;
b9147dd1b   Shaohua Li   blk-throttle: add...
1950

b889bf66d   Joseph Qi   blk-throttle: tra...
1951
  				latency[rw] = tmp->total_latency;
b9147dd1b   Shaohua Li   blk-throttle: add...
1952

b889bf66d   Joseph Qi   blk-throttle: tra...
1953
1954
1955
1956
1957
1958
1959
  				tmp->total_latency = 0;
  				tmp->samples = 0;
  				latency[rw] /= samples;
  				if (latency[rw] == 0)
  					continue;
  				avg_latency[rw][i].latency = latency[rw];
  			}
b9147dd1b   Shaohua Li   blk-throttle: add...
1960
1961
  		}
  	}
b889bf66d   Joseph Qi   blk-throttle: tra...
1962
1963
1964
1965
1966
1967
1968
1969
  	for (rw = READ; rw <= WRITE; rw++) {
  		for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
  			if (!avg_latency[rw][i].latency) {
  				if (td->avg_buckets[rw][i].latency < last_latency[rw])
  					td->avg_buckets[rw][i].latency =
  						last_latency[rw];
  				continue;
  			}
b9147dd1b   Shaohua Li   blk-throttle: add...
1970

b889bf66d   Joseph Qi   blk-throttle: tra...
1971
1972
1973
1974
1975
  			if (!td->avg_buckets[rw][i].valid)
  				latency[rw] = avg_latency[rw][i].latency;
  			else
  				latency[rw] = (td->avg_buckets[rw][i].latency * 7 +
  					avg_latency[rw][i].latency) >> 3;
b9147dd1b   Shaohua Li   blk-throttle: add...
1976

b889bf66d   Joseph Qi   blk-throttle: tra...
1977
1978
1979
1980
1981
  			td->avg_buckets[rw][i].latency = max(latency[rw],
  				last_latency[rw]);
  			td->avg_buckets[rw][i].valid = true;
  			last_latency[rw] = td->avg_buckets[rw][i].latency;
  		}
b9147dd1b   Shaohua Li   blk-throttle: add...
1982
  	}
4cff729f6   Shaohua Li   blk-throttle: out...
1983
1984
1985
  
  	for (i = 0; i < LATENCY_BUCKET_SIZE; i++)
  		throtl_log(&td->service_queue,
b889bf66d   Joseph Qi   blk-throttle: tra...
1986
1987
1988
1989
1990
1991
  			"Latency bucket %d: read latency=%ld, read valid=%d, "
  			"write latency=%ld, write valid=%d", i,
  			td->avg_buckets[READ][i].latency,
  			td->avg_buckets[READ][i].valid,
  			td->avg_buckets[WRITE][i].latency,
  			td->avg_buckets[WRITE][i].valid);
b9147dd1b   Shaohua Li   blk-throttle: add...
1992
1993
1994
1995
1996
1997
  }
  #else
  static inline void throtl_update_latency_buckets(struct throtl_data *td)
  {
  }
  #endif
ae1188963   Tejun Heo   blkcg: consolidat...
1998
1999
  bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
  		    struct bio *bio)
e43473b7f   Vivek Goyal   blkio: Core imple...
2000
  {
c5cc2070b   Tejun Heo   blk-throttle: add...
2001
  	struct throtl_qnode *qn = NULL;
b5f2954d3   Dennis Zhou   blkcg: revert blk...
2002
  	struct throtl_grp *tg = blkg_to_tg(blkg ?: q->root_blkg);
73f0d49a9   Tejun Heo   blk-throttle: mov...
2003
  	struct throtl_service_queue *sq;
0e9f4164b   Tejun Heo   blk-throttle: gen...
2004
  	bool rw = bio_data_dir(bio);
bc16a4f93   Tejun Heo   block: reorganize...
2005
  	bool throttled = false;
b9147dd1b   Shaohua Li   blk-throttle: add...
2006
  	struct throtl_data *td = tg->td;
e43473b7f   Vivek Goyal   blkio: Core imple...
2007

ae1188963   Tejun Heo   blkcg: consolidat...
2008
  	WARN_ON_ONCE(!rcu_read_lock_held());
2a0f61e6e   Tejun Heo   blk-throttle: set...
2009
  	/* see throtl_charge_bio() */
8d2bbd4c8   Christoph Hellwig   block: replace RE...
2010
  	if (bio_flagged(bio, BIO_THROTTLED) || !tg->has_rules[rw])
bc16a4f93   Tejun Heo   block: reorganize...
2011
  		goto out;
e43473b7f   Vivek Goyal   blkio: Core imple...
2012

0d945c1f9   Christoph Hellwig   block: remove the...
2013
  	spin_lock_irq(&q->queue_lock);
c9589f03e   Tejun Heo   blk-throttle: imp...
2014

b9147dd1b   Shaohua Li   blk-throttle: add...
2015
  	throtl_update_latency_buckets(td);
9e234eeaf   Shaohua Li   blk-throttle: add...
2016
  	blk_throtl_update_idletime(tg);
73f0d49a9   Tejun Heo   blk-throttle: mov...
2017
  	sq = &tg->service_queue;
c79892c55   Shaohua Li   blk-throttle: add...
2018
  again:
9e660acff   Tejun Heo   blk-throttle: mak...
2019
  	while (true) {
3f0abd806   Shaohua Li   blk-throttle: add...
2020
2021
2022
  		if (tg->last_low_overflow_time[rw] == 0)
  			tg->last_low_overflow_time[rw] = jiffies;
  		throtl_downgrade_check(tg);
fa6fb5aab   Shaohua Li   blk-throttle: ign...
2023
  		throtl_upgrade_check(tg);
9e660acff   Tejun Heo   blk-throttle: mak...
2024
2025
2026
  		/* throtl is FIFO - if bios are already queued, should queue */
  		if (sq->nr_queued[rw])
  			break;
de701c74a   Vivek Goyal   blk-throttle: Som...
2027

9e660acff   Tejun Heo   blk-throttle: mak...
2028
  		/* if above limits, break to queue */
c79892c55   Shaohua Li   blk-throttle: add...
2029
  		if (!tg_may_dispatch(tg, bio, NULL)) {
3f0abd806   Shaohua Li   blk-throttle: add...
2030
  			tg->last_low_overflow_time[rw] = jiffies;
b9147dd1b   Shaohua Li   blk-throttle: add...
2031
2032
  			if (throtl_can_upgrade(td, tg)) {
  				throtl_upgrade_state(td);
c79892c55   Shaohua Li   blk-throttle: add...
2033
2034
  				goto again;
  			}
9e660acff   Tejun Heo   blk-throttle: mak...
2035
  			break;
c79892c55   Shaohua Li   blk-throttle: add...
2036
  		}
9e660acff   Tejun Heo   blk-throttle: mak...
2037
2038
  
  		/* within limits, let's charge and dispatch directly */
e43473b7f   Vivek Goyal   blkio: Core imple...
2039
  		throtl_charge_bio(tg, bio);
04521db04   Vivek Goyal   blk-throttle: Res...
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
  
  		/*
  		 * We need to trim slice even when bios are not being queued
  		 * otherwise it might happen that a bio is not queued for
  		 * a long time and slice keeps on extending and trim is not
  		 * called for a long time. Now if limits are reduced suddenly
  		 * we take into account all the IO dispatched so far at new
  		 * low rate and * newly queued IO gets a really long dispatch
  		 * time.
  		 *
  		 * So keep on trimming slice even if bio is not queued.
  		 */
0f3457f60   Tejun Heo   blk-throttle: add...
2052
  		throtl_trim_slice(tg, rw);
9e660acff   Tejun Heo   blk-throttle: mak...
2053
2054
2055
2056
2057
2058
  
  		/*
  		 * @bio passed through this layer without being throttled.
  		 * Climb up the ladder.  If we''re already at the top, it
  		 * can be executed directly.
  		 */
c5cc2070b   Tejun Heo   blk-throttle: add...
2059
  		qn = &tg->qnode_on_parent[rw];
9e660acff   Tejun Heo   blk-throttle: mak...
2060
2061
2062
2063
  		sq = sq->parent_sq;
  		tg = sq_to_tg(sq);
  		if (!tg)
  			goto out_unlock;
e43473b7f   Vivek Goyal   blkio: Core imple...
2064
  	}
9e660acff   Tejun Heo   blk-throttle: mak...
2065
  	/* out-of-limit, queue to @tg */
fda6f272c   Tejun Heo   blk-throttle: imp...
2066
2067
  	throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u iops=%u queued=%d/%d",
  		   rw == READ ? 'R' : 'W',
9f626e372   Shaohua Li   blk-throttle: pre...
2068
2069
2070
  		   tg->bytes_disp[rw], bio->bi_iter.bi_size,
  		   tg_bps_limit(tg, rw),
  		   tg->io_disp[rw], tg_iops_limit(tg, rw),
fda6f272c   Tejun Heo   blk-throttle: imp...
2071
  		   sq->nr_queued[READ], sq->nr_queued[WRITE]);
e43473b7f   Vivek Goyal   blkio: Core imple...
2072

3f0abd806   Shaohua Li   blk-throttle: add...
2073
  	tg->last_low_overflow_time[rw] = jiffies;
b9147dd1b   Shaohua Li   blk-throttle: add...
2074
  	td->nr_queued[rw]++;
c5cc2070b   Tejun Heo   blk-throttle: add...
2075
  	throtl_add_bio_tg(bio, qn, tg);
bc16a4f93   Tejun Heo   block: reorganize...
2076
  	throttled = true;
e43473b7f   Vivek Goyal   blkio: Core imple...
2077

7f52f98c2   Tejun Heo   blk-throttle: imp...
2078
2079
2080
2081
2082
2083
  	/*
  	 * Update @tg's dispatch time and force schedule dispatch if @tg
  	 * was empty before @bio.  The forced scheduling isn't likely to
  	 * cause undue delay as @bio is likely to be dispatched directly if
  	 * its @tg's disptime is not in the future.
  	 */
0e9f4164b   Tejun Heo   blk-throttle: gen...
2084
  	if (tg->flags & THROTL_TG_WAS_EMPTY) {
77216b048   Tejun Heo   blk-throttle: add...
2085
  		tg_update_disptime(tg);
7f52f98c2   Tejun Heo   blk-throttle: imp...
2086
  		throtl_schedule_next_dispatch(tg->service_queue.parent_sq, true);
e43473b7f   Vivek Goyal   blkio: Core imple...
2087
  	}
bc16a4f93   Tejun Heo   block: reorganize...
2088
  out_unlock:
0d945c1f9   Christoph Hellwig   block: remove the...
2089
  	spin_unlock_irq(&q->queue_lock);
bc16a4f93   Tejun Heo   block: reorganize...
2090
  out:
111be8839   Shaohua Li   block-throttle: a...
2091
  	bio_set_flag(bio, BIO_THROTTLED);
b9147dd1b   Shaohua Li   blk-throttle: add...
2092
2093
2094
  
  #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
  	if (throttled || !td->track_bio_latency)
5238dcf41   Omar Sandoval   block: replace bi...
2095
  		bio->bi_issue.value |= BIO_ISSUE_THROTL_SKIP_LATENCY;
b9147dd1b   Shaohua Li   blk-throttle: add...
2096
  #endif
bc16a4f93   Tejun Heo   block: reorganize...
2097
  	return throttled;
e43473b7f   Vivek Goyal   blkio: Core imple...
2098
  }
9e234eeaf   Shaohua Li   blk-throttle: add...
2099
  #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
b9147dd1b   Shaohua Li   blk-throttle: add...
2100
2101
2102
2103
2104
  static void throtl_track_latency(struct throtl_data *td, sector_t size,
  	int op, unsigned long time)
  {
  	struct latency_bucket *latency;
  	int index;
b889bf66d   Joseph Qi   blk-throttle: tra...
2105
2106
  	if (!td || td->limit_index != LIMIT_LOW ||
  	    !(op == REQ_OP_READ || op == REQ_OP_WRITE) ||
b9147dd1b   Shaohua Li   blk-throttle: add...
2107
2108
2109
2110
  	    !blk_queue_nonrot(td->queue))
  		return;
  
  	index = request_bucket_index(size);
b889bf66d   Joseph Qi   blk-throttle: tra...
2111
  	latency = get_cpu_ptr(td->latency_buckets[op]);
b9147dd1b   Shaohua Li   blk-throttle: add...
2112
2113
  	latency[index].total_latency += time;
  	latency[index].samples++;
b889bf66d   Joseph Qi   blk-throttle: tra...
2114
  	put_cpu_ptr(td->latency_buckets[op]);
b9147dd1b   Shaohua Li   blk-throttle: add...
2115
2116
2117
2118
2119
2120
  }
  
  void blk_throtl_stat_add(struct request *rq, u64 time_ns)
  {
  	struct request_queue *q = rq->q;
  	struct throtl_data *td = q->td;
3d2443069   Hou Tao   block: make rq se...
2121
2122
  	throtl_track_latency(td, blk_rq_stats_sectors(rq), req_op(rq),
  			     time_ns >> 10);
b9147dd1b   Shaohua Li   blk-throttle: add...
2123
  }
9e234eeaf   Shaohua Li   blk-throttle: add...
2124
2125
  void blk_throtl_bio_endio(struct bio *bio)
  {
08e18eab0   Josef Bacik   block: add bi_blk...
2126
  	struct blkcg_gq *blkg;
9e234eeaf   Shaohua Li   blk-throttle: add...
2127
  	struct throtl_grp *tg;
b9147dd1b   Shaohua Li   blk-throttle: add...
2128
2129
2130
2131
  	u64 finish_time_ns;
  	unsigned long finish_time;
  	unsigned long start_time;
  	unsigned long lat;
b889bf66d   Joseph Qi   blk-throttle: tra...
2132
  	int rw = bio_data_dir(bio);
9e234eeaf   Shaohua Li   blk-throttle: add...
2133

08e18eab0   Josef Bacik   block: add bi_blk...
2134
2135
  	blkg = bio->bi_blkg;
  	if (!blkg)
9e234eeaf   Shaohua Li   blk-throttle: add...
2136
  		return;
08e18eab0   Josef Bacik   block: add bi_blk...
2137
  	tg = blkg_to_tg(blkg);
9e234eeaf   Shaohua Li   blk-throttle: add...
2138

b9147dd1b   Shaohua Li   blk-throttle: add...
2139
2140
  	finish_time_ns = ktime_get_ns();
  	tg->last_finish_time = finish_time_ns >> 10;
5238dcf41   Omar Sandoval   block: replace bi...
2141
2142
  	start_time = bio_issue_time(&bio->bi_issue) >> 10;
  	finish_time = __bio_issue_time(finish_time_ns) >> 10;
08e18eab0   Josef Bacik   block: add bi_blk...
2143
  	if (!start_time || finish_time <= start_time)
53696b8d2   Shaohua Li   blk-throttle: add...
2144
2145
2146
  		return;
  
  	lat = finish_time - start_time;
b9147dd1b   Shaohua Li   blk-throttle: add...
2147
  	/* this is only for bio based driver */
5238dcf41   Omar Sandoval   block: replace bi...
2148
2149
2150
  	if (!(bio->bi_issue.value & BIO_ISSUE_THROTL_SKIP_LATENCY))
  		throtl_track_latency(tg->td, bio_issue_size(&bio->bi_issue),
  				     bio_op(bio), lat);
53696b8d2   Shaohua Li   blk-throttle: add...
2151

6679a90c4   Shaohua Li   blk-throttle: set...
2152
  	if (tg->latency_target && lat >= tg->td->filtered_latency) {
53696b8d2   Shaohua Li   blk-throttle: add...
2153
2154
  		int bucket;
  		unsigned int threshold;
5238dcf41   Omar Sandoval   block: replace bi...
2155
  		bucket = request_bucket_index(bio_issue_size(&bio->bi_issue));
b889bf66d   Joseph Qi   blk-throttle: tra...
2156
  		threshold = tg->td->avg_buckets[rw][bucket].latency +
53696b8d2   Shaohua Li   blk-throttle: add...
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
  			tg->latency_target;
  		if (lat > threshold)
  			tg->bad_bio_cnt++;
  		/*
  		 * Not race free, could get wrong count, which means cgroups
  		 * will be throttled
  		 */
  		tg->bio_cnt++;
  	}
  
  	if (time_after(jiffies, tg->bio_cnt_reset_time) || tg->bio_cnt > 1024) {
  		tg->bio_cnt_reset_time = tg->td->throtl_slice + jiffies;
  		tg->bio_cnt /= 2;
  		tg->bad_bio_cnt /= 2;
b9147dd1b   Shaohua Li   blk-throttle: add...
2171
  	}
9e234eeaf   Shaohua Li   blk-throttle: add...
2172
2173
  }
  #endif
2a12f0dcd   Tejun Heo   blk-throttle: mak...
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
  /*
   * Dispatch all bios from all children tg's queued on @parent_sq.  On
   * return, @parent_sq is guaranteed to not have any active children tg's
   * and all bios from previously active tg's are on @parent_sq->bio_lists[].
   */
  static void tg_drain_bios(struct throtl_service_queue *parent_sq)
  {
  	struct throtl_grp *tg;
  
  	while ((tg = throtl_rb_first(parent_sq))) {
  		struct throtl_service_queue *sq = &tg->service_queue;
  		struct bio *bio;
  
  		throtl_dequeue_tg(tg);
c5cc2070b   Tejun Heo   blk-throttle: add...
2188
  		while ((bio = throtl_peek_queued(&sq->queued[READ])))
2a12f0dcd   Tejun Heo   blk-throttle: mak...
2189
  			tg_dispatch_one_bio(tg, bio_data_dir(bio));
c5cc2070b   Tejun Heo   blk-throttle: add...
2190
  		while ((bio = throtl_peek_queued(&sq->queued[WRITE])))
2a12f0dcd   Tejun Heo   blk-throttle: mak...
2191
2192
2193
  			tg_dispatch_one_bio(tg, bio_data_dir(bio));
  	}
  }
c9a929dde   Tejun Heo   block: fix reques...
2194
2195
2196
2197
2198
2199
2200
  /**
   * blk_throtl_drain - drain throttled bios
   * @q: request_queue to drain throttled bios for
   *
   * Dispatch all currently throttled bios on @q through ->make_request_fn().
   */
  void blk_throtl_drain(struct request_queue *q)
0d945c1f9   Christoph Hellwig   block: remove the...
2201
  	__releases(&q->queue_lock) __acquires(&q->queue_lock)
c9a929dde   Tejun Heo   block: fix reques...
2202
2203
  {
  	struct throtl_data *td = q->td;
2a12f0dcd   Tejun Heo   blk-throttle: mak...
2204
  	struct blkcg_gq *blkg;
492eb21b9   Tejun Heo   cgroup: make hier...
2205
  	struct cgroup_subsys_state *pos_css;
c9a929dde   Tejun Heo   block: fix reques...
2206
  	struct bio *bio;
651930bc1   Tejun Heo   blk-throttle: dis...
2207
  	int rw;
c9a929dde   Tejun Heo   block: fix reques...
2208

2a12f0dcd   Tejun Heo   blk-throttle: mak...
2209
  	rcu_read_lock();
c9a929dde   Tejun Heo   block: fix reques...
2210

2a12f0dcd   Tejun Heo   blk-throttle: mak...
2211
2212
2213
2214
2215
2216
  	/*
  	 * Drain each tg while doing post-order walk on the blkg tree, so
  	 * that all bios are propagated to td->service_queue.  It'd be
  	 * better to walk service_queue tree directly but blkg walk is
  	 * easier.
  	 */
492eb21b9   Tejun Heo   cgroup: make hier...
2217
  	blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg)
2a12f0dcd   Tejun Heo   blk-throttle: mak...
2218
  		tg_drain_bios(&blkg_to_tg(blkg)->service_queue);
73f0d49a9   Tejun Heo   blk-throttle: mov...
2219

2a12f0dcd   Tejun Heo   blk-throttle: mak...
2220
2221
2222
2223
  	/* finally, transfer bios from top-level tg's into the td */
  	tg_drain_bios(&td->service_queue);
  
  	rcu_read_unlock();
0d945c1f9   Christoph Hellwig   block: remove the...
2224
  	spin_unlock_irq(&q->queue_lock);
c9a929dde   Tejun Heo   block: fix reques...
2225

2a12f0dcd   Tejun Heo   blk-throttle: mak...
2226
  	/* all bios now should be in td->service_queue, issue them */
651930bc1   Tejun Heo   blk-throttle: dis...
2227
  	for (rw = READ; rw <= WRITE; rw++)
c5cc2070b   Tejun Heo   blk-throttle: add...
2228
2229
  		while ((bio = throtl_pop_queued(&td->service_queue.queued[rw],
  						NULL)))
651930bc1   Tejun Heo   blk-throttle: dis...
2230
  			generic_make_request(bio);
c9a929dde   Tejun Heo   block: fix reques...
2231

0d945c1f9   Christoph Hellwig   block: remove the...
2232
  	spin_lock_irq(&q->queue_lock);
c9a929dde   Tejun Heo   block: fix reques...
2233
  }
e43473b7f   Vivek Goyal   blkio: Core imple...
2234
2235
2236
  int blk_throtl_init(struct request_queue *q)
  {
  	struct throtl_data *td;
a2b1693ba   Tejun Heo   blkcg: implement ...
2237
  	int ret;
e43473b7f   Vivek Goyal   blkio: Core imple...
2238
2239
2240
2241
  
  	td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
  	if (!td)
  		return -ENOMEM;
b889bf66d   Joseph Qi   blk-throttle: tra...
2242
  	td->latency_buckets[READ] = __alloc_percpu(sizeof(struct latency_bucket) *
b9147dd1b   Shaohua Li   blk-throttle: add...
2243
  		LATENCY_BUCKET_SIZE, __alignof__(u64));
b889bf66d   Joseph Qi   blk-throttle: tra...
2244
2245
2246
2247
2248
  	if (!td->latency_buckets[READ]) {
  		kfree(td);
  		return -ENOMEM;
  	}
  	td->latency_buckets[WRITE] = __alloc_percpu(sizeof(struct latency_bucket) *
b9147dd1b   Shaohua Li   blk-throttle: add...
2249
  		LATENCY_BUCKET_SIZE, __alignof__(u64));
b889bf66d   Joseph Qi   blk-throttle: tra...
2250
2251
  	if (!td->latency_buckets[WRITE]) {
  		free_percpu(td->latency_buckets[READ]);
b9147dd1b   Shaohua Li   blk-throttle: add...
2252
2253
2254
  		kfree(td);
  		return -ENOMEM;
  	}
e43473b7f   Vivek Goyal   blkio: Core imple...
2255

69df0ab03   Tejun Heo   blk-throttle: sep...
2256
  	INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
b2ce2643c   Tejun Heo   blk-throttle: cle...
2257
  	throtl_service_queue_init(&td->service_queue);
e43473b7f   Vivek Goyal   blkio: Core imple...
2258

cd1604fab   Tejun Heo   blkcg: factor out...
2259
  	q->td = td;
29b125892   Vivek Goyal   blk-throttle: Dyn...
2260
  	td->queue = q;
02977e4af   Vivek Goyal   blkio: Add root g...
2261

9f626e372   Shaohua Li   blk-throttle: pre...
2262
  	td->limit_valid[LIMIT_MAX] = true;
cd5ab1b0f   Shaohua Li   blk-throttle: add...
2263
  	td->limit_index = LIMIT_MAX;
3f0abd806   Shaohua Li   blk-throttle: add...
2264
2265
  	td->low_upgrade_time = jiffies;
  	td->low_downgrade_time = jiffies;
9e234eeaf   Shaohua Li   blk-throttle: add...
2266

a2b1693ba   Tejun Heo   blkcg: implement ...
2267
  	/* activate policy */
3c798398e   Tejun Heo   blkcg: mass renam...
2268
  	ret = blkcg_activate_policy(q, &blkcg_policy_throtl);
b9147dd1b   Shaohua Li   blk-throttle: add...
2269
  	if (ret) {
b889bf66d   Joseph Qi   blk-throttle: tra...
2270
2271
  		free_percpu(td->latency_buckets[READ]);
  		free_percpu(td->latency_buckets[WRITE]);
f51b802c1   Tejun Heo   blkcg: use the us...
2272
  		kfree(td);
b9147dd1b   Shaohua Li   blk-throttle: add...
2273
  	}
a2b1693ba   Tejun Heo   blkcg: implement ...
2274
  	return ret;
e43473b7f   Vivek Goyal   blkio: Core imple...
2275
2276
2277
2278
  }
  
  void blk_throtl_exit(struct request_queue *q)
  {
c875f4d02   Tejun Heo   blkcg: drop unnec...
2279
  	BUG_ON(!q->td);
da5277700   Vivek Goyal   block: Move blk_t...
2280
  	throtl_shutdown_wq(q);
3c798398e   Tejun Heo   blkcg: mass renam...
2281
  	blkcg_deactivate_policy(q, &blkcg_policy_throtl);
b889bf66d   Joseph Qi   blk-throttle: tra...
2282
2283
  	free_percpu(q->td->latency_buckets[READ]);
  	free_percpu(q->td->latency_buckets[WRITE]);
c9a929dde   Tejun Heo   block: fix reques...
2284
  	kfree(q->td);
e43473b7f   Vivek Goyal   blkio: Core imple...
2285
  }
d61fcfa4b   Shaohua Li   blk-throttle: cho...
2286
2287
2288
  void blk_throtl_register_queue(struct request_queue *q)
  {
  	struct throtl_data *td;
6679a90c4   Shaohua Li   blk-throttle: set...
2289
  	int i;
d61fcfa4b   Shaohua Li   blk-throttle: cho...
2290
2291
2292
  
  	td = q->td;
  	BUG_ON(!td);
6679a90c4   Shaohua Li   blk-throttle: set...
2293
  	if (blk_queue_nonrot(q)) {
d61fcfa4b   Shaohua Li   blk-throttle: cho...
2294
  		td->throtl_slice = DFL_THROTL_SLICE_SSD;
6679a90c4   Shaohua Li   blk-throttle: set...
2295
2296
  		td->filtered_latency = LATENCY_FILTERED_SSD;
  	} else {
d61fcfa4b   Shaohua Li   blk-throttle: cho...
2297
  		td->throtl_slice = DFL_THROTL_SLICE_HD;
6679a90c4   Shaohua Li   blk-throttle: set...
2298
  		td->filtered_latency = LATENCY_FILTERED_HD;
b889bf66d   Joseph Qi   blk-throttle: tra...
2299
2300
2301
2302
  		for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
  			td->avg_buckets[READ][i].latency = DFL_HD_BASELINE_LATENCY;
  			td->avg_buckets[WRITE][i].latency = DFL_HD_BASELINE_LATENCY;
  		}
6679a90c4   Shaohua Li   blk-throttle: set...
2303
  	}
d61fcfa4b   Shaohua Li   blk-throttle: cho...
2304
2305
2306
2307
  #ifndef CONFIG_BLK_DEV_THROTTLING_LOW
  	/* if no low limit, use previous default */
  	td->throtl_slice = DFL_THROTL_SLICE_HD;
  #endif
9e234eeaf   Shaohua Li   blk-throttle: add...
2308

344e9ffcb   Jens Axboe   block: add queue_...
2309
  	td->track_bio_latency = !queue_is_mq(q);
b9147dd1b   Shaohua Li   blk-throttle: add...
2310
2311
  	if (!td->track_bio_latency)
  		blk_stat_enable_accounting(q);
d61fcfa4b   Shaohua Li   blk-throttle: cho...
2312
  }
297e3d854   Shaohua Li   blk-throttle: mak...
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
  #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
  ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page)
  {
  	if (!q->td)
  		return -EINVAL;
  	return sprintf(page, "%u
  ", jiffies_to_msecs(q->td->throtl_slice));
  }
  
  ssize_t blk_throtl_sample_time_store(struct request_queue *q,
  	const char *page, size_t count)
  {
  	unsigned long v;
  	unsigned long t;
  
  	if (!q->td)
  		return -EINVAL;
  	if (kstrtoul(page, 10, &v))
  		return -EINVAL;
  	t = msecs_to_jiffies(v);
  	if (t == 0 || t > MAX_THROTL_SLICE)
  		return -EINVAL;
  	q->td->throtl_slice = t;
  	return count;
  }
  #endif
e43473b7f   Vivek Goyal   blkio: Core imple...
2339
2340
  static int __init throtl_init(void)
  {
450adcbe5   Vivek Goyal   blk-throttle: Do ...
2341
2342
2343
2344
  	kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0);
  	if (!kthrotld_workqueue)
  		panic("Failed to create kthrotld
  ");
3c798398e   Tejun Heo   blkcg: mass renam...
2345
  	return blkcg_policy_register(&blkcg_policy_throtl);
e43473b7f   Vivek Goyal   blkio: Core imple...
2346
2347
2348
  }
  
  module_init(throtl_init);