Blame view

block/blk-cgroup.c 47.5 KB
31e4c28d9   Vivek Goyal   blkio: Introduce ...
1
2
3
4
5
6
7
8
9
10
11
  /*
   * Common Block IO controller cgroup interface
   *
   * Based on ideas and code from CFQ, CFS and BFQ:
   * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
   *
   * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
   *		      Paolo Valente <paolo.valente@unimore.it>
   *
   * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
   * 	              Nauman Rafique <nauman@google.com>
e48453c38   Arianna Avanzini   block, cgroup: im...
12
13
14
15
   *
   * For policy-specific per-blkcg data:
   * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it>
   *                    Arianna Avanzini <avanzini.arianna@gmail.com>
31e4c28d9   Vivek Goyal   blkio: Introduce ...
16
17
   */
  #include <linux/ioprio.h>
220841906   Vivek Goyal   blkio: Export dis...
18
  #include <linux/kdev_t.h>
9d6a986c0   Vivek Goyal   blkio: Export som...
19
  #include <linux/module.h>
174cd4b1e   Ingo Molnar   sched/headers: Pr...
20
  #include <linux/sched/signal.h>
accee7854   Stephen Rothwell   block: include li...
21
  #include <linux/err.h>
9195291e5   Divyesh Shah   blkio: Increment ...
22
  #include <linux/blkdev.h>
52ebea749   Tejun Heo   writeback: make b...
23
  #include <linux/backing-dev.h>
5a0e3ad6a   Tejun Heo   include cleanup: ...
24
  #include <linux/slab.h>
34d0f179d   Gui Jianfeng   io-controller: Ad...
25
  #include <linux/genhd.h>
72e06c255   Tejun Heo   blkcg: shoot down...
26
  #include <linux/delay.h>
9a9e8a26d   Tejun Heo   blkcg: add blkcg->id
27
  #include <linux/atomic.h>
36aa9e5f5   Tejun Heo   blkcg: move body ...
28
  #include <linux/ctype.h>
eea8f41cc   Tejun Heo   blkcg: move block...
29
  #include <linux/blk-cgroup.h>
d09d8df3a   Josef Bacik   blkcg: add generi...
30
  #include <linux/tracehook.h>
5efd61135   Tejun Heo   blkcg: add blkcg_...
31
  #include "blk.h"
3e2520668   Vivek Goyal   blkio: Implement ...
32

84c124da9   Divyesh Shah   blkio: Changes to...
33
  #define MAX_KEY_LEN 100
838f13bf4   Tejun Heo   blkcg: allow blkc...
34
35
36
37
38
39
40
41
  /*
   * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation.
   * blkcg_pol_register_mutex nests outside of it and synchronizes entire
   * policy [un]register operations including cgroup file additions /
   * removals.  Putting cgroup file registration outside blkcg_pol_mutex
   * allows grabbing it from cgroup callbacks.
   */
  static DEFINE_MUTEX(blkcg_pol_register_mutex);
bc0d6501a   Tejun Heo   blkcg: kill blkio...
42
  static DEFINE_MUTEX(blkcg_pol_mutex);
923adde1b   Tejun Heo   blkcg: clear all ...
43

e48453c38   Arianna Avanzini   block, cgroup: im...
44
  struct blkcg blkcg_root;
3c798398e   Tejun Heo   blkcg: mass renam...
45
  EXPORT_SYMBOL_GPL(blkcg_root);
9d6a986c0   Vivek Goyal   blkio: Export som...
46

496d5e756   Tejun Heo   blkcg: add blkcg_...
47
  struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css;
3c798398e   Tejun Heo   blkcg: mass renam...
48
  static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
035d10b2f   Tejun Heo   blkcg: add blkio_...
49

7876f930d   Tejun Heo   blkcg: implement ...
50
  static LIST_HEAD(all_blkcgs);		/* protected by blkcg_pol_mutex */
903d23f0a   Josef Bacik   blk-cgroup: allow...
51
  static bool blkcg_debug_stats = false;
a2b1693ba   Tejun Heo   blkcg: implement ...
52
  static bool blkcg_policy_enabled(struct request_queue *q,
3c798398e   Tejun Heo   blkcg: mass renam...
53
  				 const struct blkcg_policy *pol)
a2b1693ba   Tejun Heo   blkcg: implement ...
54
55
56
  {
  	return pol && test_bit(pol->plid, q->blkcg_pols);
  }
0381411e4   Tejun Heo   blkcg: let blkcg ...
57
58
59
60
61
62
  /**
   * blkg_free - free a blkg
   * @blkg: blkg to free
   *
   * Free @blkg which may be partially allocated.
   */
3c798398e   Tejun Heo   blkcg: mass renam...
63
  static void blkg_free(struct blkcg_gq *blkg)
0381411e4   Tejun Heo   blkcg: let blkcg ...
64
  {
e8989fae3   Tejun Heo   blkcg: unify blkg...
65
  	int i;
549d3aa87   Tejun Heo   blkcg: make blkg-...
66
67
68
  
  	if (!blkg)
  		return;
db6136703   Tejun Heo   blkcg: invoke blk...
69
  	for (i = 0; i < BLKCG_MAX_POLS; i++)
001bea73e   Tejun Heo   blkcg: replace bl...
70
71
  		if (blkg->pd[i])
  			blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
e8989fae3   Tejun Heo   blkcg: unify blkg...
72

994b78327   Tejun Heo   blkcg: use blkg_f...
73
  	if (blkg->blkcg != &blkcg_root)
b425e5049   Bart Van Assche   block: Avoid that...
74
  		blk_exit_rl(blkg->q, &blkg->rl);
77ea73388   Tejun Heo   blkcg: move io_se...
75
76
77
  
  	blkg_rwstat_exit(&blkg->stat_ios);
  	blkg_rwstat_exit(&blkg->stat_bytes);
549d3aa87   Tejun Heo   blkcg: make blkg-...
78
  	kfree(blkg);
0381411e4   Tejun Heo   blkcg: let blkcg ...
79
80
81
82
83
84
  }
  
  /**
   * blkg_alloc - allocate a blkg
   * @blkcg: block cgroup the new blkg is associated with
   * @q: request_queue the new blkg is associated with
159749937   Tejun Heo   blkcg: make root ...
85
   * @gfp_mask: allocation mask to use
0381411e4   Tejun Heo   blkcg: let blkcg ...
86
   *
e8989fae3   Tejun Heo   blkcg: unify blkg...
87
   * Allocate a new blkg assocating @blkcg and @q.
0381411e4   Tejun Heo   blkcg: let blkcg ...
88
   */
159749937   Tejun Heo   blkcg: make root ...
89
90
  static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
  				   gfp_t gfp_mask)
0381411e4   Tejun Heo   blkcg: let blkcg ...
91
  {
3c798398e   Tejun Heo   blkcg: mass renam...
92
  	struct blkcg_gq *blkg;
e8989fae3   Tejun Heo   blkcg: unify blkg...
93
  	int i;
0381411e4   Tejun Heo   blkcg: let blkcg ...
94
95
  
  	/* alloc and init base part */
159749937   Tejun Heo   blkcg: make root ...
96
  	blkg = kzalloc_node(sizeof(*blkg), gfp_mask, q->node);
0381411e4   Tejun Heo   blkcg: let blkcg ...
97
98
  	if (!blkg)
  		return NULL;
77ea73388   Tejun Heo   blkcg: move io_se...
99
100
101
  	if (blkg_rwstat_init(&blkg->stat_bytes, gfp_mask) ||
  	    blkg_rwstat_init(&blkg->stat_ios, gfp_mask))
  		goto err_free;
c875f4d02   Tejun Heo   blkcg: drop unnec...
102
  	blkg->q = q;
e8989fae3   Tejun Heo   blkcg: unify blkg...
103
  	INIT_LIST_HEAD(&blkg->q_node);
0381411e4   Tejun Heo   blkcg: let blkcg ...
104
  	blkg->blkcg = blkcg;
a5049a8ae   Tejun Heo   blkcg: fix use-af...
105
  	atomic_set(&blkg->refcnt, 1);
0381411e4   Tejun Heo   blkcg: let blkcg ...
106

a051661ca   Tejun Heo   blkcg: implement ...
107
108
109
110
111
112
  	/* root blkg uses @q->root_rl, init rl only for !root blkgs */
  	if (blkcg != &blkcg_root) {
  		if (blk_init_rl(&blkg->rl, q, gfp_mask))
  			goto err_free;
  		blkg->rl.blkg = blkg;
  	}
8bd435b30   Tejun Heo   blkcg: remove sta...
113
  	for (i = 0; i < BLKCG_MAX_POLS; i++) {
3c798398e   Tejun Heo   blkcg: mass renam...
114
  		struct blkcg_policy *pol = blkcg_policy[i];
e8989fae3   Tejun Heo   blkcg: unify blkg...
115
  		struct blkg_policy_data *pd;
0381411e4   Tejun Heo   blkcg: let blkcg ...
116

a2b1693ba   Tejun Heo   blkcg: implement ...
117
  		if (!blkcg_policy_enabled(q, pol))
e8989fae3   Tejun Heo   blkcg: unify blkg...
118
119
120
  			continue;
  
  		/* alloc per-policy data and attach it to blkg */
001bea73e   Tejun Heo   blkcg: replace bl...
121
  		pd = pol->pd_alloc_fn(gfp_mask, q->node);
a051661ca   Tejun Heo   blkcg: implement ...
122
123
  		if (!pd)
  			goto err_free;
549d3aa87   Tejun Heo   blkcg: make blkg-...
124

e8989fae3   Tejun Heo   blkcg: unify blkg...
125
126
  		blkg->pd[i] = pd;
  		pd->blkg = blkg;
b276a876a   Tejun Heo   blkcg: add blkg_p...
127
  		pd->plid = i;
e8989fae3   Tejun Heo   blkcg: unify blkg...
128
  	}
0381411e4   Tejun Heo   blkcg: let blkcg ...
129
  	return blkg;
a051661ca   Tejun Heo   blkcg: implement ...
130
131
132
133
  
  err_free:
  	blkg_free(blkg);
  	return NULL;
0381411e4   Tejun Heo   blkcg: let blkcg ...
134
  }
24f290466   Tejun Heo   blkcg: inline [__...
135
136
  struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
  				      struct request_queue *q, bool update_hint)
80fd99792   Tejun Heo   blkcg: make sure ...
137
  {
3c798398e   Tejun Heo   blkcg: mass renam...
138
  	struct blkcg_gq *blkg;
80fd99792   Tejun Heo   blkcg: make sure ...
139

a637120e4   Tejun Heo   blkcg: use radix ...
140
  	/*
86cde6b62   Tejun Heo   blkcg: reorganize...
141
142
143
144
  	 * Hint didn't match.  Look up from the radix tree.  Note that the
  	 * hint can only be updated under queue_lock as otherwise @blkg
  	 * could have already been removed from blkg_tree.  The caller is
  	 * responsible for grabbing queue_lock if @update_hint.
a637120e4   Tejun Heo   blkcg: use radix ...
145
146
  	 */
  	blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id);
86cde6b62   Tejun Heo   blkcg: reorganize...
147
148
149
150
151
  	if (blkg && blkg->q == q) {
  		if (update_hint) {
  			lockdep_assert_held(q->queue_lock);
  			rcu_assign_pointer(blkcg->blkg_hint, blkg);
  		}
a637120e4   Tejun Heo   blkcg: use radix ...
152
  		return blkg;
86cde6b62   Tejun Heo   blkcg: reorganize...
153
  	}
a637120e4   Tejun Heo   blkcg: use radix ...
154

80fd99792   Tejun Heo   blkcg: make sure ...
155
156
  	return NULL;
  }
ae1188963   Tejun Heo   blkcg: consolidat...
157
  EXPORT_SYMBOL_GPL(blkg_lookup_slowpath);
80fd99792   Tejun Heo   blkcg: make sure ...
158

159749937   Tejun Heo   blkcg: make root ...
159
  /*
d708f0d50   Jens Axboe   Revert "blkcg: al...
160
161
   * If @new_blkg is %NULL, this function tries to allocate a new one as
   * necessary using %GFP_NOWAIT.  @new_blkg is always consumed on return.
159749937   Tejun Heo   blkcg: make root ...
162
   */
86cde6b62   Tejun Heo   blkcg: reorganize...
163
  static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
d708f0d50   Jens Axboe   Revert "blkcg: al...
164
165
  				    struct request_queue *q,
  				    struct blkcg_gq *new_blkg)
5624a4e44   Vivek Goyal   blk-throttle: Mak...
166
  {
d708f0d50   Jens Axboe   Revert "blkcg: al...
167
  	struct blkcg_gq *blkg;
ce7acfeaf   Tejun Heo   writeback, blkcg:...
168
  	struct bdi_writeback_congested *wb_congested;
f427d9096   Tejun Heo   blkcg: implement ...
169
  	int i, ret;
5624a4e44   Vivek Goyal   blk-throttle: Mak...
170

cd1604fab   Tejun Heo   blkcg: factor out...
171
172
  	WARN_ON_ONCE(!rcu_read_lock_held());
  	lockdep_assert_held(q->queue_lock);
7ee9c5620   Tejun Heo   blkcg: let blkio_...
173
  	/* blkg holds a reference to blkcg */
ec903c0c8   Tejun Heo   cgroup: rename cs...
174
  	if (!css_tryget_online(&blkcg->css)) {
20386ce01   Tejun Heo   blkcg: refine err...
175
  		ret = -ENODEV;
93e6d5d8f   Tejun Heo   blkcg: cosmetic u...
176
  		goto err_free_blkg;
159749937   Tejun Heo   blkcg: make root ...
177
  	}
cd1604fab   Tejun Heo   blkcg: factor out...
178

dc3b17cc8   Jan Kara   block: Use pointe...
179
  	wb_congested = wb_congested_get_create(q->backing_dev_info,
d708f0d50   Jens Axboe   Revert "blkcg: al...
180
181
182
  					       blkcg->css.id,
  					       GFP_NOWAIT | __GFP_NOWARN);
  	if (!wb_congested) {
ce7acfeaf   Tejun Heo   writeback, blkcg:...
183
  		ret = -ENOMEM;
d708f0d50   Jens Axboe   Revert "blkcg: al...
184
  		goto err_put_css;
ce7acfeaf   Tejun Heo   writeback, blkcg:...
185
  	}
d708f0d50   Jens Axboe   Revert "blkcg: al...
186
187
188
189
190
191
  	/* allocate */
  	if (!new_blkg) {
  		new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT | __GFP_NOWARN);
  		if (unlikely(!new_blkg)) {
  			ret = -ENOMEM;
  			goto err_put_congested;
159749937   Tejun Heo   blkcg: make root ...
192
193
  		}
  	}
d708f0d50   Jens Axboe   Revert "blkcg: al...
194
195
  	blkg = new_blkg;
  	blkg->wb_congested = wb_congested;
cd1604fab   Tejun Heo   blkcg: factor out...
196

db6136703   Tejun Heo   blkcg: invoke blk...
197
  	/* link parent */
3c5478659   Tejun Heo   blkcg: make blkcg...
198
199
200
  	if (blkcg_parent(blkcg)) {
  		blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
  		if (WARN_ON_ONCE(!blkg->parent)) {
20386ce01   Tejun Heo   blkcg: refine err...
201
  			ret = -ENODEV;
d708f0d50   Jens Axboe   Revert "blkcg: al...
202
  			goto err_put_congested;
3c5478659   Tejun Heo   blkcg: make blkcg...
203
204
205
  		}
  		blkg_get(blkg->parent);
  	}
db6136703   Tejun Heo   blkcg: invoke blk...
206
207
208
209
210
  	/* invoke per-policy init */
  	for (i = 0; i < BLKCG_MAX_POLS; i++) {
  		struct blkcg_policy *pol = blkcg_policy[i];
  
  		if (blkg->pd[i] && pol->pd_init_fn)
a9520cd6f   Tejun Heo   blkcg: make blkcg...
211
  			pol->pd_init_fn(blkg->pd[i]);
db6136703   Tejun Heo   blkcg: invoke blk...
212
213
214
  	}
  
  	/* insert */
cd1604fab   Tejun Heo   blkcg: factor out...
215
  	spin_lock(&blkcg->lock);
a637120e4   Tejun Heo   blkcg: use radix ...
216
217
218
219
  	ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg);
  	if (likely(!ret)) {
  		hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
  		list_add(&blkg->q_node, &q->blkg_list);
f427d9096   Tejun Heo   blkcg: implement ...
220
221
222
223
224
  
  		for (i = 0; i < BLKCG_MAX_POLS; i++) {
  			struct blkcg_policy *pol = blkcg_policy[i];
  
  			if (blkg->pd[i] && pol->pd_online_fn)
a9520cd6f   Tejun Heo   blkcg: make blkcg...
225
  				pol->pd_online_fn(blkg->pd[i]);
f427d9096   Tejun Heo   blkcg: implement ...
226
  		}
a637120e4   Tejun Heo   blkcg: use radix ...
227
  	}
f427d9096   Tejun Heo   blkcg: implement ...
228
  	blkg->online = true;
cd1604fab   Tejun Heo   blkcg: factor out...
229
  	spin_unlock(&blkcg->lock);
496fb7806   Tejun Heo   blkcg: fix blkcg-...
230

ec13b1d6f   Tejun Heo   blkcg: always cre...
231
  	if (!ret)
a637120e4   Tejun Heo   blkcg: use radix ...
232
  		return blkg;
159749937   Tejun Heo   blkcg: make root ...
233

3c5478659   Tejun Heo   blkcg: make blkcg...
234
235
236
  	/* @blkg failed fully initialized, use the usual release path */
  	blkg_put(blkg);
  	return ERR_PTR(ret);
d708f0d50   Jens Axboe   Revert "blkcg: al...
237
238
239
  err_put_congested:
  	wb_congested_put(wb_congested);
  err_put_css:
496fb7806   Tejun Heo   blkcg: fix blkcg-...
240
  	css_put(&blkcg->css);
93e6d5d8f   Tejun Heo   blkcg: cosmetic u...
241
  err_free_blkg:
d708f0d50   Jens Axboe   Revert "blkcg: al...
242
  	blkg_free(new_blkg);
93e6d5d8f   Tejun Heo   blkcg: cosmetic u...
243
  	return ERR_PTR(ret);
31e4c28d9   Vivek Goyal   blkio: Introduce ...
244
  }
3c96cb32d   Tejun Heo   blkcg: drop stuff...
245

86cde6b62   Tejun Heo   blkcg: reorganize...
246
  /**
d708f0d50   Jens Axboe   Revert "blkcg: al...
247
   * blkg_lookup_create - lookup blkg, try to create one if not there
86cde6b62   Tejun Heo   blkcg: reorganize...
248
249
250
251
   * @blkcg: blkcg of interest
   * @q: request_queue of interest
   *
   * Lookup blkg for the @blkcg - @q pair.  If it doesn't exist, try to
3c5478659   Tejun Heo   blkcg: make blkcg...
252
253
254
   * create one.  blkg creation is performed recursively from blkcg_root such
   * that all non-root blkg's have access to the parent blkg.  This function
   * should be called under RCU read lock and @q->queue_lock.
86cde6b62   Tejun Heo   blkcg: reorganize...
255
256
257
258
259
   *
   * Returns pointer to the looked up or created blkg on success, ERR_PTR()
   * value on error.  If @q is dead, returns ERR_PTR(-EINVAL).  If @q is not
   * dead and bypassing, returns ERR_PTR(-EBUSY).
   */
d708f0d50   Jens Axboe   Revert "blkcg: al...
260
261
  struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
  				    struct request_queue *q)
3c96cb32d   Tejun Heo   blkcg: drop stuff...
262
  {
86cde6b62   Tejun Heo   blkcg: reorganize...
263
264
265
266
  	struct blkcg_gq *blkg;
  
  	WARN_ON_ONCE(!rcu_read_lock_held());
  	lockdep_assert_held(q->queue_lock);
d708f0d50   Jens Axboe   Revert "blkcg: al...
267
268
269
270
271
272
  	/*
  	 * This could be the first entry point of blkcg implementation and
  	 * we shouldn't allow anything to go through for a bypassing queue.
  	 */
  	if (unlikely(blk_queue_bypass(q)))
  		return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY);
86cde6b62   Tejun Heo   blkcg: reorganize...
273
274
275
  	blkg = __blkg_lookup(blkcg, q, true);
  	if (blkg)
  		return blkg;
3c5478659   Tejun Heo   blkcg: make blkcg...
276
277
278
279
280
281
282
283
284
285
286
287
  	/*
  	 * Create blkgs walking down from blkcg_root to @blkcg, so that all
  	 * non-root blkgs have access to their parents.
  	 */
  	while (true) {
  		struct blkcg *pos = blkcg;
  		struct blkcg *parent = blkcg_parent(blkcg);
  
  		while (parent && !__blkg_lookup(parent, q, false)) {
  			pos = parent;
  			parent = blkcg_parent(parent);
  		}
d708f0d50   Jens Axboe   Revert "blkcg: al...
288
  		blkg = blkg_create(pos, q, NULL);
3c5478659   Tejun Heo   blkcg: make blkcg...
289
290
291
  		if (pos == blkcg || IS_ERR(blkg))
  			return blkg;
  	}
3c96cb32d   Tejun Heo   blkcg: drop stuff...
292
  }
31e4c28d9   Vivek Goyal   blkio: Introduce ...
293

3c798398e   Tejun Heo   blkcg: mass renam...
294
  static void blkg_destroy(struct blkcg_gq *blkg)
03aa264ac   Tejun Heo   blkcg: let blkcg ...
295
  {
3c798398e   Tejun Heo   blkcg: mass renam...
296
  	struct blkcg *blkcg = blkg->blkcg;
77ea73388   Tejun Heo   blkcg: move io_se...
297
  	struct blkcg_gq *parent = blkg->parent;
6b0654620   Dennis Zhou (Facebook)   Revert "blk-throt...
298
  	int i;
03aa264ac   Tejun Heo   blkcg: let blkcg ...
299

27e1f9d1c   Tejun Heo   blkcg: drop local...
300
  	lockdep_assert_held(blkg->q->queue_lock);
9f13ef678   Tejun Heo   blkcg: use double...
301
  	lockdep_assert_held(&blkcg->lock);
03aa264ac   Tejun Heo   blkcg: let blkcg ...
302
303
  
  	/* Something wrong if we are trying to remove same group twice */
e8989fae3   Tejun Heo   blkcg: unify blkg...
304
  	WARN_ON_ONCE(list_empty(&blkg->q_node));
9f13ef678   Tejun Heo   blkcg: use double...
305
  	WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node));
a637120e4   Tejun Heo   blkcg: use radix ...
306

6b0654620   Dennis Zhou (Facebook)   Revert "blk-throt...
307
308
309
310
311
312
  	for (i = 0; i < BLKCG_MAX_POLS; i++) {
  		struct blkcg_policy *pol = blkcg_policy[i];
  
  		if (blkg->pd[i] && pol->pd_offline_fn)
  			pol->pd_offline_fn(blkg->pd[i]);
  	}
77ea73388   Tejun Heo   blkcg: move io_se...
313
314
315
316
  	if (parent) {
  		blkg_rwstat_add_aux(&parent->stat_bytes, &blkg->stat_bytes);
  		blkg_rwstat_add_aux(&parent->stat_ios, &blkg->stat_ios);
  	}
f427d9096   Tejun Heo   blkcg: implement ...
317
  	blkg->online = false;
a637120e4   Tejun Heo   blkcg: use radix ...
318
  	radix_tree_delete(&blkcg->blkg_tree, blkg->q->id);
e8989fae3   Tejun Heo   blkcg: unify blkg...
319
  	list_del_init(&blkg->q_node);
9f13ef678   Tejun Heo   blkcg: use double...
320
  	hlist_del_init_rcu(&blkg->blkcg_node);
03aa264ac   Tejun Heo   blkcg: let blkcg ...
321

03aa264ac   Tejun Heo   blkcg: let blkcg ...
322
  	/*
a637120e4   Tejun Heo   blkcg: use radix ...
323
324
325
326
  	 * Both setting lookup hint to and clearing it from @blkg are done
  	 * under queue_lock.  If it's not pointing to @blkg now, it never
  	 * will.  Hint assignment itself can race safely.
  	 */
ec6c676a0   Paul E. McKenney   block: Substitute...
327
  	if (rcu_access_pointer(blkcg->blkg_hint) == blkg)
a637120e4   Tejun Heo   blkcg: use radix ...
328
329
330
  		rcu_assign_pointer(blkcg->blkg_hint, NULL);
  
  	/*
03aa264ac   Tejun Heo   blkcg: let blkcg ...
331
332
333
334
335
  	 * Put the reference taken at the time of creation so that when all
  	 * queues are gone, group can be destroyed.
  	 */
  	blkg_put(blkg);
  }
9f13ef678   Tejun Heo   blkcg: use double...
336
337
338
  /**
   * blkg_destroy_all - destroy all blkgs associated with a request_queue
   * @q: request_queue of interest
9f13ef678   Tejun Heo   blkcg: use double...
339
   *
3c96cb32d   Tejun Heo   blkcg: drop stuff...
340
   * Destroy all blkgs associated with @q.
9f13ef678   Tejun Heo   blkcg: use double...
341
   */
3c96cb32d   Tejun Heo   blkcg: drop stuff...
342
  static void blkg_destroy_all(struct request_queue *q)
72e06c255   Tejun Heo   blkcg: shoot down...
343
  {
3c798398e   Tejun Heo   blkcg: mass renam...
344
  	struct blkcg_gq *blkg, *n;
72e06c255   Tejun Heo   blkcg: shoot down...
345

6d18b008d   Tejun Heo   blkcg: shoot down...
346
  	lockdep_assert_held(q->queue_lock);
72e06c255   Tejun Heo   blkcg: shoot down...
347

9f13ef678   Tejun Heo   blkcg: use double...
348
  	list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
3c798398e   Tejun Heo   blkcg: mass renam...
349
  		struct blkcg *blkcg = blkg->blkcg;
72e06c255   Tejun Heo   blkcg: shoot down...
350

9f13ef678   Tejun Heo   blkcg: use double...
351
352
353
  		spin_lock(&blkcg->lock);
  		blkg_destroy(blkg);
  		spin_unlock(&blkcg->lock);
72e06c255   Tejun Heo   blkcg: shoot down...
354
  	}
6fe810bda   Tejun Heo   block: blkg_destr...
355
356
357
  
  	q->root_blkg = NULL;
  	q->root_rl.blkg = NULL;
72e06c255   Tejun Heo   blkcg: shoot down...
358
  }
2a4fd070e   Tejun Heo   blkcg: move bulk ...
359
360
361
362
363
364
365
366
367
  /*
   * A group is RCU protected, but having an rcu lock does not mean that one
   * can access all the fields of blkg and assume these are valid.  For
   * example, don't try to follow throtl_data and request queue links.
   *
   * Having a reference to blkg under an rcu allows accesses to only values
   * local to groups like group stats and group rate limits.
   */
  void __blkg_release_rcu(struct rcu_head *rcu_head)
1adaf3dde   Tejun Heo   blkcg: move refcn...
368
  {
2a4fd070e   Tejun Heo   blkcg: move bulk ...
369
  	struct blkcg_gq *blkg = container_of(rcu_head, struct blkcg_gq, rcu_head);
db6136703   Tejun Heo   blkcg: invoke blk...
370

3c5478659   Tejun Heo   blkcg: make blkcg...
371
  	/* release the blkcg and parent blkg refs this blkg has been holding */
1adaf3dde   Tejun Heo   blkcg: move refcn...
372
  	css_put(&blkg->blkcg->css);
a5049a8ae   Tejun Heo   blkcg: fix use-af...
373
  	if (blkg->parent)
3c5478659   Tejun Heo   blkcg: make blkcg...
374
  		blkg_put(blkg->parent);
1adaf3dde   Tejun Heo   blkcg: move refcn...
375

ce7acfeaf   Tejun Heo   writeback, blkcg:...
376
  	wb_congested_put(blkg->wb_congested);
2a4fd070e   Tejun Heo   blkcg: move bulk ...
377
  	blkg_free(blkg);
1adaf3dde   Tejun Heo   blkcg: move refcn...
378
  }
2a4fd070e   Tejun Heo   blkcg: move bulk ...
379
  EXPORT_SYMBOL_GPL(__blkg_release_rcu);
1adaf3dde   Tejun Heo   blkcg: move refcn...
380

a051661ca   Tejun Heo   blkcg: implement ...
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
  /*
   * The next function used by blk_queue_for_each_rl().  It's a bit tricky
   * because the root blkg uses @q->root_rl instead of its own rl.
   */
  struct request_list *__blk_queue_next_rl(struct request_list *rl,
  					 struct request_queue *q)
  {
  	struct list_head *ent;
  	struct blkcg_gq *blkg;
  
  	/*
  	 * Determine the current blkg list_head.  The first entry is
  	 * root_rl which is off @q->blkg_list and mapped to the head.
  	 */
  	if (rl == &q->root_rl) {
  		ent = &q->blkg_list;
65c77fd9e   Jun'ichi Nomura   blkcg: stop itera...
397
398
399
  		/* There are no more block groups, hence no request lists */
  		if (list_empty(ent))
  			return NULL;
a051661ca   Tejun Heo   blkcg: implement ...
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
  	} else {
  		blkg = container_of(rl, struct blkcg_gq, rl);
  		ent = &blkg->q_node;
  	}
  
  	/* walk to the next list_head, skip root blkcg */
  	ent = ent->next;
  	if (ent == &q->root_blkg->q_node)
  		ent = ent->next;
  	if (ent == &q->blkg_list)
  		return NULL;
  
  	blkg = container_of(ent, struct blkcg_gq, q_node);
  	return &blkg->rl;
  }
182446d08   Tejun Heo   cgroup: pass arou...
415
416
  static int blkcg_reset_stats(struct cgroup_subsys_state *css,
  			     struct cftype *cftype, u64 val)
303a3acb2   Divyesh Shah   blkio: Add io con...
417
  {
182446d08   Tejun Heo   cgroup: pass arou...
418
  	struct blkcg *blkcg = css_to_blkcg(css);
3c798398e   Tejun Heo   blkcg: mass renam...
419
  	struct blkcg_gq *blkg;
bc0d6501a   Tejun Heo   blkcg: kill blkio...
420
  	int i;
303a3acb2   Divyesh Shah   blkio: Add io con...
421

838f13bf4   Tejun Heo   blkcg: allow blkc...
422
  	mutex_lock(&blkcg_pol_mutex);
303a3acb2   Divyesh Shah   blkio: Add io con...
423
  	spin_lock_irq(&blkcg->lock);
997a026c8   Tejun Heo   blkcg: simplify s...
424
425
426
427
428
429
  
  	/*
  	 * Note that stat reset is racy - it doesn't synchronize against
  	 * stat updates.  This is a debug feature which shouldn't exist
  	 * anyway.  If you get hit by a race, retry.
  	 */
b67bfe0d4   Sasha Levin   hlist: drop the n...
430
  	hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
77ea73388   Tejun Heo   blkcg: move io_se...
431
432
  		blkg_rwstat_reset(&blkg->stat_bytes);
  		blkg_rwstat_reset(&blkg->stat_ios);
8bd435b30   Tejun Heo   blkcg: remove sta...
433
  		for (i = 0; i < BLKCG_MAX_POLS; i++) {
3c798398e   Tejun Heo   blkcg: mass renam...
434
  			struct blkcg_policy *pol = blkcg_policy[i];
549d3aa87   Tejun Heo   blkcg: make blkg-...
435

a9520cd6f   Tejun Heo   blkcg: make blkcg...
436
437
  			if (blkg->pd[i] && pol->pd_reset_stats_fn)
  				pol->pd_reset_stats_fn(blkg->pd[i]);
bc0d6501a   Tejun Heo   blkcg: kill blkio...
438
  		}
303a3acb2   Divyesh Shah   blkio: Add io con...
439
  	}
f0bdc8cdd   Vivek Goyal   blk-cgroup: Make ...
440

303a3acb2   Divyesh Shah   blkio: Add io con...
441
  	spin_unlock_irq(&blkcg->lock);
bc0d6501a   Tejun Heo   blkcg: kill blkio...
442
  	mutex_unlock(&blkcg_pol_mutex);
303a3acb2   Divyesh Shah   blkio: Add io con...
443
444
  	return 0;
  }
dd165eb3b   Tejun Heo   blkcg: misc prepa...
445
  const char *blkg_dev_name(struct blkcg_gq *blkg)
303a3acb2   Divyesh Shah   blkio: Add io con...
446
  {
d3d32e69f   Tejun Heo   blkcg: restructur...
447
  	/* some drivers (floppy) instantiate a queue w/o disk registered */
dc3b17cc8   Jan Kara   block: Use pointe...
448
449
  	if (blkg->q->backing_dev_info->dev)
  		return dev_name(blkg->q->backing_dev_info->dev);
d3d32e69f   Tejun Heo   blkcg: restructur...
450
  	return NULL;
303a3acb2   Divyesh Shah   blkio: Add io con...
451
  }
dd165eb3b   Tejun Heo   blkcg: misc prepa...
452
  EXPORT_SYMBOL_GPL(blkg_dev_name);
303a3acb2   Divyesh Shah   blkio: Add io con...
453

d3d32e69f   Tejun Heo   blkcg: restructur...
454
455
456
457
458
459
460
461
462
463
464
  /**
   * blkcg_print_blkgs - helper for printing per-blkg data
   * @sf: seq_file to print to
   * @blkcg: blkcg of interest
   * @prfill: fill function to print out a blkg
   * @pol: policy in question
   * @data: data to be passed to @prfill
   * @show_total: to print out sum of prfill return values or not
   *
   * This function invokes @prfill on each blkg of @blkcg if pd for the
   * policy specified by @pol exists.  @prfill is invoked with @sf, the
810ecfa76   Tejun Heo   blkcg: make blkcg...
465
466
467
   * policy data and @data and the matching queue lock held.  If @show_total
   * is %true, the sum of the return values from @prfill is printed with
   * "Total" label at the end.
d3d32e69f   Tejun Heo   blkcg: restructur...
468
469
470
471
   *
   * This is to be used to construct print functions for
   * cftype->read_seq_string method.
   */
3c798398e   Tejun Heo   blkcg: mass renam...
472
  void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
f95a04afa   Tejun Heo   blkcg: embed stru...
473
474
  		       u64 (*prfill)(struct seq_file *,
  				     struct blkg_policy_data *, int),
3c798398e   Tejun Heo   blkcg: mass renam...
475
  		       const struct blkcg_policy *pol, int data,
ec399347d   Tejun Heo   blkcg: use @pol i...
476
  		       bool show_total)
5624a4e44   Vivek Goyal   blk-throttle: Mak...
477
  {
3c798398e   Tejun Heo   blkcg: mass renam...
478
  	struct blkcg_gq *blkg;
d3d32e69f   Tejun Heo   blkcg: restructur...
479
  	u64 total = 0;
5624a4e44   Vivek Goyal   blk-throttle: Mak...
480

810ecfa76   Tejun Heo   blkcg: make blkcg...
481
  	rcu_read_lock();
ee89f8125   Linus Torvalds   Merge branch 'for...
482
  	hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
810ecfa76   Tejun Heo   blkcg: make blkcg...
483
  		spin_lock_irq(blkg->q->queue_lock);
a2b1693ba   Tejun Heo   blkcg: implement ...
484
  		if (blkcg_policy_enabled(blkg->q, pol))
f95a04afa   Tejun Heo   blkcg: embed stru...
485
  			total += prfill(sf, blkg->pd[pol->plid], data);
810ecfa76   Tejun Heo   blkcg: make blkcg...
486
487
488
  		spin_unlock_irq(blkg->q->queue_lock);
  	}
  	rcu_read_unlock();
d3d32e69f   Tejun Heo   blkcg: restructur...
489
490
491
492
493
  
  	if (show_total)
  		seq_printf(sf, "Total %llu
  ", (unsigned long long)total);
  }
829fdb500   Tejun Heo   blkcg: export con...
494
  EXPORT_SYMBOL_GPL(blkcg_print_blkgs);
d3d32e69f   Tejun Heo   blkcg: restructur...
495
496
497
498
  
  /**
   * __blkg_prfill_u64 - prfill helper for a single u64 value
   * @sf: seq_file to print to
f95a04afa   Tejun Heo   blkcg: embed stru...
499
   * @pd: policy private data of interest
d3d32e69f   Tejun Heo   blkcg: restructur...
500
501
   * @v: value to print
   *
f95a04afa   Tejun Heo   blkcg: embed stru...
502
   * Print @v to @sf for the device assocaited with @pd.
d3d32e69f   Tejun Heo   blkcg: restructur...
503
   */
f95a04afa   Tejun Heo   blkcg: embed stru...
504
  u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v)
d3d32e69f   Tejun Heo   blkcg: restructur...
505
  {
f95a04afa   Tejun Heo   blkcg: embed stru...
506
  	const char *dname = blkg_dev_name(pd->blkg);
d3d32e69f   Tejun Heo   blkcg: restructur...
507
508
509
510
511
512
513
514
  
  	if (!dname)
  		return 0;
  
  	seq_printf(sf, "%s %llu
  ", dname, (unsigned long long)v);
  	return v;
  }
829fdb500   Tejun Heo   blkcg: export con...
515
  EXPORT_SYMBOL_GPL(__blkg_prfill_u64);
d3d32e69f   Tejun Heo   blkcg: restructur...
516
517
518
519
  
  /**
   * __blkg_prfill_rwstat - prfill helper for a blkg_rwstat
   * @sf: seq_file to print to
f95a04afa   Tejun Heo   blkcg: embed stru...
520
   * @pd: policy private data of interest
d3d32e69f   Tejun Heo   blkcg: restructur...
521
522
   * @rwstat: rwstat to print
   *
f95a04afa   Tejun Heo   blkcg: embed stru...
523
   * Print @rwstat to @sf for the device assocaited with @pd.
d3d32e69f   Tejun Heo   blkcg: restructur...
524
   */
f95a04afa   Tejun Heo   blkcg: embed stru...
525
  u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
829fdb500   Tejun Heo   blkcg: export con...
526
  			 const struct blkg_rwstat *rwstat)
d3d32e69f   Tejun Heo   blkcg: restructur...
527
528
529
530
531
532
  {
  	static const char *rwstr[] = {
  		[BLKG_RWSTAT_READ]	= "Read",
  		[BLKG_RWSTAT_WRITE]	= "Write",
  		[BLKG_RWSTAT_SYNC]	= "Sync",
  		[BLKG_RWSTAT_ASYNC]	= "Async",
636620b66   Tejun Heo   blkcg: Track DISC...
533
  		[BLKG_RWSTAT_DISCARD]	= "Discard",
d3d32e69f   Tejun Heo   blkcg: restructur...
534
  	};
f95a04afa   Tejun Heo   blkcg: embed stru...
535
  	const char *dname = blkg_dev_name(pd->blkg);
d3d32e69f   Tejun Heo   blkcg: restructur...
536
537
538
539
540
541
542
543
544
  	u64 v;
  	int i;
  
  	if (!dname)
  		return 0;
  
  	for (i = 0; i < BLKG_RWSTAT_NR; i++)
  		seq_printf(sf, "%s %s %llu
  ", dname, rwstr[i],
24bdb8ef0   Tejun Heo   blkcg: make blkcg...
545
  			   (unsigned long long)atomic64_read(&rwstat->aux_cnt[i]));
d3d32e69f   Tejun Heo   blkcg: restructur...
546

24bdb8ef0   Tejun Heo   blkcg: make blkcg...
547
  	v = atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_READ]) +
636620b66   Tejun Heo   blkcg: Track DISC...
548
549
  		atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_WRITE]) +
  		atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_DISCARD]);
d3d32e69f   Tejun Heo   blkcg: restructur...
550
551
552
553
  	seq_printf(sf, "%s Total %llu
  ", dname, (unsigned long long)v);
  	return v;
  }
b50da39f5   Tejun Heo   blkcg: export __b...
554
  EXPORT_SYMBOL_GPL(__blkg_prfill_rwstat);
d3d32e69f   Tejun Heo   blkcg: restructur...
555

5bc4afb1e   Tejun Heo   blkcg: drop BLKCG...
556
557
558
  /**
   * blkg_prfill_stat - prfill callback for blkg_stat
   * @sf: seq_file to print to
f95a04afa   Tejun Heo   blkcg: embed stru...
559
560
   * @pd: policy private data of interest
   * @off: offset to the blkg_stat in @pd
5bc4afb1e   Tejun Heo   blkcg: drop BLKCG...
561
562
563
   *
   * prfill callback for printing a blkg_stat.
   */
f95a04afa   Tejun Heo   blkcg: embed stru...
564
  u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off)
d3d32e69f   Tejun Heo   blkcg: restructur...
565
  {
f95a04afa   Tejun Heo   blkcg: embed stru...
566
  	return __blkg_prfill_u64(sf, pd, blkg_stat_read((void *)pd + off));
d3d32e69f   Tejun Heo   blkcg: restructur...
567
  }
5bc4afb1e   Tejun Heo   blkcg: drop BLKCG...
568
  EXPORT_SYMBOL_GPL(blkg_prfill_stat);
d3d32e69f   Tejun Heo   blkcg: restructur...
569

5bc4afb1e   Tejun Heo   blkcg: drop BLKCG...
570
571
572
  /**
   * blkg_prfill_rwstat - prfill callback for blkg_rwstat
   * @sf: seq_file to print to
f95a04afa   Tejun Heo   blkcg: embed stru...
573
574
   * @pd: policy private data of interest
   * @off: offset to the blkg_rwstat in @pd
5bc4afb1e   Tejun Heo   blkcg: drop BLKCG...
575
576
577
   *
   * prfill callback for printing a blkg_rwstat.
   */
f95a04afa   Tejun Heo   blkcg: embed stru...
578
579
  u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
  		       int off)
d3d32e69f   Tejun Heo   blkcg: restructur...
580
  {
f95a04afa   Tejun Heo   blkcg: embed stru...
581
  	struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd + off);
d3d32e69f   Tejun Heo   blkcg: restructur...
582

f95a04afa   Tejun Heo   blkcg: embed stru...
583
  	return __blkg_prfill_rwstat(sf, pd, &rwstat);
d3d32e69f   Tejun Heo   blkcg: restructur...
584
  }
5bc4afb1e   Tejun Heo   blkcg: drop BLKCG...
585
  EXPORT_SYMBOL_GPL(blkg_prfill_rwstat);
d3d32e69f   Tejun Heo   blkcg: restructur...
586

77ea73388   Tejun Heo   blkcg: move io_se...
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
  static u64 blkg_prfill_rwstat_field(struct seq_file *sf,
  				    struct blkg_policy_data *pd, int off)
  {
  	struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd->blkg + off);
  
  	return __blkg_prfill_rwstat(sf, pd, &rwstat);
  }
  
  /**
   * blkg_print_stat_bytes - seq_show callback for blkg->stat_bytes
   * @sf: seq_file to print to
   * @v: unused
   *
   * To be used as cftype->seq_show to print blkg->stat_bytes.
   * cftype->private must be set to the blkcg_policy.
   */
  int blkg_print_stat_bytes(struct seq_file *sf, void *v)
  {
  	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
  			  blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private,
  			  offsetof(struct blkcg_gq, stat_bytes), true);
  	return 0;
  }
  EXPORT_SYMBOL_GPL(blkg_print_stat_bytes);
  
  /**
   * blkg_print_stat_bytes - seq_show callback for blkg->stat_ios
   * @sf: seq_file to print to
   * @v: unused
   *
   * To be used as cftype->seq_show to print blkg->stat_ios.  cftype->private
   * must be set to the blkcg_policy.
   */
  int blkg_print_stat_ios(struct seq_file *sf, void *v)
  {
  	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
  			  blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private,
  			  offsetof(struct blkcg_gq, stat_ios), true);
  	return 0;
  }
  EXPORT_SYMBOL_GPL(blkg_print_stat_ios);
  
  static u64 blkg_prfill_rwstat_field_recursive(struct seq_file *sf,
  					      struct blkg_policy_data *pd,
  					      int off)
  {
  	struct blkg_rwstat rwstat = blkg_rwstat_recursive_sum(pd->blkg,
  							      NULL, off);
  	return __blkg_prfill_rwstat(sf, pd, &rwstat);
  }
  
  /**
   * blkg_print_stat_bytes_recursive - recursive version of blkg_print_stat_bytes
   * @sf: seq_file to print to
   * @v: unused
   */
  int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v)
  {
  	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
  			  blkg_prfill_rwstat_field_recursive,
  			  (void *)seq_cft(sf)->private,
  			  offsetof(struct blkcg_gq, stat_bytes), true);
  	return 0;
  }
  EXPORT_SYMBOL_GPL(blkg_print_stat_bytes_recursive);
  
  /**
   * blkg_print_stat_ios_recursive - recursive version of blkg_print_stat_ios
   * @sf: seq_file to print to
   * @v: unused
   */
  int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v)
  {
  	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
  			  blkg_prfill_rwstat_field_recursive,
  			  (void *)seq_cft(sf)->private,
  			  offsetof(struct blkcg_gq, stat_ios), true);
  	return 0;
  }
  EXPORT_SYMBOL_GPL(blkg_print_stat_ios_recursive);
3a8b31d39   Tejun Heo   blkcg: restructur...
667
  /**
16b3de665   Tejun Heo   blkcg: implement ...
668
   * blkg_stat_recursive_sum - collect hierarchical blkg_stat
f12c74cab   Tejun Heo   blkcg: make blkg_...
669
670
671
   * @blkg: blkg of interest
   * @pol: blkcg_policy which contains the blkg_stat
   * @off: offset to the blkg_stat in blkg_policy_data or @blkg
16b3de665   Tejun Heo   blkcg: implement ...
672
   *
f12c74cab   Tejun Heo   blkcg: make blkg_...
673
674
675
676
677
678
   * Collect the blkg_stat specified by @blkg, @pol and @off and all its
   * online descendants and their aux counts.  The caller must be holding the
   * queue lock for online tests.
   *
   * If @pol is NULL, blkg_stat is at @off bytes into @blkg; otherwise, it is
   * at @off bytes into @blkg's blkg_policy_data of the policy.
16b3de665   Tejun Heo   blkcg: implement ...
679
   */
f12c74cab   Tejun Heo   blkcg: make blkg_...
680
681
  u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg,
  			    struct blkcg_policy *pol, int off)
16b3de665   Tejun Heo   blkcg: implement ...
682
  {
16b3de665   Tejun Heo   blkcg: implement ...
683
  	struct blkcg_gq *pos_blkg;
492eb21b9   Tejun Heo   cgroup: make hier...
684
  	struct cgroup_subsys_state *pos_css;
bd8815a6d   Tejun Heo   cgroup: make css_...
685
  	u64 sum = 0;
16b3de665   Tejun Heo   blkcg: implement ...
686

f12c74cab   Tejun Heo   blkcg: make blkg_...
687
  	lockdep_assert_held(blkg->q->queue_lock);
16b3de665   Tejun Heo   blkcg: implement ...
688

16b3de665   Tejun Heo   blkcg: implement ...
689
  	rcu_read_lock();
f12c74cab   Tejun Heo   blkcg: make blkg_...
690
691
692
693
694
  	blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
  		struct blkg_stat *stat;
  
  		if (!pos_blkg->online)
  			continue;
16b3de665   Tejun Heo   blkcg: implement ...
695

f12c74cab   Tejun Heo   blkcg: make blkg_...
696
697
698
699
700
701
  		if (pol)
  			stat = (void *)blkg_to_pd(pos_blkg, pol) + off;
  		else
  			stat = (void *)blkg + off;
  
  		sum += blkg_stat_read(stat) + atomic64_read(&stat->aux_cnt);
16b3de665   Tejun Heo   blkcg: implement ...
702
703
704
705
706
707
708
709
710
  	}
  	rcu_read_unlock();
  
  	return sum;
  }
  EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum);
  
  /**
   * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat
f12c74cab   Tejun Heo   blkcg: make blkg_...
711
712
713
   * @blkg: blkg of interest
   * @pol: blkcg_policy which contains the blkg_rwstat
   * @off: offset to the blkg_rwstat in blkg_policy_data or @blkg
16b3de665   Tejun Heo   blkcg: implement ...
714
   *
f12c74cab   Tejun Heo   blkcg: make blkg_...
715
716
717
718
719
720
   * Collect the blkg_rwstat specified by @blkg, @pol and @off and all its
   * online descendants and their aux counts.  The caller must be holding the
   * queue lock for online tests.
   *
   * If @pol is NULL, blkg_rwstat is at @off bytes into @blkg; otherwise, it
   * is at @off bytes into @blkg's blkg_policy_data of the policy.
16b3de665   Tejun Heo   blkcg: implement ...
721
   */
f12c74cab   Tejun Heo   blkcg: make blkg_...
722
723
  struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg,
  					     struct blkcg_policy *pol, int off)
16b3de665   Tejun Heo   blkcg: implement ...
724
  {
16b3de665   Tejun Heo   blkcg: implement ...
725
  	struct blkcg_gq *pos_blkg;
492eb21b9   Tejun Heo   cgroup: make hier...
726
  	struct cgroup_subsys_state *pos_css;
bd8815a6d   Tejun Heo   cgroup: make css_...
727
  	struct blkg_rwstat sum = { };
16b3de665   Tejun Heo   blkcg: implement ...
728
  	int i;
f12c74cab   Tejun Heo   blkcg: make blkg_...
729
  	lockdep_assert_held(blkg->q->queue_lock);
16b3de665   Tejun Heo   blkcg: implement ...
730

16b3de665   Tejun Heo   blkcg: implement ...
731
  	rcu_read_lock();
f12c74cab   Tejun Heo   blkcg: make blkg_...
732
  	blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
3a7faeada   Tejun Heo   blkcg: reduce sta...
733
  		struct blkg_rwstat *rwstat;
16b3de665   Tejun Heo   blkcg: implement ...
734
735
736
  
  		if (!pos_blkg->online)
  			continue;
f12c74cab   Tejun Heo   blkcg: make blkg_...
737
738
739
740
  		if (pol)
  			rwstat = (void *)blkg_to_pd(pos_blkg, pol) + off;
  		else
  			rwstat = (void *)pos_blkg + off;
16b3de665   Tejun Heo   blkcg: implement ...
741
  		for (i = 0; i < BLKG_RWSTAT_NR; i++)
3a7faeada   Tejun Heo   blkcg: reduce sta...
742
743
744
  			atomic64_add(atomic64_read(&rwstat->aux_cnt[i]) +
  				percpu_counter_sum_positive(&rwstat->cpu_cnt[i]),
  				&sum.aux_cnt[i]);
16b3de665   Tejun Heo   blkcg: implement ...
745
746
747
748
749
750
  	}
  	rcu_read_unlock();
  
  	return sum;
  }
  EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum);
457e490f2   Tahsin Erdogan   blkcg: allocate s...
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
  /* Performs queue bypass and policy enabled checks then looks up blkg. */
  static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg,
  					  const struct blkcg_policy *pol,
  					  struct request_queue *q)
  {
  	WARN_ON_ONCE(!rcu_read_lock_held());
  	lockdep_assert_held(q->queue_lock);
  
  	if (!blkcg_policy_enabled(q, pol))
  		return ERR_PTR(-EOPNOTSUPP);
  
  	/*
  	 * This could be the first entry point of blkcg implementation and
  	 * we shouldn't allow anything to go through for a bypassing queue.
  	 */
  	if (unlikely(blk_queue_bypass(q)))
  		return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY);
  
  	return __blkg_lookup(blkcg, q, true /* update_hint */);
  }
16b3de665   Tejun Heo   blkcg: implement ...
771
  /**
3a8b31d39   Tejun Heo   blkcg: restructur...
772
773
   * blkg_conf_prep - parse and prepare for per-blkg config update
   * @blkcg: target block cgroup
da8b06626   Tejun Heo   blkcg: make blkg_...
774
   * @pol: target policy
3a8b31d39   Tejun Heo   blkcg: restructur...
775
776
777
778
   * @input: input string
   * @ctx: blkg_conf_ctx to be filled
   *
   * Parse per-blkg config update from @input and initialize @ctx with the
36aa9e5f5   Tejun Heo   blkcg: move body ...
779
780
781
   * result.  @ctx->blkg points to the blkg to be updated and @ctx->body the
   * part of @input following MAJ:MIN.  This function returns with RCU read
   * lock and queue lock held and must be paired with blkg_conf_finish().
3a8b31d39   Tejun Heo   blkcg: restructur...
782
   */
3c798398e   Tejun Heo   blkcg: mass renam...
783
  int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
36aa9e5f5   Tejun Heo   blkcg: move body ...
784
  		   char *input, struct blkg_conf_ctx *ctx)
da8b06626   Tejun Heo   blkcg: make blkg_...
785
  	__acquires(rcu) __acquires(disk->queue->queue_lock)
34d0f179d   Gui Jianfeng   io-controller: Ad...
786
  {
3a8b31d39   Tejun Heo   blkcg: restructur...
787
  	struct gendisk *disk;
457e490f2   Tahsin Erdogan   blkcg: allocate s...
788
  	struct request_queue *q;
3c798398e   Tejun Heo   blkcg: mass renam...
789
  	struct blkcg_gq *blkg;
726fa6945   Tejun Heo   blkcg: simplify b...
790
  	unsigned int major, minor;
36aa9e5f5   Tejun Heo   blkcg: move body ...
791
792
  	int key_len, part, ret;
  	char *body;
34d0f179d   Gui Jianfeng   io-controller: Ad...
793

36aa9e5f5   Tejun Heo   blkcg: move body ...
794
  	if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2)
726fa6945   Tejun Heo   blkcg: simplify b...
795
  		return -EINVAL;
3a8b31d39   Tejun Heo   blkcg: restructur...
796

36aa9e5f5   Tejun Heo   blkcg: move body ...
797
798
799
800
  	body = input + key_len;
  	if (!isspace(*body))
  		return -EINVAL;
  	body = skip_spaces(body);
726fa6945   Tejun Heo   blkcg: simplify b...
801
  	disk = get_gendisk(MKDEV(major, minor), &part);
5f6c2d2b7   Tejun Heo   blkcg: fix gendis...
802
  	if (!disk)
20386ce01   Tejun Heo   blkcg: refine err...
803
  		return -ENODEV;
5f6c2d2b7   Tejun Heo   blkcg: fix gendis...
804
  	if (part) {
457e490f2   Tahsin Erdogan   blkcg: allocate s...
805
806
  		ret = -ENODEV;
  		goto fail;
5f6c2d2b7   Tejun Heo   blkcg: fix gendis...
807
  	}
e56da7e28   Tejun Heo   blkcg: don't allo...
808

457e490f2   Tahsin Erdogan   blkcg: allocate s...
809
  	q = disk->queue;
da8b06626   Tejun Heo   blkcg: make blkg_...
810

457e490f2   Tahsin Erdogan   blkcg: allocate s...
811
812
  	rcu_read_lock();
  	spin_lock_irq(q->queue_lock);
e56da7e28   Tejun Heo   blkcg: don't allo...
813

457e490f2   Tahsin Erdogan   blkcg: allocate s...
814
  	blkg = blkg_lookup_check(blkcg, pol, q);
4bfd482e7   Tejun Heo   blkcg: kill blkio...
815
816
  	if (IS_ERR(blkg)) {
  		ret = PTR_ERR(blkg);
457e490f2   Tahsin Erdogan   blkcg: allocate s...
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
  		goto fail_unlock;
  	}
  
  	if (blkg)
  		goto success;
  
  	/*
  	 * Create blkgs walking down from blkcg_root to @blkcg, so that all
  	 * non-root blkgs have access to their parents.
  	 */
  	while (true) {
  		struct blkcg *pos = blkcg;
  		struct blkcg *parent;
  		struct blkcg_gq *new_blkg;
  
  		parent = blkcg_parent(blkcg);
  		while (parent && !__blkg_lookup(parent, q, false)) {
  			pos = parent;
  			parent = blkcg_parent(parent);
  		}
  
  		/* Drop locks to do new blkg allocation with GFP_KERNEL. */
  		spin_unlock_irq(q->queue_lock);
3a8b31d39   Tejun Heo   blkcg: restructur...
840
  		rcu_read_unlock();
457e490f2   Tahsin Erdogan   blkcg: allocate s...
841
842
843
844
845
  
  		new_blkg = blkg_alloc(pos, q, GFP_KERNEL);
  		if (unlikely(!new_blkg)) {
  			ret = -ENOMEM;
  			goto fail;
7702e8f45   Vivek Goyal   blk-cgroup: cgrou...
846
  		}
3a8b31d39   Tejun Heo   blkcg: restructur...
847

457e490f2   Tahsin Erdogan   blkcg: allocate s...
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
  		rcu_read_lock();
  		spin_lock_irq(q->queue_lock);
  
  		blkg = blkg_lookup_check(pos, pol, q);
  		if (IS_ERR(blkg)) {
  			ret = PTR_ERR(blkg);
  			goto fail_unlock;
  		}
  
  		if (blkg) {
  			blkg_free(new_blkg);
  		} else {
  			blkg = blkg_create(pos, q, new_blkg);
  			if (unlikely(IS_ERR(blkg))) {
  				ret = PTR_ERR(blkg);
  				goto fail_unlock;
  			}
  		}
  
  		if (pos == blkcg)
  			goto success;
  	}
  success:
3a8b31d39   Tejun Heo   blkcg: restructur...
871
872
  	ctx->disk = disk;
  	ctx->blkg = blkg;
36aa9e5f5   Tejun Heo   blkcg: move body ...
873
  	ctx->body = body;
726fa6945   Tejun Heo   blkcg: simplify b...
874
  	return 0;
457e490f2   Tahsin Erdogan   blkcg: allocate s...
875
876
877
878
879
  
  fail_unlock:
  	spin_unlock_irq(q->queue_lock);
  	rcu_read_unlock();
  fail:
9df6c2991   Jan Kara   genhd: Add helper...
880
  	put_disk_and_module(disk);
457e490f2   Tahsin Erdogan   blkcg: allocate s...
881
882
883
884
885
886
887
888
889
890
891
  	/*
  	 * If queue was bypassing, we should retry.  Do so after a
  	 * short msleep().  It isn't strictly necessary but queue
  	 * can be bypassing for some time and it's always nice to
  	 * avoid busy looping.
  	 */
  	if (ret == -EBUSY) {
  		msleep(10);
  		ret = restart_syscall();
  	}
  	return ret;
34d0f179d   Gui Jianfeng   io-controller: Ad...
892
  }
829fdb500   Tejun Heo   blkcg: export con...
893
  EXPORT_SYMBOL_GPL(blkg_conf_prep);
34d0f179d   Gui Jianfeng   io-controller: Ad...
894

3a8b31d39   Tejun Heo   blkcg: restructur...
895
896
897
898
899
900
901
  /**
   * blkg_conf_finish - finish up per-blkg config update
   * @ctx: blkg_conf_ctx intiailized by blkg_conf_prep()
   *
   * Finish up after per-blkg config update.  This function must be paired
   * with blkg_conf_prep().
   */
829fdb500   Tejun Heo   blkcg: export con...
902
  void blkg_conf_finish(struct blkg_conf_ctx *ctx)
da8b06626   Tejun Heo   blkcg: make blkg_...
903
  	__releases(ctx->disk->queue->queue_lock) __releases(rcu)
34d0f179d   Gui Jianfeng   io-controller: Ad...
904
  {
da8b06626   Tejun Heo   blkcg: make blkg_...
905
  	spin_unlock_irq(ctx->disk->queue->queue_lock);
3a8b31d39   Tejun Heo   blkcg: restructur...
906
  	rcu_read_unlock();
9df6c2991   Jan Kara   genhd: Add helper...
907
  	put_disk_and_module(ctx->disk);
34d0f179d   Gui Jianfeng   io-controller: Ad...
908
  }
829fdb500   Tejun Heo   blkcg: export con...
909
  EXPORT_SYMBOL_GPL(blkg_conf_finish);
34d0f179d   Gui Jianfeng   io-controller: Ad...
910

2ee867dcf   Tejun Heo   blkcg: implement ...
911
912
913
914
915
916
917
918
919
  static int blkcg_print_stat(struct seq_file *sf, void *v)
  {
  	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
  	struct blkcg_gq *blkg;
  
  	rcu_read_lock();
  
  	hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
  		const char *dname;
903d23f0a   Josef Bacik   blk-cgroup: allow...
920
  		char *buf;
2ee867dcf   Tejun Heo   blkcg: implement ...
921
  		struct blkg_rwstat rwstat;
636620b66   Tejun Heo   blkcg: Track DISC...
922
  		u64 rbytes, wbytes, rios, wios, dbytes, dios;
903d23f0a   Josef Bacik   blk-cgroup: allow...
923
924
925
  		size_t size = seq_get_buf(sf, &buf), off = 0;
  		int i;
  		bool has_stats = false;
2ee867dcf   Tejun Heo   blkcg: implement ...
926
927
928
929
  
  		dname = blkg_dev_name(blkg);
  		if (!dname)
  			continue;
903d23f0a   Josef Bacik   blk-cgroup: allow...
930
931
932
933
934
935
936
  		/*
  		 * Hooray string manipulation, count is the size written NOT
  		 * INCLUDING THE \0, so size is now count+1 less than what we
  		 * had before, but we want to start writing the next bit from
  		 * the \0 so we only add count to buf.
  		 */
  		off += scnprintf(buf+off, size-off, "%s ", dname);
2ee867dcf   Tejun Heo   blkcg: implement ...
937
938
939
940
941
942
  		spin_lock_irq(blkg->q->queue_lock);
  
  		rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
  					offsetof(struct blkcg_gq, stat_bytes));
  		rbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
  		wbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
636620b66   Tejun Heo   blkcg: Track DISC...
943
  		dbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]);
2ee867dcf   Tejun Heo   blkcg: implement ...
944
945
946
947
948
  
  		rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
  					offsetof(struct blkcg_gq, stat_ios));
  		rios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
  		wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
636620b66   Tejun Heo   blkcg: Track DISC...
949
  		dios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]);
2ee867dcf   Tejun Heo   blkcg: implement ...
950
951
  
  		spin_unlock_irq(blkg->q->queue_lock);
903d23f0a   Josef Bacik   blk-cgroup: allow...
952
953
954
  		if (rbytes || wbytes || rios || wios) {
  			has_stats = true;
  			off += scnprintf(buf+off, size-off,
636620b66   Tejun Heo   blkcg: Track DISC...
955
956
957
  					 "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu",
  					 rbytes, wbytes, rios, wios,
  					 dbytes, dios);
903d23f0a   Josef Bacik   blk-cgroup: allow...
958
959
960
961
  		}
  
  		if (!blkcg_debug_stats)
  			goto next;
d09d8df3a   Josef Bacik   blkcg: add generi...
962
963
964
965
966
967
968
  		if (atomic_read(&blkg->use_delay)) {
  			has_stats = true;
  			off += scnprintf(buf+off, size-off,
  					 " use_delay=%d delay_nsec=%llu",
  					 atomic_read(&blkg->use_delay),
  					(unsigned long long)atomic64_read(&blkg->delay_nsec));
  		}
903d23f0a   Josef Bacik   blk-cgroup: allow...
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
  		for (i = 0; i < BLKCG_MAX_POLS; i++) {
  			struct blkcg_policy *pol = blkcg_policy[i];
  			size_t written;
  
  			if (!blkg->pd[i] || !pol->pd_stat_fn)
  				continue;
  
  			written = pol->pd_stat_fn(blkg->pd[i], buf+off, size-off);
  			if (written)
  				has_stats = true;
  			off += written;
  		}
  next:
  		if (has_stats) {
  			off += scnprintf(buf+off, size-off, "
  ");
  			seq_commit(sf, off);
  		}
2ee867dcf   Tejun Heo   blkcg: implement ...
987
988
989
990
991
  	}
  
  	rcu_read_unlock();
  	return 0;
  }
e1f3b9412   Bart Van Assche   block/blk-cgroup....
992
  static struct cftype blkcg_files[] = {
2ee867dcf   Tejun Heo   blkcg: implement ...
993
994
  	{
  		.name = "stat",
ca0752c5e   Tejun Heo   blkcg: don't crea...
995
  		.flags = CFTYPE_NOT_ON_ROOT,
2ee867dcf   Tejun Heo   blkcg: implement ...
996
997
998
999
  		.seq_show = blkcg_print_stat,
  	},
  	{ }	/* terminate */
  };
e1f3b9412   Bart Van Assche   block/blk-cgroup....
1000
  static struct cftype blkcg_legacy_files[] = {
31e4c28d9   Vivek Goyal   blkio: Introduce ...
1001
  	{
84c124da9   Divyesh Shah   blkio: Changes to...
1002
  		.name = "reset_stats",
3c798398e   Tejun Heo   blkcg: mass renam...
1003
  		.write_u64 = blkcg_reset_stats,
220841906   Vivek Goyal   blkio: Export dis...
1004
  	},
4baf6e332   Tejun Heo   cgroup: convert a...
1005
  	{ }	/* terminate */
31e4c28d9   Vivek Goyal   blkio: Introduce ...
1006
  };
59b57717f   Dennis Zhou (Facebook)   blkcg: delay blkg...
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
  /*
   * blkcg destruction is a three-stage process.
   *
   * 1. Destruction starts.  The blkcg_css_offline() callback is invoked
   *    which offlines writeback.  Here we tie the next stage of blkg destruction
   *    to the completion of writeback associated with the blkcg.  This lets us
   *    avoid punting potentially large amounts of outstanding writeback to root
   *    while maintaining any ongoing policies.  The next stage is triggered when
   *    the nr_cgwbs count goes to zero.
   *
   * 2. When the nr_cgwbs count goes to zero, blkcg_destroy_blkgs() is called
   *    and handles the destruction of blkgs.  Here the css reference held by
   *    the blkg is put back eventually allowing blkcg_css_free() to be called.
   *    This work may occur in cgwb_release_workfn() on the cgwb_release
   *    workqueue.  Any submitted ios that fail to get the blkg ref will be
   *    punted to the root_blkg.
   *
   * 3. Once the blkcg ref count goes to zero, blkcg_css_free() is called.
   *    This finally frees the blkcg.
   */
9f13ef678   Tejun Heo   blkcg: use double...
1027
  /**
92fb97487   Tejun Heo   cgroup: rename ->...
1028
   * blkcg_css_offline - cgroup css_offline callback
eb95419b0   Tejun Heo   cgroup: pass arou...
1029
   * @css: css of interest
9f13ef678   Tejun Heo   blkcg: use double...
1030
   *
59b57717f   Dennis Zhou (Facebook)   blkcg: delay blkg...
1031
1032
1033
   * This function is called when @css is about to go away.  Here the cgwbs are
   * offlined first and only once writeback associated with the blkcg has
   * finished do we start step 2 (see above).
9f13ef678   Tejun Heo   blkcg: use double...
1034
   */
eb95419b0   Tejun Heo   cgroup: pass arou...
1035
  static void blkcg_css_offline(struct cgroup_subsys_state *css)
31e4c28d9   Vivek Goyal   blkio: Introduce ...
1036
  {
eb95419b0   Tejun Heo   cgroup: pass arou...
1037
  	struct blkcg *blkcg = css_to_blkcg(css);
b1c357696   Vivek Goyal   blkio: Take care ...
1038

59b57717f   Dennis Zhou (Facebook)   blkcg: delay blkg...
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
  	/* this prevents anyone from attaching or migrating to this blkcg */
  	wb_blkcg_offline(blkcg);
  
  	/* put the base cgwb reference allowing step 2 to be triggered */
  	blkcg_cgwb_put(blkcg);
  }
  
  /**
   * blkcg_destroy_blkgs - responsible for shooting down blkgs
   * @blkcg: blkcg of interest
   *
   * blkgs should be removed while holding both q and blkcg locks.  As blkcg lock
   * is nested inside q lock, this function performs reverse double lock dancing.
   * Destroying the blkgs releases the reference held on the blkcg's css allowing
   * blkcg_css_free to eventually be called.
   *
   * This is the blkcg counterpart of ioc_release_fn().
   */
  void blkcg_destroy_blkgs(struct blkcg *blkcg)
  {
9f13ef678   Tejun Heo   blkcg: use double...
1059
  	spin_lock_irq(&blkcg->lock);
7ee9c5620   Tejun Heo   blkcg: let blkio_...
1060

4c6994806   Joseph Qi   blk-throttle: fix...
1061
1062
  	while (!hlist_empty(&blkcg->blkg_list)) {
  		struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first,
6b0654620   Dennis Zhou (Facebook)   Revert "blk-throt...
1063
  						struct blkcg_gq, blkcg_node);
4c6994806   Joseph Qi   blk-throttle: fix...
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
  		struct request_queue *q = blkg->q;
  
  		if (spin_trylock(q->queue_lock)) {
  			blkg_destroy(blkg);
  			spin_unlock(q->queue_lock);
  		} else {
  			spin_unlock_irq(&blkcg->lock);
  			cpu_relax();
  			spin_lock_irq(&blkcg->lock);
  		}
  	}
6b0654620   Dennis Zhou (Facebook)   Revert "blk-throt...
1075

4c6994806   Joseph Qi   blk-throttle: fix...
1076
1077
  	spin_unlock_irq(&blkcg->lock);
  }
eb95419b0   Tejun Heo   cgroup: pass arou...
1078
  static void blkcg_css_free(struct cgroup_subsys_state *css)
7ee9c5620   Tejun Heo   blkcg: let blkio_...
1079
  {
eb95419b0   Tejun Heo   cgroup: pass arou...
1080
  	struct blkcg *blkcg = css_to_blkcg(css);
bc915e61c   Tejun Heo   blkcg: remove unn...
1081
  	int i;
7ee9c5620   Tejun Heo   blkcg: let blkio_...
1082

7876f930d   Tejun Heo   blkcg: implement ...
1083
  	mutex_lock(&blkcg_pol_mutex);
e4a9bde95   Tejun Heo   blkcg: replace bl...
1084

7876f930d   Tejun Heo   blkcg: implement ...
1085
  	list_del(&blkcg->all_blkcgs_node);
7876f930d   Tejun Heo   blkcg: implement ...
1086

bc915e61c   Tejun Heo   blkcg: remove unn...
1087
  	for (i = 0; i < BLKCG_MAX_POLS; i++)
e4a9bde95   Tejun Heo   blkcg: replace bl...
1088
1089
1090
1091
  		if (blkcg->cpd[i])
  			blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
  
  	mutex_unlock(&blkcg_pol_mutex);
bc915e61c   Tejun Heo   blkcg: remove unn...
1092
  	kfree(blkcg);
31e4c28d9   Vivek Goyal   blkio: Introduce ...
1093
  }
eb95419b0   Tejun Heo   cgroup: pass arou...
1094
1095
  static struct cgroup_subsys_state *
  blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
31e4c28d9   Vivek Goyal   blkio: Introduce ...
1096
  {
3c798398e   Tejun Heo   blkcg: mass renam...
1097
  	struct blkcg *blkcg;
e48453c38   Arianna Avanzini   block, cgroup: im...
1098
1099
  	struct cgroup_subsys_state *ret;
  	int i;
31e4c28d9   Vivek Goyal   blkio: Introduce ...
1100

7876f930d   Tejun Heo   blkcg: implement ...
1101
  	mutex_lock(&blkcg_pol_mutex);
eb95419b0   Tejun Heo   cgroup: pass arou...
1102
  	if (!parent_css) {
3c798398e   Tejun Heo   blkcg: mass renam...
1103
  		blkcg = &blkcg_root;
bc915e61c   Tejun Heo   blkcg: remove unn...
1104
1105
1106
1107
  	} else {
  		blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
  		if (!blkcg) {
  			ret = ERR_PTR(-ENOMEM);
4c18c9e96   weiping zhang   blkcg: avoid free...
1108
  			goto unlock;
bc915e61c   Tejun Heo   blkcg: remove unn...
1109
  		}
e48453c38   Arianna Avanzini   block, cgroup: im...
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
  	}
  
  	for (i = 0; i < BLKCG_MAX_POLS ; i++) {
  		struct blkcg_policy *pol = blkcg_policy[i];
  		struct blkcg_policy_data *cpd;
  
  		/*
  		 * If the policy hasn't been attached yet, wait for it
  		 * to be attached before doing anything else. Otherwise,
  		 * check if the policy requires any specific per-cgroup
  		 * data: if it does, allocate and initialize it.
  		 */
e4a9bde95   Tejun Heo   blkcg: replace bl...
1122
  		if (!pol || !pol->cpd_alloc_fn)
e48453c38   Arianna Avanzini   block, cgroup: im...
1123
  			continue;
e4a9bde95   Tejun Heo   blkcg: replace bl...
1124
  		cpd = pol->cpd_alloc_fn(GFP_KERNEL);
e48453c38   Arianna Avanzini   block, cgroup: im...
1125
1126
1127
1128
  		if (!cpd) {
  			ret = ERR_PTR(-ENOMEM);
  			goto free_pd_blkcg;
  		}
814376483   Tejun Heo   blkcg: minor upda...
1129
1130
  		blkcg->cpd[i] = cpd;
  		cpd->blkcg = blkcg;
e48453c38   Arianna Avanzini   block, cgroup: im...
1131
  		cpd->plid = i;
e4a9bde95   Tejun Heo   blkcg: replace bl...
1132
1133
  		if (pol->cpd_init_fn)
  			pol->cpd_init_fn(cpd);
e48453c38   Arianna Avanzini   block, cgroup: im...
1134
  	}
31e4c28d9   Vivek Goyal   blkio: Introduce ...
1135

31e4c28d9   Vivek Goyal   blkio: Introduce ...
1136
  	spin_lock_init(&blkcg->lock);
e00f4f4d0   Tejun Heo   block,blkcg: use ...
1137
  	INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT | __GFP_NOWARN);
31e4c28d9   Vivek Goyal   blkio: Introduce ...
1138
  	INIT_HLIST_HEAD(&blkcg->blkg_list);
52ebea749   Tejun Heo   writeback: make b...
1139
1140
  #ifdef CONFIG_CGROUP_WRITEBACK
  	INIT_LIST_HEAD(&blkcg->cgwb_list);
59b57717f   Dennis Zhou (Facebook)   blkcg: delay blkg...
1141
  	refcount_set(&blkcg->cgwb_refcnt, 1);
52ebea749   Tejun Heo   writeback: make b...
1142
  #endif
7876f930d   Tejun Heo   blkcg: implement ...
1143
1144
1145
  	list_add_tail(&blkcg->all_blkcgs_node, &all_blkcgs);
  
  	mutex_unlock(&blkcg_pol_mutex);
31e4c28d9   Vivek Goyal   blkio: Introduce ...
1146
  	return &blkcg->css;
e48453c38   Arianna Avanzini   block, cgroup: im...
1147
1148
1149
  
  free_pd_blkcg:
  	for (i--; i >= 0; i--)
e4a9bde95   Tejun Heo   blkcg: replace bl...
1150
1151
  		if (blkcg->cpd[i])
  			blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
4c18c9e96   weiping zhang   blkcg: avoid free...
1152
1153
1154
1155
  
  	if (blkcg != &blkcg_root)
  		kfree(blkcg);
  unlock:
7876f930d   Tejun Heo   blkcg: implement ...
1156
  	mutex_unlock(&blkcg_pol_mutex);
e48453c38   Arianna Avanzini   block, cgroup: im...
1157
  	return ret;
31e4c28d9   Vivek Goyal   blkio: Introduce ...
1158
  }
5efd61135   Tejun Heo   blkcg: add blkcg_...
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
  /**
   * blkcg_init_queue - initialize blkcg part of request queue
   * @q: request_queue to initialize
   *
   * Called from blk_alloc_queue_node(). Responsible for initializing blkcg
   * part of new request_queue @q.
   *
   * RETURNS:
   * 0 on success, -errno on failure.
   */
  int blkcg_init_queue(struct request_queue *q)
  {
d708f0d50   Jens Axboe   Revert "blkcg: al...
1171
1172
  	struct blkcg_gq *new_blkg, *blkg;
  	bool preloaded;
ec13b1d6f   Tejun Heo   blkcg: always cre...
1173
  	int ret;
d708f0d50   Jens Axboe   Revert "blkcg: al...
1174
1175
1176
1177
1178
  	new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
  	if (!new_blkg)
  		return -ENOMEM;
  
  	preloaded = !radix_tree_preload(GFP_KERNEL);
bea548831   Jiang Biao   blkcg: small fix ...
1179
  	/* Make sure the root blkg exists. */
ec13b1d6f   Tejun Heo   blkcg: always cre...
1180
1181
  	rcu_read_lock();
  	spin_lock_irq(q->queue_lock);
d708f0d50   Jens Axboe   Revert "blkcg: al...
1182
  	blkg = blkg_create(&blkcg_root, q, new_blkg);
901932a3f   Jiang Biao   blkcg: init root ...
1183
1184
1185
1186
  	if (IS_ERR(blkg))
  		goto err_unlock;
  	q->root_blkg = blkg;
  	q->root_rl.blkg = blkg;
ec13b1d6f   Tejun Heo   blkcg: always cre...
1187
1188
  	spin_unlock_irq(q->queue_lock);
  	rcu_read_unlock();
d708f0d50   Jens Axboe   Revert "blkcg: al...
1189
1190
  	if (preloaded)
  		radix_tree_preload_end();
d70675121   Josef Bacik   block: introduce ...
1191
1192
1193
1194
1195
1196
1197
  	ret = blk_iolatency_init(q);
  	if (ret) {
  		spin_lock_irq(q->queue_lock);
  		blkg_destroy_all(q);
  		spin_unlock_irq(q->queue_lock);
  		return ret;
  	}
ec13b1d6f   Tejun Heo   blkcg: always cre...
1198
1199
1200
1201
1202
1203
1204
  	ret = blk_throtl_init(q);
  	if (ret) {
  		spin_lock_irq(q->queue_lock);
  		blkg_destroy_all(q);
  		spin_unlock_irq(q->queue_lock);
  	}
  	return ret;
901932a3f   Jiang Biao   blkcg: init root ...
1205
1206
1207
1208
1209
1210
1211
  
  err_unlock:
  	spin_unlock_irq(q->queue_lock);
  	rcu_read_unlock();
  	if (preloaded)
  		radix_tree_preload_end();
  	return PTR_ERR(blkg);
5efd61135   Tejun Heo   blkcg: add blkcg_...
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
  }
  
  /**
   * blkcg_drain_queue - drain blkcg part of request_queue
   * @q: request_queue to drain
   *
   * Called from blk_drain_queue().  Responsible for draining blkcg part.
   */
  void blkcg_drain_queue(struct request_queue *q)
  {
  	lockdep_assert_held(q->queue_lock);
0b462c89e   Tejun Heo   blkcg: don't call...
1223
1224
1225
1226
1227
1228
  	/*
  	 * @q could be exiting and already have destroyed all blkgs as
  	 * indicated by NULL root_blkg.  If so, don't confuse policies.
  	 */
  	if (!q->root_blkg)
  		return;
5efd61135   Tejun Heo   blkcg: add blkcg_...
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
  	blk_throtl_drain(q);
  }
  
  /**
   * blkcg_exit_queue - exit and release blkcg part of request_queue
   * @q: request_queue being released
   *
   * Called from blk_release_queue().  Responsible for exiting blkcg part.
   */
  void blkcg_exit_queue(struct request_queue *q)
  {
6d18b008d   Tejun Heo   blkcg: shoot down...
1240
  	spin_lock_irq(q->queue_lock);
3c96cb32d   Tejun Heo   blkcg: drop stuff...
1241
  	blkg_destroy_all(q);
6d18b008d   Tejun Heo   blkcg: shoot down...
1242
  	spin_unlock_irq(q->queue_lock);
5efd61135   Tejun Heo   blkcg: add blkcg_...
1243
1244
  	blk_throtl_exit(q);
  }
31e4c28d9   Vivek Goyal   blkio: Introduce ...
1245
1246
1247
1248
1249
1250
  /*
   * We cannot support shared io contexts, as we have no mean to support
   * two tasks with the same ioc in two different groups without major rework
   * of the main cic data structures.  For now we allow a task to change
   * its cgroup only if it's the only owner of its ioc.
   */
1f7dd3e5a   Tejun Heo   cgroup: fix handl...
1251
  static int blkcg_can_attach(struct cgroup_taskset *tset)
31e4c28d9   Vivek Goyal   blkio: Introduce ...
1252
  {
bb9d97b6d   Tejun Heo   cgroup: don't use...
1253
  	struct task_struct *task;
1f7dd3e5a   Tejun Heo   cgroup: fix handl...
1254
  	struct cgroup_subsys_state *dst_css;
31e4c28d9   Vivek Goyal   blkio: Introduce ...
1255
1256
1257
1258
  	struct io_context *ioc;
  	int ret = 0;
  
  	/* task_lock() is needed to avoid races with exit_io_context() */
1f7dd3e5a   Tejun Heo   cgroup: fix handl...
1259
  	cgroup_taskset_for_each(task, dst_css, tset) {
bb9d97b6d   Tejun Heo   cgroup: don't use...
1260
1261
1262
1263
1264
1265
1266
1267
  		task_lock(task);
  		ioc = task->io_context;
  		if (ioc && atomic_read(&ioc->nr_tasks) > 1)
  			ret = -EINVAL;
  		task_unlock(task);
  		if (ret)
  			break;
  	}
31e4c28d9   Vivek Goyal   blkio: Introduce ...
1268
1269
  	return ret;
  }
69d7fde59   Tejun Heo   blkcg: use CGROUP...
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
  static void blkcg_bind(struct cgroup_subsys_state *root_css)
  {
  	int i;
  
  	mutex_lock(&blkcg_pol_mutex);
  
  	for (i = 0; i < BLKCG_MAX_POLS; i++) {
  		struct blkcg_policy *pol = blkcg_policy[i];
  		struct blkcg *blkcg;
  
  		if (!pol || !pol->cpd_bind_fn)
  			continue;
  
  		list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node)
  			if (blkcg->cpd[pol->plid])
  				pol->cpd_bind_fn(blkcg->cpd[pol->plid]);
  	}
  	mutex_unlock(&blkcg_pol_mutex);
  }
d09d8df3a   Josef Bacik   blkcg: add generi...
1289
1290
1291
1292
1293
1294
  static void blkcg_exit(struct task_struct *tsk)
  {
  	if (tsk->throttle_queue)
  		blk_put_queue(tsk->throttle_queue);
  	tsk->throttle_queue = NULL;
  }
c165b3e3c   Tejun Heo   blkcg: rename sub...
1295
  struct cgroup_subsys io_cgrp_subsys = {
92fb97487   Tejun Heo   cgroup: rename ->...
1296
1297
1298
  	.css_alloc = blkcg_css_alloc,
  	.css_offline = blkcg_css_offline,
  	.css_free = blkcg_css_free,
3c798398e   Tejun Heo   blkcg: mass renam...
1299
  	.can_attach = blkcg_can_attach,
69d7fde59   Tejun Heo   blkcg: use CGROUP...
1300
  	.bind = blkcg_bind,
2ee867dcf   Tejun Heo   blkcg: implement ...
1301
  	.dfl_cftypes = blkcg_files,
880f50e22   Tejun Heo   blkcg: mark exist...
1302
  	.legacy_cftypes = blkcg_legacy_files,
c165b3e3c   Tejun Heo   blkcg: rename sub...
1303
  	.legacy_name = "blkio",
d09d8df3a   Josef Bacik   blkcg: add generi...
1304
  	.exit = blkcg_exit,
1ced953b1   Tejun Heo   blkcg, memcg: mak...
1305
1306
1307
1308
1309
1310
1311
1312
  #ifdef CONFIG_MEMCG
  	/*
  	 * This ensures that, if available, memcg is automatically enabled
  	 * together on the default hierarchy so that the owner cgroup can
  	 * be retrieved from writeback pages.
  	 */
  	.depends_on = 1 << memory_cgrp_id,
  #endif
676f7c8f8   Tejun Heo   cgroup: relocate ...
1313
  };
c165b3e3c   Tejun Heo   blkcg: rename sub...
1314
  EXPORT_SYMBOL_GPL(io_cgrp_subsys);
676f7c8f8   Tejun Heo   cgroup: relocate ...
1315

8bd435b30   Tejun Heo   blkcg: remove sta...
1316
  /**
a2b1693ba   Tejun Heo   blkcg: implement ...
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
   * blkcg_activate_policy - activate a blkcg policy on a request_queue
   * @q: request_queue of interest
   * @pol: blkcg policy to activate
   *
   * Activate @pol on @q.  Requires %GFP_KERNEL context.  @q goes through
   * bypass mode to populate its blkgs with policy_data for @pol.
   *
   * Activation happens with @q bypassed, so nobody would be accessing blkgs
   * from IO path.  Update of each blkg is protected by both queue and blkcg
   * locks so that holding either lock and testing blkcg_policy_enabled() is
   * always enough for dereferencing policy data.
   *
   * The caller is responsible for synchronizing [de]activations and policy
   * [un]registerations.  Returns 0 on success, -errno on failure.
   */
  int blkcg_activate_policy(struct request_queue *q,
3c798398e   Tejun Heo   blkcg: mass renam...
1333
  			  const struct blkcg_policy *pol)
a2b1693ba   Tejun Heo   blkcg: implement ...
1334
  {
4c55f4f9a   Tejun Heo   blkcg: restructur...
1335
  	struct blkg_policy_data *pd_prealloc = NULL;
ec13b1d6f   Tejun Heo   blkcg: always cre...
1336
  	struct blkcg_gq *blkg;
4c55f4f9a   Tejun Heo   blkcg: restructur...
1337
  	int ret;
a2b1693ba   Tejun Heo   blkcg: implement ...
1338
1339
1340
  
  	if (blkcg_policy_enabled(q, pol))
  		return 0;
38dbb7dd4   Jens Axboe   blk-cgroup: don't...
1341
  	if (q->mq_ops)
bd166ef18   Jens Axboe   blk-mq-sched: add...
1342
  		blk_mq_freeze_queue(q);
38dbb7dd4   Jens Axboe   blk-cgroup: don't...
1343
  	else
bd166ef18   Jens Axboe   blk-mq-sched: add...
1344
  		blk_queue_bypass_start(q);
4c55f4f9a   Tejun Heo   blkcg: restructur...
1345
1346
  pd_prealloc:
  	if (!pd_prealloc) {
001bea73e   Tejun Heo   blkcg: replace bl...
1347
  		pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q->node);
4c55f4f9a   Tejun Heo   blkcg: restructur...
1348
  		if (!pd_prealloc) {
a2b1693ba   Tejun Heo   blkcg: implement ...
1349
  			ret = -ENOMEM;
4c55f4f9a   Tejun Heo   blkcg: restructur...
1350
  			goto out_bypass_end;
a2b1693ba   Tejun Heo   blkcg: implement ...
1351
  		}
a2b1693ba   Tejun Heo   blkcg: implement ...
1352
  	}
a2b1693ba   Tejun Heo   blkcg: implement ...
1353
1354
1355
  	spin_lock_irq(q->queue_lock);
  
  	list_for_each_entry(blkg, &q->blkg_list, q_node) {
4c55f4f9a   Tejun Heo   blkcg: restructur...
1356
1357
1358
1359
  		struct blkg_policy_data *pd;
  
  		if (blkg->pd[pol->plid])
  			continue;
a2b1693ba   Tejun Heo   blkcg: implement ...
1360

e00f4f4d0   Tejun Heo   block,blkcg: use ...
1361
  		pd = pol->pd_alloc_fn(GFP_NOWAIT | __GFP_NOWARN, q->node);
4c55f4f9a   Tejun Heo   blkcg: restructur...
1362
1363
1364
1365
1366
1367
  		if (!pd)
  			swap(pd, pd_prealloc);
  		if (!pd) {
  			spin_unlock_irq(q->queue_lock);
  			goto pd_prealloc;
  		}
a2b1693ba   Tejun Heo   blkcg: implement ...
1368
1369
1370
  
  		blkg->pd[pol->plid] = pd;
  		pd->blkg = blkg;
b276a876a   Tejun Heo   blkcg: add blkg_p...
1371
  		pd->plid = pol->plid;
3e4187104   Tejun Heo   blkcg: make blkcg...
1372
  		if (pol->pd_init_fn)
a9520cd6f   Tejun Heo   blkcg: make blkcg...
1373
  			pol->pd_init_fn(pd);
a2b1693ba   Tejun Heo   blkcg: implement ...
1374
1375
1376
1377
  	}
  
  	__set_bit(pol->plid, q->blkcg_pols);
  	ret = 0;
4c55f4f9a   Tejun Heo   blkcg: restructur...
1378

a2b1693ba   Tejun Heo   blkcg: implement ...
1379
  	spin_unlock_irq(q->queue_lock);
4c55f4f9a   Tejun Heo   blkcg: restructur...
1380
  out_bypass_end:
38dbb7dd4   Jens Axboe   blk-cgroup: don't...
1381
  	if (q->mq_ops)
bd166ef18   Jens Axboe   blk-mq-sched: add...
1382
  		blk_mq_unfreeze_queue(q);
38dbb7dd4   Jens Axboe   blk-cgroup: don't...
1383
  	else
bd166ef18   Jens Axboe   blk-mq-sched: add...
1384
  		blk_queue_bypass_end(q);
001bea73e   Tejun Heo   blkcg: replace bl...
1385
1386
  	if (pd_prealloc)
  		pol->pd_free_fn(pd_prealloc);
a2b1693ba   Tejun Heo   blkcg: implement ...
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
  	return ret;
  }
  EXPORT_SYMBOL_GPL(blkcg_activate_policy);
  
  /**
   * blkcg_deactivate_policy - deactivate a blkcg policy on a request_queue
   * @q: request_queue of interest
   * @pol: blkcg policy to deactivate
   *
   * Deactivate @pol on @q.  Follows the same synchronization rules as
   * blkcg_activate_policy().
   */
  void blkcg_deactivate_policy(struct request_queue *q,
3c798398e   Tejun Heo   blkcg: mass renam...
1400
  			     const struct blkcg_policy *pol)
a2b1693ba   Tejun Heo   blkcg: implement ...
1401
  {
3c798398e   Tejun Heo   blkcg: mass renam...
1402
  	struct blkcg_gq *blkg;
a2b1693ba   Tejun Heo   blkcg: implement ...
1403
1404
1405
  
  	if (!blkcg_policy_enabled(q, pol))
  		return;
38dbb7dd4   Jens Axboe   blk-cgroup: don't...
1406
  	if (q->mq_ops)
bd166ef18   Jens Axboe   blk-mq-sched: add...
1407
  		blk_mq_freeze_queue(q);
38dbb7dd4   Jens Axboe   blk-cgroup: don't...
1408
  	else
bd166ef18   Jens Axboe   blk-mq-sched: add...
1409
  		blk_queue_bypass_start(q);
a2b1693ba   Tejun Heo   blkcg: implement ...
1410
1411
1412
1413
1414
  	spin_lock_irq(q->queue_lock);
  
  	__clear_bit(pol->plid, q->blkcg_pols);
  
  	list_for_each_entry(blkg, &q->blkg_list, q_node) {
001bea73e   Tejun Heo   blkcg: replace bl...
1415
  		if (blkg->pd[pol->plid]) {
6b0654620   Dennis Zhou (Facebook)   Revert "blk-throt...
1416
  			if (pol->pd_offline_fn)
a9520cd6f   Tejun Heo   blkcg: make blkcg...
1417
  				pol->pd_offline_fn(blkg->pd[pol->plid]);
001bea73e   Tejun Heo   blkcg: replace bl...
1418
1419
1420
  			pol->pd_free_fn(blkg->pd[pol->plid]);
  			blkg->pd[pol->plid] = NULL;
  		}
a2b1693ba   Tejun Heo   blkcg: implement ...
1421
1422
1423
  	}
  
  	spin_unlock_irq(q->queue_lock);
bd166ef18   Jens Axboe   blk-mq-sched: add...
1424

38dbb7dd4   Jens Axboe   blk-cgroup: don't...
1425
  	if (q->mq_ops)
bd166ef18   Jens Axboe   blk-mq-sched: add...
1426
  		blk_mq_unfreeze_queue(q);
38dbb7dd4   Jens Axboe   blk-cgroup: don't...
1427
  	else
bd166ef18   Jens Axboe   blk-mq-sched: add...
1428
  		blk_queue_bypass_end(q);
a2b1693ba   Tejun Heo   blkcg: implement ...
1429
1430
1431
1432
  }
  EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);
  
  /**
3c798398e   Tejun Heo   blkcg: mass renam...
1433
1434
   * blkcg_policy_register - register a blkcg policy
   * @pol: blkcg policy to register
8bd435b30   Tejun Heo   blkcg: remove sta...
1435
   *
3c798398e   Tejun Heo   blkcg: mass renam...
1436
1437
   * Register @pol with blkcg core.  Might sleep and @pol may be modified on
   * successful registration.  Returns 0 on success and -errno on failure.
8bd435b30   Tejun Heo   blkcg: remove sta...
1438
   */
d5bf02914   Jens Axboe   Revert "block: ad...
1439
  int blkcg_policy_register(struct blkcg_policy *pol)
3e2520668   Vivek Goyal   blkio: Implement ...
1440
  {
06b285bd1   Tejun Heo   blkcg: fix blkcg_...
1441
  	struct blkcg *blkcg;
8bd435b30   Tejun Heo   blkcg: remove sta...
1442
  	int i, ret;
e8989fae3   Tejun Heo   blkcg: unify blkg...
1443

838f13bf4   Tejun Heo   blkcg: allow blkc...
1444
  	mutex_lock(&blkcg_pol_register_mutex);
bc0d6501a   Tejun Heo   blkcg: kill blkio...
1445
  	mutex_lock(&blkcg_pol_mutex);
8bd435b30   Tejun Heo   blkcg: remove sta...
1446
1447
1448
  	/* find an empty slot */
  	ret = -ENOSPC;
  	for (i = 0; i < BLKCG_MAX_POLS; i++)
3c798398e   Tejun Heo   blkcg: mass renam...
1449
  		if (!blkcg_policy[i])
8bd435b30   Tejun Heo   blkcg: remove sta...
1450
  			break;
01c5f85ae   Jens Axboe   blk-cgroup: incre...
1451
1452
1453
  	if (i >= BLKCG_MAX_POLS) {
  		pr_warn("blkcg_policy_register: BLKCG_MAX_POLS too small
  ");
838f13bf4   Tejun Heo   blkcg: allow blkc...
1454
  		goto err_unlock;
01c5f85ae   Jens Axboe   blk-cgroup: incre...
1455
  	}
035d10b2f   Tejun Heo   blkcg: add blkio_...
1456

e84010732   weiping zhang   blkcg: add sanity...
1457
1458
1459
1460
  	/* Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs */
  	if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) ||
  		(!pol->pd_alloc_fn ^ !pol->pd_free_fn))
  		goto err_unlock;
06b285bd1   Tejun Heo   blkcg: fix blkcg_...
1461
  	/* register @pol */
3c798398e   Tejun Heo   blkcg: mass renam...
1462
  	pol->plid = i;
06b285bd1   Tejun Heo   blkcg: fix blkcg_...
1463
1464
1465
  	blkcg_policy[pol->plid] = pol;
  
  	/* allocate and install cpd's */
e4a9bde95   Tejun Heo   blkcg: replace bl...
1466
  	if (pol->cpd_alloc_fn) {
06b285bd1   Tejun Heo   blkcg: fix blkcg_...
1467
1468
  		list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
  			struct blkcg_policy_data *cpd;
e4a9bde95   Tejun Heo   blkcg: replace bl...
1469
  			cpd = pol->cpd_alloc_fn(GFP_KERNEL);
bbb427e34   Bart Van Assche   blkcg: Unlock blk...
1470
  			if (!cpd)
06b285bd1   Tejun Heo   blkcg: fix blkcg_...
1471
  				goto err_free_cpds;
06b285bd1   Tejun Heo   blkcg: fix blkcg_...
1472

814376483   Tejun Heo   blkcg: minor upda...
1473
1474
  			blkcg->cpd[pol->plid] = cpd;
  			cpd->blkcg = blkcg;
06b285bd1   Tejun Heo   blkcg: fix blkcg_...
1475
  			cpd->plid = pol->plid;
814376483   Tejun Heo   blkcg: minor upda...
1476
  			pol->cpd_init_fn(cpd);
06b285bd1   Tejun Heo   blkcg: fix blkcg_...
1477
1478
  		}
  	}
838f13bf4   Tejun Heo   blkcg: allow blkc...
1479
  	mutex_unlock(&blkcg_pol_mutex);
8bd435b30   Tejun Heo   blkcg: remove sta...
1480

8bd435b30   Tejun Heo   blkcg: remove sta...
1481
  	/* everything is in place, add intf files for the new policy */
2ee867dcf   Tejun Heo   blkcg: implement ...
1482
1483
1484
  	if (pol->dfl_cftypes)
  		WARN_ON(cgroup_add_dfl_cftypes(&io_cgrp_subsys,
  					       pol->dfl_cftypes));
880f50e22   Tejun Heo   blkcg: mark exist...
1485
  	if (pol->legacy_cftypes)
c165b3e3c   Tejun Heo   blkcg: rename sub...
1486
  		WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys,
880f50e22   Tejun Heo   blkcg: mark exist...
1487
  						  pol->legacy_cftypes));
838f13bf4   Tejun Heo   blkcg: allow blkc...
1488
1489
  	mutex_unlock(&blkcg_pol_register_mutex);
  	return 0;
06b285bd1   Tejun Heo   blkcg: fix blkcg_...
1490
  err_free_cpds:
58a9edce0   weiping zhang   blkcg: check pol-...
1491
  	if (pol->cpd_free_fn) {
06b285bd1   Tejun Heo   blkcg: fix blkcg_...
1492
  		list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
e4a9bde95   Tejun Heo   blkcg: replace bl...
1493
1494
1495
1496
  			if (blkcg->cpd[pol->plid]) {
  				pol->cpd_free_fn(blkcg->cpd[pol->plid]);
  				blkcg->cpd[pol->plid] = NULL;
  			}
06b285bd1   Tejun Heo   blkcg: fix blkcg_...
1497
1498
1499
  		}
  	}
  	blkcg_policy[pol->plid] = NULL;
838f13bf4   Tejun Heo   blkcg: allow blkc...
1500
  err_unlock:
bc0d6501a   Tejun Heo   blkcg: kill blkio...
1501
  	mutex_unlock(&blkcg_pol_mutex);
838f13bf4   Tejun Heo   blkcg: allow blkc...
1502
  	mutex_unlock(&blkcg_pol_register_mutex);
8bd435b30   Tejun Heo   blkcg: remove sta...
1503
  	return ret;
3e2520668   Vivek Goyal   blkio: Implement ...
1504
  }
3c798398e   Tejun Heo   blkcg: mass renam...
1505
  EXPORT_SYMBOL_GPL(blkcg_policy_register);
3e2520668   Vivek Goyal   blkio: Implement ...
1506

8bd435b30   Tejun Heo   blkcg: remove sta...
1507
  /**
3c798398e   Tejun Heo   blkcg: mass renam...
1508
1509
   * blkcg_policy_unregister - unregister a blkcg policy
   * @pol: blkcg policy to unregister
8bd435b30   Tejun Heo   blkcg: remove sta...
1510
   *
3c798398e   Tejun Heo   blkcg: mass renam...
1511
   * Undo blkcg_policy_register(@pol).  Might sleep.
8bd435b30   Tejun Heo   blkcg: remove sta...
1512
   */
3c798398e   Tejun Heo   blkcg: mass renam...
1513
  void blkcg_policy_unregister(struct blkcg_policy *pol)
3e2520668   Vivek Goyal   blkio: Implement ...
1514
  {
06b285bd1   Tejun Heo   blkcg: fix blkcg_...
1515
  	struct blkcg *blkcg;
838f13bf4   Tejun Heo   blkcg: allow blkc...
1516
  	mutex_lock(&blkcg_pol_register_mutex);
bc0d6501a   Tejun Heo   blkcg: kill blkio...
1517

3c798398e   Tejun Heo   blkcg: mass renam...
1518
  	if (WARN_ON(blkcg_policy[pol->plid] != pol))
8bd435b30   Tejun Heo   blkcg: remove sta...
1519
1520
1521
  		goto out_unlock;
  
  	/* kill the intf files first */
2ee867dcf   Tejun Heo   blkcg: implement ...
1522
1523
  	if (pol->dfl_cftypes)
  		cgroup_rm_cftypes(pol->dfl_cftypes);
880f50e22   Tejun Heo   blkcg: mark exist...
1524
1525
  	if (pol->legacy_cftypes)
  		cgroup_rm_cftypes(pol->legacy_cftypes);
44ea53de4   Tejun Heo   blkcg: implement ...
1526

06b285bd1   Tejun Heo   blkcg: fix blkcg_...
1527
  	/* remove cpds and unregister */
838f13bf4   Tejun Heo   blkcg: allow blkc...
1528
  	mutex_lock(&blkcg_pol_mutex);
06b285bd1   Tejun Heo   blkcg: fix blkcg_...
1529

58a9edce0   weiping zhang   blkcg: check pol-...
1530
  	if (pol->cpd_free_fn) {
06b285bd1   Tejun Heo   blkcg: fix blkcg_...
1531
  		list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
e4a9bde95   Tejun Heo   blkcg: replace bl...
1532
1533
1534
1535
  			if (blkcg->cpd[pol->plid]) {
  				pol->cpd_free_fn(blkcg->cpd[pol->plid]);
  				blkcg->cpd[pol->plid] = NULL;
  			}
06b285bd1   Tejun Heo   blkcg: fix blkcg_...
1536
1537
  		}
  	}
3c798398e   Tejun Heo   blkcg: mass renam...
1538
  	blkcg_policy[pol->plid] = NULL;
06b285bd1   Tejun Heo   blkcg: fix blkcg_...
1539

bc0d6501a   Tejun Heo   blkcg: kill blkio...
1540
  	mutex_unlock(&blkcg_pol_mutex);
838f13bf4   Tejun Heo   blkcg: allow blkc...
1541
1542
  out_unlock:
  	mutex_unlock(&blkcg_pol_register_mutex);
3e2520668   Vivek Goyal   blkio: Implement ...
1543
  }
3c798398e   Tejun Heo   blkcg: mass renam...
1544
  EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
903d23f0a   Josef Bacik   blk-cgroup: allow...
1545

d09d8df3a   Josef Bacik   blkcg: add generi...
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
  /*
   * Scale the accumulated delay based on how long it has been since we updated
   * the delay.  We only call this when we are adding delay, in case it's been a
   * while since we added delay, and when we are checking to see if we need to
   * delay a task, to account for any delays that may have occurred.
   */
  static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now)
  {
  	u64 old = atomic64_read(&blkg->delay_start);
  
  	/*
  	 * We only want to scale down every second.  The idea here is that we
  	 * want to delay people for min(delay_nsec, NSEC_PER_SEC) in a certain
  	 * time window.  We only want to throttle tasks for recent delay that
  	 * has occurred, in 1 second time windows since that's the maximum
  	 * things can be throttled.  We save the current delay window in
  	 * blkg->last_delay so we know what amount is still left to be charged
  	 * to the blkg from this point onward.  blkg->last_use keeps track of
  	 * the use_delay counter.  The idea is if we're unthrottling the blkg we
  	 * are ok with whatever is happening now, and we can take away more of
  	 * the accumulated delay as we've already throttled enough that
  	 * everybody is happy with their IO latencies.
  	 */
  	if (time_before64(old + NSEC_PER_SEC, now) &&
  	    atomic64_cmpxchg(&blkg->delay_start, old, now) == old) {
  		u64 cur = atomic64_read(&blkg->delay_nsec);
  		u64 sub = min_t(u64, blkg->last_delay, now - old);
  		int cur_use = atomic_read(&blkg->use_delay);
  
  		/*
  		 * We've been unthrottled, subtract a larger chunk of our
  		 * accumulated delay.
  		 */
  		if (cur_use < blkg->last_use)
  			sub = max_t(u64, sub, blkg->last_delay >> 1);
  
  		/*
  		 * This shouldn't happen, but handle it anyway.  Our delay_nsec
  		 * should only ever be growing except here where we subtract out
  		 * min(last_delay, 1 second), but lord knows bugs happen and I'd
  		 * rather not end up with negative numbers.
  		 */
  		if (unlikely(cur < sub)) {
  			atomic64_set(&blkg->delay_nsec, 0);
  			blkg->last_delay = 0;
  		} else {
  			atomic64_sub(sub, &blkg->delay_nsec);
  			blkg->last_delay = cur - sub;
  		}
  		blkg->last_use = cur_use;
  	}
  }
  
  /*
   * This is called when we want to actually walk up the hierarchy and check to
   * see if we need to throttle, and then actually throttle if there is some
   * accumulated delay.  This should only be called upon return to user space so
   * we're not holding some lock that would induce a priority inversion.
   */
  static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
  {
  	u64 now = ktime_to_ns(ktime_get());
  	u64 exp;
  	u64 delay_nsec = 0;
  	int tok;
  
  	while (blkg->parent) {
  		if (atomic_read(&blkg->use_delay)) {
  			blkcg_scale_delay(blkg, now);
  			delay_nsec = max_t(u64, delay_nsec,
  					   atomic64_read(&blkg->delay_nsec));
  		}
  		blkg = blkg->parent;
  	}
  
  	if (!delay_nsec)
  		return;
  
  	/*
  	 * Let's not sleep for all eternity if we've amassed a huge delay.
  	 * Swapping or metadata IO can accumulate 10's of seconds worth of
  	 * delay, and we want userspace to be able to do _something_ so cap the
  	 * delays at 1 second.  If there's 10's of seconds worth of delay then
  	 * the tasks will be delayed for 1 second for every syscall.
  	 */
  	delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC);
  
  	/*
  	 * TODO: the use_memdelay flag is going to be for the upcoming psi stuff
  	 * that hasn't landed upstream yet.  Once that stuff is in place we need
  	 * to do a psi_memstall_enter/leave if memdelay is set.
  	 */
  
  	exp = ktime_add_ns(now, delay_nsec);
  	tok = io_schedule_prepare();
  	do {
  		__set_current_state(TASK_KILLABLE);
  		if (!schedule_hrtimeout(&exp, HRTIMER_MODE_ABS))
  			break;
  	} while (!fatal_signal_pending(current));
  	io_schedule_finish(tok);
  }
  
  /**
   * blkcg_maybe_throttle_current - throttle the current task if it has been marked
   *
   * This is only called if we've been marked with set_notify_resume().  Obviously
   * we can be set_notify_resume() for reasons other than blkcg throttling, so we
   * check to see if current->throttle_queue is set and if not this doesn't do
   * anything.  This should only ever be called by the resume code, it's not meant
   * to be called by people willy-nilly as it will actually do the work to
   * throttle the task if it is setup for throttling.
   */
  void blkcg_maybe_throttle_current(void)
  {
  	struct request_queue *q = current->throttle_queue;
  	struct cgroup_subsys_state *css;
  	struct blkcg *blkcg;
  	struct blkcg_gq *blkg;
  	bool use_memdelay = current->use_memdelay;
  
  	if (!q)
  		return;
  
  	current->throttle_queue = NULL;
  	current->use_memdelay = false;
  
  	rcu_read_lock();
  	css = kthread_blkcg();
  	if (css)
  		blkcg = css_to_blkcg(css);
  	else
  		blkcg = css_to_blkcg(task_css(current, io_cgrp_id));
  
  	if (!blkcg)
  		goto out;
  	blkg = blkg_lookup(blkcg, q);
  	if (!blkg)
  		goto out;
  	blkg = blkg_try_get(blkg);
  	if (!blkg)
  		goto out;
  	rcu_read_unlock();
d09d8df3a   Josef Bacik   blkcg: add generi...
1689
1690
1691
  
  	blkcg_maybe_throttle_blkg(blkg, use_memdelay);
  	blkg_put(blkg);
cc7ecc258   Josef Bacik   blk-cgroup: hold ...
1692
  	blk_put_queue(q);
d09d8df3a   Josef Bacik   blkcg: add generi...
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
  	return;
  out:
  	rcu_read_unlock();
  	blk_put_queue(q);
  }
  EXPORT_SYMBOL_GPL(blkcg_maybe_throttle_current);
  
  /**
   * blkcg_schedule_throttle - this task needs to check for throttling
   * @q - the request queue IO was submitted on
   * @use_memdelay - do we charge this to memory delay for PSI
   *
   * This is called by the IO controller when we know there's delay accumulated
   * for the blkg for this task.  We do not pass the blkg because there are places
   * we call this that may not have that information, the swapping code for
   * instance will only have a request_queue at that point.  This set's the
   * notify_resume for the task to check and see if it requires throttling before
   * returning to user space.
   *
   * We will only schedule once per syscall.  You can call this over and over
   * again and it will only do the check once upon return to user space, and only
   * throttle once.  If the task needs to be throttled again it'll need to be
   * re-set at the next time we see the task.
   */
  void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay)
  {
  	if (unlikely(current->flags & PF_KTHREAD))
  		return;
  
  	if (!blk_get_queue(q))
  		return;
  
  	if (current->throttle_queue)
  		blk_put_queue(current->throttle_queue);
  	current->throttle_queue = q;
  	if (use_memdelay)
  		current->use_memdelay = use_memdelay;
  	set_notify_resume(current);
  }
  EXPORT_SYMBOL_GPL(blkcg_schedule_throttle);
  
  /**
   * blkcg_add_delay - add delay to this blkg
   * @now - the current time in nanoseconds
   * @delta - how many nanoseconds of delay to add
   *
   * Charge @delta to the blkg's current delay accumulation.  This is used to
   * throttle tasks if an IO controller thinks we need more throttling.
   */
  void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
  {
  	blkcg_scale_delay(blkg, now);
  	atomic64_add(delta, &blkg->delay_nsec);
  }
  EXPORT_SYMBOL_GPL(blkcg_add_delay);
903d23f0a   Josef Bacik   blk-cgroup: allow...
1748
1749
  module_param(blkcg_debug_stats, bool, 0644);
  MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");