Doug / smarc-fsl-linux-kernel

1

/*

1

/*

2

* Interface for controlling IO bandwidth on a request queue

2

* Interface for controlling IO bandwidth on a request queue

3

*

3

*

4

5

*/

5

*/

6

7

#include <linux/module.h>

7

#include <linux/module.h>

8

#include <linux/slab.h>

8

#include <linux/slab.h>

9

#include <linux/blkdev.h>

9

#include <linux/blkdev.h>

10

#include <linux/bio.h>

10

#include <linux/bio.h>

11

#include <linux/blktrace_api.h>

11

#include <linux/blktrace_api.h>

12

#include "blk-cgroup.h"

12

#include "blk-cgroup.h"

13

#include "blk.h"

13

#include "blk.h"

14

15

/* Max dispatch from a group in 1 round */

15

/* Max dispatch from a group in 1 round */

16

static int throtl_grp_quantum = 8;

16

static int throtl_grp_quantum = 8;

17

18

/* Total max dispatch from all groups in one round */

18

/* Total max dispatch from all groups in one round */

19

static int throtl_quantum = 32;

19

static int throtl_quantum = 32;

20

21

/* Throttling is performed over 100ms slice and after that slice is renewed */

21

/* Throttling is performed over 100ms slice and after that slice is renewed */

22

static unsigned long throtl_slice = HZ/10; /* 100 ms */

22

static unsigned long throtl_slice = HZ/10; /* 100 ms */

23

24

static struct blkcg_policy blkcg_policy_throtl;

24

static struct blkcg_policy blkcg_policy_throtl;

25

26

/* A workqueue to queue throttle related work */

26

/* A workqueue to queue throttle related work */

27

static struct workqueue_struct *kthrotld_workqueue;

27

static struct workqueue_struct *kthrotld_workqueue;

28

29

struct throtl_service_queue {

29

struct throtl_service_queue {

30

struct rb_root pending_tree; /* RB tree of active tgs */

30

struct rb_root pending_tree; /* RB tree of active tgs */

31

struct rb_node *first_pending; /* first node in the tree */

31

struct rb_node *first_pending; /* first node in the tree */

32

unsigned int nr_pending; /* # queued in the tree */

32

unsigned int nr_pending; /* # queued in the tree */

33

unsigned long first_pending_disptime; /* disptime of the first tg */

33

unsigned long first_pending_disptime; /* disptime of the first tg */

34

};

34

};

35

36

#define THROTL_SERVICE_QUEUE_INITIALIZER \

36

#define THROTL_SERVICE_QUEUE_INITIALIZER \

37

(struct throtl_service_queue){ .pending_tree = RB_ROOT }

37

(struct throtl_service_queue){ .pending_tree = RB_ROOT }

38

39

enum tg_state_flags {

39

enum tg_state_flags {

40

THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */

40

THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */

41

};

41

};

42

43

#define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node)

43

#define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node)

44

45

/* Per-cpu group stats */

45

/* Per-cpu group stats */

46

struct tg_stats_cpu {

46

struct tg_stats_cpu {

47

/* total bytes transferred */

47

/* total bytes transferred */

48

struct blkg_rwstat service_bytes;

48

struct blkg_rwstat service_bytes;

49

/* total IOs serviced, post merge */

49

/* total IOs serviced, post merge */

50

struct blkg_rwstat serviced;

50

struct blkg_rwstat serviced;

51

};

51

};

52

53

struct throtl_grp {

53

struct throtl_grp {

54

/* must be the first member */

54

/* must be the first member */

55

struct blkg_policy_data pd;

55

struct blkg_policy_data pd;

56

57

/* active throtl group service_queue member */

57

/* active throtl group service_queue member */

58

struct rb_node rb_node;

58

struct rb_node rb_node;

59

60

/* throtl_data this group belongs to */

60

/* throtl_data this group belongs to */

61

struct throtl_data *td;

61

struct throtl_data *td;

62

63

/*

63

/*

64

* Dispatch time in jiffies. This is the estimated time when group

64

* Dispatch time in jiffies. This is the estimated time when group

65

* will unthrottle and is ready to dispatch more bio. It is used as

65

* will unthrottle and is ready to dispatch more bio. It is used as

66

* key to sort active groups in service tree.

66

* key to sort active groups in service tree.

67

*/

67

*/

68

unsigned long disptime;

68

unsigned long disptime;

69

70

unsigned int flags;

70

unsigned int flags;

71

72

/* Two lists for READ and WRITE */

72

/* Two lists for READ and WRITE */

73

struct bio_list bio_lists[2];

73

struct bio_list bio_lists[2];

74

75

/* Number of queued bios on READ and WRITE lists */

75

/* Number of queued bios on READ and WRITE lists */

76

unsigned int nr_queued[2];

76

unsigned int nr_queued[2];

77

78

/* bytes per second rate limits */

78

/* bytes per second rate limits */

79

uint64_t bps[2];

79

uint64_t bps[2];

80

81

/* IOPS limits */

81

/* IOPS limits */

82

unsigned int iops[2];

82

unsigned int iops[2];

83

84

/* Number of bytes disptached in current slice */

84

/* Number of bytes disptached in current slice */

85

uint64_t bytes_disp[2];

85

uint64_t bytes_disp[2];

86

/* Number of bio's dispatched in current slice */

86

/* Number of bio's dispatched in current slice */

87

unsigned int io_disp[2];

87

unsigned int io_disp[2];

88

89

/* When did we start a new slice */

89

/* When did we start a new slice */

90

unsigned long slice_start[2];

90

unsigned long slice_start[2];

91

unsigned long slice_end[2];

91

unsigned long slice_end[2];

92

93

/* Per cpu stats pointer */

93

/* Per cpu stats pointer */

94

struct tg_stats_cpu __percpu *stats_cpu;

94

struct tg_stats_cpu __percpu *stats_cpu;

95

96

/* List of tgs waiting for per cpu stats memory to be allocated */

96

/* List of tgs waiting for per cpu stats memory to be allocated */

97

struct list_head stats_alloc_node;

97

struct list_head stats_alloc_node;

98

};

98

};

99

100

struct throtl_data

100

struct throtl_data

101

{

101

{

102

/* service tree for active throtl groups */

102

/* service tree for active throtl groups */

103

struct throtl_service_queue service_queue;

103

struct throtl_service_queue service_queue;

104

105

struct request_queue *queue;

105

struct request_queue *queue;

106

107

/* Total Number of queued bios on READ and WRITE lists */

107

/* Total Number of queued bios on READ and WRITE lists */

108

unsigned int nr_queued[2];

108

unsigned int nr_queued[2];

109

110

/*

110

/*

111

* number of total undestroyed groups

111

* number of total undestroyed groups

112

*/

112

*/

113

unsigned int nr_undestroyed_grps;

113

unsigned int nr_undestroyed_grps;

114

115

/* Work for dispatching throttled bios */

115

/* Work for dispatching throttled bios */

116

struct delayed_work dispatch_work;

116

struct delayed_work dispatch_work;

117

};

117

};

118

119

/* list and work item to allocate percpu group stats */

119

/* list and work item to allocate percpu group stats */

120

static DEFINE_SPINLOCK(tg_stats_alloc_lock);

120

static DEFINE_SPINLOCK(tg_stats_alloc_lock);

121

static LIST_HEAD(tg_stats_alloc_list);

121

static LIST_HEAD(tg_stats_alloc_list);

122

123

static void tg_stats_alloc_fn(struct work_struct *);

123

static void tg_stats_alloc_fn(struct work_struct *);

124

static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn);

124

static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn);

125

126

static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd)

126

static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd)

127

{

127

{

128

return pd ? container_of(pd, struct throtl_grp, pd) : NULL;

128

return pd ? container_of(pd, struct throtl_grp, pd) : NULL;

129

}

129

}

130

131

static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg)

131

static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg)

132

{

132

{

133

return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl));

133

return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl));

134

}

134

}

135

136

static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg)

136

static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg)

137

{

137

{

138

return pd_to_blkg(&tg->pd);

138

return pd_to_blkg(&tg->pd);

139

}

139

}

140

141

static inline struct throtl_grp *td_root_tg(struct throtl_data *td)

141

static inline struct throtl_grp *td_root_tg(struct throtl_data *td)

142

{

142

{

143

return blkg_to_tg(td->queue->root_blkg);

143

return blkg_to_tg(td->queue->root_blkg);

144

}

144

}

145

146

#define throtl_log_tg(tg, fmt, args...) do { \

146

#define throtl_log_tg(tg, fmt, args...) do { \

147

char __pbuf[128]; \

147

char __pbuf[128]; \

148

\

148

\

149

blkg_path(tg_to_blkg(tg), __pbuf, sizeof(__pbuf)); \

149

blkg_path(tg_to_blkg(tg), __pbuf, sizeof(__pbuf)); \

150

blk_add_trace_msg((tg)->td->queue, "throtl %s " fmt, __pbuf, ##args); \

150

blk_add_trace_msg((tg)->td->queue, "throtl %s " fmt, __pbuf, ##args); \

151

} while (0)

151

} while (0)

152

153

#define throtl_log(td, fmt, args...) \

153

#define throtl_log(td, fmt, args...) \

154

blk_add_trace_msg((td)->queue, "throtl " fmt, ##args)

154

blk_add_trace_msg((td)->queue, "throtl " fmt, ##args)

155

156

/*

156

/*

157

* Worker for allocating per cpu stat for tgs. This is scheduled on the

157

* Worker for allocating per cpu stat for tgs. This is scheduled on the

158

* system_wq once there are some groups on the alloc_list waiting for

158

* system_wq once there are some groups on the alloc_list waiting for

159

* allocation.

159

* allocation.

160

*/

160

*/

161

static void tg_stats_alloc_fn(struct work_struct *work)

161

static void tg_stats_alloc_fn(struct work_struct *work)

162

{

162

{

163

static struct tg_stats_cpu *stats_cpu; /* this fn is non-reentrant */

163

static struct tg_stats_cpu *stats_cpu; /* this fn is non-reentrant */

164

struct delayed_work *dwork = to_delayed_work(work);

164

struct delayed_work *dwork = to_delayed_work(work);

165

bool empty = false;

165

bool empty = false;

166

167

alloc_stats:

167

alloc_stats:

168

if (!stats_cpu) {

168

if (!stats_cpu) {

169

stats_cpu = alloc_percpu(struct tg_stats_cpu);

169

stats_cpu = alloc_percpu(struct tg_stats_cpu);

170

if (!stats_cpu) {

170

if (!stats_cpu) {

171

/* allocation failed, try again after some time */

171

/* allocation failed, try again after some time */

172

schedule_delayed_work(dwork, msecs_to_jiffies(10));

172

schedule_delayed_work(dwork, msecs_to_jiffies(10));

173

return;

173

return;

174

}

174

}

175

}

175

}

176

177

spin_lock_irq(&tg_stats_alloc_lock);

177

spin_lock_irq(&tg_stats_alloc_lock);

178

179

if (!list_empty(&tg_stats_alloc_list)) {

179

if (!list_empty(&tg_stats_alloc_list)) {

180

struct throtl_grp *tg = list_first_entry(&tg_stats_alloc_list,

180

struct throtl_grp *tg = list_first_entry(&tg_stats_alloc_list,

181

struct throtl_grp,

181

struct throtl_grp,

182

stats_alloc_node);

182

stats_alloc_node);

183

swap(tg->stats_cpu, stats_cpu);

183

swap(tg->stats_cpu, stats_cpu);

184

list_del_init(&tg->stats_alloc_node);

184

list_del_init(&tg->stats_alloc_node);

185

}

185

}

186

187

empty = list_empty(&tg_stats_alloc_list);

187

empty = list_empty(&tg_stats_alloc_list);

188

spin_unlock_irq(&tg_stats_alloc_lock);

188

spin_unlock_irq(&tg_stats_alloc_lock);

189

if (!empty)

189

if (!empty)

190

goto alloc_stats;

190

goto alloc_stats;

191

}

191

}

192

193

static void throtl_pd_init(struct blkcg_gq *blkg)

193

static void throtl_pd_init(struct blkcg_gq *blkg)

194

{

194

{

195

struct throtl_grp *tg = blkg_to_tg(blkg);

195

struct throtl_grp *tg = blkg_to_tg(blkg);

196

unsigned long flags;

196

unsigned long flags;

197

198

RB_CLEAR_NODE(&tg->rb_node);

198

RB_CLEAR_NODE(&tg->rb_node);

199

tg->td = blkg->q->td;

199

tg->td = blkg->q->td;

200

bio_list_init(&tg->bio_lists[0]);

200

bio_list_init(&tg->bio_lists[0]);

201

bio_list_init(&tg->bio_lists[1]);

201

bio_list_init(&tg->bio_lists[1]);

202

203

tg->bps[READ] = -1;

203

tg->bps[READ] = -1;

204

tg->bps[WRITE] = -1;

204

tg->bps[WRITE] = -1;

205

tg->iops[READ] = -1;

205

tg->iops[READ] = -1;

206

tg->iops[WRITE] = -1;

206

tg->iops[WRITE] = -1;

207

208

/*

208

/*

209

* Ugh... We need to perform per-cpu allocation for tg->stats_cpu

209

* Ugh... We need to perform per-cpu allocation for tg->stats_cpu

210

* but percpu allocator can't be called from IO path. Queue tg on

210

* but percpu allocator can't be called from IO path. Queue tg on

211

* tg_stats_alloc_list and allocate from work item.

211

* tg_stats_alloc_list and allocate from work item.

212

*/

212

*/

213

spin_lock_irqsave(&tg_stats_alloc_lock, flags);

213

spin_lock_irqsave(&tg_stats_alloc_lock, flags);

214

list_add(&tg->stats_alloc_node, &tg_stats_alloc_list);

214

list_add(&tg->stats_alloc_node, &tg_stats_alloc_list);

215

schedule_delayed_work(&tg_stats_alloc_work, 0);

215

schedule_delayed_work(&tg_stats_alloc_work, 0);

216

spin_unlock_irqrestore(&tg_stats_alloc_lock, flags);

216

spin_unlock_irqrestore(&tg_stats_alloc_lock, flags);

217

}

217

}

218

219

static void throtl_pd_exit(struct blkcg_gq *blkg)

219

static void throtl_pd_exit(struct blkcg_gq *blkg)

220

{

220

{

221

struct throtl_grp *tg = blkg_to_tg(blkg);

221

struct throtl_grp *tg = blkg_to_tg(blkg);

222

unsigned long flags;

222

unsigned long flags;

223

224

spin_lock_irqsave(&tg_stats_alloc_lock, flags);

224

spin_lock_irqsave(&tg_stats_alloc_lock, flags);

225

list_del_init(&tg->stats_alloc_node);

225

list_del_init(&tg->stats_alloc_node);

226

spin_unlock_irqrestore(&tg_stats_alloc_lock, flags);

226

spin_unlock_irqrestore(&tg_stats_alloc_lock, flags);

227

228

free_percpu(tg->stats_cpu);

228

free_percpu(tg->stats_cpu);

229

}

229

}

230

231

static void throtl_pd_reset_stats(struct blkcg_gq *blkg)

231

static void throtl_pd_reset_stats(struct blkcg_gq *blkg)

232

{

232

{

233

struct throtl_grp *tg = blkg_to_tg(blkg);

233

struct throtl_grp *tg = blkg_to_tg(blkg);

234

int cpu;

234

int cpu;

235

236

if (tg->stats_cpu == NULL)

236

if (tg->stats_cpu == NULL)

237

return;

237

return;

238

239

for_each_possible_cpu(cpu) {

239

for_each_possible_cpu(cpu) {

240

struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu);

240

struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu);

241

242

blkg_rwstat_reset(&sc->service_bytes);

242

blkg_rwstat_reset(&sc->service_bytes);

243

blkg_rwstat_reset(&sc->serviced);

243

blkg_rwstat_reset(&sc->serviced);

244

}

244

}

245

}

245

}

246

247

static struct throtl_grp *throtl_lookup_tg(struct throtl_data *td,

247

static struct throtl_grp *throtl_lookup_tg(struct throtl_data *td,

248

struct blkcg *blkcg)

248

struct blkcg *blkcg)

249

{

249

{

250

/*

250

/*

251

* This is the common case when there are no blkcgs. Avoid lookup

251

* This is the common case when there are no blkcgs. Avoid lookup

252

* in this case

252

* in this case

253

*/

253

*/

254

if (blkcg == &blkcg_root)

254

if (blkcg == &blkcg_root)

255

return td_root_tg(td);

255

return td_root_tg(td);

256

257

return blkg_to_tg(blkg_lookup(blkcg, td->queue));

257

return blkg_to_tg(blkg_lookup(blkcg, td->queue));

258

}

258

}

259

260

static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td,

260

static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td,

261

struct blkcg *blkcg)

261

struct blkcg *blkcg)

262

{

262

{

263

struct request_queue *q = td->queue;

263

struct request_queue *q = td->queue;

264

struct throtl_grp *tg = NULL;

264

struct throtl_grp *tg = NULL;

265

266

/*

266

/*

267

* This is the common case when there are no blkcgs. Avoid lookup

267

* This is the common case when there are no blkcgs. Avoid lookup

268

* in this case

268

* in this case

269

*/

269

*/

270

if (blkcg == &blkcg_root) {

270

if (blkcg == &blkcg_root) {

271

tg = td_root_tg(td);

271

tg = td_root_tg(td);

272

} else {

272

} else {

273

struct blkcg_gq *blkg;

273

struct blkcg_gq *blkg;

274

275

blkg = blkg_lookup_create(blkcg, q);

275

blkg = blkg_lookup_create(blkcg, q);

276

277

/* if %NULL and @q is alive, fall back to root_tg */

277

/* if %NULL and @q is alive, fall back to root_tg */

278

if (!IS_ERR(blkg))

278

if (!IS_ERR(blkg))

279

tg = blkg_to_tg(blkg);

279

tg = blkg_to_tg(blkg);

280

else if (!blk_queue_dying(q))

280

else if (!blk_queue_dying(q))

281

tg = td_root_tg(td);

281

tg = td_root_tg(td);

282

}

282

}

283

284

return tg;

284

return tg;

285

}

285

}

286

287

static struct throtl_grp *throtl_rb_first(struct throtl_service_queue *sq)

287

static struct throtl_grp *

288

throtl_rb_first(struct throtl_service_queue *parent_sq)

288

{

289

{

289

/* Service tree is empty */

290

/* Service tree is empty */

290

if (!sq->nr_pending)

291

if (!parent_sq->nr_pending)

291

return NULL;

292

return NULL;

292

293

if (!sq->first_pending)

294

if (!parent_sq->first_pending)

294

sq->first_pending = rb_first(&sq->pending_tree);

295

parent_sq->first_pending = rb_first(&parent_sq->pending_tree);

295

296

if (sq->first_pending)

297

if (parent_sq->first_pending)

297

return rb_entry_tg(sq->first_pending);

298

return rb_entry_tg(parent_sq->first_pending);

298

299

return NULL;

300

return NULL;

300

}

301

}

301

302

static void rb_erase_init(struct rb_node *n, struct rb_root *root)

303

static void rb_erase_init(struct rb_node *n, struct rb_root *root)

303

{

304

{

304

rb_erase(n, root);

305

rb_erase(n, root);

305

RB_CLEAR_NODE(n);

306

RB_CLEAR_NODE(n);

306

}

307

}

307

308

static void throtl_rb_erase(struct rb_node *n, struct throtl_service_queue *sq)

309

static void throtl_rb_erase(struct rb_node *n,

310

struct throtl_service_queue *parent_sq)

309

{

311

{

310

if (sq->first_pending == n)

312

if (parent_sq->first_pending == n)

311

sq->first_pending = NULL;

313

parent_sq->first_pending = NULL;

312

rb_erase_init(n, &sq->pending_tree);

314

rb_erase_init(n, &parent_sq->pending_tree);

313

--sq->nr_pending;

315

--parent_sq->nr_pending;

314

}

316

}

315

317

316

static void update_min_dispatch_time(struct throtl_service_queue *sq)

318

static void update_min_dispatch_time(struct throtl_service_queue *parent_sq)

317

{

319

{

318

struct throtl_grp *tg;

320

struct throtl_grp *tg;

319

321

320

tg = throtl_rb_first(sq);

322

tg = throtl_rb_first(parent_sq);

321

if (!tg)

323

if (!tg)

322

return;

324

return;

323

325

324

sq->first_pending_disptime = tg->disptime;

326

parent_sq->first_pending_disptime = tg->disptime;

325

}

327

}

326

328

327

static void tg_service_queue_add(struct throtl_service_queue *sq,

329

static void tg_service_queue_add(struct throtl_grp *tg,

328

struct throtl_grp *tg)

330

struct throtl_service_queue *parent_sq)

329

{

331

{

330

struct rb_node **node = &sq->pending_tree.rb_node;

332

struct rb_node **node = &parent_sq->pending_tree.rb_node;

331

struct rb_node *parent = NULL;

333

struct rb_node *parent = NULL;

332

struct throtl_grp *__tg;

334

struct throtl_grp *__tg;

333

unsigned long key = tg->disptime;

335

unsigned long key = tg->disptime;

334

int left = 1;

336

int left = 1;

335

337

336

while (*node != NULL) {

338

while (*node != NULL) {

337

parent = *node;

339

parent = *node;

338

__tg = rb_entry_tg(parent);

340

__tg = rb_entry_tg(parent);

339

341

340

if (time_before(key, __tg->disptime))

342

if (time_before(key, __tg->disptime))

341

node = &parent->rb_left;

343

node = &parent->rb_left;

342

else {

344

else {

343

node = &parent->rb_right;

345

node = &parent->rb_right;

344

left = 0;

346

left = 0;

345

}

347

}

346

}

348

}

347

349

348

if (left)

350

if (left)

349

sq->first_pending = &tg->rb_node;

351

parent_sq->first_pending = &tg->rb_node;

350

352

351

rb_link_node(&tg->rb_node, parent, node);

353

rb_link_node(&tg->rb_node, parent, node);

352

rb_insert_color(&tg->rb_node, &sq->pending_tree);

354

rb_insert_color(&tg->rb_node, &parent_sq->pending_tree);

353

}

355

}

354

356

355

static void __throtl_enqueue_tg(struct throtl_service_queue *sq,

357

static void __throtl_enqueue_tg(struct throtl_grp *tg,

356

struct throtl_grp *tg)

358

struct throtl_service_queue *parent_sq)

357

{

359

{

358

tg_service_queue_add(sq, tg);

360

tg_service_queue_add(tg, parent_sq);

359

tg->flags |= THROTL_TG_PENDING;

361

tg->flags |= THROTL_TG_PENDING;

360

sq->nr_pending++;

362

parent_sq->nr_pending++;

361

}

363

}

362

364

363

static void throtl_enqueue_tg(struct throtl_service_queue *sq,

365

static void throtl_enqueue_tg(struct throtl_grp *tg,

364

struct throtl_grp *tg)

366

struct throtl_service_queue *parent_sq)

365

{

367

{

366

if (!(tg->flags & THROTL_TG_PENDING))

368

if (!(tg->flags & THROTL_TG_PENDING))

367

__throtl_enqueue_tg(sq, tg);

369

__throtl_enqueue_tg(tg, parent_sq);

368

}

370

}

369

371

370

static void __throtl_dequeue_tg(struct throtl_service_queue *sq,

372

static void __throtl_dequeue_tg(struct throtl_grp *tg,

371

struct throtl_grp *tg)

373

struct throtl_service_queue *parent_sq)

372

{

374

{

373

throtl_rb_erase(&tg->rb_node, sq);

375

throtl_rb_erase(&tg->rb_node, parent_sq);

374

tg->flags &= ~THROTL_TG_PENDING;

376

tg->flags &= ~THROTL_TG_PENDING;

375

}

377

}

376

378

377

static void throtl_dequeue_tg(struct throtl_service_queue *sq,

379

static void throtl_dequeue_tg(struct throtl_grp *tg,

378

struct throtl_grp *tg)

380

struct throtl_service_queue *parent_sq)

379

{

381

{

380

if (tg->flags & THROTL_TG_PENDING)

382

if (tg->flags & THROTL_TG_PENDING)

381

__throtl_dequeue_tg(sq, tg);

383

__throtl_dequeue_tg(tg, parent_sq);

382

}

384

}

383

385

384

/* Call with queue lock held */

386

/* Call with queue lock held */

385

static void throtl_schedule_delayed_work(struct throtl_data *td,

387

static void throtl_schedule_delayed_work(struct throtl_data *td,

386

unsigned long delay)

388

unsigned long delay)

387

{

389

{

388

struct delayed_work *dwork = &td->dispatch_work;

390

struct delayed_work *dwork = &td->dispatch_work;

389

391

390

mod_delayed_work(kthrotld_workqueue, dwork, delay);

392

mod_delayed_work(kthrotld_workqueue, dwork, delay);

391

throtl_log(td, "schedule work. delay=%lu jiffies=%lu", delay, jiffies);

393

throtl_log(td, "schedule work. delay=%lu jiffies=%lu", delay, jiffies);

392

}

394

}

393

395

394

static void throtl_schedule_next_dispatch(struct throtl_data *td)

396

static void throtl_schedule_next_dispatch(struct throtl_data *td)

395

{

397

{

396

struct throtl_service_queue *sq = &td->service_queue;

398

struct throtl_service_queue *sq = &td->service_queue;

397

399

398

/* any pending children left? */

400

/* any pending children left? */

399

if (!sq->nr_pending)

401

if (!sq->nr_pending)

400

return;

402

return;

401

403

402

update_min_dispatch_time(sq);

404

update_min_dispatch_time(sq);

403

405

404

if (time_before_eq(sq->first_pending_disptime, jiffies))

406

if (time_before_eq(sq->first_pending_disptime, jiffies))

405

throtl_schedule_delayed_work(td, 0);

407

throtl_schedule_delayed_work(td, 0);

406

else

408

else

407

throtl_schedule_delayed_work(td, sq->first_pending_disptime - jiffies);

409

throtl_schedule_delayed_work(td, sq->first_pending_disptime - jiffies);

408

}

410

}

409

411

410

static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw)

412

static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw)

411

{

413

{

412

tg->bytes_disp[rw] = 0;

414

tg->bytes_disp[rw] = 0;

413

tg->io_disp[rw] = 0;

415

tg->io_disp[rw] = 0;

414

tg->slice_start[rw] = jiffies;

416

tg->slice_start[rw] = jiffies;

415

tg->slice_end[rw] = jiffies + throtl_slice;

417

tg->slice_end[rw] = jiffies + throtl_slice;

416

throtl_log_tg(tg, "[%c] new slice start=%lu end=%lu jiffies=%lu",

418

throtl_log_tg(tg, "[%c] new slice start=%lu end=%lu jiffies=%lu",

417

rw == READ ? 'R' : 'W', tg->slice_start[rw],

419

rw == READ ? 'R' : 'W', tg->slice_start[rw],

418

tg->slice_end[rw], jiffies);

420

tg->slice_end[rw], jiffies);

419

}

421

}

420

422

421

static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw,

423

static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw,

422

unsigned long jiffy_end)

424

unsigned long jiffy_end)

423

{

425

{

424

tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);

426

tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);

425

}

427

}

426

428

427

static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw,

429

static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw,

428

unsigned long jiffy_end)

430

unsigned long jiffy_end)

429

{

431

{

430

tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);

432

tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);

431

throtl_log_tg(tg, "[%c] extend slice start=%lu end=%lu jiffies=%lu",

433

throtl_log_tg(tg, "[%c] extend slice start=%lu end=%lu jiffies=%lu",

432

rw == READ ? 'R' : 'W', tg->slice_start[rw],

434

rw == READ ? 'R' : 'W', tg->slice_start[rw],

433

tg->slice_end[rw], jiffies);

435

tg->slice_end[rw], jiffies);

434

}

436

}

435

437

436

/* Determine if previously allocated or extended slice is complete or not */

438

/* Determine if previously allocated or extended slice is complete or not */

437

static bool throtl_slice_used(struct throtl_grp *tg, bool rw)

439

static bool throtl_slice_used(struct throtl_grp *tg, bool rw)

438

{

440

{

439

if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw]))

441

if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw]))

440

return 0;

442

return 0;

441

443

442

return 1;

444

return 1;

443

}

445

}

444

446

445

/* Trim the used slices and adjust slice start accordingly */

447

/* Trim the used slices and adjust slice start accordingly */

446

static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)

448

static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)

447

{

449

{

448

unsigned long nr_slices, time_elapsed, io_trim;

450

unsigned long nr_slices, time_elapsed, io_trim;

449

u64 bytes_trim, tmp;

451

u64 bytes_trim, tmp;

450

452

451

BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw]));

453

BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw]));

452

454

453

/*

455

/*

454

* If bps are unlimited (-1), then time slice don't get

456

* If bps are unlimited (-1), then time slice don't get

455

* renewed. Don't try to trim the slice if slice is used. A new

457

* renewed. Don't try to trim the slice if slice is used. A new

456

* slice will start when appropriate.

458

* slice will start when appropriate.

457

*/

459

*/

458

if (throtl_slice_used(tg, rw))

460

if (throtl_slice_used(tg, rw))

459

return;

461

return;

460

462

461

/*

463

/*

462

* A bio has been dispatched. Also adjust slice_end. It might happen

464

* A bio has been dispatched. Also adjust slice_end. It might happen

463

* that initially cgroup limit was very low resulting in high

465

* that initially cgroup limit was very low resulting in high

464

* slice_end, but later limit was bumped up and bio was dispached

466

* slice_end, but later limit was bumped up and bio was dispached

465

* sooner, then we need to reduce slice_end. A high bogus slice_end

467

* sooner, then we need to reduce slice_end. A high bogus slice_end

466

* is bad because it does not allow new slice to start.

468

* is bad because it does not allow new slice to start.

467

*/

469

*/

468

470

469

throtl_set_slice_end(tg, rw, jiffies + throtl_slice);

471

throtl_set_slice_end(tg, rw, jiffies + throtl_slice);

470

472

471

time_elapsed = jiffies - tg->slice_start[rw];

473

time_elapsed = jiffies - tg->slice_start[rw];

472

474

473

nr_slices = time_elapsed / throtl_slice;

475

nr_slices = time_elapsed / throtl_slice;

474

476

475

if (!nr_slices)

477

if (!nr_slices)

476

return;

478

return;

477

tmp = tg->bps[rw] * throtl_slice * nr_slices;

479

tmp = tg->bps[rw] * throtl_slice * nr_slices;

478

do_div(tmp, HZ);

480

do_div(tmp, HZ);

479

bytes_trim = tmp;

481

bytes_trim = tmp;

480

482

481

io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ;

483

io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ;

482

484

483

if (!bytes_trim && !io_trim)

485

if (!bytes_trim && !io_trim)

484

return;

486

return;

485

487

486

if (tg->bytes_disp[rw] >= bytes_trim)

488

if (tg->bytes_disp[rw] >= bytes_trim)

487

tg->bytes_disp[rw] -= bytes_trim;

489

tg->bytes_disp[rw] -= bytes_trim;

488

else

490

else

489

tg->bytes_disp[rw] = 0;

491

tg->bytes_disp[rw] = 0;

490

492

491

if (tg->io_disp[rw] >= io_trim)

493

if (tg->io_disp[rw] >= io_trim)

492

tg->io_disp[rw] -= io_trim;

494

tg->io_disp[rw] -= io_trim;

493

else

495

else

494

tg->io_disp[rw] = 0;

496

tg->io_disp[rw] = 0;

495

497

496

tg->slice_start[rw] += nr_slices * throtl_slice;

498

tg->slice_start[rw] += nr_slices * throtl_slice;

497

499

498

throtl_log_tg(tg, "[%c] trim slice nr=%lu bytes=%llu io=%lu"

500

throtl_log_tg(tg, "[%c] trim slice nr=%lu bytes=%llu io=%lu"

499

" start=%lu end=%lu jiffies=%lu",

501

" start=%lu end=%lu jiffies=%lu",

500

rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim,

502

rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim,

501

tg->slice_start[rw], tg->slice_end[rw], jiffies);

503

tg->slice_start[rw], tg->slice_end[rw], jiffies);

502

}

504

}

503

505

504

static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,

506

static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,

505

unsigned long *wait)

507

unsigned long *wait)

506

{

508

{

507

bool rw = bio_data_dir(bio);

509

bool rw = bio_data_dir(bio);

508

unsigned int io_allowed;

510

unsigned int io_allowed;

509

unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;

511

unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;

510

u64 tmp;

512

u64 tmp;

511

513

512

jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];

514

jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];

513

515

514

/* Slice has just started. Consider one slice interval */

516

/* Slice has just started. Consider one slice interval */

515

if (!jiffy_elapsed)

517

if (!jiffy_elapsed)

516

jiffy_elapsed_rnd = throtl_slice;

518

jiffy_elapsed_rnd = throtl_slice;

517

519

518

jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);

520

jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);

519

521

520

/*

522

/*

521

* jiffy_elapsed_rnd should not be a big value as minimum iops can be

523

* jiffy_elapsed_rnd should not be a big value as minimum iops can be

522

* 1 then at max jiffy elapsed should be equivalent of 1 second as we

524

* 1 then at max jiffy elapsed should be equivalent of 1 second as we

523

* will allow dispatch after 1 second and after that slice should

525

* will allow dispatch after 1 second and after that slice should

524

* have been trimmed.

526

* have been trimmed.

525

*/

527

*/

526

528

527

tmp = (u64)tg->iops[rw] * jiffy_elapsed_rnd;

529

tmp = (u64)tg->iops[rw] * jiffy_elapsed_rnd;

528

do_div(tmp, HZ);

530

do_div(tmp, HZ);

529

531

530

if (tmp > UINT_MAX)

532

if (tmp > UINT_MAX)

531

io_allowed = UINT_MAX;

533

io_allowed = UINT_MAX;

532

else

534

else

533

io_allowed = tmp;

535

io_allowed = tmp;

534

536

535

if (tg->io_disp[rw] + 1 <= io_allowed) {

537

if (tg->io_disp[rw] + 1 <= io_allowed) {

536

if (wait)

538

if (wait)

537

*wait = 0;

539

*wait = 0;

538

return 1;

540

return 1;

539

}

541

}

540

542

541

/* Calc approx time to dispatch */

543

/* Calc approx time to dispatch */

542

jiffy_wait = ((tg->io_disp[rw] + 1) * HZ)/tg->iops[rw] + 1;

544

jiffy_wait = ((tg->io_disp[rw] + 1) * HZ)/tg->iops[rw] + 1;

543

545

544

if (jiffy_wait > jiffy_elapsed)

546

if (jiffy_wait > jiffy_elapsed)

545

jiffy_wait = jiffy_wait - jiffy_elapsed;

547

jiffy_wait = jiffy_wait - jiffy_elapsed;

546

else

548

else

547

jiffy_wait = 1;

549

jiffy_wait = 1;

548

550

549

if (wait)

551

if (wait)

550

*wait = jiffy_wait;

552

*wait = jiffy_wait;

551

return 0;

553

return 0;

552

}

554

}

553

555

554

static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,

556

static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,

555

unsigned long *wait)

557

unsigned long *wait)

556

{

558

{

557

bool rw = bio_data_dir(bio);

559

bool rw = bio_data_dir(bio);

558

u64 bytes_allowed, extra_bytes, tmp;

560

u64 bytes_allowed, extra_bytes, tmp;

559

unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;

561

unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;

560

562

561

jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];

563

jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];

562

564

563

/* Slice has just started. Consider one slice interval */

565

/* Slice has just started. Consider one slice interval */

564

if (!jiffy_elapsed)

566

if (!jiffy_elapsed)

565

jiffy_elapsed_rnd = throtl_slice;

567

jiffy_elapsed_rnd = throtl_slice;

566

568

567

jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);

569

jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);

568

570

569

tmp = tg->bps[rw] * jiffy_elapsed_rnd;

571

tmp = tg->bps[rw] * jiffy_elapsed_rnd;

570

do_div(tmp, HZ);

572

do_div(tmp, HZ);

571

bytes_allowed = tmp;

573

bytes_allowed = tmp;

572

574

573

if (tg->bytes_disp[rw] + bio->bi_size <= bytes_allowed) {

575

if (tg->bytes_disp[rw] + bio->bi_size <= bytes_allowed) {

574

if (wait)

576

if (wait)

575

*wait = 0;

577

*wait = 0;

576

return 1;

578

return 1;

577

}

579

}

578

580

579

/* Calc approx time to dispatch */

581

/* Calc approx time to dispatch */

580

extra_bytes = tg->bytes_disp[rw] + bio->bi_size - bytes_allowed;

582

extra_bytes = tg->bytes_disp[rw] + bio->bi_size - bytes_allowed;

581

jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]);

583

jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]);

582

584

583

if (!jiffy_wait)

585

if (!jiffy_wait)

584

jiffy_wait = 1;

586

jiffy_wait = 1;

585

587

586

/*

588

/*

587

* This wait time is without taking into consideration the rounding

589

* This wait time is without taking into consideration the rounding

588

* up we did. Add that time also.

590

* up we did. Add that time also.

589

*/

591

*/

590

jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed);

592

jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed);

591

if (wait)

593

if (wait)

592

*wait = jiffy_wait;

594

*wait = jiffy_wait;

593

return 0;

595

return 0;

594

}

596

}

595

597

596

static bool tg_no_rule_group(struct throtl_grp *tg, bool rw) {

598

static bool tg_no_rule_group(struct throtl_grp *tg, bool rw) {

597

if (tg->bps[rw] == -1 && tg->iops[rw] == -1)

599

if (tg->bps[rw] == -1 && tg->iops[rw] == -1)

598

return 1;

600

return 1;

599

return 0;

601

return 0;

600

}

602

}

601

603

602

/*

604

/*

603

* Returns whether one can dispatch a bio or not. Also returns approx number

605

* Returns whether one can dispatch a bio or not. Also returns approx number

604

* of jiffies to wait before this bio is with-in IO rate and can be dispatched

606

* of jiffies to wait before this bio is with-in IO rate and can be dispatched

605

*/

607

*/

606

static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,

608

static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,

607

unsigned long *wait)

609

unsigned long *wait)

608

{

610

{

609

bool rw = bio_data_dir(bio);

611

bool rw = bio_data_dir(bio);

610

unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;

612

unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;

611

613

612

/*

614

/*

613

* Currently whole state machine of group depends on first bio

615

* Currently whole state machine of group depends on first bio

614

* queued in the group bio list. So one should not be calling

616

* queued in the group bio list. So one should not be calling

615

* this function with a different bio if there are other bios

617

* this function with a different bio if there are other bios

616

* queued.

618

* queued.

617

*/

619

*/

618

BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw]));

620

BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw]));

619

621

620

/* If tg->bps = -1, then BW is unlimited */

622

/* If tg->bps = -1, then BW is unlimited */

621

if (tg->bps[rw] == -1 && tg->iops[rw] == -1) {

623

if (tg->bps[rw] == -1 && tg->iops[rw] == -1) {

622

if (wait)

624

if (wait)

623

*wait = 0;

625

*wait = 0;

624

return 1;

626

return 1;

625

}

627

}

626

628

627

/*

629

/*

628

* If previous slice expired, start a new one otherwise renew/extend

630

* If previous slice expired, start a new one otherwise renew/extend

629

* existing slice to make sure it is at least throtl_slice interval

631

* existing slice to make sure it is at least throtl_slice interval

630

* long since now.

632

* long since now.

631

*/

633

*/

632

if (throtl_slice_used(tg, rw))

634

if (throtl_slice_used(tg, rw))

633

throtl_start_new_slice(tg, rw);

635

throtl_start_new_slice(tg, rw);

634

else {

636

else {

635

if (time_before(tg->slice_end[rw], jiffies + throtl_slice))

637

if (time_before(tg->slice_end[rw], jiffies + throtl_slice))

636

throtl_extend_slice(tg, rw, jiffies + throtl_slice);

638

throtl_extend_slice(tg, rw, jiffies + throtl_slice);

637

}

639

}

638

640

639

if (tg_with_in_bps_limit(tg, bio, &bps_wait) &&

641

if (tg_with_in_bps_limit(tg, bio, &bps_wait) &&

640

tg_with_in_iops_limit(tg, bio, &iops_wait)) {

642

tg_with_in_iops_limit(tg, bio, &iops_wait)) {

641

if (wait)

643

if (wait)

642

*wait = 0;

644

*wait = 0;

643

return 1;

645

return 1;

644

}

646

}

645

647

646

max_wait = max(bps_wait, iops_wait);

648

max_wait = max(bps_wait, iops_wait);

647

649

648

if (wait)

650

if (wait)

649

*wait = max_wait;

651

*wait = max_wait;

650

652

651

if (time_before(tg->slice_end[rw], jiffies + max_wait))

653

if (time_before(tg->slice_end[rw], jiffies + max_wait))

652

throtl_extend_slice(tg, rw, jiffies + max_wait);

654

throtl_extend_slice(tg, rw, jiffies + max_wait);

653

655

654

return 0;

656

return 0;

655

}

657

}

656

658

657

static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes,

659

static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes,

658

int rw)

660

int rw)

659

{

661

{

660

struct throtl_grp *tg = blkg_to_tg(blkg);

662

struct throtl_grp *tg = blkg_to_tg(blkg);

661

struct tg_stats_cpu *stats_cpu;

663

struct tg_stats_cpu *stats_cpu;

662

unsigned long flags;

664

unsigned long flags;

663

665

664

/* If per cpu stats are not allocated yet, don't do any accounting. */

666

/* If per cpu stats are not allocated yet, don't do any accounting. */

665

if (tg->stats_cpu == NULL)

667

if (tg->stats_cpu == NULL)

666

return;

668

return;

667

669

668

/*

670

/*

669

* Disabling interrupts to provide mutual exclusion between two

671

* Disabling interrupts to provide mutual exclusion between two

670

* writes on same cpu. It probably is not needed for 64bit. Not

672

* writes on same cpu. It probably is not needed for 64bit. Not

671

* optimizing that case yet.

673

* optimizing that case yet.

672

*/

674

*/

673

local_irq_save(flags);

675

local_irq_save(flags);

674

676

675

stats_cpu = this_cpu_ptr(tg->stats_cpu);

677

stats_cpu = this_cpu_ptr(tg->stats_cpu);

676

678

677

blkg_rwstat_add(&stats_cpu->serviced, rw, 1);

679

blkg_rwstat_add(&stats_cpu->serviced, rw, 1);

678

blkg_rwstat_add(&stats_cpu->service_bytes, rw, bytes);

680

blkg_rwstat_add(&stats_cpu->service_bytes, rw, bytes);

679

681

680

local_irq_restore(flags);

682

local_irq_restore(flags);

681

}

683

}

682

684

683

static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)

685

static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)

684

{

686

{

685

bool rw = bio_data_dir(bio);

687

bool rw = bio_data_dir(bio);

686

688

687

/* Charge the bio to the group */

689

/* Charge the bio to the group */

688

tg->bytes_disp[rw] += bio->bi_size;

690

tg->bytes_disp[rw] += bio->bi_size;

689

tg->io_disp[rw]++;

691

tg->io_disp[rw]++;

690

692

691

throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size, bio->bi_rw);

693

throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size, bio->bi_rw);

692

}

694

}

693

695

694

static void throtl_add_bio_tg(struct throtl_service_queue *sq,

696

static void throtl_add_bio_tg(struct bio *bio, struct throtl_grp *tg,

695

struct throtl_grp *tg, struct bio *bio)

697

struct throtl_service_queue *parent_sq)

696

{

698

{

697

bool rw = bio_data_dir(bio);

699

bool rw = bio_data_dir(bio);

698

700

699

bio_list_add(&tg->bio_lists[rw], bio);

701

bio_list_add(&tg->bio_lists[rw], bio);

700

/* Take a bio reference on tg */

702

/* Take a bio reference on tg */

701

blkg_get(tg_to_blkg(tg));

703

blkg_get(tg_to_blkg(tg));

702

tg->nr_queued[rw]++;

704

tg->nr_queued[rw]++;

703

tg->td->nr_queued[rw]++;

705

tg->td->nr_queued[rw]++;

704

throtl_enqueue_tg(sq, tg);

706

throtl_enqueue_tg(tg, parent_sq);

705

}

707

}

706

708

707

static void tg_update_disptime(struct throtl_service_queue *sq,

709

static void tg_update_disptime(struct throtl_grp *tg,

708

struct throtl_grp *tg)

710

struct throtl_service_queue *parent_sq)

709

{

711

{

710

unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;

712

unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;

711

struct bio *bio;

713

struct bio *bio;

712

714

713

if ((bio = bio_list_peek(&tg->bio_lists[READ])))

715

if ((bio = bio_list_peek(&tg->bio_lists[READ])))

714

tg_may_dispatch(tg, bio, &read_wait);

716

tg_may_dispatch(tg, bio, &read_wait);

715

717

716

if ((bio = bio_list_peek(&tg->bio_lists[WRITE])))

718

if ((bio = bio_list_peek(&tg->bio_lists[WRITE])))

717

tg_may_dispatch(tg, bio, &write_wait);

719

tg_may_dispatch(tg, bio, &write_wait);

718

720

719

min_wait = min(read_wait, write_wait);

721

min_wait = min(read_wait, write_wait);

720

disptime = jiffies + min_wait;

722

disptime = jiffies + min_wait;

721

723

722

/* Update dispatch time */

724

/* Update dispatch time */

723

throtl_dequeue_tg(sq, tg);

725

throtl_dequeue_tg(tg, parent_sq);

724

tg->disptime = disptime;

726

tg->disptime = disptime;

725

throtl_enqueue_tg(sq, tg);

727

throtl_enqueue_tg(tg, parent_sq);

726

}

728

}

727

729

728

static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw,

730

static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw,

729

struct bio_list *bl)

731

struct bio_list *bl)

730

{

732

{

731

struct bio *bio;

733

struct bio *bio;

732

734

733

bio = bio_list_pop(&tg->bio_lists[rw]);

735

bio = bio_list_pop(&tg->bio_lists[rw]);

734

tg->nr_queued[rw]--;

736

tg->nr_queued[rw]--;

735

/* Drop bio reference on blkg */

737

/* Drop bio reference on blkg */

736

blkg_put(tg_to_blkg(tg));

738

blkg_put(tg_to_blkg(tg));

737

739

738

BUG_ON(tg->td->nr_queued[rw] <= 0);

740

BUG_ON(tg->td->nr_queued[rw] <= 0);

739

tg->td->nr_queued[rw]--;

741

tg->td->nr_queued[rw]--;

740

742

741

throtl_charge_bio(tg, bio);

743

throtl_charge_bio(tg, bio);

742

bio_list_add(bl, bio);

744

bio_list_add(bl, bio);

743

bio->bi_rw |= REQ_THROTTLED;

745

bio->bi_rw |= REQ_THROTTLED;

744

746

745

throtl_trim_slice(tg, rw);

747

throtl_trim_slice(tg, rw);

746

}

748

}

747

749

748

static int throtl_dispatch_tg(struct throtl_grp *tg, struct bio_list *bl)

750

static int throtl_dispatch_tg(struct throtl_grp *tg, struct bio_list *bl)

749

{

751

{

750

unsigned int nr_reads = 0, nr_writes = 0;

752

unsigned int nr_reads = 0, nr_writes = 0;

751

unsigned int max_nr_reads = throtl_grp_quantum*3/4;

753

unsigned int max_nr_reads = throtl_grp_quantum*3/4;

752

unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads;

754

unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads;

753

struct bio *bio;

755

struct bio *bio;

754

756

755

/* Try to dispatch 75% READS and 25% WRITES */

757

/* Try to dispatch 75% READS and 25% WRITES */

756

758

757

while ((bio = bio_list_peek(&tg->bio_lists[READ])) &&

759

while ((bio = bio_list_peek(&tg->bio_lists[READ])) &&

758

tg_may_dispatch(tg, bio, NULL)) {

760

tg_may_dispatch(tg, bio, NULL)) {

759

761

760

tg_dispatch_one_bio(tg, bio_data_dir(bio), bl);

762

tg_dispatch_one_bio(tg, bio_data_dir(bio), bl);

761

nr_reads++;

763

nr_reads++;

762

764

763

if (nr_reads >= max_nr_reads)

765

if (nr_reads >= max_nr_reads)

764

break;

766

break;

765

}

767

}

766

768

767

while ((bio = bio_list_peek(&tg->bio_lists[WRITE])) &&

769

while ((bio = bio_list_peek(&tg->bio_lists[WRITE])) &&

768

tg_may_dispatch(tg, bio, NULL)) {

770

tg_may_dispatch(tg, bio, NULL)) {

769

771

770

tg_dispatch_one_bio(tg, bio_data_dir(bio), bl);

772

tg_dispatch_one_bio(tg, bio_data_dir(bio), bl);

771

nr_writes++;

773

nr_writes++;

772

774

773

if (nr_writes >= max_nr_writes)

775

if (nr_writes >= max_nr_writes)

774

break;

776

break;

775

}

777

}

776

778

777

return nr_reads + nr_writes;

779

return nr_reads + nr_writes;

778

}

780

}

779

781

780

static int throtl_select_dispatch(struct throtl_service_queue *sq,

782

static int throtl_select_dispatch(struct throtl_service_queue *parent_sq,

781

struct bio_list *bl)

783

struct bio_list *bl)

782

{

784

{

783

unsigned int nr_disp = 0;

785

unsigned int nr_disp = 0;

784

struct throtl_grp *tg;

786

struct throtl_grp *tg;

785

787

786

while (1) {

788

while (1) {

787

tg = throtl_rb_first(sq);

789

tg = throtl_rb_first(parent_sq);

788

790

789

if (!tg)

791

if (!tg)

790

break;

792

break;

791

793

792

if (time_before(jiffies, tg->disptime))

794

if (time_before(jiffies, tg->disptime))

793

break;

795

break;

794

796

795

throtl_dequeue_tg(sq, tg);

797

throtl_dequeue_tg(tg, parent_sq);

796

798

797

nr_disp += throtl_dispatch_tg(tg, bl);

799

nr_disp += throtl_dispatch_tg(tg, bl);

798

800

799

if (tg->nr_queued[0] || tg->nr_queued[1])

801

if (tg->nr_queued[0] || tg->nr_queued[1])

800

tg_update_disptime(sq, tg);

802

tg_update_disptime(tg, parent_sq);

801

803

802

if (nr_disp >= throtl_quantum)

804

if (nr_disp >= throtl_quantum)

803

break;

805

break;

804

}

806

}

805

807

806

return nr_disp;

808

return nr_disp;

807

}

809

}

808

810

809

/* work function to dispatch throttled bios */

811

/* work function to dispatch throttled bios */

810

void blk_throtl_dispatch_work_fn(struct work_struct *work)

812

void blk_throtl_dispatch_work_fn(struct work_struct *work)

811

{

813

{

812

struct throtl_data *td = container_of(to_delayed_work(work),

814

struct throtl_data *td = container_of(to_delayed_work(work),

813

struct throtl_data, dispatch_work);

815

struct throtl_data, dispatch_work);

814

struct request_queue *q = td->queue;

816

struct request_queue *q = td->queue;

815

unsigned int nr_disp = 0;

817

unsigned int nr_disp = 0;

816

struct bio_list bio_list_on_stack;

818

struct bio_list bio_list_on_stack;

817

struct bio *bio;

819

struct bio *bio;

818

struct blk_plug plug;

820

struct blk_plug plug;

819

821

820

spin_lock_irq(q->queue_lock);

822

spin_lock_irq(q->queue_lock);

821

823

822

bio_list_init(&bio_list_on_stack);

824

bio_list_init(&bio_list_on_stack);

823

825

824

throtl_log(td, "dispatch nr_queued=%u read=%u write=%u",

826

throtl_log(td, "dispatch nr_queued=%u read=%u write=%u",

825

td->nr_queued[READ] + td->nr_queued[WRITE],

827

td->nr_queued[READ] + td->nr_queued[WRITE],

826

td->nr_queued[READ], td->nr_queued[WRITE]);

828

td->nr_queued[READ], td->nr_queued[WRITE]);

827

829

828

nr_disp = throtl_select_dispatch(&td->service_queue, &bio_list_on_stack);

830

nr_disp = throtl_select_dispatch(&td->service_queue, &bio_list_on_stack);

829

831

830

if (nr_disp)

832

if (nr_disp)

831

throtl_log(td, "bios disp=%u", nr_disp);

833

throtl_log(td, "bios disp=%u", nr_disp);

832

834

833

throtl_schedule_next_dispatch(td);

835

throtl_schedule_next_dispatch(td);

834

836

835

spin_unlock_irq(q->queue_lock);

837

spin_unlock_irq(q->queue_lock);

836

838

837

/*

839

/*

838

* If we dispatched some requests, unplug the queue to make sure

840

* If we dispatched some requests, unplug the queue to make sure

839

* immediate dispatch

841

* immediate dispatch

840

*/

842

*/

841

if (nr_disp) {

843

if (nr_disp) {

842

blk_start_plug(&plug);

844

blk_start_plug(&plug);

843

while((bio = bio_list_pop(&bio_list_on_stack)))

845

while((bio = bio_list_pop(&bio_list_on_stack)))

844

generic_make_request(bio);

846

generic_make_request(bio);

845

blk_finish_plug(&plug);

847

blk_finish_plug(&plug);

846

}

848

}

847

}

849

}

848

850

849

static u64 tg_prfill_cpu_rwstat(struct seq_file *sf,

851

static u64 tg_prfill_cpu_rwstat(struct seq_file *sf,

850

struct blkg_policy_data *pd, int off)

852

struct blkg_policy_data *pd, int off)

851

{

853

{

852

struct throtl_grp *tg = pd_to_tg(pd);

854

struct throtl_grp *tg = pd_to_tg(pd);

853

struct blkg_rwstat rwstat = { }, tmp;

855

struct blkg_rwstat rwstat = { }, tmp;

854

int i, cpu;

856

int i, cpu;

855

857

856

for_each_possible_cpu(cpu) {

858

for_each_possible_cpu(cpu) {

857

struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu);

859

struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu);

858

860

859

tmp = blkg_rwstat_read((void *)sc + off);

861

tmp = blkg_rwstat_read((void *)sc + off);

860

for (i = 0; i < BLKG_RWSTAT_NR; i++)

862

for (i = 0; i < BLKG_RWSTAT_NR; i++)

861

rwstat.cnt[i] += tmp.cnt[i];

863

rwstat.cnt[i] += tmp.cnt[i];

862

}

864

}

863

865

864

return __blkg_prfill_rwstat(sf, pd, &rwstat);

866

return __blkg_prfill_rwstat(sf, pd, &rwstat);

865

}

867

}

866

868

867

static int tg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft,

869

static int tg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft,

868

struct seq_file *sf)

870

struct seq_file *sf)

869

{

871

{

870

struct blkcg *blkcg = cgroup_to_blkcg(cgrp);

872

struct blkcg *blkcg = cgroup_to_blkcg(cgrp);

871

873

872

blkcg_print_blkgs(sf, blkcg, tg_prfill_cpu_rwstat, &blkcg_policy_throtl,

874

blkcg_print_blkgs(sf, blkcg, tg_prfill_cpu_rwstat, &blkcg_policy_throtl,

873

cft->private, true);

875

cft->private, true);

874

return 0;

876

return 0;

875

}

877

}

876

878

877

static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd,

879

static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd,

878

int off)

880

int off)

879

{

881

{

880

struct throtl_grp *tg = pd_to_tg(pd);

882

struct throtl_grp *tg = pd_to_tg(pd);

881

u64 v = *(u64 *)((void *)tg + off);

883

u64 v = *(u64 *)((void *)tg + off);

882

884

883

if (v == -1)

885

if (v == -1)

884

return 0;

886

return 0;

885

return __blkg_prfill_u64(sf, pd, v);

887

return __blkg_prfill_u64(sf, pd, v);

886

}

888

}

887

889

888

static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd,

890

static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd,

889

int off)

891

int off)

890

{

892

{

891

struct throtl_grp *tg = pd_to_tg(pd);

893

struct throtl_grp *tg = pd_to_tg(pd);

892

unsigned int v = *(unsigned int *)((void *)tg + off);

894

unsigned int v = *(unsigned int *)((void *)tg + off);

893

895

894

if (v == -1)

896

if (v == -1)

895

return 0;

897

return 0;

896

return __blkg_prfill_u64(sf, pd, v);

898

return __blkg_prfill_u64(sf, pd, v);

897

}

899

}

898

900

899

static int tg_print_conf_u64(struct cgroup *cgrp, struct cftype *cft,

901

static int tg_print_conf_u64(struct cgroup *cgrp, struct cftype *cft,

900

struct seq_file *sf)

902

struct seq_file *sf)

901

{

903

{

902

blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), tg_prfill_conf_u64,

904

blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), tg_prfill_conf_u64,

903

&blkcg_policy_throtl, cft->private, false);

905

&blkcg_policy_throtl, cft->private, false);

904

return 0;

906

return 0;

905

}

907

}

906

908

907

static int tg_print_conf_uint(struct cgroup *cgrp, struct cftype *cft,

909

static int tg_print_conf_uint(struct cgroup *cgrp, struct cftype *cft,

908

struct seq_file *sf)

910

struct seq_file *sf)

909

{

911

{

910

blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), tg_prfill_conf_uint,

912

blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), tg_prfill_conf_uint,

911

&blkcg_policy_throtl, cft->private, false);

913

&blkcg_policy_throtl, cft->private, false);

912

return 0;

914

return 0;

913

}

915

}

914

916

915

static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf,

917

static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf,

916

bool is_u64)

918

bool is_u64)

917

{

919

{

918

struct blkcg *blkcg = cgroup_to_blkcg(cgrp);

920

struct blkcg *blkcg = cgroup_to_blkcg(cgrp);

919

struct blkg_conf_ctx ctx;

921

struct blkg_conf_ctx ctx;

920

struct throtl_grp *tg;

922

struct throtl_grp *tg;

921

struct throtl_data *td;

923

struct throtl_data *td;

922

int ret;

924

int ret;

923

925

924

ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);

926

ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);

925

if (ret)

927

if (ret)

926

return ret;

928

return ret;

927

929

928

tg = blkg_to_tg(ctx.blkg);

930

tg = blkg_to_tg(ctx.blkg);

929

td = ctx.blkg->q->td;

931

td = ctx.blkg->q->td;

930

932

931

if (!ctx.v)

933

if (!ctx.v)

932

ctx.v = -1;

934

ctx.v = -1;

933

935

934

if (is_u64)

936

if (is_u64)

935

*(u64 *)((void *)tg + cft->private) = ctx.v;

937

*(u64 *)((void *)tg + cft->private) = ctx.v;

936

else

938

else

937

*(unsigned int *)((void *)tg + cft->private) = ctx.v;

939

*(unsigned int *)((void *)tg + cft->private) = ctx.v;

938

940

939

throtl_log_tg(tg, "limit change rbps=%llu wbps=%llu riops=%u wiops=%u",

941

throtl_log_tg(tg, "limit change rbps=%llu wbps=%llu riops=%u wiops=%u",

940

tg->bps[READ], tg->bps[WRITE],

942

tg->bps[READ], tg->bps[WRITE],

941

tg->iops[READ], tg->iops[WRITE]);

943

tg->iops[READ], tg->iops[WRITE]);

942

944

943

/*

945

/*

944

* We're already holding queue_lock and know @tg is valid. Let's

946

* We're already holding queue_lock and know @tg is valid. Let's

945

* apply the new config directly.

947

* apply the new config directly.

946

*

948

*

947

* Restart the slices for both READ and WRITES. It might happen

949

* Restart the slices for both READ and WRITES. It might happen

948

* that a group's limit are dropped suddenly and we don't want to

950

* that a group's limit are dropped suddenly and we don't want to

949

* account recently dispatched IO with new low rate.

951

* account recently dispatched IO with new low rate.

950

*/

952

*/

951

throtl_start_new_slice(tg, 0);

953

throtl_start_new_slice(tg, 0);

952

throtl_start_new_slice(tg, 1);

954

throtl_start_new_slice(tg, 1);

953

955

954

if (tg->flags & THROTL_TG_PENDING) {

956

if (tg->flags & THROTL_TG_PENDING) {

955

tg_update_disptime(&td->service_queue, tg);

957

tg_update_disptime(tg, &td->service_queue);

956

throtl_schedule_next_dispatch(td);

958

throtl_schedule_next_dispatch(td);

957

}

959

}

958

960

959

blkg_conf_finish(&ctx);

961

blkg_conf_finish(&ctx);

960

return 0;

962

return 0;

961

}

963

}

962

964

963

static int tg_set_conf_u64(struct cgroup *cgrp, struct cftype *cft,

965

static int tg_set_conf_u64(struct cgroup *cgrp, struct cftype *cft,

964

const char *buf)

966

const char *buf)

965

{

967

{

966

return tg_set_conf(cgrp, cft, buf, true);

968

return tg_set_conf(cgrp, cft, buf, true);

967

}

969

}

968

970

969

static int tg_set_conf_uint(struct cgroup *cgrp, struct cftype *cft,

971

static int tg_set_conf_uint(struct cgroup *cgrp, struct cftype *cft,

970

const char *buf)

972

const char *buf)

971

{

973

{

972

return tg_set_conf(cgrp, cft, buf, false);

974

return tg_set_conf(cgrp, cft, buf, false);

973

}

975

}

974

976

975

static struct cftype throtl_files[] = {

977

static struct cftype throtl_files[] = {

976

{

978

{

977

.name = "throttle.read_bps_device",

979

.name = "throttle.read_bps_device",

978

.private = offsetof(struct throtl_grp, bps[READ]),

980

.private = offsetof(struct throtl_grp, bps[READ]),

979

.read_seq_string = tg_print_conf_u64,

981

.read_seq_string = tg_print_conf_u64,

980

.write_string = tg_set_conf_u64,

982

.write_string = tg_set_conf_u64,

981

.max_write_len = 256,

983

.max_write_len = 256,

982

},

984

},

983

{

985

{

984

.name = "throttle.write_bps_device",

986

.name = "throttle.write_bps_device",

985

.private = offsetof(struct throtl_grp, bps[WRITE]),

987

.private = offsetof(struct throtl_grp, bps[WRITE]),

986

.read_seq_string = tg_print_conf_u64,

988

.read_seq_string = tg_print_conf_u64,

987

.write_string = tg_set_conf_u64,

989

.write_string = tg_set_conf_u64,

988

.max_write_len = 256,

990

.max_write_len = 256,

989

},

991

},

990

{

992

{

991

.name = "throttle.read_iops_device",

993

.name = "throttle.read_iops_device",

992

.private = offsetof(struct throtl_grp, iops[READ]),

994

.private = offsetof(struct throtl_grp, iops[READ]),

993

.read_seq_string = tg_print_conf_uint,

995

.read_seq_string = tg_print_conf_uint,

994

.write_string = tg_set_conf_uint,

996

.write_string = tg_set_conf_uint,

995

.max_write_len = 256,

997

.max_write_len = 256,

996

},

998

},

997

{

999

{

998

.name = "throttle.write_iops_device",

1000

.name = "throttle.write_iops_device",

999

.private = offsetof(struct throtl_grp, iops[WRITE]),

1001

.private = offsetof(struct throtl_grp, iops[WRITE]),

1000

.read_seq_string = tg_print_conf_uint,

1002

.read_seq_string = tg_print_conf_uint,

1001

.write_string = tg_set_conf_uint,

1003

.write_string = tg_set_conf_uint,

1002

.max_write_len = 256,

1004

.max_write_len = 256,

1003

},

1005

},

1004

{

1006

{

1005

.name = "throttle.io_service_bytes",

1007

.name = "throttle.io_service_bytes",

1006

.private = offsetof(struct tg_stats_cpu, service_bytes),

1008

.private = offsetof(struct tg_stats_cpu, service_bytes),

1007

.read_seq_string = tg_print_cpu_rwstat,

1009

.read_seq_string = tg_print_cpu_rwstat,

1008

},

1010

},

1009

{

1011

{

1010

.name = "throttle.io_serviced",

1012

.name = "throttle.io_serviced",

1011

.private = offsetof(struct tg_stats_cpu, serviced),

1013

.private = offsetof(struct tg_stats_cpu, serviced),

1012

.read_seq_string = tg_print_cpu_rwstat,

1014

.read_seq_string = tg_print_cpu_rwstat,

1013

},

1015

},

1014

{ } /* terminate */

1016

{ } /* terminate */

1015

};

1017

};

1016

1018

1017

static void throtl_shutdown_wq(struct request_queue *q)

1019

static void throtl_shutdown_wq(struct request_queue *q)

1018

{

1020

{

1019

struct throtl_data *td = q->td;

1021

struct throtl_data *td = q->td;

1020

1022

1021

cancel_delayed_work_sync(&td->dispatch_work);

1023

cancel_delayed_work_sync(&td->dispatch_work);

1022

}

1024

}

1023

1025

1024

static struct blkcg_policy blkcg_policy_throtl = {

1026

static struct blkcg_policy blkcg_policy_throtl = {

1025

.pd_size = sizeof(struct throtl_grp),

1027

.pd_size = sizeof(struct throtl_grp),

1026

.cftypes = throtl_files,

1028

.cftypes = throtl_files,

1027

1029

1028

.pd_init_fn = throtl_pd_init,

1030

.pd_init_fn = throtl_pd_init,

1029

.pd_exit_fn = throtl_pd_exit,

1031

.pd_exit_fn = throtl_pd_exit,

1030

.pd_reset_stats_fn = throtl_pd_reset_stats,

1032

.pd_reset_stats_fn = throtl_pd_reset_stats,

1031

};

1033

};

1032

1034

1033

bool blk_throtl_bio(struct request_queue *q, struct bio *bio)

1035

bool blk_throtl_bio(struct request_queue *q, struct bio *bio)

1034

{

1036

{

1035

struct throtl_data *td = q->td;

1037

struct throtl_data *td = q->td;

1036

struct throtl_grp *tg;

1038

struct throtl_grp *tg;

1037

bool rw = bio_data_dir(bio), update_disptime = true;

1039

bool rw = bio_data_dir(bio), update_disptime = true;

1038

struct blkcg *blkcg;

1040

struct blkcg *blkcg;

1039

bool throttled = false;

1041

bool throttled = false;

1040

1042

1041

if (bio->bi_rw & REQ_THROTTLED) {

1043

if (bio->bi_rw & REQ_THROTTLED) {

1042

bio->bi_rw &= ~REQ_THROTTLED;

1044

bio->bi_rw &= ~REQ_THROTTLED;

1043

goto out;

1045

goto out;

1044

}

1046

}

1045

1047

1046

/*

1048

/*

1047

* A throtl_grp pointer retrieved under rcu can be used to access

1049

* A throtl_grp pointer retrieved under rcu can be used to access

1048

* basic fields like stats and io rates. If a group has no rules,

1050

* basic fields like stats and io rates. If a group has no rules,

1049

* just update the dispatch stats in lockless manner and return.

1051

* just update the dispatch stats in lockless manner and return.

1050

*/

1052

*/

1051

rcu_read_lock();

1053

rcu_read_lock();

1052

blkcg = bio_blkcg(bio);

1054

blkcg = bio_blkcg(bio);

1053

tg = throtl_lookup_tg(td, blkcg);

1055

tg = throtl_lookup_tg(td, blkcg);

1054

if (tg) {

1056

if (tg) {

1055

if (tg_no_rule_group(tg, rw)) {

1057

if (tg_no_rule_group(tg, rw)) {

1056

throtl_update_dispatch_stats(tg_to_blkg(tg),

1058

throtl_update_dispatch_stats(tg_to_blkg(tg),

1057

bio->bi_size, bio->bi_rw);

1059

bio->bi_size, bio->bi_rw);

1058

goto out_unlock_rcu;

1060

goto out_unlock_rcu;

1059

}

1061

}

1060

}

1062

}

1061

1063

1062

/*

1064

/*

1063

* Either group has not been allocated yet or it is not an unlimited

1065

* Either group has not been allocated yet or it is not an unlimited

1064

* IO group

1066

* IO group

1065

*/

1067

*/

1066

spin_lock_irq(q->queue_lock);

1068

spin_lock_irq(q->queue_lock);

1067

tg = throtl_lookup_create_tg(td, blkcg);

1069

tg = throtl_lookup_create_tg(td, blkcg);

1068

if (unlikely(!tg))

1070

if (unlikely(!tg))

1069

goto out_unlock;

1071

goto out_unlock;

1070

1072

1071

if (tg->nr_queued[rw]) {

1073

if (tg->nr_queued[rw]) {

1072

/*

1074

/*

1073

* There is already another bio queued in same dir. No

1075

* There is already another bio queued in same dir. No

1074

* need to update dispatch time.

1076

* need to update dispatch time.

1075

*/

1077

*/

1076

update_disptime = false;

1078

update_disptime = false;

1077

goto queue_bio;

1079

goto queue_bio;

1078

1080

1079

}

1081

}

1080

1082

1081

/* Bio is with-in rate limit of group */

1083

/* Bio is with-in rate limit of group */

1082

if (tg_may_dispatch(tg, bio, NULL)) {

1084

if (tg_may_dispatch(tg, bio, NULL)) {

1083

throtl_charge_bio(tg, bio);

1085

throtl_charge_bio(tg, bio);

1084

1086

1085

/*

1087

/*

1086

* We need to trim slice even when bios are not being queued

1088

* We need to trim slice even when bios are not being queued

1087

* otherwise it might happen that a bio is not queued for

1089

* otherwise it might happen that a bio is not queued for

1088

* a long time and slice keeps on extending and trim is not

1090

* a long time and slice keeps on extending and trim is not

1089

* called for a long time. Now if limits are reduced suddenly

1091

* called for a long time. Now if limits are reduced suddenly

1090

* we take into account all the IO dispatched so far at new

1092

* we take into account all the IO dispatched so far at new

1091

* low rate and * newly queued IO gets a really long dispatch

1093

* low rate and * newly queued IO gets a really long dispatch

1092

* time.

1094

* time.

1093

*

1095

*

1094

* So keep on trimming slice even if bio is not queued.

1096

* So keep on trimming slice even if bio is not queued.

1095

*/

1097

*/

1096

throtl_trim_slice(tg, rw);

1098

throtl_trim_slice(tg, rw);

1097

goto out_unlock;

1099

goto out_unlock;

1098

}

1100

}

1099

1101

1100

queue_bio:

1102

queue_bio:

1101

throtl_log_tg(tg, "[%c] bio. bdisp=%llu sz=%u bps=%llu"

1103

throtl_log_tg(tg, "[%c] bio. bdisp=%llu sz=%u bps=%llu"

1102

" iodisp=%u iops=%u queued=%d/%d",

1104

" iodisp=%u iops=%u queued=%d/%d",

1103

rw == READ ? 'R' : 'W',

1105

rw == READ ? 'R' : 'W',

1104

tg->bytes_disp[rw], bio->bi_size, tg->bps[rw],

1106

tg->bytes_disp[rw], bio->bi_size, tg->bps[rw],

1105

tg->io_disp[rw], tg->iops[rw],

1107

tg->io_disp[rw], tg->iops[rw],

1106

tg->nr_queued[READ], tg->nr_queued[WRITE]);

1108

tg->nr_queued[READ], tg->nr_queued[WRITE]);

1107

1109

1108

bio_associate_current(bio);

1110

bio_associate_current(bio);

1109

throtl_add_bio_tg(&q->td->service_queue, tg, bio);

1111

throtl_add_bio_tg(bio, tg, &q->td->service_queue);

1110

throttled = true;

1112

throttled = true;

1111

1113

1112

if (update_disptime) {

1114

if (update_disptime) {

1113

tg_update_disptime(&td->service_queue, tg);

1115

tg_update_disptime(tg, &td->service_queue);

1114

throtl_schedule_next_dispatch(td);

1116

throtl_schedule_next_dispatch(td);

1115

}

1117

}

1116

1118

1117

out_unlock:

1119

out_unlock:

1118

spin_unlock_irq(q->queue_lock);

1120

spin_unlock_irq(q->queue_lock);

1119

out_unlock_rcu:

1121

out_unlock_rcu:

1120

rcu_read_unlock();

1122

rcu_read_unlock();

1121

out:

1123

out:

1122

return throttled;

1124

return throttled;

1123

}

1125

}

1124

1126

1125

/**

1127

/**

1126

* blk_throtl_drain - drain throttled bios

1128

* blk_throtl_drain - drain throttled bios

1127

* @q: request_queue to drain throttled bios for

1129

* @q: request_queue to drain throttled bios for

1128

*

1130

*

1129

* Dispatch all currently throttled bios on @q through ->make_request_fn().

1131

* Dispatch all currently throttled bios on @q through ->make_request_fn().

1130

*/

1132

*/

1131

void blk_throtl_drain(struct request_queue *q)

1133

void blk_throtl_drain(struct request_queue *q)

1132

__releases(q->queue_lock) __acquires(q->queue_lock)

1134

__releases(q->queue_lock) __acquires(q->queue_lock)

1133

{

1135

{

1134

struct throtl_data *td = q->td;

1136

struct throtl_data *td = q->td;

1135

struct throtl_service_queue *sq = &td->service_queue;

1137

struct throtl_service_queue *parent_sq = &td->service_queue;

1136

struct throtl_grp *tg;

1138

struct throtl_grp *tg;

1137

struct bio_list bl;

1139

struct bio_list bl;

1138

struct bio *bio;

1140

struct bio *bio;

1139

1141

1140

queue_lockdep_assert_held(q);

1142

queue_lockdep_assert_held(q);

1141

1143

1142

bio_list_init(&bl);

1144

bio_list_init(&bl);

1143

1145

1144

while ((tg = throtl_rb_first(sq))) {

1146

while ((tg = throtl_rb_first(parent_sq))) {

1145

throtl_dequeue_tg(sq, tg);

1147

throtl_dequeue_tg(tg, parent_sq);

1146

1148

1147

while ((bio = bio_list_peek(&tg->bio_lists[READ])))

1149

while ((bio = bio_list_peek(&tg->bio_lists[READ])))

1148

tg_dispatch_one_bio(tg, bio_data_dir(bio), &bl);

1150

tg_dispatch_one_bio(tg, bio_data_dir(bio), &bl);

1149

while ((bio = bio_list_peek(&tg->bio_lists[WRITE])))

1151

while ((bio = bio_list_peek(&tg->bio_lists[WRITE])))

1150

tg_dispatch_one_bio(tg, bio_data_dir(bio), &bl);

1152

tg_dispatch_one_bio(tg, bio_data_dir(bio), &bl);

1151

}

1153

}

1152

spin_unlock_irq(q->queue_lock);

1154

spin_unlock_irq(q->queue_lock);

1153

1155

1154

while ((bio = bio_list_pop(&bl)))

1156

while ((bio = bio_list_pop(&bl)))

1155

generic_make_request(bio);

1157

generic_make_request(bio);

1156

1158

1157

spin_lock_irq(q->queue_lock);

1159

spin_lock_irq(q->queue_lock);

1158

}

1160

}

1159

1161

1160

int blk_throtl_init(struct request_queue *q)

1162

int blk_throtl_init(struct request_queue *q)

1161

{

1163

{

1162

struct throtl_data *td;

1164

struct throtl_data *td;

1163

int ret;

1165

int ret;

1164

1166

1165

td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);

1167

td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);

1166

if (!td)

1168

if (!td)

1167

return -ENOMEM;

1169

return -ENOMEM;

1168

1170

1169

td->service_queue = THROTL_SERVICE_QUEUE_INITIALIZER;

1171

td->service_queue = THROTL_SERVICE_QUEUE_INITIALIZER;

1170

INIT_DELAYED_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);

1172

INIT_DELAYED_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);

1171

1173

1172

q->td = td;

1174

q->td = td;

1173

td->queue = q;

1175

td->queue = q;

1174

1176

1175

/* activate policy */

1177

/* activate policy */

1176

ret = blkcg_activate_policy(q, &blkcg_policy_throtl);

1178

ret = blkcg_activate_policy(q, &blkcg_policy_throtl);

1177

if (ret)

1179

if (ret)

1178

kfree(td);

1180

kfree(td);

1179

return ret;

1181

return ret;

1180

}

1182

}

1181

1183

1182

void blk_throtl_exit(struct request_queue *q)

1184

void blk_throtl_exit(struct request_queue *q)

1183

{

1185

{

1184

BUG_ON(!q->td);

1186

BUG_ON(!q->td);

1185

throtl_shutdown_wq(q);

1187

throtl_shutdown_wq(q);

1186

blkcg_deactivate_policy(q, &blkcg_policy_throtl);

1188

blkcg_deactivate_policy(q, &blkcg_policy_throtl);

1187

kfree(q->td);

1189

kfree(q->td);

1188

}

1190

}

1189

1191

1190

static int __init throtl_init(void)

1192

static int __init throtl_init(void)

1191

{

1193

{

1192

kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0);

1194

kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0);

1193

if (!kthrotld_workqueue)

1195

if (!kthrotld_workqueue)

1194

panic("Failed to create kthrotld\n");

1196

panic("Failed to create kthrotld\n");

1195

1197

1196

return blkcg_policy_register(&blkcg_policy_throtl);

1198

return blkcg_policy_register(&blkcg_policy_throtl);

1197

}

1199

}

1198

1200

1199

module_init(throtl_init);

1201

module_init(throtl_init);

1200

1202

GITLAB

blk-throttle: reorganize throtl_service_queue passed around as argument

 /*
  * Interface for controlling IO bandwidth on a request queue
  *
  * Copyright (C) 2010 Vivek Goyal <vgoyal@redhat.com>
  */
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/blkdev.h>
 #include <linux/bio.h>
 #include <linux/blktrace_api.h>
 #include "blk-cgroup.h"
 #include "blk.h"
 /* Max dispatch from a group in 1 round */
 static int throtl_grp_quantum = 8;
 /* Total max dispatch from all groups in one round */
 static int throtl_quantum = 32;
 /* Throttling is performed over 100ms slice and after that slice is renewed */
 static unsigned long throtl_slice = HZ/10;	/* 100 ms */
 static struct blkcg_policy blkcg_policy_throtl;
 /* A workqueue to queue throttle related work */
 static struct workqueue_struct *kthrotld_workqueue;
 struct throtl_service_queue {
 	struct rb_root		pending_tree;	/* RB tree of active tgs */
 	struct rb_node		*first_pending;	/* first node in the tree */
 	unsigned int		nr_pending;	/* # queued in the tree */
 	unsigned long		first_pending_disptime;	/* disptime of the first tg */
 };
 #define THROTL_SERVICE_QUEUE_INITIALIZER				\
 	(struct throtl_service_queue){ .pending_tree = RB_ROOT }
 enum tg_state_flags {
 	THROTL_TG_PENDING	= 1 << 0,	/* on parent's pending tree */
 };
 #define rb_entry_tg(node)	rb_entry((node), struct throtl_grp, rb_node)
 /* Per-cpu group stats */
 struct tg_stats_cpu {
 	/* total bytes transferred */
 	struct blkg_rwstat		service_bytes;
 	/* total IOs serviced, post merge */
 	struct blkg_rwstat		serviced;
 };
 struct throtl_grp {
 	/* must be the first member */
 	struct blkg_policy_data pd;
 	/* active throtl group service_queue member */
 	struct rb_node rb_node;
 	/* throtl_data this group belongs to */
 	struct throtl_data *td;
 	/*
 	 * Dispatch time in jiffies. This is the estimated time when group
 	 * will unthrottle and is ready to dispatch more bio. It is used as
 	 * key to sort active groups in service tree.
 	 */
 	unsigned long disptime;
 	unsigned int flags;
 	/* Two lists for READ and WRITE */
 	struct bio_list bio_lists[2];
 	/* Number of queued bios on READ and WRITE lists */
 	unsigned int nr_queued[2];
 	/* bytes per second rate limits */
 	uint64_t bps[2];
 	/* IOPS limits */
 	unsigned int iops[2];
 	/* Number of bytes disptached in current slice */
 	uint64_t bytes_disp[2];
 	/* Number of bio's dispatched in current slice */
 	unsigned int io_disp[2];
 	/* When did we start a new slice */
 	unsigned long slice_start[2];
 	unsigned long slice_end[2];
 	/* Per cpu stats pointer */
 	struct tg_stats_cpu __percpu *stats_cpu;
 	/* List of tgs waiting for per cpu stats memory to be allocated */
 	struct list_head stats_alloc_node;
 };
 struct throtl_data
 {
 	/* service tree for active throtl groups */
 	struct throtl_service_queue service_queue;
 	struct request_queue *queue;
 	/* Total Number of queued bios on READ and WRITE lists */
 	unsigned int nr_queued[2];
 	/*
 	 * number of total undestroyed groups
 	 */
 	unsigned int nr_undestroyed_grps;
 	/* Work for dispatching throttled bios */
 	struct delayed_work dispatch_work;
 };
 /* list and work item to allocate percpu group stats */
 static DEFINE_SPINLOCK(tg_stats_alloc_lock);
 static LIST_HEAD(tg_stats_alloc_list);
 static void tg_stats_alloc_fn(struct work_struct *);
 static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn);
 static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd)
 {
 	return pd ? container_of(pd, struct throtl_grp, pd) : NULL;
 }
 static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg)
 {
 	return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl));
 }
 static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg)
 {
 	return pd_to_blkg(&tg->pd);
 }
 static inline struct throtl_grp *td_root_tg(struct throtl_data *td)
 {
 	return blkg_to_tg(td->queue->root_blkg);
 }
 #define throtl_log_tg(tg, fmt, args...)	do {				\
 	char __pbuf[128];						\
 									\
 	blkg_path(tg_to_blkg(tg), __pbuf, sizeof(__pbuf));		\
 	blk_add_trace_msg((tg)->td->queue, "throtl %s " fmt, __pbuf, ##args); \
 } while (0)
 #define throtl_log(td, fmt, args...)	\
 	blk_add_trace_msg((td)->queue, "throtl " fmt, ##args)
 /*
  * Worker for allocating per cpu stat for tgs. This is scheduled on the
  * system_wq once there are some groups on the alloc_list waiting for
  * allocation.
  */
 static void tg_stats_alloc_fn(struct work_struct *work)
 {
 	static struct tg_stats_cpu *stats_cpu;	/* this fn is non-reentrant */
 	struct delayed_work *dwork = to_delayed_work(work);
 	bool empty = false;
 alloc_stats:
 	if (!stats_cpu) {
 		stats_cpu = alloc_percpu(struct tg_stats_cpu);
 		if (!stats_cpu) {
 			/* allocation failed, try again after some time */
 			schedule_delayed_work(dwork, msecs_to_jiffies(10));
 			return;
 		}
 	}
 	spin_lock_irq(&tg_stats_alloc_lock);
 	if (!list_empty(&tg_stats_alloc_list)) {
 		struct throtl_grp *tg = list_first_entry(&tg_stats_alloc_list,
 							 struct throtl_grp,
 							 stats_alloc_node);
 		swap(tg->stats_cpu, stats_cpu);
 		list_del_init(&tg->stats_alloc_node);
 	}
 	empty = list_empty(&tg_stats_alloc_list);
 	spin_unlock_irq(&tg_stats_alloc_lock);
 	if (!empty)
 		goto alloc_stats;
 }
 static void throtl_pd_init(struct blkcg_gq *blkg)
 {
 	struct throtl_grp *tg = blkg_to_tg(blkg);
 	unsigned long flags;
 	RB_CLEAR_NODE(&tg->rb_node);
 	tg->td = blkg->q->td;
 	bio_list_init(&tg->bio_lists[0]);
 	bio_list_init(&tg->bio_lists[1]);
 	tg->bps[READ] = -1;
 	tg->bps[WRITE] = -1;
 	tg->iops[READ] = -1;
 	tg->iops[WRITE] = -1;
 	/*
 	 * Ugh... We need to perform per-cpu allocation for tg->stats_cpu
 	 * but percpu allocator can't be called from IO path.  Queue tg on
 	 * tg_stats_alloc_list and allocate from work item.
 	 */
 	spin_lock_irqsave(&tg_stats_alloc_lock, flags);
 	list_add(&tg->stats_alloc_node, &tg_stats_alloc_list);
 	schedule_delayed_work(&tg_stats_alloc_work, 0);
 	spin_unlock_irqrestore(&tg_stats_alloc_lock, flags);
 }
 static void throtl_pd_exit(struct blkcg_gq *blkg)
 {
 	struct throtl_grp *tg = blkg_to_tg(blkg);
 	unsigned long flags;
 	spin_lock_irqsave(&tg_stats_alloc_lock, flags);
 	list_del_init(&tg->stats_alloc_node);
 	spin_unlock_irqrestore(&tg_stats_alloc_lock, flags);
 	free_percpu(tg->stats_cpu);
 }
 static void throtl_pd_reset_stats(struct blkcg_gq *blkg)
 {
 	struct throtl_grp *tg = blkg_to_tg(blkg);
 	int cpu;
 	if (tg->stats_cpu == NULL)
 		return;
 	for_each_possible_cpu(cpu) {
 		struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu);
 		blkg_rwstat_reset(&sc->service_bytes);
 		blkg_rwstat_reset(&sc->serviced);
 	}
 }
 static struct throtl_grp *throtl_lookup_tg(struct throtl_data *td,
 					   struct blkcg *blkcg)
 {
 	/*
 	 * This is the common case when there are no blkcgs.  Avoid lookup
 	 * in this case
 	 */
 	if (blkcg == &blkcg_root)
 		return td_root_tg(td);
 	return blkg_to_tg(blkg_lookup(blkcg, td->queue));
 }
 static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td,
 						  struct blkcg *blkcg)
 {
 	struct request_queue *q = td->queue;
 	struct throtl_grp *tg = NULL;
 	/*
 	 * This is the common case when there are no blkcgs.  Avoid lookup
 	 * in this case
 	 */
 	if (blkcg == &blkcg_root) {
 		tg = td_root_tg(td);
 	} else {
 		struct blkcg_gq *blkg;
 		blkg = blkg_lookup_create(blkcg, q);
 		/* if %NULL and @q is alive, fall back to root_tg */
 		if (!IS_ERR(blkg))
 			tg = blkg_to_tg(blkg);
 		else if (!blk_queue_dying(q))
 			tg = td_root_tg(td);
 	}
 	return tg;
 }
-static struct throtl_grp *throtl_rb_first(struct throtl_service_queue *sq)
+static struct throtl_grp *
+throtl_rb_first(struct throtl_service_queue *parent_sq)
 {
 	/* Service tree is empty */
-	if (!sq->nr_pending)
+	if (!parent_sq->nr_pending)
 		return NULL;
-	if (!sq->first_pending)
+	if (!parent_sq->first_pending)
-		sq->first_pending = rb_first(&sq->pending_tree);
+		parent_sq->first_pending = rb_first(&parent_sq->pending_tree);
-	if (sq->first_pending)
+	if (parent_sq->first_pending)
-		return rb_entry_tg(sq->first_pending);
+		return rb_entry_tg(parent_sq->first_pending);
 	return NULL;
 }
 static void rb_erase_init(struct rb_node *n, struct rb_root *root)
 {
 	rb_erase(n, root);
 	RB_CLEAR_NODE(n);
 }
-static void throtl_rb_erase(struct rb_node *n, struct throtl_service_queue *sq)
+static void throtl_rb_erase(struct rb_node *n,
+			    struct throtl_service_queue *parent_sq)
 {
-	if (sq->first_pending == n)
+	if (parent_sq->first_pending == n)
-		sq->first_pending = NULL;
+		parent_sq->first_pending = NULL;
-	rb_erase_init(n, &sq->pending_tree);
+	rb_erase_init(n, &parent_sq->pending_tree);
-	--sq->nr_pending;
+	--parent_sq->nr_pending;
 }
-static void update_min_dispatch_time(struct throtl_service_queue *sq)
+static void update_min_dispatch_time(struct throtl_service_queue *parent_sq)
 {
 	struct throtl_grp *tg;
-	tg = throtl_rb_first(sq);
+	tg = throtl_rb_first(parent_sq);
 	if (!tg)
 		return;
-	sq->first_pending_disptime = tg->disptime;
+	parent_sq->first_pending_disptime = tg->disptime;
 }
-static void tg_service_queue_add(struct throtl_service_queue *sq,
+static void tg_service_queue_add(struct throtl_grp *tg,
-				 struct throtl_grp *tg)
+				 struct throtl_service_queue *parent_sq)
 {
-	struct rb_node **node = &sq->pending_tree.rb_node;
+	struct rb_node **node = &parent_sq->pending_tree.rb_node;
 	struct rb_node *parent = NULL;
 	struct throtl_grp *__tg;
 	unsigned long key = tg->disptime;
 	int left = 1;
 	while (*node != NULL) {
 		parent = *node;
 		__tg = rb_entry_tg(parent);
 		if (time_before(key, __tg->disptime))
 			node = &parent->rb_left;
 		else {
 			node = &parent->rb_right;
 			left = 0;
 		}
 	}
 	if (left)
-		sq->first_pending = &tg->rb_node;
+		parent_sq->first_pending = &tg->rb_node;
 	rb_link_node(&tg->rb_node, parent, node);
-	rb_insert_color(&tg->rb_node, &sq->pending_tree);
+	rb_insert_color(&tg->rb_node, &parent_sq->pending_tree);
 }
-static void __throtl_enqueue_tg(struct throtl_service_queue *sq,
+static void __throtl_enqueue_tg(struct throtl_grp *tg,
-				struct throtl_grp *tg)
+				struct throtl_service_queue *parent_sq)
 {
-	tg_service_queue_add(sq, tg);
+	tg_service_queue_add(tg, parent_sq);
 	tg->flags |= THROTL_TG_PENDING;
-	sq->nr_pending++;
+	parent_sq->nr_pending++;
 }
-static void throtl_enqueue_tg(struct throtl_service_queue *sq,
+static void throtl_enqueue_tg(struct throtl_grp *tg,
-			      struct throtl_grp *tg)
+			      struct throtl_service_queue *parent_sq)
 {
 	if (!(tg->flags & THROTL_TG_PENDING))
-		__throtl_enqueue_tg(sq, tg);
+		__throtl_enqueue_tg(tg, parent_sq);
 }
-static void __throtl_dequeue_tg(struct throtl_service_queue *sq,
+static void __throtl_dequeue_tg(struct throtl_grp *tg,
-				struct throtl_grp *tg)
+				struct throtl_service_queue *parent_sq)
 {
-	throtl_rb_erase(&tg->rb_node, sq);
+	throtl_rb_erase(&tg->rb_node, parent_sq);
 	tg->flags &= ~THROTL_TG_PENDING;
 }
-static void throtl_dequeue_tg(struct throtl_service_queue *sq,
+static void throtl_dequeue_tg(struct throtl_grp *tg,
-			      struct throtl_grp *tg)
+			      struct throtl_service_queue *parent_sq)
 {
 	if (tg->flags & THROTL_TG_PENDING)
-		__throtl_dequeue_tg(sq, tg);
+		__throtl_dequeue_tg(tg, parent_sq);
 }
 /* Call with queue lock held */
 static void throtl_schedule_delayed_work(struct throtl_data *td,
 					 unsigned long delay)
 {
 	struct delayed_work *dwork = &td->dispatch_work;
 	mod_delayed_work(kthrotld_workqueue, dwork, delay);
 	throtl_log(td, "schedule work. delay=%lu jiffies=%lu", delay, jiffies);
 }
 static void throtl_schedule_next_dispatch(struct throtl_data *td)
 {
 	struct throtl_service_queue *sq = &td->service_queue;
 	/* any pending children left? */
 	if (!sq->nr_pending)
 		return;
 	update_min_dispatch_time(sq);
 	if (time_before_eq(sq->first_pending_disptime, jiffies))
 		throtl_schedule_delayed_work(td, 0);
 	else
 		throtl_schedule_delayed_work(td, sq->first_pending_disptime - jiffies);
 }
 static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw)
 {
 	tg->bytes_disp[rw] = 0;
 	tg->io_disp[rw] = 0;
 	tg->slice_start[rw] = jiffies;
 	tg->slice_end[rw] = jiffies + throtl_slice;
 	throtl_log_tg(tg, "[%c] new slice start=%lu end=%lu jiffies=%lu",
 			rw == READ ? 'R' : 'W', tg->slice_start[rw],
 			tg->slice_end[rw], jiffies);
 }
 static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw,
 					unsigned long jiffy_end)
 {
 	tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
 }
 static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw,
 				       unsigned long jiffy_end)
 {
 	tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
 	throtl_log_tg(tg, "[%c] extend slice start=%lu end=%lu jiffies=%lu",
 			rw == READ ? 'R' : 'W', tg->slice_start[rw],
 			tg->slice_end[rw], jiffies);
 }
 /* Determine if previously allocated or extended slice is complete or not */
 static bool throtl_slice_used(struct throtl_grp *tg, bool rw)
 {
 	if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw]))
 		return 0;
 	return 1;
 }
 /* Trim the used slices and adjust slice start accordingly */
 static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
 {
 	unsigned long nr_slices, time_elapsed, io_trim;
 	u64 bytes_trim, tmp;
 	BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw]));
 	/*
 	 * If bps are unlimited (-1), then time slice don't get
 	 * renewed. Don't try to trim the slice if slice is used. A new
 	 * slice will start when appropriate.
 	 */
 	if (throtl_slice_used(tg, rw))
 		return;
 	/*
 	 * A bio has been dispatched. Also adjust slice_end. It might happen
 	 * that initially cgroup limit was very low resulting in high
 	 * slice_end, but later limit was bumped up and bio was dispached
 	 * sooner, then we need to reduce slice_end. A high bogus slice_end
 	 * is bad because it does not allow new slice to start.
 	 */
 	throtl_set_slice_end(tg, rw, jiffies + throtl_slice);
 	time_elapsed = jiffies - tg->slice_start[rw];
 	nr_slices = time_elapsed / throtl_slice;
 	if (!nr_slices)
 		return;
 	tmp = tg->bps[rw] * throtl_slice * nr_slices;
 	do_div(tmp, HZ);
 	bytes_trim = tmp;
 	io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ;
 	if (!bytes_trim && !io_trim)
 		return;
 	if (tg->bytes_disp[rw] >= bytes_trim)
 		tg->bytes_disp[rw] -= bytes_trim;
 	else
 		tg->bytes_disp[rw] = 0;
 	if (tg->io_disp[rw] >= io_trim)
 		tg->io_disp[rw] -= io_trim;
 	else
 		tg->io_disp[rw] = 0;
 	tg->slice_start[rw] += nr_slices * throtl_slice;
 	throtl_log_tg(tg, "[%c] trim slice nr=%lu bytes=%llu io=%lu"
 			" start=%lu end=%lu jiffies=%lu",
 			rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim,
 			tg->slice_start[rw], tg->slice_end[rw], jiffies);
 }
 static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
 				  unsigned long *wait)
 {
 	bool rw = bio_data_dir(bio);
 	unsigned int io_allowed;
 	unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
 	u64 tmp;
 	jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
 	/* Slice has just started. Consider one slice interval */
 	if (!jiffy_elapsed)
 		jiffy_elapsed_rnd = throtl_slice;
 	jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
 	/*
 	 * jiffy_elapsed_rnd should not be a big value as minimum iops can be
 	 * 1 then at max jiffy elapsed should be equivalent of 1 second as we
 	 * will allow dispatch after 1 second and after that slice should
 	 * have been trimmed.
 	 */
 	tmp = (u64)tg->iops[rw] * jiffy_elapsed_rnd;
 	do_div(tmp, HZ);
 	if (tmp > UINT_MAX)
 		io_allowed = UINT_MAX;
 	else
 		io_allowed = tmp;
 	if (tg->io_disp[rw] + 1 <= io_allowed) {
 		if (wait)
 			*wait = 0;
 		return 1;
 	}
 	/* Calc approx time to dispatch */
 	jiffy_wait = ((tg->io_disp[rw] + 1) * HZ)/tg->iops[rw] + 1;
 	if (jiffy_wait > jiffy_elapsed)
 		jiffy_wait = jiffy_wait - jiffy_elapsed;
 	else
 		jiffy_wait = 1;
 	if (wait)
 		*wait = jiffy_wait;
 	return 0;
 }
 static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
 				 unsigned long *wait)
 {
 	bool rw = bio_data_dir(bio);
 	u64 bytes_allowed, extra_bytes, tmp;
 	unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
 	jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
 	/* Slice has just started. Consider one slice interval */
 	if (!jiffy_elapsed)
 		jiffy_elapsed_rnd = throtl_slice;
 	jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
 	tmp = tg->bps[rw] * jiffy_elapsed_rnd;
 	do_div(tmp, HZ);
 	bytes_allowed = tmp;
 	if (tg->bytes_disp[rw] + bio->bi_size <= bytes_allowed) {
 		if (wait)
 			*wait = 0;
 		return 1;
 	}
 	/* Calc approx time to dispatch */
 	extra_bytes = tg->bytes_disp[rw] + bio->bi_size - bytes_allowed;
 	jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]);
 	if (!jiffy_wait)
 		jiffy_wait = 1;
 	/*
 	 * This wait time is without taking into consideration the rounding
 	 * up we did. Add that time also.
 	 */
 	jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed);
 	if (wait)
 		*wait = jiffy_wait;
 	return 0;
 }
 static bool tg_no_rule_group(struct throtl_grp *tg, bool rw) {
 	if (tg->bps[rw] == -1 && tg->iops[rw] == -1)
 		return 1;
 	return 0;
 }
 /*
  * Returns whether one can dispatch a bio or not. Also returns approx number
  * of jiffies to wait before this bio is with-in IO rate and can be dispatched
  */
 static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
 			    unsigned long *wait)
 {
 	bool rw = bio_data_dir(bio);
 	unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;
 	/*
  	 * Currently whole state machine of group depends on first bio
 	 * queued in the group bio list. So one should not be calling
 	 * this function with a different bio if there are other bios
 	 * queued.
 	 */
 	BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw]));
 	/* If tg->bps = -1, then BW is unlimited */
 	if (tg->bps[rw] == -1 && tg->iops[rw] == -1) {
 		if (wait)
 			*wait = 0;
 		return 1;
 	}
 	/*
 	 * If previous slice expired, start a new one otherwise renew/extend
 	 * existing slice to make sure it is at least throtl_slice interval
 	 * long since now.
 	 */
 	if (throtl_slice_used(tg, rw))
 		throtl_start_new_slice(tg, rw);
 	else {
 		if (time_before(tg->slice_end[rw], jiffies + throtl_slice))
 			throtl_extend_slice(tg, rw, jiffies + throtl_slice);
 	}
 	if (tg_with_in_bps_limit(tg, bio, &bps_wait) &&
 	    tg_with_in_iops_limit(tg, bio, &iops_wait)) {
 		if (wait)
 			*wait = 0;
 		return 1;
 	}
 	max_wait = max(bps_wait, iops_wait);
 	if (wait)
 		*wait = max_wait;
 	if (time_before(tg->slice_end[rw], jiffies + max_wait))
 		throtl_extend_slice(tg, rw, jiffies + max_wait);
 	return 0;
 }
 static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes,
 					 int rw)
 {
 	struct throtl_grp *tg = blkg_to_tg(blkg);
 	struct tg_stats_cpu *stats_cpu;
 	unsigned long flags;
 	/* If per cpu stats are not allocated yet, don't do any accounting. */
 	if (tg->stats_cpu == NULL)
 		return;
 	/*
 	 * Disabling interrupts to provide mutual exclusion between two
 	 * writes on same cpu. It probably is not needed for 64bit. Not
 	 * optimizing that case yet.
 	 */
 	local_irq_save(flags);
 	stats_cpu = this_cpu_ptr(tg->stats_cpu);
 	blkg_rwstat_add(&stats_cpu->serviced, rw, 1);
 	blkg_rwstat_add(&stats_cpu->service_bytes, rw, bytes);
 	local_irq_restore(flags);
 }
 static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
 {
 	bool rw = bio_data_dir(bio);
 	/* Charge the bio to the group */
 	tg->bytes_disp[rw] += bio->bi_size;
 	tg->io_disp[rw]++;
 	throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size, bio->bi_rw);
 }
-static void throtl_add_bio_tg(struct throtl_service_queue *sq,
+static void throtl_add_bio_tg(struct bio *bio, struct throtl_grp *tg,
-			      struct throtl_grp *tg, struct bio *bio)
+			      struct throtl_service_queue *parent_sq)
 {
 	bool rw = bio_data_dir(bio);
 	bio_list_add(&tg->bio_lists[rw], bio);
 	/* Take a bio reference on tg */
 	blkg_get(tg_to_blkg(tg));
 	tg->nr_queued[rw]++;
 	tg->td->nr_queued[rw]++;
-	throtl_enqueue_tg(sq, tg);
+	throtl_enqueue_tg(tg, parent_sq);
 }
-static void tg_update_disptime(struct throtl_service_queue *sq,
+static void tg_update_disptime(struct throtl_grp *tg,
-			       struct throtl_grp *tg)
+			       struct throtl_service_queue *parent_sq)
 {
 	unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;
 	struct bio *bio;
 	if ((bio = bio_list_peek(&tg->bio_lists[READ])))
 		tg_may_dispatch(tg, bio, &read_wait);
 	if ((bio = bio_list_peek(&tg->bio_lists[WRITE])))
 		tg_may_dispatch(tg, bio, &write_wait);
 	min_wait = min(read_wait, write_wait);
 	disptime = jiffies + min_wait;
 	/* Update dispatch time */
-	throtl_dequeue_tg(sq, tg);
+	throtl_dequeue_tg(tg, parent_sq);
 	tg->disptime = disptime;
-	throtl_enqueue_tg(sq, tg);
+	throtl_enqueue_tg(tg, parent_sq);
 }
 static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw,
 				struct bio_list *bl)
 {
 	struct bio *bio;
 	bio = bio_list_pop(&tg->bio_lists[rw]);
 	tg->nr_queued[rw]--;
 	/* Drop bio reference on blkg */
 	blkg_put(tg_to_blkg(tg));
 	BUG_ON(tg->td->nr_queued[rw] <= 0);
 	tg->td->nr_queued[rw]--;
 	throtl_charge_bio(tg, bio);
 	bio_list_add(bl, bio);
 	bio->bi_rw |= REQ_THROTTLED;
 	throtl_trim_slice(tg, rw);
 }
 static int throtl_dispatch_tg(struct throtl_grp *tg, struct bio_list *bl)
 {
 	unsigned int nr_reads = 0, nr_writes = 0;
 	unsigned int max_nr_reads = throtl_grp_quantum*3/4;
 	unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads;
 	struct bio *bio;
 	/* Try to dispatch 75% READS and 25% WRITES */
 	while ((bio = bio_list_peek(&tg->bio_lists[READ])) &&
 	       tg_may_dispatch(tg, bio, NULL)) {
 		tg_dispatch_one_bio(tg, bio_data_dir(bio), bl);
 		nr_reads++;
 		if (nr_reads >= max_nr_reads)
 			break;
 	}
 	while ((bio = bio_list_peek(&tg->bio_lists[WRITE])) &&
 	       tg_may_dispatch(tg, bio, NULL)) {
 		tg_dispatch_one_bio(tg, bio_data_dir(bio), bl);
 		nr_writes++;
 		if (nr_writes >= max_nr_writes)
 			break;
 	}
 	return nr_reads + nr_writes;
 }
-static int throtl_select_dispatch(struct throtl_service_queue *sq,
+static int throtl_select_dispatch(struct throtl_service_queue *parent_sq,
 				  struct bio_list *bl)
 {
 	unsigned int nr_disp = 0;
 	struct throtl_grp *tg;
 	while (1) {
-		tg = throtl_rb_first(sq);
+		tg = throtl_rb_first(parent_sq);
 		if (!tg)
 			break;
 		if (time_before(jiffies, tg->disptime))
 			break;
-		throtl_dequeue_tg(sq, tg);
+		throtl_dequeue_tg(tg, parent_sq);
 		nr_disp += throtl_dispatch_tg(tg, bl);
 		if (tg->nr_queued[0] || tg->nr_queued[1])
-			tg_update_disptime(sq, tg);
+			tg_update_disptime(tg, parent_sq);
 		if (nr_disp >= throtl_quantum)
 			break;
 	}
 	return nr_disp;
 }
 /* work function to dispatch throttled bios */
 void blk_throtl_dispatch_work_fn(struct work_struct *work)
 {
 	struct throtl_data *td = container_of(to_delayed_work(work),
 					      struct throtl_data, dispatch_work);
 	struct request_queue *q = td->queue;
 	unsigned int nr_disp = 0;
 	struct bio_list bio_list_on_stack;
 	struct bio *bio;
 	struct blk_plug plug;
 	spin_lock_irq(q->queue_lock);
 	bio_list_init(&bio_list_on_stack);
 	throtl_log(td, "dispatch nr_queued=%u read=%u write=%u",
 		   td->nr_queued[READ] + td->nr_queued[WRITE],
 		   td->nr_queued[READ], td->nr_queued[WRITE]);
 	nr_disp = throtl_select_dispatch(&td->service_queue, &bio_list_on_stack);
 	if (nr_disp)
 		throtl_log(td, "bios disp=%u", nr_disp);
 	throtl_schedule_next_dispatch(td);
 	spin_unlock_irq(q->queue_lock);
 	/*
 	 * If we dispatched some requests, unplug the queue to make sure
 	 * immediate dispatch
 	 */
 	if (nr_disp) {
 		blk_start_plug(&plug);
 		while((bio = bio_list_pop(&bio_list_on_stack)))
 			generic_make_request(bio);
 		blk_finish_plug(&plug);
 	}
 }
 static u64 tg_prfill_cpu_rwstat(struct seq_file *sf,
 				struct blkg_policy_data *pd, int off)
 {
 	struct throtl_grp *tg = pd_to_tg(pd);
 	struct blkg_rwstat rwstat = { }, tmp;
 	int i, cpu;
 	for_each_possible_cpu(cpu) {
 		struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu);
 		tmp = blkg_rwstat_read((void *)sc + off);
 		for (i = 0; i < BLKG_RWSTAT_NR; i++)
 			rwstat.cnt[i] += tmp.cnt[i];
 	}
 	return __blkg_prfill_rwstat(sf, pd, &rwstat);
 }
 static int tg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft,
 			       struct seq_file *sf)
 {
 	struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
 	blkcg_print_blkgs(sf, blkcg, tg_prfill_cpu_rwstat, &blkcg_policy_throtl,
 			  cft->private, true);
 	return 0;
 }
 static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd,
 			      int off)
 {
 	struct throtl_grp *tg = pd_to_tg(pd);
 	u64 v = *(u64 *)((void *)tg + off);
 	if (v == -1)
 		return 0;
 	return __blkg_prfill_u64(sf, pd, v);
 }
 static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd,
 			       int off)
 {
 	struct throtl_grp *tg = pd_to_tg(pd);
 	unsigned int v = *(unsigned int *)((void *)tg + off);
 	if (v == -1)
 		return 0;
 	return __blkg_prfill_u64(sf, pd, v);
 }
 static int tg_print_conf_u64(struct cgroup *cgrp, struct cftype *cft,
 			     struct seq_file *sf)
 {
 	blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), tg_prfill_conf_u64,
 			  &blkcg_policy_throtl, cft->private, false);
 	return 0;
 }
 static int tg_print_conf_uint(struct cgroup *cgrp, struct cftype *cft,
 			      struct seq_file *sf)
 {
 	blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), tg_prfill_conf_uint,
 			  &blkcg_policy_throtl, cft->private, false);
 	return 0;
 }
 static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf,
 		       bool is_u64)
 {
 	struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
 	struct blkg_conf_ctx ctx;
 	struct throtl_grp *tg;
 	struct throtl_data *td;
 	int ret;
 	ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
 	if (ret)
 		return ret;
 	tg = blkg_to_tg(ctx.blkg);
 	td = ctx.blkg->q->td;
 	if (!ctx.v)
 		ctx.v = -1;
 	if (is_u64)
 		*(u64 *)((void *)tg + cft->private) = ctx.v;
 	else
 		*(unsigned int *)((void *)tg + cft->private) = ctx.v;
 	throtl_log_tg(tg, "limit change rbps=%llu wbps=%llu riops=%u wiops=%u",
 		      tg->bps[READ], tg->bps[WRITE],
 		      tg->iops[READ], tg->iops[WRITE]);
 	/*
 	 * We're already holding queue_lock and know @tg is valid.  Let's
 	 * apply the new config directly.
 	 *
 	 * Restart the slices for both READ and WRITES. It might happen
 	 * that a group's limit are dropped suddenly and we don't want to
 	 * account recently dispatched IO with new low rate.
 	 */
 	throtl_start_new_slice(tg, 0);
 	throtl_start_new_slice(tg, 1);
 	if (tg->flags & THROTL_TG_PENDING) {
-		tg_update_disptime(&td->service_queue, tg);
+		tg_update_disptime(tg, &td->service_queue);
 		throtl_schedule_next_dispatch(td);
 	}
 	blkg_conf_finish(&ctx);
 	return 0;
 }
 static int tg_set_conf_u64(struct cgroup *cgrp, struct cftype *cft,
 			   const char *buf)
 {
 	return tg_set_conf(cgrp, cft, buf, true);
 }
 static int tg_set_conf_uint(struct cgroup *cgrp, struct cftype *cft,
 			    const char *buf)
 {
 	return tg_set_conf(cgrp, cft, buf, false);
 }
 static struct cftype throtl_files[] = {
 	{
 		.name = "throttle.read_bps_device",
 		.private = offsetof(struct throtl_grp, bps[READ]),
 		.read_seq_string = tg_print_conf_u64,
 		.write_string = tg_set_conf_u64,
 		.max_write_len = 256,
 	},
 	{
 		.name = "throttle.write_bps_device",
 		.private = offsetof(struct throtl_grp, bps[WRITE]),
 		.read_seq_string = tg_print_conf_u64,
 		.write_string = tg_set_conf_u64,
 		.max_write_len = 256,
 	},
 	{
 		.name = "throttle.read_iops_device",
 		.private = offsetof(struct throtl_grp, iops[READ]),
 		.read_seq_string = tg_print_conf_uint,
 		.write_string = tg_set_conf_uint,
 		.max_write_len = 256,
 	},
 	{
 		.name = "throttle.write_iops_device",
 		.private = offsetof(struct throtl_grp, iops[WRITE]),
 		.read_seq_string = tg_print_conf_uint,
 		.write_string = tg_set_conf_uint,
 		.max_write_len = 256,
 	},
 	{
 		.name = "throttle.io_service_bytes",
 		.private = offsetof(struct tg_stats_cpu, service_bytes),
 		.read_seq_string = tg_print_cpu_rwstat,
 	},
 	{
 		.name = "throttle.io_serviced",
 		.private = offsetof(struct tg_stats_cpu, serviced),
 		.read_seq_string = tg_print_cpu_rwstat,
 	},
 	{ }	/* terminate */
 };
 static void throtl_shutdown_wq(struct request_queue *q)
 {
 	struct throtl_data *td = q->td;
 	cancel_delayed_work_sync(&td->dispatch_work);
 }
 static struct blkcg_policy blkcg_policy_throtl = {
 	.pd_size		= sizeof(struct throtl_grp),
 	.cftypes		= throtl_files,
 	.pd_init_fn		= throtl_pd_init,
 	.pd_exit_fn		= throtl_pd_exit,
 	.pd_reset_stats_fn	= throtl_pd_reset_stats,
 };
 bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
 {
 	struct throtl_data *td = q->td;
 	struct throtl_grp *tg;
 	bool rw = bio_data_dir(bio), update_disptime = true;
 	struct blkcg *blkcg;
 	bool throttled = false;
 	if (bio->bi_rw & REQ_THROTTLED) {
 		bio->bi_rw &= ~REQ_THROTTLED;
 		goto out;
 	}
 	/*
 	 * A throtl_grp pointer retrieved under rcu can be used to access
 	 * basic fields like stats and io rates. If a group has no rules,
 	 * just update the dispatch stats in lockless manner and return.
 	 */
 	rcu_read_lock();
 	blkcg = bio_blkcg(bio);
 	tg = throtl_lookup_tg(td, blkcg);
 	if (tg) {
 		if (tg_no_rule_group(tg, rw)) {
 			throtl_update_dispatch_stats(tg_to_blkg(tg),
 						     bio->bi_size, bio->bi_rw);
 			goto out_unlock_rcu;
 		}
 	}
 	/*
 	 * Either group has not been allocated yet or it is not an unlimited
 	 * IO group
 	 */
 	spin_lock_irq(q->queue_lock);
 	tg = throtl_lookup_create_tg(td, blkcg);
 	if (unlikely(!tg))
 		goto out_unlock;
 	if (tg->nr_queued[rw]) {
 		/*
 		 * There is already another bio queued in same dir. No
 		 * need to update dispatch time.
 		 */
 		update_disptime = false;
 		goto queue_bio;
 	}
 	/* Bio is with-in rate limit of group */
 	if (tg_may_dispatch(tg, bio, NULL)) {
 		throtl_charge_bio(tg, bio);
 		/*
 		 * We need to trim slice even when bios are not being queued
 		 * otherwise it might happen that a bio is not queued for
 		 * a long time and slice keeps on extending and trim is not
 		 * called for a long time. Now if limits are reduced suddenly
 		 * we take into account all the IO dispatched so far at new
 		 * low rate and * newly queued IO gets a really long dispatch
 		 * time.
 		 *
 		 * So keep on trimming slice even if bio is not queued.
 		 */
 		throtl_trim_slice(tg, rw);
 		goto out_unlock;
 	}
 queue_bio:
 	throtl_log_tg(tg, "[%c] bio. bdisp=%llu sz=%u bps=%llu"
 			" iodisp=%u iops=%u queued=%d/%d",
 			rw == READ ? 'R' : 'W',
 			tg->bytes_disp[rw], bio->bi_size, tg->bps[rw],
 			tg->io_disp[rw], tg->iops[rw],
 			tg->nr_queued[READ], tg->nr_queued[WRITE]);
 	bio_associate_current(bio);
-	throtl_add_bio_tg(&q->td->service_queue, tg, bio);
+	throtl_add_bio_tg(bio, tg, &q->td->service_queue);
 	throttled = true;
 	if (update_disptime) {
-		tg_update_disptime(&td->service_queue, tg);
+		tg_update_disptime(tg, &td->service_queue);
 		throtl_schedule_next_dispatch(td);
 	}
 out_unlock:
 	spin_unlock_irq(q->queue_lock);
 out_unlock_rcu:
 	rcu_read_unlock();
 out:
 	return throttled;
 }
 /**
  * blk_throtl_drain - drain throttled bios
  * @q: request_queue to drain throttled bios for
  *
  * Dispatch all currently throttled bios on @q through ->make_request_fn().
  */
 void blk_throtl_drain(struct request_queue *q)
 	__releases(q->queue_lock) __acquires(q->queue_lock)
 {
 	struct throtl_data *td = q->td;
-	struct throtl_service_queue *sq = &td->service_queue;
+	struct throtl_service_queue *parent_sq = &td->service_queue;
 	struct throtl_grp *tg;
 	struct bio_list bl;
 	struct bio *bio;
 	queue_lockdep_assert_held(q);
 	bio_list_init(&bl);
-	while ((tg = throtl_rb_first(sq))) {
+	while ((tg = throtl_rb_first(parent_sq))) {
-		throtl_dequeue_tg(sq, tg);
+		throtl_dequeue_tg(tg, parent_sq);
 		while ((bio = bio_list_peek(&tg->bio_lists[READ])))
 			tg_dispatch_one_bio(tg, bio_data_dir(bio), &bl);
 		while ((bio = bio_list_peek(&tg->bio_lists[WRITE])))
 			tg_dispatch_one_bio(tg, bio_data_dir(bio), &bl);
 	}
 	spin_unlock_irq(q->queue_lock);
 	while ((bio = bio_list_pop(&bl)))
 		generic_make_request(bio);
 	spin_lock_irq(q->queue_lock);
 }
 int blk_throtl_init(struct request_queue *q)
 {
 	struct throtl_data *td;
 	int ret;
 	td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
 	if (!td)
 		return -ENOMEM;
 	td->service_queue = THROTL_SERVICE_QUEUE_INITIALIZER;
 	INIT_DELAYED_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
 	q->td = td;
 	td->queue = q;
 	/* activate policy */
 	ret = blkcg_activate_policy(q, &blkcg_policy_throtl);
 	if (ret)
 		kfree(td);
 	return ret;
 }
 void blk_throtl_exit(struct request_queue *q)
 {
 	BUG_ON(!q->td);
 	throtl_shutdown_wq(q);
 	blkcg_deactivate_policy(q, &blkcg_policy_throtl);
 	kfree(q->td);
 }
 static int __init throtl_init(void)
 {
 	kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0);
 	if (!kthrotld_workqueue)
 		panic("Failed to create kthrotld\n");
 	return blkcg_policy_register(&blkcg_policy_throtl);
 }
 module_init(throtl_init);